mirror of
https://github.com/ollama/ollama.git
synced 2026-05-13 14:27:00 +00:00
llama/compat: add llama4 text + clip handlers
Text side (handle_llama4): standard skip of `v.*` / `mm.*` from the
text loader so the n_tensors check passes. Text-side KVs and tensor
names match upstream verbatim. Tested: llama4:scout 17B-16E generates
correctly with `-c 4096` (the default 10M context OOMs even on 110GB
unified memory — that's a memory-budget issue, not a compat one).
Clip side (handle_llama4_clip): rewrites arch to `clip` with
`projector_type=llama4`, copies `llama4.vision.*` → `clip.vision.*`,
projects `llama4.embedding_length` → `clip.vision.projection_dim`,
injects pixtral-style image stats and `projector.scale_factor=2`.
Tensor renames cover three scopes:
* Vision adapter MLP → MM-MLP slots (run BEFORE the generic
`.mlp.fc{1,2}` rename so the substring stays anchored to the
adapter prefix): `v.vision_adapter.mlp.fc{1,2}` → `mm.model.mlp.{1,2}`.
* Main projector: `mm.linear_1` → `mm.model.fc`.
* Vision tower: `class_embedding`/`layernorm_pre`/`layernorm_post`/
`patch_embedding` → `class_embd`/`pre_ln`/`post_ln`/`patch_embd`,
plus `positional_embedding_vlm` → `position_embd.weight` (exact
rename — no `.weight` suffix in Ollama). Block leaves get the same
treatment as gemma3/mistral3: `attn_output`→`attn_out`, `attn_norm`/
`ffn_norm` → `ln1`/`ln2`, `mlp.fc{1,2}` → `ffn_{up,down}`.
Adds `llama4` to the Go-side `compatClipArches` allowlist.
Status: clip metadata + tensors load successfully — name mapping is
verified to match the upstream-converted reference mmproj exactly.
mtmd_encode_chunk currently fails at runtime with "failed to encode
image slice" (cause not yet diagnosed; tensor shapes/dtypes look
right, may be a graph-build mismatch in upstream's llama4 clip path).
Text-side llama4 is fully functional.
This commit is contained in:
parent
2c7850dbaf
commit
9e3b542257
2 changed files with 100 additions and 0 deletions
99
llama/compat/llama-ollama-compat.cpp
vendored
99
llama/compat/llama-ollama-compat.cpp
vendored
|
|
@ -350,6 +350,34 @@ void handle_nemotron_h_moe(const llama_model_loader * ml, gguf_context * meta, g
|
|||
add_skip_prefix(ml, "mtp.");
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// llama4 (text side)
|
||||
// =========================================================================
|
||||
//
|
||||
// Same arch name on both sides. Ollama publishes a monolithic GGUF that
|
||||
// embeds the vision encoder + projector inline. Text-side KVs/tensor
|
||||
// names match upstream verbatim — only fix is to hide `v.*`/`mm.*` from
|
||||
// the text loader so n_tensors lines up.
|
||||
|
||||
bool detect_ollama_llama4(const gguf_context * meta, const ggml_context * ctx) {
|
||||
const int64_t arch_kid = gguf_find_key(meta, "general.architecture");
|
||||
if (arch_kid < 0) return false;
|
||||
if (std::strcmp(gguf_get_val_str(meta, arch_kid), "llama4") != 0) return false;
|
||||
return any_tensor_with_prefix(ctx, "v.")
|
||||
|| any_tensor_with_prefix(ctx, "mm.");
|
||||
}
|
||||
|
||||
void handle_llama4(const llama_model_loader * ml, gguf_context * meta, ggml_context * ctx) {
|
||||
if (!detect_ollama_llama4(meta, ctx)) return;
|
||||
(void) meta;
|
||||
(void) ctx;
|
||||
|
||||
LLAMA_LOG_INFO("%s: detected Ollama-format llama4 GGUF; applying compatibility fixes\n", __func__);
|
||||
|
||||
add_skip_prefix(ml, "v.");
|
||||
add_skip_prefix(ml, "mm.");
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// gpt-oss (text only)
|
||||
// =========================================================================
|
||||
|
|
@ -795,6 +823,72 @@ void handle_deepseekocr_clip(gguf_context * meta, ggml_context * ctx) {
|
|||
promote_tensor_to_f32(meta, ctx, "v.position_embd.weight");
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// llama4 (clip side)
|
||||
// =========================================================================
|
||||
//
|
||||
// Ollama's monolithic llama4 GGUF embeds the CLIP-style ViT and a 3-layer
|
||||
// projector (`mm.linear_1` + `v.vision_adapter.mlp.fc1/fc2`). Upstream's
|
||||
// PROJECTOR_TYPE_LLAMA4 expects the projector under `mm.model.fc` /
|
||||
// `mm.model.mlp.{1,2}` and standard CLIP block leaf names.
|
||||
|
||||
constexpr std::pair<const char *, const char *> kLlama4ClipRenames[] = {
|
||||
// Vision-adapter MLP -> upstream's MM-MLP slots. Run BEFORE the generic
|
||||
// `.mlp.fc{1,2}` -> `.ffn_{up,down}` rename so the substring match stays
|
||||
// pinned to the adapter prefix.
|
||||
{"v.vision_adapter.mlp.fc1", "mm.model.mlp.1"},
|
||||
{"v.vision_adapter.mlp.fc2", "mm.model.mlp.2"},
|
||||
|
||||
// Main projector.
|
||||
{"mm.linear_1", "mm.model.fc"},
|
||||
|
||||
// Vision tower non-blk.
|
||||
{"v.class_embedding", "v.class_embd"},
|
||||
{"v.layernorm_post", "v.post_ln"},
|
||||
{"v.layernorm_pre", "v.pre_ln"},
|
||||
{"v.patch_embedding", "v.patch_embd"},
|
||||
|
||||
// Vision-tower block leaves.
|
||||
{".attn_output", ".attn_out"},
|
||||
{".attn_norm", ".ln1"},
|
||||
{".ffn_norm", ".ln2"},
|
||||
{".mlp.fc1", ".ffn_up"},
|
||||
{".mlp.fc2", ".ffn_down"},
|
||||
};
|
||||
|
||||
void handle_llama4_clip(gguf_context * meta, ggml_context * ctx) {
|
||||
LLAMA_LOG_INFO("%s: detected Ollama-format llama4 GGUF used as mmproj; translating\n", __func__);
|
||||
|
||||
copy_u32_kv(meta, "llama4.vision.block_count", "clip.vision.block_count");
|
||||
copy_u32_kv(meta, "llama4.vision.embedding_length", "clip.vision.embedding_length");
|
||||
copy_u32_kv(meta, "llama4.vision.feed_forward_length", "clip.vision.feed_forward_length");
|
||||
copy_u32_kv(meta, "llama4.vision.attention.head_count", "clip.vision.attention.head_count");
|
||||
copy_u32_kv(meta, "llama4.vision.image_size", "clip.vision.image_size");
|
||||
copy_u32_kv(meta, "llama4.vision.patch_size", "clip.vision.patch_size");
|
||||
copy_f32_kv(meta, "llama4.vision.layer_norm_epsilon", "clip.vision.attention.layer_norm_epsilon");
|
||||
// projection_dim = LM embedding length (= mm.model.fc output dim).
|
||||
copy_u32_kv(meta, "llama4.embedding_length", "clip.vision.projection_dim");
|
||||
|
||||
// Defaults (match the upstream-converted reference mmproj).
|
||||
inject_u32_if_missing(meta, "clip.vision.projector.scale_factor", 2);
|
||||
|
||||
static const float kHalfHalfHalf[3] = {0.5f, 0.5f, 0.5f};
|
||||
inject_f32_arr_if_missing(meta, "clip.vision.image_mean", kHalfHalfHalf, 3);
|
||||
inject_f32_arr_if_missing(meta, "clip.vision.image_std", kHalfHalfHalf, 3);
|
||||
|
||||
inject_bool_if_missing(meta, "clip.has_vision_encoder", true);
|
||||
inject_bool_if_missing(meta, "clip.use_gelu", true);
|
||||
gguf_set_val_str(meta, "clip.projector_type", "llama4");
|
||||
gguf_set_val_str(meta, "general.architecture", "clip");
|
||||
|
||||
// Position embedding has no `.weight` suffix in Ollama; rename exactly.
|
||||
rename_tensor(meta, ctx, "v.positional_embedding_vlm", "v.position_embd.weight");
|
||||
|
||||
for (const auto & [from, to] : kLlama4ClipRenames) {
|
||||
rename_tensors_containing(meta, ctx, from, to);
|
||||
}
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// mistral3 (clip side — pixtral projector)
|
||||
// =========================================================================
|
||||
|
|
@ -981,6 +1075,7 @@ void translate_metadata(const llama_model_loader * ml,
|
|||
if (arch_name == "mistral3") handle_mistral3 (ml, meta, ctx);
|
||||
if (arch_name == "deepseekocr") handle_deepseekocr (ml, meta, ctx, arch_name);
|
||||
if (arch_name == "nemotron_h_moe") handle_nemotron_h_moe(ml, meta, ctx);
|
||||
if (arch_name == "llama4") handle_llama4 (ml, meta, ctx);
|
||||
// Dispatch. Add more arches as they are wired up.
|
||||
}
|
||||
|
||||
|
|
@ -1005,6 +1100,10 @@ void translate_clip_metadata(gguf_context * meta, ggml_context * ctx) {
|
|||
handle_deepseekocr_clip(meta, ctx);
|
||||
return;
|
||||
}
|
||||
if (detect_ollama_llama4(meta, ctx)) {
|
||||
handle_llama4_clip(meta, ctx);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
bool should_skip_tensor(const llama_model_loader * ml, const char * tensor_name) {
|
||||
|
|
|
|||
|
|
@ -437,6 +437,7 @@ func NewLlamaServerRunner(
|
|||
"qwen35moe": true,
|
||||
"mistral3": true,
|
||||
"deepseekocr": true,
|
||||
"llama4": true,
|
||||
// Add entries as llama/compat grows clip handlers.
|
||||
}
|
||||
if len(projectors) == 0 &&
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue