mirror of
https://github.com/ollama/ollama.git
synced 2026-05-13 14:27:00 +00:00
llama/compat: add gemma4 clip handler (gemma4v projector)
Translates Ollama's monolithic gemma4 GGUF into the upstream gemma4v mmproj shape so `--mmproj <same-blob>` loads alongside the text model. KV synthesis: rewrite arch to `clip` with `clip.vision.projector_type= gemma4v`, copy `gemma4.vision.*` → `clip.vision.*`, project `gemma4.embedding_length` → `clip.vision.projection_dim` (= mm.input_projection output dim). gemma4 vision uses image_mean=[0,0,0] / image_std=[1,1,1] (the model does its own per-image normalization via v.std_bias / v.std_scale tensors), unlike the [0.5,0.5,0.5] defaults most arches use — inject those. No tensor renames needed: every `v.*` and `mm.*` tensor name (incl. v.patch_embd, v.position_embd, v.std_bias/scale, mm.input_projection, plus the per-block ln1/ln2/attn_post_norm/ffn_post_norm/attn_q_norm/ attn_k_norm) already matches the upstream-converted reference mmproj exactly. F32 promote of v.patch_embd.weight (Metal IM2COL requirement, same as gemma3 / mistral3 / deepseek-ocr). Adds `gemma4` to the Go-side `compatClipArches` allowlist so `ollama run gemma4 -i image.jpg ...` auto-passes the same blob as `--mmproj`. Covers all four published sizes (E2B, E4B, 26B-A4B MoE, 31B dense). Verified loading: 1189 tensors loaded from monolithic blob, hparams (projector=gemma4v, n_embd=1152, n_head=16, n_ff=4304, n_layer=27, n_merge=3, image_size=224, patch_size=16) match migration mmproj exactly. Image encoding/decoding completes (264 vision tokens). End- to-end testing through llama-server's chat completions surfaces a chat-template detection issue in llama-server (Ollama's gemma4 GGUFs ship without `tokenizer.chat_template`, so llama-server falls back to ChatML which the model doesn't understand) — orthogonal to compat; `ollama run` supplies the correct template via Modelfile.
This commit is contained in:
parent
9e3b542257
commit
034fee349c
2 changed files with 47 additions and 0 deletions
46
llama/compat/llama-ollama-compat.cpp
vendored
46
llama/compat/llama-ollama-compat.cpp
vendored
|
|
@ -823,6 +823,48 @@ void handle_deepseekocr_clip(gguf_context * meta, ggml_context * ctx) {
|
|||
promote_tensor_to_f32(meta, ctx, "v.position_embd.weight");
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// gemma4 (clip side — gemma4v projector)
|
||||
// =========================================================================
|
||||
//
|
||||
// Ollama's monolithic gemma4 GGUF embeds a SigLIP-style ViT plus the
|
||||
// gemma4v projector (a single `mm.input_projection`). All v.* / mm.*
|
||||
// tensor names already match upstream's PROJECTOR_TYPE_GEMMA4V — this
|
||||
// handler only needs KV translation and an F32 promote of the patch
|
||||
// embedding (Metal IM2COL).
|
||||
//
|
||||
// gemma4 vision uses image normalization mean=[0,0,0] / std=[1,1,1]
|
||||
// (the LM does its own per-image normalization via v.std_bias /
|
||||
// v.std_scale tensors) — different from the [0.5,0.5,0.5] used by
|
||||
// most other arches.
|
||||
|
||||
void handle_gemma4_clip(gguf_context * meta, ggml_context * ctx) {
|
||||
LLAMA_LOG_INFO("%s: detected Ollama-format gemma4 GGUF used as mmproj; translating\n", __func__);
|
||||
|
||||
copy_u32_kv(meta, "gemma4.vision.block_count", "clip.vision.block_count");
|
||||
copy_u32_kv(meta, "gemma4.vision.embedding_length", "clip.vision.embedding_length");
|
||||
copy_u32_kv(meta, "gemma4.vision.feed_forward_length", "clip.vision.feed_forward_length");
|
||||
copy_u32_kv(meta, "gemma4.vision.attention.head_count", "clip.vision.attention.head_count");
|
||||
copy_f32_kv(meta, "gemma4.vision.attention.layer_norm_epsilon", "clip.vision.attention.layer_norm_epsilon");
|
||||
copy_u32_kv(meta, "gemma4.vision.patch_size", "clip.vision.patch_size");
|
||||
// gemma4 vision is fixed at 224x224 patches.
|
||||
inject_u32_if_missing(meta, "clip.vision.image_size", 224);
|
||||
// projection_dim = LM embedding length.
|
||||
copy_u32_kv(meta, "gemma4.embedding_length", "clip.vision.projection_dim");
|
||||
|
||||
static const float kZeros[3] = {0.0f, 0.0f, 0.0f};
|
||||
static const float kOnes [3] = {1.0f, 1.0f, 1.0f};
|
||||
inject_f32_arr_if_missing(meta, "clip.vision.image_mean", kZeros, 3);
|
||||
inject_f32_arr_if_missing(meta, "clip.vision.image_std", kOnes, 3);
|
||||
|
||||
inject_bool_if_missing(meta, "clip.has_vision_encoder", true);
|
||||
gguf_set_val_str(meta, "clip.vision.projector_type", "gemma4v");
|
||||
gguf_set_val_str(meta, "general.architecture", "clip");
|
||||
|
||||
// Metal IM2COL needs F32 patch_embd weights (same as other arches).
|
||||
promote_tensor_to_f32(meta, ctx, "v.patch_embd.weight");
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// llama4 (clip side)
|
||||
// =========================================================================
|
||||
|
|
@ -1104,6 +1146,10 @@ void translate_clip_metadata(gguf_context * meta, ggml_context * ctx) {
|
|||
handle_llama4_clip(meta, ctx);
|
||||
return;
|
||||
}
|
||||
if (detect_ollama_gemma4(meta, ctx)) {
|
||||
handle_gemma4_clip(meta, ctx);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
bool should_skip_tensor(const llama_model_loader * ml, const char * tensor_name) {
|
||||
|
|
|
|||
|
|
@ -434,6 +434,7 @@ func NewLlamaServerRunner(
|
|||
// the compat layer's clip-side coverage in llama/compat/.
|
||||
compatClipArches := map[string]bool{
|
||||
"gemma3": true,
|
||||
"gemma4": true,
|
||||
"qwen35moe": true,
|
||||
"mistral3": true,
|
||||
"deepseekocr": true,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue