llama/compat: add glm-ocr clip handler (glm4v projector)

GLM-OCR ships a vision tower whose Ollama-format names already mostly match upstream's PROJECTOR_TYPE_GLM4V expectations: v.blk.X.attn_qkv / attn_out / attn_q_norm / attn_k_norm ✓ v.blk.X.ln1 / ln2 / ffn_{gate,up,down} ✓ mm.model.fc, mm.up, mm.gate, mm.down, mm.post_norm, mm.patch_merger ✓ Two diffs to fix: * Patch-embed temporal pair: Ollama uses `v.patch_embd_0.weight` / `v.patch_embd_1.weight` (underscore-suffixed); upstream uses `v.patch_embd.weight` (unsuffixed) + `v.patch_embd.weight.1` (with TN_PATCH_EMBD_1). Exact rename for both. * F32 promote of `v.patch_embd.weight{,.1}` for Metal IM2COL (same fix as gemma3 / mistral3 / deepseek-ocr). KV synthesis: rewrite arch to `clip` with `projector_type=glm4v`, copy `glmocr.vision.*` → `clip.vision.*` (incl. spatial_merge_size, out_hidden_size → projection_dim, intermediate_size → feed_forward_length, layer_norm_rms_epsilon → attention.layer_norm_epsilon), copy through Ollama's `image_mean` / `image_std` arrays, set `clip.use_silu=true`. Adds `glmocr` to the Go-side `compatClipArches` allowlist. Verified: with --mmproj pointing at the same blob, mmproj loads cleanly with all glm4v hparams set and patch_embd promoted to F32. End-to-end testing through `ollama run glm-ocr` (which supplies the proper chat template via Modelfile) is the user-facing flow.
2026-05-13 14:27:00 +00:00 · 2026-04-19 17:12:28 -07:00 · 2026-04-19 17:12:28 -07:00 · f1bd1a25ac
commit f1bd1a25ac
parent 7e07653271
2 changed files with 66 additions and 0 deletions
--- a/llama/compat/llama-ollama-compat.cpp
+++ b/llama/compat/llama-ollama-compat.cpp
@ -995,6 +995,67 @@ void handle_gemma4_clip(gguf_context * meta, ggml_context * ctx) {
    promote_tensor_to_f32(meta, ctx, "v.patch_embd.weight");
 }

+// =========================================================================
+// glm-ocr (clip side — glm4v projector)
+// =========================================================================
+//
+// Ollama stores the GLM4V vision tower with v.blk.X.* tensor names that
+// already match upstream's expectations (`attn_qkv`, `attn_out`,
+// `attn_q_norm`, `attn_k_norm`, `ln1`/`ln2`, `ffn_{gate,up,down}`).
+// Most of mm.* (mm.model.fc, mm.up/gate/down, mm.post_norm,
+// mm.patch_merger) is also already named correctly. The two diffs:
+//   * `v.patch_embd_0.weight` / `v.patch_embd_1.weight` → upstream's
+//     pixel-shuffle patch-embed pair `v.patch_embd.weight` /
+//     `v.patch_embd.weight.1`.
+//   * F32 promote of patch_embd weights (Metal IM2COL).
+
+void handle_glmocr_clip(gguf_context * meta, ggml_context * ctx) {
+    LLAMA_LOG_INFO("%s: detected Ollama-format glm-ocr GGUF used as mmproj; translating\n", __func__);
+
+    copy_u32_kv(meta, "glmocr.vision.block_count",                   "clip.vision.block_count");
+    copy_u32_kv(meta, "glmocr.vision.embedding_length",              "clip.vision.embedding_length");
+    copy_u32_kv(meta, "glmocr.vision.intermediate_size",             "clip.vision.feed_forward_length");
+    copy_u32_kv(meta, "glmocr.vision.attention.head_count",          "clip.vision.attention.head_count");
+    copy_f32_kv(meta, "glmocr.vision.attention.layer_norm_rms_epsilon", "clip.vision.attention.layer_norm_epsilon");
+    copy_u32_kv(meta, "glmocr.vision.image_size",                    "clip.vision.image_size");
+    copy_u32_kv(meta, "glmocr.vision.patch_size",                    "clip.vision.patch_size");
+    copy_u32_kv(meta, "glmocr.vision.spatial_merge_size",            "clip.vision.spatial_merge_size");
+    copy_u32_kv(meta, "glmocr.vision.out_hidden_size",               "clip.vision.projection_dim");
+
+    // Ollama already shipped image_mean / image_std under glmocr.vision.*;
+    // copy them through.
+    {
+        const int64_t kid = gguf_find_key(meta, "glmocr.vision.image_mean");
+        if (kid >= 0 && !has_key(meta, "clip.vision.image_mean")) {
+            const size_t n = gguf_get_arr_n(meta, kid);
+            gguf_set_arr_data(meta, "clip.vision.image_mean", GGUF_TYPE_FLOAT32,
+                              gguf_get_arr_data(meta, kid), n);
+        }
+    }
+    {
+        const int64_t kid = gguf_find_key(meta, "glmocr.vision.image_std");
+        if (kid >= 0 && !has_key(meta, "clip.vision.image_std")) {
+            const size_t n = gguf_get_arr_n(meta, kid);
+            gguf_set_arr_data(meta, "clip.vision.image_std", GGUF_TYPE_FLOAT32,
+                              gguf_get_arr_data(meta, kid), n);
+        }
+    }
+
+    inject_bool_if_missing(meta, "clip.has_vision_encoder", true);
+    inject_bool_if_missing(meta, "clip.use_silu",           true);
+    gguf_set_val_str(meta, "clip.projector_type",  "glm4v");
+    gguf_set_val_str(meta, "general.architecture", "clip");
+
+    // Patch-embed temporal pair: Ollama uses _0/_1 suffixes, upstream uses
+    // unsuffixed/.1.
+    rename_tensor(meta, ctx, "v.patch_embd_0.weight", "v.patch_embd.weight");
+    rename_tensor(meta, ctx, "v.patch_embd_1.weight", "v.patch_embd.weight.1");
+
+    // F32 promote for IM2COL on Metal (same fix as gemma3 / mistral3).
+    promote_tensor_to_f32(meta, ctx, "v.patch_embd.weight");
+    promote_tensor_to_f32(meta, ctx, "v.patch_embd.weight.1");
+}
+
 // =========================================================================
 // llama4 (clip side)
 // =========================================================================
@ -1281,6 +1342,10 @@ void translate_clip_metadata(gguf_context * meta, ggml_context * ctx) {
        handle_gemma4_clip(meta, ctx);
        return;
    }
+    if (detect_ollama_glmocr(meta)) {
+        handle_glmocr_clip(meta, ctx);
+        return;
+    }
 }

 bool should_skip_tensor(const llama_model_loader * ml, const char * tensor_name) {
--- a/llm/llama_server.go
+++ b/llm/llama_server.go
@ -438,6 +438,7 @@ func NewLlamaServerRunner(
 		"qwen35moe":   true,
 		"mistral3":    true,
 		"deepseekocr": true,
+		"glmocr":      true,
 		"llama4":      true,
 		// Add entries as llama/compat grows clip handlers.
 	}