llama/compat: add glm-ocr text handler + text-loader load-op hook

glm-ocr (text side): * Arch rename `glmocr` → `glm4` (incl. KV prefix); upstream supports GLM-OCR via LLM_ARCH_GLM4 with n_layer=17 (16 main + 1 nextn). We report n_layer=16 and leave nextn_predict_layers absent — the Ollama blob doesn't ship the nextn layer's weights. * M-RoPE: pad `rope.mrope_section` (3 elements) → `rope.dimension_sections` (4 elements with trailing 0). * Inject `rope.dimension_count = key_length`. * Tokenizer pre-tokenizer rename `llama-bpe` → `chatglm-bpe`. * Tensor renames: `attn_out`→`attn_output`, `post_attn_norm`→ `post_attention_norm`, `post_ffn_norm`→`post_ffw_norm`. * **Per-block FFN concat**: GLM4 expects fused `ffn_up.weight: [n_embd, n_ff*2]` (gate || up). Ollama writes separate `ffn_gate.weight` + `ffn_up.weight` (each `[n_embd, n_ff]`). Register a load-time concat op that stitches gate+up into the fused upstream slot, then add a per-block skip-prefix for the orphan `blk.X.ffn_gate.` so the n_tensors check lines up. * Hide embedded `v.*`/`mm.*` from the text loader. This is the first text-side compat that needs custom load-time tensor data (the FFN concat). Until now load-op support only covered the clip side. New plumbing: * `set_loader_path(ml, fname)` — store the model file path on a per-loader registry, called from the loader constructor. * `maybe_load_text_tensor(ml, cur, off, buft)` — the text-side counterpart to `maybe_load_tensor`; looks up the path from the registry then delegates to the existing load-op machinery. * Upstream patch grows two new lines: a `set_loader_path` call in the constructor and a `maybe_load_text_tensor` hook in `load_all_data` (before the use_mmap branch). Verified: with --no-mmap, glm-ocr's blk.X.ffn_up.weight load fires the concat op (28MB per block on the 1B variant) and the model emits coherent text. Through `ollama run` the proper chat template applies. Note: vision (clip) handler is a follow-up.
2026-05-13 14:27:00 +00:00 · 2026-04-19 17:10:29 -07:00 · 2026-04-19 17:10:29 -07:00 · 7e07653271
commit 7e07653271
parent 5d45391016
3 changed files with 157 additions and 2 deletions
--- a/llama/compat/llama-ollama-compat.cpp
+++ b/llama/compat/llama-ollama-compat.cpp
@ -5,7 +5,9 @@

 #include <cstdio>
 #include <cstring>
+#include <mutex>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>

@ -397,6 +399,115 @@ void handle_llama4(const llama_model_loader * ml, gguf_context * meta, ggml_cont
    add_skip_prefix(ml, "mm.");
 }

+// =========================================================================
+// glm-ocr (text side)
+// =========================================================================
+//
+// Ollama uses arch name "glmocr" / KV prefix "glmocr.*" with 16 blocks.
+// Upstream uses "glm4" / "glm4.*" — the GLM-OCR variant of LLM_ARCH_GLM4
+// is identified by `n_layer = 17` (16 main + 1 nextn predict layer).
+// Ollama drops the nextn layer entirely, so we report n_layer = 16 and
+// leave `nextn_predict_layers` absent (defaults to 0 = no nextn path).
+//
+// Bigger surgery: GLM4 expects fused gate+up MLP weights stored at
+// `blk.X.ffn_up.weight` with shape `[n_embd, n_ff*2]`. Ollama writes
+// the gate and up halves as separate `ffn_gate.weight` / `ffn_up.weight`
+// tensors (each `[n_embd, n_ff]`). We register a concat load op that
+// reads gate+up bytes and stitches them into the fused upstream slot.
+
+// Per-block: register a concat load that fuses Ollama's separate
+// ffn_gate + ffn_up into upstream's single `blk.X.ffn_up.weight`
+// tensor with doubled out dim. Capture source file offsets BEFORE any
+// renames invalidate them (same pattern as qwen35moe QKV merge).
+void register_glm4_ffn_concat(gguf_context * meta, ggml_context * ctx, int block_idx) {
+    char gate_n[64], up_n[64];
+    std::snprintf(gate_n, sizeof(gate_n), "blk.%d.ffn_gate.weight", block_idx);
+    std::snprintf(up_n,   sizeof(up_n),   "blk.%d.ffn_up.weight",   block_idx);
+
+    if (!ggml_get_tensor(ctx, gate_n) || !ggml_get_tensor(ctx, up_n)) return;
+
+    // GLM4's fused ffn_up has gate as first half, up as second half
+    // (so ggml_swiglu's silu(first_half) * second_half gives silu(gate) * up).
+    register_concat_load(meta, up_n, {gate_n, up_n});
+
+    if (ggml_tensor * t = ggml_get_tensor(ctx, up_n)) {
+        set_tensor_shape(t, {t->ne[0], t->ne[1] * 2});
+    }
+}
+
+bool detect_ollama_glmocr(const gguf_context * meta) {
+    const int64_t arch_kid = gguf_find_key(meta, "general.architecture");
+    if (arch_kid < 0) return false;
+    return std::strcmp(gguf_get_val_str(meta, arch_kid), "glmocr") == 0;
+}
+
+void handle_glmocr(const llama_model_loader * ml, gguf_context * meta,
+                   ggml_context * ctx, std::string & arch_name) {
+    if (!detect_ollama_glmocr(meta)) return;
+
+    LLAMA_LOG_INFO("%s: detected Ollama-format glmocr GGUF; applying compatibility fixes\n", __func__);
+
+    gguf_set_val_str(meta, "general.architecture", "glm4");
+    rename_kv_prefix(meta, "glmocr.", "glm4.");
+    arch_name = "glm4";
+
+    // M-RoPE: Ollama writes a 3-element `rope.mrope_section`, upstream expects
+    // a 4-element `rope.dimension_sections` (pad trailing 0).
+    {
+        const int64_t kid = gguf_find_key(meta, "glm4.rope.mrope_section");
+        if (kid >= 0 && gguf_get_arr_n(meta, kid) == 3) {
+            const auto * src = static_cast<const int32_t *>(gguf_get_arr_data(meta, kid));
+            const int32_t padded[4] = { src[0], src[1], src[2], 0 };
+            gguf_set_arr_data(meta, "glm4.rope.dimension_sections",
+                              GGUF_TYPE_INT32, padded, 4);
+        }
+    }
+    // Inject `rope.dimension_count` from key_length (used as the rope dim).
+    if (!has_key(meta, "glm4.rope.dimension_count")) {
+        const int64_t kid = gguf_find_key(meta, "glm4.attention.key_length");
+        if (kid >= 0) {
+            gguf_set_val_u32(meta, "glm4.rope.dimension_count",
+                             gguf_get_val_u32(meta, kid));
+        }
+    }
+
+    // Tokenizer pre-tokenizer: Ollama wrote `llama-bpe`, but glm-ocr uses
+    // `chatglm-bpe` (different regex split — wrong pre-tokenization can
+    // fragment GLM's special tokens).
+    {
+        const int64_t kid = gguf_find_key(meta, "tokenizer.ggml.pre");
+        if (kid >= 0) {
+            const char * cur = gguf_get_val_str(meta, kid);
+            if (cur && std::strcmp(cur, "chatglm-bpe") != 0) {
+                gguf_set_val_str(meta, "tokenizer.ggml.pre", "chatglm-bpe");
+            }
+        }
+    }
+
+    // Tensor renames (substring): each leaf appears once per block and
+    // doesn't overlap the others.
+    rename_tensors_containing(meta, ctx, ".attn_out",       ".attn_output");
+    rename_tensors_containing(meta, ctx, ".post_attn_norm", ".post_attention_norm");
+    rename_tensors_containing(meta, ctx, ".post_ffn_norm",  ".post_ffw_norm");
+
+    // Fuse ffn_gate + ffn_up → ffn_up[:, 2*n_ff] for every block, then mark
+    // the orphan ffn_gate tensors as skip so n_tensors lines up.
+    {
+        const int64_t n_blk_kid = gguf_find_key(meta, "glm4.block_count");
+        const uint32_t n_blocks = n_blk_kid >= 0 ? gguf_get_val_u32(meta, n_blk_kid) : 16;
+        for (uint32_t b = 0; b < n_blocks; ++b) {
+            register_glm4_ffn_concat(meta, ctx, (int) b);
+            char skip_pref[64];
+            std::snprintf(skip_pref, sizeof(skip_pref), "blk.%u.ffn_gate.", b);
+            add_skip_prefix(ml, skip_pref);
+        }
+    }
+
+    // Hide embedded vision + projector tensors from the text loader.
+    add_skip_prefix(ml, "v.");
+    add_skip_prefix(ml, "mm.");
+}
+
 // =========================================================================
 // gpt-oss (text only)
 // =========================================================================
@ -1137,6 +1248,7 @@ void translate_metadata(const llama_model_loader * ml,
    if (arch_name == "deepseekocr")   handle_deepseekocr   (ml, meta, ctx, arch_name);
    if (arch_name == "nemotron_h_moe") handle_nemotron_h_moe(ml, meta, ctx);
    if (arch_name == "llama4")        handle_llama4        (ml, meta, ctx);
+    if (arch_name == "glmocr")        handle_glmocr        (ml, meta, ctx, arch_name);
    // Dispatch. Add more arches as they are wired up.
 }

@ -1198,4 +1310,28 @@ bool maybe_load_tensor(ggml_tensor * cur,
    return true;
 }

+namespace {
+std::mutex g_loader_path_mutex;
+std::unordered_map<const llama_model_loader *, std::string> g_loader_paths;
+}
+
+void set_loader_path(const llama_model_loader * ml, const char * fname) {
+    std::lock_guard<std::mutex> lk(g_loader_path_mutex);
+    g_loader_paths[ml] = fname ? fname : "";
+}
+
+bool maybe_load_text_tensor(const llama_model_loader * ml,
+                            ggml_tensor * cur,
+                            size_t file_offset,
+                            ggml_backend_buffer_type_t buft) {
+    std::string path;
+    {
+        std::lock_guard<std::mutex> lk(g_loader_path_mutex);
+        auto it = g_loader_paths.find(ml);
+        if (it == g_loader_paths.end() || it->second.empty()) return false;
+        path = it->second;
+    }
+    return maybe_load_tensor(cur, path.c_str(), file_offset, buft);
+}
+
 } // namespace llama_ollama_compat
--- a/llama/compat/llama-ollama-compat.h
+++ b/llama/compat/llama-ollama-compat.h
@ -60,4 +60,15 @@ bool maybe_load_tensor(ggml_tensor * cur,
                       size_t file_offset,
                       ggml_backend_buffer_type_t buft);

+// Same as maybe_load_tensor but for the text-side llama_model_loader,
+// which doesn't have the clip loader's `fname` in scope at the read
+// site. Looks up the model's file path from a per-loader registry
+// populated by `set_loader_path` (called from the model loader's
+// constructor right after `fname` is in scope).
+bool maybe_load_text_tensor(const llama_model_loader * ml,
+                            ggml_tensor * cur,
+                            size_t file_offset,
+                            ggml_backend_buffer_type_t buft);
+void set_loader_path(const llama_model_loader * ml, const char * fname);
+
 } // namespace llama_ollama_compat
--- a/llama/compat/upstream-edits.patch
+++ b/llama/compat/upstream-edits.patch
@ -10,10 +10,11 @@ index 4e65a45a5..75836c683 100644
 
 #include <algorithm>
 #include <array>
-@@ -549,6 +550,7 @@ llama_model_loader::llama_model_loader(
+@@ -549,6 +550,8 @@ llama_model_loader::llama_model_loader(
         }
- 
+
         get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
+        llama_ollama_compat::set_loader_path(this, fname.c_str());
 +        llama_ollama_compat::translate_metadata(this, metadata, ctx, arch_name);
         llm_kv = LLM_KV(llm_arch_from_string(arch_name));
 
@ -38,6 +39,13 @@ index 4e65a45a5..75836c683 100644
             // make sure there is no duplicated tensor names
             if (weights_map.find(tensor_name) != weights_map.end()) {
                 throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
+@@ -1535,3 +1543,6 @@ bool llama_model_loader::load_all_data(
+         size_t n_size = ggml_nbytes(cur);
+
+        if (llama_ollama_compat::maybe_load_text_tensor(this, cur, weight->offs, cur->buffer ? ggml_backend_buffer_get_type(cur->buffer) : nullptr)) {
+            continue;
+        }
+         if (use_mmap) {
 diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
 index f0e8786b6..35defa89d 100644
 --- a/tools/mtmd/clip.cpp