llama/compat: collapse text-loader hook back to one new patch line

Previous addition for glm-ocr added 4 lines to upstream-edits.patch (1 in constructor for set_loader_path + 3 in load_all_data for the maybe_load_text_tensor if/continue/closing-brace block). Refactor to +1 line: * Fold path capture into translate_metadata: signature now takes `const char * fname` and stashes it on the per-loader registry internally. The constructor still has just one llama_compat call (we just changed the argument list). * Make maybe_load_text_tensor self-contained: it derives the buffer type from `cur->buffer` rather than the caller passing it. The hook line in load_all_data collapses to a single `if (llama_ollama_compat::maybe_load_text_tensor(this, cur, weight->offs)) continue;`. * Drop the public `set_loader_path` symbol — it's now an internal detail of translate_metadata. Net patch growth from glm-ocr support: +1 line. Total upstream patch is back to its original 17-line surface. Functional: glm-ocr text generation still works ("Paris" via raw turn-template completion), concat ops still fire (28MB per block).
2026-05-13 14:27:00 +00:00 · 2026-04-19 17:23:50 -07:00 · 2026-04-19 17:23:50 -07:00 · 4b5cf3420a
commit 4b5cf3420a
parent f1bd1a25ac
3 changed files with 29 additions and 29 deletions
--- a/llama/compat/llama-ollama-compat.cpp
+++ b/llama/compat/llama-ollama-compat.cpp
@ -17,6 +17,12 @@ using namespace llama_ollama_compat::detail; // pull detail:: helpers into scope

 namespace {

+// Per-loader file path registry — set by translate_metadata, read by
+// maybe_load_text_tensor so it can pass the path to load ops without a
+// separate patch insertion in the model loader's load_all_data path.
+std::mutex g_loader_path_mutex;
+std::unordered_map<const llama_model_loader *, std::string> g_loader_paths;
+
 // =========================================================================
 // gemma3 (text side)
 // =========================================================================
@ -1297,8 +1303,13 @@ void handle_mistral3_clip(gguf_context * meta, ggml_context * ctx) {
 void translate_metadata(const llama_model_loader * ml,
                        gguf_context * meta,
                        ggml_context * ctx,
-                        std::string & arch_name) {
+                        std::string & arch_name,
+                        const char * fname) {
    if (!meta) return;
+    {
+        std::lock_guard<std::mutex> lk(g_loader_path_mutex);
+        g_loader_paths[ml] = fname ? fname : "";
+    }
    if (arch_name == "gemma3")    handle_gemma3   (ml, meta, ctx);
    if (arch_name == "gemma4")    handle_gemma4   (ml, meta, ctx);
    if (arch_name == "qwen35moe") handle_qwen35moe(ml, meta, ctx);
@ -1375,20 +1386,9 @@ bool maybe_load_tensor(ggml_tensor * cur,
    return true;
 }

-namespace {
-std::mutex g_loader_path_mutex;
-std::unordered_map<const llama_model_loader *, std::string> g_loader_paths;
-}
-
-void set_loader_path(const llama_model_loader * ml, const char * fname) {
-    std::lock_guard<std::mutex> lk(g_loader_path_mutex);
-    g_loader_paths[ml] = fname ? fname : "";
-}
-
 bool maybe_load_text_tensor(const llama_model_loader * ml,
                            ggml_tensor * cur,
-                            size_t file_offset,
-                            ggml_backend_buffer_type_t buft) {
+                            size_t file_offset) {
    std::string path;
    {
        std::lock_guard<std::mutex> lk(g_loader_path_mutex);
@ -1396,6 +1396,9 @@ bool maybe_load_text_tensor(const llama_model_loader * ml,
        if (it == g_loader_paths.end() || it->second.empty()) return false;
        path = it->second;
    }
+    ggml_backend_buffer_type_t buft = cur->buffer
+        ? ggml_backend_buffer_get_type(cur->buffer)
+        : nullptr;
    return maybe_load_tensor(cur, path.c_str(), file_offset, buft);
 }

--- a/llama/compat/llama-ollama-compat.h
+++ b/llama/compat/llama-ollama-compat.h
@ -35,10 +35,13 @@ struct llama_model_loader;
 namespace llama_ollama_compat {

 // Called from llama_model_loader's constructor, right after the arch is read.
+// `fname` is the model file path, captured here so later load-time hooks
+// (maybe_load_text_tensor) can read raw bytes from it.
 void translate_metadata(const llama_model_loader * ml,
                        gguf_context * meta,
                        ggml_context * ctx,
-                        std::string & arch_name);
+                        std::string & arch_name,
+                        const char * fname);

 // Called from llama_model_loader's weights_map population loop. Returns
 // true to drop a tensor from the loader — used to hide embedded vision
@ -60,15 +63,12 @@ bool maybe_load_tensor(ggml_tensor * cur,
                       size_t file_offset,
                       ggml_backend_buffer_type_t buft);

-// Same as maybe_load_tensor but for the text-side llama_model_loader,
-// which doesn't have the clip loader's `fname` in scope at the read
-// site. Looks up the model's file path from a per-loader registry
-// populated by `set_loader_path` (called from the model loader's
-// constructor right after `fname` is in scope).
+// Text-side counterpart to maybe_load_tensor. Self-contained: looks up
+// the model file path from the per-loader registry populated by
+// translate_metadata, and derives the buffer type from cur->buffer
+// internally — keeps the call site (and the upstream patch) to one line.
 bool maybe_load_text_tensor(const llama_model_loader * ml,
                            ggml_tensor * cur,
-                            size_t file_offset,
-                            ggml_backend_buffer_type_t buft);
-void set_loader_path(const llama_model_loader * ml, const char * fname);
+                            size_t file_offset);

 } // namespace llama_ollama_compat
--- a/llama/compat/upstream-edits.patch
+++ b/llama/compat/upstream-edits.patch
@ -10,12 +10,11 @@ index 4e65a45a5..75836c683 100644
 
 #include <algorithm>
 #include <array>
-@@ -549,6 +550,8 @@ llama_model_loader::llama_model_loader(
+@@ -549,6 +550,7 @@ llama_model_loader::llama_model_loader(
         }

         get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
-+        llama_ollama_compat::set_loader_path(this, fname.c_str());
-+        llama_ollama_compat::translate_metadata(this, metadata, ctx, arch_name);
+        llama_ollama_compat::translate_metadata(this, metadata, ctx, arch_name, fname.c_str());
         llm_kv = LLM_KV(llm_arch_from_string(arch_name));
 
         files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
@ -39,12 +38,10 @@ index 4e65a45a5..75836c683 100644
             // make sure there is no duplicated tensor names
             if (weights_map.find(tensor_name) != weights_map.end()) {
                 throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
-@@ -1535,3 +1543,6 @@ bool llama_model_loader::load_all_data(
+@@ -1535,3 +1542,4 @@ bool llama_model_loader::load_all_data(
         size_t n_size = ggml_nbytes(cur);

-+        if (llama_ollama_compat::maybe_load_text_tensor(this, cur, weight->offs, cur->buffer ? ggml_backend_buffer_get_type(cur->buffer) : nullptr)) {
-+            continue;
-+        }
+        if (llama_ollama_compat::maybe_load_text_tensor(this, cur, weight->offs)) continue;
         if (use_mmap) {
 diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
 index f0e8786b6..35defa89d 100644