llama/compat: collapse text-loader hook back to one new patch line

Previous addition for glm-ocr added 4 lines to upstream-edits.patch
(1 in constructor for set_loader_path + 3 in load_all_data for the
maybe_load_text_tensor if/continue/closing-brace block). Refactor to
+1 line:

  * Fold path capture into translate_metadata: signature now takes
    `const char * fname` and stashes it on the per-loader registry
    internally. The constructor still has just one llama_compat call
    (we just changed the argument list).
  * Make maybe_load_text_tensor self-contained: it derives the buffer
    type from `cur->buffer` rather than the caller passing it. The
    hook line in load_all_data collapses to a single
    `if (llama_ollama_compat::maybe_load_text_tensor(this, cur, weight->offs)) continue;`.
  * Drop the public `set_loader_path` symbol — it's now an internal
    detail of translate_metadata.

Net patch growth from glm-ocr support: +1 line. Total upstream patch
is back to its original 17-line surface.

Functional: glm-ocr text generation still works ("Paris" via raw
turn-template completion), concat ops still fire (28MB per block).
This commit is contained in:
jmorganca 2026-04-19 17:23:50 -07:00
parent f1bd1a25ac
commit 4b5cf3420a
3 changed files with 29 additions and 29 deletions

View file

@ -17,6 +17,12 @@ using namespace llama_ollama_compat::detail; // pull detail:: helpers into scope
namespace {
// Per-loader file path registry — set by translate_metadata, read by
// maybe_load_text_tensor so it can pass the path to load ops without a
// separate patch insertion in the model loader's load_all_data path.
std::mutex g_loader_path_mutex;
std::unordered_map<const llama_model_loader *, std::string> g_loader_paths;
// =========================================================================
// gemma3 (text side)
// =========================================================================
@ -1297,8 +1303,13 @@ void handle_mistral3_clip(gguf_context * meta, ggml_context * ctx) {
void translate_metadata(const llama_model_loader * ml,
gguf_context * meta,
ggml_context * ctx,
std::string & arch_name) {
std::string & arch_name,
const char * fname) {
if (!meta) return;
{
std::lock_guard<std::mutex> lk(g_loader_path_mutex);
g_loader_paths[ml] = fname ? fname : "";
}
if (arch_name == "gemma3") handle_gemma3 (ml, meta, ctx);
if (arch_name == "gemma4") handle_gemma4 (ml, meta, ctx);
if (arch_name == "qwen35moe") handle_qwen35moe(ml, meta, ctx);
@ -1375,20 +1386,9 @@ bool maybe_load_tensor(ggml_tensor * cur,
return true;
}
namespace {
std::mutex g_loader_path_mutex;
std::unordered_map<const llama_model_loader *, std::string> g_loader_paths;
}
void set_loader_path(const llama_model_loader * ml, const char * fname) {
std::lock_guard<std::mutex> lk(g_loader_path_mutex);
g_loader_paths[ml] = fname ? fname : "";
}
bool maybe_load_text_tensor(const llama_model_loader * ml,
ggml_tensor * cur,
size_t file_offset,
ggml_backend_buffer_type_t buft) {
size_t file_offset) {
std::string path;
{
std::lock_guard<std::mutex> lk(g_loader_path_mutex);
@ -1396,6 +1396,9 @@ bool maybe_load_text_tensor(const llama_model_loader * ml,
if (it == g_loader_paths.end() || it->second.empty()) return false;
path = it->second;
}
ggml_backend_buffer_type_t buft = cur->buffer
? ggml_backend_buffer_get_type(cur->buffer)
: nullptr;
return maybe_load_tensor(cur, path.c_str(), file_offset, buft);
}

View file

@ -35,10 +35,13 @@ struct llama_model_loader;
namespace llama_ollama_compat {
// Called from llama_model_loader's constructor, right after the arch is read.
// `fname` is the model file path, captured here so later load-time hooks
// (maybe_load_text_tensor) can read raw bytes from it.
void translate_metadata(const llama_model_loader * ml,
gguf_context * meta,
ggml_context * ctx,
std::string & arch_name);
std::string & arch_name,
const char * fname);
// Called from llama_model_loader's weights_map population loop. Returns
// true to drop a tensor from the loader — used to hide embedded vision
@ -60,15 +63,12 @@ bool maybe_load_tensor(ggml_tensor * cur,
size_t file_offset,
ggml_backend_buffer_type_t buft);
// Same as maybe_load_tensor but for the text-side llama_model_loader,
// which doesn't have the clip loader's `fname` in scope at the read
// site. Looks up the model's file path from a per-loader registry
// populated by `set_loader_path` (called from the model loader's
// constructor right after `fname` is in scope).
// Text-side counterpart to maybe_load_tensor. Self-contained: looks up
// the model file path from the per-loader registry populated by
// translate_metadata, and derives the buffer type from cur->buffer
// internally — keeps the call site (and the upstream patch) to one line.
bool maybe_load_text_tensor(const llama_model_loader * ml,
ggml_tensor * cur,
size_t file_offset,
ggml_backend_buffer_type_t buft);
void set_loader_path(const llama_model_loader * ml, const char * fname);
size_t file_offset);
} // namespace llama_ollama_compat

View file

@ -10,12 +10,11 @@ index 4e65a45a5..75836c683 100644
#include <algorithm>
#include <array>
@@ -549,6 +550,8 @@ llama_model_loader::llama_model_loader(
@@ -549,6 +550,7 @@ llama_model_loader::llama_model_loader(
}
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
+ llama_ollama_compat::set_loader_path(this, fname.c_str());
+ llama_ollama_compat::translate_metadata(this, metadata, ctx, arch_name);
+ llama_ollama_compat::translate_metadata(this, metadata, ctx, arch_name, fname.c_str());
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
@ -39,12 +38,10 @@ index 4e65a45a5..75836c683 100644
// make sure there is no duplicated tensor names
if (weights_map.find(tensor_name) != weights_map.end()) {
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
@@ -1535,3 +1543,6 @@ bool llama_model_loader::load_all_data(
@@ -1535,3 +1542,4 @@ bool llama_model_loader::load_all_data(
size_t n_size = ggml_nbytes(cur);
+ if (llama_ollama_compat::maybe_load_text_tensor(this, cur, weight->offs, cur->buffer ? ggml_backend_buffer_get_type(cur->buffer) : nullptr)) {
+ continue;
+ }
+ if (llama_ollama_compat::maybe_load_text_tensor(this, cur, weight->offs)) continue;
if (use_mmap) {
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index f0e8786b6..35defa89d 100644