llama/compat: simplify shim (gemma3-tested)

Round of post-first-arch cleanup. No behavioral change — text + vision still pass end-to-end on Ollama gemma3:latest (4B), 1B, and 270m. Removes: - apply_tensor_transforms() and the TransformSpec registry. Registered the +1 RMSNorm shift at one point but empirically Ollama's blobs already have it baked in. YAGNI until an arch actually needs a tensor-data transform; the hook can come back data-driven. - Corresponding llama-model.cpp patch hunk (include + post-load call). - copy_kv<> template plumbing. Direct copy_u32_kv / copy_f32_kv are shorter and more readable. - set_str wrapper (one-liner around gguf_set_val_str). - find_tensor helper (only used once; inlined as a loop). - convert_f16_to_f32 helper (one-line inner loop, inlined). - set_f32_if_missing wrapper (inlined at 3 call sites). Tightens: - Clip tensor renames are now a table (kGemma3ClipRenames) iterated by handle_gemma3_clip. Adding a rename is one row. - translate_clip_metadata now reuses detect_ollama_gemma3 for its "is this an Ollama blob?" check instead of a separate ad-hoc check. Net: llama-ollama-compat.cpp 488 -> 343 lines (-30%) llama-ollama-compat.h 83 -> 63 lines (-24%) upstream-edits.patch 20 -> 16 lines of real edits, 3 -> 2 files
2026-05-14 06:51:24 +00:00 · 2026-04-19 12:05:11 -07:00 · 2026-04-19 12:05:11 -07:00 · 36049361cd
commit 36049361cd
parent 61b367ec29
3 changed files with 233 additions and 434 deletions
--- a/llama/compat/llama-ollama-compat.cpp
+++ b/llama/compat/llama-ollama-compat.cpp
@ -7,9 +7,8 @@
 #include "llama-model-loader.h"

 #include <cstdint>
-#include <cstring>
 #include <cstdio>
-#include <functional>
+#include <cstring>
 #include <mutex>
 #include <string>
 #include <unordered_map>
@ -17,120 +16,148 @@
 #include <vector>

 namespace llama_ollama_compat {
-
 namespace {

-// ---- helpers -------------------------------------------------------------
+// -------------------------------------------------------------------------
+// tiny gguf_context helpers
+// -------------------------------------------------------------------------

 bool has_key(const gguf_context * meta, const char * key) {
    return gguf_find_key(meta, key) >= 0;
 }

-void set_f32_if_missing(gguf_context * meta, const char * key, float value) {
-    if (!has_key(meta, key)) {
-        gguf_set_val_f32(meta, key, value);
-    }
-}
-
 bool any_tensor_with_prefix(const ggml_context * ctx, const char * prefix) {
    const size_t plen = std::strlen(prefix);
    for (ggml_tensor * t = ggml_get_first_tensor(ctx); t; t = ggml_get_next_tensor(ctx, t)) {
-        if (std::strncmp(ggml_get_name(t), prefix, plen) == 0) {
-            return true;
-        }
+        if (std::strncmp(ggml_get_name(t), prefix, plen) == 0) return true;
    }
    return false;
 }

-const ggml_tensor * find_tensor(const ggml_context * ctx, const char * name) {
-    for (ggml_tensor * t = ggml_get_first_tensor(ctx); t; t = ggml_get_next_tensor(ctx, t)) {
-        if (std::strcmp(ggml_get_name(t), name) == 0) return t;
-    }
-    return nullptr;
+// Copy a uint32 KV from src to dst if src exists and dst doesn't.
+void copy_u32_kv(gguf_context * meta, const char * src, const char * dst) {
+    if (has_key(meta, dst)) return;
+    const int64_t k = gguf_find_key(meta, src);
+    if (k < 0) return;
+    gguf_set_val_u32(meta, dst, gguf_get_val_u32(meta, k));
 }

-// Truncate a string-typed KV array to `new_n` entries. No-op if absent or
-// already that size or smaller.
+// Copy a float32 KV from src to dst if src exists and dst doesn't.
+void copy_f32_kv(gguf_context * meta, const char * src, const char * dst) {
+    if (has_key(meta, dst)) return;
+    const int64_t k = gguf_find_key(meta, src);
+    if (k < 0) return;
+    gguf_set_val_f32(meta, dst, gguf_get_val_f32(meta, k));
+}
+
+// Truncate a string-typed KV array to `new_n` entries.
 void truncate_str_arr(gguf_context * meta, const char * key, size_t new_n) {
    const int64_t kid = gguf_find_key(meta, key);
-    if (kid < 0) return;
-    const size_t cur_n = gguf_get_arr_n(meta, kid);
-    if (new_n >= cur_n) return;
+    if (kid < 0 || new_n >= gguf_get_arr_n(meta, kid)) return;

    std::vector<std::string> owned;
    owned.reserve(new_n);
    std::vector<const char *> ptrs;
    ptrs.reserve(new_n);
-    for (size_t i = 0; i < new_n; ++i) {
-        owned.emplace_back(gguf_get_arr_str(meta, kid, i));
-    }
+    for (size_t i = 0; i < new_n; ++i) owned.emplace_back(gguf_get_arr_str(meta, kid, i));
    for (const auto & s : owned) ptrs.push_back(s.c_str());
    gguf_set_arr_str(meta, key, ptrs.data(), new_n);
 }

 // Truncate a primitive-typed KV array to `new_n` entries.
-void truncate_data_arr(gguf_context * meta, const char * key, gguf_type elem_type, size_t elem_size, size_t new_n) {
+void truncate_data_arr(gguf_context * meta, const char * key,
+                       gguf_type elem_type, size_t elem_size, size_t new_n) {
    const int64_t kid = gguf_find_key(meta, key);
-    if (kid < 0) return;
-    const size_t cur_n = gguf_get_arr_n(meta, kid);
-    if (new_n >= cur_n) return;
+    if (kid < 0 || new_n >= gguf_get_arr_n(meta, kid)) return;

-    const void * data = gguf_get_arr_data(meta, kid);
    std::vector<uint8_t> copy(elem_size * new_n);
-    std::memcpy(copy.data(), data, elem_size * new_n);
+    std::memcpy(copy.data(), gguf_get_arr_data(meta, kid), elem_size * new_n);
    gguf_set_arr_data(meta, key, elem_type, copy.data(), new_n);
 }

-// ---- per-loader state (skip lists + tensor transforms) -------------------
+// Rename a tensor in BOTH the gguf_context and the ggml_context so that all
+// name-based lookups agree. gguf_get_tensor_name returns a pointer into a
+// mutable `char[GGML_MAX_NAME]` inside a std::vector element; the const on
+// the return type is API courtesy, so writing through const_cast is defined.
+void rename_tensor(gguf_context * meta, ggml_context * ctx,
+                   const char * old_name, const char * new_name) {
+    const int64_t id = gguf_find_tensor(meta, old_name);
+    if (id < 0) return;
+    if (char * p = const_cast<char *>(gguf_get_tensor_name(meta, id))) {
+        std::strncpy(p, new_name, GGML_MAX_NAME - 1);
+        p[GGML_MAX_NAME - 1] = '\0';
+    }
+    if (ggml_tensor * t = ggml_get_tensor(ctx, old_name)) ggml_set_name(t, new_name);
+}

-struct TransformSpec {
-    std::function<bool(const std::string &)> matches;
-    std::function<void(void *, size_t, ggml_type)> apply;
-    const char * description;
-};
+// Rename every tensor whose name contains `needle` (covers `.weight` + `.bias`).
+void rename_tensors_containing(gguf_context * meta, ggml_context * ctx,
+                               const char * needle, const char * replacement) {
+    std::vector<std::pair<std::string, std::string>> renames;
+    const int64_t n = gguf_get_n_tensors(meta);
+    const size_t needle_len = std::strlen(needle);
+    for (int64_t i = 0; i < n; ++i) {
+        std::string s(gguf_get_tensor_name(meta, i));
+        const size_t pos = s.find(needle);
+        if (pos == std::string::npos) continue;
+        std::string ns = s;
+        ns.replace(pos, needle_len, replacement);
+        renames.emplace_back(std::move(s), std::move(ns));
+    }
+    for (const auto & [from, to] : renames) rename_tensor(meta, ctx, from.c_str(), to.c_str());
+}

-struct LoaderState {
-    std::vector<TransformSpec> transforms;
-    std::vector<std::string>   skip_prefixes;
-};
+// -------------------------------------------------------------------------
+// per-loader state (currently just the "drop these tensor prefixes" list)
+// -------------------------------------------------------------------------

 std::mutex g_registry_mutex;
-std::unordered_map<const llama_model_loader *, LoaderState> g_registry;
+std::unordered_map<const llama_model_loader *, std::vector<std::string>> g_skip_prefixes;

 void add_skip_prefix(const llama_model_loader * ml, std::string prefix) {
    std::lock_guard<std::mutex> lk(g_registry_mutex);
-    g_registry[ml].skip_prefixes.push_back(std::move(prefix));
+    g_skip_prefixes[ml].push_back(std::move(prefix));
 }

-// ---- gemma3 --------------------------------------------------------------
+// -------------------------------------------------------------------------
+// F16 -> F32 tensor promotion (needed for Metal IM2COL on gemma3 conv weights)
+// -------------------------------------------------------------------------

-// Returns true if this looks like an Ollama-format gemma3 blob. We collect
-// several independent markers because different Ollama converter versions
-// produced different quirks (the 4B has embedded vision, the 1B has
-// non-standard rope key names, etc.) — any one marker flips detection on.
+std::mutex g_promote_mutex;
+std::unordered_set<std::string> g_promote_f16_to_f32;
+
+// Set a tensor's type + strides in a ggml_context. The companion to this is
+// the `maybe_load_tensor` read hook, which converts F16 bytes from disk into
+// the newly-wider F32 buffer at load time.
+void promote_tensor_to_f32(ggml_context * ctx, const char * name) {
+    ggml_tensor * t = ggml_get_tensor(ctx, name);
+    if (!t) return;
+    t->type = GGML_TYPE_F32;
+    t->nb[0] = ggml_type_size(GGML_TYPE_F32);
+    t->nb[1] = t->nb[0] * (t->ne[0] / ggml_blck_size(GGML_TYPE_F32));
+    for (int i = 2; i < GGML_MAX_DIMS; ++i) t->nb[i] = t->nb[i - 1] * t->ne[i - 1];
+
+    std::lock_guard<std::mutex> lk(g_promote_mutex);
+    g_promote_f16_to_f32.insert(name);
+}
+
+// -------------------------------------------------------------------------
+// gemma3 (text side)
+// -------------------------------------------------------------------------
+
+// Returns true if this looks like an Ollama-format gemma3 blob. Different
+// Ollama converter versions produced different quirks (4B/12B/27B have
+// embedded vision + mm KVs; 1B uses non-standard rope key names; all of
+// them omit layer_norm_rms_epsilon). Any single marker trips detection.
 bool detect_ollama_gemma3(const gguf_context * meta, const ggml_context * ctx) {
-    // Vision-capable gemma3 (4B/12B/27B): Ollama writes this key.
-    if (has_key(meta, "gemma3.mm.tokens_per_image")) return true;
-
-    // Embedded vision tensors in the main file. Upstream stores vision in
-    // a separate mmproj file.
-    if (any_tensor_with_prefix(ctx, "v.") ||
-        any_tensor_with_prefix(ctx, "mm.")) return true;
-
-    // Non-standard rope key names. Ollama's 1B converter used
-    // `gemma3.rope.{global,local}.freq_base` instead of upstream's flat
-    // `gemma3.rope.freq_base` / `gemma3.rope.freq_base_swa`.
-    if (has_key(meta, "gemma3.rope.global.freq_base")) return true;
-    if (has_key(meta, "gemma3.rope.local.freq_base"))  return true;
-
-    // Tokenizer KVs Ollama writes but upstream doesn't.
-    if (has_key(meta, "tokenizer.ggml.add_padding_token")) return true;
-    if (has_key(meta, "tokenizer.ggml.add_unknown_token")) return true;
-
-    // Required KV upstream always writes — its absence is a strong marker.
-    if (!has_key(meta, "gemma3.attention.layer_norm_rms_epsilon")) return true;
-
-    return false;
+    return has_key(meta, "gemma3.mm.tokens_per_image")
+        || any_tensor_with_prefix(ctx, "v.")
+        || any_tensor_with_prefix(ctx, "mm.")
+        || has_key(meta, "gemma3.rope.global.freq_base")
+        || has_key(meta, "gemma3.rope.local.freq_base")
+        || has_key(meta, "tokenizer.ggml.add_padding_token")
+        || has_key(meta, "tokenizer.ggml.add_unknown_token")
+        || !has_key(meta, "gemma3.attention.layer_norm_rms_epsilon");
 }

 void handle_gemma3(const llama_model_loader * ml, gguf_context * meta, ggml_context * ctx) {
@ -138,211 +165,75 @@ void handle_gemma3(const llama_model_loader * ml, gguf_context * meta, ggml_cont

    LLAMA_LOG_INFO("%s: detected Ollama-format gemma3 GGUF; applying compatibility fixes\n", __func__);

-    // 1. Inject required KVs that Ollama's old converter omitted. Defaults
-    //    are the gemma3 standard values; only injected if missing, so explicit
-    //    values in a file take precedence.
-    //
-    //    Some older Ollama converters also used the non-standard keys
-    //    `gemma3.rope.global.freq_base` and `gemma3.rope.local.freq_base`.
-    //    llama.cpp reads only the flat names, so copy those over first so
-    //    the has_key checks below don't trample real values.
-    if (!has_key(meta, "gemma3.rope.freq_base")) {
-        const int64_t k = gguf_find_key(meta, "gemma3.rope.global.freq_base");
-        if (k >= 0) {
-            gguf_set_val_f32(meta, "gemma3.rope.freq_base", gguf_get_val_f32(meta, k));
-        }
+    // Old Ollama converters sometimes used nested rope key names. Copy
+    // them to the flat names upstream expects. Copy-if-missing order
+    // matters: we want real values to take priority over injected defaults.
+    copy_f32_kv(meta, "gemma3.rope.global.freq_base", "gemma3.rope.freq_base");
+    copy_f32_kv(meta, "gemma3.rope.local.freq_base",  "gemma3.rope.freq_base_swa");
+
+    // Inject required KVs with their standard gemma3 defaults (no-op if
+    // already present).
+    if (!has_key(meta, "gemma3.attention.layer_norm_rms_epsilon"))
+        gguf_set_val_f32(meta, "gemma3.attention.layer_norm_rms_epsilon", 1e-6f);
+    if (!has_key(meta, "gemma3.rope.freq_base"))
+        gguf_set_val_f32(meta, "gemma3.rope.freq_base", 1000000.0f);
+    if (!has_key(meta, "gemma3.rope.freq_base_swa"))
+        gguf_set_val_f32(meta, "gemma3.rope.freq_base_swa", 10000.0f);
+
+    // Gemma3 4B/12B/27B ship with {type: "linear", factor: 8.0} rope scaling
+    // in their HF config to extend the 16k trained context to 131072. Ollama's
+    // old converter didn't write these. The 1B has no scaling — detect by
+    // context length.
+    int64_t ctx_key = gguf_find_key(meta, "gemma3.context_length");
+    if (ctx_key >= 0 && gguf_get_val_u32(meta, ctx_key) >= 131072
+            && !has_key(meta, "gemma3.rope.scaling.factor")) {
+        gguf_set_val_str(meta, "gemma3.rope.scaling.type", "linear");
+        gguf_set_val_f32(meta, "gemma3.rope.scaling.factor", 8.0f);
    }
-    if (!has_key(meta, "gemma3.rope.freq_base_swa")) {
-        const int64_t k = gguf_find_key(meta, "gemma3.rope.local.freq_base");
-        if (k >= 0) {
-            gguf_set_val_f32(meta, "gemma3.rope.freq_base_swa", gguf_get_val_f32(meta, k));
+
+    // Tokenizer vocab size vs embedding rows mismatch: Ollama leaves extra
+    // multimodal tokens (e.g. <image_soft_token>) in the tokenizer arrays.
+    // Truncate to match token_embd rows so llama.cpp's dim check passes.
+    for (ggml_tensor * t = ggml_get_first_tensor(ctx); t; t = ggml_get_next_tensor(ctx, t)) {
+        if (std::strcmp(ggml_get_name(t), "token_embd.weight") == 0) {
+            const size_t rows = t->ne[1]; // shape is [n_embd, n_vocab]
+            truncate_str_arr (meta, "tokenizer.ggml.tokens",     rows);
+            truncate_data_arr(meta, "tokenizer.ggml.scores",     GGUF_TYPE_FLOAT32, sizeof(float),   rows);
+            truncate_data_arr(meta, "tokenizer.ggml.token_type", GGUF_TYPE_INT32,   sizeof(int32_t), rows);
+            break;
        }
    }

-    set_f32_if_missing(meta, "gemma3.attention.layer_norm_rms_epsilon", 1e-6f);
-    set_f32_if_missing(meta, "gemma3.rope.freq_base", 1000000.0f);
-    set_f32_if_missing(meta, "gemma3.rope.freq_base_swa", 10000.0f);
-
-    // RoPE linear scaling: gemma3 4B/12B/27B ship with
-    //   rope_scaling = { type: "linear", factor: 8.0 }
-    // in their HF config. This extends the native ~16k trained context to
-    // the declared 131072 token context. Ollama's old converter didn't
-    // write these KVs; without them llama.cpp uses factor=1.0 which makes
-    // all positional embeddings subtly wrong (coherent but off-distribution
-    // output). The 1B variant has no rope_scaling — detect by context
-    // length.
-    {
-        const int64_t ctx_key = gguf_find_key(meta, "gemma3.context_length");
-        const uint32_t ctx_len = ctx_key >= 0 ? gguf_get_val_u32(meta, ctx_key) : 0;
-        if (ctx_len >= 131072 && !has_key(meta, "gemma3.rope.scaling.factor")) {
-            gguf_set_val_str(meta, "gemma3.rope.scaling.type", "linear");
-            gguf_set_val_f32(meta, "gemma3.rope.scaling.factor", 8.0f);
-        }
-    }
-
-    // 2. Tokenizer vocab size vs. embedding dim mismatch. Ollama's old
-    //    converter leaves special/multimodal tokens (e.g. <image_soft_token>)
-    //    in the tokenizer arrays even though the embedding matrix doesn't
-    //    cover them. Truncate the tokenizer to match the embedding rows.
-    if (const ggml_tensor * tok = find_tensor(ctx, "token_embd.weight")) {
-        const size_t embd_rows = tok->ne[1]; // shape is [n_embd, n_vocab]
-        truncate_str_arr (meta, "tokenizer.ggml.tokens",     embd_rows);
-        truncate_data_arr(meta, "tokenizer.ggml.scores",     GGUF_TYPE_FLOAT32, sizeof(float),   embd_rows);
-        truncate_data_arr(meta, "tokenizer.ggml.token_type", GGUF_TYPE_INT32,   sizeof(int32_t), embd_rows);
-    }
-
-    // 3. Drop embedded vision/projector tensors from the text loader.
-    //    Ollama's Go wrapper extracts them to a sidecar mmproj file before
-    //    passing --mmproj to llama-server.
+    // Hide embedded vision tensors from the text loader. Ollama's Go side
+    // re-passes the same blob as --mmproj so the clip loader picks them up.
    add_skip_prefix(ml, "v.");
    add_skip_prefix(ml, "mm.");

-    // Note: no RMSNorm weight shift is required. Ollama's published gemma3
-    // blobs already have the +1 shift baked in at conversion time — same as
-    // upstream llama.cpp's convert_hf_to_gguf.py.
-}
-
-} // anonymous namespace
-
-void translate_metadata(const llama_model_loader * ml,
-                        gguf_context * meta,
-                        ggml_context * ctx,
-                        std::string & arch_name) {
-    if (!meta) return;
-
-    if (arch_name == "gemma3") {
-        handle_gemma3(ml, meta, ctx);
-    }
-    // Dispatch. Add more arches as they are wired up.
+    // Note: no RMSNorm weight shift needed. Ollama's published gemma3 blobs
+    // already have the +1 shift baked in, same as upstream's convert_hf.
 }

 // -------------------------------------------------------------------------
-// Clip-side (mmproj) translation
+// gemma3 (clip side)
 // -------------------------------------------------------------------------

-namespace {
-
-// Rename a tensor in BOTH the gguf_context and the ggml_context so that all
-// name-based lookups — offset map, ggml_get_tensor, tensor.name — agree.
-//
-// The gguf_context side is a bit sneaky: gguf_get_tensor_name returns a
-// pointer into the embedded ggml_tensor's `name[GGML_MAX_NAME]` buffer.
-// That buffer is non-const storage inside a std::vector element; the const
-// on the return type is just API hygiene. Casting it away and strncpy'ing
-// a new name is well-defined and avoids needing to patch gguf's internals.
-void rename_tensor(gguf_context * meta, ggml_context * ctx,
-                   const char * old_name, const char * new_name) {
-    const int64_t id = gguf_find_tensor(meta, old_name);
-    if (id < 0) return;
-
-    // Update the gguf-side name (what gguf_get_tensor_name returns later).
-    if (char * name_ptr = const_cast<char *>(gguf_get_tensor_name(meta, id))) {
-        std::strncpy(name_ptr, new_name, GGML_MAX_NAME - 1);
-        name_ptr[GGML_MAX_NAME - 1] = '\0';
-    }
-
-    // Update the ggml-side name (what ggml_get_tensor looks up by).
-    if (ggml_tensor * t = ggml_get_tensor(ctx, old_name)) {
-        ggml_set_name(t, new_name);
-    }
-}
-
-// Rename every tensor whose name contains `needle` by replacing that
-// substring with `replacement`. Applies to both `.weight` and `.bias`.
-void rename_tensors_containing(gguf_context * meta, ggml_context * ctx,
-                               const char * needle, const char * replacement) {
-    // Collect names first — renaming while iterating would shift indices.
-    std::vector<std::string> renames; // old -> new
-    const int64_t n = gguf_get_n_tensors(meta);
-    for (int64_t i = 0; i < n; ++i) {
-        const char * name = gguf_get_tensor_name(meta, i);
-        std::string s(name);
-        size_t pos = s.find(needle);
-        if (pos == std::string::npos) continue;
-        std::string new_s = s;
-        new_s.replace(pos, std::strlen(needle), replacement);
-        renames.push_back(s);
-        renames.push_back(std::move(new_s));
-    }
-    for (size_t i = 0; i + 1 < renames.size(); i += 2) {
-        rename_tensor(meta, ctx, renames[i].c_str(), renames[i + 1].c_str());
-    }
-}
-
-// Copy a KV from src_key to dst_key if src_key exists and dst_key doesn't.
-template <typename Getter, typename Setter>
-bool copy_kv(gguf_context * meta, const char * src_key, const char * dst_key,
-             Getter get, Setter set) {
-    if (has_key(meta, dst_key)) return true; // already set, keep explicit values
-    const int64_t kid = gguf_find_key(meta, src_key);
-    if (kid < 0) return false;
-    set(meta, dst_key, get(meta, kid));
-    return true;
-}
-
-void copy_u32_kv(gguf_context * meta, const char * src_key, const char * dst_key) {
-    copy_kv(meta, src_key, dst_key,
-            gguf_get_val_u32,
-            [](gguf_context * m, const char * k, uint32_t v){ gguf_set_val_u32(m, k, v); });
-}
-
-void copy_f32_kv(gguf_context * meta, const char * src_key, const char * dst_key) {
-    copy_kv(meta, src_key, dst_key,
-            gguf_get_val_f32,
-            [](gguf_context * m, const char * k, float v){ gguf_set_val_f32(m, k, v); });
-}
-
-void set_str(gguf_context * meta, const char * key, const char * value) {
-    gguf_set_val_str(meta, key, value);
-}
-
-// Tensors marked for F16→F32 promotion. Looked up by tensor name.
-// Populated by handle_gemma3_clip; consumed by supply_promoted_tensor_data.
-std::mutex g_promote_mutex;
-std::unordered_set<std::string> g_promote_f16_to_f32;
-
-void mark_promote_f16_to_f32(const std::string & name) {
-    std::lock_guard<std::mutex> lk(g_promote_mutex);
-    g_promote_f16_to_f32.insert(name);
-}
-
-// Change a tensor's type in the ggml_context. Updates type and strides so
-// that ggml_nbytes(t) returns the new-type size, and ggml_dup_tensor
-// propagates the new type to any copies.
-void set_tensor_type_in_ctx(ggml_context * ctx, const char * name, ggml_type new_type) {
-    ggml_tensor * t = ggml_get_tensor(ctx, name);
-    if (!t) return;
-    t->type = new_type;
-    t->nb[0] = ggml_type_size(new_type);
-    t->nb[1] = t->nb[0] * (t->ne[0] / ggml_blck_size(new_type));
-    for (int i = 2; i < GGML_MAX_DIMS; ++i) {
-        t->nb[i] = t->nb[i - 1] * t->ne[i - 1];
-    }
-}
-
-// Promote a tensor's type in both gguf_context and ggml_context. Used for
-// F16→F32 conversion of conv weights that Metal requires as F32.
-void promote_tensor_to_f32(gguf_context * meta, ggml_context * ctx, const char * name) {
-    // Update ggml_context (clip.cpp reads type from here via ggml_dup_tensor).
-    set_tensor_type_in_ctx(ctx, name, GGML_TYPE_F32);
-    // Note: we do NOT call gguf_set_tensor_type on `meta`, because that
-    // recomputes tensor data offsets based on the new type — but we still
-    // have F16 bytes at the original offset. clip.cpp reads the offset from
-    // its own tensor_offset map (populated from gguf_context BEFORE this
-    // promotion), so leaving meta's offset alone preserves the correct
-    // source location. We also don't use meta's type for sizing.
-    mark_promote_f16_to_f32(name);
-}
-
-// Convert F16 → F32 in place.
-void convert_f16_to_f32(const uint16_t * src, float * dst, size_t n) {
-    for (size_t i = 0; i < n; ++i) {
-        dst[i] = ggml_fp16_to_fp32(src[i]);
-    }
-}
+// Ollama -> upstream tensor-name renames. Applied via substring match, so
+// both `.weight` and `.bias` variants are covered with one entry each.
+constexpr std::pair<const char *, const char *> kGemma3ClipRenames[] = {
+    {"v.patch_embedding",       "v.patch_embd"},
+    {"v.position_embedding",    "v.position_embd"},
+    {"v.post_layernorm",        "v.post_ln"},
+    {".layer_norm1",            ".ln1"},
+    {".layer_norm2",            ".ln2"},
+    {".attn_output",            ".attn_out"},
+    {".mlp.fc1",                ".ffn_down"},
+    {".mlp.fc2",                ".ffn_up"},
+    {"mm.mm_input_projection",  "mm.input_projection"},
+    {"mm.mm_soft_emb_norm",     "mm.soft_emb_norm"},
+};

 void handle_gemma3_clip(gguf_context * meta, ggml_context * ctx) {
-    // Build clip.* KVs from the gemma3.vision.* KVs already in the file.
+    // Synthesize clip.vision.* from gemma3.vision.* (same values, different key).
    copy_u32_kv(meta, "gemma3.vision.block_count",                   "clip.vision.block_count");
    copy_u32_kv(meta, "gemma3.vision.embedding_length",              "clip.vision.embedding_length");
    copy_u32_kv(meta, "gemma3.vision.feed_forward_length",           "clip.vision.feed_forward_length");
@ -350,11 +241,10 @@ void handle_gemma3_clip(gguf_context * meta, ggml_context * ctx) {
    copy_u32_kv(meta, "gemma3.vision.patch_size",                    "clip.vision.patch_size");
    copy_u32_kv(meta, "gemma3.vision.attention.head_count",          "clip.vision.attention.head_count");
    copy_f32_kv(meta, "gemma3.vision.attention.layer_norm_epsilon",  "clip.vision.attention.layer_norm_epsilon");
-    // projection_dim is the TEXT model's embedding_length (the mmproj
-    // output dim == language model input dim).
+    // projection_dim = text model's embedding_length (mmproj out == LM in).
    copy_u32_kv(meta, "gemma3.embedding_length",                     "clip.vision.projection_dim");

-    // image_mean / image_std — constant defaults for gemma3 vision.
+    // image_mean / image_std are constants for gemma3 vision.
    if (!has_key(meta, "clip.vision.image_mean")) {
        const float mean[3] = {0.5f, 0.5f, 0.5f};
        gguf_set_arr_data(meta, "clip.vision.image_mean", GGUF_TYPE_FLOAT32, mean, 3);
@ -364,63 +254,64 @@ void handle_gemma3_clip(gguf_context * meta, ggml_context * ctx) {
        gguf_set_arr_data(meta, "clip.vision.image_std", GGUF_TYPE_FLOAT32, std_, 3);
    }

-    // Top-level clip flags.
-    if (!has_key(meta, "clip.has_vision_encoder")) {
-        gguf_set_val_bool(meta, "clip.has_vision_encoder", true);
-    }
-    if (!has_key(meta, "clip.use_gelu")) {
-        gguf_set_val_bool(meta, "clip.use_gelu", true);
-    }
-    set_str(meta, "clip.projector_type", "gemma3");
-    set_str(meta, "general.architecture", "clip");
+    if (!has_key(meta, "clip.has_vision_encoder")) gguf_set_val_bool(meta, "clip.has_vision_encoder", true);
+    if (!has_key(meta, "clip.use_gelu"))           gguf_set_val_bool(meta, "clip.use_gelu",           true);
+    gguf_set_val_str(meta, "clip.projector_type",  "gemma3");
+    gguf_set_val_str(meta, "general.architecture", "clip");

-    // Tensor name translation (Ollama -> upstream mtmd convention).
-    rename_tensors_containing(meta, ctx, "v.patch_embedding",      "v.patch_embd");
-    rename_tensors_containing(meta, ctx, "v.position_embedding",   "v.position_embd");
-    rename_tensors_containing(meta, ctx, "v.post_layernorm",       "v.post_ln");
-    rename_tensors_containing(meta, ctx, ".layer_norm1",           ".ln1");
-    rename_tensors_containing(meta, ctx, ".layer_norm2",           ".ln2");
-    rename_tensors_containing(meta, ctx, ".attn_output",           ".attn_out");
-    rename_tensors_containing(meta, ctx, ".mlp.fc1",               ".ffn_down");
-    rename_tensors_containing(meta, ctx, ".mlp.fc2",               ".ffn_up");
-    rename_tensors_containing(meta, ctx, "mm.mm_input_projection", "mm.input_projection");
-    rename_tensors_containing(meta, ctx, "mm.mm_soft_emb_norm",    "mm.soft_emb_norm");
+    for (const auto & [from, to] : kGemma3ClipRenames) {
+        rename_tensors_containing(meta, ctx, from, to);
+    }

-    // Promote F16 patch-embed / position-embed to F32. Upstream stores these
-    // as F32 (see Gemma3VisionModel.tensor_force_quant in convert_hf_to_gguf.py).
-    // Metal's IM2COL op requires F32 for these convolution inputs.
-    promote_tensor_to_f32(meta, ctx, "v.patch_embd.weight");
-    promote_tensor_to_f32(meta, ctx, "v.position_embd.weight");
+    // Upstream stores patch_embd/position_embd as F32 (Gemma3VisionModel
+    // tensor_force_quant); Ollama stored F16. Metal's IM2COL convolution
+    // requires F32, so promote both at load time.
+    promote_tensor_to_f32(ctx, "v.patch_embd.weight");
+    promote_tensor_to_f32(ctx, "v.position_embd.weight");
 }

 } // anonymous namespace

+// -------------------------------------------------------------------------
+// public entry points
+// -------------------------------------------------------------------------
+
+void translate_metadata(const llama_model_loader * ml,
+                        gguf_context * meta,
+                        ggml_context * ctx,
+                        std::string & arch_name) {
+    if (!meta) return;
+    if (arch_name == "gemma3") handle_gemma3(ml, meta, ctx);
+    // Dispatch. Add more arches as they are wired up.
+}
+
 void translate_clip_metadata(gguf_context * meta, ggml_context * ctx) {
    if (!meta) return;
-
-    // Detection: Ollama-format gemma3 blob has `gemma3.mm.tokens_per_image`
-    // plus embedded `v.*` tensors. Upstream mmproj files use `general.architecture=clip`
-    // and don't have gemma3.* KVs.
-    if (has_key(meta, "gemma3.mm.tokens_per_image") &&
-        any_tensor_with_prefix(ctx, "v.")) {
+    // Require both the gemma3 markers AND embedded vision tensors to fire.
+    if (detect_ollama_gemma3(meta, ctx) && any_tensor_with_prefix(ctx, "v.")) {
        LLAMA_LOG_INFO("%s: detected Ollama-format gemma3 GGUF used as mmproj; translating\n", __func__);
        handle_gemma3_clip(meta, ctx);
    }
 }

+bool should_skip_tensor(const llama_model_loader * ml, const char * tensor_name) {
+    std::lock_guard<std::mutex> lk(g_registry_mutex);
+    auto it = g_skip_prefixes.find(ml);
+    if (it == g_skip_prefixes.end()) return false;
+    for (const auto & prefix : it->second) {
+        if (std::strncmp(tensor_name, prefix.c_str(), prefix.size()) == 0) return true;
+    }
+    return false;
+}
+
 bool maybe_load_tensor(ggml_tensor * cur,
                       const char * source_file,
                       size_t file_offset,
                       ggml_backend_buffer_type_t buft) {
-    // Check registry: is this tensor marked for F16→F32 promotion?
    {
        std::lock_guard<std::mutex> lk(g_promote_mutex);
-        if (g_promote_f16_to_f32.find(ggml_get_name(cur)) == g_promote_f16_to_f32.end()) {
-            return false;
-        }
+        if (g_promote_f16_to_f32.find(ggml_get_name(cur)) == g_promote_f16_to_f32.end()) return false;
    }
-    // Destination was promoted to F32 by translate_clip_metadata. Source
-    // bytes on disk are still F16 at file_offset.
    if (cur->type != GGML_TYPE_F32) return false;

    const size_t n_elem   = ggml_nelements(cur);
@ -428,76 +319,25 @@ bool maybe_load_tensor(ggml_tensor * cur,
    const size_t dst_size = n_elem * sizeof(float);

    std::vector<uint8_t> src(src_size);
-
    FILE * f = std::fopen(source_file, "rb");
-    if (!f) {
-        LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, source_file);
-        return false;
-    }
-    if (std::fseek(f, (long) file_offset, SEEK_SET) != 0 ||
-        std::fread(src.data(), 1, src_size, f) != src_size) {
-        std::fclose(f);
-        LLAMA_LOG_ERROR("%s: failed to read %zu bytes for '%s'\n",
-                        __func__, src_size, ggml_get_name(cur));
+    if (!f || std::fseek(f, (long) file_offset, SEEK_SET) != 0
+           || std::fread(src.data(), 1, src_size, f) != src_size) {
+        if (f) std::fclose(f);
+        LLAMA_LOG_ERROR("%s: failed to read F16 bytes for '%s'\n", __func__, ggml_get_name(cur));
        return false;
    }
    std::fclose(f);

    std::vector<uint8_t> dst(dst_size);
-    convert_f16_to_f32(reinterpret_cast<const uint16_t *>(src.data()),
-                       reinterpret_cast<float *>(dst.data()),
-                       n_elem);
+    const uint16_t * sp = reinterpret_cast<const uint16_t *>(src.data());
+    float          * dp = reinterpret_cast<float *>(dst.data());
+    for (size_t i = 0; i < n_elem; ++i) dp[i] = ggml_fp16_to_fp32(sp[i]);

-    // Deliver the converted bytes to the tensor's final backend buffer.
-    if (ggml_backend_buft_is_host(buft)) {
-        std::memcpy(cur->data, dst.data(), dst_size);
-    } else {
-        ggml_backend_tensor_set(cur, dst.data(), 0, dst_size);
-    }
+    if (ggml_backend_buft_is_host(buft)) std::memcpy(cur->data, dst.data(), dst_size);
+    else                                 ggml_backend_tensor_set(cur, dst.data(), 0, dst_size);

-    LLAMA_LOG_INFO("%s: promoted F16->F32 for %s (%zu elems)\n",
-                   __func__, ggml_get_name(cur), n_elem);
+    LLAMA_LOG_INFO("%s: promoted F16->F32 for %s (%zu elems)\n", __func__, ggml_get_name(cur), n_elem);
    return true;
 }

-bool should_skip_tensor(const llama_model_loader * ml, const char * tensor_name) {
-    std::lock_guard<std::mutex> lk(g_registry_mutex);
-    auto it = g_registry.find(ml);
-    if (it == g_registry.end()) return false;
-    for (const auto & prefix : it->second.skip_prefixes) {
-        if (std::strncmp(tensor_name, prefix.c_str(), prefix.size()) == 0) {
-            return true;
-        }
-    }
-    return false;
-}
-
-void apply_tensor_transforms(const llama_model_loader * ml, ggml_context * ctx) {
-    std::vector<TransformSpec> specs;
-    {
-        std::lock_guard<std::mutex> lk(g_registry_mutex);
-        auto it = g_registry.find(ml);
-        if (it == g_registry.end()) return;
-        specs = it->second.transforms;
-    }
-    if (specs.empty()) return;
-
-    std::vector<uint8_t> buf;
-    for (ggml_tensor * t = ggml_get_first_tensor(ctx); t; t = ggml_get_next_tensor(ctx, t)) {
-        if (!t->buffer) continue;
-        const std::string name = ggml_get_name(t);
-        for (const auto & spec : specs) {
-            if (!spec.matches(name)) continue;
-
-            const size_t nbytes = ggml_nbytes(t);
-            const size_t n_elem = ggml_nelements(t);
-
-            buf.resize(nbytes);
-            ggml_backend_tensor_get(t, buf.data(), 0, nbytes);
-            spec.apply(buf.data(), n_elem, t->type);
-            ggml_backend_tensor_set(t, buf.data(), 0, nbytes);
-        }
-    }
-}
-
 } // namespace llama_ollama_compat
--- a/llama/compat/llama-ollama-compat.h
+++ b/llama/compat/llama-ollama-compat.h
@ -3,20 +3,27 @@
 // Ollama-format GGUF compatibility shim.
 //
 // Older Ollama builds ship GGUFs that differ from upstream in a handful of
-// ways per-architecture. This shim detects those files during load and
-// translates them in-memory so the rest of llama.cpp can load them
-// unmodified. Single entry point per hook; all logic is data-driven from
-// per-architecture rules.
+// ways per-architecture (arch names, KV keys, tensor names, file layout).
+// This shim detects those files during load and translates them in-memory
+// so the rest of llama.cpp can load them unmodified.
 //
-// Two hooks:
-//   1. translate_metadata() — runs after gguf_init_from_file, mutates KVs
-//      and (optionally) tensor names on the gguf_context / ggml_context.
-//   2. apply_tensor_transforms() — runs after load_all_data, rewrites
-//      tensor data that differs numerically (e.g. gemma3 RMSNorm +1).
+// Three upstream hook points call into this namespace — one per insertion:
+//
+//   1. llama-model-loader.cpp (main model load):
+//        translate_metadata()        — mutate KVs / tensor metadata
+//        should_skip_tensor()        — filter weights_map population
+//
+//   2. tools/mtmd/clip.cpp (mmproj load):
+//        translate_clip_metadata()   — rewrite KVs + tensor names for clip
+//        maybe_load_tensor()         — override file read (e.g. F16->F32)
+//
+// Detection is per-arch; for any non-Ollama file every entry point is a
+// no-op. Per-arch logic lives in anonymous-namespace handle_<arch>()
+// functions in the .cpp; adding a new arch is a new handler plus one
+// dispatch line in each translate_* entry point.

-#include <cstdint>
+#include <cstddef>
 #include <string>
-#include <vector>

 #include "ggml-backend.h" // for ggml_backend_buffer_type_t

@ -27,53 +34,27 @@ struct llama_model_loader;

 namespace llama_ollama_compat {

-// Inspect and mutate the just-loaded gguf_context. May update arch_name if
-// the file uses an Ollama-specific architecture name. Safe to call for any
-// model — no-op when no Ollama markers are present.
-//
-// If compat was applied, registers any tensor transforms against `ml` for
-// apply_tensor_transforms() to consume later.
+// Called from llama_model_loader's constructor, right after the arch is read.
 void translate_metadata(const llama_model_loader * ml,
                        gguf_context * meta,
                        ggml_context * ctx,
                        std::string & arch_name);

-// Returns true if the loader should skip this tensor entirely (not add to
-// weights_map, not count toward n_tensors). Used to drop embedded vision
-// tensors from the text model without physically removing them.
+// Called from llama_model_loader's weights_map population loop. Returns
+// true to drop a tensor from the loader — used to hide embedded vision
+// tensors from the text model's view without modifying the gguf_context.
 bool should_skip_tensor(const llama_model_loader * ml, const char * tensor_name);

-// Called after load_all_data returns for a model context. Applies any
-// registered transforms (read tensor data from the backend buffer, modify,
-// write back) to tensors in `ctx`. Call once per model context.
-void apply_tensor_transforms(const llama_model_loader * ml, ggml_context * ctx);
-
-// Called from the clip loader (tools/mtmd/clip.cpp). If the file is an
-// Ollama-format monolithic GGUF (text + embedded vision), rewrites the
-// clip-facing view of the metadata so the clip loader sees it as a normal
-// mmproj file. Safe to call unconditionally — no-op when not an Ollama file.
-//
-// Operations:
-//   - sets general.architecture = "clip"
-//   - sets clip.has_vision_encoder, clip.projector_type, clip.use_gelu
-//   - copies gemma3.vision.* KVs into clip.vision.*
-//   - renames vision tensors (v.patch_embedding -> v.patch_embd, etc.)
-//   - promotes specific F16 tensors to F32 in the ggml_context so clip
-//     allocates the correct buffer size
-//
-// Non-vision text tensors remain in the gguf but are never looked up by
-// clip, so they cost nothing.
+// Called from clip_model_loader's constructor. Rewrites the clip-facing
+// view of the metadata (arch=clip, clip.vision.* KVs, renamed tensors)
+// so the rest of clip.cpp can load an Ollama monolithic GGUF unchanged.
 void translate_clip_metadata(gguf_context * meta, ggml_context * ctx);

-// Called from clip.cpp's tensor-loading loop, before reading bytes from the
-// file. If this tensor was marked for type promotion by translate_clip_metadata
-// (e.g. F16→F32), reads the source bytes, converts them, and writes the
-// result directly into `cur` (choosing host copy vs. backend upload based
-// on `buft`). Returns true if the tensor was handled — caller should skip
-// its normal file-read path. Returns false otherwise; caller loads normally.
-//
-// `file_offset` is the absolute file offset of the original (pre-promotion)
-// tensor data in the source GGUF.
+// Called from clip.cpp's tensor-loading loop, before the normal file read.
+// If this tensor was marked for type promotion by translate_clip_metadata
+// (e.g. F16->F32), performs the conversion and writes the result into
+// `cur` (host memcpy or backend_tensor_set based on `buft`). Returns true
+// when the tensor was handled — caller should skip its normal read path.
 bool maybe_load_tensor(ggml_tensor * cur,
                       const char * source_file,
                       size_t file_offset,
--- a/llama/compat/upstream-edits.patch
+++ b/llama/compat/upstream-edits.patch
@ -38,28 +38,6 @@ index 4e65a45a5..75836c683 100644
             // make sure there is no duplicated tensor names
             if (weights_map.find(tensor_name) != weights_map.end()) {
                 throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
-diff --git a/src/llama-model.cpp b/src/llama-model.cpp
-index 4ded484dd..7d3509c23 100644
--- a/src/llama-model.cpp
-+++ b/src/llama-model.cpp
-@@ -6,6 +6,7 @@
- #include "llama-mmap.h"
- #include "llama-cparams.h"
- #include "llama-model-loader.h"
-+#include "llama-ollama-compat.h"
- 
- #include "llama-kv-cache.h"
- #include "llama-kv-cache-iswa.h"
-@@ -8023,6 +8024,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
-         if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
-             return false;
-         }
-+        // Apply any Ollama-format numerical fixups (e.g. gemma3 RMSNorm +1)
-+        // while the data is in its final backend buffers.
-+        llama_ollama_compat::apply_tensor_transforms(&ml, ctx);
-     }
- 
-     if (use_mmap_buffer) {
 diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
 index f0e8786b6..35defa89d 100644
 --- a/tools/mtmd/clip.cpp