llama/compat: split shared infra into a util TU

Main translation unit (llama-ollama-compat.cpp) is now purely per-arch dispatch: detect_* + handle_* for each arch, plus the 4 public entry points. Dropped from 724 lines to 430. Everything that doesn't depend on a specific arch moves to llama-ollama-compat-util.{h,cpp}: - gguf KV helpers (has_key, copy_{u32,f32}_kv, inject_{u32,f32,str,bool, f32_arr}_if_missing, truncate_{str,data}_arr) - ggml tensor helpers (any_tensor_with_prefix, rename_tensor, rename_tensors_containing, set_tensor_{type,shape}, reclaim_slot_as, tensor_file_offset) - per-loader skip-prefix registry (add_skip_prefix, should_skip_tensor_prefix) - LoadOp registry (register_load_op, take_load_op, read_at) - common high-level transforms (promote_tensor_to_f32, register_concat_load) New helpers introduced while splitting: - inject_{u32,f32,str,bool,f32_arr}_if_missing — replaces the has_key + gguf_set_val_* idiom we were using 20+ times. - reclaim_slot_as — extracts the "rename an orphan tensor slot as a synthesized one" pattern used by qwen35moe's patch_embed split. Clear name + comment explains the workaround. CMake: target_sources now globs llama/compat/*.cpp (CONFIGURE_DEPENDS), so new .cpp files are picked up without CMake edits. Nothing behaviorally changed. Verified gemma3 + qwen3.5 text + vision still work end-to-end after a clean rebuild.
2026-05-13 22:37:14 +00:00 · 2026-04-19 12:59:21 -07:00 · 2026-04-19 12:59:21 -07:00 · 2a388da77b
commit 2a388da77b
parent db0c745308
4 changed files with 449 additions and 389 deletions
--- a/llama/compat/llama-ollama-compat-util.cpp
+++ b/llama/compat/llama-ollama-compat-util.cpp
@ -0,0 +1,268 @@
+#include "llama-ollama-compat-util.h"
+
+#include "llama-impl.h"
+#include "llama-model-loader.h"
+
+#include <cstdio>
+#include <cstring>
+#include <mutex>
+#include <unordered_map>
+
+namespace llama_ollama_compat::detail {
+
+// -------------------------------------------------------------------------
+// gguf_context KV helpers
+// -------------------------------------------------------------------------
+
+bool has_key(const gguf_context * meta, const char * key) {
+    return gguf_find_key(meta, key) >= 0;
+}
+
+void copy_u32_kv(gguf_context * meta, const char * src, const char * dst) {
+    if (has_key(meta, dst)) return;
+    const int64_t k = gguf_find_key(meta, src);
+    if (k < 0) return;
+    gguf_set_val_u32(meta, dst, gguf_get_val_u32(meta, k));
+}
+
+void copy_f32_kv(gguf_context * meta, const char * src, const char * dst) {
+    if (has_key(meta, dst)) return;
+    const int64_t k = gguf_find_key(meta, src);
+    if (k < 0) return;
+    gguf_set_val_f32(meta, dst, gguf_get_val_f32(meta, k));
+}
+
+void inject_u32_if_missing (gguf_context * meta, const char * key, uint32_t v) {
+    if (!has_key(meta, key)) gguf_set_val_u32(meta, key, v);
+}
+void inject_f32_if_missing (gguf_context * meta, const char * key, float v) {
+    if (!has_key(meta, key)) gguf_set_val_f32(meta, key, v);
+}
+void inject_str_if_missing (gguf_context * meta, const char * key, const char * v) {
+    if (!has_key(meta, key)) gguf_set_val_str(meta, key, v);
+}
+void inject_bool_if_missing(gguf_context * meta, const char * key, bool v) {
+    if (!has_key(meta, key)) gguf_set_val_bool(meta, key, v);
+}
+void inject_f32_arr_if_missing(gguf_context * meta, const char * key,
+                               const float * data, size_t n) {
+    if (!has_key(meta, key)) gguf_set_arr_data(meta, key, GGUF_TYPE_FLOAT32, data, n);
+}
+
+void truncate_str_arr(gguf_context * meta, const char * key, size_t new_n) {
+    const int64_t kid = gguf_find_key(meta, key);
+    if (kid < 0 || new_n >= gguf_get_arr_n(meta, kid)) return;
+
+    std::vector<std::string> owned;
+    owned.reserve(new_n);
+    std::vector<const char *> ptrs;
+    ptrs.reserve(new_n);
+    for (size_t i = 0; i < new_n; ++i) owned.emplace_back(gguf_get_arr_str(meta, kid, i));
+    for (const auto & s : owned) ptrs.push_back(s.c_str());
+    gguf_set_arr_str(meta, key, ptrs.data(), new_n);
+}
+
+void truncate_data_arr(gguf_context * meta, const char * key,
+                       gguf_type elem_type, size_t elem_size, size_t new_n) {
+    const int64_t kid = gguf_find_key(meta, key);
+    if (kid < 0 || new_n >= gguf_get_arr_n(meta, kid)) return;
+
+    std::vector<uint8_t> copy(elem_size * new_n);
+    std::memcpy(copy.data(), gguf_get_arr_data(meta, kid), elem_size * new_n);
+    gguf_set_arr_data(meta, key, elem_type, copy.data(), new_n);
+}
+
+// -------------------------------------------------------------------------
+// ggml_context tensor scans
+// -------------------------------------------------------------------------
+
+bool any_tensor_with_prefix(const ggml_context * ctx, const char * prefix) {
+    const size_t plen = std::strlen(prefix);
+    for (ggml_tensor * t = ggml_get_first_tensor(ctx); t; t = ggml_get_next_tensor(ctx, t)) {
+        if (std::strncmp(ggml_get_name(t), prefix, plen) == 0) return true;
+    }
+    return false;
+}
+
+// -------------------------------------------------------------------------
+// Tensor renaming / reshaping (mutates both contexts)
+// -------------------------------------------------------------------------
+
+// gguf_get_tensor_name returns a pointer into a mutable `char[GGML_MAX_NAME]`
+// inside a std::vector element; the const on the return type is API
+// courtesy, so writing through const_cast is defined.
+void rename_tensor(gguf_context * meta, ggml_context * ctx,
+                   const char * old_name, const char * new_name) {
+    const int64_t id = gguf_find_tensor(meta, old_name);
+    if (id < 0) return;
+    if (char * p = const_cast<char *>(gguf_get_tensor_name(meta, id))) {
+        std::strncpy(p, new_name, GGML_MAX_NAME - 1);
+        p[GGML_MAX_NAME - 1] = '\0';
+    }
+    if (ggml_tensor * t = ggml_get_tensor(ctx, old_name)) ggml_set_name(t, new_name);
+}
+
+void rename_tensors_containing(gguf_context * meta, ggml_context * ctx,
+                               const char * needle, const char * replacement) {
+    std::vector<std::pair<std::string, std::string>> renames;
+    const int64_t n = gguf_get_n_tensors(meta);
+    const size_t needle_len = std::strlen(needle);
+    for (int64_t i = 0; i < n; ++i) {
+        std::string s(gguf_get_tensor_name(meta, i));
+        const size_t pos = s.find(needle);
+        if (pos == std::string::npos) continue;
+        std::string ns = s;
+        ns.replace(pos, needle_len, replacement);
+        renames.emplace_back(std::move(s), std::move(ns));
+    }
+    for (const auto & [from, to] : renames) rename_tensor(meta, ctx, from.c_str(), to.c_str());
+}
+
+void set_tensor_type(ggml_tensor * t, ggml_type type) {
+    t->type  = type;
+    t->nb[0] = ggml_type_size(type);
+    t->nb[1] = t->nb[0] * (t->ne[0] / ggml_blck_size(type));
+    for (int i = 2; i < GGML_MAX_DIMS; ++i) t->nb[i] = t->nb[i - 1] * t->ne[i - 1];
+}
+
+void set_tensor_shape(ggml_tensor * t, std::initializer_list<int64_t> shape) {
+    int i = 0;
+    for (auto v : shape) t->ne[i++] = v;
+    for (; i < GGML_MAX_DIMS; ++i) t->ne[i] = 1;
+    set_tensor_type(t, t->type);
+}
+
+// Rename an orphan tensor slot as a new synthesized tensor. See header for
+// why this is the workaround of choice (clip's ctx_meta has no spare capacity).
+bool reclaim_slot_as(gguf_context * meta, ggml_context * ctx,
+                     const char * orphan_name, const char * new_name,
+                     std::initializer_list<int64_t> shape, ggml_type type) {
+    if (gguf_find_tensor(meta, orphan_name) < 0) return false;
+    rename_tensor(meta, ctx, orphan_name, new_name);
+    ggml_tensor * t = ggml_get_tensor(ctx, new_name);
+    if (!t) return false;
+    set_tensor_shape(t, shape);
+    set_tensor_type (t, type);
+    return true;
+}
+
+size_t tensor_file_offset(const gguf_context * meta, const char * name) {
+    const int64_t id = gguf_find_tensor(meta, name);
+    if (id < 0) return 0;
+    return gguf_get_data_offset(meta) + gguf_get_tensor_offset(meta, id);
+}
+
+// -------------------------------------------------------------------------
+// Per-loader skip-prefix registry
+// -------------------------------------------------------------------------
+
+namespace {
+std::mutex g_skip_mutex;
+std::unordered_map<const llama_model_loader *, std::vector<std::string>> g_skip_prefixes;
+} // anon
+
+void add_skip_prefix(const llama_model_loader * ml, std::string prefix) {
+    std::lock_guard<std::mutex> lk(g_skip_mutex);
+    g_skip_prefixes[ml].push_back(std::move(prefix));
+}
+
+bool should_skip_tensor_prefix(const llama_model_loader * ml, const char * name) {
+    std::lock_guard<std::mutex> lk(g_skip_mutex);
+    auto it = g_skip_prefixes.find(ml);
+    if (it == g_skip_prefixes.end()) return false;
+    for (const auto & prefix : it->second) {
+        if (std::strncmp(name, prefix.c_str(), prefix.size()) == 0) return true;
+    }
+    return false;
+}
+
+// -------------------------------------------------------------------------
+// Load-time transform registry
+// -------------------------------------------------------------------------
+
+namespace {
+std::mutex g_loadop_mutex;
+std::unordered_map<std::string, LoadOp> g_loadops;
+} // anon
+
+void register_load_op(std::string dest_name, LoadOp op) {
+    std::lock_guard<std::mutex> lk(g_loadop_mutex);
+    g_loadops[std::move(dest_name)] = std::move(op);
+}
+
+bool take_load_op(const char * dest_name, LoadOp & out) {
+    std::lock_guard<std::mutex> lk(g_loadop_mutex);
+    auto it = g_loadops.find(dest_name);
+    if (it == g_loadops.end()) return false;
+    out = std::move(it->second);
+    g_loadops.erase(it);
+    return true;
+}
+
+bool read_at(const char * path, size_t offset, void * dst, size_t size) {
+    FILE * f = std::fopen(path, "rb");
+    if (!f) return false;
+    bool ok = (std::fseek(f, (long) offset, SEEK_SET) == 0
+               && std::fread(dst, 1, size, f) == size);
+    std::fclose(f);
+    return ok;
+}
+
+// -------------------------------------------------------------------------
+// Common high-level transforms
+// -------------------------------------------------------------------------
+
+void promote_tensor_to_f32(gguf_context * meta, ggml_context * ctx, const char * name) {
+    const int64_t tid = gguf_find_tensor(meta, name);
+    if (tid < 0) return;
+    ggml_tensor * t = ggml_get_tensor(ctx, name);
+    if (!t || t->type != GGML_TYPE_F16) return;
+
+    const size_t src_offset = tensor_file_offset(meta, name);
+    const size_t n_elem     = ggml_nelements(t);
+    const size_t src_size   = n_elem * sizeof(uint16_t);
+
+    set_tensor_type(t, GGML_TYPE_F32);
+
+    register_load_op(name, LoadOp{
+        [src_offset, src_size, n_elem](const char * path, void * dst, size_t dst_size) {
+            (void) dst_size;
+            std::vector<uint8_t> src(src_size);
+            if (!read_at(path, src_offset, src.data(), src_size)) return false;
+            const uint16_t * sp = reinterpret_cast<const uint16_t *>(src.data());
+            float          * dp = reinterpret_cast<float *>(dst);
+            for (size_t i = 0; i < n_elem; ++i) dp[i] = ggml_fp16_to_fp32(sp[i]);
+            return true;
+        },
+        "F16->F32 promote",
+    });
+}
+
+void register_concat_load(const gguf_context * meta, std::string dest_name,
+                          const std::vector<std::string> & src_names) {
+    std::vector<std::pair<size_t, size_t>> regions;
+    regions.reserve(src_names.size());
+    for (const auto & n : src_names) {
+        const int64_t id = gguf_find_tensor(meta, n.c_str());
+        if (id < 0) return;
+        regions.emplace_back(
+            gguf_get_data_offset(meta) + gguf_get_tensor_offset(meta, id),
+            gguf_get_tensor_size(meta, id));
+    }
+    register_load_op(std::move(dest_name), LoadOp{
+        [regions](const char * path, void * dst, size_t dst_size) {
+            size_t total = 0;
+            for (auto & [_, sz] : regions) total += sz;
+            if (total != dst_size) return false;
+            uint8_t * p = static_cast<uint8_t *>(dst);
+            for (auto & [off, sz] : regions) {
+                if (!read_at(path, off, p, sz)) return false;
+                p += sz;
+            }
+            return true;
+        },
+        "concat sources",
+    });
+}
+
+} // namespace llama_ollama_compat::detail
--- a/llama/compat/llama-ollama-compat-util.h
+++ b/llama/compat/llama-ollama-compat-util.h
@ -0,0 +1,85 @@
+#pragma once
+
+// Internal helpers shared by the per-architecture handlers in
+// llama-ollama-compat.cpp. Not part of the public API.
+//
+// Everything lives under namespace llama_ollama_compat::detail. The
+// definitions live in llama-ollama-compat-util.cpp, which also owns the
+// registry globals (tensor skip list, load-op table) that need a single
+// translation unit.
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <initializer_list>
+#include <string>
+#include <vector>
+
+#include "ggml.h"
+#include "ggml-backend.h"
+#include "gguf.h"
+
+struct llama_model_loader;
+
+namespace llama_ollama_compat::detail {
+
+// -- gguf_context KV helpers --
+bool has_key(const gguf_context * meta, const char * key);
+void copy_u32_kv(gguf_context * meta, const char * src, const char * dst);
+void copy_f32_kv(gguf_context * meta, const char * src, const char * dst);
+void inject_u32_if_missing (gguf_context * meta, const char * key, uint32_t v);
+void inject_f32_if_missing (gguf_context * meta, const char * key, float    v);
+void inject_str_if_missing (gguf_context * meta, const char * key, const char * v);
+void inject_bool_if_missing(gguf_context * meta, const char * key, bool     v);
+void inject_f32_arr_if_missing(gguf_context * meta, const char * key,
+                               const float * data, size_t n);
+void truncate_str_arr (gguf_context * meta, const char * key, size_t new_n);
+void truncate_data_arr(gguf_context * meta, const char * key,
+                       gguf_type elem_type, size_t elem_size, size_t new_n);
+
+// -- ggml_context tensor scans --
+bool any_tensor_with_prefix(const ggml_context * ctx, const char * prefix);
+
+// -- Tensor renaming / reshaping (mutates both gguf_context and ggml_context) --
+void rename_tensor(gguf_context * meta, ggml_context * ctx,
+                   const char * old_name, const char * new_name);
+void rename_tensors_containing(gguf_context * meta, ggml_context * ctx,
+                               const char * needle, const char * replacement);
+void set_tensor_type (ggml_tensor * t, ggml_type type);
+void set_tensor_shape(ggml_tensor * t, std::initializer_list<int64_t> shape);
+bool reclaim_slot_as (gguf_context * meta, ggml_context * ctx,
+                      const char * orphan_name, const char * new_name,
+                      std::initializer_list<int64_t> shape, ggml_type type);
+
+// -- File-offset capture (before rename) --
+size_t tensor_file_offset(const gguf_context * meta, const char * name);
+
+// -- Per-loader skip-prefix registry --
+void add_skip_prefix(const llama_model_loader * ml, std::string prefix);
+bool should_skip_tensor_prefix(const llama_model_loader * ml, const char * name);
+
+// -- Load-time transform registry --
+struct LoadOp {
+    std::function<bool(const char * src_file, void * dst, size_t dst_size)> apply;
+    const char * description;
+};
+void register_load_op(std::string dest_name, LoadOp op);
+bool take_load_op    (const char * dest_name, LoadOp & out); // removes + returns
+
+// Read `size` bytes at `offset` from `path` into `dst`. Used by LoadOps.
+bool read_at(const char * path, size_t offset, void * dst, size_t size);
+
+// -- Common high-level transforms --
+
+// F16 -> F32 promotion. Captures the source file offset at registration
+// time so later renames/reshapes of this tensor don't invalidate the read.
+void promote_tensor_to_f32(gguf_context * meta, ggml_context * ctx, const char * name);
+
+// Concatenate N source tensors into one destination. Captures each source's
+// file offset + byte size at registration time. Layout assumption: sources
+// concatenate cleanly along the destination's slow ggml axis, which in
+// C order means the destination bytes are src[0] || src[1] || ... .
+void register_concat_load(const gguf_context * meta, std::string dest_name,
+                          const std::vector<std::string> & src_names);
+
+} // namespace llama_ollama_compat::detail
--- a/llama/compat/llama-ollama-compat.cpp
+++ b/llama/compat/llama-ollama-compat.cpp
@ -1,259 +1,33 @@
 #include "llama-ollama-compat.h"
+#include "llama-ollama-compat-util.h"

-#include "ggml.h"
-#include "ggml-backend.h"
-#include "gguf.h"
 #include "llama-impl.h"
-#include "llama-model-loader.h"

-#include <cstdint>
 #include <cstdio>
 #include <cstring>
-#include <mutex>
 #include <string>
-#include <unordered_map>
-#include <unordered_set>
+#include <utility>
 #include <vector>

 namespace llama_ollama_compat {
+
+using namespace llama_ollama_compat::detail; // pull detail:: helpers into scope
+
 namespace {

-// -------------------------------------------------------------------------
-// tiny gguf_context helpers
-// -------------------------------------------------------------------------
-
-bool has_key(const gguf_context * meta, const char * key) {
-    return gguf_find_key(meta, key) >= 0;
-}
-
-bool any_tensor_with_prefix(const ggml_context * ctx, const char * prefix) {
-    const size_t plen = std::strlen(prefix);
-    for (ggml_tensor * t = ggml_get_first_tensor(ctx); t; t = ggml_get_next_tensor(ctx, t)) {
-        if (std::strncmp(ggml_get_name(t), prefix, plen) == 0) return true;
-    }
-    return false;
-}
-
-// Copy a uint32 KV from src to dst if src exists and dst doesn't.
-void copy_u32_kv(gguf_context * meta, const char * src, const char * dst) {
-    if (has_key(meta, dst)) return;
-    const int64_t k = gguf_find_key(meta, src);
-    if (k < 0) return;
-    gguf_set_val_u32(meta, dst, gguf_get_val_u32(meta, k));
-}
-
-// Copy a float32 KV from src to dst if src exists and dst doesn't.
-void copy_f32_kv(gguf_context * meta, const char * src, const char * dst) {
-    if (has_key(meta, dst)) return;
-    const int64_t k = gguf_find_key(meta, src);
-    if (k < 0) return;
-    gguf_set_val_f32(meta, dst, gguf_get_val_f32(meta, k));
-}
-
-// Truncate a string-typed KV array to `new_n` entries.
-void truncate_str_arr(gguf_context * meta, const char * key, size_t new_n) {
-    const int64_t kid = gguf_find_key(meta, key);
-    if (kid < 0 || new_n >= gguf_get_arr_n(meta, kid)) return;
-
-    std::vector<std::string> owned;
-    owned.reserve(new_n);
-    std::vector<const char *> ptrs;
-    ptrs.reserve(new_n);
-    for (size_t i = 0; i < new_n; ++i) owned.emplace_back(gguf_get_arr_str(meta, kid, i));
-    for (const auto & s : owned) ptrs.push_back(s.c_str());
-    gguf_set_arr_str(meta, key, ptrs.data(), new_n);
-}
-
-// Truncate a primitive-typed KV array to `new_n` entries.
-void truncate_data_arr(gguf_context * meta, const char * key,
-                       gguf_type elem_type, size_t elem_size, size_t new_n) {
-    const int64_t kid = gguf_find_key(meta, key);
-    if (kid < 0 || new_n >= gguf_get_arr_n(meta, kid)) return;
-
-    std::vector<uint8_t> copy(elem_size * new_n);
-    std::memcpy(copy.data(), gguf_get_arr_data(meta, kid), elem_size * new_n);
-    gguf_set_arr_data(meta, key, elem_type, copy.data(), new_n);
-}
-
-// Rename a tensor in BOTH the gguf_context and the ggml_context so that all
-// name-based lookups agree. gguf_get_tensor_name returns a pointer into a
-// mutable `char[GGML_MAX_NAME]` inside a std::vector element; the const on
-// the return type is API courtesy, so writing through const_cast is defined.
-void rename_tensor(gguf_context * meta, ggml_context * ctx,
-                   const char * old_name, const char * new_name) {
-    const int64_t id = gguf_find_tensor(meta, old_name);
-    if (id < 0) return;
-    if (char * p = const_cast<char *>(gguf_get_tensor_name(meta, id))) {
-        std::strncpy(p, new_name, GGML_MAX_NAME - 1);
-        p[GGML_MAX_NAME - 1] = '\0';
-    }
-    if (ggml_tensor * t = ggml_get_tensor(ctx, old_name)) ggml_set_name(t, new_name);
-}
-
-// Rename every tensor whose name contains `needle` (covers `.weight` + `.bias`).
-void rename_tensors_containing(gguf_context * meta, ggml_context * ctx,
-                               const char * needle, const char * replacement) {
-    std::vector<std::pair<std::string, std::string>> renames;
-    const int64_t n = gguf_get_n_tensors(meta);
-    const size_t needle_len = std::strlen(needle);
-    for (int64_t i = 0; i < n; ++i) {
-        std::string s(gguf_get_tensor_name(meta, i));
-        const size_t pos = s.find(needle);
-        if (pos == std::string::npos) continue;
-        std::string ns = s;
-        ns.replace(pos, needle_len, replacement);
-        renames.emplace_back(std::move(s), std::move(ns));
-    }
-    for (const auto & [from, to] : renames) rename_tensor(meta, ctx, from.c_str(), to.c_str());
-}
-
-// -------------------------------------------------------------------------
-// per-loader state (currently just the "drop these tensor prefixes" list)
-// -------------------------------------------------------------------------
-
-std::mutex g_registry_mutex;
-std::unordered_map<const llama_model_loader *, std::vector<std::string>> g_skip_prefixes;
-
-void add_skip_prefix(const llama_model_loader * ml, std::string prefix) {
-    std::lock_guard<std::mutex> lk(g_registry_mutex);
-    g_skip_prefixes[ml].push_back(std::move(prefix));
-}
-
-// -------------------------------------------------------------------------
-// Load-time tensor transforms (registry consumed by maybe_load_tensor)
-//
-// Each registered op produces the final bytes for a single destination
-// tensor by reading + transforming bytes from the source GGUF file.
-// Used for F16->F32 promotion, QKV merging, and patch-embed splitting.
-// -------------------------------------------------------------------------
-
-struct LoadOp {
-    // apply() reads what it needs from `src_file` and fills `dst` (dst_size
-    // bytes). Returns false on failure.
-    std::function<bool(const char * src_file, void * dst, size_t dst_size)> apply;
-    const char * description;
-};
-
-std::mutex g_loadop_mutex;
-std::unordered_map<std::string, LoadOp> g_loadops;
-
-void register_load_op(std::string dest_name, LoadOp op) {
-    std::lock_guard<std::mutex> lk(g_loadop_mutex);
-    g_loadops[std::move(dest_name)] = std::move(op);
-}
-
-// Helper: read `size` bytes at `offset` from `path` into `dst`.
-bool read_at(const char * path, size_t offset, void * dst, size_t size) {
-    FILE * f = std::fopen(path, "rb");
-    if (!f) return false;
-    bool ok = (std::fseek(f, (long) offset, SEEK_SET) == 0
-               && std::fread(dst, 1, size, f) == size);
-    std::fclose(f);
-    return ok;
-}
-
-// Capture a tensor's absolute file offset BEFORE any rename or reshape.
-size_t tensor_file_offset(const gguf_context * meta, const char * name) {
-    const int64_t id = gguf_find_tensor(meta, name);
-    if (id < 0) return 0;
-    return gguf_get_data_offset(meta) + gguf_get_tensor_offset(meta, id);
-}
-
-// Set a tensor's type and recompute strides in a ggml_context.
-void set_tensor_type(ggml_tensor * t, ggml_type type) {
-    t->type  = type;
-    t->nb[0] = ggml_type_size(type);
-    t->nb[1] = t->nb[0] * (t->ne[0] / ggml_blck_size(type));
-    for (int i = 2; i < GGML_MAX_DIMS; ++i) t->nb[i] = t->nb[i - 1] * t->ne[i - 1];
-}
-
-// Set a tensor's shape and recompute strides in a ggml_context.
-void set_tensor_shape(ggml_tensor * t, std::initializer_list<int64_t> shape) {
-    int i = 0;
-    for (auto v : shape) t->ne[i++] = v;
-    for (; i < GGML_MAX_DIMS; ++i) t->ne[i] = 1;
-    set_tensor_type(t, t->type);
-}
-
-// Promote a tensor F16 -> F32. The disk bytes stay F16; we register a
-// load op that converts on read.
-void promote_tensor_to_f32(gguf_context * meta, ggml_context * ctx, const char * name) {
-    const int64_t tid = gguf_find_tensor(meta, name);
-    if (tid < 0) return;
-    ggml_tensor * t = ggml_get_tensor(ctx, name);
-    if (!t || t->type != GGML_TYPE_F16) return;
-
-    const size_t src_offset = tensor_file_offset(meta, name);
-    const size_t n_elem     = ggml_nelements(t);
-    const size_t src_size   = n_elem * sizeof(uint16_t);
-
-    set_tensor_type(t, GGML_TYPE_F32);
-
-    register_load_op(name, LoadOp{
-        [src_offset, src_size, n_elem](const char * path, void * dst, size_t dst_size) {
-            (void) dst_size;
-            std::vector<uint8_t> src(src_size);
-            if (!read_at(path, src_offset, src.data(), src_size)) return false;
-            const uint16_t * sp = reinterpret_cast<const uint16_t *>(src.data());
-            float          * dp = reinterpret_cast<float *>(dst);
-            for (size_t i = 0; i < n_elem; ++i) dp[i] = ggml_fp16_to_fp32(sp[i]);
-            return true;
-        },
-        "F16->F32 promote",
-    });
-}
-
-// Concatenate N source tensors into one destination tensor. Captures
-// source file offsets and sizes at registration time so later renames or
-// reshapes don't affect the read. Layout assumption: the source tensors
-// concatenate cleanly along their slowest dim, which in C/ggml order
-// means the destination's bytes are just src[0] || src[1] || ... .
-void register_concat_load(const gguf_context * meta, std::string dest_name,
-                          const std::vector<std::string> & src_names) {
-    std::vector<std::pair<size_t, size_t>> regions; // (offset, size)
-    regions.reserve(src_names.size());
-    for (const auto & n : src_names) {
-        const int64_t id = gguf_find_tensor(meta, n.c_str());
-        if (id < 0) return; // bail; downstream will fail loudly
-        regions.emplace_back(
-            gguf_get_data_offset(meta) + gguf_get_tensor_offset(meta, id),
-            gguf_get_tensor_size(meta, id));
-    }
-    register_load_op(std::move(dest_name), LoadOp{
-        [regions](const char * path, void * dst, size_t dst_size) {
-            size_t total = 0;
-            for (auto & [_, sz] : regions) total += sz;
-            if (total != dst_size) return false;
-            uint8_t * p = static_cast<uint8_t *>(dst);
-            for (auto & [off, sz] : regions) {
-                if (!read_at(path, off, p, sz)) return false;
-                p += sz;
-            }
-            return true;
-        },
-        "concat sources",
-    });
-}
-
-// -------------------------------------------------------------------------
+// =========================================================================
 // gemma3 (text side)
-// -------------------------------------------------------------------------
+// =========================================================================

-// Returns true if this looks like an Ollama-format gemma3 blob. Requires
-// the file to declare itself gemma3 (either via general.architecture or
-// by having at least one gemma3.* KV), AND to exhibit at least one Ollama
-// quirk. Different Ollama converter versions produced different quirks
-// (4B/12B/27B have embedded vision + mm KVs; 1B uses non-standard rope
-// key names; all of them omit layer_norm_rms_epsilon).
+// An Ollama-format gemma3 file declares arch="gemma3" AND exhibits at
+// least one converter quirk. Different converter versions produced
+// different quirks (4B/12B/27B have embedded vision + mm KVs; 1B uses
+// non-standard rope key names; all of them omit layer_norm_rms_epsilon).
 bool detect_ollama_gemma3(const gguf_context * meta, const ggml_context * ctx) {
-    // Claim #1: the file is gemma3.
    const int64_t arch_kid = gguf_find_key(meta, "general.architecture");
    if (arch_kid < 0) return false;
    if (std::strcmp(gguf_get_val_str(meta, arch_kid), "gemma3") != 0) return false;

-    // Claim #2: at least one Ollama-ism. An upstream-converted gemma3 would
-    // have none of these (except possibly the v./mm. prefixes, which upstream
-    // never ships in the text file — they live in a separate mmproj).
    return has_key(meta, "gemma3.mm.tokens_per_image")
        || any_tensor_with_prefix(ctx, "v.")
        || any_tensor_with_prefix(ctx, "mm.")
@ -270,29 +44,23 @@ void handle_gemma3(const llama_model_loader * ml, gguf_context * meta, ggml_cont
    LLAMA_LOG_INFO("%s: detected Ollama-format gemma3 GGUF; applying compatibility fixes\n", __func__);

    // Old Ollama converters sometimes used nested rope key names. Copy
-    // them to the flat names upstream expects. Copy-if-missing order
-    // matters: we want real values to take priority over injected defaults.
+    // them to the flat names upstream expects BEFORE injecting defaults.
    copy_f32_kv(meta, "gemma3.rope.global.freq_base", "gemma3.rope.freq_base");
    copy_f32_kv(meta, "gemma3.rope.local.freq_base",  "gemma3.rope.freq_base_swa");

-    // Inject required KVs with their standard gemma3 defaults (no-op if
-    // already present).
-    if (!has_key(meta, "gemma3.attention.layer_norm_rms_epsilon"))
-        gguf_set_val_f32(meta, "gemma3.attention.layer_norm_rms_epsilon", 1e-6f);
-    if (!has_key(meta, "gemma3.rope.freq_base"))
-        gguf_set_val_f32(meta, "gemma3.rope.freq_base", 1000000.0f);
-    if (!has_key(meta, "gemma3.rope.freq_base_swa"))
-        gguf_set_val_f32(meta, "gemma3.rope.freq_base_swa", 10000.0f);
+    // Inject required KVs with their standard gemma3 defaults.
+    inject_f32_if_missing(meta, "gemma3.attention.layer_norm_rms_epsilon", 1e-6f);
+    inject_f32_if_missing(meta, "gemma3.rope.freq_base",                   1000000.0f);
+    inject_f32_if_missing(meta, "gemma3.rope.freq_base_swa",               10000.0f);

    // Gemma3 4B/12B/27B ship with {type: "linear", factor: 8.0} rope scaling
    // in their HF config to extend the 16k trained context to 131072. Ollama's
    // old converter didn't write these. The 1B has no scaling — detect by
    // context length.
-    int64_t ctx_key = gguf_find_key(meta, "gemma3.context_length");
-    if (ctx_key >= 0 && gguf_get_val_u32(meta, ctx_key) >= 131072
-            && !has_key(meta, "gemma3.rope.scaling.factor")) {
-        gguf_set_val_str(meta, "gemma3.rope.scaling.type", "linear");
-        gguf_set_val_f32(meta, "gemma3.rope.scaling.factor", 8.0f);
+    const int64_t ctx_key = gguf_find_key(meta, "gemma3.context_length");
+    if (ctx_key >= 0 && gguf_get_val_u32(meta, ctx_key) >= 131072) {
+        inject_str_if_missing(meta, "gemma3.rope.scaling.type",   "linear");
+        inject_f32_if_missing(meta, "gemma3.rope.scaling.factor", 8.0f);
    }

    // Tokenizer vocab size vs embedding rows mismatch: Ollama leaves extra
@ -317,20 +85,19 @@ void handle_gemma3(const llama_model_loader * ml, gguf_context * meta, ggml_cont
    // already have the +1 shift baked in, same as upstream's convert_hf.
 }

-// -------------------------------------------------------------------------
+// =========================================================================
 // qwen35moe (text side)
-// -------------------------------------------------------------------------
+// =========================================================================

 bool detect_ollama_qwen35moe(const gguf_context * meta, const ggml_context * ctx) {
-    // Require the file to declare itself qwen35moe first.
    const int64_t arch_kid = gguf_find_key(meta, "general.architecture");
    if (arch_kid < 0) return false;
    if (std::strcmp(gguf_get_val_str(meta, arch_kid), "qwen35moe") != 0) return false;

-    // Then: at least one Ollama-ism. Upstream qwen35moe text files have none
-    // of these — the vision KVs move to mmproj, MTP tensors are dropped,
-    // head_count_kv is a scalar not an array, and the various extra rope /
-    // ssm KVs below are either absent or stored differently.
+    // Any Ollama-ism. Upstream qwen35moe files have none of these — the
+    // vision KVs live in a separate mmproj, MTP tensors are dropped,
+    // head_count_kv is a scalar, and the extra rope / ssm / feed_forward
+    // KVs are either absent or stored differently.
    return has_key(meta, "qwen35moe.vision.block_count")
        || has_key(meta, "qwen35moe.image_token_id")
        || has_key(meta, "qwen35moe.ssm.v_head_reordered")
@ -346,8 +113,8 @@ void handle_qwen35moe(const llama_model_loader * ml, gguf_context * meta, ggml_c
    LLAMA_LOG_INFO("%s: detected Ollama-format qwen35moe GGUF; applying compatibility fixes\n", __func__);

    // 1. attention.head_count_kv — upstream expects UINT32; Ollama wrote
-    //    an array (one entry per layer, 0 for SSM layers, 2 for attention
-    //    layers). Collapse to the max non-zero value.
+    //    an array (one entry per layer, 0 for SSM layers, 2 for attention).
+    //    Collapse to the max non-zero value.
    {
        const int64_t kid = gguf_find_key(meta, "qwen35moe.attention.head_count_kv");
        if (kid >= 0 && gguf_get_kv_type(meta, kid) == GGUF_TYPE_ARRAY) {
@ -356,8 +123,8 @@ void handle_qwen35moe(const llama_model_loader * ml, gguf_context * meta, ggml_c
            uint32_t max_kv = 0;
            for (size_t i = 0; i < n; ++i) if (arr[i] > max_kv) max_kv = arr[i];
            if (max_kv == 0) max_kv = 2; // safety fallback
-            gguf_remove_key(meta, "qwen35moe.attention.head_count_kv");
-            gguf_set_val_u32(meta, "qwen35moe.attention.head_count_kv", max_kv);
+            gguf_remove_key  (meta, "qwen35moe.attention.head_count_kv");
+            gguf_set_val_u32 (meta, "qwen35moe.attention.head_count_kv", max_kv);
        }
    }

@ -373,8 +140,8 @@ void handle_qwen35moe(const llama_model_loader * ml, gguf_context * meta, ggml_c
        }
    }

-    // 3. Tensor rename: Ollama's `blk.N.ssm_dt` corresponds to upstream's
-    //    `blk.N.ssm_dt.bias` (same shape, F32 [32]). 40 layers.
+    // 3. Tensor rename: Ollama's `blk.N.ssm_dt` is upstream's
+    //    `blk.N.ssm_dt.bias` (same shape). 40 layers.
    {
        std::vector<std::string> targets;
        const int64_t n = gguf_get_n_tensors(meta);
@ -393,18 +160,15 @@ void handle_qwen35moe(const llama_model_loader * ml, gguf_context * meta, ggml_c
    }

    // 4. Drop embedded vision + MTP + projector tensors from the text loader.
-    //    (vision goes to clip via --mmproj; MTP isn't used by upstream.)
    add_skip_prefix(ml, "v.");
    add_skip_prefix(ml, "mm.");
    add_skip_prefix(ml, "mtp.");
 }

-// -------------------------------------------------------------------------
+// =========================================================================
 // gemma3 (clip side)
-// -------------------------------------------------------------------------
+// =========================================================================

-// Ollama -> upstream tensor-name renames. Applied via substring match, so
-// both `.weight` and `.bias` variants are covered with one entry each.
 constexpr std::pair<const char *, const char *> kGemma3ClipRenames[] = {
    {"v.patch_embedding",       "v.patch_embd"},
    {"v.position_embedding",    "v.position_embd"},
@ -419,7 +183,6 @@ constexpr std::pair<const char *, const char *> kGemma3ClipRenames[] = {
 };

 void handle_gemma3_clip(gguf_context * meta, ggml_context * ctx) {
-    // Synthesize clip.vision.* from gemma3.vision.* (same values, different key).
    copy_u32_kv(meta, "gemma3.vision.block_count",                   "clip.vision.block_count");
    copy_u32_kv(meta, "gemma3.vision.embedding_length",              "clip.vision.embedding_length");
    copy_u32_kv(meta, "gemma3.vision.feed_forward_length",           "clip.vision.feed_forward_length");
@ -430,18 +193,12 @@ void handle_gemma3_clip(gguf_context * meta, ggml_context * ctx) {
    // projection_dim = text model's embedding_length (mmproj out == LM in).
    copy_u32_kv(meta, "gemma3.embedding_length",                     "clip.vision.projection_dim");

-    // image_mean / image_std are constants for gemma3 vision.
-    if (!has_key(meta, "clip.vision.image_mean")) {
-        const float mean[3] = {0.5f, 0.5f, 0.5f};
-        gguf_set_arr_data(meta, "clip.vision.image_mean", GGUF_TYPE_FLOAT32, mean, 3);
-    }
-    if (!has_key(meta, "clip.vision.image_std")) {
-        const float std_[3] = {0.5f, 0.5f, 0.5f};
-        gguf_set_arr_data(meta, "clip.vision.image_std", GGUF_TYPE_FLOAT32, std_, 3);
-    }
+    static const float kHalfHalfHalf[3] = {0.5f, 0.5f, 0.5f};
+    inject_f32_arr_if_missing(meta, "clip.vision.image_mean", kHalfHalfHalf, 3);
+    inject_f32_arr_if_missing(meta, "clip.vision.image_std",  kHalfHalfHalf, 3);

-    if (!has_key(meta, "clip.has_vision_encoder")) gguf_set_val_bool(meta, "clip.has_vision_encoder", true);
-    if (!has_key(meta, "clip.use_gelu"))           gguf_set_val_bool(meta, "clip.use_gelu",           true);
+    inject_bool_if_missing(meta, "clip.has_vision_encoder", true);
+    inject_bool_if_missing(meta, "clip.use_gelu",           true);
    gguf_set_val_str(meta, "clip.projector_type",  "gemma3");
    gguf_set_val_str(meta, "general.architecture", "clip");

@ -456,11 +213,10 @@ void handle_gemma3_clip(gguf_context * meta, ggml_context * ctx) {
    promote_tensor_to_f32(meta, ctx, "v.position_embd.weight");
 }

-// -------------------------------------------------------------------------
+// =========================================================================
 // qwen35moe (clip side)
-// -------------------------------------------------------------------------
+// =========================================================================

-// Substring renames. One entry handles both `.weight` and `.bias` variants.
 constexpr std::pair<const char *, const char *> kQwen35moeClipRenames[] = {
    {"v.pos_embed",          "v.position_embd"},
    {"v.patch_embed",        "v.patch_embd"},
@ -473,41 +229,31 @@ constexpr std::pair<const char *, const char *> kQwen35moeClipRenames[] = {
    {".norm2",               ".ln2"},
 };

-// Register a QKV merge for a single block: Ollama has separate attn_q,
-// attn_k, attn_v tensors; upstream wants them concatenated along their
-// slow axis. Capture source file offsets BEFORE renaming.
+// Register a QKV merge for a single vision block: Ollama has separate
+// attn_q, attn_k, attn_v tensors; upstream wants them concatenated along
+// their slow axis. Capture source file offsets BEFORE renaming attn_q.
 void register_qwen35moe_qkv_merge(gguf_context * meta, ggml_context * ctx, int block_idx) {
-    char qname[64], kname[64], vname[64];
-    std::snprintf(qname, sizeof(qname), "v.blk.%d.attn_q.weight",   block_idx);
-    std::snprintf(kname, sizeof(kname), "v.blk.%d.attn_k.weight",   block_idx);
-    std::snprintf(vname, sizeof(vname), "v.blk.%d.attn_v.weight",   block_idx);
-
-    const ggml_tensor * q = ggml_get_tensor(ctx, qname);
-    if (!q) return; // not a qwen35moe vision block
-
-    // Set up the destination tensor. We rename attn_q -> attn_qkv and
-    // widen its slow axis from [1152, 1152] to [1152, 3456] (3 * hidden).
-    char qkv_w[64], qkv_b[64], qbias[64], kbias[64], vbias[64];
-    std::snprintf(qkv_w, sizeof(qkv_w), "v.blk.%d.attn_qkv.weight", block_idx);
-    std::snprintf(qkv_b, sizeof(qkv_b), "v.blk.%d.attn_qkv.bias",   block_idx);
+    char q[64], k[64], v[64], qbias[64], kbias[64], vbias[64], qkv_w[64], qkv_b[64];
+    std::snprintf(q,     sizeof(q),     "v.blk.%d.attn_q.weight",   block_idx);
+    std::snprintf(k,     sizeof(k),     "v.blk.%d.attn_k.weight",   block_idx);
+    std::snprintf(v,     sizeof(v),     "v.blk.%d.attn_v.weight",   block_idx);
    std::snprintf(qbias, sizeof(qbias), "v.blk.%d.attn_q.bias",     block_idx);
    std::snprintf(kbias, sizeof(kbias), "v.blk.%d.attn_k.bias",     block_idx);
    std::snprintf(vbias, sizeof(vbias), "v.blk.%d.attn_v.bias",     block_idx);
+    std::snprintf(qkv_w, sizeof(qkv_w), "v.blk.%d.attn_qkv.weight", block_idx);
+    std::snprintf(qkv_b, sizeof(qkv_b), "v.blk.%d.attn_qkv.bias",   block_idx);
+
+    if (!ggml_get_tensor(ctx, q)) return; // no vision block at this index

    // Capture source offsets for the concat BEFORE renaming.
-    register_concat_load(meta, qkv_w, {qname, kname, vname});
+    register_concat_load(meta, qkv_w, {q, k, v});
    register_concat_load(meta, qkv_b, {qbias, kbias, vbias});

-    // Rename attn_q -> attn_qkv and widen shape.
-    rename_tensor(meta, ctx, qname, qkv_w);
-    if (ggml_tensor * t = ggml_get_tensor(ctx, qkv_w)) {
-        set_tensor_shape(t, {t->ne[0], t->ne[1] * 3});
-    }
-    // Rename attn_q.bias -> attn_qkv.bias and widen from [1152] to [3456].
+    // Rename attn_q -> attn_qkv and widen from [hidden, hidden] to [hidden, 3*hidden].
+    rename_tensor(meta, ctx, q, qkv_w);
+    if (ggml_tensor * t = ggml_get_tensor(ctx, qkv_w)) set_tensor_shape(t, {t->ne[0], t->ne[1] * 3});
    rename_tensor(meta, ctx, qbias, qkv_b);
-    if (ggml_tensor * t = ggml_get_tensor(ctx, qkv_b)) {
-        set_tensor_shape(t, {t->ne[0] * 3});
-    }
+    if (ggml_tensor * t = ggml_get_tensor(ctx, qkv_b)) set_tensor_shape(t, {t->ne[0] * 3});
 }

 // Register the patch_embed reshape + split + F16->F32.
@ -518,21 +264,15 @@ void register_qwen35moe_qkv_merge(gguf_context * meta, ggml_context * ctx, int b
 // [out_c=1152, in_c=3, ...] dim pair, so packed_c = c_out*3 + c_in.
 //
 // Destination: two upstream tensors with ggml shape
-//   [h=16, w=16, c_in=3, c_out=1152] F32 each,
-// one per temporal slice. Matches upstream's
-//   yield data_torch[:, :, 0, ...]   # PyTorch [1152, 3, 16, 16]
-//   yield data_torch[:, :, 1, ...]
-// which reverses to ggml ne=[16, 16, 3, 1152] per slice.
+//   [h=16, w=16, c_in=3, c_out=1152] F32 each, one per temporal slice.
 //
 // For each output element (h, w, c_in, c_out):
 //   src_idx = h + w*W + t*W*H + (c_out*C_in + c_in)*W*H*T
 //   dst_idx = h + w*W + c_in*W*H + c_out*W*H*C_in
 void register_qwen35moe_patch_embed_split(gguf_context * meta, ggml_context * ctx) {
    const char * src_name = "v.patch_embed.weight";
-    const int64_t tid = gguf_find_tensor(meta, src_name);
-    if (tid < 0) return;
-
-    ggml_tensor * src_t = ggml_get_tensor(ctx, src_name);
+    if (gguf_find_tensor(meta, src_name) < 0) return;
+    const ggml_tensor * src_t = ggml_get_tensor(ctx, src_name);
    if (!src_t) return;

    const size_t src_offset = tensor_file_offset(meta, src_name);
@ -552,8 +292,8 @@ void register_qwen35moe_patch_embed_split(gguf_context * meta, ggml_context * ct
                for (int c_out = 0; c_out < COUT; ++c_out) {
                    for (int c_in = 0; c_in < CIN; ++c_in) {
                        const size_t packed = (size_t) c_out * CIN + c_in;
-                        const uint16_t * in_base = sp + HW * (slice_idx + T * packed);
-                        float * out_base = dp + HW * (c_in + CIN * c_out);
+                        const uint16_t * in_base  = sp + HW * (slice_idx + T * packed);
+                        float          * out_base = dp + HW * (c_in + CIN * c_out);
                        for (size_t i = 0; i < HW; ++i) out_base[i] = ggml_fp16_to_fp32(in_base[i]);
                    }
                }
@ -565,105 +305,82 @@ void register_qwen35moe_patch_embed_split(gguf_context * meta, ggml_context * ct
    };

    // Rename src -> `v.patch_embd.weight`, reshape to dest layout, register
-    // the slice-0 load op against its new name.
+    // the slice-0 load op.
    rename_tensor(meta, ctx, src_name, "v.patch_embd.weight");
-    ggml_tensor * dest0 = ggml_get_tensor(ctx, "v.patch_embd.weight");
-    if (!dest0) return;
-    set_tensor_shape(dest0, {16, 16, 3, 1152});
-    set_tensor_type (dest0, GGML_TYPE_F32);
+    if (ggml_tensor * dest0 = ggml_get_tensor(ctx, "v.patch_embd.weight")) {
+        set_tensor_shape(dest0, {H, W, CIN, COUT});
+        set_tensor_type (dest0, GGML_TYPE_F32);
+    }
    register_load_op("v.patch_embd.weight", make_slice_op(0));

-    // We need a sibling tensor `v.patch_embd.weight.1` in ctx_meta so clip's
-    // get_tensor() can find it. ggml_new_tensor() would blow ctx_meta's
-    // fixed memory pool (sized exactly for the original tensor count).
-    // Instead, steal an unused slot: after the QKV merge, `v.blk.0.attn_k`
-    // is orphaned in ctx_meta — clip never looks it up because it asks for
-    // the merged `attn_qkv`. Rename it to our sibling and reshape.
-    rename_tensor(meta, ctx, "v.blk.0.attn_k.weight", "v.patch_embd.weight.1");
-    ggml_tensor * dest1 = ggml_get_tensor(ctx, "v.patch_embd.weight.1");
-    if (!dest1) return;
-    set_tensor_shape(dest1, {16, 16, 3, 1152});
-    set_tensor_type (dest1, GGML_TYPE_F32);
+    // Reclaim the `v.blk.0.attn_k.weight` slot (orphaned by the QKV merge)
+    // as the sibling `v.patch_embd.weight.1`.
+    reclaim_slot_as(meta, ctx,
+                    "v.blk.0.attn_k.weight", "v.patch_embd.weight.1",
+                    {H, W, CIN, COUT}, GGML_TYPE_F32);
    register_load_op("v.patch_embd.weight.1", make_slice_op(1));
 }

 void handle_qwen35moe_clip(gguf_context * meta, ggml_context * ctx) {
    LLAMA_LOG_INFO("%s: detected Ollama-format qwen35moe GGUF used as mmproj; translating\n", __func__);

-    // KV synthesis: clip.vision.* from qwen35moe.vision.* (plus defaults).
    copy_u32_kv(meta, "qwen35moe.vision.block_count",                   "clip.vision.block_count");
    copy_u32_kv(meta, "qwen35moe.vision.embedding_length",              "clip.vision.embedding_length");
    copy_u32_kv(meta, "qwen35moe.vision.attention.head_count",          "clip.vision.attention.head_count");
    copy_u32_kv(meta, "qwen35moe.vision.patch_size",                    "clip.vision.patch_size");
    copy_u32_kv(meta, "qwen35moe.vision.spatial_merge_size",            "clip.vision.spatial_merge_size");
    copy_u32_kv(meta, "qwen35moe.vision.num_channels",                  "clip.vision.num_channels");
-    // projection_dim is the text model's embedding_length (merger out dim).
+    // projection_dim = text model's embedding_length.
    copy_u32_kv(meta, "qwen35moe.embedding_length",                     "clip.vision.projection_dim");

-    // Ollama omitted these; defaults match reference (ref_Q3.5-35B-A3B mmproj).
-    if (!has_key(meta, "clip.vision.feed_forward_length"))
-        gguf_set_val_u32(meta, "clip.vision.feed_forward_length", 4304);
-    if (!has_key(meta, "clip.vision.image_size"))
-        gguf_set_val_u32(meta, "clip.vision.image_size", 768);
-    if (!has_key(meta, "clip.vision.attention.layer_norm_epsilon"))
-        gguf_set_val_f32(meta, "clip.vision.attention.layer_norm_epsilon", 1e-6f);
+    // Defaults for KVs Ollama omitted (match the Qwen3.5-35B-A3B reference mmproj).
+    inject_u32_if_missing(meta, "clip.vision.feed_forward_length",          4304);
+    inject_u32_if_missing(meta, "clip.vision.image_size",                   768);
+    inject_f32_if_missing(meta, "clip.vision.attention.layer_norm_epsilon", 1e-6f);

-    // image_mean / image_std — constants for qwen3.5 vision.
-    if (!has_key(meta, "clip.vision.image_mean")) {
-        const float v[3] = {0.5f, 0.5f, 0.5f};
-        gguf_set_arr_data(meta, "clip.vision.image_mean", GGUF_TYPE_FLOAT32, v, 3);
-    }
-    if (!has_key(meta, "clip.vision.image_std")) {
-        const float v[3] = {0.5f, 0.5f, 0.5f};
-        gguf_set_arr_data(meta, "clip.vision.image_std", GGUF_TYPE_FLOAT32, v, 3);
-    }
+    static const float kHalfHalfHalf[3] = {0.5f, 0.5f, 0.5f};
+    inject_f32_arr_if_missing(meta, "clip.vision.image_mean", kHalfHalfHalf, 3);
+    inject_f32_arr_if_missing(meta, "clip.vision.image_std",  kHalfHalfHalf, 3);

-    // is_deepstack_layers: qwen3.5 35B has no deepstack layers. Set a
-    // 27-element array of False matching clip.vision.block_count.
+    // is_deepstack_layers: qwen3.5 35B has no deepstack layers. Set 27 False.
    if (!has_key(meta, "clip.vision.is_deepstack_layers")) {
        uint8_t bools[27] = {};
        gguf_set_arr_data(meta, "clip.vision.is_deepstack_layers", GGUF_TYPE_BOOL, bools, 27);
    }

-    if (!has_key(meta, "clip.has_vision_encoder")) gguf_set_val_bool(meta, "clip.has_vision_encoder", true);
-    if (!has_key(meta, "clip.use_gelu"))           gguf_set_val_bool(meta, "clip.use_gelu",           true);
+    inject_bool_if_missing(meta, "clip.has_vision_encoder", true);
+    inject_bool_if_missing(meta, "clip.use_gelu",           true);
    gguf_set_val_str(meta, "clip.projector_type",  "qwen3vl_merger");
    gguf_set_val_str(meta, "general.architecture", "clip");

-    // QKV merge per block. Runs BEFORE the substring renames so we can
-    // reliably find attn_q / attn_k / attn_v by name.
+    // QKV merge runs BEFORE substring renames so it can find attn_q/k/v by name.
    const int64_t n_blocks_key = gguf_find_key(meta, "clip.vision.block_count");
    const uint32_t n_blocks = n_blocks_key >= 0 ? gguf_get_val_u32(meta, n_blocks_key) : 27;
-    for (uint32_t b = 0; b < n_blocks; ++b) {
-        register_qwen35moe_qkv_merge(meta, ctx, (int) b);
-    }
+    for (uint32_t b = 0; b < n_blocks; ++b) register_qwen35moe_qkv_merge(meta, ctx, (int) b);

-    // patch_embed: reshape + temporal split + F16->F32. Also BEFORE renames
-    // because it references `v.patch_embed.weight` by name.
+    // Also before renames: patch_embed references the source by name.
    register_qwen35moe_patch_embed_split(meta, ctx);

-    // Substring renames (last). These handle the simple pos_embed, merger.*,
-    // linear_fc1/2, norm1/2 conversions.
+    // Simple substring renames.
    for (const auto & [from, to] : kQwen35moeClipRenames) {
        rename_tensors_containing(meta, ctx, from, to);
    }

-    // F16 -> F32 on position_embd after rename.
    promote_tensor_to_f32(meta, ctx, "v.position_embd.weight");
 }

 } // anonymous namespace

-// -------------------------------------------------------------------------
-// public entry points
-// -------------------------------------------------------------------------
+// =========================================================================
+// Public entry points
+// =========================================================================

 void translate_metadata(const llama_model_loader * ml,
                        gguf_context * meta,
                        ggml_context * ctx,
                        std::string & arch_name) {
    if (!meta) return;
-    if (arch_name == "gemma3")    handle_gemma3(ml, meta, ctx);
+    if (arch_name == "gemma3")    handle_gemma3   (ml, meta, ctx);
    if (arch_name == "qwen35moe") handle_qwen35moe(ml, meta, ctx);
    // Dispatch. Add more arches as they are wired up.
 }
@ -684,13 +401,7 @@ void translate_clip_metadata(gguf_context * meta, ggml_context * ctx) {
 }

 bool should_skip_tensor(const llama_model_loader * ml, const char * tensor_name) {
-    std::lock_guard<std::mutex> lk(g_registry_mutex);
-    auto it = g_skip_prefixes.find(ml);
-    if (it == g_skip_prefixes.end()) return false;
-    for (const auto & prefix : it->second) {
-        if (std::strncmp(tensor_name, prefix.c_str(), prefix.size()) == 0) return true;
-    }
-    return false;
+    return should_skip_tensor_prefix(ml, tensor_name);
 }

 bool maybe_load_tensor(ggml_tensor * cur,
@ -700,12 +411,7 @@ bool maybe_load_tensor(ggml_tensor * cur,
    (void) file_offset; // registered ops capture their own offsets

    LoadOp op;
-    {
-        std::lock_guard<std::mutex> lk(g_loadop_mutex);
-        auto it = g_loadops.find(ggml_get_name(cur));
-        if (it == g_loadops.end()) return false;
-        op = it->second;
-    }
+    if (!take_load_op(ggml_get_name(cur), op)) return false;

    const size_t dst_size = ggml_nbytes(cur);
    std::vector<uint8_t> dst(dst_size);
--- a/llama/server/CMakeLists.txt
+++ b/llama/server/CMakeLists.txt
@ -75,8 +75,9 @@ FetchContent_MakeAvailable(llama_cpp)
 # Kept separate from the upstream-edits patch so our .cpp/.h stay
 # on-disk in llama/compat/ rather than being copied into _deps/.
 if(DEFINED OLLAMA_LLAMA_CPP_COMPAT_DIR)
-    target_sources(llama PRIVATE
-        ${OLLAMA_LLAMA_CPP_COMPAT_DIR}/llama-ollama-compat.cpp)
+    file(GLOB _compat_sources CONFIGURE_DEPENDS
+        ${OLLAMA_LLAMA_CPP_COMPAT_DIR}/*.cpp)
+    target_sources(llama PRIVATE ${_compat_sources})
    target_include_directories(llama PRIVATE
        ${OLLAMA_LLAMA_CPP_COMPAT_DIR})
    # mtmd's clip.cpp #include's the compat header too — add the same dir