llama/compat: split shared infra into a util TU

Main translation unit (llama-ollama-compat.cpp) is now purely per-arch
dispatch: detect_* + handle_* for each arch, plus the 4 public entry
points. Dropped from 724 lines to 430.

Everything that doesn't depend on a specific arch moves to
llama-ollama-compat-util.{h,cpp}:
  - gguf KV helpers (has_key, copy_{u32,f32}_kv, inject_{u32,f32,str,bool,
    f32_arr}_if_missing, truncate_{str,data}_arr)
  - ggml tensor helpers (any_tensor_with_prefix, rename_tensor,
    rename_tensors_containing, set_tensor_{type,shape}, reclaim_slot_as,
    tensor_file_offset)
  - per-loader skip-prefix registry (add_skip_prefix, should_skip_tensor_prefix)
  - LoadOp registry (register_load_op, take_load_op, read_at)
  - common high-level transforms (promote_tensor_to_f32, register_concat_load)

New helpers introduced while splitting:
  - inject_{u32,f32,str,bool,f32_arr}_if_missing — replaces the
    has_key + gguf_set_val_* idiom we were using 20+ times.
  - reclaim_slot_as — extracts the "rename an orphan tensor slot as a
    synthesized one" pattern used by qwen35moe's patch_embed split. Clear
    name + comment explains the workaround.

CMake: target_sources now globs llama/compat/*.cpp (CONFIGURE_DEPENDS),
so new .cpp files are picked up without CMake edits.

Nothing behaviorally changed. Verified gemma3 + qwen3.5 text + vision
still work end-to-end after a clean rebuild.
This commit is contained in:
jmorganca 2026-04-19 12:59:21 -07:00
parent db0c745308
commit 2a388da77b
4 changed files with 449 additions and 389 deletions

View file

@ -0,0 +1,268 @@
#include "llama-ollama-compat-util.h"
#include "llama-impl.h"
#include "llama-model-loader.h"
#include <cstdio>
#include <cstring>
#include <mutex>
#include <unordered_map>
namespace llama_ollama_compat::detail {
// -------------------------------------------------------------------------
// gguf_context KV helpers
// -------------------------------------------------------------------------
bool has_key(const gguf_context * meta, const char * key) {
return gguf_find_key(meta, key) >= 0;
}
void copy_u32_kv(gguf_context * meta, const char * src, const char * dst) {
if (has_key(meta, dst)) return;
const int64_t k = gguf_find_key(meta, src);
if (k < 0) return;
gguf_set_val_u32(meta, dst, gguf_get_val_u32(meta, k));
}
void copy_f32_kv(gguf_context * meta, const char * src, const char * dst) {
if (has_key(meta, dst)) return;
const int64_t k = gguf_find_key(meta, src);
if (k < 0) return;
gguf_set_val_f32(meta, dst, gguf_get_val_f32(meta, k));
}
void inject_u32_if_missing (gguf_context * meta, const char * key, uint32_t v) {
if (!has_key(meta, key)) gguf_set_val_u32(meta, key, v);
}
void inject_f32_if_missing (gguf_context * meta, const char * key, float v) {
if (!has_key(meta, key)) gguf_set_val_f32(meta, key, v);
}
void inject_str_if_missing (gguf_context * meta, const char * key, const char * v) {
if (!has_key(meta, key)) gguf_set_val_str(meta, key, v);
}
void inject_bool_if_missing(gguf_context * meta, const char * key, bool v) {
if (!has_key(meta, key)) gguf_set_val_bool(meta, key, v);
}
void inject_f32_arr_if_missing(gguf_context * meta, const char * key,
const float * data, size_t n) {
if (!has_key(meta, key)) gguf_set_arr_data(meta, key, GGUF_TYPE_FLOAT32, data, n);
}
void truncate_str_arr(gguf_context * meta, const char * key, size_t new_n) {
const int64_t kid = gguf_find_key(meta, key);
if (kid < 0 || new_n >= gguf_get_arr_n(meta, kid)) return;
std::vector<std::string> owned;
owned.reserve(new_n);
std::vector<const char *> ptrs;
ptrs.reserve(new_n);
for (size_t i = 0; i < new_n; ++i) owned.emplace_back(gguf_get_arr_str(meta, kid, i));
for (const auto & s : owned) ptrs.push_back(s.c_str());
gguf_set_arr_str(meta, key, ptrs.data(), new_n);
}
void truncate_data_arr(gguf_context * meta, const char * key,
gguf_type elem_type, size_t elem_size, size_t new_n) {
const int64_t kid = gguf_find_key(meta, key);
if (kid < 0 || new_n >= gguf_get_arr_n(meta, kid)) return;
std::vector<uint8_t> copy(elem_size * new_n);
std::memcpy(copy.data(), gguf_get_arr_data(meta, kid), elem_size * new_n);
gguf_set_arr_data(meta, key, elem_type, copy.data(), new_n);
}
// -------------------------------------------------------------------------
// ggml_context tensor scans
// -------------------------------------------------------------------------
bool any_tensor_with_prefix(const ggml_context * ctx, const char * prefix) {
const size_t plen = std::strlen(prefix);
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t; t = ggml_get_next_tensor(ctx, t)) {
if (std::strncmp(ggml_get_name(t), prefix, plen) == 0) return true;
}
return false;
}
// -------------------------------------------------------------------------
// Tensor renaming / reshaping (mutates both contexts)
// -------------------------------------------------------------------------
// gguf_get_tensor_name returns a pointer into a mutable `char[GGML_MAX_NAME]`
// inside a std::vector element; the const on the return type is API
// courtesy, so writing through const_cast is defined.
void rename_tensor(gguf_context * meta, ggml_context * ctx,
const char * old_name, const char * new_name) {
const int64_t id = gguf_find_tensor(meta, old_name);
if (id < 0) return;
if (char * p = const_cast<char *>(gguf_get_tensor_name(meta, id))) {
std::strncpy(p, new_name, GGML_MAX_NAME - 1);
p[GGML_MAX_NAME - 1] = '\0';
}
if (ggml_tensor * t = ggml_get_tensor(ctx, old_name)) ggml_set_name(t, new_name);
}
void rename_tensors_containing(gguf_context * meta, ggml_context * ctx,
const char * needle, const char * replacement) {
std::vector<std::pair<std::string, std::string>> renames;
const int64_t n = gguf_get_n_tensors(meta);
const size_t needle_len = std::strlen(needle);
for (int64_t i = 0; i < n; ++i) {
std::string s(gguf_get_tensor_name(meta, i));
const size_t pos = s.find(needle);
if (pos == std::string::npos) continue;
std::string ns = s;
ns.replace(pos, needle_len, replacement);
renames.emplace_back(std::move(s), std::move(ns));
}
for (const auto & [from, to] : renames) rename_tensor(meta, ctx, from.c_str(), to.c_str());
}
void set_tensor_type(ggml_tensor * t, ggml_type type) {
t->type = type;
t->nb[0] = ggml_type_size(type);
t->nb[1] = t->nb[0] * (t->ne[0] / ggml_blck_size(type));
for (int i = 2; i < GGML_MAX_DIMS; ++i) t->nb[i] = t->nb[i - 1] * t->ne[i - 1];
}
void set_tensor_shape(ggml_tensor * t, std::initializer_list<int64_t> shape) {
int i = 0;
for (auto v : shape) t->ne[i++] = v;
for (; i < GGML_MAX_DIMS; ++i) t->ne[i] = 1;
set_tensor_type(t, t->type);
}
// Rename an orphan tensor slot as a new synthesized tensor. See header for
// why this is the workaround of choice (clip's ctx_meta has no spare capacity).
bool reclaim_slot_as(gguf_context * meta, ggml_context * ctx,
const char * orphan_name, const char * new_name,
std::initializer_list<int64_t> shape, ggml_type type) {
if (gguf_find_tensor(meta, orphan_name) < 0) return false;
rename_tensor(meta, ctx, orphan_name, new_name);
ggml_tensor * t = ggml_get_tensor(ctx, new_name);
if (!t) return false;
set_tensor_shape(t, shape);
set_tensor_type (t, type);
return true;
}
size_t tensor_file_offset(const gguf_context * meta, const char * name) {
const int64_t id = gguf_find_tensor(meta, name);
if (id < 0) return 0;
return gguf_get_data_offset(meta) + gguf_get_tensor_offset(meta, id);
}
// -------------------------------------------------------------------------
// Per-loader skip-prefix registry
// -------------------------------------------------------------------------
namespace {
std::mutex g_skip_mutex;
std::unordered_map<const llama_model_loader *, std::vector<std::string>> g_skip_prefixes;
} // anon
void add_skip_prefix(const llama_model_loader * ml, std::string prefix) {
std::lock_guard<std::mutex> lk(g_skip_mutex);
g_skip_prefixes[ml].push_back(std::move(prefix));
}
bool should_skip_tensor_prefix(const llama_model_loader * ml, const char * name) {
std::lock_guard<std::mutex> lk(g_skip_mutex);
auto it = g_skip_prefixes.find(ml);
if (it == g_skip_prefixes.end()) return false;
for (const auto & prefix : it->second) {
if (std::strncmp(name, prefix.c_str(), prefix.size()) == 0) return true;
}
return false;
}
// -------------------------------------------------------------------------
// Load-time transform registry
// -------------------------------------------------------------------------
namespace {
std::mutex g_loadop_mutex;
std::unordered_map<std::string, LoadOp> g_loadops;
} // anon
void register_load_op(std::string dest_name, LoadOp op) {
std::lock_guard<std::mutex> lk(g_loadop_mutex);
g_loadops[std::move(dest_name)] = std::move(op);
}
bool take_load_op(const char * dest_name, LoadOp & out) {
std::lock_guard<std::mutex> lk(g_loadop_mutex);
auto it = g_loadops.find(dest_name);
if (it == g_loadops.end()) return false;
out = std::move(it->second);
g_loadops.erase(it);
return true;
}
bool read_at(const char * path, size_t offset, void * dst, size_t size) {
FILE * f = std::fopen(path, "rb");
if (!f) return false;
bool ok = (std::fseek(f, (long) offset, SEEK_SET) == 0
&& std::fread(dst, 1, size, f) == size);
std::fclose(f);
return ok;
}
// -------------------------------------------------------------------------
// Common high-level transforms
// -------------------------------------------------------------------------
void promote_tensor_to_f32(gguf_context * meta, ggml_context * ctx, const char * name) {
const int64_t tid = gguf_find_tensor(meta, name);
if (tid < 0) return;
ggml_tensor * t = ggml_get_tensor(ctx, name);
if (!t || t->type != GGML_TYPE_F16) return;
const size_t src_offset = tensor_file_offset(meta, name);
const size_t n_elem = ggml_nelements(t);
const size_t src_size = n_elem * sizeof(uint16_t);
set_tensor_type(t, GGML_TYPE_F32);
register_load_op(name, LoadOp{
[src_offset, src_size, n_elem](const char * path, void * dst, size_t dst_size) {
(void) dst_size;
std::vector<uint8_t> src(src_size);
if (!read_at(path, src_offset, src.data(), src_size)) return false;
const uint16_t * sp = reinterpret_cast<const uint16_t *>(src.data());
float * dp = reinterpret_cast<float *>(dst);
for (size_t i = 0; i < n_elem; ++i) dp[i] = ggml_fp16_to_fp32(sp[i]);
return true;
},
"F16->F32 promote",
});
}
void register_concat_load(const gguf_context * meta, std::string dest_name,
const std::vector<std::string> & src_names) {
std::vector<std::pair<size_t, size_t>> regions;
regions.reserve(src_names.size());
for (const auto & n : src_names) {
const int64_t id = gguf_find_tensor(meta, n.c_str());
if (id < 0) return;
regions.emplace_back(
gguf_get_data_offset(meta) + gguf_get_tensor_offset(meta, id),
gguf_get_tensor_size(meta, id));
}
register_load_op(std::move(dest_name), LoadOp{
[regions](const char * path, void * dst, size_t dst_size) {
size_t total = 0;
for (auto & [_, sz] : regions) total += sz;
if (total != dst_size) return false;
uint8_t * p = static_cast<uint8_t *>(dst);
for (auto & [off, sz] : regions) {
if (!read_at(path, off, p, sz)) return false;
p += sz;
}
return true;
},
"concat sources",
});
}
} // namespace llama_ollama_compat::detail

85
llama/compat/llama-ollama-compat-util.h vendored Normal file
View file

@ -0,0 +1,85 @@
#pragma once
// Internal helpers shared by the per-architecture handlers in
// llama-ollama-compat.cpp. Not part of the public API.
//
// Everything lives under namespace llama_ollama_compat::detail. The
// definitions live in llama-ollama-compat-util.cpp, which also owns the
// registry globals (tensor skip list, load-op table) that need a single
// translation unit.
#include <cstddef>
#include <cstdint>
#include <functional>
#include <initializer_list>
#include <string>
#include <vector>
#include "ggml.h"
#include "ggml-backend.h"
#include "gguf.h"
struct llama_model_loader;
namespace llama_ollama_compat::detail {
// -- gguf_context KV helpers --
bool has_key(const gguf_context * meta, const char * key);
void copy_u32_kv(gguf_context * meta, const char * src, const char * dst);
void copy_f32_kv(gguf_context * meta, const char * src, const char * dst);
void inject_u32_if_missing (gguf_context * meta, const char * key, uint32_t v);
void inject_f32_if_missing (gguf_context * meta, const char * key, float v);
void inject_str_if_missing (gguf_context * meta, const char * key, const char * v);
void inject_bool_if_missing(gguf_context * meta, const char * key, bool v);
void inject_f32_arr_if_missing(gguf_context * meta, const char * key,
const float * data, size_t n);
void truncate_str_arr (gguf_context * meta, const char * key, size_t new_n);
void truncate_data_arr(gguf_context * meta, const char * key,
gguf_type elem_type, size_t elem_size, size_t new_n);
// -- ggml_context tensor scans --
bool any_tensor_with_prefix(const ggml_context * ctx, const char * prefix);
// -- Tensor renaming / reshaping (mutates both gguf_context and ggml_context) --
void rename_tensor(gguf_context * meta, ggml_context * ctx,
const char * old_name, const char * new_name);
void rename_tensors_containing(gguf_context * meta, ggml_context * ctx,
const char * needle, const char * replacement);
void set_tensor_type (ggml_tensor * t, ggml_type type);
void set_tensor_shape(ggml_tensor * t, std::initializer_list<int64_t> shape);
bool reclaim_slot_as (gguf_context * meta, ggml_context * ctx,
const char * orphan_name, const char * new_name,
std::initializer_list<int64_t> shape, ggml_type type);
// -- File-offset capture (before rename) --
size_t tensor_file_offset(const gguf_context * meta, const char * name);
// -- Per-loader skip-prefix registry --
void add_skip_prefix(const llama_model_loader * ml, std::string prefix);
bool should_skip_tensor_prefix(const llama_model_loader * ml, const char * name);
// -- Load-time transform registry --
struct LoadOp {
std::function<bool(const char * src_file, void * dst, size_t dst_size)> apply;
const char * description;
};
void register_load_op(std::string dest_name, LoadOp op);
bool take_load_op (const char * dest_name, LoadOp & out); // removes + returns
// Read `size` bytes at `offset` from `path` into `dst`. Used by LoadOps.
bool read_at(const char * path, size_t offset, void * dst, size_t size);
// -- Common high-level transforms --
// F16 -> F32 promotion. Captures the source file offset at registration
// time so later renames/reshapes of this tensor don't invalidate the read.
void promote_tensor_to_f32(gguf_context * meta, ggml_context * ctx, const char * name);
// Concatenate N source tensors into one destination. Captures each source's
// file offset + byte size at registration time. Layout assumption: sources
// concatenate cleanly along the destination's slow ggml axis, which in
// C order means the destination bytes are src[0] || src[1] || ... .
void register_concat_load(const gguf_context * meta, std::string dest_name,
const std::vector<std::string> & src_names);
} // namespace llama_ollama_compat::detail

View file

@ -1,259 +1,33 @@
#include "llama-ollama-compat.h"
#include "llama-ollama-compat-util.h"
#include "ggml.h"
#include "ggml-backend.h"
#include "gguf.h"
#include "llama-impl.h"
#include "llama-model-loader.h"
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <mutex>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
namespace llama_ollama_compat {
using namespace llama_ollama_compat::detail; // pull detail:: helpers into scope
namespace {
// -------------------------------------------------------------------------
// tiny gguf_context helpers
// -------------------------------------------------------------------------
bool has_key(const gguf_context * meta, const char * key) {
return gguf_find_key(meta, key) >= 0;
}
bool any_tensor_with_prefix(const ggml_context * ctx, const char * prefix) {
const size_t plen = std::strlen(prefix);
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t; t = ggml_get_next_tensor(ctx, t)) {
if (std::strncmp(ggml_get_name(t), prefix, plen) == 0) return true;
}
return false;
}
// Copy a uint32 KV from src to dst if src exists and dst doesn't.
void copy_u32_kv(gguf_context * meta, const char * src, const char * dst) {
if (has_key(meta, dst)) return;
const int64_t k = gguf_find_key(meta, src);
if (k < 0) return;
gguf_set_val_u32(meta, dst, gguf_get_val_u32(meta, k));
}
// Copy a float32 KV from src to dst if src exists and dst doesn't.
void copy_f32_kv(gguf_context * meta, const char * src, const char * dst) {
if (has_key(meta, dst)) return;
const int64_t k = gguf_find_key(meta, src);
if (k < 0) return;
gguf_set_val_f32(meta, dst, gguf_get_val_f32(meta, k));
}
// Truncate a string-typed KV array to `new_n` entries.
void truncate_str_arr(gguf_context * meta, const char * key, size_t new_n) {
const int64_t kid = gguf_find_key(meta, key);
if (kid < 0 || new_n >= gguf_get_arr_n(meta, kid)) return;
std::vector<std::string> owned;
owned.reserve(new_n);
std::vector<const char *> ptrs;
ptrs.reserve(new_n);
for (size_t i = 0; i < new_n; ++i) owned.emplace_back(gguf_get_arr_str(meta, kid, i));
for (const auto & s : owned) ptrs.push_back(s.c_str());
gguf_set_arr_str(meta, key, ptrs.data(), new_n);
}
// Truncate a primitive-typed KV array to `new_n` entries.
void truncate_data_arr(gguf_context * meta, const char * key,
gguf_type elem_type, size_t elem_size, size_t new_n) {
const int64_t kid = gguf_find_key(meta, key);
if (kid < 0 || new_n >= gguf_get_arr_n(meta, kid)) return;
std::vector<uint8_t> copy(elem_size * new_n);
std::memcpy(copy.data(), gguf_get_arr_data(meta, kid), elem_size * new_n);
gguf_set_arr_data(meta, key, elem_type, copy.data(), new_n);
}
// Rename a tensor in BOTH the gguf_context and the ggml_context so that all
// name-based lookups agree. gguf_get_tensor_name returns a pointer into a
// mutable `char[GGML_MAX_NAME]` inside a std::vector element; the const on
// the return type is API courtesy, so writing through const_cast is defined.
void rename_tensor(gguf_context * meta, ggml_context * ctx,
const char * old_name, const char * new_name) {
const int64_t id = gguf_find_tensor(meta, old_name);
if (id < 0) return;
if (char * p = const_cast<char *>(gguf_get_tensor_name(meta, id))) {
std::strncpy(p, new_name, GGML_MAX_NAME - 1);
p[GGML_MAX_NAME - 1] = '\0';
}
if (ggml_tensor * t = ggml_get_tensor(ctx, old_name)) ggml_set_name(t, new_name);
}
// Rename every tensor whose name contains `needle` (covers `.weight` + `.bias`).
void rename_tensors_containing(gguf_context * meta, ggml_context * ctx,
const char * needle, const char * replacement) {
std::vector<std::pair<std::string, std::string>> renames;
const int64_t n = gguf_get_n_tensors(meta);
const size_t needle_len = std::strlen(needle);
for (int64_t i = 0; i < n; ++i) {
std::string s(gguf_get_tensor_name(meta, i));
const size_t pos = s.find(needle);
if (pos == std::string::npos) continue;
std::string ns = s;
ns.replace(pos, needle_len, replacement);
renames.emplace_back(std::move(s), std::move(ns));
}
for (const auto & [from, to] : renames) rename_tensor(meta, ctx, from.c_str(), to.c_str());
}
// -------------------------------------------------------------------------
// per-loader state (currently just the "drop these tensor prefixes" list)
// -------------------------------------------------------------------------
std::mutex g_registry_mutex;
std::unordered_map<const llama_model_loader *, std::vector<std::string>> g_skip_prefixes;
void add_skip_prefix(const llama_model_loader * ml, std::string prefix) {
std::lock_guard<std::mutex> lk(g_registry_mutex);
g_skip_prefixes[ml].push_back(std::move(prefix));
}
// -------------------------------------------------------------------------
// Load-time tensor transforms (registry consumed by maybe_load_tensor)
//
// Each registered op produces the final bytes for a single destination
// tensor by reading + transforming bytes from the source GGUF file.
// Used for F16->F32 promotion, QKV merging, and patch-embed splitting.
// -------------------------------------------------------------------------
struct LoadOp {
// apply() reads what it needs from `src_file` and fills `dst` (dst_size
// bytes). Returns false on failure.
std::function<bool(const char * src_file, void * dst, size_t dst_size)> apply;
const char * description;
};
std::mutex g_loadop_mutex;
std::unordered_map<std::string, LoadOp> g_loadops;
void register_load_op(std::string dest_name, LoadOp op) {
std::lock_guard<std::mutex> lk(g_loadop_mutex);
g_loadops[std::move(dest_name)] = std::move(op);
}
// Helper: read `size` bytes at `offset` from `path` into `dst`.
bool read_at(const char * path, size_t offset, void * dst, size_t size) {
FILE * f = std::fopen(path, "rb");
if (!f) return false;
bool ok = (std::fseek(f, (long) offset, SEEK_SET) == 0
&& std::fread(dst, 1, size, f) == size);
std::fclose(f);
return ok;
}
// Capture a tensor's absolute file offset BEFORE any rename or reshape.
size_t tensor_file_offset(const gguf_context * meta, const char * name) {
const int64_t id = gguf_find_tensor(meta, name);
if (id < 0) return 0;
return gguf_get_data_offset(meta) + gguf_get_tensor_offset(meta, id);
}
// Set a tensor's type and recompute strides in a ggml_context.
void set_tensor_type(ggml_tensor * t, ggml_type type) {
t->type = type;
t->nb[0] = ggml_type_size(type);
t->nb[1] = t->nb[0] * (t->ne[0] / ggml_blck_size(type));
for (int i = 2; i < GGML_MAX_DIMS; ++i) t->nb[i] = t->nb[i - 1] * t->ne[i - 1];
}
// Set a tensor's shape and recompute strides in a ggml_context.
void set_tensor_shape(ggml_tensor * t, std::initializer_list<int64_t> shape) {
int i = 0;
for (auto v : shape) t->ne[i++] = v;
for (; i < GGML_MAX_DIMS; ++i) t->ne[i] = 1;
set_tensor_type(t, t->type);
}
// Promote a tensor F16 -> F32. The disk bytes stay F16; we register a
// load op that converts on read.
void promote_tensor_to_f32(gguf_context * meta, ggml_context * ctx, const char * name) {
const int64_t tid = gguf_find_tensor(meta, name);
if (tid < 0) return;
ggml_tensor * t = ggml_get_tensor(ctx, name);
if (!t || t->type != GGML_TYPE_F16) return;
const size_t src_offset = tensor_file_offset(meta, name);
const size_t n_elem = ggml_nelements(t);
const size_t src_size = n_elem * sizeof(uint16_t);
set_tensor_type(t, GGML_TYPE_F32);
register_load_op(name, LoadOp{
[src_offset, src_size, n_elem](const char * path, void * dst, size_t dst_size) {
(void) dst_size;
std::vector<uint8_t> src(src_size);
if (!read_at(path, src_offset, src.data(), src_size)) return false;
const uint16_t * sp = reinterpret_cast<const uint16_t *>(src.data());
float * dp = reinterpret_cast<float *>(dst);
for (size_t i = 0; i < n_elem; ++i) dp[i] = ggml_fp16_to_fp32(sp[i]);
return true;
},
"F16->F32 promote",
});
}
// Concatenate N source tensors into one destination tensor. Captures
// source file offsets and sizes at registration time so later renames or
// reshapes don't affect the read. Layout assumption: the source tensors
// concatenate cleanly along their slowest dim, which in C/ggml order
// means the destination's bytes are just src[0] || src[1] || ... .
void register_concat_load(const gguf_context * meta, std::string dest_name,
const std::vector<std::string> & src_names) {
std::vector<std::pair<size_t, size_t>> regions; // (offset, size)
regions.reserve(src_names.size());
for (const auto & n : src_names) {
const int64_t id = gguf_find_tensor(meta, n.c_str());
if (id < 0) return; // bail; downstream will fail loudly
regions.emplace_back(
gguf_get_data_offset(meta) + gguf_get_tensor_offset(meta, id),
gguf_get_tensor_size(meta, id));
}
register_load_op(std::move(dest_name), LoadOp{
[regions](const char * path, void * dst, size_t dst_size) {
size_t total = 0;
for (auto & [_, sz] : regions) total += sz;
if (total != dst_size) return false;
uint8_t * p = static_cast<uint8_t *>(dst);
for (auto & [off, sz] : regions) {
if (!read_at(path, off, p, sz)) return false;
p += sz;
}
return true;
},
"concat sources",
});
}
// -------------------------------------------------------------------------
// =========================================================================
// gemma3 (text side)
// -------------------------------------------------------------------------
// =========================================================================
// Returns true if this looks like an Ollama-format gemma3 blob. Requires
// the file to declare itself gemma3 (either via general.architecture or
// by having at least one gemma3.* KV), AND to exhibit at least one Ollama
// quirk. Different Ollama converter versions produced different quirks
// (4B/12B/27B have embedded vision + mm KVs; 1B uses non-standard rope
// key names; all of them omit layer_norm_rms_epsilon).
// An Ollama-format gemma3 file declares arch="gemma3" AND exhibits at
// least one converter quirk. Different converter versions produced
// different quirks (4B/12B/27B have embedded vision + mm KVs; 1B uses
// non-standard rope key names; all of them omit layer_norm_rms_epsilon).
bool detect_ollama_gemma3(const gguf_context * meta, const ggml_context * ctx) {
// Claim #1: the file is gemma3.
const int64_t arch_kid = gguf_find_key(meta, "general.architecture");
if (arch_kid < 0) return false;
if (std::strcmp(gguf_get_val_str(meta, arch_kid), "gemma3") != 0) return false;
// Claim #2: at least one Ollama-ism. An upstream-converted gemma3 would
// have none of these (except possibly the v./mm. prefixes, which upstream
// never ships in the text file — they live in a separate mmproj).
return has_key(meta, "gemma3.mm.tokens_per_image")
|| any_tensor_with_prefix(ctx, "v.")
|| any_tensor_with_prefix(ctx, "mm.")
@ -270,29 +44,23 @@ void handle_gemma3(const llama_model_loader * ml, gguf_context * meta, ggml_cont
LLAMA_LOG_INFO("%s: detected Ollama-format gemma3 GGUF; applying compatibility fixes\n", __func__);
// Old Ollama converters sometimes used nested rope key names. Copy
// them to the flat names upstream expects. Copy-if-missing order
// matters: we want real values to take priority over injected defaults.
// them to the flat names upstream expects BEFORE injecting defaults.
copy_f32_kv(meta, "gemma3.rope.global.freq_base", "gemma3.rope.freq_base");
copy_f32_kv(meta, "gemma3.rope.local.freq_base", "gemma3.rope.freq_base_swa");
// Inject required KVs with their standard gemma3 defaults (no-op if
// already present).
if (!has_key(meta, "gemma3.attention.layer_norm_rms_epsilon"))
gguf_set_val_f32(meta, "gemma3.attention.layer_norm_rms_epsilon", 1e-6f);
if (!has_key(meta, "gemma3.rope.freq_base"))
gguf_set_val_f32(meta, "gemma3.rope.freq_base", 1000000.0f);
if (!has_key(meta, "gemma3.rope.freq_base_swa"))
gguf_set_val_f32(meta, "gemma3.rope.freq_base_swa", 10000.0f);
// Inject required KVs with their standard gemma3 defaults.
inject_f32_if_missing(meta, "gemma3.attention.layer_norm_rms_epsilon", 1e-6f);
inject_f32_if_missing(meta, "gemma3.rope.freq_base", 1000000.0f);
inject_f32_if_missing(meta, "gemma3.rope.freq_base_swa", 10000.0f);
// Gemma3 4B/12B/27B ship with {type: "linear", factor: 8.0} rope scaling
// in their HF config to extend the 16k trained context to 131072. Ollama's
// old converter didn't write these. The 1B has no scaling — detect by
// context length.
int64_t ctx_key = gguf_find_key(meta, "gemma3.context_length");
if (ctx_key >= 0 && gguf_get_val_u32(meta, ctx_key) >= 131072
&& !has_key(meta, "gemma3.rope.scaling.factor")) {
gguf_set_val_str(meta, "gemma3.rope.scaling.type", "linear");
gguf_set_val_f32(meta, "gemma3.rope.scaling.factor", 8.0f);
const int64_t ctx_key = gguf_find_key(meta, "gemma3.context_length");
if (ctx_key >= 0 && gguf_get_val_u32(meta, ctx_key) >= 131072) {
inject_str_if_missing(meta, "gemma3.rope.scaling.type", "linear");
inject_f32_if_missing(meta, "gemma3.rope.scaling.factor", 8.0f);
}
// Tokenizer vocab size vs embedding rows mismatch: Ollama leaves extra
@ -317,20 +85,19 @@ void handle_gemma3(const llama_model_loader * ml, gguf_context * meta, ggml_cont
// already have the +1 shift baked in, same as upstream's convert_hf.
}
// -------------------------------------------------------------------------
// =========================================================================
// qwen35moe (text side)
// -------------------------------------------------------------------------
// =========================================================================
bool detect_ollama_qwen35moe(const gguf_context * meta, const ggml_context * ctx) {
// Require the file to declare itself qwen35moe first.
const int64_t arch_kid = gguf_find_key(meta, "general.architecture");
if (arch_kid < 0) return false;
if (std::strcmp(gguf_get_val_str(meta, arch_kid), "qwen35moe") != 0) return false;
// Then: at least one Ollama-ism. Upstream qwen35moe text files have none
// of these — the vision KVs move to mmproj, MTP tensors are dropped,
// head_count_kv is a scalar not an array, and the various extra rope /
// ssm KVs below are either absent or stored differently.
// Any Ollama-ism. Upstream qwen35moe files have none of these — the
// vision KVs live in a separate mmproj, MTP tensors are dropped,
// head_count_kv is a scalar, and the extra rope / ssm / feed_forward
// KVs are either absent or stored differently.
return has_key(meta, "qwen35moe.vision.block_count")
|| has_key(meta, "qwen35moe.image_token_id")
|| has_key(meta, "qwen35moe.ssm.v_head_reordered")
@ -346,8 +113,8 @@ void handle_qwen35moe(const llama_model_loader * ml, gguf_context * meta, ggml_c
LLAMA_LOG_INFO("%s: detected Ollama-format qwen35moe GGUF; applying compatibility fixes\n", __func__);
// 1. attention.head_count_kv — upstream expects UINT32; Ollama wrote
// an array (one entry per layer, 0 for SSM layers, 2 for attention
// layers). Collapse to the max non-zero value.
// an array (one entry per layer, 0 for SSM layers, 2 for attention).
// Collapse to the max non-zero value.
{
const int64_t kid = gguf_find_key(meta, "qwen35moe.attention.head_count_kv");
if (kid >= 0 && gguf_get_kv_type(meta, kid) == GGUF_TYPE_ARRAY) {
@ -356,8 +123,8 @@ void handle_qwen35moe(const llama_model_loader * ml, gguf_context * meta, ggml_c
uint32_t max_kv = 0;
for (size_t i = 0; i < n; ++i) if (arr[i] > max_kv) max_kv = arr[i];
if (max_kv == 0) max_kv = 2; // safety fallback
gguf_remove_key(meta, "qwen35moe.attention.head_count_kv");
gguf_set_val_u32(meta, "qwen35moe.attention.head_count_kv", max_kv);
gguf_remove_key (meta, "qwen35moe.attention.head_count_kv");
gguf_set_val_u32 (meta, "qwen35moe.attention.head_count_kv", max_kv);
}
}
@ -373,8 +140,8 @@ void handle_qwen35moe(const llama_model_loader * ml, gguf_context * meta, ggml_c
}
}
// 3. Tensor rename: Ollama's `blk.N.ssm_dt` corresponds to upstream's
// `blk.N.ssm_dt.bias` (same shape, F32 [32]). 40 layers.
// 3. Tensor rename: Ollama's `blk.N.ssm_dt` is upstream's
// `blk.N.ssm_dt.bias` (same shape). 40 layers.
{
std::vector<std::string> targets;
const int64_t n = gguf_get_n_tensors(meta);
@ -393,18 +160,15 @@ void handle_qwen35moe(const llama_model_loader * ml, gguf_context * meta, ggml_c
}
// 4. Drop embedded vision + MTP + projector tensors from the text loader.
// (vision goes to clip via --mmproj; MTP isn't used by upstream.)
add_skip_prefix(ml, "v.");
add_skip_prefix(ml, "mm.");
add_skip_prefix(ml, "mtp.");
}
// -------------------------------------------------------------------------
// =========================================================================
// gemma3 (clip side)
// -------------------------------------------------------------------------
// =========================================================================
// Ollama -> upstream tensor-name renames. Applied via substring match, so
// both `.weight` and `.bias` variants are covered with one entry each.
constexpr std::pair<const char *, const char *> kGemma3ClipRenames[] = {
{"v.patch_embedding", "v.patch_embd"},
{"v.position_embedding", "v.position_embd"},
@ -419,7 +183,6 @@ constexpr std::pair<const char *, const char *> kGemma3ClipRenames[] = {
};
void handle_gemma3_clip(gguf_context * meta, ggml_context * ctx) {
// Synthesize clip.vision.* from gemma3.vision.* (same values, different key).
copy_u32_kv(meta, "gemma3.vision.block_count", "clip.vision.block_count");
copy_u32_kv(meta, "gemma3.vision.embedding_length", "clip.vision.embedding_length");
copy_u32_kv(meta, "gemma3.vision.feed_forward_length", "clip.vision.feed_forward_length");
@ -430,18 +193,12 @@ void handle_gemma3_clip(gguf_context * meta, ggml_context * ctx) {
// projection_dim = text model's embedding_length (mmproj out == LM in).
copy_u32_kv(meta, "gemma3.embedding_length", "clip.vision.projection_dim");
// image_mean / image_std are constants for gemma3 vision.
if (!has_key(meta, "clip.vision.image_mean")) {
const float mean[3] = {0.5f, 0.5f, 0.5f};
gguf_set_arr_data(meta, "clip.vision.image_mean", GGUF_TYPE_FLOAT32, mean, 3);
}
if (!has_key(meta, "clip.vision.image_std")) {
const float std_[3] = {0.5f, 0.5f, 0.5f};
gguf_set_arr_data(meta, "clip.vision.image_std", GGUF_TYPE_FLOAT32, std_, 3);
}
static const float kHalfHalfHalf[3] = {0.5f, 0.5f, 0.5f};
inject_f32_arr_if_missing(meta, "clip.vision.image_mean", kHalfHalfHalf, 3);
inject_f32_arr_if_missing(meta, "clip.vision.image_std", kHalfHalfHalf, 3);
if (!has_key(meta, "clip.has_vision_encoder")) gguf_set_val_bool(meta, "clip.has_vision_encoder", true);
if (!has_key(meta, "clip.use_gelu")) gguf_set_val_bool(meta, "clip.use_gelu", true);
inject_bool_if_missing(meta, "clip.has_vision_encoder", true);
inject_bool_if_missing(meta, "clip.use_gelu", true);
gguf_set_val_str(meta, "clip.projector_type", "gemma3");
gguf_set_val_str(meta, "general.architecture", "clip");
@ -456,11 +213,10 @@ void handle_gemma3_clip(gguf_context * meta, ggml_context * ctx) {
promote_tensor_to_f32(meta, ctx, "v.position_embd.weight");
}
// -------------------------------------------------------------------------
// =========================================================================
// qwen35moe (clip side)
// -------------------------------------------------------------------------
// =========================================================================
// Substring renames. One entry handles both `.weight` and `.bias` variants.
constexpr std::pair<const char *, const char *> kQwen35moeClipRenames[] = {
{"v.pos_embed", "v.position_embd"},
{"v.patch_embed", "v.patch_embd"},
@ -473,41 +229,31 @@ constexpr std::pair<const char *, const char *> kQwen35moeClipRenames[] = {
{".norm2", ".ln2"},
};
// Register a QKV merge for a single block: Ollama has separate attn_q,
// attn_k, attn_v tensors; upstream wants them concatenated along their
// slow axis. Capture source file offsets BEFORE renaming.
// Register a QKV merge for a single vision block: Ollama has separate
// attn_q, attn_k, attn_v tensors; upstream wants them concatenated along
// their slow axis. Capture source file offsets BEFORE renaming attn_q.
void register_qwen35moe_qkv_merge(gguf_context * meta, ggml_context * ctx, int block_idx) {
char qname[64], kname[64], vname[64];
std::snprintf(qname, sizeof(qname), "v.blk.%d.attn_q.weight", block_idx);
std::snprintf(kname, sizeof(kname), "v.blk.%d.attn_k.weight", block_idx);
std::snprintf(vname, sizeof(vname), "v.blk.%d.attn_v.weight", block_idx);
const ggml_tensor * q = ggml_get_tensor(ctx, qname);
if (!q) return; // not a qwen35moe vision block
// Set up the destination tensor. We rename attn_q -> attn_qkv and
// widen its slow axis from [1152, 1152] to [1152, 3456] (3 * hidden).
char qkv_w[64], qkv_b[64], qbias[64], kbias[64], vbias[64];
std::snprintf(qkv_w, sizeof(qkv_w), "v.blk.%d.attn_qkv.weight", block_idx);
std::snprintf(qkv_b, sizeof(qkv_b), "v.blk.%d.attn_qkv.bias", block_idx);
char q[64], k[64], v[64], qbias[64], kbias[64], vbias[64], qkv_w[64], qkv_b[64];
std::snprintf(q, sizeof(q), "v.blk.%d.attn_q.weight", block_idx);
std::snprintf(k, sizeof(k), "v.blk.%d.attn_k.weight", block_idx);
std::snprintf(v, sizeof(v), "v.blk.%d.attn_v.weight", block_idx);
std::snprintf(qbias, sizeof(qbias), "v.blk.%d.attn_q.bias", block_idx);
std::snprintf(kbias, sizeof(kbias), "v.blk.%d.attn_k.bias", block_idx);
std::snprintf(vbias, sizeof(vbias), "v.blk.%d.attn_v.bias", block_idx);
std::snprintf(qkv_w, sizeof(qkv_w), "v.blk.%d.attn_qkv.weight", block_idx);
std::snprintf(qkv_b, sizeof(qkv_b), "v.blk.%d.attn_qkv.bias", block_idx);
if (!ggml_get_tensor(ctx, q)) return; // no vision block at this index
// Capture source offsets for the concat BEFORE renaming.
register_concat_load(meta, qkv_w, {qname, kname, vname});
register_concat_load(meta, qkv_w, {q, k, v});
register_concat_load(meta, qkv_b, {qbias, kbias, vbias});
// Rename attn_q -> attn_qkv and widen shape.
rename_tensor(meta, ctx, qname, qkv_w);
if (ggml_tensor * t = ggml_get_tensor(ctx, qkv_w)) {
set_tensor_shape(t, {t->ne[0], t->ne[1] * 3});
}
// Rename attn_q.bias -> attn_qkv.bias and widen from [1152] to [3456].
// Rename attn_q -> attn_qkv and widen from [hidden, hidden] to [hidden, 3*hidden].
rename_tensor(meta, ctx, q, qkv_w);
if (ggml_tensor * t = ggml_get_tensor(ctx, qkv_w)) set_tensor_shape(t, {t->ne[0], t->ne[1] * 3});
rename_tensor(meta, ctx, qbias, qkv_b);
if (ggml_tensor * t = ggml_get_tensor(ctx, qkv_b)) {
set_tensor_shape(t, {t->ne[0] * 3});
}
if (ggml_tensor * t = ggml_get_tensor(ctx, qkv_b)) set_tensor_shape(t, {t->ne[0] * 3});
}
// Register the patch_embed reshape + split + F16->F32.
@ -518,21 +264,15 @@ void register_qwen35moe_qkv_merge(gguf_context * meta, ggml_context * ctx, int b
// [out_c=1152, in_c=3, ...] dim pair, so packed_c = c_out*3 + c_in.
//
// Destination: two upstream tensors with ggml shape
// [h=16, w=16, c_in=3, c_out=1152] F32 each,
// one per temporal slice. Matches upstream's
// yield data_torch[:, :, 0, ...] # PyTorch [1152, 3, 16, 16]
// yield data_torch[:, :, 1, ...]
// which reverses to ggml ne=[16, 16, 3, 1152] per slice.
// [h=16, w=16, c_in=3, c_out=1152] F32 each, one per temporal slice.
//
// For each output element (h, w, c_in, c_out):
// src_idx = h + w*W + t*W*H + (c_out*C_in + c_in)*W*H*T
// dst_idx = h + w*W + c_in*W*H + c_out*W*H*C_in
void register_qwen35moe_patch_embed_split(gguf_context * meta, ggml_context * ctx) {
const char * src_name = "v.patch_embed.weight";
const int64_t tid = gguf_find_tensor(meta, src_name);
if (tid < 0) return;
ggml_tensor * src_t = ggml_get_tensor(ctx, src_name);
if (gguf_find_tensor(meta, src_name) < 0) return;
const ggml_tensor * src_t = ggml_get_tensor(ctx, src_name);
if (!src_t) return;
const size_t src_offset = tensor_file_offset(meta, src_name);
@ -552,8 +292,8 @@ void register_qwen35moe_patch_embed_split(gguf_context * meta, ggml_context * ct
for (int c_out = 0; c_out < COUT; ++c_out) {
for (int c_in = 0; c_in < CIN; ++c_in) {
const size_t packed = (size_t) c_out * CIN + c_in;
const uint16_t * in_base = sp + HW * (slice_idx + T * packed);
float * out_base = dp + HW * (c_in + CIN * c_out);
const uint16_t * in_base = sp + HW * (slice_idx + T * packed);
float * out_base = dp + HW * (c_in + CIN * c_out);
for (size_t i = 0; i < HW; ++i) out_base[i] = ggml_fp16_to_fp32(in_base[i]);
}
}
@ -565,105 +305,82 @@ void register_qwen35moe_patch_embed_split(gguf_context * meta, ggml_context * ct
};
// Rename src -> `v.patch_embd.weight`, reshape to dest layout, register
// the slice-0 load op against its new name.
// the slice-0 load op.
rename_tensor(meta, ctx, src_name, "v.patch_embd.weight");
ggml_tensor * dest0 = ggml_get_tensor(ctx, "v.patch_embd.weight");
if (!dest0) return;
set_tensor_shape(dest0, {16, 16, 3, 1152});
set_tensor_type (dest0, GGML_TYPE_F32);
if (ggml_tensor * dest0 = ggml_get_tensor(ctx, "v.patch_embd.weight")) {
set_tensor_shape(dest0, {H, W, CIN, COUT});
set_tensor_type (dest0, GGML_TYPE_F32);
}
register_load_op("v.patch_embd.weight", make_slice_op(0));
// We need a sibling tensor `v.patch_embd.weight.1` in ctx_meta so clip's
// get_tensor() can find it. ggml_new_tensor() would blow ctx_meta's
// fixed memory pool (sized exactly for the original tensor count).
// Instead, steal an unused slot: after the QKV merge, `v.blk.0.attn_k`
// is orphaned in ctx_meta — clip never looks it up because it asks for
// the merged `attn_qkv`. Rename it to our sibling and reshape.
rename_tensor(meta, ctx, "v.blk.0.attn_k.weight", "v.patch_embd.weight.1");
ggml_tensor * dest1 = ggml_get_tensor(ctx, "v.patch_embd.weight.1");
if (!dest1) return;
set_tensor_shape(dest1, {16, 16, 3, 1152});
set_tensor_type (dest1, GGML_TYPE_F32);
// Reclaim the `v.blk.0.attn_k.weight` slot (orphaned by the QKV merge)
// as the sibling `v.patch_embd.weight.1`.
reclaim_slot_as(meta, ctx,
"v.blk.0.attn_k.weight", "v.patch_embd.weight.1",
{H, W, CIN, COUT}, GGML_TYPE_F32);
register_load_op("v.patch_embd.weight.1", make_slice_op(1));
}
void handle_qwen35moe_clip(gguf_context * meta, ggml_context * ctx) {
LLAMA_LOG_INFO("%s: detected Ollama-format qwen35moe GGUF used as mmproj; translating\n", __func__);
// KV synthesis: clip.vision.* from qwen35moe.vision.* (plus defaults).
copy_u32_kv(meta, "qwen35moe.vision.block_count", "clip.vision.block_count");
copy_u32_kv(meta, "qwen35moe.vision.embedding_length", "clip.vision.embedding_length");
copy_u32_kv(meta, "qwen35moe.vision.attention.head_count", "clip.vision.attention.head_count");
copy_u32_kv(meta, "qwen35moe.vision.patch_size", "clip.vision.patch_size");
copy_u32_kv(meta, "qwen35moe.vision.spatial_merge_size", "clip.vision.spatial_merge_size");
copy_u32_kv(meta, "qwen35moe.vision.num_channels", "clip.vision.num_channels");
// projection_dim is the text model's embedding_length (merger out dim).
// projection_dim = text model's embedding_length.
copy_u32_kv(meta, "qwen35moe.embedding_length", "clip.vision.projection_dim");
// Ollama omitted these; defaults match reference (ref_Q3.5-35B-A3B mmproj).
if (!has_key(meta, "clip.vision.feed_forward_length"))
gguf_set_val_u32(meta, "clip.vision.feed_forward_length", 4304);
if (!has_key(meta, "clip.vision.image_size"))
gguf_set_val_u32(meta, "clip.vision.image_size", 768);
if (!has_key(meta, "clip.vision.attention.layer_norm_epsilon"))
gguf_set_val_f32(meta, "clip.vision.attention.layer_norm_epsilon", 1e-6f);
// Defaults for KVs Ollama omitted (match the Qwen3.5-35B-A3B reference mmproj).
inject_u32_if_missing(meta, "clip.vision.feed_forward_length", 4304);
inject_u32_if_missing(meta, "clip.vision.image_size", 768);
inject_f32_if_missing(meta, "clip.vision.attention.layer_norm_epsilon", 1e-6f);
// image_mean / image_std — constants for qwen3.5 vision.
if (!has_key(meta, "clip.vision.image_mean")) {
const float v[3] = {0.5f, 0.5f, 0.5f};
gguf_set_arr_data(meta, "clip.vision.image_mean", GGUF_TYPE_FLOAT32, v, 3);
}
if (!has_key(meta, "clip.vision.image_std")) {
const float v[3] = {0.5f, 0.5f, 0.5f};
gguf_set_arr_data(meta, "clip.vision.image_std", GGUF_TYPE_FLOAT32, v, 3);
}
static const float kHalfHalfHalf[3] = {0.5f, 0.5f, 0.5f};
inject_f32_arr_if_missing(meta, "clip.vision.image_mean", kHalfHalfHalf, 3);
inject_f32_arr_if_missing(meta, "clip.vision.image_std", kHalfHalfHalf, 3);
// is_deepstack_layers: qwen3.5 35B has no deepstack layers. Set a
// 27-element array of False matching clip.vision.block_count.
// is_deepstack_layers: qwen3.5 35B has no deepstack layers. Set 27 False.
if (!has_key(meta, "clip.vision.is_deepstack_layers")) {
uint8_t bools[27] = {};
gguf_set_arr_data(meta, "clip.vision.is_deepstack_layers", GGUF_TYPE_BOOL, bools, 27);
}
if (!has_key(meta, "clip.has_vision_encoder")) gguf_set_val_bool(meta, "clip.has_vision_encoder", true);
if (!has_key(meta, "clip.use_gelu")) gguf_set_val_bool(meta, "clip.use_gelu", true);
inject_bool_if_missing(meta, "clip.has_vision_encoder", true);
inject_bool_if_missing(meta, "clip.use_gelu", true);
gguf_set_val_str(meta, "clip.projector_type", "qwen3vl_merger");
gguf_set_val_str(meta, "general.architecture", "clip");
// QKV merge per block. Runs BEFORE the substring renames so we can
// reliably find attn_q / attn_k / attn_v by name.
// QKV merge runs BEFORE substring renames so it can find attn_q/k/v by name.
const int64_t n_blocks_key = gguf_find_key(meta, "clip.vision.block_count");
const uint32_t n_blocks = n_blocks_key >= 0 ? gguf_get_val_u32(meta, n_blocks_key) : 27;
for (uint32_t b = 0; b < n_blocks; ++b) {
register_qwen35moe_qkv_merge(meta, ctx, (int) b);
}
for (uint32_t b = 0; b < n_blocks; ++b) register_qwen35moe_qkv_merge(meta, ctx, (int) b);
// patch_embed: reshape + temporal split + F16->F32. Also BEFORE renames
// because it references `v.patch_embed.weight` by name.
// Also before renames: patch_embed references the source by name.
register_qwen35moe_patch_embed_split(meta, ctx);
// Substring renames (last). These handle the simple pos_embed, merger.*,
// linear_fc1/2, norm1/2 conversions.
// Simple substring renames.
for (const auto & [from, to] : kQwen35moeClipRenames) {
rename_tensors_containing(meta, ctx, from, to);
}
// F16 -> F32 on position_embd after rename.
promote_tensor_to_f32(meta, ctx, "v.position_embd.weight");
}
} // anonymous namespace
// -------------------------------------------------------------------------
// public entry points
// -------------------------------------------------------------------------
// =========================================================================
// Public entry points
// =========================================================================
void translate_metadata(const llama_model_loader * ml,
gguf_context * meta,
ggml_context * ctx,
std::string & arch_name) {
if (!meta) return;
if (arch_name == "gemma3") handle_gemma3(ml, meta, ctx);
if (arch_name == "gemma3") handle_gemma3 (ml, meta, ctx);
if (arch_name == "qwen35moe") handle_qwen35moe(ml, meta, ctx);
// Dispatch. Add more arches as they are wired up.
}
@ -684,13 +401,7 @@ void translate_clip_metadata(gguf_context * meta, ggml_context * ctx) {
}
bool should_skip_tensor(const llama_model_loader * ml, const char * tensor_name) {
std::lock_guard<std::mutex> lk(g_registry_mutex);
auto it = g_skip_prefixes.find(ml);
if (it == g_skip_prefixes.end()) return false;
for (const auto & prefix : it->second) {
if (std::strncmp(tensor_name, prefix.c_str(), prefix.size()) == 0) return true;
}
return false;
return should_skip_tensor_prefix(ml, tensor_name);
}
bool maybe_load_tensor(ggml_tensor * cur,
@ -700,12 +411,7 @@ bool maybe_load_tensor(ggml_tensor * cur,
(void) file_offset; // registered ops capture their own offsets
LoadOp op;
{
std::lock_guard<std::mutex> lk(g_loadop_mutex);
auto it = g_loadops.find(ggml_get_name(cur));
if (it == g_loadops.end()) return false;
op = it->second;
}
if (!take_load_op(ggml_get_name(cur), op)) return false;
const size_t dst_size = ggml_nbytes(cur);
std::vector<uint8_t> dst(dst_size);

View file

@ -75,8 +75,9 @@ FetchContent_MakeAvailable(llama_cpp)
# Kept separate from the upstream-edits patch so our .cpp/.h stay
# on-disk in llama/compat/ rather than being copied into _deps/.
if(DEFINED OLLAMA_LLAMA_CPP_COMPAT_DIR)
target_sources(llama PRIVATE
${OLLAMA_LLAMA_CPP_COMPAT_DIR}/llama-ollama-compat.cpp)
file(GLOB _compat_sources CONFIGURE_DEPENDS
${OLLAMA_LLAMA_CPP_COMPAT_DIR}/*.cpp)
target_sources(llama PRIVATE ${_compat_sources})
target_include_directories(llama PRIVATE
${OLLAMA_LLAMA_CPP_COMPAT_DIR})
# mtmd's clip.cpp #include's the compat header too — add the same dir