llama/compat: add gpt-oss and lfm2 handlers

gpt-oss: rename arch "gptoss" -> "gpt-oss" (incl. KV prefix), inject
the missing `expert_feed_forward_length` from the ffn_gate_exps shape,
and rename `attn_out`/`attn_sinks`/`ffn_norm` tensors to upstream's
`attn_output`/`attn_sinks.weight`/`post_attention_norm`. Also remove
the library/gpt-oss -> dhiltgen/gpt-oss redirect now that the compat
shim handles it directly.

lfm2: rename `output_norm.weight` -> `token_embd_norm.weight` and fix
a stale `lfm2.feed_forward_length` (some Ollama blobs claim 12288 on
a model whose ffn_gate is [2048, 8192]) by reading the real value off
the ffn_gate tensor shape.

Adds two helpers to compat-util: `copy_kv` (type-preserving generic
KV copy) and `rename_kv_prefix` (bulk-copy every KV with a given
prefix to a new prefix). Old keys are left in place — harmless because
the loader queries by exact name and only the new prefix matters.

Tested locally: gpt-oss:20b and lfm2.5-thinking now load + generate
coherently against an unmodified upstream llama-server build.
This commit is contained in:
jmorganca 2026-04-19 13:43:52 -07:00
parent 9a69a17dc2
commit d0f38a915a
5 changed files with 160 additions and 1 deletions

View file

@ -40,6 +40,9 @@ an immediate no-op.
| Arch | Text loader | Clip (mmproj) loader |
|---|---|---|
| `gemma3` | KV injection (`layer_norm_rms_epsilon`, `rope.freq_base`, `rope.freq_base_swa`), tokenizer vocab truncation, drop `v.*`/`mm.*` tensors | Arch rewrite to `clip`, KV synthesis (`clip.vision.*`, `clip.projector_type=gemma3`), tensor renames (`v.patch_embedding``v.patch_embd`, `mlp.fc{1,2}``ffn_{down,up}`, etc.), F16→F32 promotion for patch/position embeddings (Metal IM2COL requirement) |
| `qwen35moe` | head_count_kv array → scalar, rope dimension_sections pad 3→4, `ssm_dt``ssm_dt.bias` rename, drop `v.*`/`mm.*`/`mtp.*` tensors | Arch rewrite to `clip`, KV synthesis (`clip.vision.*`, `clip.projector_type=qwen3vl_merger`), per-block QKV merge (concat at load time), patch_embed reshape + F16→F32 + slice-as-temporal-pair (reclaiming an orphan `v.blk.0.attn_k` slot for the second pair) |
| `gptoss` | Arch rename `gptoss``gpt-oss` (incl. KV prefix), inject `gpt-oss.expert_feed_forward_length` from `ffn_gate_exps` shape, tensor renames (`attn_out``attn_output`, `attn_sinks``attn_sinks.weight`, `ffn_norm``post_attention_norm`) | n/a |
| `lfm2` | Tensor rename `output_norm.weight``token_embd_norm.weight`, fix stale `lfm2.feed_forward_length` from `ffn_gate` shape | n/a |
Usage:

View file

@ -32,6 +32,60 @@ void copy_f32_kv(gguf_context * meta, const char * src, const char * dst) {
gguf_set_val_f32(meta, dst, gguf_get_val_f32(meta, k));
}
void copy_kv(gguf_context * meta, const char * src, const char * dst) {
if (has_key(meta, dst)) return;
const int64_t kid = gguf_find_key(meta, src);
if (kid < 0) return;
const enum gguf_type t = gguf_get_kv_type(meta, kid);
switch (t) {
case GGUF_TYPE_UINT8: gguf_set_val_u8 (meta, dst, gguf_get_val_u8 (meta, kid)); break;
case GGUF_TYPE_INT8: gguf_set_val_i8 (meta, dst, gguf_get_val_i8 (meta, kid)); break;
case GGUF_TYPE_UINT16: gguf_set_val_u16 (meta, dst, gguf_get_val_u16 (meta, kid)); break;
case GGUF_TYPE_INT16: gguf_set_val_i16 (meta, dst, gguf_get_val_i16 (meta, kid)); break;
case GGUF_TYPE_UINT32: gguf_set_val_u32 (meta, dst, gguf_get_val_u32 (meta, kid)); break;
case GGUF_TYPE_INT32: gguf_set_val_i32 (meta, dst, gguf_get_val_i32 (meta, kid)); break;
case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (meta, dst, gguf_get_val_f32 (meta, kid)); break;
case GGUF_TYPE_BOOL: gguf_set_val_bool(meta, dst, gguf_get_val_bool(meta, kid)); break;
case GGUF_TYPE_STRING: gguf_set_val_str (meta, dst, gguf_get_val_str (meta, kid)); break;
case GGUF_TYPE_UINT64: gguf_set_val_u64 (meta, dst, gguf_get_val_u64 (meta, kid)); break;
case GGUF_TYPE_INT64: gguf_set_val_i64 (meta, dst, gguf_get_val_i64 (meta, kid)); break;
case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (meta, dst, gguf_get_val_f64 (meta, kid)); break;
case GGUF_TYPE_ARRAY: {
const enum gguf_type et = gguf_get_arr_type(meta, kid);
const size_t n = gguf_get_arr_n(meta, kid);
if (et == GGUF_TYPE_STRING) {
std::vector<std::string> owned;
owned.reserve(n);
std::vector<const char *> ptrs;
ptrs.reserve(n);
for (size_t i = 0; i < n; ++i) owned.emplace_back(gguf_get_arr_str(meta, kid, i));
for (const auto & s : owned) ptrs.push_back(s.c_str());
gguf_set_arr_str(meta, dst, ptrs.data(), n);
} else {
gguf_set_arr_data(meta, dst, et, gguf_get_arr_data(meta, kid), n);
}
break;
}
default: break;
}
}
void rename_kv_prefix(gguf_context * meta, const char * old_prefix,
const char * new_prefix) {
const size_t old_len = std::strlen(old_prefix);
// Snapshot keys first; copy_kv() invalidates the kv index by appending.
std::vector<std::string> matches;
const int64_t n = gguf_get_n_kv(meta);
for (int64_t i = 0; i < n; ++i) {
const char * k = gguf_get_key(meta, i);
if (std::strncmp(k, old_prefix, old_len) == 0) matches.emplace_back(k);
}
for (const auto & old_key : matches) {
copy_kv(meta, old_key.c_str(),
(std::string(new_prefix) + old_key.substr(old_len)).c_str());
}
}
void inject_u32_if_missing (gguf_context * meta, const char * key, uint32_t v) {
if (!has_key(meta, key)) gguf_set_val_u32(meta, key, v);
}

View file

@ -53,6 +53,15 @@ namespace llama_ollama_compat::detail {
bool has_key(const gguf_context * meta, const char * key);
void copy_u32_kv(gguf_context * meta, const char * src, const char * dst);
void copy_f32_kv(gguf_context * meta, const char * src, const char * dst);
// Generic copy that preserves the source's gguf_type. Skips if `src` is
// missing or `dst` is already present. Arrays are copied verbatim
// (including element type).
void copy_kv(gguf_context * meta, const char * src, const char * dst);
// Copy every KV whose key starts with `old_prefix` to a new key under
// `new_prefix`. Old keys are left in place — harmless because the loader
// looks up keys by exact name and only queries the new prefix.
void rename_kv_prefix(gguf_context * meta, const char * old_prefix,
const char * new_prefix);
void inject_u32_if_missing (gguf_context * meta, const char * key, uint32_t v);
void inject_f32_if_missing (gguf_context * meta, const char * key, float v);
void inject_str_if_missing (gguf_context * meta, const char * key, const char * v);

View file

@ -165,6 +165,96 @@ void handle_qwen35moe(const llama_model_loader * ml, gguf_context * meta, ggml_c
add_skip_prefix(ml, "mtp.");
}
// =========================================================================
// gpt-oss (text only)
// =========================================================================
//
// Ollama uses arch name "gptoss" (no hyphen) and KV prefix "gptoss.*".
// Upstream uses "gpt-oss" / "gpt-oss.*". Same tensor layout otherwise,
// except:
// * `blk.X.attn_sinks` -> `blk.X.attn_sinks.weight` (missing suffix)
// * `blk.X.ffn_norm.weight` -> `blk.X.post_attention_norm.weight`
// (the second-norm-per-block names differ between converters)
bool detect_ollama_gptoss(const gguf_context * meta) {
const int64_t arch_kid = gguf_find_key(meta, "general.architecture");
if (arch_kid < 0) return false;
return std::strcmp(gguf_get_val_str(meta, arch_kid), "gptoss") == 0;
}
// `arch_name` is mutated to "gpt-oss" so the caller's subsequent
// LLM_KV lookups query the renamed prefix.
void handle_gptoss(const llama_model_loader * ml, gguf_context * meta,
ggml_context * ctx, std::string & arch_name) {
if (!detect_ollama_gptoss(meta)) return;
(void) ml;
LLAMA_LOG_INFO("%s: detected Ollama-format gpt-oss GGUF; applying compatibility fixes\n", __func__);
gguf_set_val_str(meta, "general.architecture", "gpt-oss");
rename_kv_prefix(meta, "gptoss.", "gpt-oss.");
arch_name = "gpt-oss";
// Upstream's gpt-oss loader requires `gpt-oss.expert_feed_forward_length`
// (n_ff_exp). Ollama omitted it; recover from the ffn_gate_exps tensor
// shape — for gpt-oss the tensor is created as {n_embd, n_ff_exp, n_expert}
// so ne[1] is the per-expert FFN dim.
if (!has_key(meta, "gpt-oss.expert_feed_forward_length")) {
if (ggml_tensor * t = ggml_get_tensor(ctx, "blk.0.ffn_gate_exps.weight")) {
gguf_set_val_u32(meta, "gpt-oss.expert_feed_forward_length", (uint32_t) t->ne[1]);
}
}
// Tensor renames. `rename_tensors_containing` does a substring replace
// on first occurrence — each needle below appears exactly once per
// tensor name and the needles don't overlap each other.
rename_tensors_containing(meta, ctx, ".attn_out",
".attn_output"); // wo: out -> output
rename_tensors_containing(meta, ctx, ".attn_sinks",
".attn_sinks.weight"); // add missing suffix
rename_tensors_containing(meta, ctx, ".ffn_norm",
".post_attention_norm");
}
// =========================================================================
// lfm2 (text only)
// =========================================================================
//
// Same arch name ("lfm2") on both sides. Only difference is the
// pre-output-projection norm: Ollama writes `output_norm.weight`,
// upstream writes `token_embd_norm.weight` (with the LFM2-specific
// LLM_TENSOR_OUTPUT_NORM_LFM2 mapping). One tensor rename.
bool detect_ollama_lfm2(const gguf_context * meta, const ggml_context * ctx) {
const int64_t arch_kid = gguf_find_key(meta, "general.architecture");
if (arch_kid < 0) return false;
if (std::strcmp(gguf_get_val_str(meta, arch_kid), "lfm2") != 0) return false;
// Marker: Ollama-converted lfm2 has output_norm.weight, upstream has
// token_embd_norm.weight instead.
return ggml_get_tensor(const_cast<ggml_context *>(ctx), "output_norm.weight") != nullptr
&& ggml_get_tensor(const_cast<ggml_context *>(ctx), "token_embd_norm.weight") == nullptr;
}
void handle_lfm2(const llama_model_loader * ml, gguf_context * meta, ggml_context * ctx) {
if (!detect_ollama_lfm2(meta, ctx)) return;
(void) ml;
LLAMA_LOG_INFO("%s: detected Ollama-format lfm2 GGUF; applying compatibility fixes\n", __func__);
rename_tensor(meta, ctx, "output_norm.weight", "token_embd_norm.weight");
// Older Ollama converters wrote a stale `lfm2.feed_forward_length` that
// didn't match the actual ffn_gate tensor shape (e.g. claimed 12288 on
// a model whose ffn_gate is [2048, 8192]). Fix from the tensor shape.
if (ggml_tensor * t = ggml_get_tensor(ctx, "blk.0.ffn_gate.weight")) {
const uint32_t real_n_ff = (uint32_t) t->ne[1];
const int64_t kid = gguf_find_key(meta, "lfm2.feed_forward_length");
if (kid < 0 || gguf_get_val_u32(meta, kid) != real_n_ff) {
gguf_set_val_u32(meta, "lfm2.feed_forward_length", real_n_ff);
}
}
}
// =========================================================================
// gemma3 (clip side)
// =========================================================================
@ -382,6 +472,8 @@ void translate_metadata(const llama_model_loader * ml,
if (!meta) return;
if (arch_name == "gemma3") handle_gemma3 (ml, meta, ctx);
if (arch_name == "qwen35moe") handle_qwen35moe(ml, meta, ctx);
if (arch_name == "gptoss") handle_gptoss (ml, meta, ctx, arch_name);
if (arch_name == "lfm2") handle_lfm2 (ml, meta, ctx);
// Dispatch. Add more arches as they are wired up.
}

View file

@ -15,8 +15,9 @@ import (
// entry should be removed from this list — the compat layer translates the
// original library/ blob in memory so no republish is needed.
var compatModelRedirects = []struct{ from, to string }{
{"library/gpt-oss", "dhiltgen/gpt-oss"},
// library/gpt-oss — handled by llama/compat (text only).
// library/gemma3 — handled by llama/compat (text + vision).
// library/lfm2.5-thinking — handled by llama/compat (text only).
{"library/embeddinggemma", "dhiltgen/embeddinggemma"},
{"library/snowflake-arctic-embed2", "dhiltgen/snowflake-arctic-embed2"},
{"library/gemma3n", "dhiltgen/gemma3n"},