llama/compat: add gpt-oss and lfm2 handlers

gpt-oss: rename arch "gptoss" -> "gpt-oss" (incl. KV prefix), inject the missing `expert_feed_forward_length` from the ffn_gate_exps shape, and rename `attn_out`/`attn_sinks`/`ffn_norm` tensors to upstream's `attn_output`/`attn_sinks.weight`/`post_attention_norm`. Also remove the library/gpt-oss -> dhiltgen/gpt-oss redirect now that the compat shim handles it directly. lfm2: rename `output_norm.weight` -> `token_embd_norm.weight` and fix a stale `lfm2.feed_forward_length` (some Ollama blobs claim 12288 on a model whose ffn_gate is [2048, 8192]) by reading the real value off the ffn_gate tensor shape. Adds two helpers to compat-util: `copy_kv` (type-preserving generic KV copy) and `rename_kv_prefix` (bulk-copy every KV with a given prefix to a new prefix). Old keys are left in place — harmless because the loader queries by exact name and only the new prefix matters. Tested locally: gpt-oss:20b and lfm2.5-thinking now load + generate coherently against an unmodified upstream llama-server build.
2026-05-13 14:27:00 +00:00 · 2026-04-19 13:43:52 -07:00 · 2026-04-19 13:43:52 -07:00 · d0f38a915a
commit d0f38a915a
parent 9a69a17dc2
5 changed files with 160 additions and 1 deletions
--- a/llama/compat/README.md
+++ b/llama/compat/README.md
@ -40,6 +40,9 @@ an immediate no-op.
 | Arch | Text loader | Clip (mmproj) loader |
 |---|---|---|
 | `gemma3` | KV injection (`layer_norm_rms_epsilon`, `rope.freq_base`, `rope.freq_base_swa`), tokenizer vocab truncation, drop `v.*`/`mm.*` tensors | Arch rewrite to `clip`, KV synthesis (`clip.vision.*`, `clip.projector_type=gemma3`), tensor renames (`v.patch_embedding`→`v.patch_embd`, `mlp.fc{1,2}`→`ffn_{down,up}`, etc.), F16→F32 promotion for patch/position embeddings (Metal IM2COL requirement) |
+| `qwen35moe` | head_count_kv array → scalar, rope dimension_sections pad 3→4, `ssm_dt`→`ssm_dt.bias` rename, drop `v.*`/`mm.*`/`mtp.*` tensors | Arch rewrite to `clip`, KV synthesis (`clip.vision.*`, `clip.projector_type=qwen3vl_merger`), per-block QKV merge (concat at load time), patch_embed reshape + F16→F32 + slice-as-temporal-pair (reclaiming an orphan `v.blk.0.attn_k` slot for the second pair) |
+| `gptoss` | Arch rename `gptoss`→`gpt-oss` (incl. KV prefix), inject `gpt-oss.expert_feed_forward_length` from `ffn_gate_exps` shape, tensor renames (`attn_out`→`attn_output`, `attn_sinks`→`attn_sinks.weight`, `ffn_norm`→`post_attention_norm`) | n/a |
+| `lfm2` | Tensor rename `output_norm.weight`→`token_embd_norm.weight`, fix stale `lfm2.feed_forward_length` from `ffn_gate` shape | n/a |

 Usage:

--- a/llama/compat/llama-ollama-compat-util.cpp
+++ b/llama/compat/llama-ollama-compat-util.cpp
@ -32,6 +32,60 @@ void copy_f32_kv(gguf_context * meta, const char * src, const char * dst) {
    gguf_set_val_f32(meta, dst, gguf_get_val_f32(meta, k));
 }

+void copy_kv(gguf_context * meta, const char * src, const char * dst) {
+    if (has_key(meta, dst)) return;
+    const int64_t kid = gguf_find_key(meta, src);
+    if (kid < 0) return;
+    const enum gguf_type t = gguf_get_kv_type(meta, kid);
+    switch (t) {
+        case GGUF_TYPE_UINT8:   gguf_set_val_u8  (meta, dst, gguf_get_val_u8  (meta, kid)); break;
+        case GGUF_TYPE_INT8:    gguf_set_val_i8  (meta, dst, gguf_get_val_i8  (meta, kid)); break;
+        case GGUF_TYPE_UINT16:  gguf_set_val_u16 (meta, dst, gguf_get_val_u16 (meta, kid)); break;
+        case GGUF_TYPE_INT16:   gguf_set_val_i16 (meta, dst, gguf_get_val_i16 (meta, kid)); break;
+        case GGUF_TYPE_UINT32:  gguf_set_val_u32 (meta, dst, gguf_get_val_u32 (meta, kid)); break;
+        case GGUF_TYPE_INT32:   gguf_set_val_i32 (meta, dst, gguf_get_val_i32 (meta, kid)); break;
+        case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (meta, dst, gguf_get_val_f32 (meta, kid)); break;
+        case GGUF_TYPE_BOOL:    gguf_set_val_bool(meta, dst, gguf_get_val_bool(meta, kid)); break;
+        case GGUF_TYPE_STRING:  gguf_set_val_str (meta, dst, gguf_get_val_str (meta, kid)); break;
+        case GGUF_TYPE_UINT64:  gguf_set_val_u64 (meta, dst, gguf_get_val_u64 (meta, kid)); break;
+        case GGUF_TYPE_INT64:   gguf_set_val_i64 (meta, dst, gguf_get_val_i64 (meta, kid)); break;
+        case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (meta, dst, gguf_get_val_f64 (meta, kid)); break;
+        case GGUF_TYPE_ARRAY: {
+            const enum gguf_type et = gguf_get_arr_type(meta, kid);
+            const size_t n = gguf_get_arr_n(meta, kid);
+            if (et == GGUF_TYPE_STRING) {
+                std::vector<std::string> owned;
+                owned.reserve(n);
+                std::vector<const char *> ptrs;
+                ptrs.reserve(n);
+                for (size_t i = 0; i < n; ++i) owned.emplace_back(gguf_get_arr_str(meta, kid, i));
+                for (const auto & s : owned) ptrs.push_back(s.c_str());
+                gguf_set_arr_str(meta, dst, ptrs.data(), n);
+            } else {
+                gguf_set_arr_data(meta, dst, et, gguf_get_arr_data(meta, kid), n);
+            }
+            break;
+        }
+        default: break;
+    }
+}
+
+void rename_kv_prefix(gguf_context * meta, const char * old_prefix,
+                      const char * new_prefix) {
+    const size_t old_len = std::strlen(old_prefix);
+    // Snapshot keys first; copy_kv() invalidates the kv index by appending.
+    std::vector<std::string> matches;
+    const int64_t n = gguf_get_n_kv(meta);
+    for (int64_t i = 0; i < n; ++i) {
+        const char * k = gguf_get_key(meta, i);
+        if (std::strncmp(k, old_prefix, old_len) == 0) matches.emplace_back(k);
+    }
+    for (const auto & old_key : matches) {
+        copy_kv(meta, old_key.c_str(),
+                (std::string(new_prefix) + old_key.substr(old_len)).c_str());
+    }
+}
+
 void inject_u32_if_missing (gguf_context * meta, const char * key, uint32_t v) {
    if (!has_key(meta, key)) gguf_set_val_u32(meta, key, v);
 }
--- a/llama/compat/llama-ollama-compat-util.h
+++ b/llama/compat/llama-ollama-compat-util.h
@ -53,6 +53,15 @@ namespace llama_ollama_compat::detail {
 bool has_key(const gguf_context * meta, const char * key);
 void copy_u32_kv(gguf_context * meta, const char * src, const char * dst);
 void copy_f32_kv(gguf_context * meta, const char * src, const char * dst);
+// Generic copy that preserves the source's gguf_type. Skips if `src` is
+// missing or `dst` is already present. Arrays are copied verbatim
+// (including element type).
+void copy_kv(gguf_context * meta, const char * src, const char * dst);
+// Copy every KV whose key starts with `old_prefix` to a new key under
+// `new_prefix`. Old keys are left in place — harmless because the loader
+// looks up keys by exact name and only queries the new prefix.
+void rename_kv_prefix(gguf_context * meta, const char * old_prefix,
+                      const char * new_prefix);
 void inject_u32_if_missing (gguf_context * meta, const char * key, uint32_t v);
 void inject_f32_if_missing (gguf_context * meta, const char * key, float    v);
 void inject_str_if_missing (gguf_context * meta, const char * key, const char * v);
--- a/llama/compat/llama-ollama-compat.cpp
+++ b/llama/compat/llama-ollama-compat.cpp
@ -165,6 +165,96 @@ void handle_qwen35moe(const llama_model_loader * ml, gguf_context * meta, ggml_c
    add_skip_prefix(ml, "mtp.");
 }

+// =========================================================================
+// gpt-oss (text only)
+// =========================================================================
+//
+// Ollama uses arch name "gptoss" (no hyphen) and KV prefix "gptoss.*".
+// Upstream uses "gpt-oss" / "gpt-oss.*". Same tensor layout otherwise,
+// except:
+//   * `blk.X.attn_sinks` -> `blk.X.attn_sinks.weight` (missing suffix)
+//   * `blk.X.ffn_norm.weight` -> `blk.X.post_attention_norm.weight`
+//     (the second-norm-per-block names differ between converters)
+
+bool detect_ollama_gptoss(const gguf_context * meta) {
+    const int64_t arch_kid = gguf_find_key(meta, "general.architecture");
+    if (arch_kid < 0) return false;
+    return std::strcmp(gguf_get_val_str(meta, arch_kid), "gptoss") == 0;
+}
+
+// `arch_name` is mutated to "gpt-oss" so the caller's subsequent
+// LLM_KV lookups query the renamed prefix.
+void handle_gptoss(const llama_model_loader * ml, gguf_context * meta,
+                   ggml_context * ctx, std::string & arch_name) {
+    if (!detect_ollama_gptoss(meta)) return;
+    (void) ml;
+
+    LLAMA_LOG_INFO("%s: detected Ollama-format gpt-oss GGUF; applying compatibility fixes\n", __func__);
+
+    gguf_set_val_str(meta, "general.architecture", "gpt-oss");
+    rename_kv_prefix(meta, "gptoss.", "gpt-oss.");
+    arch_name = "gpt-oss";
+
+    // Upstream's gpt-oss loader requires `gpt-oss.expert_feed_forward_length`
+    // (n_ff_exp). Ollama omitted it; recover from the ffn_gate_exps tensor
+    // shape — for gpt-oss the tensor is created as {n_embd, n_ff_exp, n_expert}
+    // so ne[1] is the per-expert FFN dim.
+    if (!has_key(meta, "gpt-oss.expert_feed_forward_length")) {
+        if (ggml_tensor * t = ggml_get_tensor(ctx, "blk.0.ffn_gate_exps.weight")) {
+            gguf_set_val_u32(meta, "gpt-oss.expert_feed_forward_length", (uint32_t) t->ne[1]);
+        }
+    }
+
+    // Tensor renames. `rename_tensors_containing` does a substring replace
+    // on first occurrence — each needle below appears exactly once per
+    // tensor name and the needles don't overlap each other.
+    rename_tensors_containing(meta, ctx, ".attn_out",
+                              ".attn_output");      // wo: out -> output
+    rename_tensors_containing(meta, ctx, ".attn_sinks",
+                              ".attn_sinks.weight"); // add missing suffix
+    rename_tensors_containing(meta, ctx, ".ffn_norm",
+                              ".post_attention_norm");
+}
+
+// =========================================================================
+// lfm2 (text only)
+// =========================================================================
+//
+// Same arch name ("lfm2") on both sides. Only difference is the
+// pre-output-projection norm: Ollama writes `output_norm.weight`,
+// upstream writes `token_embd_norm.weight` (with the LFM2-specific
+// LLM_TENSOR_OUTPUT_NORM_LFM2 mapping). One tensor rename.
+
+bool detect_ollama_lfm2(const gguf_context * meta, const ggml_context * ctx) {
+    const int64_t arch_kid = gguf_find_key(meta, "general.architecture");
+    if (arch_kid < 0) return false;
+    if (std::strcmp(gguf_get_val_str(meta, arch_kid), "lfm2") != 0) return false;
+    // Marker: Ollama-converted lfm2 has output_norm.weight, upstream has
+    // token_embd_norm.weight instead.
+    return ggml_get_tensor(const_cast<ggml_context *>(ctx), "output_norm.weight") != nullptr
+        && ggml_get_tensor(const_cast<ggml_context *>(ctx), "token_embd_norm.weight") == nullptr;
+}
+
+void handle_lfm2(const llama_model_loader * ml, gguf_context * meta, ggml_context * ctx) {
+    if (!detect_ollama_lfm2(meta, ctx)) return;
+    (void) ml;
+
+    LLAMA_LOG_INFO("%s: detected Ollama-format lfm2 GGUF; applying compatibility fixes\n", __func__);
+
+    rename_tensor(meta, ctx, "output_norm.weight", "token_embd_norm.weight");
+
+    // Older Ollama converters wrote a stale `lfm2.feed_forward_length` that
+    // didn't match the actual ffn_gate tensor shape (e.g. claimed 12288 on
+    // a model whose ffn_gate is [2048, 8192]). Fix from the tensor shape.
+    if (ggml_tensor * t = ggml_get_tensor(ctx, "blk.0.ffn_gate.weight")) {
+        const uint32_t real_n_ff = (uint32_t) t->ne[1];
+        const int64_t kid = gguf_find_key(meta, "lfm2.feed_forward_length");
+        if (kid < 0 || gguf_get_val_u32(meta, kid) != real_n_ff) {
+            gguf_set_val_u32(meta, "lfm2.feed_forward_length", real_n_ff);
+        }
+    }
+}
+
 // =========================================================================
 // gemma3 (clip side)
 // =========================================================================
@ -382,6 +472,8 @@ void translate_metadata(const llama_model_loader * ml,
    if (!meta) return;
    if (arch_name == "gemma3")    handle_gemma3   (ml, meta, ctx);
    if (arch_name == "qwen35moe") handle_qwen35moe(ml, meta, ctx);
+    if (arch_name == "gptoss")    handle_gptoss   (ml, meta, ctx, arch_name);
+    if (arch_name == "lfm2")      handle_lfm2     (ml, meta, ctx);
    // Dispatch. Add more arches as they are wired up.
 }

--- a/server/model_resolver.go
+++ b/server/model_resolver.go
@ -15,8 +15,9 @@ import (
 // entry should be removed from this list — the compat layer translates the
 // original library/ blob in memory so no republish is needed.
 var compatModelRedirects = []struct{ from, to string }{
-	{"library/gpt-oss", "dhiltgen/gpt-oss"},
+	// library/gpt-oss — handled by llama/compat (text only).
 	// library/gemma3 — handled by llama/compat (text + vision).
+	// library/lfm2.5-thinking — handled by llama/compat (text only).
 	{"library/embeddinggemma", "dhiltgen/embeddinggemma"},
 	{"library/snowflake-arctic-embed2", "dhiltgen/snowflake-arctic-embed2"},
 	{"library/gemma3n", "dhiltgen/gemma3n"},