llama/compat: add qwen35moe text handler

Text-only support for Ollama's qwen3.5 (qwen35moe) blobs. Vision is more involved (QKV merge, patch_embed reshape/split) and follows in a later commit. Detection markers: qwen35moe.vision.*, qwen35moe.image_token_id, qwen35moe.ssm.v_head_reordered, qwen35moe.feed_forward_length, mtp.* tensors, embedded v.* tensors. Fixes applied: - attention.head_count_kv ARRAY -> UINT32 scalar (Ollama wrote per-layer [0,0,0,2,...] over 40 layers; upstream wants the max non-zero scalar). - rope.dimension_sections padded from [11,11,10] to [11,11,10,0] (4-element M-RoPE convention). - blk.N.ssm_dt -> blk.N.ssm_dt.bias rename for all 40 layers. - skip prefixes v.*, mm.*, mtp.* from the text loader. Tightens detect_ollama_gemma3 so it only matches files whose general.architecture is actually "gemma3" — without this, qwen3.5 was triggering gemma3 clip translation on the auto-mmproj pass and crashing on tensor-shape mismatches. Go side: gates the auto-mmproj behavior on a small arch allowlist (currently just gemma3). When a clip handler exists for an arch, add it to the map; until then the model runs text-only. Verified end-to-end via `ollama run qwen3.5:35b-a3b-q4_K_M` on the existing library/qwen3.5 blob — answers "2 + 2 equals **4**" with the text loader's compat fixups firing and no --mmproj flag.
2026-05-14 06:51:24 +00:00 · 2026-04-19 12:24:16 -07:00 · 2026-04-19 12:24:16 -07:00 · 8fa6648650
commit 8fa6648650
parent 36049361cd
2 changed files with 113 additions and 10 deletions
--- a/llama/compat/llama-ollama-compat.cpp
+++ b/llama/compat/llama-ollama-compat.cpp
@ -145,11 +145,21 @@ void promote_tensor_to_f32(ggml_context * ctx, const char * name) {
 // gemma3 (text side)
 // -------------------------------------------------------------------------

-// Returns true if this looks like an Ollama-format gemma3 blob. Different
-// Ollama converter versions produced different quirks (4B/12B/27B have
-// embedded vision + mm KVs; 1B uses non-standard rope key names; all of
-// them omit layer_norm_rms_epsilon). Any single marker trips detection.
+// Returns true if this looks like an Ollama-format gemma3 blob. Requires
+// the file to declare itself gemma3 (either via general.architecture or
+// by having at least one gemma3.* KV), AND to exhibit at least one Ollama
+// quirk. Different Ollama converter versions produced different quirks
+// (4B/12B/27B have embedded vision + mm KVs; 1B uses non-standard rope
+// key names; all of them omit layer_norm_rms_epsilon).
 bool detect_ollama_gemma3(const gguf_context * meta, const ggml_context * ctx) {
+    // Claim #1: the file is gemma3.
+    const int64_t arch_kid = gguf_find_key(meta, "general.architecture");
+    if (arch_kid < 0) return false;
+    if (std::strcmp(gguf_get_val_str(meta, arch_kid), "gemma3") != 0) return false;
+
+    // Claim #2: at least one Ollama-ism. An upstream-converted gemma3 would
+    // have none of these (except possibly the v./mm. prefixes, which upstream
+    // never ships in the text file — they live in a separate mmproj).
    return has_key(meta, "gemma3.mm.tokens_per_image")
        || any_tensor_with_prefix(ctx, "v.")
        || any_tensor_with_prefix(ctx, "mm.")
@ -213,6 +223,87 @@ void handle_gemma3(const llama_model_loader * ml, gguf_context * meta, ggml_cont
    // already have the +1 shift baked in, same as upstream's convert_hf.
 }

+// -------------------------------------------------------------------------
+// qwen35moe (text side)
+// -------------------------------------------------------------------------
+
+bool detect_ollama_qwen35moe(const gguf_context * meta, const ggml_context * ctx) {
+    // Strongest markers: vision KVs live in-file (upstream splits to mmproj)
+    // or MTP tensors are present (upstream strips them).
+    if (has_key(meta, "qwen35moe.vision.block_count"))     return true;
+    if (has_key(meta, "qwen35moe.image_token_id"))         return true;
+    if (has_key(meta, "qwen35moe.ssm.v_head_reordered"))   return true;
+    if (has_key(meta, "qwen35moe.feed_forward_length"))    return true; // upstream omits (=0 stored)
+    if (has_key(meta, "qwen35moe.rope.mrope_interleaved")) return true;
+    if (any_tensor_with_prefix(ctx, "mtp."))               return true;
+    if (any_tensor_with_prefix(ctx, "v."))                 return true;
+
+    // Scalar-vs-array: upstream writes head_count_kv as UINT32; Ollama wrote
+    // it as a per-layer array. has_key alone can't tell us that, but a mismatch
+    // shows up as a type-mismatch crash downstream, which is worse than over-
+    // detecting. If any of the above markers fire we'll normalize it below.
+    return false;
+}
+
+void handle_qwen35moe(const llama_model_loader * ml, gguf_context * meta, ggml_context * ctx) {
+    if (!detect_ollama_qwen35moe(meta, ctx)) return;
+
+    LLAMA_LOG_INFO("%s: detected Ollama-format qwen35moe GGUF; applying compatibility fixes\n", __func__);
+
+    // 1. attention.head_count_kv — upstream expects UINT32; Ollama wrote
+    //    an array (one entry per layer, 0 for SSM layers, 2 for attention
+    //    layers). Collapse to the max non-zero value.
+    {
+        const int64_t kid = gguf_find_key(meta, "qwen35moe.attention.head_count_kv");
+        if (kid >= 0 && gguf_get_kv_type(meta, kid) == GGUF_TYPE_ARRAY) {
+            const size_t n = gguf_get_arr_n(meta, kid);
+            const auto * arr = static_cast<const uint32_t *>(gguf_get_arr_data(meta, kid));
+            uint32_t max_kv = 0;
+            for (size_t i = 0; i < n; ++i) if (arr[i] > max_kv) max_kv = arr[i];
+            if (max_kv == 0) max_kv = 2; // safety fallback
+            gguf_remove_key(meta, "qwen35moe.attention.head_count_kv");
+            gguf_set_val_u32(meta, "qwen35moe.attention.head_count_kv", max_kv);
+        }
+    }
+
+    // 2. rope.dimension_sections — upstream expects a 4-element array
+    //    (M-RoPE convention); Ollama wrote 3 elements. Pad with a trailing 0.
+    {
+        const int64_t kid = gguf_find_key(meta, "qwen35moe.rope.dimension_sections");
+        if (kid >= 0 && gguf_get_arr_n(meta, kid) == 3) {
+            const auto * src = static_cast<const int32_t *>(gguf_get_arr_data(meta, kid));
+            const int32_t padded[4] = { src[0], src[1], src[2], 0 };
+            gguf_set_arr_data(meta, "qwen35moe.rope.dimension_sections",
+                              GGUF_TYPE_INT32, padded, 4);
+        }
+    }
+
+    // 3. Tensor rename: Ollama's `blk.N.ssm_dt` corresponds to upstream's
+    //    `blk.N.ssm_dt.bias` (same shape, F32 [32]). 40 layers.
+    {
+        std::vector<std::string> targets;
+        const int64_t n = gguf_get_n_tensors(meta);
+        static const char suffix[] = ".ssm_dt";
+        const size_t slen = sizeof(suffix) - 1;
+        for (int64_t i = 0; i < n; ++i) {
+            std::string name(gguf_get_tensor_name(meta, i));
+            if (name.size() >= slen
+                    && name.compare(name.size() - slen, slen, suffix) == 0) {
+                targets.push_back(std::move(name));
+            }
+        }
+        for (const auto & from : targets) {
+            rename_tensor(meta, ctx, from.c_str(), (from + ".bias").c_str());
+        }
+    }
+
+    // 4. Drop embedded vision + MTP + projector tensors from the text loader.
+    //    (vision goes to clip via --mmproj; MTP isn't used by upstream.)
+    add_skip_prefix(ml, "v.");
+    add_skip_prefix(ml, "mm.");
+    add_skip_prefix(ml, "mtp.");
+}
+
 // -------------------------------------------------------------------------
 // gemma3 (clip side)
 // -------------------------------------------------------------------------
@ -281,7 +372,8 @@ void translate_metadata(const llama_model_loader * ml,
                        ggml_context * ctx,
                        std::string & arch_name) {
    if (!meta) return;
-    if (arch_name == "gemma3") handle_gemma3(ml, meta, ctx);
+    if (arch_name == "gemma3")    handle_gemma3(ml, meta, ctx);
+    if (arch_name == "qwen35moe") handle_qwen35moe(ml, meta, ctx);
    // Dispatch. Add more arches as they are wired up.
 }

--- a/llm/llama_server.go
+++ b/llm/llama_server.go
@ -424,11 +424,22 @@ func NewLlamaServerRunner(
 	_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]

 	// Older Ollama-format GGUFs store vision tensors (v.*, mm.*) inline in
-	// the main model file rather than in a separate projector layer. Detect
-	// this case and point --mmproj at the model itself — the in-process
-	// llama.cpp compat shim translates the same file into both a text-only
-	// view and a clip-mmproj view. See llama/compat/ for details.
-	if len(projectors) == 0 && len(f.Tensors().Items("v.")) > 0 {
+	// the main model file rather than in a separate projector layer. When
+	// the arch has a llama/compat clip handler, we can point --mmproj at
+	// the same file and the in-process shim translates the two views.
+	//
+	// If we auto-enable --mmproj for an arch whose clip handler doesn't
+	// exist yet, upstream's clip loader sees un-translated Ollama tensors
+	// and aborts model load. So gate on an explicit allowlist that mirrors
+	// the compat layer's clip-side coverage in llama/compat/.
+	compatClipArches := map[string]bool{
+		"gemma3": true,
+		// Add entries as llama/compat grows clip handlers.
+		// "qwen35moe": true,
+	}
+	if len(projectors) == 0 &&
+		len(f.Tensors().Items("v.")) > 0 &&
+		compatClipArches[f.KV().Architecture()] {
 		projectors = []string{modelPath}
 	}