mirror of
https://github.com/ollama/ollama.git
synced 2026-05-13 14:27:00 +00:00
llama/compat: apply LLaMA RoPE permute to mistral3 vision Q/K
The previous mistral3 vision handler loaded but produced hallucinated
descriptions of real images (e.g. described a clear photo of a person
in a suit handing money as "abstract hexagonal pattern"). Solid colors
and large color blocks worked, suggesting the vision tower was producing
*some* signal but the spatial/feature relationships were scrambled.
Root cause: Ollama's mistral3 converter only applies its LLaMA-style
RoPE repack to TEXT-side `attn_q`/`attn_k` tensors — the
`if !HasPrefix(name, "v.")` guard in convert/convert_mistral.go::Tensors
skips vision tensors entirely. Vision Q/K therefore leave the converter
in raw HF/PyTorch order. Upstream's HF→GGUF flow does permute vision
Q/K with the vision head count, because pixtral's clip graph uses
`ggml_rope_ext` in mode 0 which expects the [n_head, head_dim/2, 2, ...]
LLaMA layout.
Fix: register a load-time op that applies LlamaModel.permute equivalently
on each F16 vision attn_q.weight / attn_k.weight (24 layers × 2 = 48
tensors for ministral-3 8B). Capture file offsets *before* renames
invalidate them, same pattern as promote_tensor_to_f32 and
register_concat_load.
Verified against the upstream-format migration mmproj (which gets the
same image right) — both 8B Ollama+compat and 14B migration now produce
matching, accurate descriptions ("hand giving money to person in suit,
cartoonish star/burst above their head, blue grid background").
This commit is contained in:
parent
63bde9ff73
commit
3a57b89d54
1 changed files with 80 additions and 0 deletions
80
llama/compat/llama-ollama-compat.cpp
vendored
80
llama/compat/llama-ollama-compat.cpp
vendored
|
|
@ -546,6 +546,61 @@ constexpr std::pair<const char *, const char *> kMistral3ClipRenames[] = {
|
|||
{"mm.norm", "mm.input_norm"},
|
||||
};
|
||||
|
||||
// Apply the LLaMA-style RoPE permutation to Ollama's vision Q/K weight.
|
||||
//
|
||||
// Ollama's mistral3 converter (convert/convert_mistral.go) only applies
|
||||
// its repack to TEXT-side attn_q/attn_k (the `if !HasPrefix(name, "v.")`
|
||||
// guard skips vision tensors). So vision Q/K leave the converter in raw
|
||||
// HF/PyTorch order. Upstream's HF→GGUF flow (convert_hf_to_gguf.py
|
||||
// Mistral3 path) DOES permute vision Q/K with the vision head count,
|
||||
// because pixtral's clip graph uses `ggml_rope_ext` in mode 0 which
|
||||
// expects the [n_head, head_dim/2, 2, ...] layout.
|
||||
//
|
||||
// To bridge the two: apply LlamaModel.permute equivalently — reshape
|
||||
// to [n_head, 2, head_dim/2, in], swap axes 1↔2, reshape back. The
|
||||
// permutation acts only on the output dim, which is ne[1] for ggml
|
||||
// weights stored as [in_dim, out_dim], so we shuffle whole rows.
|
||||
//
|
||||
// Permutation formula: oa = h*head_dim + dp*2 + half (post-permute idx)
|
||||
// ob = h*head_dim + half*(head_dim/2) + dp (HF idx)
|
||||
// copy row ob in src → row oa in dst.
|
||||
//
|
||||
// Only F16 Q/K rows handled (V is not RoPE'd; quantized rows would need
|
||||
// block-aware shuffling — Ollama keeps Q/K F16 for mistral3 8B).
|
||||
void register_mistral3_vision_qk_permute(gguf_context * meta, ggml_context * ctx,
|
||||
const char * tensor_name, int n_head) {
|
||||
ggml_tensor * t = ggml_get_tensor(ctx, tensor_name);
|
||||
if (!t || t->type != GGML_TYPE_F16) return;
|
||||
|
||||
const int total_out = (int) t->ne[1];
|
||||
if (total_out % n_head != 0) return;
|
||||
const size_t row_bytes = ggml_row_size(t->type, t->ne[0]);
|
||||
const size_t total_bytes = ggml_nbytes(t);
|
||||
const size_t src_offset = tensor_file_offset(meta, tensor_name);
|
||||
|
||||
const int head_dim = total_out / n_head;
|
||||
const int head_dim2 = head_dim / 2;
|
||||
|
||||
register_load_op(tensor_name, LoadOp{
|
||||
[=](const char * path, void * dst, size_t dst_size) {
|
||||
if (dst_size != total_bytes) return false;
|
||||
std::vector<uint8_t> src(total_bytes);
|
||||
if (!read_at(path, src_offset, src.data(), total_bytes)) return false;
|
||||
uint8_t * dp = static_cast<uint8_t *>(dst);
|
||||
for (int oa = 0; oa < total_out; ++oa) {
|
||||
const int h = oa / head_dim;
|
||||
const int dp_ = (oa % head_dim) / 2;
|
||||
const int hf = oa % 2;
|
||||
const int ob = h * head_dim + hf * head_dim2 + dp_;
|
||||
std::memcpy(dp + (size_t) oa * row_bytes,
|
||||
src.data() + (size_t) ob * row_bytes, row_bytes);
|
||||
}
|
||||
return true;
|
||||
},
|
||||
"vision Q/K LLaMA permute",
|
||||
});
|
||||
}
|
||||
|
||||
void handle_mistral3_clip(gguf_context * meta, ggml_context * ctx) {
|
||||
LLAMA_LOG_INFO("%s: detected Ollama-format mistral3 GGUF used as mmproj; translating\n", __func__);
|
||||
|
||||
|
|
@ -591,9 +646,34 @@ void handle_mistral3_clip(gguf_context * meta, ggml_context * ctx) {
|
|||
});
|
||||
}
|
||||
|
||||
// Apply LLaMA-style RoPE permutation to vision Q/K BEFORE renames
|
||||
// (we capture offsets by current name). Ollama's converter only
|
||||
// repacks TEXT-side q/k (skipping `v.*`), but pixtral's clip graph
|
||||
// expects HF→GGUF's permuted layout for vision Q/K.
|
||||
{
|
||||
const int64_t v_hk = gguf_find_key(meta, "mistral3.vision.attention.head_count");
|
||||
const int64_t n_blk_k = gguf_find_key(meta, "mistral3.vision.block_count");
|
||||
if (v_hk >= 0 && n_blk_k >= 0) {
|
||||
const int n_head = (int) gguf_get_val_u32(meta, v_hk);
|
||||
const uint32_t n_blocks = gguf_get_val_u32(meta, n_blk_k);
|
||||
for (uint32_t b = 0; b < n_blocks; ++b) {
|
||||
char qn[64], kn[64];
|
||||
std::snprintf(qn, sizeof(qn), "v.blk.%u.attn_q.weight", b);
|
||||
std::snprintf(kn, sizeof(kn), "v.blk.%u.attn_k.weight", b);
|
||||
register_mistral3_vision_qk_permute(meta, ctx, qn, n_head);
|
||||
register_mistral3_vision_qk_permute(meta, ctx, kn, n_head);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto & [from, to] : kMistral3ClipRenames) {
|
||||
rename_tensors_containing(meta, ctx, from, to);
|
||||
}
|
||||
|
||||
// Upstream stores patch_embd as F32; Ollama stored F16. Metal's
|
||||
// IM2COL convolution silently produces garbage with F16 weights
|
||||
// (same issue as gemma3 — see handle_gemma3_clip). Promote to F32.
|
||||
promote_tensor_to_f32(meta, ctx, "v.patch_embd.weight");
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue