mirror of
https://github.com/ollama/ollama.git
synced 2026-05-14 06:51:24 +00:00
llama/compat: add nemotron_h_moe handler (latent FFN + MTP skip)
Covers two variants of the same arch:
- nemotron-3-super (120B-A12B): uses a latent-FFN variant where
experts operate in a compressed `moe_latent_size`-dim space rather
than the full embed dim. Inject `nemotron_h_moe.moe_latent_size`
from the first `ffn_latent_in.weight` ne[1], and rename Ollama's
`ffn_latent_in`/`ffn_latent_out` → upstream's `ffn_latent_down`/
`ffn_latent_up`.
- nemotron-cascade-2 (30B-A3B): no latent FFN; loads with just the
MTP skip described below.
Both variants ship MTP (Multi-Token Prediction) layers that Ollama
emits one-tensor-per-expert (`mtp.layers.X.mixer.experts.Y.{up,down}_proj.weight`
— 1040 extras on the 120B). Upstream's nemotron_h_moe loader doesn't
claim them, so without skipping they trigger
`done_getting_tensors: wrong number of tensors`.
Also tested:
- nomic-embed-text-v2-moe (`nomic-bert-moe`): loads + serves 768-dim
embeddings without any compat. No handler needed.
This commit is contained in:
parent
99cb874396
commit
2c7850dbaf
2 changed files with 57 additions and 4 deletions
|
|
@ -47,6 +47,7 @@ an immediate no-op.
|
|||
| `qwen35` | Same fixes as `qwen35moe` (head_count_kv array→scalar, rope dimension_sections pad 3→4, `ssm_dt`→`ssm_dt.bias`, drop `v.*`/`mm.*`/`mtp.*`) but for the non-MoE qwen3.5 (e.g. 9B). Both arches share `apply_qwen35_text_fixes`. | n/a |
|
||||
| `gemma4` | Drop `a.*`/`v.*`/`mm.*` (audio + vision + projector) from the text loader. Covers both E2B/E4B (dense) and 26B-A4B (MoE). | n/a |
|
||||
| `deepseekocr` | Arch rename `deepseekocr`→`deepseek2-ocr` (incl. KV prefix), inject `expert_feed_forward_length` from `ffn_down_exps` shape, `expert_shared_count` from `ffn_down_shexp` shape, default `attention.layer_norm_rms_epsilon`, drop `s.*`/`v.*`/`mm.*` | Arch rewrite to `clip`, KV synthesis (`clip.vision.*`, `clip.vision.sam.*`, `clip.projector_type=deepseekocr`, defaults for `feed_forward_length`/`projection_dim`/`window_size`/image stats), prefix-only rename `s.*`→`v.sam.*` (substring rename would corrupt `mm.layers`), CLIP leaf renames (`self_attn.{out,qkv}_proj`→`attn_{out,qkv}`, `layer_norm{1,2}`→`ln{1,2}`, `mlp.fc{1,2}`→`ffn_{up,down}`, `pre_layrnorm`→`pre_ln`), SAM leaf renames (`attn.proj`→`attn.out`, `attn.rel_pos_{h,w}`→`attn.pos_{h,w}.weight`, `norm{1,2}`→`{pre,post}_ln`), projector renames (`mm.layers`→`mm.model.fc`, `mm.image_newline`/`view_seperator`→`v.*`), F32 promote of `v.patch_embd.weight`, `v.sam.patch_embd.weight`, `v.position_embd.weight` |
|
||||
| `nemotron_h_moe` | For latent-FFN variants (e.g. nemotron-3-super 120B-A12B): inject `moe_latent_size` from `ffn_latent_in.weight` ne[1], rename `ffn_latent_{in,out}`→`ffn_latent_{down,up}`. For all variants: drop `mtp.*` (Multi-Token Prediction tensors that Ollama emits as one-tensor-per-expert; ~1040 extras on the 120B). Standard variants (e.g. nemotron-cascade-2 30B-A3B) load with no rename, only the MTP skip. | n/a |
|
||||
|
||||
Usage:
|
||||
|
||||
|
|
|
|||
60
llama/compat/llama-ollama-compat.cpp
vendored
60
llama/compat/llama-ollama-compat.cpp
vendored
|
|
@ -299,6 +299,57 @@ void handle_deepseekocr(const llama_model_loader * ml, gguf_context * meta,
|
|||
add_skip_prefix(ml, "mm.");
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// nemotron_h_moe (text only)
|
||||
// =========================================================================
|
||||
//
|
||||
// Same arch name on both sides. Most variants (e.g. nemotron-cascade-2)
|
||||
// load as-is. The latent-FFN variants (e.g. nemotron-3-super 120B-A12B)
|
||||
// rename `ffn_latent_in` / `ffn_latent_out` to `ffn_latent_down` /
|
||||
// `ffn_latent_up`, and need `moe_latent_size` injected (derived from
|
||||
// the latent tensor shape).
|
||||
|
||||
bool detect_ollama_nemotron_h_moe(const gguf_context * meta, const ggml_context * ctx) {
|
||||
const int64_t arch_kid = gguf_find_key(meta, "general.architecture");
|
||||
if (arch_kid < 0) return false;
|
||||
if (std::strcmp(gguf_get_val_str(meta, arch_kid), "nemotron_h_moe") != 0) return false;
|
||||
return any_tensor_with_prefix(ctx, "blk.1.ffn_latent_in")
|
||||
|| any_tensor_with_prefix(ctx, "blk.0.ffn_latent_in")
|
||||
|| any_tensor_with_prefix(ctx, "mtp.");
|
||||
}
|
||||
|
||||
void handle_nemotron_h_moe(const llama_model_loader * ml, gguf_context * meta, ggml_context * ctx) {
|
||||
if (!detect_ollama_nemotron_h_moe(meta, ctx)) return;
|
||||
|
||||
LLAMA_LOG_INFO("%s: detected Ollama-format nemotron_h_moe GGUF; applying compatibility fixes\n", __func__);
|
||||
|
||||
// Inject moe_latent_size for latent-FFN variants (e.g. super 120B-A12B).
|
||||
// Standard variants (e.g. cascade-2 30B-A3B) have no latent tensors and
|
||||
// use n_embd as the MoE inner dim — leave the key absent.
|
||||
if (!has_key(meta, "nemotron_h_moe.moe_latent_size")) {
|
||||
for (uint32_t b = 0; b < 1024; ++b) {
|
||||
char name[64];
|
||||
std::snprintf(name, sizeof(name), "blk.%u.ffn_latent_in.weight", b);
|
||||
if (ggml_tensor * t = ggml_get_tensor(ctx, name)) {
|
||||
gguf_set_val_u32(meta, "nemotron_h_moe.moe_latent_size",
|
||||
(uint32_t) t->ne[1]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Rename the latent projection tensors to upstream's naming (no-op when
|
||||
// the file has no latent tensors).
|
||||
rename_tensors_containing(meta, ctx, ".ffn_latent_in", ".ffn_latent_down");
|
||||
rename_tensors_containing(meta, ctx, ".ffn_latent_out", ".ffn_latent_up");
|
||||
|
||||
// Drop MTP (Multi-Token Prediction) tensors — Ollama's converter emits
|
||||
// them as one-tensor-per-expert (`mtp.layers.X.mixer.experts.Y.{up,down}_proj`)
|
||||
// which upstream's nemotron_h_moe loader doesn't claim. Total: ~1040 extra
|
||||
// tensors on super 120B.
|
||||
add_skip_prefix(ml, "mtp.");
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// gpt-oss (text only)
|
||||
// =========================================================================
|
||||
|
|
@ -925,10 +976,11 @@ void translate_metadata(const llama_model_loader * ml,
|
|||
if (arch_name == "gemma4") handle_gemma4 (ml, meta, ctx);
|
||||
if (arch_name == "qwen35moe") handle_qwen35moe(ml, meta, ctx);
|
||||
if (arch_name == "qwen35") handle_qwen35 (ml, meta, ctx);
|
||||
if (arch_name == "gptoss") handle_gptoss (ml, meta, ctx, arch_name);
|
||||
if (arch_name == "lfm2") handle_lfm2 (ml, meta, ctx);
|
||||
if (arch_name == "mistral3") handle_mistral3 (ml, meta, ctx);
|
||||
if (arch_name == "deepseekocr") handle_deepseekocr(ml, meta, ctx, arch_name);
|
||||
if (arch_name == "gptoss") handle_gptoss (ml, meta, ctx, arch_name);
|
||||
if (arch_name == "lfm2") handle_lfm2 (ml, meta, ctx);
|
||||
if (arch_name == "mistral3") handle_mistral3 (ml, meta, ctx);
|
||||
if (arch_name == "deepseekocr") handle_deepseekocr (ml, meta, ctx, arch_name);
|
||||
if (arch_name == "nemotron_h_moe") handle_nemotron_h_moe(ml, meta, ctx);
|
||||
// Dispatch. Add more arches as they are wired up.
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue