mirror of
https://github.com/ollama/ollama.git
synced 2026-05-13 14:27:00 +00:00
create: keep linear_attn in_proj_qkv and in_proj_z in BF16 for NVFP4/MXFP4/MXFP8
The fused QKV projection for linear attention layers has K-projection rows with small magnitudes (0.01-0.04) in early layers. When quantized to NVFP4 with group_size=16 the global scale is too large for these values, rounding them to the FP4 zero codepoint. Layer 0 shows 100% zeroed K-rows, layer 1 shows ~88%. This corrupts recurrent state and produces garbage output. Add .linear_attn.in_proj_qkv.weight and .linear_attn.in_proj_z.weight to the BF16 exemption list in qwen35ShouldKeepBF16ForDirectNonAffine. The upstream source (RedHatAI/Qwen3.6-35B-A3B-NVFP4) keeps all linear_attn projections in BF16 with only bulk MoE expert weights in NVFP4. This aligns the exemption list with that recipe -- in_proj_a and in_proj_b were already covered, in_proj_qkv and in_proj_z were missing. Fixes #15866
This commit is contained in:
parent
c7c2837c96
commit
0e1a7b4700
2 changed files with 8 additions and 0 deletions
|
|
@ -1544,6 +1544,8 @@ func TestCreateSafetensorsModel_Qwen35DirectNonAffineKeepsSensitiveWeightsBF16(t
|
|||
st.NewTensorDataFromBytes("lm_head.weight", "BF16", []int32{64, 64}, make([]byte, 64*64*2)),
|
||||
st.NewTensorDataFromBytes("model.language_model.layers.0.linear_attn.in_proj_a.weight", "BF16", []int32{32, 64}, make([]byte, 32*64*2)),
|
||||
st.NewTensorDataFromBytes("model.language_model.layers.0.linear_attn.in_proj_b.weight", "BF16", []int32{32, 64}, make([]byte, 32*64*2)),
|
||||
st.NewTensorDataFromBytes("model.language_model.layers.0.linear_attn.in_proj_qkv.weight", "BF16", []int32{64, 64}, make([]byte, 64*64*2)),
|
||||
st.NewTensorDataFromBytes("model.language_model.layers.0.linear_attn.in_proj_z.weight", "BF16", []int32{64, 64}, make([]byte, 64*64*2)),
|
||||
st.NewTensorDataFromBytes("model.language_model.layers.0.mlp.gate.weight", "BF16", []int32{64, 64}, make([]byte, 64*64*2)),
|
||||
st.NewTensorDataFromBytes("model.language_model.layers.0.mlp.shared_expert_gate.weight", "BF16", []int32{1, 64}, make([]byte, 64*2)),
|
||||
st.NewTensorDataFromBytes("model.language_model.layers.0.self_attn.q_proj.weight", "BF16", []int32{64, 64}, make([]byte, 64*64*2)),
|
||||
|
|
@ -1598,6 +1600,8 @@ func TestCreateSafetensorsModel_Qwen35DirectNonAffineKeepsSensitiveWeightsBF16(t
|
|||
"language_model.lm_head.weight",
|
||||
"language_model.model.layers.0.linear_attn.in_proj_a.weight",
|
||||
"language_model.model.layers.0.linear_attn.in_proj_b.weight",
|
||||
"language_model.model.layers.0.linear_attn.in_proj_qkv.weight",
|
||||
"language_model.model.layers.0.linear_attn.in_proj_z.weight",
|
||||
"language_model.model.layers.0.mlp.gate.weight",
|
||||
"language_model.model.layers.0.mlp.shared_expert_gate.weight",
|
||||
} {
|
||||
|
|
|
|||
|
|
@ -99,6 +99,10 @@ func qwen35ShouldKeepBF16ForDirectNonAffine(name string) bool {
|
|||
return true
|
||||
case strings.HasSuffix(name, ".linear_attn.in_proj_ba.weight"):
|
||||
return true
|
||||
case strings.HasSuffix(name, ".linear_attn.in_proj_qkv.weight"):
|
||||
return true
|
||||
case strings.HasSuffix(name, ".linear_attn.in_proj_z.weight"):
|
||||
return true
|
||||
case strings.HasSuffix(name, ".mlp.gate.weight") && !strings.Contains(name, "_proj"):
|
||||
return true
|
||||
case strings.HasSuffix(name, ".mlp.shared_expert_gate.weight"):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue