create: keep linear_attn in_proj_qkv and in_proj_z in BF16 for NVFP4/MXFP4/MXFP8

The fused QKV projection for linear attention layers has K-projection rows
with small magnitudes (0.01-0.04) in early layers. When quantized to NVFP4
with group_size=16 the global scale is too large for these values, rounding
them to the FP4 zero codepoint. Layer 0 shows 100% zeroed K-rows, layer 1
shows ~88%. This corrupts recurrent state and produces garbage output.

Add .linear_attn.in_proj_qkv.weight and .linear_attn.in_proj_z.weight to
the BF16 exemption list in qwen35ShouldKeepBF16ForDirectNonAffine. The
upstream source (RedHatAI/Qwen3.6-35B-A3B-NVFP4) keeps all linear_attn
projections in BF16 with only bulk MoE expert weights in NVFP4. This aligns
the exemption list with that recipe -- in_proj_a and in_proj_b were already
covered, in_proj_qkv and in_proj_z were missing.

Fixes #15866
This commit is contained in:
ArkaD171717 2026-04-30 11:05:00 -05:00 committed by Arkadeep Dutta
parent c7c2837c96
commit 0e1a7b4700
2 changed files with 8 additions and 0 deletions

View file

@ -1544,6 +1544,8 @@ func TestCreateSafetensorsModel_Qwen35DirectNonAffineKeepsSensitiveWeightsBF16(t
st.NewTensorDataFromBytes("lm_head.weight", "BF16", []int32{64, 64}, make([]byte, 64*64*2)),
st.NewTensorDataFromBytes("model.language_model.layers.0.linear_attn.in_proj_a.weight", "BF16", []int32{32, 64}, make([]byte, 32*64*2)),
st.NewTensorDataFromBytes("model.language_model.layers.0.linear_attn.in_proj_b.weight", "BF16", []int32{32, 64}, make([]byte, 32*64*2)),
st.NewTensorDataFromBytes("model.language_model.layers.0.linear_attn.in_proj_qkv.weight", "BF16", []int32{64, 64}, make([]byte, 64*64*2)),
st.NewTensorDataFromBytes("model.language_model.layers.0.linear_attn.in_proj_z.weight", "BF16", []int32{64, 64}, make([]byte, 64*64*2)),
st.NewTensorDataFromBytes("model.language_model.layers.0.mlp.gate.weight", "BF16", []int32{64, 64}, make([]byte, 64*64*2)),
st.NewTensorDataFromBytes("model.language_model.layers.0.mlp.shared_expert_gate.weight", "BF16", []int32{1, 64}, make([]byte, 64*2)),
st.NewTensorDataFromBytes("model.language_model.layers.0.self_attn.q_proj.weight", "BF16", []int32{64, 64}, make([]byte, 64*64*2)),
@ -1598,6 +1600,8 @@ func TestCreateSafetensorsModel_Qwen35DirectNonAffineKeepsSensitiveWeightsBF16(t
"language_model.lm_head.weight",
"language_model.model.layers.0.linear_attn.in_proj_a.weight",
"language_model.model.layers.0.linear_attn.in_proj_b.weight",
"language_model.model.layers.0.linear_attn.in_proj_qkv.weight",
"language_model.model.layers.0.linear_attn.in_proj_z.weight",
"language_model.model.layers.0.mlp.gate.weight",
"language_model.model.layers.0.mlp.shared_expert_gate.weight",
} {

View file

@ -99,6 +99,10 @@ func qwen35ShouldKeepBF16ForDirectNonAffine(name string) bool {
return true
case strings.HasSuffix(name, ".linear_attn.in_proj_ba.weight"):
return true
case strings.HasSuffix(name, ".linear_attn.in_proj_qkv.weight"):
return true
case strings.HasSuffix(name, ".linear_attn.in_proj_z.weight"):
return true
case strings.HasSuffix(name, ".mlp.gate.weight") && !strings.Contains(name, "_proj"):
return true
case strings.HasSuffix(name, ".mlp.shared_expert_gate.weight"):