glm 4.7 flash support on experimental engine (#13838)

2026-05-13 06:21:28 +00:00 · 2026-02-02 15:22:11 -08:00 · 2026-02-02 15:22:11 -08:00 · d8cc798c2b
commit d8cc798c2b
parent 6582f6da5c
33 changed files with 3879 additions and 2406 deletions
--- a/x/server/show.go
+++ b/x/server/show.go
@ -163,9 +163,18 @@ func GetSafetensorsTensorInfo(name model.Name) ([]api.Tensor, error) {

 // getTensorInfoFromManifest extracts tensor info from a manifest.
 // This is separated for testability.
+// For quantized models, groups weight/scale/qbias into single entries with detected quantization type.
 func getTensorInfoFromManifest(mf *manifest.Manifest) ([]api.Tensor, error) {
 	var tensors []api.Tensor

+	// First pass: collect all tensor info and identify scale tensors
+	type tensorData struct {
+		info   *safetensorsTensorInfo
+		digest string
+	}
+	tensorMap := make(map[string]*tensorData)
+	scaleMap := make(map[string]*tensorData) // base name -> scale tensor info
+
 	for _, layer := range mf.Layers {
 		if layer.MediaType != manifest.MediaTypeImageTensor {
 			continue
@ -178,28 +187,96 @@ func getTensorInfoFromManifest(mf *manifest.Manifest) ([]api.Tensor, error) {
 		}
 		info, err := readSafetensorsHeader(blobPath)
 		if err != nil {
-			// Skip tensors we can't read
 			continue
 		}

-		// Convert shape from int to uint64
-		shape := make([]uint64, len(info.Shape))
-		for i, s := range info.Shape {
-			shape[i] = uint64(s)
+		td := &tensorData{info: info, digest: layer.Digest}
+
+		if strings.HasSuffix(layer.Name, "_scale") {
+			baseName := strings.TrimSuffix(layer.Name, "_scale")
+			scaleMap[baseName] = td
+		} else if strings.HasSuffix(layer.Name, "_qbias") {
+			// Skip qbias tensors - they're included with the quantized weight
+			continue
+		} else {
+			tensorMap[layer.Name] = td
+		}
+	}
+
+	// Second pass: build tensor list with quantization info
+	for _, layer := range mf.Layers {
+		if layer.MediaType != manifest.MediaTypeImageTensor {
+			continue
 		}

-		tensors = append(tensors, api.Tensor{
-			Name:  layer.Name,
-			Type:  info.Dtype,
-			Shape: shape,
-		})
+		// Skip scale and qbias tensors
+		if strings.HasSuffix(layer.Name, "_scale") || strings.HasSuffix(layer.Name, "_qbias") {
+			continue
+		}
+
+		td := tensorMap[layer.Name]
+		if td == nil {
+			continue
+		}
+
+		// Check if this tensor has a corresponding scale tensor (quantized)
+		scaleTd := scaleMap[layer.Name]
+		if scaleTd != nil && len(td.info.Shape) >= 2 && len(scaleTd.info.Shape) >= 2 {
+			// Quantized tensor - detect bits from shapes
+			weightCols := td.info.Shape[len(td.info.Shape)-1]
+			scaleCols := scaleTd.info.Shape[len(scaleTd.info.Shape)-1]
+
+			// Detect quantization: Q4 has pack_factor=8, Q8 has pack_factor=4
+			// Q4 uses group_size=32: weightCols * 8 / scaleCols = 32
+			// Q8 uses group_size=64: weightCols * 4 / scaleCols = 64
+			var bits int
+			var quantType string
+			if weightCols*8/scaleCols == 32 {
+				bits = 4
+				quantType = "Q4"
+			} else if weightCols*4/scaleCols == 64 {
+				bits = 8
+				quantType = "Q8"
+			} else {
+				// Unknown quantization, show raw
+				quantType = td.info.Dtype
+			}
+
+			// Calculate unpacked shape
+			shape := make([]uint64, len(td.info.Shape))
+			for i, s := range td.info.Shape {
+				shape[i] = uint64(s)
+			}
+			if bits > 0 {
+				packFactor := int64(32 / bits)
+				shape[len(shape)-1] = uint64(td.info.Shape[len(td.info.Shape)-1] * packFactor)
+			}
+
+			tensors = append(tensors, api.Tensor{
+				Name:  layer.Name,
+				Type:  quantType,
+				Shape: shape,
+			})
+		} else {
+			// Non-quantized tensor
+			shape := make([]uint64, len(td.info.Shape))
+			for i, s := range td.info.Shape {
+				shape[i] = uint64(s)
+			}
+
+			tensors = append(tensors, api.Tensor{
+				Name:  layer.Name,
+				Type:  td.info.Dtype,
+				Shape: shape,
+			})
+		}
 	}

 	return tensors, nil
 }

 // GetSafetensorsDtype returns the quantization type for a safetensors model.
-// If the model is quantized (has _scale tensors), returns the quantization type (e.g., "FP8").
+// Reads from model_index.json first, falls back to detection from tensor names.
 // Otherwise returns the torch_dtype from config.json.
 func GetSafetensorsDtype(name model.Name) (string, error) {
 	mf, err := manifest.ParseNamedManifest(name)
@ -207,16 +284,38 @@ func GetSafetensorsDtype(name model.Name) (string, error) {
 		return "", fmt.Errorf("failed to load manifest: %w", err)
 	}

-	// Check if model is quantized by looking for _scale tensors
+	// First try to read quantization from model_index.json
+	var modelIndex struct {
+		Quantization string `json:"quantization"`
+	}
+	if err := mf.ReadConfigJSON("model_index.json", &modelIndex); err == nil && modelIndex.Quantization != "" {
+		return modelIndex.Quantization, nil
+	}
+
+	// Fallback: detect from tensor names
+	hasScales := false
+	hasQBias := false
 	for _, layer := range mf.Layers {
 		if layer.MediaType == manifest.MediaTypeImageTensor {
 			if strings.HasSuffix(layer.Name, "_scale") {
-				// Model is quantized - return FP8 (affine quantization)
-				return "FP8", nil
+				hasScales = true
+			}
+			if strings.HasSuffix(layer.Name, "_qbias") {
+				hasQBias = true
 			}
 		}
 	}

+	if hasScales {
+		if hasQBias {
+			// Affine mode (has scale + qbias) - could be Q4 or Q8
+			// Default to Q4 as it's more common
+			return "Q4", nil
+		}
+		// No qbias = NVFP4
+		return "NVFP4", nil
+	}
+
 	// Not quantized - return torch_dtype from config.json
 	var cfg struct {
 		TorchDtype string `json:"torch_dtype"`