mirror of
https://github.com/ollama/ollama.git
synced 2026-05-13 06:21:28 +00:00
glm 4.7 flash support on experimental engine (#13838)
This commit is contained in:
parent
6582f6da5c
commit
d8cc798c2b
33 changed files with 3879 additions and 2406 deletions
127
x/server/show.go
127
x/server/show.go
|
|
@ -163,9 +163,18 @@ func GetSafetensorsTensorInfo(name model.Name) ([]api.Tensor, error) {
|
|||
|
||||
// getTensorInfoFromManifest extracts tensor info from a manifest.
|
||||
// This is separated for testability.
|
||||
// For quantized models, groups weight/scale/qbias into single entries with detected quantization type.
|
||||
func getTensorInfoFromManifest(mf *manifest.Manifest) ([]api.Tensor, error) {
|
||||
var tensors []api.Tensor
|
||||
|
||||
// First pass: collect all tensor info and identify scale tensors
|
||||
type tensorData struct {
|
||||
info *safetensorsTensorInfo
|
||||
digest string
|
||||
}
|
||||
tensorMap := make(map[string]*tensorData)
|
||||
scaleMap := make(map[string]*tensorData) // base name -> scale tensor info
|
||||
|
||||
for _, layer := range mf.Layers {
|
||||
if layer.MediaType != manifest.MediaTypeImageTensor {
|
||||
continue
|
||||
|
|
@ -178,28 +187,96 @@ func getTensorInfoFromManifest(mf *manifest.Manifest) ([]api.Tensor, error) {
|
|||
}
|
||||
info, err := readSafetensorsHeader(blobPath)
|
||||
if err != nil {
|
||||
// Skip tensors we can't read
|
||||
continue
|
||||
}
|
||||
|
||||
// Convert shape from int to uint64
|
||||
shape := make([]uint64, len(info.Shape))
|
||||
for i, s := range info.Shape {
|
||||
shape[i] = uint64(s)
|
||||
td := &tensorData{info: info, digest: layer.Digest}
|
||||
|
||||
if strings.HasSuffix(layer.Name, "_scale") {
|
||||
baseName := strings.TrimSuffix(layer.Name, "_scale")
|
||||
scaleMap[baseName] = td
|
||||
} else if strings.HasSuffix(layer.Name, "_qbias") {
|
||||
// Skip qbias tensors - they're included with the quantized weight
|
||||
continue
|
||||
} else {
|
||||
tensorMap[layer.Name] = td
|
||||
}
|
||||
}
|
||||
|
||||
// Second pass: build tensor list with quantization info
|
||||
for _, layer := range mf.Layers {
|
||||
if layer.MediaType != manifest.MediaTypeImageTensor {
|
||||
continue
|
||||
}
|
||||
|
||||
tensors = append(tensors, api.Tensor{
|
||||
Name: layer.Name,
|
||||
Type: info.Dtype,
|
||||
Shape: shape,
|
||||
})
|
||||
// Skip scale and qbias tensors
|
||||
if strings.HasSuffix(layer.Name, "_scale") || strings.HasSuffix(layer.Name, "_qbias") {
|
||||
continue
|
||||
}
|
||||
|
||||
td := tensorMap[layer.Name]
|
||||
if td == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// Check if this tensor has a corresponding scale tensor (quantized)
|
||||
scaleTd := scaleMap[layer.Name]
|
||||
if scaleTd != nil && len(td.info.Shape) >= 2 && len(scaleTd.info.Shape) >= 2 {
|
||||
// Quantized tensor - detect bits from shapes
|
||||
weightCols := td.info.Shape[len(td.info.Shape)-1]
|
||||
scaleCols := scaleTd.info.Shape[len(scaleTd.info.Shape)-1]
|
||||
|
||||
// Detect quantization: Q4 has pack_factor=8, Q8 has pack_factor=4
|
||||
// Q4 uses group_size=32: weightCols * 8 / scaleCols = 32
|
||||
// Q8 uses group_size=64: weightCols * 4 / scaleCols = 64
|
||||
var bits int
|
||||
var quantType string
|
||||
if weightCols*8/scaleCols == 32 {
|
||||
bits = 4
|
||||
quantType = "Q4"
|
||||
} else if weightCols*4/scaleCols == 64 {
|
||||
bits = 8
|
||||
quantType = "Q8"
|
||||
} else {
|
||||
// Unknown quantization, show raw
|
||||
quantType = td.info.Dtype
|
||||
}
|
||||
|
||||
// Calculate unpacked shape
|
||||
shape := make([]uint64, len(td.info.Shape))
|
||||
for i, s := range td.info.Shape {
|
||||
shape[i] = uint64(s)
|
||||
}
|
||||
if bits > 0 {
|
||||
packFactor := int64(32 / bits)
|
||||
shape[len(shape)-1] = uint64(td.info.Shape[len(td.info.Shape)-1] * packFactor)
|
||||
}
|
||||
|
||||
tensors = append(tensors, api.Tensor{
|
||||
Name: layer.Name,
|
||||
Type: quantType,
|
||||
Shape: shape,
|
||||
})
|
||||
} else {
|
||||
// Non-quantized tensor
|
||||
shape := make([]uint64, len(td.info.Shape))
|
||||
for i, s := range td.info.Shape {
|
||||
shape[i] = uint64(s)
|
||||
}
|
||||
|
||||
tensors = append(tensors, api.Tensor{
|
||||
Name: layer.Name,
|
||||
Type: td.info.Dtype,
|
||||
Shape: shape,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return tensors, nil
|
||||
}
|
||||
|
||||
// GetSafetensorsDtype returns the quantization type for a safetensors model.
|
||||
// If the model is quantized (has _scale tensors), returns the quantization type (e.g., "FP8").
|
||||
// Reads from model_index.json first, falls back to detection from tensor names.
|
||||
// Otherwise returns the torch_dtype from config.json.
|
||||
func GetSafetensorsDtype(name model.Name) (string, error) {
|
||||
mf, err := manifest.ParseNamedManifest(name)
|
||||
|
|
@ -207,16 +284,38 @@ func GetSafetensorsDtype(name model.Name) (string, error) {
|
|||
return "", fmt.Errorf("failed to load manifest: %w", err)
|
||||
}
|
||||
|
||||
// Check if model is quantized by looking for _scale tensors
|
||||
// First try to read quantization from model_index.json
|
||||
var modelIndex struct {
|
||||
Quantization string `json:"quantization"`
|
||||
}
|
||||
if err := mf.ReadConfigJSON("model_index.json", &modelIndex); err == nil && modelIndex.Quantization != "" {
|
||||
return modelIndex.Quantization, nil
|
||||
}
|
||||
|
||||
// Fallback: detect from tensor names
|
||||
hasScales := false
|
||||
hasQBias := false
|
||||
for _, layer := range mf.Layers {
|
||||
if layer.MediaType == manifest.MediaTypeImageTensor {
|
||||
if strings.HasSuffix(layer.Name, "_scale") {
|
||||
// Model is quantized - return FP8 (affine quantization)
|
||||
return "FP8", nil
|
||||
hasScales = true
|
||||
}
|
||||
if strings.HasSuffix(layer.Name, "_qbias") {
|
||||
hasQBias = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if hasScales {
|
||||
if hasQBias {
|
||||
// Affine mode (has scale + qbias) - could be Q4 or Q8
|
||||
// Default to Q4 as it's more common
|
||||
return "Q4", nil
|
||||
}
|
||||
// No qbias = NVFP4
|
||||
return "NVFP4", nil
|
||||
}
|
||||
|
||||
// Not quantized - return torch_dtype from config.json
|
||||
var cfg struct {
|
||||
TorchDtype string `json:"torch_dtype"`
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue