mirror of
https://github.com/ollama/ollama.git
synced 2026-05-13 14:27:00 +00:00
* mlx: add laguna model support * convert: support fp8 safetensors import Decode HF F8_E4M3 safetensors with block scale companions into GGUF-supported tensor types, and record which output tensors came from FP8 source weights. Use that source-precision metadata during create quantization: default FP8-sourced GGUFs to Q8_0, keep non-FP8 tensors at their original precision for Q8_0, and promote non-FP8 quantizable tensors to Q8_0 for Q4_K requests. * ggml: add laguna model support * server: preserve generate logprobs with builtin parsers Generate requests were dropping logprob-only chunks whenever a builtin parser buffered visible content. Chat already handled this case, but generate only forwarded chunks with visible response, thinking, or tool-call output. Keep generate chunks that carry logprobs even when the builtin parser has not flushed visible content yet, and add a regression test that exercises the behavior with a generic thinking parser. * review comments - perf improvements * ggml: implement nemotron 3 nano omni * add poolside integration * update poolside doc * adapt to new cache setup * fix test * fix test --------- Co-authored-by: Eva Ho <hoyyeva@gmail.com>
604 lines
20 KiB
Go
604 lines
20 KiB
Go
package convert
|
|
|
|
import (
|
|
"cmp"
|
|
"encoding/json"
|
|
"fmt"
|
|
iofs "io/fs"
|
|
"math"
|
|
"strings"
|
|
|
|
"github.com/ollama/ollama/fs/ggml"
|
|
)
|
|
|
|
type lagunaModel struct {
|
|
ModelParameters
|
|
|
|
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
|
HiddenSize uint32 `json:"hidden_size"`
|
|
IntermediateSize uint32 `json:"intermediate_size"`
|
|
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
|
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
|
|
HeadDim uint32 `json:"head_dim"`
|
|
RMSNormEPS float32 `json:"rms_norm_eps"`
|
|
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
|
|
SlidingWindow uint32 `json:"sliding_window"`
|
|
PartialRotaryFactor float32 `json:"partial_rotary_factor"`
|
|
Gating lagunaGatingMode `json:"gating"`
|
|
QKNormType string `json:"qk_norm_type"`
|
|
|
|
LayerTypes []string `json:"layer_types"`
|
|
NumAttentionHeadsPerLayer []uint32 `json:"num_attention_heads_per_layer"`
|
|
|
|
NumExperts uint32 `json:"num_experts"`
|
|
NumExpertsPerTok uint32 `json:"num_experts_per_tok"`
|
|
MoEIntermediateSize uint32 `json:"moe_intermediate_size"`
|
|
SharedExpertIntermediateSize uint32 `json:"shared_expert_intermediate_size"`
|
|
NormTopKProb bool `json:"norm_topk_prob"`
|
|
MoeRoutedScalingFactor float32 `json:"moe_routed_scaling_factor"`
|
|
MoERouterUseSigmoid bool `json:"moe_router_use_sigmoid"`
|
|
MoEApplyRouterWeightOnInput bool `json:"moe_apply_router_weight_on_input"`
|
|
DecoderSparseStep uint32 `json:"decoder_sparse_step"`
|
|
MLPOnlyLayers []uint32 `json:"mlp_only_layers"`
|
|
MLPLayerTypes []string `json:"mlp_layer_types"`
|
|
|
|
RopeParameters lagunaRopeParameters `json:"rope_parameters"`
|
|
SwaRopeParameters lagunaRopeParameters `json:"swa_rope_parameters"`
|
|
|
|
SwaAttentionSinkEnabled bool `json:"swa_attention_sink_enabled"`
|
|
}
|
|
|
|
type lagunaGatingMode string
|
|
|
|
type lagunaRopeParameters struct {
|
|
RopeTheta float32 `json:"rope_theta"`
|
|
RopeType string `json:"rope_type"`
|
|
Type string `json:"type"`
|
|
Factor float32 `json:"factor"`
|
|
OriginalMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"`
|
|
BetaSlow float32 `json:"beta_slow"`
|
|
BetaFast float32 `json:"beta_fast"`
|
|
AttentionFactor float32 `json:"attention_factor"`
|
|
PartialRotaryFactor float32 `json:"partial_rotary_factor"`
|
|
}
|
|
|
|
type lagunaRopeConfig struct {
|
|
flat lagunaRopeParameters
|
|
full lagunaRopeParameters
|
|
sliding lagunaRopeParameters
|
|
nested bool
|
|
}
|
|
|
|
func (g *lagunaGatingMode) UnmarshalJSON(b []byte) error {
|
|
var s string
|
|
if err := json.Unmarshal(b, &s); err == nil {
|
|
*g = lagunaGatingMode(s)
|
|
return nil
|
|
}
|
|
|
|
var enabled bool
|
|
if err := json.Unmarshal(b, &enabled); err == nil {
|
|
if enabled {
|
|
*g = "true"
|
|
} else {
|
|
*g = "false"
|
|
}
|
|
return nil
|
|
}
|
|
|
|
if string(b) == "null" {
|
|
return nil
|
|
}
|
|
return fmt.Errorf("unsupported Laguna gating JSON value %s", string(b))
|
|
}
|
|
|
|
func (g lagunaGatingMode) perHead() bool {
|
|
return strings.EqualFold(string(g), "per-head") || strings.EqualFold(string(g), "true")
|
|
}
|
|
|
|
func (r *lagunaRopeConfig) UnmarshalJSON(b []byte) error {
|
|
if string(b) == "null" {
|
|
return nil
|
|
}
|
|
|
|
var probe map[string]json.RawMessage
|
|
if err := json.Unmarshal(b, &probe); err != nil {
|
|
return err
|
|
}
|
|
|
|
if len(probe) == 0 {
|
|
return nil
|
|
}
|
|
|
|
if raw, ok := probe["full_attention"]; ok {
|
|
r.nested = true
|
|
if err := json.Unmarshal(raw, &r.full); err != nil {
|
|
return err
|
|
}
|
|
if raw = probe["sliding_attention"]; raw != nil {
|
|
if err := json.Unmarshal(raw, &r.sliding); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
if raw, ok := probe["global_attention"]; ok {
|
|
r.nested = true
|
|
if err := json.Unmarshal(raw, &r.full); err != nil {
|
|
return err
|
|
}
|
|
if raw = probe["sliding_attention"]; raw != nil {
|
|
if err := json.Unmarshal(raw, &r.sliding); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
return json.Unmarshal(b, &r.flat)
|
|
}
|
|
|
|
func (r lagunaRopeConfig) fullParams() lagunaRopeParameters {
|
|
if r.nested {
|
|
return r.full
|
|
}
|
|
return r.flat
|
|
}
|
|
|
|
func (r lagunaRopeConfig) slidingParams() (lagunaRopeParameters, bool) {
|
|
if !r.nested {
|
|
return lagunaRopeParameters{}, false
|
|
}
|
|
return r.sliding, true
|
|
}
|
|
|
|
func (r lagunaRopeParameters) ropeType() string {
|
|
return cmp.Or(r.RopeType, r.Type)
|
|
}
|
|
|
|
func (r lagunaRopeParameters) withDefaultPartialRotaryFactor(v float32) lagunaRopeParameters {
|
|
if r.PartialRotaryFactor == 0 {
|
|
r.PartialRotaryFactor = v
|
|
}
|
|
return r
|
|
}
|
|
|
|
func (r lagunaRopeParameters) empty() bool {
|
|
return r == (lagunaRopeParameters{})
|
|
}
|
|
|
|
type rawLagunaModel struct {
|
|
ModelParameters
|
|
|
|
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
|
HiddenSize uint32 `json:"hidden_size"`
|
|
IntermediateSize uint32 `json:"intermediate_size"`
|
|
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
|
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
|
|
HeadDim uint32 `json:"head_dim"`
|
|
RMSNormEPS float32 `json:"rms_norm_eps"`
|
|
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
|
|
SlidingWindow uint32 `json:"sliding_window"`
|
|
PartialRotaryFactor float32 `json:"partial_rotary_factor"`
|
|
Gating lagunaGatingMode `json:"gating"`
|
|
QKNormType string `json:"qk_norm_type"`
|
|
|
|
LayerTypes []string `json:"layer_types"`
|
|
NumAttentionHeadsPerLayer []uint32 `json:"num_attention_heads_per_layer"`
|
|
|
|
NumExperts uint32 `json:"num_experts"`
|
|
NumExpertsPerTok uint32 `json:"num_experts_per_tok"`
|
|
MoEIntermediateSize uint32 `json:"moe_intermediate_size"`
|
|
SharedExpertIntermediateSize uint32 `json:"shared_expert_intermediate_size"`
|
|
NormTopKProb *bool `json:"norm_topk_prob"`
|
|
MoeRoutedScalingFactor float32 `json:"moe_routed_scaling_factor"`
|
|
MoERouterUseSigmoid *bool `json:"moe_router_use_sigmoid"`
|
|
MoEApplyRouterWeightOnInput bool `json:"moe_apply_router_weight_on_input"`
|
|
DecoderSparseStep uint32 `json:"decoder_sparse_step"`
|
|
MLPOnlyLayers []uint32 `json:"mlp_only_layers"`
|
|
MLPLayerTypes []string `json:"mlp_layer_types"`
|
|
|
|
RopeParameters lagunaRopeConfig `json:"rope_parameters"`
|
|
SwaRopeParameters lagunaRopeParameters `json:"swa_rope_parameters"`
|
|
|
|
SwaAttentionSinkEnabled bool `json:"swa_attention_sink_enabled"`
|
|
}
|
|
|
|
func (p *lagunaModel) UnmarshalJSON(b []byte) error {
|
|
var raw rawLagunaModel
|
|
if err := json.Unmarshal(b, &raw); err != nil {
|
|
return err
|
|
}
|
|
|
|
mlpOnlyLayers, err := lagunaDenseLayers(raw.MLPOnlyLayers, raw.MLPLayerTypes)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
fullRope := raw.RopeParameters.fullParams().withDefaultPartialRotaryFactor(cmp.Or(raw.PartialRotaryFactor, float32(1)))
|
|
swaRope := raw.SwaRopeParameters
|
|
if nestedSwa, ok := raw.RopeParameters.slidingParams(); ok && !nestedSwa.empty() {
|
|
swaRope = nestedSwa
|
|
}
|
|
swaRope = swaRope.withDefaultPartialRotaryFactor(cmp.Or(fullRope.PartialRotaryFactor, float32(1)))
|
|
|
|
*p = lagunaModel{
|
|
ModelParameters: raw.ModelParameters,
|
|
NumHiddenLayers: raw.NumHiddenLayers,
|
|
HiddenSize: raw.HiddenSize,
|
|
IntermediateSize: raw.IntermediateSize,
|
|
NumAttentionHeads: raw.NumAttentionHeads,
|
|
NumKeyValueHeads: raw.NumKeyValueHeads,
|
|
HeadDim: raw.HeadDim,
|
|
RMSNormEPS: raw.RMSNormEPS,
|
|
MaxPositionEmbeddings: raw.MaxPositionEmbeddings,
|
|
SlidingWindow: raw.SlidingWindow,
|
|
PartialRotaryFactor: cmp.Or(raw.PartialRotaryFactor, fullRope.PartialRotaryFactor),
|
|
Gating: raw.Gating,
|
|
QKNormType: cmp.Or(raw.QKNormType, "rmsnorm"),
|
|
LayerTypes: raw.LayerTypes,
|
|
NumAttentionHeadsPerLayer: raw.NumAttentionHeadsPerLayer,
|
|
NumExperts: raw.NumExperts,
|
|
NumExpertsPerTok: raw.NumExpertsPerTok,
|
|
MoEIntermediateSize: raw.MoEIntermediateSize,
|
|
SharedExpertIntermediateSize: raw.SharedExpertIntermediateSize,
|
|
NormTopKProb: defaultBool(raw.NormTopKProb, true),
|
|
MoeRoutedScalingFactor: raw.MoeRoutedScalingFactor,
|
|
MoERouterUseSigmoid: defaultBool(raw.MoERouterUseSigmoid, true),
|
|
MoEApplyRouterWeightOnInput: raw.MoEApplyRouterWeightOnInput,
|
|
DecoderSparseStep: raw.DecoderSparseStep,
|
|
MLPOnlyLayers: mlpOnlyLayers,
|
|
MLPLayerTypes: raw.MLPLayerTypes,
|
|
RopeParameters: fullRope,
|
|
SwaRopeParameters: swaRope,
|
|
SwaAttentionSinkEnabled: raw.SwaAttentionSinkEnabled,
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func defaultBool(v *bool, fallback bool) bool {
|
|
if v == nil {
|
|
return fallback
|
|
}
|
|
return *v
|
|
}
|
|
|
|
const (
|
|
lagunaGatingFuncSoftmax uint32 = 1
|
|
lagunaGatingFuncSigmoid uint32 = 2
|
|
|
|
lagunaLayerTypeGlobal uint32 = 0
|
|
lagunaLayerTypeSliding uint32 = 1
|
|
)
|
|
|
|
func (p *lagunaModel) KV(t *Tokenizer) KV {
|
|
kv := p.ModelParameters.KV(t)
|
|
kv["general.architecture"] = "laguna"
|
|
// Laguna's chat template and built-in renderer both emit the leading
|
|
// special token explicitly. Auto-prepending BOS here would duplicate it.
|
|
kv["tokenizer.ggml.add_bos_token"] = false
|
|
kv["tokenizer.ggml.pre"] = "laguna"
|
|
// Laguna does not need tokenizer.chat_template at runtime: Ollama create
|
|
// sets the Laguna renderer/parser from the architecture, and the renderer
|
|
// owns prompt formatting.
|
|
delete(kv, "tokenizer.chat_template")
|
|
|
|
kv["laguna.block_count"] = p.NumHiddenLayers
|
|
kv["laguna.context_length"] = p.MaxPositionEmbeddings
|
|
kv["laguna.embedding_length"] = p.HiddenSize
|
|
kv["laguna.feed_forward_length"] = p.IntermediateSize
|
|
|
|
if len(p.NumAttentionHeadsPerLayer) == int(p.NumHiddenLayers) {
|
|
kv["laguna.attention.head_count"] = p.NumAttentionHeadsPerLayer
|
|
} else {
|
|
kv["laguna.attention.head_count"] = p.NumAttentionHeads
|
|
}
|
|
kv["laguna.attention.head_count_kv"] = p.NumKeyValueHeads
|
|
kv["laguna.attention.key_length"] = p.HeadDim
|
|
kv["laguna.attention.value_length"] = p.HeadDim
|
|
kv["laguna.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
|
|
kv["laguna.attention.sliding_window"] = p.SlidingWindow
|
|
kv["laguna.attention.sink_enabled"] = p.SwaAttentionSinkEnabled
|
|
|
|
if len(p.LayerTypes) > 0 {
|
|
encoded := make([]uint32, len(p.LayerTypes))
|
|
slidingPattern := make([]bool, len(p.LayerTypes))
|
|
for i, layerType := range p.LayerTypes {
|
|
if lagunaLayerIsSliding(layerType) {
|
|
encoded[i] = lagunaLayerTypeSliding
|
|
slidingPattern[i] = true
|
|
} else {
|
|
encoded[i] = lagunaLayerTypeGlobal
|
|
}
|
|
}
|
|
kv["laguna.attention.layer_types"] = encoded
|
|
kv["laguna.attention.sliding_window_pattern"] = slidingPattern
|
|
}
|
|
|
|
if p.Gating.perHead() {
|
|
kv["laguna.attention.gating_type"] = uint32(1)
|
|
} else {
|
|
kv["laguna.attention.gating_type"] = uint32(0)
|
|
}
|
|
kv["laguna.attention.qk_norm"] = p.QKNormType == "rmsnorm"
|
|
|
|
kv["laguna.expert_count"] = p.NumExperts
|
|
kv["laguna.expert_used_count"] = p.NumExpertsPerTok
|
|
kv["laguna.expert_feed_forward_length"] = p.MoEIntermediateSize
|
|
kv["laguna.expert_shared_feed_forward_length"] = p.SharedExpertIntermediateSize
|
|
kv["laguna.expert_shared_count"] = uint32(1)
|
|
kv["laguna.expert_weights_norm"] = p.NormTopKProb
|
|
kv["laguna.expert_weights_scale"] = p.MoeRoutedScalingFactor
|
|
kv["laguna.expert_gating_func"] = lagunaMoeGatingFunc(p.MoERouterUseSigmoid)
|
|
kv["laguna.decoder_sparse_step"] = cmp.Or(p.DecoderSparseStep, uint32(1))
|
|
|
|
if leading, ok := lagunaLeadingDensePrefix(p.MLPOnlyLayers); ok {
|
|
kv["laguna.leading_dense_block_count"] = leading
|
|
}
|
|
if len(p.MLPOnlyLayers) > 0 {
|
|
kv["laguna.dense_layers"] = p.MLPOnlyLayers
|
|
}
|
|
|
|
ropeType := p.RopeParameters.ropeType()
|
|
kv["laguna.rope.freq_base"] = cmp.Or(p.RopeParameters.RopeTheta, float32(10000))
|
|
kv["laguna.rope.scaling.type"] = ropeType
|
|
ropeFactor := cmp.Or(p.RopeParameters.Factor, float32(1))
|
|
kv["laguna.rope.scaling.factor"] = ropeFactor
|
|
kv["laguna.rope.scaling.original_context_length"] = p.RopeParameters.OriginalMaxPositionEmbeddings
|
|
kv["laguna.rope.scaling.beta_fast"] = p.RopeParameters.BetaFast
|
|
kv["laguna.rope.scaling.beta_slow"] = p.RopeParameters.BetaSlow
|
|
kv["laguna.rope.scaling.attn_factor"] = lagunaAttentionFactor(ropeType, ropeFactor, p.RopeParameters.AttentionFactor)
|
|
kv["laguna.rope.partial_rotary_factor"] = cmp.Or(p.PartialRotaryFactor, float32(1))
|
|
|
|
swaRopeType := p.SwaRopeParameters.ropeType()
|
|
kv["laguna.rope.swa.freq_base"] = cmp.Or(p.SwaRopeParameters.RopeTheta, float32(10000))
|
|
kv["laguna.rope.swa.scaling.type"] = cmp.Or(swaRopeType, "linear")
|
|
kv["laguna.rope.swa.scaling.factor"] = cmp.Or(p.SwaRopeParameters.Factor, float32(1))
|
|
kv["laguna.rope.swa.partial_rotary_factor"] = cmp.Or(p.SwaRopeParameters.PartialRotaryFactor, float32(1))
|
|
|
|
headDim := p.HeadDim
|
|
if headDim == 0 && p.NumAttentionHeads > 0 {
|
|
headDim = p.HiddenSize / p.NumAttentionHeads
|
|
}
|
|
kv["laguna.rope.dimension_count"] = lagunaRopeDim(headDim, cmp.Or(p.PartialRotaryFactor, float32(1)))
|
|
kv["laguna.rope.swa.dimension_count"] = lagunaRopeDim(headDim, cmp.Or(p.SwaRopeParameters.PartialRotaryFactor, float32(1)))
|
|
|
|
return kv
|
|
}
|
|
|
|
func (p *lagunaModel) parseMore(_ iofs.FS) error {
|
|
return p.validate()
|
|
}
|
|
|
|
func (p *lagunaModel) validate() error {
|
|
if p.NumHiddenLayers == 0 {
|
|
return fmt.Errorf("laguna: num_hidden_layers must be set")
|
|
}
|
|
if p.HiddenSize == 0 {
|
|
return fmt.Errorf("laguna: hidden_size must be set")
|
|
}
|
|
if p.HeadDim == 0 {
|
|
return fmt.Errorf("laguna: head_dim must be set")
|
|
}
|
|
if p.NumKeyValueHeads == 0 {
|
|
return fmt.Errorf("laguna: num_key_value_heads must be set")
|
|
}
|
|
if p.SwaAttentionSinkEnabled {
|
|
return fmt.Errorf("laguna: unsupported swa_attention_sink_enabled=true")
|
|
}
|
|
if !p.Gating.perHead() {
|
|
return fmt.Errorf("laguna: unsupported attention gating %q: only gating=\"per-head\" is supported", p.Gating)
|
|
}
|
|
if p.QKNormType != "rmsnorm" {
|
|
return fmt.Errorf("laguna: unsupported qk_norm_type %q: only rmsnorm is supported", p.QKNormType)
|
|
}
|
|
if !p.MoERouterUseSigmoid {
|
|
return fmt.Errorf("laguna: unsupported moe_router_use_sigmoid=false")
|
|
}
|
|
if p.MoEApplyRouterWeightOnInput {
|
|
return fmt.Errorf("laguna: unsupported moe_apply_router_weight_on_input=true")
|
|
}
|
|
if p.DecoderSparseStep != 0 && p.DecoderSparseStep != 1 {
|
|
return fmt.Errorf("laguna: unsupported decoder_sparse_step=%d: only 1 is supported", p.DecoderSparseStep)
|
|
}
|
|
if len(p.MLPOnlyLayers) != 1 || p.MLPOnlyLayers[0] != 0 {
|
|
return fmt.Errorf("laguna: unsupported mlp_only_layers=%v: only [0] is supported", p.MLPOnlyLayers)
|
|
}
|
|
if p.NumExperts == 0 {
|
|
return fmt.Errorf("laguna: num_experts must be set")
|
|
}
|
|
if p.NumExpertsPerTok == 0 {
|
|
return fmt.Errorf("laguna: num_experts_per_tok must be set")
|
|
}
|
|
if p.MoEIntermediateSize == 0 {
|
|
return fmt.Errorf("laguna: moe_intermediate_size must be set")
|
|
}
|
|
if p.SharedExpertIntermediateSize == 0 {
|
|
return fmt.Errorf("laguna: shared_expert_intermediate_size must be set")
|
|
}
|
|
|
|
if len(p.LayerTypes) > 0 && len(p.LayerTypes) != int(p.NumHiddenLayers) {
|
|
return fmt.Errorf("laguna: layer_types has %d entries, expected %d", len(p.LayerTypes), p.NumHiddenLayers)
|
|
}
|
|
for i, layerType := range p.LayerTypes {
|
|
if !lagunaLayerIsGlobal(layerType) && !lagunaLayerIsSliding(layerType) {
|
|
return fmt.Errorf("laguna: unsupported layer_types[%d]=%q", i, layerType)
|
|
}
|
|
}
|
|
if len(p.NumAttentionHeadsPerLayer) > 0 && len(p.NumAttentionHeadsPerLayer) != int(p.NumHiddenLayers) {
|
|
return fmt.Errorf("laguna: num_attention_heads_per_layer has %d entries, expected %d", len(p.NumAttentionHeadsPerLayer), p.NumHiddenLayers)
|
|
}
|
|
if len(p.NumAttentionHeadsPerLayer) == 0 && p.NumAttentionHeads == 0 {
|
|
return fmt.Errorf("laguna: num_attention_heads or num_attention_heads_per_layer must be set")
|
|
}
|
|
for i, heads := range p.NumAttentionHeadsPerLayer {
|
|
if heads == 0 {
|
|
return fmt.Errorf("laguna: num_attention_heads_per_layer[%d] must be non-zero", i)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (p *lagunaModel) numHeadsForLayer(layer uint32) uint32 {
|
|
if len(p.NumAttentionHeadsPerLayer) > int(layer) && p.NumAttentionHeadsPerLayer[layer] > 0 {
|
|
return p.NumAttentionHeadsPerLayer[layer]
|
|
}
|
|
return p.NumAttentionHeads
|
|
}
|
|
|
|
func (p *lagunaModel) layerUsesMoE(layer uint32) bool {
|
|
for _, denseLayer := range p.MLPOnlyLayers {
|
|
if denseLayer == layer {
|
|
return false
|
|
}
|
|
}
|
|
step := cmp.Or(p.DecoderSparseStep, uint32(1))
|
|
return p.NumExperts > 0 && (layer+1)%step == 0
|
|
}
|
|
|
|
func (p *lagunaModel) Replacements() []string {
|
|
return []string{
|
|
"lm_head", "output",
|
|
"model.embed_tokens", "token_embd",
|
|
"model.norm", "output_norm",
|
|
"model.layers", "blk",
|
|
"input_layernorm", "attn_norm",
|
|
"post_attention_layernorm", "ffn_norm",
|
|
"self_attn.q_proj", "attn_q",
|
|
"self_attn.k_proj", "attn_k",
|
|
"self_attn.v_proj", "attn_v",
|
|
"self_attn.o_proj", "attn_output",
|
|
"self_attn.g_proj", "attn_g",
|
|
"self_attn.q_norm", "attn_q_norm",
|
|
"self_attn.k_norm", "attn_k_norm",
|
|
"mlp.gate_proj", "ffn_gate",
|
|
"mlp.up_proj", "ffn_up",
|
|
"mlp.down_proj", "ffn_down",
|
|
"mlp.gate.weight", "ffn_gate_inp.weight",
|
|
"mlp.experts.e_score_correction_bias", "exp_probs_b.bias",
|
|
"mlp.shared_expert.gate_proj", "ffn_gate_shexp",
|
|
"mlp.shared_expert.up_proj", "ffn_up_shexp",
|
|
"mlp.shared_expert.down_proj", "ffn_down_shexp",
|
|
"mlp.experts.*.gate_proj", "ffn_gate_exps",
|
|
"mlp.experts.*.up_proj", "ffn_up_exps",
|
|
"mlp.experts.*.down_proj", "ffn_down_exps",
|
|
}
|
|
}
|
|
|
|
func (p *lagunaModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
|
// Current Laguna drops store routed MoE experts as separate per-expert
|
|
// tensors. GGUF stores each projection as one stacked tensor. If future
|
|
// drops change expert naming or layout, update these patterns with a
|
|
// focused conversion test using the new tensor names.
|
|
merges := make([]merge, 0, p.NumHiddenLayers*3)
|
|
for i := range p.NumHiddenLayers {
|
|
merges = append(merges,
|
|
merge{
|
|
fmt.Sprintf("blk.%d.mlp.experts.*.gate_proj.weight", i),
|
|
fmt.Sprintf("blk.%d.ffn_gate_exps.weight", i),
|
|
},
|
|
merge{
|
|
fmt.Sprintf("blk.%d.mlp.experts.*.up_proj.weight", i),
|
|
fmt.Sprintf("blk.%d.ffn_up_exps.weight", i),
|
|
},
|
|
merge{
|
|
fmt.Sprintf("blk.%d.mlp.experts.*.down_proj.weight", i),
|
|
fmt.Sprintf("blk.%d.ffn_down_exps.weight", i),
|
|
},
|
|
)
|
|
}
|
|
|
|
out, rest := mergeTensors(ts, merges...)
|
|
for _, t := range rest {
|
|
out = append(out, &ggml.Tensor{
|
|
Name: t.Name(),
|
|
Kind: t.Kind(),
|
|
Shape: t.Shape(),
|
|
WriterTo: t,
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
func (p *lagunaModel) specialTokenTypes() []string {
|
|
return []string{"bos", "eos", "pad", "unk"}
|
|
}
|
|
|
|
func lagunaLayerIsSliding(layerType string) bool {
|
|
return strings.EqualFold(layerType, "sliding_attention")
|
|
}
|
|
|
|
func lagunaLayerIsGlobal(layerType string) bool {
|
|
return strings.EqualFold(layerType, "full_attention") || strings.EqualFold(layerType, "global_attention")
|
|
}
|
|
|
|
func lagunaLeadingDensePrefix(layers []uint32) (uint32, bool) {
|
|
for i, v := range layers {
|
|
if v != uint32(i) {
|
|
return 0, false
|
|
}
|
|
}
|
|
return uint32(len(layers)), true
|
|
}
|
|
|
|
func lagunaDenseLayers(mlpOnlyLayers []uint32, mlpLayerTypes []string) ([]uint32, error) {
|
|
if len(mlpOnlyLayers) > 0 {
|
|
return mlpOnlyLayers, nil
|
|
}
|
|
if len(mlpLayerTypes) == 0 {
|
|
return nil, nil
|
|
}
|
|
|
|
denseLayers := make([]uint32, 0, len(mlpLayerTypes))
|
|
for i, layerType := range mlpLayerTypes {
|
|
switch {
|
|
case strings.EqualFold(layerType, "dense"):
|
|
denseLayers = append(denseLayers, uint32(i))
|
|
case strings.EqualFold(layerType, "sparse"):
|
|
default:
|
|
return nil, fmt.Errorf("laguna: unsupported mlp_layer_types[%d]=%q", i, layerType)
|
|
}
|
|
}
|
|
return denseLayers, nil
|
|
}
|
|
|
|
func lagunaMoeGatingFunc(useSigmoid bool) uint32 {
|
|
if useSigmoid {
|
|
return lagunaGatingFuncSigmoid
|
|
}
|
|
return lagunaGatingFuncSoftmax
|
|
}
|
|
|
|
func lagunaAttentionFactor(ropeType string, scaleFactor, attentionFactor float32) float32 {
|
|
if attentionFactor != 0 {
|
|
return attentionFactor
|
|
}
|
|
if strings.EqualFold(ropeType, "yarn") && scaleFactor > 1 {
|
|
return float32(0.1*math.Log(float64(scaleFactor)) + 1)
|
|
}
|
|
return 1
|
|
}
|
|
|
|
func lagunaRopeDim(headDim uint32, partialRotaryFactor float32) uint32 {
|
|
if headDim == 0 {
|
|
return 0
|
|
}
|
|
dim := uint32(float32(headDim) * partialRotaryFactor)
|
|
if dim == 0 || dim > headDim {
|
|
dim = headDim
|
|
}
|
|
if dim%2 != 0 {
|
|
dim--
|
|
}
|
|
if dim == 0 {
|
|
return headDim
|
|
}
|
|
return dim
|
|
}
|
|
|
|
var (
|
|
_ ModelConverter = (*lagunaModel)(nil)
|
|
_ moreParser = (*lagunaModel)(nil)
|
|
)
|