ollama/x/tokenizer/tokenizer_load_test.go
Daniel Hiltgen 87288ced4f
New models (#15861)
* mlx: add laguna model support

* convert: support fp8 safetensors import

Decode HF F8_E4M3 safetensors with block scale companions into GGUF-supported tensor types, and record which output tensors came from FP8 source weights.

Use that source-precision metadata during create quantization: default FP8-sourced GGUFs to Q8_0, keep non-FP8 tensors at their original precision for Q8_0, and promote non-FP8 quantizable tensors to Q8_0 for Q4_K requests.

* ggml: add laguna model support

* server: preserve generate logprobs with builtin parsers

Generate requests were dropping logprob-only chunks whenever a builtin parser buffered visible content. Chat already handled this case, but generate only forwarded chunks with visible response, thinking, or tool-call output.

Keep generate chunks that carry logprobs even when the builtin parser has not flushed visible content yet, and add a regression test that exercises the behavior with a generic thinking parser.

* review comments - perf improvements

* ggml: implement nemotron 3 nano omni

* add poolside integration

* update poolside doc

* adapt to new cache setup

* fix test

* fix test

---------

Co-authored-by: Eva Ho <hoyyeva@gmail.com>
2026-04-28 11:50:12 -07:00

52 lines
1.1 KiB
Go

package tokenizer
import (
"strings"
"testing"
)
func TestLoadFromBytesRejectsWordPiece(t *testing.T) {
data := []byte(`{
"model": {
"type": "WordPiece",
"vocab": {"[UNK]": 0, "hello": 1}
},
"added_tokens": []
}`)
_, err := LoadFromBytes(data)
if err == nil {
t.Fatal("expected WordPiece load to fail")
}
if !strings.Contains(err.Error(), "unsupported tokenizer type: WordPiece") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestExtractPretokenizerSkipsUnsupportedSequenceSplit(t *testing.T) {
data := []byte(`{
"type": "Sequence",
"pretokenizers": [
{
"type": "Split",
"pattern": {
"Regex": "(?:\\r?\\n)+(?!\\r?\\n)"
}
},
{
"type": "Split",
"pattern": {
"Regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
}
}
]
}`)
pattern := extractPretokenizer(data)
if pattern == "" {
t.Fatal("expected supported Split pretokenizer")
}
if strings.Contains(pattern, `(?!\r?\n)`) {
t.Fatalf("selected unsupported newline splitter: %q", pattern)
}
}