mirror of
https://github.com/ollama/ollama.git
synced 2026-05-13 14:27:00 +00:00
tokenizer: fix multi-regex BPE offset handling (#15844)
Use the current fragment offset when emitting unmatched spans during multi-regex BPE splitting. This avoids duplicating earlier prompt text and inflating token counts for multi-stage BPE tokenizers.
This commit is contained in:
parent
4656a07e56
commit
ec9b4e9e47
2 changed files with 56 additions and 1 deletions
|
|
@ -84,7 +84,7 @@ func (bpe *BytePairEncoding) split(s string) iter.Seq[string] {
|
|||
var offset int
|
||||
for m, _ := re.FindRunesMatch(r); m != nil; m, _ = re.FindNextMatch(m) {
|
||||
if offset-m.Index != 0 {
|
||||
if !yield(string(r[:m.Index])) {
|
||||
if !yield(string(r[offset:m.Index])) {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -545,6 +545,61 @@ func BenchmarkBytePairEncoding(b *testing.B) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestBytePairEncodingSplitMultipleRegexpsPreservesOffsets(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
bpe := NewBytePairEncoding(
|
||||
nil,
|
||||
`(?:\r?\n)+(?!\r?\n)`,
|
||||
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
|
||||
)
|
||||
|
||||
input := "One line\nTwo lines\n\nThree"
|
||||
got := slices.Collect(bpe.split(input))
|
||||
want := []string{"One", " line", "\n", "Two", " lines", "\n\n", "Three"}
|
||||
|
||||
if diff := cmp.Diff(want, got); diff != "" {
|
||||
t.Fatalf("split mismatch (-want +got):\n%s", diff)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBytePairEncodingSplitRefactPreservesOffsets(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
bpe := NewBytePairEncoding(
|
||||
nil,
|
||||
`\p{N}`,
|
||||
`'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`,
|
||||
)
|
||||
|
||||
input := "One line\nTwo lines\n\nThree"
|
||||
got := slices.Collect(bpe.split(input))
|
||||
want := []string{"One", " line", "\n", "Two", " lines", "\n", "\n", "Three"}
|
||||
|
||||
if diff := cmp.Diff(want, got); diff != "" {
|
||||
t.Fatalf("split mismatch (-want +got):\n%s", diff)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBytePairEncodingSplitDeepSeekV3PreservesOffsets(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
bpe := NewBytePairEncoding(
|
||||
nil,
|
||||
"\\p{N}{1,3}",
|
||||
`[一-龥-ゟ゠-ヿ]+`,
|
||||
"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\\r\\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
)
|
||||
|
||||
input := "One line\nTwo lines\n\nThree"
|
||||
got := slices.Collect(bpe.split(input))
|
||||
want := []string{"One", " line", "\n", "Two", " lines", "\n\n", "Three"}
|
||||
|
||||
if diff := cmp.Diff(want, got); diff != "" {
|
||||
t.Fatalf("split mismatch (-want +got):\n%s", diff)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSplit(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue