tokenizer: fix multi-regex BPE offset handling (#15844)

Use the current fragment offset when emitting unmatched spans during multi-regex BPE splitting. This avoids duplicating earlier prompt text and inflating token counts for multi-stage BPE tokenizers.
This commit is contained in:
Daniel Hiltgen 2026-04-27 14:14:27 -07:00 committed by GitHub
parent 4656a07e56
commit ec9b4e9e47
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 56 additions and 1 deletions

View file

@ -84,7 +84,7 @@ func (bpe *BytePairEncoding) split(s string) iter.Seq[string] {
var offset int
for m, _ := re.FindRunesMatch(r); m != nil; m, _ = re.FindNextMatch(m) {
if offset-m.Index != 0 {
if !yield(string(r[:m.Index])) {
if !yield(string(r[offset:m.Index])) {
return
}
}

View file

@ -545,6 +545,61 @@ func BenchmarkBytePairEncoding(b *testing.B) {
}
}
func TestBytePairEncodingSplitMultipleRegexpsPreservesOffsets(t *testing.T) {
t.Parallel()
bpe := NewBytePairEncoding(
nil,
`(?:\r?\n)+(?!\r?\n)`,
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
)
input := "One line\nTwo lines\n\nThree"
got := slices.Collect(bpe.split(input))
want := []string{"One", " line", "\n", "Two", " lines", "\n\n", "Three"}
if diff := cmp.Diff(want, got); diff != "" {
t.Fatalf("split mismatch (-want +got):\n%s", diff)
}
}
func TestBytePairEncodingSplitRefactPreservesOffsets(t *testing.T) {
t.Parallel()
bpe := NewBytePairEncoding(
nil,
`\p{N}`,
`'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`,
)
input := "One line\nTwo lines\n\nThree"
got := slices.Collect(bpe.split(input))
want := []string{"One", " line", "\n", "Two", " lines", "\n", "\n", "Three"}
if diff := cmp.Diff(want, got); diff != "" {
t.Fatalf("split mismatch (-want +got):\n%s", diff)
}
}
func TestBytePairEncodingSplitDeepSeekV3PreservesOffsets(t *testing.T) {
t.Parallel()
bpe := NewBytePairEncoding(
nil,
"\\p{N}{1,3}",
`[一-龥぀-ゟ゠-ヿ]+`,
"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\\r\\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
)
input := "One line\nTwo lines\n\nThree"
got := slices.Collect(bpe.split(input))
want := []string{"One", " line", "\n", "Two", " lines", "\n\n", "Three"}
if diff := cmp.Diff(want, got); diff != "" {
t.Fatalf("split mismatch (-want +got):\n%s", diff)
}
}
func TestSplit(t *testing.T) {
cases := []struct {
name string