Skip to content

Commit f31a37e

Browse files
authored
Merge pull request #8 from adoprog/bug/merge-em-dashes
Fix slice bounds panic in findMerged when combined symbol length exceeds maxPieceLength
2 parents dd59fe9 + 97c06ba commit f31a37e

File tree

2 files changed

+47
-1
lines changed

2 files changed

+47
-1
lines changed

processor.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,13 @@ func (proc *Processor) Encode(text string) []Token {
230230
// merge two strings together without allocations.
231231
buf := make([]byte, proc.maxPieceLength)
232232
findMerged := func(x, y symListElem) (string, int, bool) {
233-
buf = buf[:len(x.symbol)+len(y.symbol)]
233+
combinedLen := len(x.symbol) + len(y.symbol)
234+
if combinedLen > cap(buf) {
235+
// Combined symbol can't possibly exist in vocabulary
236+
// since it's longer than the longest piece.
237+
return "", 0, false
238+
}
239+
buf = buf[:combinedLen]
234240
copy(buf, x.symbol)
235241
copy(buf[len(x.symbol):], y.symbol)
236242
if id, found := proc.pieces[string(buf)]; found {

processor_test.go

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"fmt"
55
"os"
66
"slices"
7+
"strings"
78
"testing"
89
)
910

@@ -241,3 +242,42 @@ func TestInfo(t *testing.T) {
241242
t.Errorf("got %v, want %v", info.UnknownID, wantUNK)
242243
}
243244
}
245+
246+
// TestMergedSymbolExceedsMaxPieceLength tests that encoding doesn't panic
247+
// when BPE attempts to merge two symbols whose combined length exceeds
248+
// maxPieceLength. This was a bug where findMerged would panic with
249+
// "slice bounds out of range" when trying to reslice a buffer that was
250+
// allocated with maxPieceLength capacity.
251+
//
252+
// The bug is triggered by repeated em dashes (—) or ellipsis (…) characters
253+
// which, during BPE merging, can create adjacent intermediate symbols that
254+
// each are ~48 bytes. When findMerged tries to check if they can merge,
255+
// the combined length (96 bytes) exceeds maxPieceLength (93 bytes for Gemma).
256+
func TestMergedSymbolExceedsMaxPieceLength(t *testing.T) {
257+
proc := createProcessor(t)
258+
259+
// These test cases previously caused a panic:
260+
// panic: runtime error: slice bounds out of range [:96] with capacity 93
261+
testCases := []string{
262+
strings.Repeat("—", 32), // 32 em dashes (U+2014, 3 bytes each = 96 bytes)
263+
strings.Repeat("…", 32), // 32 ellipses (U+2026, 3 bytes each = 96 bytes)
264+
strings.Repeat("—", 64), // More em dashes
265+
strings.Repeat("…", 64), // More ellipses
266+
}
267+
268+
for _, text := range testCases {
269+
t.Run(fmt.Sprintf("len=%d", len(text)), func(t *testing.T) {
270+
// Should not panic
271+
tokens := proc.Encode(text)
272+
if len(tokens) == 0 {
273+
t.Errorf("expected at least one token for input of length %d", len(text))
274+
}
275+
276+
// Verify round-trip works
277+
decoded := proc.DecodeTokens(tokens)
278+
if decoded != text {
279+
t.Errorf("round-trip failed: got %q, want %q", decoded, text)
280+
}
281+
})
282+
}
283+
}

0 commit comments

Comments
 (0)