|
4 | 4 | "fmt" |
5 | 5 | "os" |
6 | 6 | "slices" |
| 7 | + "strings" |
7 | 8 | "testing" |
8 | 9 | ) |
9 | 10 |
|
@@ -241,3 +242,42 @@ func TestInfo(t *testing.T) { |
241 | 242 | t.Errorf("got %v, want %v", info.UnknownID, wantUNK) |
242 | 243 | } |
243 | 244 | } |
| 245 | + |
| 246 | +// TestMergedSymbolExceedsMaxPieceLength tests that encoding doesn't panic |
| 247 | +// when BPE attempts to merge two symbols whose combined length exceeds |
| 248 | +// maxPieceLength. This was a bug where findMerged would panic with |
| 249 | +// "slice bounds out of range" when trying to reslice a buffer that was |
| 250 | +// allocated with maxPieceLength capacity. |
| 251 | +// |
| 252 | +// The bug is triggered by repeated em dashes (—) or ellipsis (…) characters |
| 253 | +// which, during BPE merging, can create adjacent intermediate symbols that |
| 254 | +// each are ~48 bytes. When findMerged tries to check if they can merge, |
| 255 | +// the combined length (96 bytes) exceeds maxPieceLength (93 bytes for Gemma). |
| 256 | +func TestMergedSymbolExceedsMaxPieceLength(t *testing.T) { |
| 257 | + proc := createProcessor(t) |
| 258 | + |
| 259 | + // These test cases previously caused a panic: |
| 260 | + // panic: runtime error: slice bounds out of range [:96] with capacity 93 |
| 261 | + testCases := []string{ |
| 262 | + strings.Repeat("—", 32), // 32 em dashes (U+2014, 3 bytes each = 96 bytes) |
| 263 | + strings.Repeat("…", 32), // 32 ellipses (U+2026, 3 bytes each = 96 bytes) |
| 264 | + strings.Repeat("—", 64), // More em dashes |
| 265 | + strings.Repeat("…", 64), // More ellipses |
| 266 | + } |
| 267 | + |
| 268 | + for _, text := range testCases { |
| 269 | + t.Run(fmt.Sprintf("len=%d", len(text)), func(t *testing.T) { |
| 270 | + // Should not panic |
| 271 | + tokens := proc.Encode(text) |
| 272 | + if len(tokens) == 0 { |
| 273 | + t.Errorf("expected at least one token for input of length %d", len(text)) |
| 274 | + } |
| 275 | + |
| 276 | + // Verify round-trip works |
| 277 | + decoded := proc.DecodeTokens(tokens) |
| 278 | + if decoded != text { |
| 279 | + t.Errorf("round-trip failed: got %q, want %q", decoded, text) |
| 280 | + } |
| 281 | + }) |
| 282 | + } |
| 283 | +} |
0 commit comments