Skip to content

Commit 0af2152

Browse files
wesmclaude
andauthored
fix: skip oversized JSONL lines instead of failing the session (#20)
## Summary - Replace `bufio.Scanner` with a custom `lineReader` that uses `bufio.Reader.ReadLine()` to skip lines exceeding 64MB instead of aborting the entire session parse - Propagate I/O errors through `lineReader.Err()` (matching `bufio.Scanner` pattern) rather than silently swallowing them - Replace all three scan sites (Claude parser, Claude hints extractor, Codex parser) Closes #12 ## Test plan - [x] `TestLineReader` — normal lines, oversized skip, all oversized, empty input, blank lines, no trailing newline, exact limit, one over limit - [x] `TestLineReaderIOError` — custom `io.Reader` that yields data then returns a non-EOF error; verifies lines are returned and `Err()` exposes the I/O error - [x] `TestParseCodexSessionOversizedLineSkipped` — integration test placing an oversized line between normal JSONL entries - [x] All Go tests pass (`CGO_ENABLED=1 go test -tags fts5 ./...`) 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent b8ae7c8 commit 0af2152

File tree

5 files changed

+287
-83
lines changed

5 files changed

+287
-83
lines changed

internal/parser/claude.go

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
package parser
22

33
import (
4-
"bufio"
54
"fmt"
5+
"log"
66
"os"
77
"path/filepath"
88
"strings"
@@ -13,7 +13,7 @@ import (
1313

1414
const (
1515
initialScanBufSize = 64 * 1024 // 64KB
16-
maxScanTokenSize = 20 * 1024 * 1024 // 20MB
16+
maxLineSize = 64 * 1024 * 1024 // 64MB
1717
)
1818

1919
// ParseClaudeSession parses a Claude Code JSONL session file.
@@ -43,15 +43,12 @@ func ParseClaudeSession(
4343
ordinal int
4444
)
4545

46-
scanner := bufio.NewScanner(f)
47-
scanner.Buffer(
48-
make([]byte, 0, initialScanBufSize), maxScanTokenSize,
49-
)
46+
lr := newLineReader(f, maxLineSize)
5047

51-
for scanner.Scan() {
52-
line := scanner.Text()
53-
if strings.TrimSpace(line) == "" {
54-
continue
48+
for {
49+
line, ok := lr.next()
50+
if !ok {
51+
break
5552
}
5653

5754
if !gjson.Valid(line) {
@@ -137,9 +134,9 @@ func ParseClaudeSession(
137134
}
138135
}
139136

140-
if err := scanner.Err(); err != nil {
137+
if err := lr.Err(); err != nil {
141138
return ParsedSession{}, nil,
142-
fmt.Errorf("scanning %s: %w", path, err)
139+
fmt.Errorf("reading %s: %w", path, err)
143140
}
144141

145142
sess := ParsedSession{
@@ -173,15 +170,12 @@ func ExtractClaudeProjectHints(
173170
}
174171
defer f.Close()
175172

176-
scanner := bufio.NewScanner(f)
177-
scanner.Buffer(
178-
make([]byte, 0, initialScanBufSize), maxScanTokenSize,
179-
)
173+
lr := newLineReader(f, maxLineSize)
180174

181-
for scanner.Scan() {
182-
line := scanner.Text()
183-
if strings.TrimSpace(line) == "" {
184-
continue
175+
for {
176+
line, ok := lr.next()
177+
if !ok {
178+
break
185179
}
186180
if !gjson.Valid(line) {
187181
continue
@@ -198,6 +192,9 @@ func ExtractClaudeProjectHints(
198192
}
199193
}
200194
}
195+
if err := lr.Err(); err != nil {
196+
log.Printf("reading hints from %s: %v", path, err)
197+
}
201198
return cwd, gitBranch
202199
}
203200

internal/parser/codex.go

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
package parser
22

33
import (
4-
"bufio"
54
"fmt"
65
"os"
76
"path/filepath"
@@ -504,17 +503,13 @@ func ParseCodexSession(
504503
}
505504
defer f.Close()
506505

507-
scanner := bufio.NewScanner(f)
508-
scanner.Buffer(
509-
make([]byte, 0, initialScanBufSize), maxScanTokenSize,
510-
)
511-
506+
lr := newLineReader(f, maxLineSize)
512507
b := newCodexSessionBuilder(includeExec)
513508

514-
for scanner.Scan() {
515-
line := scanner.Text()
516-
if strings.TrimSpace(line) == "" {
517-
continue
509+
for {
510+
line, ok := lr.next()
511+
if !ok {
512+
break
518513
}
519514
if !gjson.Valid(line) {
520515
continue
@@ -524,9 +519,9 @@ func ParseCodexSession(
524519
}
525520
}
526521

527-
if err := scanner.Err(); err != nil {
522+
if err := lr.Err(); err != nil {
528523
return nil, nil,
529-
fmt.Errorf("scanning codex %s: %w", path, err)
524+
fmt.Errorf("reading codex %s: %w", path, err)
530525
}
531526

532527
sessionID := b.sessionID

internal/parser/linereader.go

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
package parser
2+
3+
import (
4+
"bufio"
5+
"io"
6+
)
7+
8+
// lineReader reads JSONL files line by line, skipping lines that
9+
// exceed maxLen rather than aborting. The buffer starts small and
10+
// grows on demand up to maxLen. After iteration, call Err() to
11+
// check for I/O errors (as opposed to normal EOF).
12+
type lineReader struct {
13+
r *bufio.Reader
14+
maxLen int
15+
buf []byte
16+
err error
17+
}
18+
19+
func newLineReader(r io.Reader, maxLen int) *lineReader {
20+
return &lineReader{
21+
r: bufio.NewReaderSize(r, initialScanBufSize),
22+
maxLen: maxLen,
23+
buf: make([]byte, 0, initialScanBufSize),
24+
}
25+
}
26+
27+
// next returns the next line (without trailing newline) and true,
28+
// or ("", false) at EOF or read error. After the loop, call Err()
29+
// to distinguish EOF from I/O failure.
30+
func (lr *lineReader) next() (string, bool) {
31+
for {
32+
line, err := lr.readLine()
33+
if err != nil {
34+
if err != io.EOF {
35+
lr.err = err
36+
}
37+
return "", false
38+
}
39+
if line != "" {
40+
return line, true
41+
}
42+
// Empty line or skipped oversized line — continue.
43+
}
44+
}
45+
46+
// Err returns the first non-EOF read error encountered, or nil.
47+
func (lr *lineReader) Err() error {
48+
return lr.err
49+
}
50+
51+
// readLine reads a full line, returning "" for blank/oversized
52+
// lines and a non-nil error only at EOF or read failure.
53+
func (lr *lineReader) readLine() (string, error) {
54+
lr.buf = lr.buf[:0]
55+
oversized := false
56+
57+
for {
58+
chunk, isPrefix, err := lr.r.ReadLine()
59+
if err != nil {
60+
if len(lr.buf) > 0 && err == io.EOF {
61+
break
62+
}
63+
return "", err
64+
}
65+
66+
if oversized {
67+
if !isPrefix {
68+
return "", nil // done skipping
69+
}
70+
continue
71+
}
72+
73+
lr.buf = append(lr.buf, chunk...)
74+
75+
if len(lr.buf) > lr.maxLen {
76+
oversized = true
77+
lr.buf = lr.buf[:0]
78+
if !isPrefix {
79+
return "", nil
80+
}
81+
continue
82+
}
83+
84+
if !isPrefix {
85+
break
86+
}
87+
}
88+
89+
return string(lr.buf), nil
90+
}

internal/parser/linereader_test.go

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
package parser
2+
3+
import (
4+
"errors"
5+
"io"
6+
"strings"
7+
"testing"
8+
)
9+
10+
func TestLineReader(t *testing.T) {
11+
tests := []struct {
12+
name string
13+
input string
14+
maxLen int
15+
want []string
16+
}{
17+
{
18+
"normal lines",
19+
"aaa\nbbb\nccc\n",
20+
100,
21+
[]string{"aaa", "bbb", "ccc"},
22+
},
23+
{
24+
"skips oversized line",
25+
"short\n" + strings.Repeat("x", 50) + "\nafter\n",
26+
30,
27+
[]string{"short", "after"},
28+
},
29+
{
30+
"all lines oversized",
31+
strings.Repeat("a", 50) + "\n" +
32+
strings.Repeat("b", 50) + "\n",
33+
30,
34+
nil,
35+
},
36+
{
37+
"empty input",
38+
"",
39+
100,
40+
nil,
41+
},
42+
{
43+
"blank lines skipped",
44+
"aaa\n\n\nbbb\n",
45+
100,
46+
[]string{"aaa", "bbb"},
47+
},
48+
{
49+
"line without trailing newline",
50+
"aaa\nbbb",
51+
100,
52+
[]string{"aaa", "bbb"},
53+
},
54+
{
55+
"exact limit kept",
56+
strings.Repeat("x", 30) + "\n",
57+
30,
58+
[]string{strings.Repeat("x", 30)},
59+
},
60+
{
61+
"one over limit skipped",
62+
strings.Repeat("x", 31) + "\n",
63+
30,
64+
nil,
65+
},
66+
}
67+
68+
for _, tt := range tests {
69+
t.Run(tt.name, func(t *testing.T) {
70+
lr := newLineReader(
71+
strings.NewReader(tt.input), tt.maxLen,
72+
)
73+
var got []string
74+
for {
75+
line, ok := lr.next()
76+
if !ok {
77+
break
78+
}
79+
got = append(got, line)
80+
}
81+
if len(got) != len(tt.want) {
82+
t.Fatalf("got %d lines, want %d: %v",
83+
len(got), len(tt.want), got)
84+
}
85+
for i := range got {
86+
if got[i] != tt.want[i] {
87+
t.Errorf("line[%d] = %q, want %q",
88+
i, got[i], tt.want[i])
89+
}
90+
}
91+
})
92+
}
93+
}
94+
95+
// errAfterReader yields data from buf, then returns errIO on the
96+
// next read.
97+
type errAfterReader struct {
98+
buf *strings.Reader
99+
errIO error
100+
done bool
101+
}
102+
103+
func (r *errAfterReader) Read(p []byte) (int, error) {
104+
if r.done {
105+
return 0, r.errIO
106+
}
107+
n, err := r.buf.Read(p)
108+
if err == io.EOF {
109+
r.done = true
110+
return n, r.errIO
111+
}
112+
return n, err
113+
}
114+
115+
func TestLineReaderIOError(t *testing.T) {
116+
ioErr := errors.New("disk read failed")
117+
r := &errAfterReader{
118+
buf: strings.NewReader("aaa\nbbb\n"),
119+
errIO: ioErr,
120+
}
121+
122+
lr := newLineReader(r, 100)
123+
var got []string
124+
for {
125+
line, ok := lr.next()
126+
if !ok {
127+
break
128+
}
129+
got = append(got, line)
130+
}
131+
132+
if len(got) != 2 {
133+
t.Fatalf("got %d lines, want 2: %v", len(got), got)
134+
}
135+
if lr.Err() == nil {
136+
t.Fatal("expected non-nil Err() after I/O failure")
137+
}
138+
if !errors.Is(lr.Err(), ioErr) {
139+
t.Fatalf("Err() = %v, want %v", lr.Err(), ioErr)
140+
}
141+
}

0 commit comments

Comments
 (0)