Skip to content

Commit 5ae654e

Browse files
committed
refactor: optimize chunk processing and introduce fast path for no-quote scenarios
1 parent cc7857a commit 5ae654e

File tree

5 files changed

+158
-95
lines changed

5 files changed

+158
-95
lines changed

field_parser.go

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -294,12 +294,20 @@ func processChunkMasks(
294294
state *parserState, result *parseResult,
295295
rowFirstField, lineNum *int,
296296
) {
297-
for {
298-
combined := sepMask | nlMask | quoteMask
299-
if combined == 0 {
300-
return
301-
}
297+
combined := sepMask | nlMask | quoteMask
298+
if combined == 0 {
299+
return
300+
}
301+
302+
// Fast path: no quotes in this chunk and not inside a quoted field.
303+
// Avoids quote-related event classification overhead.
304+
if quoteMask == 0 && !state.quoted {
305+
processChunkMasksNoQuotes(buf, offset, sepMask, nlMask, state, result, rowFirstField, lineNum)
306+
return
307+
}
302308

309+
// Standard path: process all structural characters in position order
310+
for combined != 0 {
303311
pos := bits.TrailingZeros64(combined)
304312
bit := uint64(1) << pos
305313
absPos := offset + uint64(pos)
@@ -318,6 +326,36 @@ func processChunkMasks(
318326
handleNewlineEvent(buf, absPos, state, result, rowFirstField, lineNum)
319327
nlMask &^= bit
320328
}
329+
330+
combined = sepMask | nlMask | quoteMask
331+
}
332+
}
333+
334+
// processChunkMasksNoQuotes is a fast path for chunks without quotes.
335+
// Avoids quote-related checks and event classification overhead.
336+
func processChunkMasksNoQuotes(
337+
buf []byte, offset uint64,
338+
sepMask, nlMask uint64,
339+
state *parserState, result *parseResult,
340+
rowFirstField, lineNum *int,
341+
) {
342+
combined := sepMask | nlMask
343+
for combined != 0 {
344+
pos := bits.TrailingZeros64(combined)
345+
bit := uint64(1) << pos
346+
absPos := offset + uint64(pos)
347+
348+
if sepMask&bit != 0 {
349+
// Separator - always record field (not quoted)
350+
recordField(buf, absPos, state, result, false)
351+
sepMask &^= bit
352+
} else {
353+
// Newline - record field and row
354+
handleNewlineEvent(buf, absPos, state, result, rowFirstField, lineNum)
355+
nlMask &^= bit
356+
}
357+
358+
combined = sepMask | nlMask
321359
}
322360
}
323361

parse.go

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -97,19 +97,23 @@ func buildRecords(buf []byte, pr *parseResult, hasCR bool) [][]string {
9797
}
9898

9999
// buildRecordZeroCopy creates a record with zero-copy strings from buf.
100-
// Only safe when no transformation (unescape/CRLF) is needed.
100+
// Safety: Only call when no transformation (unescape/CRLF) is needed.
101+
// The returned strings reference buf directly, so buf must outlive the record.
101102
func buildRecordZeroCopy(buf []byte, pr *parseResult, row rowInfo) []string {
102103
if row.fieldCount == 0 {
103104
return nil
104105
}
106+
105107
record := make([]string, row.fieldCount)
106108
bufLen := uint32(len(buf))
107-
for i := 0; i < row.fieldCount; i++ {
108-
fieldIdx := row.firstField + i
109-
if fieldIdx >= len(pr.fields) {
110-
break
111-
}
112-
field := pr.fields[fieldIdx]
109+
110+
endIdx := row.firstField + row.fieldCount
111+
if endIdx > len(pr.fields) {
112+
endIdx = len(pr.fields)
113+
}
114+
fields := pr.fields[row.firstField:endIdx]
115+
116+
for i, field := range fields {
113117
if field.length == 0 {
114118
continue
115119
}

record_builder.go

Lines changed: 81 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,8 @@ func (r *Reader) buildRecordWithValidation(row rowInfo, rowIdx int) ([]string, e
6565
return r.buildFinalRecord(fieldCount), nil
6666
}
6767

68-
// buildRecordWithValidationZeroCopy builds a record with zero-copy strings while still validating.
68+
// buildRecordWithValidationZeroCopy builds a record with zero-copy strings while validating.
69+
// Zero-copy is safe here because rawBuffer outlives the returned record strings.
6970
func (r *Reader) buildRecordWithValidationZeroCopy(row rowInfo, fields []fieldInfo) ([]string, error) {
7071
fieldCount := row.fieldCount
7172
record := r.allocateRecord(fieldCount)
@@ -75,108 +76,126 @@ func (r *Reader) buildRecordWithValidationZeroCopy(row rowInfo, fields []fieldIn
7576
bufLen := uint32(len(buf))
7677

7778
for i, field := range fields {
78-
// Validate even in zero-copy path
7979
if err := r.validateFieldIfNeeded(field, row.lineNum); err != nil {
8080
return record[:i], err
8181
}
8282

83-
start := field.start
84-
end := start + field.length
85-
if start >= bufLen {
86-
record[i] = ""
87-
} else {
88-
if end > bufLen {
89-
end = bufLen
90-
}
91-
// Zero-copy string from rawBuffer
92-
record[i] = unsafe.String(&buf[start], int(end-start))
93-
}
83+
record[i] = r.extractFieldString(buf, bufLen, field)
9484
r.state.fieldPositions[i] = position{line: row.lineNum, column: int(field.rawStart()) + 1}
9585
}
9686
return record, nil
9787
}
9888

89+
// extractFieldString returns a zero-copy string for the field content.
90+
// Returns empty string if field is out of bounds.
91+
func (r *Reader) extractFieldString(buf []byte, bufLen uint32, field fieldInfo) string {
92+
start := field.start
93+
end := start + field.length
94+
if start >= bufLen {
95+
return ""
96+
}
97+
if end > bufLen {
98+
end = bufLen
99+
}
100+
return unsafe.String(&buf[start], int(end-start))
101+
}
102+
99103
// buildRecordNoQuotes builds a record when the input contains no quotes.
100104
// Uses a single row string to avoid per-field allocations.
105+
// Zero-copy when TrimLeadingSpace is disabled; copies when trimming is needed.
101106
func (r *Reader) buildRecordNoQuotes(row rowInfo) []string {
102107
fieldCount := row.fieldCount
103108
record := r.allocateRecord(fieldCount)
104109
r.state.fieldPositions = r.ensureFieldPositionsCapacity(fieldCount)
105110

106111
fields := r.getFieldsForRow(row, fieldCount)
107-
buf := r.state.rawBuffer
108-
bufLen := uint32(len(buf))
109-
110112
if len(fields) == 0 {
111113
return record
112114
}
113115

116+
buf := r.state.rawBuffer
117+
bufLen := uint32(len(buf))
118+
119+
// Calculate row span in buffer
114120
rowStart := fields[0].rawStart()
115121
rowEnd := fields[len(fields)-1].rawEnd()
122+
123+
// Handle out-of-bounds row
116124
if rowStart >= bufLen {
117125
for i, field := range fields {
118126
record[i] = ""
119127
r.state.fieldPositions[i] = position{line: row.lineNum, column: int(field.rawStart()) + 1}
120128
}
121129
return record
122130
}
123-
if rowEnd > bufLen {
124-
rowEnd = bufLen
131+
132+
// Clamp row bounds
133+
rowEnd = clampUint32(rowEnd, rowStart, bufLen)
134+
135+
// Create row string (copy if trimming, zero-copy otherwise)
136+
rowStr := r.createRowString(buf, rowStart, rowEnd)
137+
rowStrLen := len(rowStr)
138+
139+
// Extract fields from row string
140+
for i, field := range fields {
141+
record[i] = r.extractFieldFromRow(buf, bufLen, rowStr, rowStrLen, rowStart, field)
142+
r.state.fieldPositions[i] = position{line: row.lineNum, column: int(field.rawStart()) + 1}
143+
}
144+
return record
145+
}
146+
147+
// clampUint32 clamps value to [minVal, maxVal].
148+
func clampUint32(value, minVal, maxVal uint32) uint32 {
149+
if value < minVal {
150+
return minVal
125151
}
126-
if rowEnd < rowStart {
127-
rowEnd = rowStart
152+
if value > maxVal {
153+
return maxVal
128154
}
155+
return value
156+
}
129157

130-
var rowStr string
158+
// createRowString creates a string for the row span.
159+
// Copies when TrimLeadingSpace is enabled; zero-copy otherwise.
160+
func (r *Reader) createRowString(buf []byte, rowStart, rowEnd uint32) string {
131161
if r.TrimLeadingSpace {
132-
rowStr = string(buf[rowStart:rowEnd])
133-
} else {
134-
// Zero-copy string from rawBuffer - safe because rawBuffer outlives record.
135-
rowStr = unsafe.String(&buf[rowStart], int(rowEnd-rowStart))
162+
return string(buf[rowStart:rowEnd])
136163
}
137-
rowStrLen := len(rowStr)
138-
139-
for i, field := range fields {
140-
start := field.start
141-
end := start + field.length
142-
rawStart := field.rawStart()
164+
return unsafe.String(&buf[rowStart], int(rowEnd-rowStart))
165+
}
143166

144-
if start < bufLen {
145-
if end > bufLen {
146-
end = bufLen
147-
}
148-
if r.TrimLeadingSpace && start < end {
149-
for start < end && (buf[start] == ' ' || buf[start] == '\t') {
150-
start++
151-
}
152-
}
153-
}
167+
// extractFieldFromRow extracts a field string from the row string.
168+
func (r *Reader) extractFieldFromRow(buf []byte, bufLen uint32, rowStr string, rowStrLen int, rowStart uint32, field fieldInfo) string {
169+
start := field.start
170+
end := start + field.length
154171

155-
if start < rowStart {
156-
start = rowStart
157-
}
158-
if end < start {
159-
end = start
160-
}
161-
relStart := int(start - rowStart)
162-
relEnd := int(end - rowStart)
163-
if relStart < 0 {
164-
relStart = 0
172+
// Apply trimming if needed
173+
if r.TrimLeadingSpace && start < bufLen && start < end {
174+
trimEnd := end
175+
if trimEnd > bufLen {
176+
trimEnd = bufLen
165177
}
166-
if relStart > rowStrLen {
167-
relStart = rowStrLen
168-
}
169-
if relEnd < relStart {
170-
relEnd = relStart
171-
}
172-
if relEnd > rowStrLen {
173-
relEnd = rowStrLen
178+
for start < trimEnd && (buf[start] == ' ' || buf[start] == '\t') {
179+
start++
174180
}
181+
}
175182

176-
record[i] = rowStr[relStart:relEnd]
177-
r.state.fieldPositions[i] = position{line: row.lineNum, column: int(rawStart) + 1}
183+
// Calculate relative positions in row string
184+
relStart := clampInt(int(start)-int(rowStart), 0, rowStrLen)
185+
relEnd := clampInt(int(end)-int(rowStart), relStart, rowStrLen)
186+
187+
return rowStr[relStart:relEnd]
188+
}
189+
190+
// clampInt clamps value to [minVal, maxVal].
191+
func clampInt(value, minVal, maxVal int) int {
192+
if value < minVal {
193+
return minVal
178194
}
179-
return record
195+
if value > maxVal {
196+
return maxVal
197+
}
198+
return value
180199
}
181200

182201
// getFieldsForRow extracts the slice of fieldInfo for the given row.

simd_scanner.go

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,8 @@ var scanResultPool = sync.Pool{
101101
}
102102

103103
// scanResultLargeCache retains a single large scanResult across GC cycles.
104+
// This prevents repeated large allocations when processing files > 1MB,
105+
// as sync.Pool may evict large objects during GC.
104106
var scanResultLargeCache struct {
105107
mu sync.Mutex
106108
sr *scanResult
@@ -123,20 +125,28 @@ func (sr *scanResult) reset() {
123125
}
124126

125127
// releaseScanResult returns a scanResult to the pool for reuse.
128+
// Large results (>= scanResultLargeThreshold) are cached separately to survive GC.
126129
func releaseScanResult(sr *scanResult) {
127-
if sr != nil {
128-
sr.reset()
129-
if cap(sr.quoteMasks) >= scanResultLargeThreshold {
130-
scanResultLargeCache.mu.Lock()
131-
if scanResultLargeCache.sr == nil || cap(scanResultLargeCache.sr.quoteMasks) < cap(sr.quoteMasks) {
132-
scanResultLargeCache.sr = sr
133-
scanResultLargeCache.mu.Unlock()
134-
return
135-
}
130+
if sr == nil {
131+
return
132+
}
133+
134+
sr.reset()
135+
136+
// Cache large results separately to prevent GC eviction
137+
if cap(sr.quoteMasks) >= scanResultLargeThreshold {
138+
scanResultLargeCache.mu.Lock()
139+
shouldCache := scanResultLargeCache.sr == nil ||
140+
cap(scanResultLargeCache.sr.quoteMasks) < cap(sr.quoteMasks)
141+
if shouldCache {
142+
scanResultLargeCache.sr = sr
136143
scanResultLargeCache.mu.Unlock()
144+
return
137145
}
138-
scanResultPool.Put(sr)
146+
scanResultLargeCache.mu.Unlock()
139147
}
148+
149+
scanResultPool.Put(sr)
140150
}
141151

142152
// =============================================================================

validation.go

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
package simdcsv
44

5+
import "bytes"
6+
57
// =============================================================================
68
// Validation Policy - Configurable behavior decisions
79
// =============================================================================
@@ -164,24 +166,14 @@ func (r *Reader) validateQuotedField(raw []byte, rawStart uint64, lineNum int) e
164166
// validateUnquotedField validates a field that does not start with a quote.
165167
// Reports ErrBareQuote if quotes appear in unquoted fields.
166168
func (r *Reader) validateUnquotedField(raw []byte, rawStart uint64, lineNum int) error {
167-
quotePos := findBareQuote(raw)
169+
quotePos := bytes.IndexByte(raw, '"')
168170
if quotePos == -1 {
169171
return nil
170172
}
171173
col := int(rawStart) + quotePos + 1 //nolint:gosec // G115
172174
return &ParseError{StartLine: lineNum, Line: lineNum, Column: col, Err: ErrBareQuote}
173175
}
174176

175-
// findBareQuote returns the index of the first quote in data, or -1 if none found.
176-
func findBareQuote(data []byte) int {
177-
for i, b := range data {
178-
if b == '"' {
179-
return i
180-
}
181-
}
182-
return -1
183-
}
184-
185177
// =============================================================================
186178
// Quote Structure Validators - Single responsibility functions
187179
// =============================================================================

0 commit comments

Comments
 (0)