Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 43 additions & 5 deletions field_parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -294,12 +294,20 @@ func processChunkMasks(
state *parserState, result *parseResult,
rowFirstField, lineNum *int,
) {
for {
combined := sepMask | nlMask | quoteMask
if combined == 0 {
return
}
combined := sepMask | nlMask | quoteMask
if combined == 0 {
return
}

// Fast path: no quotes in this chunk and not inside a quoted field.
// Avoids quote-related event classification overhead.
if quoteMask == 0 && !state.quoted {
processChunkMasksNoQuotes(buf, offset, sepMask, nlMask, state, result, rowFirstField, lineNum)
return
}

// Standard path: process all structural characters in position order
for combined != 0 {
pos := bits.TrailingZeros64(combined)
bit := uint64(1) << pos
absPos := offset + uint64(pos)
Expand All @@ -318,6 +326,36 @@ func processChunkMasks(
handleNewlineEvent(buf, absPos, state, result, rowFirstField, lineNum)
nlMask &^= bit
}

combined = sepMask | nlMask | quoteMask
}
}

// processChunkMasksNoQuotes is a fast path for chunks without quotes.
// Avoids quote-related checks and event classification overhead.
func processChunkMasksNoQuotes(
buf []byte, offset uint64,
sepMask, nlMask uint64,
state *parserState, result *parseResult,
rowFirstField, lineNum *int,
) {
combined := sepMask | nlMask
for combined != 0 {
pos := bits.TrailingZeros64(combined)
bit := uint64(1) << pos
absPos := offset + uint64(pos)

if sepMask&bit != 0 {
// Separator - always record field (not quoted)
recordField(buf, absPos, state, result, false)
sepMask &^= bit
} else {
// Newline - record field and row
handleNewlineEvent(buf, absPos, state, result, rowFirstField, lineNum)
nlMask &^= bit
}

combined = sepMask | nlMask
}
}

Expand Down
18 changes: 11 additions & 7 deletions parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,19 +97,23 @@ func buildRecords(buf []byte, pr *parseResult, hasCR bool) [][]string {
}

// buildRecordZeroCopy creates a record with zero-copy strings from buf.
// Only safe when no transformation (unescape/CRLF) is needed.
// Safety: Only call when no transformation (unescape/CRLF) is needed.
// The returned strings reference buf directly, so buf must outlive the record.
func buildRecordZeroCopy(buf []byte, pr *parseResult, row rowInfo) []string {
if row.fieldCount == 0 {
return nil
}

record := make([]string, row.fieldCount)
bufLen := uint32(len(buf))
for i := 0; i < row.fieldCount; i++ {
fieldIdx := row.firstField + i
if fieldIdx >= len(pr.fields) {
break
}
field := pr.fields[fieldIdx]

endIdx := row.firstField + row.fieldCount
if endIdx > len(pr.fields) {
endIdx = len(pr.fields)
}
fields := pr.fields[row.firstField:endIdx]

for i, field := range fields {
if field.length == 0 {
continue
}
Expand Down
143 changes: 81 additions & 62 deletions record_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ func (r *Reader) buildRecordWithValidation(row rowInfo, rowIdx int) ([]string, e
return r.buildFinalRecord(fieldCount), nil
}

// buildRecordWithValidationZeroCopy builds a record with zero-copy strings while still validating.
// buildRecordWithValidationZeroCopy builds a record with zero-copy strings while validating.
// Zero-copy is safe here because rawBuffer outlives the returned record strings.
func (r *Reader) buildRecordWithValidationZeroCopy(row rowInfo, fields []fieldInfo) ([]string, error) {
fieldCount := row.fieldCount
record := r.allocateRecord(fieldCount)
Expand All @@ -75,108 +76,126 @@ func (r *Reader) buildRecordWithValidationZeroCopy(row rowInfo, fields []fieldIn
bufLen := uint32(len(buf))

for i, field := range fields {
// Validate even in zero-copy path
if err := r.validateFieldIfNeeded(field, row.lineNum); err != nil {
return record[:i], err
}

start := field.start
end := start + field.length
if start >= bufLen {
record[i] = ""
} else {
if end > bufLen {
end = bufLen
}
// Zero-copy string from rawBuffer
record[i] = unsafe.String(&buf[start], int(end-start))
}
record[i] = r.extractFieldString(buf, bufLen, field)
r.state.fieldPositions[i] = position{line: row.lineNum, column: int(field.rawStart()) + 1}
}
return record, nil
}

// extractFieldString returns a zero-copy string for the field content.
// Returns empty string if field is out of bounds.
func (r *Reader) extractFieldString(buf []byte, bufLen uint32, field fieldInfo) string {
start := field.start
end := start + field.length
if start >= bufLen {
return ""
}
if end > bufLen {
end = bufLen
}
return unsafe.String(&buf[start], int(end-start))
}

// buildRecordNoQuotes builds a record when the input contains no quotes.
// Uses a single row string to avoid per-field allocations.
// Zero-copy when TrimLeadingSpace is disabled; copies when trimming is needed.
func (r *Reader) buildRecordNoQuotes(row rowInfo) []string {
fieldCount := row.fieldCount
record := r.allocateRecord(fieldCount)
r.state.fieldPositions = r.ensureFieldPositionsCapacity(fieldCount)

fields := r.getFieldsForRow(row, fieldCount)
buf := r.state.rawBuffer
bufLen := uint32(len(buf))

if len(fields) == 0 {
return record
}

buf := r.state.rawBuffer
bufLen := uint32(len(buf))

// Calculate row span in buffer
rowStart := fields[0].rawStart()
rowEnd := fields[len(fields)-1].rawEnd()

// Handle out-of-bounds row
if rowStart >= bufLen {
for i, field := range fields {
record[i] = ""
r.state.fieldPositions[i] = position{line: row.lineNum, column: int(field.rawStart()) + 1}
}
return record
}
if rowEnd > bufLen {
rowEnd = bufLen

// Clamp row bounds
rowEnd = clampUint32(rowEnd, rowStart, bufLen)

// Create row string (copy if trimming, zero-copy otherwise)
rowStr := r.createRowString(buf, rowStart, rowEnd)
rowStrLen := len(rowStr)

// Extract fields from row string
for i, field := range fields {
record[i] = r.extractFieldFromRow(buf, bufLen, rowStr, rowStrLen, rowStart, field)
r.state.fieldPositions[i] = position{line: row.lineNum, column: int(field.rawStart()) + 1}
}
return record
}

// clampUint32 clamps value to [minVal, maxVal].
func clampUint32(value, minVal, maxVal uint32) uint32 {
if value < minVal {
return minVal
}
if rowEnd < rowStart {
rowEnd = rowStart
if value > maxVal {
return maxVal
}
return value
}

var rowStr string
// createRowString creates a string for the row span.
// Copies when TrimLeadingSpace is enabled; zero-copy otherwise.
func (r *Reader) createRowString(buf []byte, rowStart, rowEnd uint32) string {
if r.TrimLeadingSpace {
rowStr = string(buf[rowStart:rowEnd])
} else {
// Zero-copy string from rawBuffer - safe because rawBuffer outlives record.
rowStr = unsafe.String(&buf[rowStart], int(rowEnd-rowStart))
return string(buf[rowStart:rowEnd])
}
rowStrLen := len(rowStr)

for i, field := range fields {
start := field.start
end := start + field.length
rawStart := field.rawStart()
return unsafe.String(&buf[rowStart], int(rowEnd-rowStart))
}

if start < bufLen {
if end > bufLen {
end = bufLen
}
if r.TrimLeadingSpace && start < end {
for start < end && (buf[start] == ' ' || buf[start] == '\t') {
start++
}
}
}
// extractFieldFromRow extracts a field string from the row string.
func (r *Reader) extractFieldFromRow(buf []byte, bufLen uint32, rowStr string, rowStrLen int, rowStart uint32, field fieldInfo) string {
start := field.start
end := start + field.length

if start < rowStart {
start = rowStart
}
if end < start {
end = start
}
relStart := int(start - rowStart)
relEnd := int(end - rowStart)
if relStart < 0 {
relStart = 0
// Apply trimming if needed
if r.TrimLeadingSpace && start < bufLen && start < end {
trimEnd := end
if trimEnd > bufLen {
trimEnd = bufLen
}
if relStart > rowStrLen {
relStart = rowStrLen
}
if relEnd < relStart {
relEnd = relStart
}
if relEnd > rowStrLen {
relEnd = rowStrLen
for start < trimEnd && (buf[start] == ' ' || buf[start] == '\t') {
start++
}
}

record[i] = rowStr[relStart:relEnd]
r.state.fieldPositions[i] = position{line: row.lineNum, column: int(rawStart) + 1}
// Calculate relative positions in row string
relStart := clampInt(int(start)-int(rowStart), 0, rowStrLen)
relEnd := clampInt(int(end)-int(rowStart), relStart, rowStrLen)

return rowStr[relStart:relEnd]
}

// clampInt clamps value to [minVal, maxVal].
func clampInt(value, minVal, maxVal int) int {
if value < minVal {
return minVal
}
return record
if value > maxVal {
return maxVal
}
return value
}

// getFieldsForRow extracts the slice of fieldInfo for the given row.
Expand Down
30 changes: 20 additions & 10 deletions simd_scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,8 @@ var scanResultPool = sync.Pool{
}

// scanResultLargeCache retains a single large scanResult across GC cycles.
// This prevents repeated large allocations when processing files > 1MB,
// as sync.Pool may evict large objects during GC.
var scanResultLargeCache struct {
mu sync.Mutex
sr *scanResult
Expand All @@ -123,20 +125,28 @@ func (sr *scanResult) reset() {
}

// releaseScanResult returns a scanResult to the pool for reuse.
// Large results (>= scanResultLargeThreshold) are cached separately to survive GC.
func releaseScanResult(sr *scanResult) {
if sr != nil {
sr.reset()
if cap(sr.quoteMasks) >= scanResultLargeThreshold {
scanResultLargeCache.mu.Lock()
if scanResultLargeCache.sr == nil || cap(scanResultLargeCache.sr.quoteMasks) < cap(sr.quoteMasks) {
scanResultLargeCache.sr = sr
scanResultLargeCache.mu.Unlock()
return
}
if sr == nil {
return
}

sr.reset()

// Cache large results separately to prevent GC eviction
if cap(sr.quoteMasks) >= scanResultLargeThreshold {
scanResultLargeCache.mu.Lock()
shouldCache := scanResultLargeCache.sr == nil ||
cap(scanResultLargeCache.sr.quoteMasks) < cap(sr.quoteMasks)
if shouldCache {
scanResultLargeCache.sr = sr
scanResultLargeCache.mu.Unlock()
return
}
scanResultPool.Put(sr)
scanResultLargeCache.mu.Unlock()
}

scanResultPool.Put(sr)
}

// =============================================================================
Expand Down
14 changes: 3 additions & 11 deletions validation.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

package simdcsv

import "bytes"

// =============================================================================
// Validation Policy - Configurable behavior decisions
// =============================================================================
Expand Down Expand Up @@ -164,24 +166,14 @@ func (r *Reader) validateQuotedField(raw []byte, rawStart uint64, lineNum int) e
// validateUnquotedField validates a field that does not start with a quote.
// Reports ErrBareQuote if quotes appear in unquoted fields.
func (r *Reader) validateUnquotedField(raw []byte, rawStart uint64, lineNum int) error {
quotePos := findBareQuote(raw)
quotePos := bytes.IndexByte(raw, '"')
if quotePos == -1 {
return nil
}
col := int(rawStart) + quotePos + 1 //nolint:gosec // G115
return &ParseError{StartLine: lineNum, Line: lineNum, Column: col, Err: ErrBareQuote}
}

// findBareQuote returns the index of the first quote in data, or -1 if none found.
func findBareQuote(data []byte) int {
for i, b := range data {
if b == '"' {
return i
}
}
return -1
}

// =============================================================================
// Quote Structure Validators - Single responsibility functions
// =============================================================================
Expand Down