refactor: improve memory allocation and parsing efficiency in CSV processing (#64)

nnnkkk7 · web-flow · commit 105dd1d89e99 · 2026-01-29T09:44:49.000+09:00
diff --git a/field_parser.go b/field_parser.go
@@ -203,7 +203,23 @@ func estimateCounts(bufLen int, sr *scanResult) (estimatedFields, estimatedRows
 }
 
 // ensureResultCapacity ensures result slices have sufficient capacity.
+// Uses scan counts for accurate pre-allocation when available.
 func ensureResultCapacity(result *parseResult, bufLen int, sr *scanResult) {
+	// Use exact counts from scan when available (most accurate)
+	if sr != nil && sr.separatorCount > 0 {
+		estimatedFields := sr.separatorCount + sr.newlineCount + 1
+		estimatedRows := sr.newlineCount + 1
+
+		if cap(result.fields) < estimatedFields {
+			result.fields = make([]fieldInfo, 0, estimatedFields)
+		}
+		if cap(result.rows) < estimatedRows {
+			result.rows = make([]rowInfo, 0, estimatedRows)
+		}
+		return
+	}
+
+	// Fallback: conservative estimate from buffer size
 	estimatedFields, estimatedRows := estimateCounts(bufLen, sr)
 
 	if cap(result.fields) < estimatedFields {
diff --git a/parse.go b/parse.go
@@ -58,18 +58,36 @@ func ParseBytesStreaming(data []byte, comma rune, callback func([]string) error)
 // ============================================================================
 
 // buildRecords converts a parseResult to [][]string.
-// Optimizes memory by accumulating fields into a single buffer per record,
-// then using zero-copy slicing after a single string conversion.
+// Fast path: zero-copy when no transformation needed.
+// Slow path: accumulate into buffer when unescape/CRLF handling required.
 func buildRecords(buf []byte, pr *parseResult, hasCR bool) [][]string {
 	if pr == nil || len(pr.rows) == 0 {
 		return nil
 	}
 
 	records := make([][]string, len(pr.rows))
 
-	// fieldEnds can be reused, but recordBuf must be unique per record for unsafe.String
-	var fieldEnds []int
+	// Check if any field needs transformation
+	needsTransform := hasCR
+	if !needsTransform {
+		for _, f := range pr.fields {
+			if f.needsUnescape() {
+				needsTransform = true
+				break
+			}
+		}
+	}
+
+	// Fast path: zero-copy direct from buffer when no transformation needed
+	if !needsTransform {
+		for i, row := range pr.rows {
+			records[i] = buildRecordZeroCopy(buf, pr, row)
+		}
+		return records
+	}
 
+	// Slow path: accumulate with transformation
+	var fieldEnds []int
 	for i, row := range pr.rows {
 		var recordBuf []byte
 		recordBuf, fieldEnds = accumulateFields(buf, pr, row, hasCR, recordBuf, fieldEnds[:0])
@@ -78,6 +96,36 @@ func buildRecords(buf []byte, pr *parseResult, hasCR bool) [][]string {
 	return records
 }
 
+// buildRecordZeroCopy creates a record with zero-copy strings from buf.
+// Only safe when no transformation (unescape/CRLF) is needed.
+func buildRecordZeroCopy(buf []byte, pr *parseResult, row rowInfo) []string {
+	if row.fieldCount == 0 {
+		return nil
+	}
+	record := make([]string, row.fieldCount)
+	bufLen := uint32(len(buf))
+	for i := 0; i < row.fieldCount; i++ {
+		fieldIdx := row.firstField + i
+		if fieldIdx >= len(pr.fields) {
+			break
+		}
+		field := pr.fields[fieldIdx]
+		if field.length == 0 {
+			continue
+		}
+		start := field.start
+		end := start + field.length
+		if start >= bufLen {
+			continue
+		}
+		if end > bufLen {
+			end = bufLen
+		}
+		record[i] = unsafe.String(&buf[start], int(end-start))
+	}
+	return record
+}
+
 // buildRecord builds a single record from a rowInfo (for streaming API).
 func buildRecord(buf []byte, pr *parseResult, row rowInfo, hasCR bool) []string {
 	recordBuf, fieldEnds := accumulateFields(buf, pr, row, hasCR, nil, nil)
@@ -101,15 +149,20 @@ func accumulateFields(buf []byte, pr *parseResult, row rowInfo, hasCR bool, reco
 // sliceFieldsFromBuffer converts the accumulated buffer to individual field strings.
 // Uses unsafe.String for zero-copy conversion. Caller must ensure recordBuf is not reused.
 func sliceFieldsFromBuffer(recordBuf []byte, fieldEnds []int) []string {
+	fieldCount := len(fieldEnds)
+	if fieldCount == 0 {
+		return nil
+	}
+	record := make([]string, fieldCount)
 	if len(recordBuf) == 0 {
-		return make([]string, len(fieldEnds))
+		return record
 	}
 	// Zero-copy string conversion - safe because recordBuf is unique per record
-	str := unsafe.String(unsafe.SliceData(recordBuf), len(recordBuf))
-	record := make([]string, len(fieldEnds))
 	prevEnd := 0
 	for i, end := range fieldEnds {
-		record[i] = str[prevEnd:end]
+		if prevEnd < end && prevEnd < len(recordBuf) {
+			record[i] = unsafe.String(&recordBuf[prevEnd], end-prevEnd)
+		}
 		prevEnd = end
 	}
 	return record
diff --git a/record_builder.go b/record_builder.go
@@ -3,7 +3,10 @@
 //nolint:gosec // G115: Integer conversions are safe - buffer size bounded by DefaultMaxInputSize (2GB)
 package simdcsv
 
-import "bytes"
+import (
+	"bytes"
+	"unsafe"
+)
 
 // Buffer allocation constants for reducing reallocations in hot path.
 const (
@@ -30,10 +33,27 @@ const (
 // This matches encoding/csv behavior and allows callers to recover partial data.
 func (r *Reader) buildRecordWithValidation(row rowInfo, rowIdx int) ([]string, error) {
 	fieldCount := row.fieldCount
-	r.prepareBuffers(row, fieldCount)
-
 	fields := r.getFieldsForRow(row, fieldCount)
 
+	// Fast path: check if any field needs transformation
+	needsTransform := r.state.hasCR
+	if !needsTransform {
+		for _, field := range fields {
+			if field.needsUnescape() {
+				needsTransform = true
+				break
+			}
+		}
+	}
+
+	// Fast path: zero-copy when no transformation needed (but still validate)
+	if !needsTransform && !r.TrimLeadingSpace {
+		return r.buildRecordWithValidationZeroCopy(row, fields)
+	}
+
+	// Standard path with transformation
+	r.prepareBuffers(row, fieldCount)
+
 	for i, field := range fields {
 		if err := r.validateFieldIfNeeded(field, row.lineNum); err != nil {
 			return r.buildPartialRecord(i), err
@@ -45,36 +65,116 @@ func (r *Reader) buildRecordWithValidation(row rowInfo, rowIdx int) ([]string, e
 	return r.buildFinalRecord(fieldCount), nil
 }
 
-// buildRecordNoQuotes builds a record when the input contains no quotes.
-// It avoids the recordBuffer copy path and mirrors appendSimpleContent behavior.
-func (r *Reader) buildRecordNoQuotes(row rowInfo) []string {
+// buildRecordWithValidationZeroCopy builds a record with zero-copy strings while still validating.
+func (r *Reader) buildRecordWithValidationZeroCopy(row rowInfo, fields []fieldInfo) ([]string, error) {
 	fieldCount := row.fieldCount
 	record := r.allocateRecord(fieldCount)
 	r.state.fieldPositions = r.ensureFieldPositionsCapacity(fieldCount)
 
-	fields := r.getFieldsForRow(row, fieldCount)
 	buf := r.state.rawBuffer
 	bufLen := uint32(len(buf))
 
 	for i, field := range fields {
+		// Validate even in zero-copy path
+		if err := r.validateFieldIfNeeded(field, row.lineNum); err != nil {
+			return record[:i], err
+		}
+
 		start := field.start
 		end := start + field.length
 		if start >= bufLen {
 			record[i] = ""
-			r.state.fieldPositions[i] = position{line: row.lineNum, column: int(start) + 1}
-			continue
+		} else {
+			if end > bufLen {
+				end = bufLen
+			}
+			// Zero-copy string from rawBuffer
+			record[i] = unsafe.String(&buf[start], int(end-start))
 		}
-		if end > bufLen {
-			end = bufLen
+		r.state.fieldPositions[i] = position{line: row.lineNum, column: int(field.rawStart()) + 1}
+	}
+	return record, nil
+}
+
+// buildRecordNoQuotes builds a record when the input contains no quotes.
+// Uses a single row string to avoid per-field allocations.
+func (r *Reader) buildRecordNoQuotes(row rowInfo) []string {
+	fieldCount := row.fieldCount
+	record := r.allocateRecord(fieldCount)
+	r.state.fieldPositions = r.ensureFieldPositionsCapacity(fieldCount)
+
+	fields := r.getFieldsForRow(row, fieldCount)
+	buf := r.state.rawBuffer
+	bufLen := uint32(len(buf))
+
+	if len(fields) == 0 {
+		return record
+	}
+
+	rowStart := fields[0].rawStart()
+	rowEnd := fields[len(fields)-1].rawEnd()
+	if rowStart >= bufLen {
+		for i, field := range fields {
+			record[i] = ""
+			r.state.fieldPositions[i] = position{line: row.lineNum, column: int(field.rawStart()) + 1}
+		}
+		return record
+	}
+	if rowEnd > bufLen {
+		rowEnd = bufLen
+	}
+	if rowEnd < rowStart {
+		rowEnd = rowStart
+	}
+
+	var rowStr string
+	if r.TrimLeadingSpace {
+		rowStr = string(buf[rowStart:rowEnd])
+	} else {
+		// Zero-copy string from rawBuffer - safe because rawBuffer outlives record.
+		rowStr = unsafe.String(&buf[rowStart], int(rowEnd-rowStart))
+	}
+	rowStrLen := len(rowStr)
+
+	for i, field := range fields {
+		start := field.start
+		end := start + field.length
+		rawStart := field.rawStart()
+
+		if start < bufLen {
+			if end > bufLen {
+				end = bufLen
+			}
+			if r.TrimLeadingSpace && start < end {
+				for start < end && (buf[start] == ' ' || buf[start] == '\t') {
+					start++
+				}
+			}
 		}
 
-		content := buf[start:end]
-		if r.TrimLeadingSpace {
-			content = trimLeftBytes(content)
+		if start < rowStart {
+			start = rowStart
+		}
+		if end < start {
+			end = start
+		}
+		relStart := int(start - rowStart)
+		relEnd := int(end - rowStart)
+		if relStart < 0 {
+			relStart = 0
+		}
+		if relStart > rowStrLen {
+			relStart = rowStrLen
+		}
+		if relEnd < relStart {
+			relEnd = relStart
+		}
+		if relEnd > rowStrLen {
+			relEnd = rowStrLen
 		}
 
-		record[i] = string(content)
-		r.state.fieldPositions[i] = position{line: row.lineNum, column: int(start) + 1}
+		record[i] = rowStr[relStart:relEnd]
+		r.state.fieldPositions[i] = position{line: row.lineNum, column: int(rawStart) + 1}
 	}
 	return record
 }
diff --git a/simd_scanner.go b/simd_scanner.go
@@ -80,8 +80,12 @@ type chunkMasks struct {
 // =============================================================================
 
 // scanResultPoolCapacity is the pre-allocated capacity for pooled scanResult slices.
-// 512 chunks = ~32KB input, balancing small and large file performance.
-const scanResultPoolCapacity = 512
+// 4096 chunks = ~256KB input, reducing allocations for typical CSV sizes.
+const scanResultPoolCapacity = 4096
+
+// scanResultLargeThreshold retains large scanResults to avoid repeated allocations across GCs.
+// 16384 chunks = ~1MB input.
+const scanResultLargeThreshold = 16384
 
 // scanResultPool provides reusable scanResult objects to reduce allocations.
 var scanResultPool = sync.Pool{
@@ -96,6 +100,12 @@ var scanResultPool = sync.Pool{
 	},
 }
 
+// scanResultLargeCache retains a single large scanResult across GC cycles.
+var scanResultLargeCache struct {
+	mu sync.Mutex
+	sr *scanResult
+}
+
 // reset clears the scanResult for reuse while preserving slice capacity.
 func (sr *scanResult) reset() {
 	sr.quoteMasks = sr.quoteMasks[:0]
@@ -116,6 +126,15 @@ func (sr *scanResult) reset() {
 func releaseScanResult(sr *scanResult) {
 	if sr != nil {
 		sr.reset()
+		if cap(sr.quoteMasks) >= scanResultLargeThreshold {
+			scanResultLargeCache.mu.Lock()
+			if scanResultLargeCache.sr == nil || cap(scanResultLargeCache.sr.quoteMasks) < cap(sr.quoteMasks) {
+				scanResultLargeCache.sr = sr
+				scanResultLargeCache.mu.Unlock()
+				return
+			}
+			scanResultLargeCache.mu.Unlock()
+		}
 		scanResultPool.Put(sr)
 	}
 }
@@ -125,27 +144,25 @@ func releaseScanResult(sr *scanResult) {
 // =============================================================================
 
 // ensureUint64SliceCap ensures slice has at least required length.
-// Uses 2x growth with 25% headroom when reallocation is needed.
+// Reuses existing capacity when possible.
 func ensureUint64SliceCap(s []uint64, required int) []uint64 {
 	if cap(s) >= required {
 		return s[:required]
 	}
-	newCap := max(cap(s)*2, required)
-	newCap += newCap / 4
-	return make([]uint64, required, newCap)
+	// Allocate exact size to avoid over-allocation for small inputs
+	return make([]uint64, required)
 }
 
 // ensureBoolSliceCap ensures slice has at least required length (cleared).
-// Uses 2x growth with 25% headroom when reallocation is needed.
+// Reuses existing capacity when possible.
 func ensureBoolSliceCap(s []bool, required int) []bool {
 	if cap(s) >= required {
 		s = s[:required]
 		clear(s)
 		return s
 	}
-	newCap := max(cap(s)*2, required)
-	newCap += newCap / 4
-	return make([]bool, required, newCap)
+	// Allocate exact size to avoid over-allocation for small inputs
+	return make([]bool, required)
 }
 
 // =============================================================================
@@ -481,6 +498,20 @@ func scanBufferWithGenerator(buf []byte, gen maskGenerator) *scanResult {
 
 // acquireScanResult gets a pooled scanResult and initializes it for the given chunk count.
 func acquireScanResult(chunkCount int) *scanResult {
+	if chunkCount >= scanResultLargeThreshold {
+		scanResultLargeCache.mu.Lock()
+		result := scanResultLargeCache.sr
+		if result != nil && cap(result.quoteMasks) >= chunkCount {
+			scanResultLargeCache.sr = nil
+			scanResultLargeCache.mu.Unlock()
+			result.reset()
+			result.chunkCount = chunkCount
+			initScanResultSlices(result, chunkCount)
+			return result
+		}
+		scanResultLargeCache.mu.Unlock()
+	}
+
 	result := scanResultPool.Get().(*scanResult)
 	result.reset()
 	result.chunkCount = chunkCount