Skip to content

Commit a0aa03c

Browse files
authored
refactor: enhance record parsing efficiency by introducing no-quote path and optimizing memory usage (#63)
1 parent 7303bc1 commit a0aa03c

File tree

6 files changed

+81
-16
lines changed

6 files changed

+81
-16
lines changed

parse.go

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
//nolint:gosec // G115: Integer conversions are safe - buffer size bounded by DefaultMaxInputSize (2GB)
44
package simdcsv
55

6+
import "unsafe"
7+
68
// ============================================================================
79
// Public API - Direct Parsing
810
// ============================================================================
@@ -65,12 +67,12 @@ func buildRecords(buf []byte, pr *parseResult, hasCR bool) [][]string {
6567

6668
records := make([][]string, len(pr.rows))
6769

68-
// Shared buffers reused across records
69-
var recordBuf []byte
70+
// fieldEnds can be reused, but recordBuf must be unique per record for unsafe.String
7071
var fieldEnds []int
7172

7273
for i, row := range pr.rows {
73-
recordBuf, fieldEnds = accumulateFields(buf, pr, row, hasCR, recordBuf[:0], fieldEnds[:0])
74+
var recordBuf []byte
75+
recordBuf, fieldEnds = accumulateFields(buf, pr, row, hasCR, recordBuf, fieldEnds[:0])
7476
records[i] = sliceFieldsFromBuffer(recordBuf, fieldEnds)
7577
}
7678
return records
@@ -97,9 +99,13 @@ func accumulateFields(buf []byte, pr *parseResult, row rowInfo, hasCR bool, reco
9799
}
98100

99101
// sliceFieldsFromBuffer converts the accumulated buffer to individual field strings.
100-
// Uses a single string conversion followed by zero-copy slicing.
102+
// Uses unsafe.String for zero-copy conversion. Caller must ensure recordBuf is not reused.
101103
func sliceFieldsFromBuffer(recordBuf []byte, fieldEnds []int) []string {
102-
str := string(recordBuf)
104+
if len(recordBuf) == 0 {
105+
return make([]string, len(fieldEnds))
106+
}
107+
// Zero-copy string conversion - safe because recordBuf is unique per record
108+
str := unsafe.String(unsafe.SliceData(recordBuf), len(recordBuf))
103109
record := make([]string, len(fieldEnds))
104110
prevEnd := 0
105111
for i, end := range fieldEnds {

quote.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ package simdcsv
44

55
import (
66
"math/bits"
7-
"unsafe"
87

98
"simd/archsimd"
109
)
@@ -99,10 +98,11 @@ func findClosingQuoteScalar(data []byte, startAfterOpenQuote int) int {
9998
// findClosingQuoteSIMD uses AVX-512 to find the closing quote in simdHalfChunk-byte chunks.
10099
func findClosingQuoteSIMD(data []byte, startAfterOpenQuote int) int {
101100
quoteCmp := archsimd.BroadcastInt8x32('"')
101+
int8Data := bytesToInt8Slice(data)
102102
i := startAfterOpenQuote
103103

104104
for i+simdHalfChunk <= len(data) {
105-
chunk := archsimd.LoadInt8x32((*[simdHalfChunk]int8)(unsafe.Pointer(&data[i])))
105+
chunk := archsimd.LoadInt8x32Slice(int8Data[i : i+simdHalfChunk])
106106
mask := chunk.Equal(quoteCmp).ToBits()
107107

108108
if mask == 0 {

reader.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,16 @@ func (r *Reader) readNextRecord() ([]string, error) {
252252
continue
253253
}
254254

255+
// Fast path: no quotes anywhere, so no unescape/validation needed.
256+
if !r.state.hasQuotes {
257+
record := r.buildRecordNoQuotes(rowInfo)
258+
if err := r.validateFieldCount(record, rowInfo); err != nil {
259+
return record, err
260+
}
261+
r.state.nonCommentRecordCount++
262+
return record, nil
263+
}
264+
255265
record, err := r.buildRecordWithValidation(rowInfo, rowIdx)
256266
if err != nil {
257267
return record, err

record_builder.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,40 @@ func (r *Reader) buildRecordWithValidation(row rowInfo, rowIdx int) ([]string, e
4545
return r.buildFinalRecord(fieldCount), nil
4646
}
4747

48+
// buildRecordNoQuotes builds a record when the input contains no quotes.
49+
// It avoids the recordBuffer copy path and mirrors appendSimpleContent behavior.
50+
func (r *Reader) buildRecordNoQuotes(row rowInfo) []string {
51+
fieldCount := row.fieldCount
52+
record := r.allocateRecord(fieldCount)
53+
r.state.fieldPositions = r.ensureFieldPositionsCapacity(fieldCount)
54+
55+
fields := r.getFieldsForRow(row, fieldCount)
56+
buf := r.state.rawBuffer
57+
bufLen := uint32(len(buf))
58+
59+
for i, field := range fields {
60+
start := field.start
61+
end := start + field.length
62+
if start >= bufLen {
63+
record[i] = ""
64+
r.state.fieldPositions[i] = position{line: row.lineNum, column: int(start) + 1}
65+
continue
66+
}
67+
if end > bufLen {
68+
end = bufLen
69+
}
70+
71+
content := buf[start:end]
72+
if r.TrimLeadingSpace {
73+
content = trimLeftBytes(content)
74+
}
75+
76+
record[i] = string(content)
77+
r.state.fieldPositions[i] = position{line: row.lineNum, column: int(start) + 1}
78+
}
79+
return record
80+
}
81+
4882
// getFieldsForRow extracts the slice of fieldInfo for the given row.
4983
func (r *Reader) getFieldsForRow(row rowInfo, fieldCount int) []fieldInfo {
5084
endIdx := row.firstField + fieldCount

simd_scanner.go

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,15 @@ import (
99
"unsafe"
1010
)
1111

12+
// bytesToInt8Slice converts a byte slice to an int8 slice without copying.
13+
// This enables use of LoadInt8xNSlice functions which are safer than pointer casts.
14+
func bytesToInt8Slice(b []byte) []int8 {
15+
if len(b) == 0 {
16+
return nil
17+
}
18+
return unsafe.Slice((*int8)(unsafe.Pointer(unsafe.SliceData(b))), len(b))
19+
}
20+
1221
// useAVX512 indicates whether AVX-512 instructions are available at runtime.
1322
var useAVX512 bool
1423

@@ -177,7 +186,7 @@ func generateMasksAVX512(data []byte, separator byte) (quote, sep, cr, nl uint64
177186

178187
// generateMasksAVX512WithCmp generates masks reusing pre-broadcasted comparators.
179188
func generateMasksAVX512WithCmp(data []byte, quoteCmp, sepCmp, crCmp, nlCmp archsimd.Int8x64) (quote, sep, cr, nl uint64) {
180-
chunk := archsimd.LoadInt8x64((*[simdChunkSize]int8)(unsafe.Pointer(&data[0])))
189+
chunk := archsimd.LoadInt8x64Slice(bytesToInt8Slice(data))
181190
return chunk.Equal(quoteCmp).ToBits(),
182191
chunk.Equal(sepCmp).ToBits(),
183192
chunk.Equal(crCmp).ToBits(),
@@ -222,17 +231,21 @@ func generateMasksPadded(data []byte, separator byte) (quote, sep, cr, nl uint64
222231
}
223232

224233
// generateMasksPaddedWithCmp is the AVX-512 version of generateMasksPadded.
234+
// Uses LoadInt8x64SlicePart to safely load partial chunks without manual padding.
225235
func generateMasksPaddedWithCmp(data []byte, quoteCmp, sepCmp, crCmp, nlCmp archsimd.Int8x64) (quote, sep, cr, nl uint64, validBits int) {
226236
validBits = len(data)
227237
if validBits == 0 {
228238
return 0, 0, 0, 0, 0
229239
}
230240

231-
var padded [simdChunkSize]byte
232-
copy(padded[:], data)
233-
234-
quote, sep, cr, nl = generateMasksAVX512WithCmp(padded[:], quoteCmp, sepCmp, crCmp, nlCmp)
241+
// SlicePart safely loads partial data, zero-filling unused lanes
242+
chunk := archsimd.LoadInt8x64SlicePart(bytesToInt8Slice(data))
243+
quote = chunk.Equal(quoteCmp).ToBits()
244+
sep = chunk.Equal(sepCmp).ToBits()
245+
cr = chunk.Equal(crCmp).ToBits()
246+
nl = chunk.Equal(nlCmp).ToBits()
235247

248+
// Mask out bits beyond valid data
236249
if validBits < simdChunkSize {
237250
mask := (uint64(1) << validBits) - 1
238251
quote &= mask

writer.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ func (w *Writer) fieldNeedsQuotesScalar(field string) bool {
125125
// fieldNeedsQuotesSIMD uses SIMD to detect special characters requiring quoting.
126126
func (w *Writer) fieldNeedsQuotesSIMD(field string) bool {
127127
data := unsafe.Slice(unsafe.StringData(field), len(field))
128+
int8Data := bytesToInt8Slice(data)
128129

129130
commaCmp := archsimd.BroadcastInt8x32(int8(w.Comma))
130131
newlineCmp := archsimd.BroadcastInt8x32('\n')
@@ -134,7 +135,7 @@ func (w *Writer) fieldNeedsQuotesSIMD(field string) bool {
134135
// Process 32-byte chunks
135136
offset := 0
136137
for offset+32 <= len(data) {
137-
chunk := archsimd.LoadInt8x32((*[32]int8)(unsafe.Pointer(&data[offset])))
138+
chunk := archsimd.LoadInt8x32Slice(int8Data[offset : offset+32])
138139

139140
commaMask := chunk.Equal(commaCmp).ToBits()
140141
newlineMask := chunk.Equal(newlineCmp).ToBits()
@@ -147,7 +148,7 @@ func (w *Writer) fieldNeedsQuotesSIMD(field string) bool {
147148
offset += 32
148149
}
149150

150-
// Process remaining bytes
151+
// Process remaining bytes (< 32 bytes, scalar is sufficient)
151152
for ; offset < len(data); offset++ {
152153
c := data[offset]
153154
if c == byte(w.Comma) || c == '\n' || c == '\r' || c == '"' {
@@ -187,14 +188,15 @@ func (w *Writer) writeQuotedFieldScalar(field string) error {
187188
// writeQuotedFieldSIMD escapes quotes using SIMD to find quote positions.
188189
func (w *Writer) writeQuotedFieldSIMD(field string) error {
189190
data := unsafe.Slice(unsafe.StringData(field), len(field))
191+
int8Data := bytesToInt8Slice(data)
190192
quoteCmp := archsimd.BroadcastInt8x32('"')
191193

192194
offset := 0
193195
lastWritten := 0
194196

195197
// Process 32-byte chunks
196198
for offset+32 <= len(data) {
197-
chunk := archsimd.LoadInt8x32((*[32]int8)(unsafe.Pointer(&data[offset])))
199+
chunk := archsimd.LoadInt8x32Slice(int8Data[offset : offset+32])
198200
mask := chunk.Equal(quoteCmp).ToBits()
199201

200202
for mask != 0 {
@@ -215,7 +217,7 @@ func (w *Writer) writeQuotedFieldSIMD(field string) error {
215217
offset += 32
216218
}
217219

218-
// Process remaining bytes
220+
// Process remaining bytes (< 32 bytes, scalar is sufficient)
219221
for ; offset < len(data); offset++ {
220222
if data[offset] == '"' {
221223
if _, err := w.w.WriteString(field[lastWritten : offset+1]); err != nil {

0 commit comments

Comments
 (0)