Skip to content

Commit 225e49c

Browse files
authored
refactor: update comments and constants for SIMD processing (#24)
1 parent 3822fa5 commit 225e49c

File tree

9 files changed

+152
-149
lines changed

9 files changed

+152
-149
lines changed

errors.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ var (
1414
ErrFieldCount = errors.New("wrong number of fields")
1515
)
1616

17-
// ParseError represents a parsing error with location information
17+
// ParseError represents a parsing error with location information.
1818
type ParseError struct {
1919
StartLine int // Record start line
2020
Line int // Error line

field_parser.go

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,8 @@ func parseBuffer(buf []byte, sr *scanResult) *parseResult {
5959
}
6060

6161
// Initialize result with estimated capacities
62-
// Assume average field length of 10 bytes and row length of 50 bytes
63-
estimatedFields := len(buf) / 10
64-
estimatedRows := len(buf) / 50
62+
estimatedFields := len(buf) / avgFieldLenEstimate
63+
estimatedRows := len(buf) / avgRowLenEstimate
6564
result := newParseResult(estimatedFields, estimatedRows)
6665

6766
// Initialize state with lastSeparatorOrDelimiter = -1
@@ -74,7 +73,7 @@ func parseBuffer(buf []byte, sr *scanResult) *parseResult {
7473

7574
// Loop through all chunks, calling processChunkMasks for each
7675
for chunkIdx := 0; chunkIdx < sr.chunkCount; chunkIdx++ {
77-
offset := uint64(chunkIdx * 64)
76+
offset := uint64(chunkIdx * simdChunkSize)
7877
sepMask := sr.separatorMasks[chunkIdx]
7978
nlMask := sr.newlineMasks[chunkIdx]
8079

@@ -258,8 +257,8 @@ func finalizeLastField(buf []byte, state *parserState, result *parseResult, curr
258257
// unescapeDoubleQuotes converts double quotes ("") to single quotes (").
259258
// Dispatches to SIMD or scalar implementation based on CPU support and string size.
260259
func unescapeDoubleQuotes(s string) string {
261-
// Use SIMD for strings >= 32 bytes
262-
if useAVX512 && len(s) >= 32 {
260+
// Use SIMD for strings >= simdMinThreshold bytes
261+
if useAVX512 && len(s) >= simdMinThreshold {
263262
return unescapeDoubleQuotesSIMD(s)
264263
}
265264
return unescapeDoubleQuotesScalar(s)
@@ -379,7 +378,7 @@ func postProcessFields(buf []byte, result *parseResult, postProcChunks []int) {
379378

380379
// For each chunk that needs post-processing, find overlapping fields
381380
for _, chunkIdx := range postProcChunks {
382-
chunkStart := uint64(chunkIdx * 64)
381+
chunkStart := uint64(chunkIdx * simdChunkSize)
383382
chunkEnd := chunkStart + 64
384383

385384
// Search for fields that start within this chunk range

parse.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,20 @@
33
package simdcsv
44

55
// ParseBytes parses a byte slice directly (zero-copy).
6-
// This function runs Stage 1 and Stage 2 processing and returns all records.
6+
// This function runs scanBuffer and parseBuffer processing and returns all records.
77
func ParseBytes(data []byte, comma rune) ([][]string, error) {
88
if len(data) == 0 {
99
return nil, nil
1010
}
1111

12-
// Stage 1: Structural analysis using SIMD (generates bitmasks)
12+
// Scan: Structural analysis using SIMD (generates bitmasks)
1313
separatorChar := byte(comma)
1414
sr := scanBuffer(data, separatorChar)
1515

16-
// Stage 2: Extract fields and rows from scan result
16+
// Parse: Extract fields and rows from scan result
1717
pr := parseBuffer(data, sr)
1818

19-
// Stage 3: Convert parseResult to [][]string
19+
// Build: Convert parseResult to [][]string
2020
return buildRecords(data, pr), nil
2121
}
2222

@@ -28,18 +28,18 @@ func ParseBytesStreaming(data []byte, comma rune, callback func([]string) error)
2828
return nil
2929
}
3030

31-
// Stage 1: Structural analysis using SIMD (generates bitmasks)
31+
// Scan: Structural analysis using SIMD (generates bitmasks)
3232
separatorChar := byte(comma)
3333
sr := scanBuffer(data, separatorChar)
3434

35-
// Stage 2: Extract fields and rows from scan result
35+
// Parse: Extract fields and rows from scan result
3636
pr := parseBuffer(data, sr)
3737

3838
if pr == nil || len(pr.rows) == 0 {
3939
return nil
4040
}
4141

42-
// Stage 3: Invoke callback for each record
42+
// Build: Invoke callback for each record
4343
for _, row := range pr.rows {
4444
record := buildRecord(data, pr, row)
4545
if err := callback(record); err != nil {

quote.go

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,8 @@ func isQuotedFieldStart(data []byte, trimLeadingSpace bool) (bool, int) {
4646
// Dispatches to SIMD or scalar implementation based on CPU support and data size.
4747
func findClosingQuote(data []byte, startAfterOpenQuote int) int {
4848
remaining := len(data) - startAfterOpenQuote
49-
// Use SIMD for data >= 32 bytes, otherwise scalar is faster
50-
if useAVX512 && remaining >= 32 {
49+
// Use SIMD for data >= simdMinThreshold bytes, otherwise scalar is faster
50+
if useAVX512 && remaining >= simdMinThreshold {
5151
return findClosingQuoteSIMD(data, startAfterOpenQuote)
5252
}
5353
return findClosingQuoteScalar(data, startAfterOpenQuote)
@@ -72,14 +72,14 @@ func findClosingQuoteScalar(data []byte, startAfterOpenQuote int) int {
7272
}
7373

7474
// findClosingQuoteSIMD uses SIMD to find the closing quote.
75-
// It searches for quote characters in 32-byte chunks using AVX-512.
75+
// It searches for quote characters in simdHalfChunk-byte chunks using AVX-512.
7676
func findClosingQuoteSIMD(data []byte, startAfterOpenQuote int) int {
7777
quoteCmp := archsimd.BroadcastInt8x32('"')
7878
i := startAfterOpenQuote
7979

80-
// Process 32-byte chunks
81-
for i+32 <= len(data) {
82-
chunk := archsimd.LoadInt8x32((*[32]int8)(unsafe.Pointer(&data[i])))
80+
// Process simdHalfChunk-byte chunks
81+
for i+simdHalfChunk <= len(data) {
82+
chunk := archsimd.LoadInt8x32((*[simdHalfChunk]int8)(unsafe.Pointer(&data[i])))
8383
mask := chunk.Equal(quoteCmp).ToBits()
8484

8585
if mask != 0 {
@@ -94,12 +94,15 @@ func findClosingQuoteSIMD(data []byte, startAfterOpenQuote int) int {
9494
// This is an escaped quote, skip both quotes
9595
// Clear this bit and the next (if in same chunk)
9696
mask &= ^(uint32(1) << pos)
97-
if pos+1 < 32 {
97+
if pos+1 < simdHalfChunk {
9898
mask &= ^(uint32(1) << (pos + 1))
9999
}
100-
// If next quote is in the next chunk, we need to skip it
101-
if pos == 31 {
102-
i += 32
100+
// If next quote is in the next chunk, we need to skip it.
101+
// Using goto here for performance: it allows us to skip the normal
102+
// i += simdHalfChunk increment and immediately continue with the
103+
// already-adjusted i value after handling boundary double quotes.
104+
if pos == simdHalfChunk-1 {
105+
i += simdHalfChunk
103106
// Skip the first quote of the next iteration
104107
if i < len(data) && data[i] == '"' {
105108
i++
@@ -112,7 +115,7 @@ func findClosingQuoteSIMD(data []byte, startAfterOpenQuote int) int {
112115
return absPos
113116
}
114117
}
115-
i += 32
118+
i += simdHalfChunk
116119
continueLoop:
117120
}
118121

@@ -127,6 +130,5 @@ func extractQuotedContent(data []byte, closingQuoteIdx int) string {
127130
if closingQuoteIdx <= 1 {
128131
return ""
129132
}
130-
content := string(data[1:closingQuoteIdx])
131-
return content
133+
return string(data[1:closingQuoteIdx])
132134
}

reader.go

Lines changed: 51 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -59,16 +59,20 @@ type Reader struct {
5959
lastRecord []string
6060

6161
// SIMD processing state
62-
scanResult *scanResult // Scan result (structural character masks)
63-
parseResult *parseResult // Parse result (extracted fields/rows)
64-
currentRecordIndex int // Current record index in parseResult.rows
65-
initialized bool // Whether scan/parse have been run
62+
scanResult *scanResult // Scan result (structural character masks)
63+
parseResult *parseResult // Parse result (extracted fields/rows)
64+
currentRecordIndex int // Current record index in parseResult.rows
65+
nonCommentRecordCount int // Count of non-comment records returned (for O(1) first record detection)
66+
initialized bool // Whether scan/parse have been run
6667

6768
// Extended options (set via NewReaderWithOptions)
68-
skipBOM bool // Skip UTF-8 BOM if present
69-
bufferSize int // Buffer size hint (reserved for future use)
70-
chunkSize int // Chunk size hint (reserved for future use)
71-
zeroCopy bool // Zero-copy mode hint (reserved for future use)
69+
skipBOM bool // Skip UTF-8 BOM if present
70+
71+
// Reserved fields for future streaming/chunked processing implementation.
72+
// These fields are accepted by NewReaderWithOptions but currently have no effect.
73+
bufferSize int // Buffer size hint (not yet implemented)
74+
chunkSize int // Chunk size hint (not yet implemented)
75+
zeroCopy bool // Zero-copy mode hint (not yet implemented)
7276
}
7377

7478
// position represents a position in the input.
@@ -95,7 +99,7 @@ func NewReader(r io.Reader) *Reader {
9599
// If ReuseRecord is true, the returned slice may be shared
96100
// between multiple calls to Read.
97101
func (r *Reader) Read() (record []string, err error) {
98-
// Initialize on first call: read all input and run Stage 1 + Stage 2
102+
// Initialize on first call: read all input and run scanBuffer + parseBuffer
99103
if !r.initialized {
100104
if err := r.initialize(); err != nil {
101105
return nil, err
@@ -108,18 +112,19 @@ func (r *Reader) Read() (record []string, err error) {
108112
return nil, io.EOF
109113
}
110114

111-
// Get current row info
112-
rowInfo := r.parseResult.rows[r.currentRecordIndex]
115+
// Get current row info and index
116+
rowIdx := r.currentRecordIndex
117+
rowInfo := r.parseResult.rows[rowIdx]
113118
r.currentRecordIndex++
114119

115120
// Check for comment line (line starting with Comment character)
116-
if r.Comment != 0 && r.isCommentLine(rowInfo) {
121+
if r.Comment != 0 && r.isCommentLine(rowInfo, rowIdx) {
117122
// Skip this line and continue to next
118123
continue
119124
}
120125

121126
// Build record from fields with validation
122-
record, err = r.buildRecordWithValidation(rowInfo)
127+
record, err = r.buildRecordWithValidation(rowInfo, rowIdx)
123128
if err != nil {
124129
return record, err
125130
}
@@ -152,24 +157,19 @@ func (r *Reader) Read() (record []string, err error) {
152157
}
153158
// If FieldsPerRecord < 0, no check is performed
154159

160+
r.nonCommentRecordCount++
155161
return record, nil
156162
}
157163
}
158164

159-
// isFirstNonCommentRecord checks if this is the first non-comment record being returned
165+
// isFirstNonCommentRecord checks if this is the first non-comment record being returned.
166+
// Uses O(1) counter instead of O(n) re-scanning.
160167
func (r *Reader) isFirstNonCommentRecord() bool {
161-
// Count how many non-comment records we've processed
162-
nonCommentCount := 0
163-
for i := 0; i < r.currentRecordIndex; i++ {
164-
if i < len(r.parseResult.rows) && !r.isCommentLine(r.parseResult.rows[i]) {
165-
nonCommentCount++
166-
}
167-
}
168-
return nonCommentCount == 1
168+
return r.nonCommentRecordCount == 0
169169
}
170170

171171
// isCommentLine checks if a row is a comment line
172-
func (r *Reader) isCommentLine(row rowInfo) bool {
172+
func (r *Reader) isCommentLine(row rowInfo, rowIdx int) bool {
173173
if r.Comment == 0 || row.fieldCount == 0 {
174174
return false
175175
}
@@ -186,15 +186,16 @@ func (r *Reader) isCommentLine(row rowInfo) bool {
186186
return false
187187
}
188188
// Get the raw start position (the original field start in rawBuffer)
189-
rawStart := r.getRawFieldStart(row, firstFieldIdx)
189+
rawStart := r.getRawFieldStart(row, rowIdx, firstFieldIdx)
190190
if rawStart < uint64(len(r.rawBuffer)) {
191191
return r.rawBuffer[rawStart] == byte(r.Comment)
192192
}
193193
return false
194194
}
195195

196-
// getRawFieldStart gets the original field start position before quote adjustment
197-
func (r *Reader) getRawFieldStart(row rowInfo, fieldIdx int) uint64 {
196+
// getRawFieldStart gets the original field start position before quote adjustment.
197+
// Uses O(1) lookup with rowIdx instead of O(n) search.
198+
func (r *Reader) getRawFieldStart(row rowInfo, rowIdx, fieldIdx int) uint64 {
198199
// For the first field of a row, we need to find the actual start
199200
// which is either:
200201
// - 0 for the first row
@@ -208,15 +209,8 @@ func (r *Reader) getRawFieldStart(row rowInfo, fieldIdx int) uint64 {
208209
// If quoteAdjust was applied, start is field.start - 1
209210
// But for comment detection, we need the actual line start
210211
// We can find it by looking at the previous row's end position
211-
prevRowIdx := -1
212-
for i, r := range r.parseResult.rows {
213-
if r.firstField == row.firstField {
214-
prevRowIdx = i - 1
215-
break
216-
}
217-
}
218-
if prevRowIdx >= 0 {
219-
prevRow := r.parseResult.rows[prevRowIdx]
212+
if rowIdx > 0 {
213+
prevRow := r.parseResult.rows[rowIdx-1]
220214
lastFieldIdx := prevRow.firstField + prevRow.fieldCount - 1
221215
if lastFieldIdx >= 0 && lastFieldIdx < len(r.parseResult.fields) {
222216
lastField := r.parseResult.fields[lastFieldIdx]
@@ -227,7 +221,7 @@ func (r *Reader) getRawFieldStart(row rowInfo, fieldIdx int) uint64 {
227221
return field.start
228222
}
229223

230-
// initialize reads all input and runs Stage 1 and Stage 2 processing.
224+
// initialize reads all input and runs scanBuffer and parseBuffer processing.
231225
func (r *Reader) initialize() error {
232226
r.initialized = true
233227

@@ -269,7 +263,7 @@ func (r *Reader) initialize() error {
269263
}
270264

271265
// buildRecordWithValidation constructs a []string record from a rowInfo with quote validation
272-
func (r *Reader) buildRecordWithValidation(row rowInfo) ([]string, error) {
266+
func (r *Reader) buildRecordWithValidation(row rowInfo, rowIdx int) ([]string, error) {
273267
fieldCount := row.fieldCount
274268
record := r.allocateRecord(fieldCount)
275269

@@ -283,7 +277,7 @@ func (r *Reader) buildRecordWithValidation(row rowInfo) ([]string, error) {
283277
field := r.parseResult.fields[fieldIdx]
284278

285279
// Get raw field data for validation
286-
rawStart, rawEnd := r.getFieldRawBounds(row, fieldIdx, i)
280+
rawStart, rawEnd := r.getFieldRawBounds(row, rowIdx, fieldIdx, i)
287281

288282
// Validate quotes unless LazyQuotes is enabled
289283
if !r.LazyQuotes {
@@ -306,7 +300,7 @@ func (r *Reader) buildRecordWithValidation(row rowInfo) ([]string, error) {
306300
}
307301

308302
// getFieldRawBounds returns the raw start and end positions for a field in the buffer
309-
func (r *Reader) getFieldRawBounds(row rowInfo, fieldIdx, fieldNum int) (uint64, uint64) {
303+
func (r *Reader) getFieldRawBounds(row rowInfo, rowIdx, fieldIdx, fieldNum int) (uint64, uint64) {
310304
field := r.parseResult.fields[fieldIdx]
311305

312306
// Calculate raw start (before any quote adjustment)
@@ -317,7 +311,7 @@ func (r *Reader) getFieldRawBounds(row rowInfo, fieldIdx, fieldNum int) (uint64,
317311
rawStart = 0
318312
} else {
319313
// Find the position after the previous newline
320-
rawStart = r.findLineStart(row)
314+
rawStart = r.findLineStart(rowIdx)
321315
}
322316
} else {
323317
// For non-first fields, find the position after the previous separator
@@ -384,17 +378,9 @@ func (r *Reader) findRawFieldEnd(start uint64, isLastField bool) uint64 {
384378
return bufLen
385379
}
386380

387-
// findLineStart finds the start position of a line
388-
func (r *Reader) findLineStart(row rowInfo) uint64 {
389-
// Find the row index
390-
rowIdx := -1
391-
for i, ri := range r.parseResult.rows {
392-
if ri.firstField == row.firstField && ri.lineNum == row.lineNum {
393-
rowIdx = i
394-
break
395-
}
396-
}
397-
381+
// findLineStart finds the start position of a line.
382+
// Uses O(1) lookup with rowIdx instead of O(n) search.
383+
func (r *Reader) findLineStart(rowIdx int) uint64 {
398384
if rowIdx <= 0 {
399385
return 0
400386
}
@@ -517,10 +503,20 @@ func (r *Reader) InputOffset() int64 {
517503

518504
// ReaderOptions contains extended configuration options for [Reader].
519505
type ReaderOptions struct {
520-
BufferSize int // BufferSize specifies the internal buffer size in bytes. Default is 64KB.
521-
ChunkSize int // Parallel processing chunk size
522-
ZeroCopy bool // Zero-copy optimization (default: false)
523-
SkipBOM bool // Skip UTF-8 BOM (default: false)
506+
// SkipBOM skips UTF-8 BOM (EF BB BF) at the beginning of input if present.
507+
SkipBOM bool
508+
509+
// BufferSize specifies the internal buffer size hint in bytes.
510+
// NOTE: Not yet implemented; reserved for future streaming support.
511+
BufferSize int
512+
513+
// ChunkSize specifies the parallel processing chunk size.
514+
// NOTE: Not yet implemented; reserved for future streaming support.
515+
ChunkSize int
516+
517+
// ZeroCopy enables zero-copy optimization.
518+
// NOTE: Not yet implemented; reserved for future optimization.
519+
ZeroCopy bool
524520
}
525521

526522
// NewReaderWithOptions creates a Reader with extended options.

0 commit comments

Comments
 (0)