go-simdcsv/parse.go at main · nnnkkk7/go-simdcsv · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
//go:build goexperiment.simd && amd64

//nolint:gosec // G115: Integer conversions are safe - buffer size bounded by DefaultMaxInputSize (2GB)
package simdcsv

import "unsafe"

// ============================================================================
// Public API - Direct Parsing
// ============================================================================

// ParseBytes parses a byte slice directly (zero-copy).
// Returns all records extracted from the CSV data.
func ParseBytes(data []byte, comma rune) ([][]string, error) {
	if len(data) == 0 {
		return nil, nil
	}

	separator := byte(comma)
	sr := scanBuffer(data, separator)
	pr := parseBuffer(data, sr)
	records := buildRecords(data, pr, sr.hasCR)

	pr.release()
	sr.release()

	return records, nil
}

// ParseBytesStreaming parses data using a streaming callback function.
// The callback is invoked for each record. If it returns an error, parsing stops.
func ParseBytesStreaming(data []byte, comma rune, callback func([]string) error) error {
	if len(data) == 0 {
		return nil
	}

	separator := byte(comma)
	sr := scanBuffer(data, separator)
	pr := parseBuffer(data, sr)
	defer pr.release()
	defer sr.release()

	if pr == nil || len(pr.rows) == 0 {
		return nil
	}

	for _, row := range pr.rows {
		record := buildRecord(data, pr, row, sr.hasCR)
		if err := callback(record); err != nil {
			return err
		}
	}
	return nil
}

// ============================================================================
// Internal - Record Building (for direct API)
// ============================================================================

// buildRecords converts a parseResult to [][]string.
// Fast path: zero-copy when no transformation needed.
// Slow path: accumulate into buffer when unescape/CRLF handling required.
func buildRecords(buf []byte, pr *parseResult, hasCR bool) [][]string {
	if pr == nil || len(pr.rows) == 0 {
		return nil
	}

	records := make([][]string, len(pr.rows))

	// Check if any field needs transformation
	needsTransform := hasCR
	if !needsTransform {
		for _, f := range pr.fields {
			if f.needsUnescape() {
				needsTransform = true
				break
			}
		}
	}

	// Fast path: zero-copy direct from buffer when no transformation needed
	if !needsTransform {
		for i, row := range pr.rows {
			records[i] = buildRecordZeroCopy(buf, pr, row)
		}
		return records
	}

	// Slow path: accumulate with transformation
	var fieldEnds []int
	for i, row := range pr.rows {
		var recordBuf []byte
		recordBuf, fieldEnds = accumulateFields(buf, pr, row, hasCR, recordBuf, fieldEnds[:0])
		records[i] = sliceFieldsFromBuffer(recordBuf, fieldEnds)
	}
	return records
}

// buildRecordZeroCopy creates a record with zero-copy strings from buf.
// Safety: Only call when no transformation (unescape/CRLF) is needed.
// The returned strings reference buf directly, so buf must outlive the record.
func buildRecordZeroCopy(buf []byte, pr *parseResult, row rowInfo) []string {
	if row.fieldCount == 0 {
		return nil
	}

	record := make([]string, row.fieldCount)
	bufLen := uint32(len(buf))

	endIdx := row.firstField + row.fieldCount
	if endIdx > len(pr.fields) {
		endIdx = len(pr.fields)
	}
	fields := pr.fields[row.firstField:endIdx]

	for i, field := range fields {
		if field.length == 0 {
			continue
		}
		start := field.start
		end := start + field.length
		if start >= bufLen {
			continue
		}
		if end > bufLen {
			end = bufLen
		}
		record[i] = unsafe.String(&buf[start], int(end-start))
	}
	return record
}

// buildRecord builds a single record from a rowInfo (for streaming API).
func buildRecord(buf []byte, pr *parseResult, row rowInfo, hasCR bool) []string {
	recordBuf, fieldEnds := accumulateFields(buf, pr, row, hasCR, nil, nil)
	return sliceFieldsFromBuffer(recordBuf, fieldEnds)
}

// accumulateFields appends all field contents from a row into recordBuf.
// Returns the updated recordBuf and fieldEnds slice.
func accumulateFields(buf []byte, pr *parseResult, row rowInfo, hasCR bool, recordBuf []byte, fieldEnds []int) ([]byte, []int) {
	for i := 0; i < row.fieldCount; i++ {
		fieldIdx := row.firstField + i
		if fieldIdx >= len(pr.fields) {
			break
		}
		recordBuf = appendFieldContent(buf, pr.fields[fieldIdx], recordBuf, hasCR)
		fieldEnds = append(fieldEnds, len(recordBuf))
	}
	return recordBuf, fieldEnds
}

// sliceFieldsFromBuffer converts the accumulated buffer to individual field strings.
// Uses unsafe.String for zero-copy conversion. Caller must ensure recordBuf is not reused.
func sliceFieldsFromBuffer(recordBuf []byte, fieldEnds []int) []string {
	fieldCount := len(fieldEnds)
	if fieldCount == 0 {
		return nil
	}
	record := make([]string, fieldCount)
	if len(recordBuf) == 0 {
		return record
	}
	// Zero-copy string conversion - safe because recordBuf is unique per record
	prevEnd := 0
	for i, end := range fieldEnds {
		if prevEnd < end && prevEnd < len(recordBuf) {
			record[i] = unsafe.String(&recordBuf[prevEnd], end-prevEnd)
		}
		prevEnd = end
	}
	return record
}

// ============================================================================
// Internal - Field Content Extraction
// ============================================================================

// appendFieldContent appends field content to buffer with unescape and CRLF normalization.
// Policy: decides whether transformation is needed based on field metadata and content.
func appendFieldContent(buf []byte, field fieldInfo, recordBuf []byte, hasCR bool) []byte {
	content := extractFieldBytes(buf, field)
	if content == nil {
		return recordBuf
	}

	needsTransform := field.needsUnescape() || (hasCR && containsCRLFBytes(content))
	if !needsTransform {
		return append(recordBuf, content...)
	}

	return transformContent(content, recordBuf)
}

// extractFieldBytes returns the raw bytes for a field, handling bounds checking.
// Mechanism: pure extraction without transformation decisions.
func extractFieldBytes(buf []byte, field fieldInfo) []byte {
	if field.length == 0 {
		return nil
	}

	start := field.start
	end := field.start + field.length
	bufLen := uint32(len(buf))
	if start >= bufLen {
		return nil
	}
	if end > bufLen {
		end = bufLen
	}
	return buf[start:end]
}

// transformContent applies double-quote unescaping and CRLF normalization.
// Mechanism: pure transformation of bytes without policy decisions.
func transformContent(content, dst []byte) []byte {
	for i := 0; i < len(content); i++ {
		b := content[i]
		if b == '"' && i+1 < len(content) && content[i+1] == '"' {
			dst = append(dst, '"')
			i++
		} else if b == '\r' && i+1 < len(content) && content[i+1] == '\n' {
			dst = append(dst, '\n')
			i++
		} else {
			dst = append(dst, b)
		}
	}
	return dst
}