@@ -21,6 +21,13 @@ func bytesToInt8Slice(b []byte) []int8 {
2121// useAVX512 indicates whether AVX-512 instructions are available at runtime.
2222var useAVX512 bool
2323
24+ // Cached broadcast values for fixed characters (initialized in init()).
25+ var (
26+ cachedQuoteCmp archsimd.Int8x64
27+ cachedCrCmp archsimd.Int8x64
28+ cachedNlCmp archsimd.Int8x64
29+ )
30+
2431// SIMD processing constants.
2532const (
2633 simdChunkSize = 64 // bytes per AVX-512 iteration
@@ -32,6 +39,12 @@ const (
3239
3340func init () {
3441 useAVX512 = archsimd .X86 .AVX512 ()
42+ if useAVX512 {
43+ // Pre-broadcast fixed characters to avoid repeated BroadcastInt8x64 calls
44+ cachedQuoteCmp = archsimd .BroadcastInt8x64 ('"' )
45+ cachedCrCmp = archsimd .BroadcastInt8x64 ('\r' )
46+ cachedNlCmp = archsimd .BroadcastInt8x64 ('\n' )
47+ }
3548}
3649
3750// =============================================================================
@@ -124,9 +137,9 @@ func (sr *scanResult) reset() {
124137 sr .newlineCount = 0
125138}
126139
127- // releaseScanResult returns a scanResult to the pool for reuse.
140+ // release returns the scanResult to the pool for reuse.
128141// Large results (>= scanResultLargeThreshold) are cached separately to survive GC.
129- func releaseScanResult (sr * scanResult ) {
142+ func (sr * scanResult ) release ( ) {
130143 if sr == nil {
131144 return
132145 }
@@ -203,12 +216,11 @@ func generateMasksScalar(data []byte, separator byte) (quote, sep, cr, nl uint64
203216
204217// generateMasksAVX512 generates masks using AVX-512 SIMD instructions.
205218// Requires AVX-512BW for ToBits() which uses VPMOVB2M instruction.
219+ // Uses cached broadcast values for fixed characters (quote, CR, NL) to avoid
220+ // repeated BroadcastInt8x64 calls.
206221func generateMasksAVX512 (data []byte , separator byte ) (quote , sep , cr , nl uint64 ) {
207- quoteCmp := archsimd .BroadcastInt8x64 ('"' )
208222 sepCmp := archsimd .BroadcastInt8x64 (int8 (separator ))
209- crCmp := archsimd .BroadcastInt8x64 ('\r' )
210- nlCmp := archsimd .BroadcastInt8x64 ('\n' )
211- return generateMasksAVX512WithCmp (data , quoteCmp , sepCmp , crCmp , nlCmp )
223+ return generateMasksAVX512WithCmp (data , cachedQuoteCmp , sepCmp , cachedCrCmp , cachedNlCmp )
212224}
213225
214226// generateMasksAVX512WithCmp generates masks reusing pre-broadcasted comparators.
@@ -446,10 +458,10 @@ type avx512MaskGenerator struct {
446458
447459func newAVX512MaskGenerator (separator byte ) * avx512MaskGenerator {
448460 return & avx512MaskGenerator {
449- quoteCmp : archsimd . BroadcastInt8x64 ( '"' ) ,
461+ quoteCmp : cachedQuoteCmp ,
450462 sepCmp : archsimd .BroadcastInt8x64 (int8 (separator )),
451- crCmp : archsimd . BroadcastInt8x64 ( '\r' ) ,
452- nlCmp : archsimd . BroadcastInt8x64 ( '\n' ) ,
463+ crCmp : cachedCrCmp ,
464+ nlCmp : cachedNlCmp ,
453465 }
454466}
455467
@@ -492,20 +504,34 @@ func scanBufferWithGenerator(buf []byte, gen maskGenerator) *scanResult {
492504 result := acquireScanResult (chunkCount )
493505 state := scanState {}
494506
495- curMasks , curValidBits := generateFirstChunkMasks (buf , gen , result )
496- nextMasks := generateSecondChunkMasks (buf , chunkCount , gen , result )
507+ sc := bufferScanContext {
508+ buf : buf ,
509+ gen : gen ,
510+ result : result ,
511+ chunkCount : chunkCount ,
512+ }
513+
514+ curMasks , curValidBits := sc .generateFirstChunkMasks ()
515+ nextMasks := sc .generateSecondChunkMasks ()
497516
498517 for chunkIdx := 0 ; chunkIdx < chunkCount ; chunkIdx ++ {
499518 processChunk (chunkIdx , curMasks , nextMasks , curValidBits , & state , result )
500519
501520 curMasks = nextMasks
502- nextMasks , curValidBits = generateNextLookahead (buf , chunkIdx , chunkCount , gen , result )
521+ nextMasks , curValidBits = sc . generateNextLookahead (chunkIdx )
503522 }
504523
505524 result .finalQuoted = state .quoted
506525 return result
507526}
508527
528+ type bufferScanContext struct {
529+ buf []byte
530+ gen maskGenerator
531+ result * scanResult
532+ chunkCount int
533+ }
534+
509535// acquireScanResult gets a pooled scanResult and initializes it for the given chunk count.
510536func acquireScanResult (chunkCount int ) * scanResult {
511537 if chunkCount >= scanResultLargeThreshold {
@@ -531,61 +557,61 @@ func acquireScanResult(chunkCount int) *scanResult {
531557
532558// generateFirstChunkMasks generates masks for the first chunk of the buffer.
533559// Handles both full chunks and partial (padded) chunks.
534- func generateFirstChunkMasks ( buf [] byte , gen maskGenerator , result * scanResult ) (chunkMasks , int ) {
535- if len (buf ) >= simdChunkSize {
536- return gen .generateFull (buf [0 :simdChunkSize ]), simdChunkSize
560+ func ( sc * bufferScanContext ) generateFirstChunkMasks ( ) (chunkMasks , int ) {
561+ if len (sc . buf ) >= simdChunkSize {
562+ return sc . gen .generateFull (sc . buf [0 :simdChunkSize ]), simdChunkSize
537563 }
538564
539- masks , validBits := gen .generatePadded (buf )
540- result .lastChunkBits = validBits
565+ masks , validBits := sc . gen .generatePadded (sc . buf )
566+ sc . result .lastChunkBits = validBits
541567 return masks , validBits
542568}
543569
544570// generateSecondChunkMasks generates lookahead masks for the second chunk if it exists.
545571// Returns empty masks if there is no second chunk.
546- func generateSecondChunkMasks ( buf [] byte , chunkCount int , gen maskGenerator , result * scanResult ) chunkMasks {
547- if chunkCount <= 1 || len (buf ) <= simdChunkSize {
572+ func ( sc * bufferScanContext ) generateSecondChunkMasks ( ) chunkMasks {
573+ if sc . chunkCount <= 1 || len (sc . buf ) <= simdChunkSize {
548574 return chunkMasks {}
549575 }
550576
551- if len (buf ) >= 2 * simdChunkSize {
552- return gen .generateFull (buf [simdChunkSize : 2 * simdChunkSize ])
577+ if len (sc . buf ) >= 2 * simdChunkSize {
578+ return sc . gen .generateFull (sc . buf [simdChunkSize : 2 * simdChunkSize ])
553579 }
554580
555- masks , validBits := gen .generatePadded (buf [simdChunkSize :])
556- if chunkCount == 2 {
557- result .lastChunkBits = validBits
581+ masks , validBits := sc . gen .generatePadded (sc . buf [simdChunkSize :])
582+ if sc . chunkCount == 2 {
583+ sc . result .lastChunkBits = validBits
558584 }
559585 return masks
560586}
561587
562588// generateNextLookahead generates masks for the chunk two positions ahead (lookahead).
563589// This enables processing current chunk while knowing what comes next.
564- func generateNextLookahead ( buf [] byte , chunkIdx , chunkCount int , gen maskGenerator , result * scanResult ) (chunkMasks , int ) {
590+ func ( sc * bufferScanContext ) generateNextLookahead ( chunkIdx int ) (chunkMasks , int ) {
565591 lookaheadIdx := chunkIdx + 2
566- if lookaheadIdx >= chunkCount {
567- return handleFinalChunkValidBits (buf , chunkIdx , chunkCount , result )
592+ if lookaheadIdx >= sc . chunkCount {
593+ return sc . handleFinalChunkValidBits (chunkIdx )
568594 }
569595
570596 offset := lookaheadIdx * simdChunkSize
571- remaining := len (buf ) - offset
597+ remaining := len (sc . buf ) - offset
572598
573599 if remaining >= simdChunkSize {
574- return gen .generateFull (buf [offset : offset + simdChunkSize ]), simdChunkSize
600+ return sc . gen .generateFull (sc . buf [offset : offset + simdChunkSize ]), simdChunkSize
575601 }
576602
577- masks , validBits := gen .generatePadded (buf [offset :])
578- result .lastChunkBits = validBits
603+ masks , validBits := sc . gen .generatePadded (sc . buf [offset :])
604+ sc . result .lastChunkBits = validBits
579605 return masks , validBits
580606}
581607
582608// handleFinalChunkValidBits computes valid bits when no more lookahead chunks exist.
583- func handleFinalChunkValidBits ( buf [] byte , chunkIdx , chunkCount int , result * scanResult ) (chunkMasks , int ) {
609+ func ( sc * bufferScanContext ) handleFinalChunkValidBits ( chunkIdx int ) (chunkMasks , int ) {
584610 validBits := simdChunkSize
585611
586- if chunkIdx + 1 == chunkCount - 1 && len (buf )% simdChunkSize != 0 {
587- validBits = len (buf ) % simdChunkSize
588- result .lastChunkBits = validBits
612+ if chunkIdx + 1 == sc . chunkCount - 1 && len (sc . buf )% simdChunkSize != 0 {
613+ validBits = len (sc . buf ) % simdChunkSize
614+ sc . result .lastChunkBits = validBits
589615 }
590616
591617 return chunkMasks {}, validBits
0 commit comments