@@ -21,6 +21,14 @@ func bytesToInt8Slice(b []byte) []int8 {
2121// useAVX512 indicates whether AVX-512 instructions are available at runtime.
2222var useAVX512 bool
2323
24+ // Cached broadcast values for fixed characters (initialized in init()).
25+ var (
26+ // AVX-512 (64-byte) cached values
27+ cachedQuoteCmp archsimd.Int8x64
28+ cachedCrCmp archsimd.Int8x64
29+ cachedNlCmp archsimd.Int8x64
30+ )
31+
2432// SIMD processing constants.
2533const (
2634 simdChunkSize = 64 // bytes per AVX-512 iteration
@@ -32,6 +40,12 @@ const (
3240
3341func init () {
3442 useAVX512 = archsimd .X86 .AVX512 ()
43+ if useAVX512 {
44+ // Pre-broadcast fixed characters to avoid repeated BroadcastInt8x64 calls
45+ cachedQuoteCmp = archsimd .BroadcastInt8x64 ('"' )
46+ cachedCrCmp = archsimd .BroadcastInt8x64 ('\r' )
47+ cachedNlCmp = archsimd .BroadcastInt8x64 ('\n' )
48+ }
3549}
3650
3751// =============================================================================
@@ -124,9 +138,9 @@ func (sr *scanResult) reset() {
124138 sr .newlineCount = 0
125139}
126140
127- // releaseScanResult returns a scanResult to the pool for reuse.
141+ // release returns the scanResult to the pool for reuse.
128142// Large results (>= scanResultLargeThreshold) are cached separately to survive GC.
129- func releaseScanResult (sr * scanResult ) {
143+ func (sr * scanResult ) release ( ) {
130144 if sr == nil {
131145 return
132146 }
@@ -203,12 +217,11 @@ func generateMasksScalar(data []byte, separator byte) (quote, sep, cr, nl uint64
203217
204218// generateMasksAVX512 generates masks using AVX-512 SIMD instructions.
205219// Requires AVX-512BW for ToBits() which uses VPMOVB2M instruction.
220+ // Uses cached broadcast values for fixed characters (quote, CR, NL) to avoid
221+ // repeated BroadcastInt8x64 calls.
206222func generateMasksAVX512 (data []byte , separator byte ) (quote , sep , cr , nl uint64 ) {
207- quoteCmp := archsimd .BroadcastInt8x64 ('"' )
208223 sepCmp := archsimd .BroadcastInt8x64 (int8 (separator ))
209- crCmp := archsimd .BroadcastInt8x64 ('\r' )
210- nlCmp := archsimd .BroadcastInt8x64 ('\n' )
211- return generateMasksAVX512WithCmp (data , quoteCmp , sepCmp , crCmp , nlCmp )
224+ return generateMasksAVX512WithCmp (data , cachedQuoteCmp , sepCmp , cachedCrCmp , cachedNlCmp )
212225}
213226
214227// generateMasksAVX512WithCmp generates masks reusing pre-broadcasted comparators.
@@ -446,10 +459,10 @@ type avx512MaskGenerator struct {
446459
447460func newAVX512MaskGenerator (separator byte ) * avx512MaskGenerator {
448461 return & avx512MaskGenerator {
449- quoteCmp : archsimd . BroadcastInt8x64 ( '"' ) ,
462+ quoteCmp : cachedQuoteCmp ,
450463 sepCmp : archsimd .BroadcastInt8x64 (int8 (separator )),
451- crCmp : archsimd . BroadcastInt8x64 ( '\r' ) ,
452- nlCmp : archsimd . BroadcastInt8x64 ( '\n' ) ,
464+ crCmp : cachedCrCmp ,
465+ nlCmp : cachedNlCmp ,
453466 }
454467}
455468
@@ -492,20 +505,34 @@ func scanBufferWithGenerator(buf []byte, gen maskGenerator) *scanResult {
492505 result := acquireScanResult (chunkCount )
493506 state := scanState {}
494507
495- curMasks , curValidBits := generateFirstChunkMasks (buf , gen , result )
496- nextMasks := generateSecondChunkMasks (buf , chunkCount , gen , result )
508+ sc := bufferScanContext {
509+ buf : buf ,
510+ gen : gen ,
511+ result : result ,
512+ chunkCount : chunkCount ,
513+ }
514+
515+ curMasks , curValidBits := sc .generateFirstChunkMasks ()
516+ nextMasks := sc .generateSecondChunkMasks ()
497517
498518 for chunkIdx := 0 ; chunkIdx < chunkCount ; chunkIdx ++ {
499519 processChunk (chunkIdx , curMasks , nextMasks , curValidBits , & state , result )
500520
501521 curMasks = nextMasks
502- nextMasks , curValidBits = generateNextLookahead (buf , chunkIdx , chunkCount , gen , result )
522+ nextMasks , curValidBits = sc . generateNextLookahead (chunkIdx )
503523 }
504524
505525 result .finalQuoted = state .quoted
506526 return result
507527}
508528
529+ type bufferScanContext struct {
530+ buf []byte
531+ gen maskGenerator
532+ result * scanResult
533+ chunkCount int
534+ }
535+
509536// acquireScanResult gets a pooled scanResult and initializes it for the given chunk count.
510537func acquireScanResult (chunkCount int ) * scanResult {
511538 if chunkCount >= scanResultLargeThreshold {
@@ -531,61 +558,61 @@ func acquireScanResult(chunkCount int) *scanResult {
531558
532559// generateFirstChunkMasks generates masks for the first chunk of the buffer.
533560// Handles both full chunks and partial (padded) chunks.
534- func generateFirstChunkMasks ( buf [] byte , gen maskGenerator , result * scanResult ) (chunkMasks , int ) {
535- if len (buf ) >= simdChunkSize {
536- return gen .generateFull (buf [0 :simdChunkSize ]), simdChunkSize
561+ func ( sc * bufferScanContext ) generateFirstChunkMasks ( ) (chunkMasks , int ) {
562+ if len (sc . buf ) >= simdChunkSize {
563+ return sc . gen .generateFull (sc . buf [0 :simdChunkSize ]), simdChunkSize
537564 }
538565
539- masks , validBits := gen .generatePadded (buf )
540- result .lastChunkBits = validBits
566+ masks , validBits := sc . gen .generatePadded (sc . buf )
567+ sc . result .lastChunkBits = validBits
541568 return masks , validBits
542569}
543570
544571// generateSecondChunkMasks generates lookahead masks for the second chunk if it exists.
545572// Returns empty masks if there is no second chunk.
546- func generateSecondChunkMasks ( buf [] byte , chunkCount int , gen maskGenerator , result * scanResult ) chunkMasks {
547- if chunkCount <= 1 || len (buf ) <= simdChunkSize {
573+ func ( sc * bufferScanContext ) generateSecondChunkMasks ( ) chunkMasks {
574+ if sc . chunkCount <= 1 || len (sc . buf ) <= simdChunkSize {
548575 return chunkMasks {}
549576 }
550577
551- if len (buf ) >= 2 * simdChunkSize {
552- return gen .generateFull (buf [simdChunkSize : 2 * simdChunkSize ])
578+ if len (sc . buf ) >= 2 * simdChunkSize {
579+ return sc . gen .generateFull (sc . buf [simdChunkSize : 2 * simdChunkSize ])
553580 }
554581
555- masks , validBits := gen .generatePadded (buf [simdChunkSize :])
556- if chunkCount == 2 {
557- result .lastChunkBits = validBits
582+ masks , validBits := sc . gen .generatePadded (sc . buf [simdChunkSize :])
583+ if sc . chunkCount == 2 {
584+ sc . result .lastChunkBits = validBits
558585 }
559586 return masks
560587}
561588
562589// generateNextLookahead generates masks for the chunk two positions ahead (lookahead).
563590// This enables processing current chunk while knowing what comes next.
564- func generateNextLookahead ( buf [] byte , chunkIdx , chunkCount int , gen maskGenerator , result * scanResult ) (chunkMasks , int ) {
591+ func ( sc * bufferScanContext ) generateNextLookahead ( chunkIdx int ) (chunkMasks , int ) {
565592 lookaheadIdx := chunkIdx + 2
566- if lookaheadIdx >= chunkCount {
567- return handleFinalChunkValidBits (buf , chunkIdx , chunkCount , result )
593+ if lookaheadIdx >= sc . chunkCount {
594+ return sc . handleFinalChunkValidBits (chunkIdx )
568595 }
569596
570597 offset := lookaheadIdx * simdChunkSize
571- remaining := len (buf ) - offset
598+ remaining := len (sc . buf ) - offset
572599
573600 if remaining >= simdChunkSize {
574- return gen .generateFull (buf [offset : offset + simdChunkSize ]), simdChunkSize
601+ return sc . gen .generateFull (sc . buf [offset : offset + simdChunkSize ]), simdChunkSize
575602 }
576603
577- masks , validBits := gen .generatePadded (buf [offset :])
578- result .lastChunkBits = validBits
604+ masks , validBits := sc . gen .generatePadded (sc . buf [offset :])
605+ sc . result .lastChunkBits = validBits
579606 return masks , validBits
580607}
581608
582609// handleFinalChunkValidBits computes valid bits when no more lookahead chunks exist.
583- func handleFinalChunkValidBits ( buf [] byte , chunkIdx , chunkCount int , result * scanResult ) (chunkMasks , int ) {
610+ func ( sc * bufferScanContext ) handleFinalChunkValidBits ( chunkIdx int ) (chunkMasks , int ) {
584611 validBits := simdChunkSize
585612
586- if chunkIdx + 1 == chunkCount - 1 && len (buf )% simdChunkSize != 0 {
587- validBits = len (buf ) % simdChunkSize
588- result .lastChunkBits = validBits
613+ if chunkIdx + 1 == sc . chunkCount - 1 && len (sc . buf )% simdChunkSize != 0 {
614+ validBits = len (sc . buf ) % simdChunkSize
615+ sc . result .lastChunkBits = validBits
589616 }
590617
591618 return chunkMasks {}, validBits
0 commit comments