Skip to content

Commit dfe96bb

Browse files
authored
feat: use PCLMULQDQ (#77)
* feat: use PCLMULQDQ * fix: use useAVX512 flag
1 parent e7f1f96 commit dfe96bb

File tree

3 files changed

+274
-18
lines changed

3 files changed

+274
-18
lines changed

benchmark_test.go

Lines changed: 62 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -570,7 +570,7 @@ func BenchmarkGenerateMasks(b *testing.B) {
570570
copy(data, []byte(`"field1","field2","field3","field4","field5","field6","fie"`))
571571

572572
b.ResetTimer()
573-
for i := 0; i < b.N; i++ {
573+
for b.Loop() {
574574
generateMasks(data, ',')
575575
}
576576
}
@@ -590,7 +590,7 @@ func BenchmarkGenerateMasksPadded(b *testing.B) {
590590
}
591591

592592
b.ResetTimer()
593-
for i := 0; i < b.N; i++ {
593+
for b.Loop() {
594594
generateMasksPadded(data, ',')
595595
}
596596
})
@@ -617,7 +617,7 @@ func BenchmarkScanBuffer(b *testing.B) {
617617

618618
b.ResetTimer()
619619
b.SetBytes(int64(size))
620-
for i := 0; i < b.N; i++ {
620+
for b.Loop() {
621621
scanBuffer(data, ',')
622622
}
623623
})
@@ -665,7 +665,65 @@ func BenchmarkParseBuffer(b *testing.B) {
665665
b.ResetTimer()
666666
b.ReportAllocs()
667667

668-
for i := 0; i < b.N; i++ {
668+
for b.Loop() {
669669
_ = parseBuffer(data, sr)
670670
}
671671
}
672+
673+
// =============================================================================
674+
// prefixXOR Benchmarks - PCLMULQDQ
675+
// =============================================================================
676+
677+
func BenchmarkPrefixXOR(b *testing.B) {
678+
// Create test masks with varying densities
679+
testCases := []struct {
680+
name string
681+
mask uint64
682+
}{
683+
{"empty", 0},
684+
{"single_bit", 1},
685+
{"sparse", 0x0001000100010001}, // few bits set
686+
{"medium", 0x5555555555555555}, // alternating bits
687+
{"dense", 0xFFFFFFFFFFFFFFFF}, // all bits set
688+
{"realistic", 0b0100010001000100010001000100010001000100010001000100010001000100}, // quote-like pattern
689+
}
690+
691+
for _, tc := range testCases {
692+
b.Run(tc.name, func(b *testing.B) {
693+
for b.Loop() {
694+
_ = prefixXOR(tc.mask)
695+
}
696+
})
697+
}
698+
}
699+
700+
// BenchmarkPrefixXORThroughput measures throughput with sequential masks
701+
func BenchmarkPrefixXORThroughput(b *testing.B) {
702+
// Pre-generate masks to avoid setup overhead
703+
masks := make([]uint64, 1024)
704+
state := uint64(0xDEADBEEFCAFEBABE)
705+
for i := range masks {
706+
state ^= state << 13
707+
state ^= state >> 7
708+
state ^= state << 17
709+
masks[i] = state
710+
}
711+
712+
idx := 0
713+
for b.Loop() {
714+
_ = prefixXOR(masks[idx%len(masks)])
715+
idx++
716+
}
717+
}
718+
719+
// BenchmarkPrefixXORLatencyChain measures latency when each call depends on previous
720+
func BenchmarkPrefixXORLatencyChain(b *testing.B) {
721+
mask := uint64(0x5555555555555555)
722+
for b.Loop() {
723+
mask = prefixXOR(mask)
724+
}
725+
// Prevent compiler from optimizing away
726+
if mask == 0 {
727+
b.Fatal("unexpected zero")
728+
}
729+
}

simd_scanner.go

Lines changed: 50 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ var (
2828
cachedCrCmp archsimd.Int8x64
2929
cachedNlCmp archsimd.Int8x64
3030
cachedSepCmp [cachedSepCmpCount]archsimd.Int8x64
31+
32+
// PCLMULQDQ cached value: all-ones for carryless multiplication
33+
cachedAllOnes archsimd.Uint64x2
3134
)
3235

3336
// SIMD processing constants.
@@ -50,6 +53,13 @@ func init() {
5053
cachedQuoteCmp = cachedSepCmp['"']
5154
cachedCrCmp = cachedSepCmp['\r']
5255
cachedNlCmp = cachedSepCmp['\n']
56+
57+
// Pre-load all-ones value for carryless multiplication (PCLMULQDQ)
58+
// Used in prefixXOR: mask × 0xFFFFFFFFFFFFFFFF computes prefix XOR
59+
cachedAllOnes = archsimd.LoadUint64x2(&[2]uint64{
60+
0xFFFFFFFFFFFFFFFF,
61+
0xFFFFFFFFFFFFFFFF,
62+
})
5363
}
5464
}
5565

@@ -63,6 +73,42 @@ func shouldUseSIMD(dataLen int) bool {
6373
return useAVX512 && dataLen >= simdMinThreshold
6474
}
6575

76+
// =============================================================================
77+
// Prefix XOR (Quote Region Mask Computation)
78+
// =============================================================================
79+
80+
// prefixXOR computes the prefix XOR of the input mask.
81+
// For each bit position i, the result bit is the XOR of bits 0 through i.
82+
//
83+
// This is used to convert a quote position mask into an "inside quotes" mask:
84+
//
85+
// input: 0b01001010 (quote positions at 1, 3, 6)
86+
// output: 0b11000110 (inside quote regions)
87+
//
88+
// When AVX-512 is available, uses PCLMULQDQ instruction for ~3x speedup.
89+
// Mathematical basis (carryless multiplication in GF(2)):
90+
//
91+
// mask × all_ones = mask × (1 + 2 + 4 + ... + 2^63)
92+
// = mask ^ (mask << 1) ^ (mask << 2) ^ ... ^ (mask << 63)
93+
//
94+
// The lower 64 bits of this product give us the prefix XOR.
95+
func prefixXOR(mask uint64) uint64 {
96+
if useAVX512 {
97+
// PCLMULQDQ path: ~3-4 instructions
98+
maskVec := archsimd.LoadUint64x2(&[2]uint64{mask, 0})
99+
result := maskVec.CarrylessMultiply(0, 0, cachedAllOnes)
100+
return result.GetElem(0)
101+
}
102+
// Scalar path: 6 shifts + 6 XORs = 12 instructions
103+
mask ^= mask << 1
104+
mask ^= mask << 2
105+
mask ^= mask << 4
106+
mask ^= mask << 8
107+
mask ^= mask << 16
108+
mask ^= mask << 32
109+
return mask
110+
}
111+
66112
// =============================================================================
67113
// Core Data Structures
68114
// =============================================================================
@@ -391,13 +437,8 @@ func processQuotesAndSeparators(quoteMask, sepMask, nextQuoteMask uint64, state
391437
state.quoted = quoted
392438

393439
// Invalidate separators using prefix XOR on cleaned quote mask
394-
inQuote := quoteMaskOut
395-
inQuote ^= inQuote << 1
396-
inQuote ^= inQuote << 2
397-
inQuote ^= inQuote << 4
398-
inQuote ^= inQuote << 8
399-
inQuote ^= inQuote << 16
400-
inQuote ^= inQuote << 32
440+
// Uses PCLMULQDQ when available for ~3x fewer instructions
441+
inQuote := prefixXOR(quoteMaskOut)
401442

402443
if initialQuoted != 0 {
403444
inQuote = ^inQuote
@@ -410,13 +451,8 @@ func processQuotesAndSeparators(quoteMask, sepMask, nextQuoteMask uint64, state
410451
// invalidateNewlinesInQuotes removes newline bits that are inside quoted regions.
411452
func invalidateNewlinesInQuotes(quoteMask, newlineMask uint64, state *scanState) uint64 {
412453
// Prefix XOR: inQuote[i] = 1 iff positions 0..i have odd number of quotes
413-
inQuote := quoteMask
414-
inQuote ^= inQuote << 1
415-
inQuote ^= inQuote << 2
416-
inQuote ^= inQuote << 4
417-
inQuote ^= inQuote << 8
418-
inQuote ^= inQuote << 16
419-
inQuote ^= inQuote << 32
454+
// Uses PCLMULQDQ when available for ~3x fewer instructions
455+
inQuote := prefixXOR(quoteMask)
420456

421457
// If we started inside a quoted region, invert the mask
422458
if state.quoted != 0 {

simd_scanner_test.go

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1160,3 +1160,165 @@ func popcount(x uint64) int {
11601160
}
11611161
return count
11621162
}
1163+
1164+
// ============================================================================
1165+
// TestPrefixXOR - Test prefix XOR computation (PCLMULQDQ optimization)
1166+
// ============================================================================
1167+
1168+
func TestPrefixXOR(t *testing.T) {
1169+
// prefixXOR computes the cumulative XOR: result[i] = XOR of bits 0..i
1170+
// This is used for quote region detection: bit i is set if there's an
1171+
// odd number of quotes at positions 0 through i (inclusive).
1172+
tests := []struct {
1173+
name string
1174+
input uint64
1175+
want uint64
1176+
}{
1177+
{
1178+
name: "empty",
1179+
input: 0,
1180+
want: 0, // no quotes = no regions
1181+
},
1182+
{
1183+
name: "single_bit_0",
1184+
input: 1, // quote at position 0
1185+
// All positions 0..63 have odd count (1) → all bits set
1186+
want: 0xFFFFFFFFFFFFFFFF,
1187+
},
1188+
{
1189+
name: "single_bit_1",
1190+
input: 2, // quote at position 1 (0b10)
1191+
// pos 0: 0 quotes → 0; pos 1+: 1 quote → 1
1192+
want: 0xFFFFFFFFFFFFFFFE,
1193+
},
1194+
{
1195+
name: "single_bit_2",
1196+
input: 4, // quote at position 2 (0b100)
1197+
// pos 0,1: 0 quotes → 0; pos 2+: 1 quote → 1
1198+
want: 0xFFFFFFFFFFFFFFFC,
1199+
},
1200+
{
1201+
name: "two_adjacent_bits",
1202+
input: 0b11, // quotes at positions 0,1
1203+
// pos 0: 1 quote → 1; pos 1+: 2 quotes → 0
1204+
want: 0x0000000000000001,
1205+
},
1206+
{
1207+
name: "alternating_bits_8bit",
1208+
input: 0xAA, // 0b10101010 = quotes at positions 1,3,5,7
1209+
// pos 0: 0→0, pos 1: 1→1, pos 2: 1→1, pos 3: 2→0,
1210+
// pos 4: 2→0, pos 5: 3→1, pos 6: 3→1, pos 7: 4→0
1211+
// Low 8 bits: 0b01100110 = 0x66
1212+
// pos 8+: 4 quotes (even) → 0
1213+
want: 0x0000000000000066,
1214+
},
1215+
{
1216+
name: "quote_example",
1217+
input: 0x4A, // 0b01001010 = quotes at positions 1, 3, 6
1218+
// pos 0: 0→0, pos 1: 1→1, pos 2: 1→1, pos 3: 2→0,
1219+
// pos 4: 2→0, pos 5: 2→0, pos 6: 3→1, pos 7+: 3→1 (odd)
1220+
// Low 8 bits: 0b11000110 = 0xC6
1221+
// Upper bits: all 1 (odd count continues)
1222+
want: 0xFFFFFFFFFFFFFFC6,
1223+
},
1224+
{
1225+
name: "all_ones_8bit",
1226+
input: 0xFF, // quotes at all positions 0-7
1227+
// pos 0: 1→1, pos 1: 2→0, pos 2: 3→1, pos 3: 4→0, ...
1228+
// Pattern: 10101010... = 0x55 for low 8 bits
1229+
// pos 8+: 8 quotes (even) → 0
1230+
want: 0x0000000000000055,
1231+
},
1232+
{
1233+
name: "high_bit_only",
1234+
input: uint64(1) << 63, // quote at position 63 only
1235+
// pos 0-62: 0 quotes → 0; pos 63: 1 quote → 1
1236+
want: 0x8000000000000000,
1237+
},
1238+
{
1239+
name: "bits_0_and_63",
1240+
input: 1 | (uint64(1) << 63), // quotes at positions 0 and 63
1241+
// pos 0: 1→1, pos 1-62: 1→1 (odd), pos 63: 2→0
1242+
want: 0x7FFFFFFFFFFFFFFF,
1243+
},
1244+
}
1245+
1246+
for _, tt := range tests {
1247+
t.Run(tt.name, func(t *testing.T) {
1248+
got := prefixXOR(tt.input)
1249+
if got != tt.want {
1250+
t.Errorf("prefixXOR(0x%016x) = 0x%016x, want 0x%016x",
1251+
tt.input, got, tt.want)
1252+
}
1253+
})
1254+
}
1255+
}
1256+
1257+
// TestPrefixXORQuoteRegions tests the actual use case: converting quote positions to regions
1258+
func TestPrefixXORQuoteRegions(t *testing.T) {
1259+
// prefixXOR result[i] = 1 means odd number of quotes at positions 0..i (inclusive)
1260+
// In CSV parsing context:
1261+
// - Position i has inQuote[i]=1 if we're at or after an opening quote but not past a closing quote
1262+
// - Quote chars themselves are considered "in quote" for masking purposes
1263+
tests := []struct {
1264+
name string
1265+
quotePos []int // positions where quotes appear
1266+
wantInQuote []int // positions where inQuote bit should be 1
1267+
wantOutQuote []int // positions where inQuote bit should be 0
1268+
initialQuoted bool // if true, we start inside a quoted region
1269+
}{
1270+
{
1271+
name: "simple_quoted_field",
1272+
quotePos: []int{0, 5}, // "hello" - quotes at 0 and 5
1273+
// pos 0: 1 quote → 1; pos 1-4: 1 quote → 1; pos 5: 2 quotes → 0
1274+
wantInQuote: []int{0, 1, 2, 3, 4},
1275+
wantOutQuote: []int{5, 6, 7, 8},
1276+
},
1277+
{
1278+
name: "two_quoted_fields",
1279+
quotePos: []int{0, 3, 5, 8}, // "ab","cd" - pattern at 0,3,5,8
1280+
// pos 0-2: 1 quote → 1; pos 3-4: 2 quotes → 0; pos 5-7: 3 quotes → 1; pos 8+: 4 quotes → 0
1281+
wantInQuote: []int{0, 1, 2, 5, 6, 7},
1282+
wantOutQuote: []int{3, 4, 8, 9, 10},
1283+
},
1284+
{
1285+
name: "start_inside_quote",
1286+
quotePos: []int{5}, // closing quote at 5
1287+
// Without inversion: pos 0-4: 0 quotes → 0; pos 5+: 1 quote → 1
1288+
// With inversion (initialQuoted=true): pos 0-4: 1; pos 5+: 0
1289+
wantInQuote: []int{0, 1, 2, 3, 4},
1290+
wantOutQuote: []int{5, 6, 7, 8},
1291+
initialQuoted: true,
1292+
},
1293+
}
1294+
1295+
for _, tt := range tests {
1296+
t.Run(tt.name, func(t *testing.T) {
1297+
// Build quote mask
1298+
var quoteMask uint64
1299+
for _, pos := range tt.quotePos {
1300+
quoteMask |= uint64(1) << pos
1301+
}
1302+
1303+
// Compute in-quote mask using prefixXOR
1304+
inQuote := prefixXOR(quoteMask)
1305+
if tt.initialQuoted {
1306+
inQuote = ^inQuote
1307+
}
1308+
1309+
// Verify expected in-quote positions
1310+
for _, pos := range tt.wantInQuote {
1311+
if inQuote&(uint64(1)<<pos) == 0 {
1312+
t.Errorf("position %d should be inside quotes (bit=1), but bit is 0. inQuote=0x%016x", pos, inQuote)
1313+
}
1314+
}
1315+
1316+
// Verify expected out-quote positions
1317+
for _, pos := range tt.wantOutQuote {
1318+
if inQuote&(uint64(1)<<pos) != 0 {
1319+
t.Errorf("position %d should be outside quotes (bit=0), but bit is 1. inQuote=0x%016x", pos, inQuote)
1320+
}
1321+
}
1322+
})
1323+
}
1324+
}

0 commit comments

Comments
 (0)