Skip to content

Commit 18dbe5b

Browse files
klauspostgopherbot
authored andcommitted
hash/crc32: add AVX512 IEEE CRC32 calculation
Benchmark: goos: windows goarch: amd64 pkg: hash/crc32 cpu: AMD Ryzen 9 9950X 16-Core Processor benchmark old MB/s new MB/s speedup BenchmarkCRC32/poly=IEEE/size=15/align=0-32 1081.48 1089.42 1.01x BenchmarkCRC32/poly=IEEE/size=15/align=1-32 1085.87 1082.61 1.00x BenchmarkCRC32/poly=IEEE/size=40/align=0-32 2756.33 2752.37 1.00x BenchmarkCRC32/poly=IEEE/size=40/align=1-32 2758.27 2756.99 1.00x BenchmarkCRC32/poly=IEEE/size=512/align=0-32 18133.44 18076.52 1.00x BenchmarkCRC32/poly=IEEE/size=512/align=1-32 18151.05 18055.41 0.99x BenchmarkCRC32/poly=IEEE/size=1kB/align=0-32 19902.93 48581.07 2.44x BenchmarkCRC32/poly=IEEE/size=1kB/align=1-32 19966.99 48393.25 2.42x BenchmarkCRC32/poly=IEEE/size=4kB/align=0-32 21690.33 51679.25 2.38x BenchmarkCRC32/poly=IEEE/size=4kB/align=1-32 21655.30 51731.22 2.39x BenchmarkCRC32/poly=IEEE/size=32kB/align=0-32 22046.57 46406.90 2.10x BenchmarkCRC32/poly=IEEE/size=32kB/align=1-32 21986.22 46250.66 2.10x AVX512 are enabled above 1KB input size. This rather high limit is due to AVX512 may be slower to ramp up than the regular SSE4 implementation for smaller inputs. This is not reflected in the benchmarks, since consecutive calls means the CPU is "hot". The 'HasAVX512VPCLMULQDQ' name mirrors the one in golang.org/x/sys/cpu Change-Id: Id23685d8e3cc412b6d397a7d70056844bdb79271 Change-Id: Id23685d8e3cc412b6d397a7d70056844bdb79271 GitHub-Last-Rev: 6639f07 GitHub-Pull-Request: #74701 Reviewed-on: https://go-review.googlesource.com/c/go/+/689435 Reviewed-by: Keith Randall <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Michael Knyszek <[email protected]> Auto-Submit: Keith Randall <[email protected]> Auto-Submit: Michael Knyszek <[email protected]> Reviewed-by: Keith Randall <[email protected]>
1 parent c641900 commit 18dbe5b

File tree

4 files changed

+84
-25
lines changed

4 files changed

+84
-25
lines changed

src/hash/crc32/crc32_amd64.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@ import (
1313
"unsafe"
1414
)
1515

16+
// Offset into internal/cpu records for use in assembly.
17+
const (
18+
offsetX86HasAVX512VPCLMULQDQL = unsafe.Offsetof(cpu.X86.HasAVX512VPCLMULQDQ)
19+
)
20+
1621
// This file contains the code to call the SSE 4.2 version of the Castagnoli
1722
// and IEEE CRC.
1823

src/hash/crc32/crc32_amd64.s

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
// license that can be found in the LICENSE file.
44

55
#include "textflag.h"
6+
#include "go_asm.h"
67

78
// castagnoliSSE42 updates the (non-inverted) crc with the given buffer.
89
//
@@ -136,15 +137,23 @@ loop:
136137
// Linux kernel, since they avoid the costly
137138
// PSHUFB 16 byte reversal proposed in the
138139
// original Intel paper.
140+
// Splatted so it can be loaded with a single VMOVDQU64
139141
DATA r2r1<>+0(SB)/8, $0x154442bd4
140142
DATA r2r1<>+8(SB)/8, $0x1c6e41596
143+
DATA r2r1<>+16(SB)/8, $0x154442bd4
144+
DATA r2r1<>+24(SB)/8, $0x1c6e41596
145+
DATA r2r1<>+32(SB)/8, $0x154442bd4
146+
DATA r2r1<>+40(SB)/8, $0x1c6e41596
147+
DATA r2r1<>+48(SB)/8, $0x154442bd4
148+
DATA r2r1<>+56(SB)/8, $0x1c6e41596
149+
141150
DATA r4r3<>+0(SB)/8, $0x1751997d0
142151
DATA r4r3<>+8(SB)/8, $0x0ccaa009e
143152
DATA rupoly<>+0(SB)/8, $0x1db710641
144153
DATA rupoly<>+8(SB)/8, $0x1f7011641
145154
DATA r5<>+0(SB)/8, $0x163cd6124
146155

147-
GLOBL r2r1<>(SB),RODATA,$16
156+
GLOBL r2r1<>(SB), RODATA, $64
148157
GLOBL r4r3<>(SB),RODATA,$16
149158
GLOBL rupoly<>(SB),RODATA,$16
150159
GLOBL r5<>(SB),RODATA,$8
@@ -158,6 +167,43 @@ TEXT ·ieeeCLMUL(SB),NOSPLIT,$0
158167
MOVQ p+8(FP), SI // data pointer
159168
MOVQ p_len+16(FP), CX // len(p)
160169

170+
// Check feature support and length to be >= 1024 bytes.
171+
CMPB internal∕cpu·X86+const_offsetX86HasAVX512VPCLMULQDQL(SB), $1
172+
JNE useSSE42
173+
CMPQ CX, $1024
174+
JL useSSE42
175+
176+
// Use AVX512
177+
VPXORQ Z0, Z0, Z0
178+
VMOVQ AX, X0
179+
VMOVDQU64 (SI), Z1
180+
VPXORQ Z0, Z1, Z1 // Merge initial CRC value into Z1
181+
ADDQ $64, SI // buf+=64
182+
SUBQ $64, CX // len-=64
183+
184+
VMOVDQU64 r2r1<>+0(SB), Z0
185+
186+
loopback64Avx512:
187+
VMOVDQU64 (SI), Z11 // Load next
188+
VPCLMULQDQ $0x11, Z0, Z1, Z5
189+
VPCLMULQDQ $0, Z0, Z1, Z1
190+
VPTERNLOGD $0x96, Z11, Z5, Z1 // Combine results with xor into Z1
191+
192+
ADDQ $0x40, DI
193+
ADDQ $64, SI // buf+=64
194+
SUBQ $64, CX // len-=64
195+
CMPQ CX, $64 // Less than 64 bytes left?
196+
JGE loopback64Avx512
197+
198+
// Unfold result into XMM1-XMM4 to match SSE4 code.
199+
VEXTRACTF32X4 $1, Z1, X2 // X2: Second 128-bit lane
200+
VEXTRACTF32X4 $2, Z1, X3 // X3: Third 128-bit lane
201+
VEXTRACTF32X4 $3, Z1, X4 // X4: Fourth 128-bit lane
202+
VZEROUPPER
203+
JMP remain64
204+
205+
PCALIGN $16
206+
useSSE42:
161207
MOVOU (SI), X1
162208
MOVOU 16(SI), X2
163209
MOVOU 32(SI), X3
@@ -207,6 +253,7 @@ loopback64:
207253
CMPQ CX, $64 // Less than 64 bytes left?
208254
JGE loopback64
209255

256+
PCALIGN $16
210257
/* Fold result into a single register (X1) */
211258
remain64:
212259
MOVOA r4r3<>+0(SB), X0

src/internal/cpu/cpu.go

Lines changed: 24 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -26,29 +26,30 @@ var CacheLineSize uintptr = CacheLinePadSize
2626
// in addition to the cpuid feature bit being set.
2727
// The struct is padded to avoid false sharing.
2828
var X86 struct {
29-
_ CacheLinePad
30-
HasAES bool
31-
HasADX bool
32-
HasAVX bool
33-
HasAVX2 bool
34-
HasAVX512F bool
35-
HasAVX512BW bool
36-
HasAVX512VL bool
37-
HasBMI1 bool
38-
HasBMI2 bool
39-
HasERMS bool
40-
HasFSRM bool
41-
HasFMA bool
42-
HasOSXSAVE bool
43-
HasPCLMULQDQ bool
44-
HasPOPCNT bool
45-
HasRDTSCP bool
46-
HasSHA bool
47-
HasSSE3 bool
48-
HasSSSE3 bool
49-
HasSSE41 bool
50-
HasSSE42 bool
51-
_ CacheLinePad
29+
_ CacheLinePad
30+
HasAES bool
31+
HasADX bool
32+
HasAVX bool
33+
HasAVX2 bool
34+
HasAVX512F bool
35+
HasAVX512BW bool
36+
HasAVX512VL bool
37+
HasBMI1 bool
38+
HasBMI2 bool
39+
HasERMS bool
40+
HasFSRM bool
41+
HasFMA bool
42+
HasOSXSAVE bool
43+
HasPCLMULQDQ bool
44+
HasPOPCNT bool
45+
HasRDTSCP bool
46+
HasSHA bool
47+
HasSSE3 bool
48+
HasSSSE3 bool
49+
HasSSE41 bool
50+
HasSSE42 bool
51+
HasAVX512VPCLMULQDQ bool
52+
_ CacheLinePad
5253
}
5354

5455
// The booleans in ARM contain the correspondingly named cpu feature bit.

src/internal/cpu/cpu_x86.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ const (
4040
cpuid_SHA = 1 << 29
4141
cpuid_AVX512BW = 1 << 30
4242
cpuid_AVX512VL = 1 << 31
43+
44+
// ecx bits
45+
cpuid_AVX512VPCLMULQDQ = 1 << 10
46+
4347
// edx bits
4448
cpuid_FSRM = 1 << 4
4549
// edx bits for CPUID 0x80000001
@@ -57,6 +61,7 @@ func doinit() {
5761
{Name: "pclmulqdq", Feature: &X86.HasPCLMULQDQ},
5862
{Name: "rdtscp", Feature: &X86.HasRDTSCP},
5963
{Name: "sha", Feature: &X86.HasSHA},
64+
{Name: "vpclmulqdq", Feature: &X86.HasAVX512VPCLMULQDQ},
6065
}
6166
level := getGOAMD64level()
6267
if level < 2 {
@@ -139,7 +144,7 @@ func doinit() {
139144
return
140145
}
141146

142-
_, ebx7, _, edx7 := cpuid(7, 0)
147+
_, ebx7, ecx7, edx7 := cpuid(7, 0)
143148
X86.HasBMI1 = isSet(ebx7, cpuid_BMI1)
144149
X86.HasAVX2 = isSet(ebx7, cpuid_AVX2) && osSupportsAVX
145150
X86.HasBMI2 = isSet(ebx7, cpuid_BMI2)
@@ -151,6 +156,7 @@ func doinit() {
151156
if X86.HasAVX512F {
152157
X86.HasAVX512BW = isSet(ebx7, cpuid_AVX512BW)
153158
X86.HasAVX512VL = isSet(ebx7, cpuid_AVX512VL)
159+
X86.HasAVX512VPCLMULQDQ = isSet(ecx7, cpuid_AVX512VPCLMULQDQ)
154160
}
155161

156162
X86.HasFSRM = isSet(edx7, cpuid_FSRM)

0 commit comments

Comments
 (0)