20
20
#define USE_SLICING
21
21
#endif
22
22
23
- static uint crc_tables[8 ][256 ]; // Tables for Slicing-by-8.
23
+ static uint crc_tables[16 ][256 ]; // Tables for Slicing-by-16.
24
+
25
+ #ifdef USE_NEON_CRC32
26
+ static bool CRC_Neon;
27
+ #endif
24
28
25
29
26
30
// Build the classic CRC32 lookup table.
@@ -36,6 +40,19 @@ void InitCRC32(uint *CRCTab)
36
40
C=(C & 1 ) ? (C>>1 )^0xEDB88320 : (C>>1 );
37
41
CRCTab[I]=C;
38
42
}
43
+
44
+ #ifdef USE_NEON_CRC32
45
+ #ifdef _APPLE
46
+ // getauxval isn't available in OS X
47
+ uint Value=0 ;
48
+ size_t Size=sizeof (Value);
49
+ int RetCode=sysctlbyname (" hw.optional.armv8_crc32" ,&Value,&Size,NULL ,0 );
50
+ CRC_Neon=RetCode==0 && Value!=0 ;
51
+ #else
52
+ CRC_Neon=(getauxval (AT_HWCAP) & HWCAP_CRC32)!=0 ;
53
+ #endif
54
+ #endif
55
+
39
56
}
40
57
41
58
@@ -47,7 +64,7 @@ static void InitTables()
47
64
for (uint I=0 ;I<256 ;I++) // Build additional lookup tables.
48
65
{
49
66
uint C=crc_tables[0 ][I];
50
- for (uint J=1 ;J<8 ;J++)
67
+ for (uint J=1 ;J<16 ;J++)
51
68
{
52
69
C=crc_tables[0 ][(byte)C]^(C>>8 );
53
70
crc_tables[J][I]=C;
@@ -63,29 +80,66 @@ uint CRC32(uint StartCRC,const void *Addr,size_t Size)
63
80
{
64
81
byte *Data=(byte *)Addr;
65
82
83
+ #ifdef USE_NEON_CRC32
84
+ if (CRC_Neon)
85
+ {
86
+ for (;Size>=8 ;Size-=8 ,Data+=8 )
87
+ #ifdef __clang__
88
+ StartCRC = __builtin_arm_crc32d (StartCRC, RawGet8 (Data));
89
+ #else
90
+ StartCRC = __builtin_aarch64_crc32x (StartCRC, RawGet8 (Data));
91
+ #endif
92
+ for (;Size>0 ;Size--,Data++) // Process left data.
93
+ #ifdef __clang__
94
+ StartCRC = __builtin_arm_crc32b (StartCRC, *Data);
95
+ #else
96
+ StartCRC = __builtin_aarch64_crc32b (StartCRC, *Data);
97
+ #endif
98
+ return StartCRC;
99
+ }
100
+ #endif
101
+
66
102
#ifdef USE_SLICING
67
- // Align Data to 8 for better performance and to avoid ALLOW_MISALIGNED
103
+ // Align Data to 16 for better performance and to avoid ALLOW_MISALIGNED
68
104
// check below.
69
- for (;Size>0 && ((size_t )Data & 7 ) ;Size--,Data++)
105
+ for (;Size>0 && ((size_t )Data & 15 )!= 0 ;Size--,Data++)
70
106
StartCRC=crc_tables[0 ][(byte)(StartCRC^Data[0 ])]^(StartCRC>>8 );
71
107
72
- for (;Size>=8 ;Size-=8 ,Data+=8 )
108
+ // 2023.12.06: We switched to slicing-by-16, which seems to be faster than
109
+ // slicing-by-8 on modern CPUs. Slicing-by-32 would require 32 KB for tables
110
+ // and could be limited by L1 cache size on some CPUs.
111
+ for (;Size>=16 ;Size-=16 ,Data+=16 )
73
112
{
74
113
#ifdef BIG_ENDIAN
75
- StartCRC ^= Data[0 ]|(Data[1 ] << 8 )|(Data[2 ] << 16 )|(Data[3 ] << 24 );
76
- uint NextData = Data[4 ]|(Data[5 ] << 8 )|(Data[6 ] << 16 )|(Data[7 ] << 24 );
114
+ StartCRC ^= RawGet4 (Data);
115
+ uint D1 = RawGet4 (Data+4 );
116
+ uint D2 = RawGet4 (Data+8 );
117
+ uint D3 = RawGet4 (Data+12 );
77
118
#else
119
+ // We avoid RawGet4 here for performance reason, to access uint32
120
+ // directly even if ALLOW_MISALIGNED isn't defined. We can do it,
121
+ // because we aligned 'Data' above.
78
122
StartCRC ^= *(uint32 *) Data;
79
- uint NextData = *(uint32 *) (Data+4 );
123
+ uint D1 = *(uint32 *) (Data+4 );
124
+ uint D2 = *(uint32 *) (Data+8 );
125
+ uint D3 = *(uint32 *) (Data+12 );
80
126
#endif
81
- StartCRC = crc_tables[7 ][(byte) StartCRC ] ^
82
- crc_tables[6 ][(byte)(StartCRC >> 8 ) ] ^
83
- crc_tables[5 ][(byte)(StartCRC >> 16 )] ^
84
- crc_tables[4 ][(byte)(StartCRC >> 24 )] ^
85
- crc_tables[3 ][(byte) NextData ] ^
86
- crc_tables[2 ][(byte)(NextData >> 8 ) ] ^
87
- crc_tables[1 ][(byte)(NextData >> 16 )] ^
88
- crc_tables[0 ][(byte)(NextData >> 24 )];
127
+ StartCRC = crc_tables[15 ][(byte) StartCRC ] ^
128
+ crc_tables[14 ][(byte)(StartCRC >> 8 ) ] ^
129
+ crc_tables[13 ][(byte)(StartCRC >> 16 )] ^
130
+ crc_tables[12 ][(byte)(StartCRC >> 24 )] ^
131
+ crc_tables[11 ][(byte) D1 ] ^
132
+ crc_tables[10 ][(byte)(D1 >> 8 ) ] ^
133
+ crc_tables[ 9 ][(byte)(D1 >> 16 )] ^
134
+ crc_tables[ 8 ][(byte)(D1 >> 24 )] ^
135
+ crc_tables[ 7 ][(byte) D2 ] ^
136
+ crc_tables[ 6 ][(byte)(D2 >> 8 )] ^
137
+ crc_tables[ 5 ][(byte)(D2 >> 16 )] ^
138
+ crc_tables[ 4 ][(byte)(D2 >> 24 )] ^
139
+ crc_tables[ 3 ][(byte) D3 ] ^
140
+ crc_tables[ 2 ][(byte)(D3 >> 8 )] ^
141
+ crc_tables[ 1 ][(byte)(D3 >> 16 )] ^
142
+ crc_tables[ 0 ][(byte)(D3 >> 24 )];
89
143
}
90
144
#endif
91
145
@@ -111,74 +165,6 @@ ushort Checksum14(ushort StartCRC,const void *Addr,size_t Size)
111
165
#endif
112
166
113
167
114
- #if 0
115
- static uint64 crc64_tables[8][256]; // Tables for Slicing-by-8 for CRC64.
116
-
117
- void InitCRC64(uint64 *CRCTab)
118
- {
119
- const uint64 poly=INT32TO64(0xC96C5795, 0xD7870F42); // 0xC96C5795D7870F42;
120
- for (uint I=0;I<256;I++)
121
- {
122
- uint64 C=I;
123
- for (uint J=0;J<8;J++)
124
- C=(C & 1) ? (C>>1)^poly: (C>>1);
125
- CRCTab[I]=C;
126
- }
127
- }
128
-
129
-
130
- static void InitTables64()
131
- {
132
- InitCRC64(crc64_tables[0]);
133
-
134
- for (uint I=0;I<256;I++) // Build additional lookup tables.
135
- {
136
- uint64 C=crc64_tables[0][I];
137
- for (uint J=1;J<8;J++)
138
- {
139
- C=crc64_tables[0][(byte)C]^(C>>8);
140
- crc64_tables[J][I]=C;
141
- }
142
- }
143
- }
144
-
145
-
146
- // We cannot place the intialization to CRC64(), because we use this function
147
- // in multithreaded mode and it conflicts with multithreading.
148
- struct CallInitCRC64 {CallInitCRC64() {InitTables64();}} static CallInit64;
149
-
150
- uint64 CRC64(uint64 StartCRC,const void *Addr,size_t Size)
151
- {
152
- byte *Data=(byte *)Addr;
153
-
154
- // Align Data to 8 for better performance.
155
- for (;Size>0 && ((size_t)Data & 7)!=0;Size--,Data++)
156
- StartCRC=crc64_tables[0][(byte)(StartCRC^Data[0])]^(StartCRC>>8);
157
-
158
- for (byte *DataEnd=Data+Size/8*8; Data<DataEnd; Data+=8 )
159
- {
160
- uint64 Index=StartCRC;
161
- #ifdef BIG_ENDIAN
162
- Index ^= (uint64(Data[0])|(uint64(Data[1])<<8)|(uint64(Data[2])<<16)|(uint64(Data[3])<<24))|
163
- (uint64(Data[4])<<32)|(uint64(Data[5])<<40)|(uint64(Data[6])<<48)|(uint64(Data[7])<<56);
164
- #else
165
- Index ^= *(uint64 *)Data;
166
- #endif
167
- StartCRC = crc64_tables[ 7 ] [ ( byte ) (Index ) ] ^
168
- crc64_tables[ 6 ] [ ( byte ) (Index >> 8 ) ] ^
169
- crc64_tables[ 5 ] [ ( byte ) (Index >> 16 ) ] ^
170
- crc64_tables[ 4 ] [ ( byte ) (Index >> 24 ) ] ^
171
- crc64_tables[ 3 ] [ ( byte ) (Index >> 32 ) ] ^
172
- crc64_tables[ 2 ] [ ( byte ) (Index >> 40 ) ] ^
173
- crc64_tables[ 1 ] [ ( byte ) (Index >> 48 ) ] ^
174
- crc64_tables[ 0 ] [ ( byte ) (Index >> 56 ) ] ;
175
- }
176
-
177
- for (Size%=8;Size>0;Size--,Data++) // Process left data.
178
- StartCRC=crc64_tables[0][(byte)(StartCRC^Data[0])]^(StartCRC>>8);
179
-
180
- return StartCRC;
181
- }
182
168
183
169
184
170
#if 0
@@ -187,6 +173,11 @@ struct TestCRCStruct {TestCRCStruct() {TestCRC();exit(0);}} GlobalTesCRC;
187
173
188
174
void TestCRC()
189
175
{
176
+ // This function is invoked from global object and _SSE_Version is global
177
+ // and can be initialized after this function. So we explicitly initialize
178
+ // it here to enable SSE support in Blake2sp.
179
+ _SSE_Version=GetSSEVersion();
180
+
190
181
const uint FirstSize=300;
191
182
byte b[FirstSize];
192
183
@@ -252,23 +243,38 @@ void TestCRC()
252
243
253
244
const size_t BufSize=0x100000;
254
245
byte *Buf=new byte[BufSize];
255
- memset (Buf,0 ,BufSize);
246
+ GetRnd (Buf,BufSize);
256
247
257
248
clock_t StartTime=clock();
258
249
r32=0xffffffff;
259
- const uint BufCount=5000;
250
+ const uint64 BufCount=5000;
260
251
for (uint I=0;I<BufCount;I++)
261
252
r32=CRC32(r32,Buf,BufSize);
262
253
if (r32!=0) // Otherwise compiler optimizer removes CRC calculation.
263
- mprintf(L"\nCRC32 speed: %d MB/s",BufCount*1000/(clock()-StartTime));
254
+ mprintf(L"\nCRC32 speed: %llu MB/s",BufCount*CLOCKS_PER_SEC/(clock()-StartTime));
255
+
256
+ StartTime=clock();
257
+ DataHash Hash;
258
+ Hash.Init(HASH_CRC32,MaxPoolThreads);
259
+ const uint64 BufCountMT=20000;
260
+ for (uint I=0;I<BufCountMT;I++)
261
+ Hash.Update(Buf,BufSize);
262
+ HashValue Result;
263
+ Hash.Result(&Result);
264
+ mprintf(L"\nCRC32 MT speed: %llu MB/s",BufCountMT*CLOCKS_PER_SEC/(clock()-StartTime));
265
+
266
+ StartTime=clock();
267
+ Hash.Init(HASH_BLAKE2,MaxPoolThreads);
268
+ for (uint I=0;I<BufCount;I++)
269
+ Hash.Update(Buf,BufSize);
270
+ Hash.Result(&Result);
271
+ mprintf(L"\nBlake2sp speed: %llu MB/s",BufCount*CLOCKS_PER_SEC/(clock()-StartTime));
264
272
265
273
StartTime=clock();
266
274
r64=0xffffffffffffffff;
267
275
for (uint I=0;I<BufCount;I++)
268
276
r64=CRC64(r64,Buf,BufSize);
269
277
if (r64!=0) // Otherwise compiler optimizer removes CRC calculation.
270
- mprintf(L"\nCRC64 speed: %d MB/s",BufCount*1000 /(clock()-StartTime));
278
+ mprintf(L"\nCRC64 speed: %llu MB/s",BufCount*CLOCKS_PER_SEC /(clock()-StartTime));
271
279
}
272
280
#endif
273
-
274
- #endif
0 commit comments