|
51 | 51 | %define FUNCTION_NAME crc32_ieee_by16_10
|
52 | 52 | %endif
|
53 | 53 |
|
| 54 | +%ifndef fetch_dist |
| 55 | +%define fetch_dist 1536 |
| 56 | +%endif |
| 57 | + |
| 58 | +%ifndef PREFETCH |
| 59 | +%define PREFETCH prefetcht0 |
| 60 | +%endif |
| 61 | + |
54 | 62 | [bits 64]
|
55 | 63 | default rel
|
56 | 64 |
|
@@ -129,24 +137,28 @@ align 16
|
129 | 137 | .fold_256_B_loop:
|
130 | 138 | add arg2, 256
|
131 | 139 | vmovdqu8 zmm3, [arg2+16*0]
|
| 140 | + PREFETCH [arg2+fetch_dist+0] |
132 | 141 | vpshufb zmm3, zmm3, zmm18
|
133 | 142 | vpclmulqdq zmm1, zmm0, zmm16, 0x00
|
134 | 143 | vpclmulqdq zmm0, zmm0, zmm16, 0x11
|
135 | 144 | vpternlogq zmm0, zmm1, zmm3, 0x96
|
136 | 145 |
|
137 | 146 | vmovdqu8 zmm9, [arg2+16*4]
|
| 147 | + PREFETCH [arg2+fetch_dist+64] |
138 | 148 | vpshufb zmm9, zmm9, zmm18
|
139 | 149 | vpclmulqdq zmm5, zmm4, zmm16, 0x00
|
140 | 150 | vpclmulqdq zmm4, zmm4, zmm16, 0x11
|
141 | 151 | vpternlogq zmm4, zmm5, zmm9, 0x96
|
142 | 152 |
|
143 | 153 | vmovdqu8 zmm11, [arg2+16*8]
|
| 154 | + PREFETCH [arg2+fetch_dist+64*2] |
144 | 155 | vpshufb zmm11, zmm11, zmm18
|
145 | 156 | vpclmulqdq zmm12, zmm7, zmm16, 0x00
|
146 | 157 | vpclmulqdq zmm7, zmm7, zmm16, 0x11
|
147 | 158 | vpternlogq zmm7, zmm12, zmm11, 0x96
|
148 | 159 |
|
149 | 160 | vmovdqu8 zmm17, [arg2+16*12]
|
| 161 | + PREFETCH [arg2+fetch_dist+64*3] |
150 | 162 | vpshufb zmm17, zmm17, zmm18
|
151 | 163 | vpclmulqdq zmm14, zmm8, zmm16, 0x00
|
152 | 164 | vpclmulqdq zmm8, zmm8, zmm16, 0x11
|
|
0 commit comments