55
55
56
56
%include "reg_sizes.asm"
57
57
58
- %define fetch_dist 1024
58
+ %ifndef fetch_dist
59
+ %define fetch_dist 4096
60
+ %endif
61
+
62
+ %ifndef PREFETCH
63
+ %define PREFETCH prefetcht1
64
+ %endif
59
65
60
66
[bits 64]
61
67
default rel
@@ -135,7 +141,7 @@ crc32_gzip_refl_by8_02:
135
141
; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
136
142
.fold_128_B_loop:
137
143
add arg2 , 128
138
- prefetchnta [ arg2 + fetch_dist + 0 ]
144
+ PREFETCH [ arg2 + fetch_dist + 0 ]
139
145
vmovdqu xmm9 , [ arg2 + 16 * 0 ]
140
146
vmovdqu xmm12 , [ arg2 + 16 * 1 ]
141
147
vpclmulqdq xmm8 , xmm0 , xmm10 , 0x10
@@ -147,7 +153,7 @@ crc32_gzip_refl_by8_02:
147
153
vpxor xmm1 , xmm12
148
154
vxorps xmm1 , xmm13
149
155
150
- prefetchnta [ arg2 + fetch_dist + 32 ]
156
+ PREFETCH [ arg2 + fetch_dist + 32 ]
151
157
vmovdqu xmm9 , [ arg2 + 16 * 2 ]
152
158
vmovdqu xmm12 , [ arg2 + 16 * 3 ]
153
159
vpclmulqdq xmm8 , xmm2 , xmm10 , 0x10
@@ -159,7 +165,7 @@ crc32_gzip_refl_by8_02:
159
165
vpxor xmm3 , xmm12
160
166
vxorps xmm3 , xmm13
161
167
162
- prefetchnta [ arg2 + fetch_dist + 64 ]
168
+ PREFETCH [ arg2 + fetch_dist + 64 ]
163
169
vmovdqu xmm9 , [ arg2 + 16 * 4 ]
164
170
vmovdqu xmm12 , [ arg2 + 16 * 5 ]
165
171
vpclmulqdq xmm8 , xmm4 , xmm10 , 0x10
@@ -171,7 +177,7 @@ crc32_gzip_refl_by8_02:
171
177
vpxor xmm5 , xmm12
172
178
vxorps xmm5 , xmm13
173
179
174
- prefetchnta [ arg2 + fetch_dist + 96 ]
180
+ PREFETCH [ arg2 + fetch_dist + 96 ]
175
181
vmovdqu xmm9 , [ arg2 + 16 * 6 ]
176
182
vmovdqu xmm12 , [ arg2 + 16 * 7 ]
177
183
vpclmulqdq xmm8 , xmm6 , xmm10 , 0x10
0 commit comments