@@ -59,6 +59,10 @@ blake3_hash_many_avx512:
5959 sub rsp , 144
6060 and rsp , 0xFFFFFFFFFFFFFFC0
6161 neg r9
62+ #ifdef _ILP32
63+ mov esi , esi
64+ mov edx , edx
65+ #endif
6266 kmovw k1 , r9d
6367 vmovd xmm0 , r8d
6468 vpbroadcastd ymm0 , xmm0
@@ -107,6 +111,7 @@ blake3_hash_many_avx512:
107111 cmp rdx , qword ptr [ rsp + 0x80 ]
108112 cmove eax , ebx
109113 mov dword ptr [ rsp + 0x88 ], eax
114+ #ifndef _ILP32
110115 mov r8 , qword ptr [ rdi ]
111116 mov r9 , qword ptr [ rdi + 0x8 ]
112117 mov r10 , qword ptr [ rdi + 0x10 ]
@@ -115,6 +120,16 @@ blake3_hash_many_avx512:
115120 mov r13 , qword ptr [ rdi + 0x48 ]
116121 mov r14 , qword ptr [ rdi + 0x50 ]
117122 mov r15 , qword ptr [ rdi + 0x58 ]
123+ #else
124+ mov r8d , dword ptr [ rdi ]
125+ mov r9d , dword ptr [ rdi + 0x4 ]
126+ mov r10d , dword ptr [ rdi + 0x8 ]
127+ mov r11d , dword ptr [ rdi + 0xc ]
128+ mov r12d , dword ptr [ rdi + 0x20 ]
129+ mov r13d , dword ptr [ rdi + 0x24 ]
130+ mov r14d , dword ptr [ rdi + 0x28 ]
131+ mov r15d , dword ptr [ rdi + 0x2c ]
132+ #endif
118133 vmovdqu32 ymm16 , ymmword ptr [ rdx + r8 - 0x2 * 0x20 ]
119134 vinserti64x4 zmm16 , zmm16 , ymmword ptr [ rdx + r12 - 0x2 * 0x20 ], 0x01
120135 vmovdqu32 ymm17 , ymmword ptr [ rdx + r9 - 0x2 * 0x20 ]
@@ -127,6 +142,7 @@ blake3_hash_many_avx512:
127142 vinserti64x4 zmm19 , zmm19 , ymmword ptr [ rdx + r15 - 0x2 * 0x20 ], 0x01
128143 vpunpcklqdq zmm10 , zmm18 , zmm19
129144 vpunpckhqdq zmm11 , zmm18 , zmm19
145+ #ifndef _ILP32
130146 mov r8 , qword ptr [ rdi + 0x20 ]
131147 mov r9 , qword ptr [ rdi + 0x28 ]
132148 mov r10 , qword ptr [ rdi + 0x30 ]
@@ -135,6 +151,16 @@ blake3_hash_many_avx512:
135151 mov r13 , qword ptr [ rdi + 0x68 ]
136152 mov r14 , qword ptr [ rdi + 0x70 ]
137153 mov r15 , qword ptr [ rdi + 0x78 ]
154+ #else
155+ mov r8d , dword ptr [ rdi + 0x10 ]
156+ mov r9d , dword ptr [ rdi + 0x14 ]
157+ mov r10d , dword ptr [ rdi + 0x18 ]
158+ mov r11d , dword ptr [ rdi + 0x1c ]
159+ mov r12d , dword ptr [ rdi + 0x30 ]
160+ mov r13d , dword ptr [ rdi + 0x34 ]
161+ mov r14d , dword ptr [ rdi + 0x38 ]
162+ mov r15d , dword ptr [ rdi + 0x3c ]
163+ #endif
138164 vmovdqu32 ymm16 , ymmword ptr [ rdx + r8 - 0x2 * 0x20 ]
139165 vinserti64x4 zmm16 , zmm16 , ymmword ptr [ rdx + r12 - 0x2 * 0x20 ], 0x01
140166 vmovdqu32 ymm17 , ymmword ptr [ rdx + r9 - 0x2 * 0x20 ]
@@ -169,6 +195,7 @@ blake3_hash_many_avx512:
169195 vmovdqa32 zmm23 , zmm19
170196 vpermt2d zmm19 , zmm27 , zmm8
171197 vpermt2d zmm23 , zmm31 , zmm8
198+ #ifndef _ILP32
172199 mov r8 , qword ptr [ rdi ]
173200 mov r9 , qword ptr [ rdi + 0x8 ]
174201 mov r10 , qword ptr [ rdi + 0x10 ]
@@ -177,6 +204,16 @@ blake3_hash_many_avx512:
177204 mov r13 , qword ptr [ rdi + 0x48 ]
178205 mov r14 , qword ptr [ rdi + 0x50 ]
179206 mov r15 , qword ptr [ rdi + 0x58 ]
207+ #else
208+ mov r8d , dword ptr [ rdi ]
209+ mov r9d , dword ptr [ rdi + 0x4 ]
210+ mov r10d , dword ptr [ rdi + 0x8 ]
211+ mov r11d , dword ptr [ rdi + 0xc ]
212+ mov r12d , dword ptr [ rdi + 0x20 ]
213+ mov r13d , dword ptr [ rdi + 0x24 ]
214+ mov r14d , dword ptr [ rdi + 0x28 ]
215+ mov r15d , dword ptr [ rdi + 0x2c ]
216+ #endif
180217 vmovdqu32 ymm24 , ymmword ptr [ r8 + rdx - 0x1 * 0x20 ]
181218 vinserti64x4 zmm24 , zmm24 , ymmword ptr [ r12 + rdx - 0x1 * 0x20 ], 0x01
182219 vmovdqu32 ymm25 , ymmword ptr [ r9 + rdx - 0x1 * 0x20 ]
@@ -197,6 +234,7 @@ blake3_hash_many_avx512:
197234 prefetcht0 [ r14 + rdx + 0x80 ]
198235 prefetcht0 [ r11 + rdx + 0x80 ]
199236 prefetcht0 [ r15 + rdx + 0x80 ]
237+ #ifndef _ILP32
200238 mov r8 , qword ptr [ rdi + 0x20 ]
201239 mov r9 , qword ptr [ rdi + 0x28 ]
202240 mov r10 , qword ptr [ rdi + 0x30 ]
@@ -205,6 +243,16 @@ blake3_hash_many_avx512:
205243 mov r13 , qword ptr [ rdi + 0x68 ]
206244 mov r14 , qword ptr [ rdi + 0x70 ]
207245 mov r15 , qword ptr [ rdi + 0x78 ]
246+ #else
247+ mov r8d , dword ptr [ rdi + 0x10 ]
248+ mov r9d , dword ptr [ rdi + 0x14 ]
249+ mov r10d , dword ptr [ rdi + 0x18 ]
250+ mov r11d , dword ptr [ rdi + 0x1c ]
251+ mov r12d , dword ptr [ rdi + 0x30 ]
252+ mov r13d , dword ptr [ rdi + 0x34 ]
253+ mov r14d , dword ptr [ rdi + 0x38 ]
254+ mov r15d , dword ptr [ rdi + 0x3c ]
255+ #endif
208256 vmovdqu32 ymm24 , ymmword ptr [ r8 + rdx - 0x1 * 0x20 ]
209257 vinserti64x4 zmm24 , zmm24 , ymmword ptr [ r12 + rdx - 0x1 * 0x20 ], 0x01
210258 vmovdqu32 ymm25 , ymmword ptr [ r9 + rdx - 0x1 * 0x20 ]
@@ -1095,7 +1143,11 @@ blake3_hash_many_avx512:
10951143 vpaddd zmm1 {k2} , zmm1 , dword ptr [ ADD1 + rip ] {1to16}
10961144 vmovdqa32 zmmword ptr [ rsp ], zmm2
10971145 vmovdqa32 zmmword ptr [ rsp + 0x1 * 0x40 ], zmm1
1146+ #ifndef _ILP32
10981147 add rdi , 128
1148+ #else
1149+ add rdi , 64
1150+ #endif
10991151 add rbx , 512
11001152 mov qword ptr [ rbp + 0x50 ], rbx
11011153 sub rsi , 16
@@ -1125,6 +1177,7 @@ blake3_hash_many_avx512:
11251177 vpbroadcastd ymm5 , dword ptr [ rcx + 0x14 ]
11261178 vpbroadcastd ymm6 , dword ptr [ rcx + 0x18 ]
11271179 vpbroadcastd ymm7 , dword ptr [ rcx + 0x1C ]
1180+ #ifndef _ILP32
11281181 mov r8 , qword ptr [ rdi ]
11291182 mov r9 , qword ptr [ rdi + 0x8 ]
11301183 mov r10 , qword ptr [ rdi + 0x10 ]
@@ -1133,6 +1186,16 @@ blake3_hash_many_avx512:
11331186 mov r13 , qword ptr [ rdi + 0x28 ]
11341187 mov r14 , qword ptr [ rdi + 0x30 ]
11351188 mov r15 , qword ptr [ rdi + 0x38 ]
1189+ #else
1190+ mov r8d , dword ptr [ rdi ]
1191+ mov r9d , dword ptr [ rdi + 0x4 ]
1192+ mov r10d , dword ptr [ rdi + 0x8 ]
1193+ mov r11d , dword ptr [ rdi + 0xc ]
1194+ mov r12d , dword ptr [ rdi + 0x10 ]
1195+ mov r13d , dword ptr [ rdi + 0x14 ]
1196+ mov r14d , dword ptr [ rdi + 0x18 ]
1197+ mov r15d , dword ptr [ rdi + 0x1c ]
1198+ #endif
11361199 movzx eax , byte ptr [ rbp + 0x38 ]
11371200 movzx ebx , byte ptr [ rbp + 0x40 ]
11381201 or eax , ebx
@@ -2055,7 +2118,11 @@ blake3_hash_many_avx512:
20552118 vmovdqa ymmword ptr [ rsp + 0x2 * 0x20 ], ymm2
20562119 add rbx , 256
20572120 mov qword ptr [ rbp + 0x50 ], rbx
2121+ #ifndef _ILP32
20582122 add rdi , 64
2123+ #else
2124+ add rdi , 32
2125+ #endif
20592126 sub rsi , 8
206021273 :
20612128 mov rbx , qword ptr [ rbp + 0x50 ]
@@ -2078,10 +2145,17 @@ blake3_hash_many_avx512:
20782145 kmovw k2 , eax
20792146 vpblendmd zmm13 {k2} , zmm13 , zmm12
20802147 vbroadcasti32x4 zmm15 , xmmword ptr [ BLAKE3_IV + rip ]
2148+ #ifndef _ILP32
20812149 mov r8 , qword ptr [ rdi ]
20822150 mov r9 , qword ptr [ rdi + 0x8 ]
20832151 mov r10 , qword ptr [ rdi + 0x10 ]
20842152 mov r11 , qword ptr [ rdi + 0x18 ]
2153+ #else
2154+ mov r8d , dword ptr [ rdi ]
2155+ mov r9d , dword ptr [ rdi + 0x4 ]
2156+ mov r10d , dword ptr [ rdi + 0x8 ]
2157+ mov r11d , dword ptr [ rdi + 0xc ]
2158+ #endif
20852159 mov eax , 43690
20862160 kmovw k3 , eax
20872161 mov eax , 34952
@@ -2195,7 +2269,11 @@ blake3_hash_many_avx512:
21952269 vmovdqa xmmword ptr [ rsp ], xmm0
21962270 vmovdqa xmmword ptr [ rsp + 0x40 ], xmm2
21972271 add rbx , 128
2272+ #ifndef _ILP32
21982273 add rdi , 32
2274+ #else
2275+ add rdi , 16
2276+ #endif
21992277 sub rsi , 4
220022783 :
22012279 test esi , 0x2
@@ -2209,8 +2287,13 @@ blake3_hash_many_avx512:
22092287 vpinsrd xmm14 , xmm14 , dword ptr [ rsp + 0x44 ], 1
22102288 vpinsrd xmm14 , xmm14 , dword ptr [ BLAKE3_BLOCK_LEN + rip ], 2
22112289 vinserti128 ymm13 , ymm13 , xmm14 , 0x01
2290+ #ifndef _ILP32
22122291 mov r8 , qword ptr [ rdi ]
22132292 mov r9 , qword ptr [ rdi + 0x8 ]
2293+ #else
2294+ mov r8d , dword ptr [ rdi ]
2295+ mov r9d , dword ptr [ rdi + 0x4 ]
2296+ #endif
22142297 movzx eax , byte ptr [ rbp + 0x40 ]
22152298 or eax , r13d
22162299 xor edx , edx
@@ -2308,7 +2391,11 @@ blake3_hash_many_avx512:
23082391 vmovdqa xmmword ptr [ rsp ], xmm0
23092392 vmovdqa xmmword ptr [ rsp + 0x4 * 0x10 ], xmm2
23102393 add rbx , 64
2394+ #ifndef _ILP32
23112395 add rdi , 16
2396+ #else
2397+ add rdi , 8
2398+ #endif
23122399 sub rsi , 2
231324003 :
23142401 test esi , 0x1
@@ -2319,7 +2406,11 @@ blake3_hash_many_avx512:
23192406 vpinsrd xmm14 , xmm14 , dword ptr [ rsp + 0x40 ], 1
23202407 vpinsrd xmm14 , xmm14 , dword ptr [ BLAKE3_BLOCK_LEN + rip ], 2
23212408 vmovdqa xmm15 , xmmword ptr [ BLAKE3_IV + rip ]
2409+ #ifndef _ILP32
23222410 mov r8 , qword ptr [ rdi ]
2411+ #else
2412+ mov r8d , dword ptr [ rdi ]
2413+ #endif
23232414 movzx eax , byte ptr [ rbp + 0x40 ]
23242415 or eax , r13d
23252416 xor edx , edx
0 commit comments