|
84 | 84 |
|
85 | 85 | #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
|
86 | 86 | movzbl src ## bh, RID1d; \
|
| 87 | + leaq s1(%rip), RID2; \ |
| 88 | + movl (RID2,RID1,4), dst ## d; \ |
87 | 89 | movzbl src ## bl, RID2d; \
|
| 90 | + leaq s2(%rip), RID1; \ |
| 91 | + op1 (RID1,RID2,4), dst ## d; \ |
88 | 92 | shrq $16, src; \
|
89 |
| - movl s1(, RID1, 4), dst ## d; \ |
90 |
| - op1 s2(, RID2, 4), dst ## d; \ |
91 | 93 | movzbl src ## bh, RID1d; \
|
| 94 | + leaq s3(%rip), RID2; \ |
| 95 | + op2 (RID2,RID1,4), dst ## d; \ |
92 | 96 | movzbl src ## bl, RID2d; \
|
93 | 97 | interleave_op(il_reg); \
|
94 |
| - op2 s3(, RID1, 4), dst ## d; \ |
95 |
| - op3 s4(, RID2, 4), dst ## d; |
| 98 | + leaq s4(%rip), RID1; \ |
| 99 | + op3 (RID1,RID2,4), dst ## d; |
96 | 100 |
|
97 | 101 | #define dummy(d) /* do nothing */
|
98 | 102 |
|
|
151 | 155 | subround(l ## 3, r ## 3, l ## 4, r ## 4, f);
|
152 | 156 |
|
153 | 157 | #define enc_preload_rkr() \
|
154 |
| - vbroadcastss .L16_mask, RKR; \ |
| 158 | + vbroadcastss .L16_mask(%rip), RKR; \ |
155 | 159 | /* add 16-bit rotation to key rotations (mod 32) */ \
|
156 | 160 | vpxor kr(CTX), RKR, RKR;
|
157 | 161 |
|
158 | 162 | #define dec_preload_rkr() \
|
159 |
| - vbroadcastss .L16_mask, RKR; \ |
| 163 | + vbroadcastss .L16_mask(%rip), RKR; \ |
160 | 164 | /* add 16-bit rotation to key rotations (mod 32) */ \
|
161 | 165 | vpxor kr(CTX), RKR, RKR; \
|
162 |
| - vpshufb .Lbswap128_mask, RKR, RKR; |
| 166 | + vpshufb .Lbswap128_mask(%rip), RKR, RKR; |
163 | 167 |
|
164 | 168 | #define transpose_2x4(x0, x1, t0, t1) \
|
165 | 169 | vpunpckldq x1, x0, t0; \
|
@@ -235,9 +239,9 @@ SYM_FUNC_START_LOCAL(__cast5_enc_blk16)
|
235 | 239 |
|
236 | 240 | movq %rdi, CTX;
|
237 | 241 |
|
238 |
| - vmovdqa .Lbswap_mask, RKM; |
239 |
| - vmovd .Lfirst_mask, R1ST; |
240 |
| - vmovd .L32_mask, R32; |
| 242 | + vmovdqa .Lbswap_mask(%rip), RKM; |
| 243 | + vmovd .Lfirst_mask(%rip), R1ST; |
| 244 | + vmovd .L32_mask(%rip), R32; |
241 | 245 | enc_preload_rkr();
|
242 | 246 |
|
243 | 247 | inpack_blocks(RL1, RR1, RTMP, RX, RKM);
|
@@ -271,7 +275,7 @@ SYM_FUNC_START_LOCAL(__cast5_enc_blk16)
|
271 | 275 | popq %rbx;
|
272 | 276 | popq %r15;
|
273 | 277 |
|
274 |
| - vmovdqa .Lbswap_mask, RKM; |
| 278 | + vmovdqa .Lbswap_mask(%rip), RKM; |
275 | 279 |
|
276 | 280 | outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
|
277 | 281 | outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
|
@@ -308,9 +312,9 @@ SYM_FUNC_START_LOCAL(__cast5_dec_blk16)
|
308 | 312 |
|
309 | 313 | movq %rdi, CTX;
|
310 | 314 |
|
311 |
| - vmovdqa .Lbswap_mask, RKM; |
312 |
| - vmovd .Lfirst_mask, R1ST; |
313 |
| - vmovd .L32_mask, R32; |
| 315 | + vmovdqa .Lbswap_mask(%rip), RKM; |
| 316 | + vmovd .Lfirst_mask(%rip), R1ST; |
| 317 | + vmovd .L32_mask(%rip), R32; |
314 | 318 | dec_preload_rkr();
|
315 | 319 |
|
316 | 320 | inpack_blocks(RL1, RR1, RTMP, RX, RKM);
|
@@ -341,7 +345,7 @@ SYM_FUNC_START_LOCAL(__cast5_dec_blk16)
|
341 | 345 | round(RL, RR, 1, 2);
|
342 | 346 | round(RR, RL, 0, 1);
|
343 | 347 |
|
344 |
| - vmovdqa .Lbswap_mask, RKM; |
| 348 | + vmovdqa .Lbswap_mask(%rip), RKM; |
345 | 349 | popq %rbx;
|
346 | 350 | popq %r15;
|
347 | 351 |
|
@@ -504,8 +508,8 @@ SYM_FUNC_START(cast5_ctr_16way)
|
504 | 508 |
|
505 | 509 | vpcmpeqd RKR, RKR, RKR;
|
506 | 510 | vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */
|
507 |
| - vmovdqa .Lbswap_iv_mask, R1ST; |
508 |
| - vmovdqa .Lbswap128_mask, RKM; |
| 511 | + vmovdqa .Lbswap_iv_mask(%rip), R1ST; |
| 512 | + vmovdqa .Lbswap128_mask(%rip), RKM; |
509 | 513 |
|
510 | 514 | /* load IV and byteswap */
|
511 | 515 | vmovq (%rcx), RX;
|
|
0 commit comments