Skip to content

Commit 0dcc778

Browse files
ardbiesheuvelherbertx
authored andcommitted
crypto: x86/cast5 - Use RIP-relative addressing
Prefer RIP-relative addressing where possible, which removes the need for boot time relocation fixups. Co-developed-by: Thomas Garnier <[email protected]> Signed-off-by: Thomas Garnier <[email protected]> Signed-off-by: Ard Biesheuvel <[email protected]> Signed-off-by: Herbert Xu <[email protected]>
1 parent 24ff1e9 commit 0dcc778

File tree

1 file changed

+21
-17
lines changed

1 file changed

+21
-17
lines changed

arch/x86/crypto/cast5-avx-x86_64-asm_64.S

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -84,15 +84,19 @@
8484

8585
#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
8686
movzbl src ## bh, RID1d; \
87+
leaq s1(%rip), RID2; \
88+
movl (RID2,RID1,4), dst ## d; \
8789
movzbl src ## bl, RID2d; \
90+
leaq s2(%rip), RID1; \
91+
op1 (RID1,RID2,4), dst ## d; \
8892
shrq $16, src; \
89-
movl s1(, RID1, 4), dst ## d; \
90-
op1 s2(, RID2, 4), dst ## d; \
9193
movzbl src ## bh, RID1d; \
94+
leaq s3(%rip), RID2; \
95+
op2 (RID2,RID1,4), dst ## d; \
9296
movzbl src ## bl, RID2d; \
9397
interleave_op(il_reg); \
94-
op2 s3(, RID1, 4), dst ## d; \
95-
op3 s4(, RID2, 4), dst ## d;
98+
leaq s4(%rip), RID1; \
99+
op3 (RID1,RID2,4), dst ## d;
96100

97101
#define dummy(d) /* do nothing */
98102

@@ -151,15 +155,15 @@
151155
subround(l ## 3, r ## 3, l ## 4, r ## 4, f);
152156

153157
#define enc_preload_rkr() \
154-
vbroadcastss .L16_mask, RKR; \
158+
vbroadcastss .L16_mask(%rip), RKR; \
155159
/* add 16-bit rotation to key rotations (mod 32) */ \
156160
vpxor kr(CTX), RKR, RKR;
157161

158162
#define dec_preload_rkr() \
159-
vbroadcastss .L16_mask, RKR; \
163+
vbroadcastss .L16_mask(%rip), RKR; \
160164
/* add 16-bit rotation to key rotations (mod 32) */ \
161165
vpxor kr(CTX), RKR, RKR; \
162-
vpshufb .Lbswap128_mask, RKR, RKR;
166+
vpshufb .Lbswap128_mask(%rip), RKR, RKR;
163167

164168
#define transpose_2x4(x0, x1, t0, t1) \
165169
vpunpckldq x1, x0, t0; \
@@ -235,9 +239,9 @@ SYM_FUNC_START_LOCAL(__cast5_enc_blk16)
235239

236240
movq %rdi, CTX;
237241

238-
vmovdqa .Lbswap_mask, RKM;
239-
vmovd .Lfirst_mask, R1ST;
240-
vmovd .L32_mask, R32;
242+
vmovdqa .Lbswap_mask(%rip), RKM;
243+
vmovd .Lfirst_mask(%rip), R1ST;
244+
vmovd .L32_mask(%rip), R32;
241245
enc_preload_rkr();
242246

243247
inpack_blocks(RL1, RR1, RTMP, RX, RKM);
@@ -271,7 +275,7 @@ SYM_FUNC_START_LOCAL(__cast5_enc_blk16)
271275
popq %rbx;
272276
popq %r15;
273277

274-
vmovdqa .Lbswap_mask, RKM;
278+
vmovdqa .Lbswap_mask(%rip), RKM;
275279

276280
outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
277281
outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
@@ -308,9 +312,9 @@ SYM_FUNC_START_LOCAL(__cast5_dec_blk16)
308312

309313
movq %rdi, CTX;
310314

311-
vmovdqa .Lbswap_mask, RKM;
312-
vmovd .Lfirst_mask, R1ST;
313-
vmovd .L32_mask, R32;
315+
vmovdqa .Lbswap_mask(%rip), RKM;
316+
vmovd .Lfirst_mask(%rip), R1ST;
317+
vmovd .L32_mask(%rip), R32;
314318
dec_preload_rkr();
315319

316320
inpack_blocks(RL1, RR1, RTMP, RX, RKM);
@@ -341,7 +345,7 @@ SYM_FUNC_START_LOCAL(__cast5_dec_blk16)
341345
round(RL, RR, 1, 2);
342346
round(RR, RL, 0, 1);
343347

344-
vmovdqa .Lbswap_mask, RKM;
348+
vmovdqa .Lbswap_mask(%rip), RKM;
345349
popq %rbx;
346350
popq %r15;
347351

@@ -504,8 +508,8 @@ SYM_FUNC_START(cast5_ctr_16way)
504508

505509
vpcmpeqd RKR, RKR, RKR;
506510
vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */
507-
vmovdqa .Lbswap_iv_mask, R1ST;
508-
vmovdqa .Lbswap128_mask, RKM;
511+
vmovdqa .Lbswap_iv_mask(%rip), R1ST;
512+
vmovdqa .Lbswap128_mask(%rip), RKM;
509513

510514
/* load IV and byteswap */
511515
vmovq (%rcx), RX;

0 commit comments

Comments
 (0)