Skip to content

Commit 1956f66

Browse files
committed
fix #582: GCC-compatible inline ARM64 ASM
1 parent d6aae1e commit 1956f66

File tree

2 files changed

+15
-31
lines changed

2 files changed

+15
-31
lines changed

constantine/math/arithmetic/assembly/limbs_asm_mul_mont_arm64.nim

Lines changed: 14 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -52,21 +52,18 @@ macro mulMont_CIOS_sparebit_gen[N: static int](
5252

5353
aaSym = ident"aa"
5454
aa = asmArray(aaSym, N, ElemsInReg, asmInputOutput) # used as buffer for final substraction
55-
mSym = ident"m"
56-
m = asmValue(mSym, Reg, asmOutputEarlyClobber)
5755

5856
uSym = ident"u"
59-
vSym = ident"v"
6057

61-
var # Break dependencies chain
62-
u = asmValue(uSym, Reg, asmOutputEarlyClobber)
63-
v = asmValue(vSym, Reg, asmOutputEarlyClobber)
58+
# Note: We might want to use an extra register to break dependency chains and expose more ILP
59+
# but then we run into GCC limitations https://github.com/mratsim/constantine/issues/582
60+
var u = asmValue(uSym, Reg, asmOutputEarlyClobber)
6461

6562
# Prologue
6663
result.add quote do:
6764
var `tSym`{.noinit, used.}: typeof(`r_PIR`)
68-
var `aSym`{.noinit.}, `biSym`{.noInit.}, `mSym`{.noinit.}: BaseType
69-
var `uSym`{.noinit.}, `vSym`{.noInit.}: BaseType
65+
var `aSym`{.noinit.}, `biSym`{.noInit.}: BaseType
66+
var `uSym`{.noinit.}: BaseType
7067

7168
let `aaSym` {.noinit, used.} = `a_PIR`
7269

@@ -111,24 +108,19 @@ macro mulMont_CIOS_sparebit_gen[N: static int](
111108
template mulloadd_co(ctx, dst, lhs, rhs, addend) {.dirty.} =
112109
ctx.mul u, lhs, rhs
113110
ctx.adds dst, addend, u
114-
swap(u, v)
115111
template mulloadd_cio(ctx, dst, lhs, rhs, addend) {.dirty.} =
116112
ctx.mul u, lhs, rhs
117113
ctx.adcs dst, addend, u
118-
swap(u, v)
119114

120115
template mulhiadd_co(ctx, dst, lhs, rhs, addend) {.dirty.} =
121116
ctx.umulh u, lhs, rhs
122117
ctx.adds dst, addend, u
123-
swap(u, v)
124118
template mulhiadd_cio(ctx, dst, lhs, rhs, addend) {.dirty.} =
125119
ctx.umulh u, lhs, rhs
126120
ctx.adcs dst, addend, u
127-
swap(u, v)
128121
template mulhiadd_ci(ctx, dst, lhs, rhs, addend) {.dirty.} =
129122
ctx.umulh u, lhs, rhs
130123
ctx.adc dst, addend, u
131-
swap(u, v)
132124

133125
doAssert N >= 2
134126

@@ -200,11 +192,14 @@ macro mulMont_CIOS_sparebit_gen[N: static int](
200192
# t[1] = t[2] + (m*M[2]).lo + (m*M[1]).hi
201193
# t[2] = t[3] + (m*M[2]).hi + (m*M[3]).lo
202194
# t[3] = A + carry + (m*M[3]).hi
195+
196+
# Note: we might lose some cycles per iteration if we reuse bi here compared to perfect usage of ILP.
197+
# but GCC limitation https://github.com/mratsim/constantine/issues/582
198+
template m: untyped = bi
203199

204200
ctx.mul m, t[0], m0ninv
205201
ctx.mul u, m, M[0]
206202
ctx.cmn t[0], u # TODO: bad latency chain, hopefully done parallel to prev loop
207-
swap(u, v)
208203

209204
for j in 1 ..< N:
210205
ctx.mulloadd_cio(t[j-1], m, M[j], t[j])
@@ -298,34 +293,29 @@ macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](
298293
b = scratch[1].as2dArrayAddr(b_PIR, rows = K, cols = N, memIndirect = memRead) # Store the `b` operand
299294
tN = scratch[2] # High part of extended precision multiplication
300295
A = scratch[3] # Carry during mul step (A)
296+
297+
# Same slot to save registers
301298
bi = scratch[4] # Stores b[i] during mul and u during reduction
302-
m = scratch[5] # Red step: (t[0] * m0ninv) mod 2ʷ
299+
m = scratch[4] # Red step: (t[0] * m0ninv) mod 2ʷ
303300

304-
var # break dependency chains
305-
u = scratch[6]
306-
v = scratch[7]
301+
var u = scratch[5]
307302

308303
template mulloadd_co(ctx, dst, lhs, rhs, addend) {.dirty.} =
309304
ctx.mul u, lhs, rhs
310305
ctx.adds dst, addend, u
311-
swap(u, v)
312306
template mulloadd_cio(ctx, dst, lhs, rhs, addend) {.dirty.} =
313307
ctx.mul u, lhs, rhs
314308
ctx.adcs dst, addend, u
315-
swap(u, v)
316309

317310
template mulhiadd_co(ctx, dst, lhs, rhs, addend) {.dirty.} =
318311
ctx.umulh u, lhs, rhs
319312
ctx.adds dst, addend, u
320-
swap(u, v)
321313
template mulhiadd_cio(ctx, dst, lhs, rhs, addend) {.dirty.} =
322314
ctx.umulh u, lhs, rhs
323315
ctx.adcs dst, addend, u
324-
swap(u, v)
325316
template mulhiadd_ci(ctx, dst, lhs, rhs, addend) {.dirty.} =
326317
ctx.umulh u, lhs, rhs
327318
ctx.adc dst, addend, u
328-
swap(u, v)
329319

330320
result.add quote do:
331321
static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
@@ -392,12 +382,12 @@ macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](
392382

393383
# Reduction step
394384
# -------------------------------
385+
# bi and m are aliasing
395386
ctx.comment " Reduction step"
396387

397388
ctx.mul m, t[0], m0ninv
398389
ctx.mul u, m, M[0]
399390
ctx.cmn t[0], u # TODO: bad latency chain, hopefully done parallel to prev loop
400-
swap(u, v)
401391

402392
for j in 1 ..< N:
403393
ctx.mulloadd_cio(t[j-1], m, M[j], t[j])

constantine/math/arithmetic/assembly/limbs_asm_redc_mont_arm64.nim

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -65,25 +65,21 @@ macro redc2xMont_gen[N: static int](
6565
let m0ninv = v[1]
6666
let m = v[2]
6767
var t0 = v[3]
68-
var t1 = v[4]
68+
# var t1 = v[4] # We might lose some cycles compared to perfect ILP but GCC limitation https://github.com/mratsim/constantine/issues/582
6969

7070
template mulloadd_cio(ctx, dst, lhs, rhs, addend) {.dirty.} =
7171
ctx.mul t0, lhs, rhs
7272
ctx.adcs dst, addend, t0
73-
swap(t0, t1)
7473

7574
template mulhiadd_co(ctx, dst, lhs, rhs, addend) {.dirty.} =
7675
ctx.umulh t0, lhs, rhs
7776
ctx.adds dst, addend, t0
78-
swap(t0, t1)
7977
template mulhiadd_cio(ctx, dst, lhs, rhs, addend) {.dirty.} =
8078
ctx.umulh t0, lhs, rhs
8179
ctx.adcs dst, addend, t0
82-
swap(t0, t1)
8380
template mulhiadd_ci(ctx, dst, lhs, rhs, addend) {.dirty.} =
8481
ctx.umulh t0, lhs, rhs
8582
ctx.adc dst, addend, t0
86-
swap(t0, t1)
8783

8884
# Algorithm
8985
# ---------------------------------------------------------
@@ -109,7 +105,6 @@ macro redc2xMont_gen[N: static int](
109105
ctx.comment "---- Reduction " & $i
110106
ctx.mul t0, m, M[0]
111107
ctx.cmn u[0], t0
112-
swap(t0, t1)
113108
ctx.mov u[N], xzr
114109

115110
for j in 0 ..< N:
@@ -136,7 +131,6 @@ macro redc2xMont_gen[N: static int](
136131
ctx.adc u[i], u[i], t0
137132
else:
138133
ctx.adcs u[i], u[i], t0
139-
swap(t0, t1)
140134

141135
if spareBits >= 2 and lazyReduce:
142136
for i in 0 ..< N:

0 commit comments

Comments
 (0)