@@ -52,21 +52,18 @@ macro mulMont_CIOS_sparebit_gen[N: static int](
5252
5353 aaSym = ident " aa"
5454 aa = asmArray (aaSym, N, ElemsInReg , asmInputOutput) # used as buffer for final substraction
55- mSym = ident " m"
56- m = asmValue (mSym, Reg , asmOutputEarlyClobber)
5755
5856 uSym = ident " u"
59- vSym = ident " v"
6057
61- var # Break dependencies chain
62- u = asmValue (uSym, Reg , asmOutputEarlyClobber)
63- v = asmValue (vSym , Reg , asmOutputEarlyClobber)
58+ # Note: We might want to use an extra register to break dependency chains and expose more ILP
59+ # but then we run into GCC limitations https://github.com/mratsim/constantine/issues/582
60+ var u = asmValue (uSym , Reg , asmOutputEarlyClobber)
6461
6562 # Prologue
6663 result .add quote do :
6764 var `tSym`{.noinit , used .}: typeof (`r_PIR`)
68- var `aSym`{.noinit .}, `biSym`{.noInit .}, `mSym`{. noinit .} : BaseType
69- var `uSym`{.noinit .}, `vSym`{. noInit .} : BaseType
65+ var `aSym`{.noinit .}, `biSym`{.noInit .}: BaseType
66+ var `uSym`{.noinit .}: BaseType
7067
7168 let `aaSym` {.noinit , used .} = `a_PIR`
7269
@@ -111,24 +108,19 @@ macro mulMont_CIOS_sparebit_gen[N: static int](
111108 template mulloadd_co (ctx, dst, lhs, rhs, addend) {.dirty .} =
112109 ctx.mul u, lhs, rhs
113110 ctx.adds dst, addend, u
114- swap (u, v)
115111 template mulloadd_cio (ctx, dst, lhs, rhs, addend) {.dirty .} =
116112 ctx.mul u, lhs, rhs
117113 ctx.adcs dst, addend, u
118- swap (u, v)
119114
120115 template mulhiadd_co (ctx, dst, lhs, rhs, addend) {.dirty .} =
121116 ctx.umulh u, lhs, rhs
122117 ctx.adds dst, addend, u
123- swap (u, v)
124118 template mulhiadd_cio (ctx, dst, lhs, rhs, addend) {.dirty .} =
125119 ctx.umulh u, lhs, rhs
126120 ctx.adcs dst, addend, u
127- swap (u, v)
128121 template mulhiadd_ci (ctx, dst, lhs, rhs, addend) {.dirty .} =
129122 ctx.umulh u, lhs, rhs
130123 ctx.adc dst, addend, u
131- swap (u, v)
132124
133125 doAssert N >= 2
134126
@@ -200,11 +192,14 @@ macro mulMont_CIOS_sparebit_gen[N: static int](
200192 # t[1] = t[2] + (m*M[2]).lo + (m*M[1]).hi
201193 # t[2] = t[3] + (m*M[2]).hi + (m*M[3]).lo
202194 # t[3] = A + carry + (m*M[3]).hi
195+
196+ # Note: we might lose some cycles per iteration if we reuse bi here compared to perfect usage of ILP.
197+ # but GCC limitation https://github.com/mratsim/constantine/issues/582
198+ template m : untyped = bi
203199
204200 ctx.mul m, t[0 ], m0ninv
205201 ctx.mul u, m, M[0 ]
206202 ctx.cmn t[0 ], u # TODO : bad latency chain, hopefully done parallel to prev loop
207- swap (u, v)
208203
209204 for j in 1 ..< N:
210205 ctx.mulloadd_cio (t[j- 1 ], m, M[j], t[j])
@@ -298,34 +293,29 @@ macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](
298293 b = scratch[1 ].as2dArrayAddr (b_PIR, rows = K, cols = N, memIndirect = memRead) # Store the `b` operand
299294 tN = scratch[2 ] # High part of extended precision multiplication
300295 A = scratch[3 ] # Carry during mul step (A)
296+
297+ # Same slot to save registers
301298 bi = scratch [4 ] # Stores b[i] during mul and u during reduction
302- m = scratch[5 ] # Red step: (t[0] * m0ninv) mod 2ʷ
299+ m = scratch[4 ] # Red step: (t[0] * m0ninv) mod 2ʷ
303300
304- var # break dependency chains
305- u = scratch[6 ]
306- v = scratch[7 ]
301+ var u = scratch[5 ]
307302
308303 template mulloadd_co (ctx, dst, lhs, rhs, addend) {.dirty .} =
309304 ctx.mul u, lhs, rhs
310305 ctx.adds dst, addend, u
311- swap (u, v)
312306 template mulloadd_cio (ctx, dst, lhs, rhs, addend) {.dirty .} =
313307 ctx.mul u, lhs, rhs
314308 ctx.adcs dst, addend, u
315- swap (u, v)
316309
317310 template mulhiadd_co (ctx, dst, lhs, rhs, addend) {.dirty .} =
318311 ctx.umulh u, lhs, rhs
319312 ctx.adds dst, addend, u
320- swap (u, v)
321313 template mulhiadd_cio (ctx, dst, lhs, rhs, addend) {.dirty .} =
322314 ctx.umulh u, lhs, rhs
323315 ctx.adcs dst, addend, u
324- swap (u, v)
325316 template mulhiadd_ci (ctx, dst, lhs, rhs, addend) {.dirty .} =
326317 ctx.umulh u, lhs, rhs
327318 ctx.adc dst, addend, u
328- swap (u, v)
329319
330320 result .add quote do :
331321 static : doAssert: sizeof (SecretWord ) == sizeof (ByteAddress )
@@ -392,12 +382,12 @@ macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](
392382
393383 # Reduction step
394384 # -------------------------------
385+ # bi and m are aliasing
395386 ctx.comment " Reduction step"
396387
397388 ctx.mul m, t[0 ], m0ninv
398389 ctx.mul u, m, M[0 ]
399390 ctx.cmn t[0 ], u # TODO : bad latency chain, hopefully done parallel to prev loop
400- swap (u, v)
401391
402392 for j in 1 ..< N:
403393 ctx.mulloadd_cio (t[j- 1 ], m, M[j], t[j])
0 commit comments