fix #582: GCC-compatible inline ARM64 ASM

mratsim · mratsim · commit 1956f6650791 · 2025-10-21T09:39:56.000+02:00
diff --git a/constantine/math/arithmetic/assembly/limbs_asm_mul_mont_arm64.nim b/constantine/math/arithmetic/assembly/limbs_asm_mul_mont_arm64.nim
@@ -52,21 +52,18 @@ macro mulMont_CIOS_sparebit_gen[N: static int](
 
     aaSym = ident"aa"
     aa = asmArray(aaSym, N, ElemsInReg, asmInputOutput) # used as buffer for final substraction
-    mSym = ident"m"
-    m = asmValue(mSym, Reg, asmOutputEarlyClobber)
 
     uSym = ident"u"
-    vSym = ident"v"
 
-  var # Break dependencies chain
-    u = asmValue(uSym, Reg, asmOutputEarlyClobber)
-    v = asmValue(vSym, Reg, asmOutputEarlyClobber)
+  # Note: We might want to use an extra register to break dependency chains and expose more ILP
+  #       but then we run into GCC limitations https://github.com/mratsim/constantine/issues/582
+  var u = asmValue(uSym, Reg, asmOutputEarlyClobber)
 
   # Prologue
   result.add quote do:
     var `tSym`{.noinit, used.}: typeof(`r_PIR`)
-    var `aSym`{.noinit.}, `biSym`{.noInit.}, `mSym`{.noinit.}: BaseType
-    var `uSym`{.noinit.}, `vSym`{.noInit.}: BaseType
+    var `aSym`{.noinit.}, `biSym`{.noInit.}: BaseType
+    var `uSym`{.noinit.}: BaseType
 
     let `aaSym` {.noinit, used.} = `a_PIR`
 
@@ -111,24 +108,19 @@ macro mulMont_CIOS_sparebit_gen[N: static int](
   template mulloadd_co(ctx, dst, lhs, rhs, addend) {.dirty.} =
     ctx.mul u, lhs, rhs
     ctx.adds dst, addend, u
-    swap(u, v)
   template mulloadd_cio(ctx, dst, lhs, rhs, addend) {.dirty.} =
     ctx.mul u, lhs, rhs
     ctx.adcs dst, addend, u
-    swap(u, v)
 
   template mulhiadd_co(ctx, dst, lhs, rhs, addend) {.dirty.} =
     ctx.umulh u, lhs, rhs
     ctx.adds dst, addend, u
-    swap(u, v)
   template mulhiadd_cio(ctx, dst, lhs, rhs, addend) {.dirty.} =
     ctx.umulh u, lhs, rhs
     ctx.adcs dst, addend, u
-    swap(u, v)
   template mulhiadd_ci(ctx, dst, lhs, rhs, addend) {.dirty.} =
     ctx.umulh u, lhs, rhs
     ctx.adc dst, addend, u
-    swap(u, v)
 
   doAssert N >= 2
 
@@ -200,11 +192,14 @@ macro mulMont_CIOS_sparebit_gen[N: static int](
     #  t[1] = t[2] + (m*M[2]).lo + (m*M[1]).hi
     #  t[2] = t[3] + (m*M[2]).hi + (m*M[3]).lo
     #  t[3] = A + carry          + (m*M[3]).hi
+    
+    # Note: we might lose some cycles per iteration if we reuse bi here compared to perfect usage of ILP.
+    # but GCC limitation https://github.com/mratsim/constantine/issues/582
+    template m: untyped = bi 
 
     ctx.mul m, t[0], m0ninv
     ctx.mul u, m, M[0]
     ctx.cmn t[0], u         # TODO: bad latency chain, hopefully done parallel to prev loop
-    swap(u, v)
 
     for j in 1 ..< N:
       ctx.mulloadd_cio(t[j-1], m, M[j], t[j])
@@ -298,34 +293,29 @@ macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](
     b = scratch[1].as2dArrayAddr(b_PIR, rows = K, cols = N, memIndirect = memRead) # Store the `b` operand
     tN = scratch[2]                                  # High part of extended precision multiplication
     A = scratch[3]                                   # Carry during mul step (A)
+    
+    # Same slot to save registers
     bi = scratch[4]                                  # Stores b[i] during mul and u during reduction
-    m = scratch[5]                                   # Red step: (t[0] * m0ninv) mod 2ʷ
+    m = scratch[4]                                   # Red step: (t[0] * m0ninv) mod 2ʷ
 
-  var # break dependency chains
-    u = scratch[6]
-    v = scratch[7]
+  var u = scratch[5]
 
   template mulloadd_co(ctx, dst, lhs, rhs, addend) {.dirty.} =
     ctx.mul u, lhs, rhs
     ctx.adds dst, addend, u
-    swap(u, v)
   template mulloadd_cio(ctx, dst, lhs, rhs, addend) {.dirty.} =
     ctx.mul u, lhs, rhs
     ctx.adcs dst, addend, u
-    swap(u, v)
 
   template mulhiadd_co(ctx, dst, lhs, rhs, addend) {.dirty.} =
     ctx.umulh u, lhs, rhs
     ctx.adds dst, addend, u
-    swap(u, v)
   template mulhiadd_cio(ctx, dst, lhs, rhs, addend) {.dirty.} =
     ctx.umulh u, lhs, rhs
     ctx.adcs dst, addend, u
-    swap(u, v)
   template mulhiadd_ci(ctx, dst, lhs, rhs, addend) {.dirty.} =
     ctx.umulh u, lhs, rhs
     ctx.adc dst, addend, u
-    swap(u, v)
 
   result.add quote do:
     static: doAssert: sizeof(SecretWord) == sizeof(ByteAddress)
@@ -392,12 +382,12 @@ macro sumprodMont_CIOS_spare2bits_gen[N, K: static int](
 
     # Reduction step
     # -------------------------------
+    # bi and m are aliasing
     ctx.comment "  Reduction step"
 
     ctx.mul m, t[0], m0ninv
     ctx.mul u, m, M[0]
     ctx.cmn t[0], u         # TODO: bad latency chain, hopefully done parallel to prev loop
-    swap(u, v)
 
     for j in 1 ..< N:
       ctx.mulloadd_cio(t[j-1], m, M[j], t[j])
diff --git a/constantine/math/arithmetic/assembly/limbs_asm_redc_mont_arm64.nim b/constantine/math/arithmetic/assembly/limbs_asm_redc_mont_arm64.nim
@@ -65,25 +65,21 @@ macro redc2xMont_gen[N: static int](
   let m0ninv = v[1]
   let m = v[2]
   var t0 = v[3]
-  var t1 = v[4]
+  # var t1 = v[4] # We might lose some cycles compared to perfect ILP but GCC limitation https://github.com/mratsim/constantine/issues/582
 
   template mulloadd_cio(ctx, dst, lhs, rhs, addend) {.dirty.} =
     ctx.mul t0, lhs, rhs
     ctx.adcs dst, addend, t0
-    swap(t0, t1)
 
   template mulhiadd_co(ctx, dst, lhs, rhs, addend) {.dirty.} =
     ctx.umulh t0, lhs, rhs
     ctx.adds dst, addend, t0
-    swap(t0, t1)
   template mulhiadd_cio(ctx, dst, lhs, rhs, addend) {.dirty.} =
     ctx.umulh t0, lhs, rhs
     ctx.adcs dst, addend, t0
-    swap(t0, t1)
   template mulhiadd_ci(ctx, dst, lhs, rhs, addend) {.dirty.} =
     ctx.umulh t0, lhs, rhs
     ctx.adc dst, addend, t0
-    swap(t0, t1)
 
   # Algorithm
   # ---------------------------------------------------------
@@ -109,7 +105,6 @@ macro redc2xMont_gen[N: static int](
     ctx.comment "---- Reduction " & $i
     ctx.mul t0, m, M[0]
     ctx.cmn u[0], t0
-    swap(t0, t1)
     ctx.mov u[N], xzr
 
     for j in 0 ..< N:
@@ -136,7 +131,6 @@ macro redc2xMont_gen[N: static int](
       ctx.adc u[i], u[i], t0
     else:
       ctx.adcs u[i], u[i], t0
-    swap(t0, t1)
 
   if spareBits >= 2 and lazyReduce:
     for i in 0 ..< N: