asm/ct*_inverse_mod_*.pl: harmonize commentary with implementation.

dot-asm · dot-asm · commit bd64e91382f5 · 2025-08-25T13:51:04.000+02:00
diff --git a/src/asm/ct_inverse_mod_256-armv8.pl b/src/asm/ct_inverse_mod_256-armv8.pl
@@ -63,7 +63,7 @@
         v += mod
     if v < 0:
         v += mod
-    elif v == 1<<512
+    elif v == 1<<512:
         v -= mod
 
     return v & (2**512 - 1) # to be reduced % mod
diff --git a/src/asm/ct_inverse_mod_256-x86_64.pl b/src/asm/ct_inverse_mod_256-x86_64.pl
@@ -62,7 +62,7 @@
         v += mod
     if v < 0:
         v += mod
-    elif v == 1<<512
+    elif v == 1<<512:
         v -= mod
 
     return v & (2**512 - 1) # to be reduced % mod
@@ -366,10 +366,10 @@
 # bit-length of the |f?| and |g?| single-limb multiplicands. However!
 # The latter should not be taken literally, as they are always chosen so
 # that "bad things" don't happen. For example, there comes a point when
-# |v| grows beyond 383 bits, while |u| remains 383 bits wide. Yet, we
-# always call __smul_383x63 to perform |u|*|f0|+|v|*|g0| step. This is
+# |v| grows beyond 256 bits, while |u| remains 256 bits wide. Yet, we
+# always call __smulq_256x63 to perform |u|*|f0|+|v|*|g0| step. This is
 # because past that point |f0| is always 1 and |g0| is always 0. And,
-# since |u| never grows beyond 383 bits, __smul_767x63 doesn't have to
+# since |u| never grows beyond 256 bits, __smulq_512x63 doesn't have to
 # perform full-width |u|*|f1| multiplication, half-width one with sign
 # extension is sufficient...
 $code.=<<___;
diff --git a/src/asm/ct_inverse_mod_384-armv8.pl b/src/asm/ct_inverse_mod_384-armv8.pl
@@ -44,7 +44,7 @@
         if b < 0:
             b, f1, g1 = -b, -f1, -g1
 
-        # __smul_767x63
+        # __smul_768x63
         u, v = u*f0 + v*g0, u*f1 + v*g1
 
     if 768 % k:
@@ -58,8 +58,13 @@
 
         v = u*f1 + v*g1
 
+    mod <<= 768 - mod.bit_length()  # align to the left
     if v < 0:
-        v += mod << (768 - mod.bit_length())    # left aligned
+        v += mod
+    if v < 0:
+        v += mod
+    elif v == 1<<768:
+        v -= mod
 
     return v & (2**768 - 1) # to be reduced % mod
 ___
diff --git a/src/asm/ctq_inverse_mod_384-x86_64.pl b/src/asm/ctq_inverse_mod_384-x86_64.pl
@@ -58,8 +58,13 @@
 
         v = u*f1 + v*g1
 
+    mod <<= 768 - mod.bit_length()  # align to the left
     if v < 0:
-        v += mod << (768 - mod.bit_length())    # left aligned
+        v += mod
+    if v < 0:
+        v += mod
+    elif v == 1<<768:
+        v -= mod
 
     return v & (2**768 - 1) # to be reduced % mod
 ___
diff --git a/src/asm/ctx_inverse_mod_384-x86_64.pl b/src/asm/ctx_inverse_mod_384-x86_64.pl
@@ -57,8 +57,13 @@
 
         v = u*f1 + v*g1
 
+    mod <<= 768 - mod.bit_length()  # align to the left
     if v < 0:
-        v += mod << (768 - mod.bit_length())    # left aligned
+        v += mod
+    if v < 0:
+        v += mod
+    elif v == 1<<768:
+        v -= mod
 
     return v & (2**768 - 1) # to be reduced % mod
 ___