refac(mod_arith): use MulUIExtendedOp for MontReduce

batzor · batzor · commit c4e1f9068a21 · 2025-04-23T09:44:46.000+09:00
With this change, we avoid i512 operations (i.e. when adding `mN`) and
instead uses extended operations for multiplying/adding i256 values.
diff --git a/tests/Dialect/ModArith/mod_arith_runner.mlir b/tests/Dialect/ModArith/mod_arith_runner.mlir
@@ -35,22 +35,26 @@ func.func @test_lower_inverse() {
 #Fq_mont = #mod_arith.montgomery<!Fq>
 
 func.func @test_lower_mont_reduce() {
-  %p = arith.constant 3723 : i512
-  %p_mont = mod_arith.mont_reduce %p {montgomery=#Fq_mont} : i512 -> !Fq
+  %p = arith.constant 2188824287183927522224640574525727508854836440041603434369820418657580849561 : i256
+  %zero = arith.constant 0 : i256
+  // `pR` is `p` << 256 so just give `p` as `high` and set `low` to 0
+  %p_mont = mod_arith.mont_reduce %zero, %p {montgomery=#Fq_mont} : i256 -> !Fq
 
   %2 = mod_arith.extract %p_mont : !Fq -> i256
-  %3 = vector.from_elements %2 : vector<1xi256>
-  %4 = vector.bitcast %3 : vector<1xi256> to vector<8xi32>
-  %mem = memref.alloc() : memref<8xi32>
+  // check if mod_arith.mont_reduce(pR) == p
+  %true = arith.cmpi eq, %2, %p : i256
+  %trueExt = arith.extui %true : i1 to i32
+  %3 = vector.from_elements %trueExt : vector<1xi32>
+  %mem = memref.alloc() : memref<1xi32>
   %idx_0 = arith.constant 0 : index
-  vector.store %4, %mem[%idx_0] : memref<8xi32>, vector<8xi32>
+  vector.store %3, %mem[%idx_0] : memref<1xi32>, vector<1xi32>
 
-  %U = memref.cast %mem : memref<8xi32> to memref<*xi32>
+  %U = memref.cast %mem : memref<1xi32> to memref<*xi32>
   func.call @printMemrefI32(%U) : (memref<*xi32>) -> ()
   return
 }
 
-// CHECK_TEST_MONT_REDUCE: [-1635059004, -1772563805, -2074116324, -156049350, 156881531, -524227392, -1359481138, 438709201]
+// CHECK_TEST_MONT_REDUCE: [1]
 
 func.func @test_lower_mont_mul() {
   %p = mod_arith.constant 17221657567640823606390383439573883756117969501024189775361 : !Fq
diff --git a/zkir/Dialect/ModArith/Conversions/ModArithToArith/ModArithToArith.cpp b/zkir/Dialect/ModArith/Conversions/ModArithToArith/ModArithToArith.cpp
@@ -163,7 +163,8 @@ struct ConvertMontReduce : public OpConversionPattern<MontReduceOp> {
 
     // `T` is the operand (e.g. the result of a multiplication, twice the
     // bitwidth of modulus).
-    Value T = adaptor.getOperands()[0];
+    Value tLow = adaptor.getOperands()[0];
+    Value tHigh = adaptor.getOperands()[1];
 
     // Extract Montgomery constants: `nPrime` and `modulus`.
     IntegerAttr nPrimeAttr = op.getMontgomeryAttr().getNPrime();
@@ -178,41 +179,49 @@ struct ConvertMontReduce : public OpConversionPattern<MontReduceOp> {
     const unsigned limbWidth = APInt::APINT_BITS_PER_WORD;
     unsigned numLimbs = (modBitWidth + limbWidth - 1) / limbWidth;
 
-    // Arith operations require the operands to be of same bit width
-    Value modExt = b.create<arith::ExtUIOp>(T.getType(), mod);
-
     // Prepare constants for limb operations.
-    Value limbWidthConst =
-        b.create<arith::ConstantOp>(b.getIntegerAttr(T.getType(), limbWidth));
-
-    // Because the number of limbs (numLimbs) is known at compile time, we can
-    // unroll the loop as a straight-line chain of operations. Let `u` be the
-    // current working value, initially `T`.
-    Value u = T;
+    auto limbWidthConst = b.create<arith::ConstantOp>(
+        b.getIntegerAttr(tLow.getType(), limbWidth));
+    auto lowLimbMask = b.create<arith::ConstantOp>(b.getIntegerAttr(
+        tLow.getType(), APInt::getAllOnes(limbWidth).zext(modBitWidth)));
+    auto lowLimbShift = b.create<arith::ConstantOp>(
+        b.getIntegerAttr(tLow.getType(), (numLimbs - 1) * limbWidth));
+
+    // Because the number of limbs (`numLimbs`) is known at compile time, we can
+    // unroll the loop as a straight-line chain of operations.
     for (unsigned i = 0; i < numLimbs; ++i) {
-      // Extract the current lowest limb: `u` (mod `base`)
-      Value lowerLimb = b.create<arith::TruncIOp>(nPrimeAttr.getType(), u);
+      // Extract the current lowest limb: `tLow` (mod `base`)
+      auto lowerLimb = b.create<arith::TruncIOp>(nPrimeAttr.getType(), tLow);
       // Compute `m` = `lowerLimb` * `nPrime` (mod `base`)
-      Value m = b.create<arith::MulIOp>(lowerLimb, nPrime);
+      auto m = b.create<arith::MulIOp>(lowerLimb, nPrime);
       // Compute `m` * `N` , where `N` is modulus
-      Value mExt = b.create<arith::ExtUIOp>(T.getType(), m);
-      Value mN = b.create<arith::MulIOp>(modExt, mExt);
-      // Add the product to `u`.
-      Value sum = b.create<arith::AddIOp>(u, mN);
+      auto mExt = b.create<arith::ExtUIOp>(mod.getType(), m);
+      auto mN = b.create<arith::MulUIExtendedOp>(mod, mExt);
+      // Add the product to `T`.
+      auto sum = b.create<arith::AddUIExtendedOp>(tLow, mN.getLow());
+      tLow = sum.getSum();
+      tHigh = b.create<arith::AddIOp>(tHigh, mN.getHigh());
+      // Add carry from the `sum` to `tHigh`.
+      auto carryExt =
+          b.create<arith::ExtUIOp>(tHigh.getType(), sum.getOverflow());
+      tHigh = b.create<arith::AddIOp>(tHigh, carryExt);
       // Shift right by `limbWidth` to discard the zeroed limb.
-      u = b.create<arith::ShRUIOp>(sum, limbWidthConst);
+      tLow = b.create<arith::ShRUIOp>(tLow, limbWidthConst);
+      // copy the lowest limb of `tHigh` to the highest limb of `tLow`
+      Value tHighLimb = b.create<arith::AndIOp>(tHigh, lowLimbMask);
+      tHighLimb = b.create<arith::ShLIOp>(tHighLimb, lowLimbShift);
+      tLow = b.create<arith::OrIOp>(tLow, tHighLimb);
+      // Shift right `tHigh` by `limbWidth`.
+      tHigh = b.create<arith::ShRUIOp>(tHigh, limbWidthConst);
     }
 
-    // Final conditional subtraction: if (`u_final` >= modulus) then subtract
-    // modulus.
-    Value cmp = b.create<arith::CmpIOp>(arith::CmpIPredicate::uge, u, modExt);
-    Value sub = b.create<arith::SubIOp>(u, modExt);
-    Value result = b.create<arith::SelectOp>(cmp, sub, u);
+    // Final conditional subtraction: if (`tLow` >= `modulus`) then subtract
+    // `modulus`.
+    auto cmp = b.create<arith::CmpIOp>(arith::CmpIPredicate::uge, tLow, mod);
+    auto sub = b.create<arith::SubIOp>(tLow, mod);
+    auto result = b.create<arith::SelectOp>(cmp, sub, tLow);
 
-    // Truncate the result to the bitwidth of the modulus.
-    Value truncated = b.create<arith::TruncIOp>(mod.getType(), result);
-
-    rewriter.replaceOp(op, truncated);
+    rewriter.replaceOp(op, result);
     return success();
   }
 };
@@ -227,20 +236,15 @@ struct ConvertToMont : public OpConversionPattern<ToMontOp> {
       ToMontOp op, OpAdaptor adaptor,
       ConversionPatternRewriter &rewriter) const override {
     ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    IntegerAttr rSquaredAttr = op.getMontgomery().getRSquared();
 
     // x * R = REDC(x * rSquared)
     auto rSquared =
         b.create<arith::ConstantOp>(op.getMontgomery().getRSquared());
-    auto extended = b.create<arith::ExtUIOp>(rSquaredAttr.getType(),
-                                             adaptor.getOperands()[0]);
-
-    // TODO(batzor): Use extended multiplication to avoid full length
-    // multiplication. Now we extend both operands to 2x the bitwidth of the
-    // modulus to avoid the truncation in multiplication.
-    auto product = b.create<arith::MulIOp>(extended, rSquared);
-    auto reduced = b.create<MontReduceOp>(op.getResult().getType(), product,
-                                          op.getMontgomery());
+    auto product =
+        b.create<arith::MulUIExtendedOp>(adaptor.getOperands()[0], rSquared);
+    auto reduced =
+        b.create<MontReduceOp>(op.getResult().getType(), product.getLow(),
+                               product.getHigh(), op.getMontgomery());
     rewriter.replaceOp(op, reduced);
     return success();
   }
@@ -258,10 +262,11 @@ struct ConvertFromMont : public OpConversionPattern<FromMontOp> {
     ImplicitLocOpBuilder b(op.getLoc(), rewriter);
 
     // x * R⁻¹ = REDC(x)
-    auto extended = b.create<arith::ExtUIOp>(
-        op.getMontgomery().getRSquared().getType(), adaptor.getOperands()[0]);
-    auto reduced = b.create<MontReduceOp>(op.getResult().getType(), extended,
-                                          op.getMontgomery());
+    auto zeroHighConst = b.create<arith::ConstantOp>(
+        IntegerAttr::get(op.getMontgomery().getRSquared().getType(), 0));
+    auto reduced = b.create<MontReduceOp>(op.getResult().getType(),
+                                          adaptor.getOperands()[0],
+                                          zeroHighConst, op.getMontgomery());
     rewriter.replaceOp(op, reduced);
     return success();
   }
@@ -492,13 +497,11 @@ struct ConvertMontMul : public OpConversionPattern<MontMulOp> {
       ConversionPatternRewriter &rewriter) const override {
     ImplicitLocOpBuilder b(op.getLoc(), rewriter);
 
-    auto lhs =
-        b.create<arith::ExtUIOp>(modulusType(op, true), adaptor.getLhs());
-    auto rhs =
-        b.create<arith::ExtUIOp>(modulusType(op, true), adaptor.getRhs());
-    auto mul = b.create<arith::MulIOp>(lhs, rhs);
+    auto mul =
+        b.create<arith::MulUIExtendedOp>(adaptor.getLhs(), adaptor.getRhs());
     auto reduced = b.create<mod_arith::MontReduceOp>(
-        getResultModArithType(op), mul.getResult(), op.getMontgomery());
+        getResultModArithType(op), mul.getLow(), mul.getHigh(),
+        op.getMontgomery());
 
     rewriter.replaceOp(op, reduced);
     return success();
diff --git a/zkir/Dialect/ModArith/IR/ModArithAttributes.cpp b/zkir/Dialect/ModArith/IR/ModArithAttributes.cpp
@@ -61,12 +61,9 @@ MontgomeryAttrStorage *MontgomeryAttrStorage::construct(
   // Construct the `rInvAttr` with the bitwidth of the modulus
   IntegerAttr rInvAttr = IntegerAttr::get(modType.getModulus().getType(), rInv);
 
-  // Construct the `rSquaredAttr` with 2x the bitwidth of the modulus
-  // NOTE(batzor): It is currently 2x bitwidth due to how the `ToMontOp` works
-  // but should be later changed.
-  IntegerAttr rSquaredAttr = IntegerAttr::get(
-      IntegerType::get(modType.getContext(), modulus.getBitWidth() * 2),
-      rSquared.zext(modulus.getBitWidth() * 2));
+  // Construct the `rSquaredAttr` with the bitwidth of the modulus
+  IntegerAttr rSquaredAttr =
+      IntegerAttr::get(modType.getModulus().getType(), rSquared);
 
   // Construct the `nPrimeAttr` with the bitwidth `w`
   IntegerAttr nPrimeAttr = IntegerAttr::get(
diff --git a/zkir/Dialect/ModArith/IR/ModArithDialect.cpp b/zkir/Dialect/ModArith/IR/ModArithDialect.cpp
@@ -108,14 +108,14 @@ LogicalResult ReduceOp::verify() {
 }
 
 LogicalResult MontReduceOp::verify() {
-  IntegerType integerType = getOperandIntegerType(*this);
+  IntegerType integerType =
+      cast<IntegerType>(getElementTypeOrSelf(this->getLow().getType()));
   ModArithType modArithType = getResultModArithType(*this);
   unsigned intWidth = integerType.getWidth();
   unsigned modWidth = modArithType.getModulus().getValue().getBitWidth();
-  if (intWidth != 2 * modWidth)
-    return emitOpError() << "Expected operand width to be " << 2 * modWidth
-                         << ", but got " << intWidth
-                         << " while modulus width is " << modWidth << ".";
+  if (intWidth != modWidth)
+    return emitOpError() << "Expected operand width to be " << modWidth
+                         << ", but got " << intWidth << " instead.";
   return success();
 }
 
diff --git a/zkir/Dialect/ModArith/IR/ModArithOps.td b/zkir/Dialect/ModArith/IR/ModArithOps.td
@@ -123,7 +123,7 @@ def ModArith_ReduceOp : ModArith_Op<"reduce", [Pure, ElementwiseMappable, SameOp
   let assemblyFormat = "operands attr-dict `:` type($output)";
 }
 
-def ModArith_MontReduceOp : ModArith_Op<"mont_reduce", [Pure, ElementwiseMappable]> {
+def ModArith_MontReduceOp : ModArith_Op<"mont_reduce", [Pure, ElementwiseMappable, SameTypeOperands]> {
   let summary = "applies montgomery reduction to the integer of twice the modulus bitwidth";
 
   let description = [{
@@ -137,12 +137,13 @@ def ModArith_MontReduceOp : ModArith_Op<"mont_reduce", [Pure, ElementwiseMappabl
   }];
 
   let arguments = (ins
-    SignlessIntegerLike:$input,
+    SignlessIntegerLike:$low,
+    SignlessIntegerLike:$high,
     ModArith_MontgomeryAttr:$montgomery
   );
   let results = (outs ModArithLike:$output);
   let hasVerifier = 1;
-  let assemblyFormat = "operands attr-dict `:` type($input) `->` type($output)";
+  let assemblyFormat = "operands attr-dict `:` type($low) `->` type($output)";
 }
 
 def ModArith_ToMontOp : ModArith_Op<"to_mont", [Pure, ElementwiseMappable, SameOperandsAndResultType]> {