perf(mod_arith): remove remainder ops if possible

batzor · batzor · commit f4df557421fd · 2025-04-14T11:52:19.000+09:00
diff --git a/benchmark/ntt/ntt_benchmark_test.cc b/benchmark/ntt/ntt_benchmark_test.cc
@@ -113,16 +113,19 @@ BENCHMARK(BM_intt_mont_benchmark)->Iterations(1)->Unit(::benchmark::kSecond);
 }  // namespace
 }  // namespace zkir
 
+// clang-format off
+// NOLINTBEGIN(whitespace/line_length)
 // Run on (14 X 24 MHz CPU s)
 // CPU Caches:
-// L1 Data 64 KiB
-// L1 Instruction 128 KiB
-// L2 Unified 4096 KiB (x14)
-// Load Average: 9.50, 8.31, 8.95
+//   L1 Data 64 KiB
+//   L1 Instruction 128 KiB
+//   L2 Unified 4096 KiB (x14)
+// Load Average: 27.66, 13.59, 9.67
 // ------------------------------------------------------------------------------
-// Benchmark                                    Time             CPU Iterations
+// Benchmark                                    Time             CPU   Iterations
 // ------------------------------------------------------------------------------
-// BM_ntt_benchmark                         0.339 s         0.333 s 2
-// BM_intt_benchmark/iterations:1           0.501 s         0.493 s 1
-// BM_ntt_mont_benchmark                    0.379 s         0.372 s 2
-// BM_intt_mont_benchmark/iterations:1      0.510 s         0.504 s 1
+// BM_ntt_benchmark                         0.190 s         0.183 s             4
+// BM_intt_benchmark/iterations:1           0.381 s         0.368 s             1
+// BM_ntt_mont_benchmark                    0.221 s         0.214 s             3
+// BM_intt_mont_benchmark/iterations:1      0.415 s         0.396 s             1
+// NOLINTEND()
diff --git a/tests/Dialect/ModArith/mod_arith_to_arith.mlir b/tests/Dialect/ModArith/mod_arith_to_arith.mlir
@@ -7,10 +7,8 @@
 // CHECK-SAME: () -> [[T:.*]] {
 func.func @test_lower_constant() -> !mod_arith.int<3 : i5> {
   // CHECK-NOT: mod_arith.constant
-  // CHECK: %[[CVAL:.*]] = arith.constant 5 : [[T]]
-  // CHECK: %[[CMOD:.*]] = arith.constant 3 : [[T]]
-  // CHECK: %[[REMU:.*]] = arith.remui %[[CVAL]], %[[CMOD]] : [[T]]
-  // CHECK: return %[[REMU]] : [[T]]
+  // CHECK: %[[CVAL:.*]] = arith.constant 2 : [[T]]
+  // CHECK: return %[[CVAL]] : [[T]]
   %res = mod_arith.constant 5:  !mod_arith.int<3 : i5>
   return %res: !mod_arith.int<3 : i5>
 }
@@ -116,7 +114,9 @@ func.func @test_lower_add(%lhs : !Zp, %rhs : !Zp) -> !Zp {
   // CHECK-NOT: mod_arith.add
   // CHECK: %[[CMOD:.*]] = arith.constant 65537 : [[T]]
   // CHECK: %[[ADD:.*]] = arith.addi %[[LHS]], %[[RHS]] : [[T]]
-  // CHECK: %[[REM:.*]] = arith.remui %[[ADD]], %[[CMOD]] : [[T]]
+  // CHECK: %[[IFGE:.*]] = arith.cmpi uge, %[[ADD]], %[[CMOD]] : [[T]]
+  // CHECK: %[[SUB:.*]] = arith.subi %[[ADD]], %[[CMOD]] : [[T]]
+  // CHECK: %[[REM:.*]] = arith.select %[[IFGE]], %[[SUB]], %[[ADD]] : [[T]]
   // CHECK: return %[[REM]] : [[T]]
   %res = mod_arith.add %lhs, %rhs : !Zp
   return %res : !Zp
@@ -128,7 +128,9 @@ func.func @test_lower_add_vec(%lhs : !Zpv, %rhs : !Zpv) -> !Zpv {
   // CHECK-NOT: mod_arith.add
   // CHECK: %[[CMOD:.*]] = arith.constant dense<65537> : [[T]]
   // CHECK: %[[ADD:.*]] = arith.addi %[[LHS]], %[[RHS]] : [[T]]
-  // CHECK: %[[REM:.*]] = arith.remui %[[ADD]], %[[CMOD]] : [[T]]
+  // CHECK: %[[IFGE:.*]] = arith.cmpi uge, %[[ADD]], %[[CMOD]] : [[T]]
+  // CHECK: %[[SUB:.*]] = arith.subi %[[ADD]], %[[CMOD]] : [[T]]
+  // CHECK: %[[REM:.*]] = arith.select %[[IFGE]], %[[SUB]], %[[ADD]] : tensor<4xi1>, [[T]]
   // CHECK: return %[[REM]] : [[T]]
   %res = mod_arith.add %lhs, %rhs : !Zpv
   return %res : !Zpv
@@ -141,8 +143,9 @@ func.func @test_lower_sub(%lhs : !Zp, %rhs : !Zp) -> !Zp {
   // CHECK: %[[CMOD:.*]] = arith.constant 65537 : [[T]]
   // CHECK: %[[SUB:.*]] = arith.subi %[[LHS]], %[[RHS]] : [[T]]
   // CHECK: %[[ADD:.*]] = arith.addi %[[SUB]], %[[CMOD]] : [[T]]
-  // CHECK: %[[REM:.*]] = arith.remui %[[ADD]], %[[CMOD]] : [[T]]
-  // CHECK: return %[[REM]] : [[T]]
+  // CHECK: %[[IFGE:.*]] = arith.cmpi uge, %[[LHS]], %[[RHS]] : [[T]]
+  // CHECK: %[[SELECT:.*]] = arith.select %[[IFGE]], %[[SUB]], %[[ADD]] : [[T]]
+  // CHECK: return %[[SELECT]] : [[T]]
   %res = mod_arith.sub %lhs, %rhs : !Zp
   return %res : !Zp
 }
@@ -154,8 +157,9 @@ func.func @test_lower_sub_vec(%lhs : !Zpv, %rhs : !Zpv) -> !Zpv {
   // CHECK: %[[CMOD:.*]] = arith.constant dense<65537> : [[T]]
   // CHECK: %[[SUB:.*]] = arith.subi %[[LHS]], %[[RHS]] : [[T]]
   // CHECK: %[[ADD:.*]] = arith.addi %[[SUB]], %[[CMOD]] : [[T]]
-  // CHECK: %[[REM:.*]] = arith.remui %[[ADD]], %[[CMOD]] : [[T]]
-  // CHECK: return %[[REM]] : [[T]]
+  // CHECK: %[[IFGE:.*]] = arith.cmpi uge, %[[LHS]], %[[RHS]] : [[T]]
+  // CHECK: %[[SELECT:.*]] = arith.select %[[IFGE]], %[[SUB]], %[[ADD]] : tensor<4xi1>, [[T]]
+  // CHECK: return %[[SELECT]] : [[T]]
   %res = mod_arith.sub %lhs, %rhs : !Zpv
   return %res : !Zpv
 }
@@ -195,10 +199,8 @@ func.func @test_lower_mul_vec(%lhs : !Zpv, %rhs : !Zpv) -> !Zpv {
 func.func @test_lower_constant_tensor() -> !Zpv {
   // CHECK-NOT: mod_arith.constant
   // CHECK: %[[C0:.*]] = arith.constant 5 : [[INT:.*]]
-  // CHECK: %[[C1:.*]] = arith.constant 65537 : [[INT]]
-  // CHECK: %[[C2:.*]] = arith.remui %[[C0]], %[[C1]] : [[INT]]
   %c0 = mod_arith.constant 5:  !Zp
-  // CHECK: %[[RES:.*]] = tensor.from_elements %[[C2]], %[[C2]], %[[C2]], %[[C2]] : [[T]]
+  // CHECK: %[[RES:.*]] = tensor.from_elements %[[C0]], %[[C0]], %[[C0]], %[[C0]] : [[T]]
   %res = tensor.from_elements %c0, %c0, %c0, %c0 : !Zpv
   // CHECK: return %[[RES]] : [[T]]
   return %res : !Zpv
diff --git a/zkir/Dialect/ModArith/Conversions/ModArithToArith/ModArithToArith.cpp b/zkir/Dialect/ModArith/Conversions/ModArithToArith/ModArithToArith.cpp
@@ -123,9 +123,7 @@ struct ConvertConstant : public OpConversionPattern<ConstantOp> {
     ImplicitLocOpBuilder b(op.getLoc(), rewriter);
 
     auto cval = b.create<arith::ConstantOp>(op.getLoc(), adaptor.getValue());
-    auto cmod = b.create<arith::ConstantOp>(modulusAttr(op));
-    auto remu = b.create<arith::RemUIOp>(cval, cmod);
-    rewriter.replaceOp(op, remu);
+    rewriter.replaceOp(op, cval);
     return success();
   }
 };
@@ -398,9 +396,11 @@ struct ConvertAdd : public OpConversionPattern<AddOp> {
 
     auto cmod = b.create<arith::ConstantOp>(modulusAttr(op));
     auto add = b.create<arith::AddIOp>(adaptor.getLhs(), adaptor.getRhs());
-    auto remu = b.create<arith::RemUIOp>(add, cmod);
+    auto ifge = b.create<arith::CmpIOp>(arith::CmpIPredicate::uge, add, cmod);
+    auto sub = b.create<arith::SubIOp>(add, cmod);
+    auto select = b.create<arith::SelectOp>(ifge, sub, add);
 
-    rewriter.replaceOp(op, remu);
+    rewriter.replaceOp(op, select);
     return success();
   }
 };
@@ -419,9 +419,11 @@ struct ConvertSub : public OpConversionPattern<SubOp> {
     auto cmod = b.create<arith::ConstantOp>(modulusAttr(op));
     auto sub = b.create<arith::SubIOp>(adaptor.getLhs(), adaptor.getRhs());
     auto add = b.create<arith::AddIOp>(sub, cmod);
-    auto remu = b.create<arith::RemUIOp>(add, cmod);
+    auto ifge = b.create<arith::CmpIOp>(arith::CmpIPredicate::uge,
+                                        adaptor.getLhs(), adaptor.getRhs());
+    auto select = b.create<arith::SelectOp>(ifge, sub, add);
 
-    rewriter.replaceOp(op, remu);
+    rewriter.replaceOp(op, select);
     return success();
   }
 };
diff --git a/zkir/Dialect/ModArith/IR/ModArithDialect.cpp b/zkir/Dialect/ModArith/IR/ModArithDialect.cpp
@@ -178,11 +178,10 @@ ParseResult ConstantOp::parse(OpAsmParser &parser, OperationState &result) {
   }
 
   // zero-extend or truncate to the correct bitwidth
-  parsedInt = parsedInt.zextOrTrunc(outputBitWidth);
+  parsedInt = parsedInt.zextOrTrunc(outputBitWidth).urem(modulus);
   result.addAttribute(
       "value",
-      IntegerAttr::get(IntegerType::get(parser.getContext(), outputBitWidth),
-                       parsedInt));
+      IntegerAttr::get(modArithType.getModulus().getType(), parsedInt));
   result.addTypes(parsedType);
   return success();
 }

Original file line number	Diff line number	Diff line change
`@@ -178,11 +178,10 @@ ParseResult ConstantOp::parse(OpAsmParser &parser, OperationState &result) {`
`178`	`178`	`}`
`179`	`179`
`180`	`180`	`// zero-extend or truncate to the correct bitwidth`
`181`		`- parsedInt = parsedInt.zextOrTrunc(outputBitWidth);`
	`181`	`+ parsedInt = parsedInt.zextOrTrunc(outputBitWidth).urem(modulus);`
`182`	`182`	`result.addAttribute(`
`183`	`183`	`"value",`
`184`		`- IntegerAttr::get(IntegerType::get(parser.getContext(), outputBitWidth),`
`185`		`- parsedInt));`
	`184`	`+ IntegerAttr::get(modArithType.getModulus().getType(), parsedInt));`
`186`	`185`	`result.addTypes(parsedType);`
`187`	`186`	`return success();`
`188`	`187`	`}`