fractalyze
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmark/ntt/ntt_benchmark.mlir‎
Lines changed: 17 additions & 26 deletions b/‎benchmark/ntt/ntt_benchmark.mlir‎
Lines changed: 17 additions & 26 deletions
diff --git a/‎benchmark/ntt/ntt_benchmark_test.cc‎
Lines changed: 49 additions & 44 deletions b/‎benchmark/ntt/ntt_benchmark_test.cc‎
Lines changed: 49 additions & 44 deletions
diff --git a/‎tests/Dialect/Poly/poly_canonicalization.mlir‎
Lines changed: 8 additions & 6 deletions b/‎tests/Dialect/Poly/poly_canonicalization.mlir‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎tests/Dialect/Poly/poly_ntt_runner.mlir‎
Lines changed: 4 additions & 4 deletions b/‎tests/Dialect/Poly/poly_ntt_runner.mlir‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎tests/Dialect/Poly/poly_to_field.mlir‎
Lines changed: 6 additions & 6 deletions b/‎tests/Dialect/Poly/poly_to_field.mlir‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎zkir/Dialect/Field/Conversions/FieldToModArith/BUILD.bazel‎
Lines changed: 1 addition & 0 deletions b/‎zkir/Dialect/Field/Conversions/FieldToModArith/BUILD.bazel‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎zkir/Dialect/Field/Conversions/FieldToModArith/FieldToModArith.cpp‎
Lines changed: 9 additions & 5 deletions b/‎zkir/Dialect/Field/Conversions/FieldToModArith/FieldToModArith.cpp‎
Lines changed: 9 additions & 5 deletions
diff --git a/‎zkir/Dialect/ModArith/Conversions/ModArithToArith/BUILD.bazel‎
Lines changed: 1 addition & 0 deletions b/‎zkir/Dialect/ModArith/Conversions/ModArithToArith/BUILD.bazel‎
Lines changed: 1 addition & 0 deletions
@@ -28,8 +28,8 @@ jobs:
         with:
           bazelisk-cache: true
           disk-cache: ${{ runner.os }}-zkir_bazelbuild
-          repository-cache: false
-          external-cache: false
+          repository-cache: true
+          external-cache: true
 
       - name: Run `bazel build`
         run: |
 
@@ -1,7 +1,6 @@
 !coeff_ty = !field.pf<21888242871839275222246405745257275088548364400416034343698204186575808495617 : i256>
-!poly_ty = !poly.polynomial<!coeff_ty, 1048575>
 !coefft_ty = tensor<1048576x!coeff_ty>
-!intt_ty = tensor<1048576xi256>
+!memref_ty = memref<1048576x!coeff_ty>
 
 #root_elem = #field.pf_elem<17220337697351015657950521176323262483320249231368149235373741788599650842711:i256> : !coeff_ty
 #root = #poly.primitive_root<root=#root_elem, degree=1048576:i256>
@@ -10,34 +9,26 @@
 #mont = #mod_arith.montgomery<!mod>
 #root_mont = #poly.primitive_root<root=#root_elem, degree=1048576:i256, montgomery=#mont>
 
-func.func @input_generation() -> !poly_ty attributes { llvm.emit_c_interface } {
-  %c42 = arith.constant 6420 : i256
-  %full = tensor.splat %c42 : !intt_ty
-  %coeffs = field.pf.encapsulate %full : !intt_ty -> !coefft_ty
-  %poly = poly.from_tensor %coeffs : !coefft_ty -> !poly_ty
-  return %poly : !poly_ty
+func.func @ntt(%arg0 : !memref_ty) attributes { llvm.emit_c_interface } {
+  %t = bufferization.to_tensor %arg0 restrict writable : !memref_ty to !coefft_ty
+  poly.ntt %t {root=#root} : !coefft_ty
+  return
 }
 
-func.func @ntt(%arg0 : !poly_ty) -> !intt_ty attributes { llvm.emit_c_interface } {
-  %0 = poly.ntt %arg0 {root=#root} : !poly_ty -> !coefft_ty
-  %1 = field.pf.extract %0 : !coefft_ty -> !intt_ty
-  return %1 : !intt_ty
+func.func @intt(%arg0 : !memref_ty) attributes { llvm.emit_c_interface } {
+  %t = bufferization.to_tensor %arg0 restrict writable : !memref_ty to !coefft_ty
+  poly.intt %t {root=#root} : !coefft_ty
+  return
 }
 
-func.func @intt(%arg0 : !intt_ty) -> !poly_ty attributes { llvm.emit_c_interface } {
-  %0 = field.pf.encapsulate %arg0 : !intt_ty -> !coefft_ty
-  %1 = poly.intt %0 {root=#root} : !coefft_ty -> !poly_ty
-  return %1 :!poly_ty
+func.func @ntt_mont(%arg0 : !memref_ty) attributes { llvm.emit_c_interface } {
+  %t = bufferization.to_tensor %arg0 restrict writable : !memref_ty to !coefft_ty
+  poly.ntt %t {root=#root_mont} : !coefft_ty
+  return
 }
 
-func.func @ntt_mont(%arg0 : !poly_ty) -> !intt_ty attributes { llvm.emit_c_interface } {
-  %0 = poly.ntt %arg0 {root=#root_mont} : !poly_ty -> !coefft_ty
-  %1 = field.pf.extract %0 : !coefft_ty -> !intt_ty
-  return %1 : !intt_ty
-}
-
-func.func @intt_mont(%arg0 : !intt_ty) -> !poly_ty attributes { llvm.emit_c_interface } {
-  %0 = field.pf.encapsulate %arg0 : !intt_ty -> !coefft_ty
-  %1 = poly.intt %0 {root=#root_mont} : !coefft_ty -> !poly_ty
-  return %1 :!poly_ty
+func.func @intt_mont(%arg0 : !memref_ty) attributes { llvm.emit_c_interface } {
+  %t = bufferization.to_tensor %arg0 restrict writable : !memref_ty to !coefft_ty
+  poly.intt %t {root=#root_mont} : !coefft_ty
+  return
 }
@@ -25,101 +25,106 @@ static void fillWithRandom(Memref<i256> *input, const i256 &kPrime) {
   std::mt19937_64 rng(std::random_device{}());  // NOLINT(whitespace/braces)
   std::uniform_int_distribution<uint64_t> dist(0, UINT64_MAX);
   for (int i = 0; i < NUM_COEFFS; i++) {
-    *input->pget(0, i) = i256::randomLT(kPrime, rng, dist);
+    *input->pget(i, 0) = i256::randomLT(kPrime, rng, dist);
   }
 }
 
-extern "C" void _mlir_ciface_input_generation(Memref<i256> *output);
-extern "C" void _mlir_ciface_ntt(Memref<i256> *output, Memref<i256> *input);
-extern "C" void _mlir_ciface_intt(Memref<i256> *output, Memref<i256> *input);
+extern "C" void _mlir_ciface_ntt(Memref<i256> *buffer);
+extern "C" void _mlir_ciface_intt(Memref<i256> *buffer);
 
-extern "C" void _mlir_ciface_ntt_mont(Memref<i256> *output,
-                                      Memref<i256> *input);
-extern "C" void _mlir_ciface_intt_mont(Memref<i256> *output,
-                                       Memref<i256> *input);
+extern "C" void _mlir_ciface_ntt_mont(Memref<i256> *buffer);
+extern "C" void _mlir_ciface_intt_mont(Memref<i256> *buffer);
 
 void BM_ntt_benchmark(::benchmark::State &state) {
-  Memref<i256> input(1, NUM_COEFFS);
-  _mlir_ciface_input_generation(&input);
+  Memref<i256> input(NUM_COEFFS, 1);
   fillWithRandom(&input, kPrime);
 
-  Memref<i256> ntt(1, NUM_COEFFS);
+  Memref<i256> ntt(NUM_COEFFS, 1);
   for (auto _ : state) {
-    _mlir_ciface_ntt(&ntt, &input);
+    state.PauseTiming();
+    memcpy(ntt.pget(0, 0), input.pget(0, 0), sizeof(i256) * NUM_COEFFS);
+    state.ResumeTiming();
+    _mlir_ciface_ntt(&ntt);
   }
 
-  Memref<i256> intt(1, NUM_COEFFS);
-  _mlir_ciface_intt(&intt, &ntt);
+  _mlir_ciface_intt(&ntt);
 
   for (int i = 0; i < NUM_COEFFS; i++) {
     for (int j = 0; j < 4; j++) {
-      EXPECT_EQ(intt.pget(0, i)->limbs[j], input.pget(0, i)->limbs[j]);
+      EXPECT_EQ(ntt.pget(i, 0)->limbs[j], input.pget(i, 0)->limbs[j]);
     }
   }
 }
 
 BENCHMARK(BM_ntt_benchmark)->Unit(::benchmark::kMillisecond);
 
 void BM_intt_benchmark(::benchmark::State &state) {
-  Memref<i256> input(1, NUM_COEFFS);
-  _mlir_ciface_input_generation(&input);
+  Memref<i256> input(NUM_COEFFS, 1);
   fillWithRandom(&input, kPrime);
 
-  Memref<i256> ntt(1, NUM_COEFFS);
-  _mlir_ciface_ntt(&ntt, &input);
+  Memref<i256> ntt(NUM_COEFFS, 1);
+  memcpy(ntt.pget(0, 0), input.pget(0, 0), sizeof(i256) * NUM_COEFFS);
+  _mlir_ciface_ntt(&ntt);
 
-  Memref<i256> intt(1, NUM_COEFFS);
+  Memref<i256> intt(NUM_COEFFS, 1);
   for (auto _ : state) {
-    _mlir_ciface_intt(&intt, &ntt);
+    state.PauseTiming();
+    memcpy(intt.pget(0, 0), ntt.pget(0, 0), sizeof(i256) * NUM_COEFFS);
+    state.ResumeTiming();
+    _mlir_ciface_intt(&ntt);
   }
 
   for (int i = 0; i < NUM_COEFFS; i++) {
     for (int j = 0; j < 4; j++) {
-      EXPECT_EQ(intt.pget(0, i)->limbs[j], input.pget(0, i)->limbs[j]);
+      EXPECT_EQ(ntt.pget(i, 0)->limbs[j], input.pget(i, 0)->limbs[j]);
     }
   }
 }
 
 BENCHMARK(BM_intt_benchmark)->Unit(::benchmark::kMillisecond);
 
 void BM_ntt_mont_benchmark(::benchmark::State &state) {
-  Memref<i256> input(1, NUM_COEFFS);
-  _mlir_ciface_input_generation(&input);
+  Memref<i256> input(NUM_COEFFS, 1);
   fillWithRandom(&input, kPrime);
 
-  Memref<i256> ntt(1, NUM_COEFFS);
+  Memref<i256> ntt(NUM_COEFFS, 1);
   for (auto _ : state) {
-    _mlir_ciface_ntt_mont(&ntt, &input);
+    state.PauseTiming();
+    memcpy(ntt.pget(0, 0), input.pget(0, 0), sizeof(i256) * NUM_COEFFS);
+    state.ResumeTiming();
+    _mlir_ciface_ntt_mont(&ntt);
   }
 
-  Memref<i256> intt(1, NUM_COEFFS);
-  _mlir_ciface_intt_mont(&intt, &ntt);
+  _mlir_ciface_intt_mont(&ntt);
 
   for (int i = 0; i < NUM_COEFFS; i++) {
     for (int j = 0; j < 4; j++) {
-      EXPECT_EQ(intt.pget(0, i)->limbs[j], input.pget(0, i)->limbs[j]);
+      EXPECT_EQ(ntt.pget(i, 0)->limbs[j], input.pget(i, 0)->limbs[j]);
     }
   }
 }
 
 BENCHMARK(BM_ntt_mont_benchmark)->Unit(::benchmark::kMillisecond);
 
 void BM_intt_mont_benchmark(::benchmark::State &state) {
-  Memref<i256> input(1, NUM_COEFFS);
-  _mlir_ciface_input_generation(&input);
+  Memref<i256> input(NUM_COEFFS, 1);
   fillWithRandom(&input, kPrime);
 
-  Memref<i256> ntt(1, NUM_COEFFS);
-  _mlir_ciface_ntt_mont(&ntt, &input);
+  Memref<i256> ntt(NUM_COEFFS, 1);
+  memcpy(ntt.pget(0, 0), input.pget(0, 0), sizeof(i256) * NUM_COEFFS);
+  _mlir_ciface_ntt_mont(&ntt);
 
-  Memref<i256> intt(1, NUM_COEFFS);
+  Memref<i256> intt(NUM_COEFFS, 1);
   for (auto _ : state) {
-    _mlir_ciface_intt_mont(&intt, &ntt);
+    state.PauseTiming();
+    memcpy(intt.pget(0, 0), ntt.pget(0, 0), sizeof(i256) * NUM_COEFFS);
+    state.ResumeTiming();
+    _mlir_ciface_intt_mont(&intt);
   }
 
   for (int i = 0; i < NUM_COEFFS; i++) {
     for (int j = 0; j < 4; j++) {
-      EXPECT_EQ(intt.pget(0, i)->limbs[j], input.pget(0, i)->limbs[j]);
+      EXPECT_EQ(intt.pget(i, 0)->limbs[j], input.pget(i, 0)->limbs[j]);
     }
   }
 }
@@ -136,12 +141,12 @@ BENCHMARK(BM_intt_mont_benchmark)->Unit(::benchmark::kMillisecond);
 //   L1 Data 64 KiB
 //   L1 Instruction 128 KiB
 //   L2 Unified 4096 KiB (x14)
-// Load Average: 6.49, 5.64, 5.49
-// -------------------------------------------------------------------------
-// Benchmark                               Time             CPU   Iterations
-// -------------------------------------------------------------------------
-// BM_ntt_benchmark                     1656 ms         1050 ms            1
-// BM_intt_benchmark/iterations:1       1791 ms         1090 ms            1
-// BM_ntt_mont_benchmark                38.6 ms         18.6 ms           40
-// BM_intt_mont_benchmark               99.4 ms         56.4 ms           11
+// Load Average: 8.66, 7.19, 7.37
+// -----------------------------------------------------------------
+// Benchmark                       Time             CPU   Iterations
+// -----------------------------------------------------------------
+// BM_ntt_benchmark             1603 ms         1085 ms            1
+// BM_intt_benchmark            1585 ms         1120 ms            1
+// BM_ntt_mont_benchmark        34.7 ms         16.8 ms           42
+// BM_intt_mont_benchmark       33.8 ms         16.6 ms           42
 // NOLINTEND()
@@ -12,8 +12,10 @@ func.func @test_canonicalize_intt_after_ntt(%p0 : !poly_ty) -> !poly_ty {
   // CHECK-NOT: poly.ntt
   // CHECK-NOT: poly.intt
   // CHECK: %[[RESULT:.*]] = poly.add %[[P]], %[[P]]  : [[T]]
-  %t0 = poly.ntt %p0 {root=#root} : !poly_ty -> !tensor_ty
-  %p1 = poly.intt %t0 {root=#root} : !tensor_ty -> !poly_ty
+  %coeffs = poly.to_tensor %p0 : !poly_ty -> !tensor_ty
+  %evals = poly.ntt %coeffs {root=#root} : !tensor_ty
+  %coeffs1 = poly.intt %evals {root=#root} : !tensor_ty
+  %p1 = poly.from_tensor %coeffs1 : !tensor_ty -> !poly_ty
   %p2 = poly.add %p1, %p1 : !poly_ty
   // CHECK: return %[[RESULT]] : [[T]]
   return %p2 : !poly_ty
@@ -25,9 +27,9 @@ func.func @test_canonicalize_ntt_after_intt(%t0 : !tensor_ty) -> !tensor_ty {
   // CHECK-NOT: poly.intt
   // CHECK-NOT: poly.ntt
   // CHECK: %[[RESULT:.*]] = field.pf.add %[[X]], %[[X]] : [[T]]
-  %p0 = poly.intt %t0 {root=#root} : !tensor_ty -> !poly_ty
-  %t1 = poly.ntt %p0 {root=#root} : !poly_ty -> !tensor_ty
-  %t2 = field.pf.add %t1, %t1 : !tensor_ty
+  %coeffs = poly.intt %t0 {root=#root} : !tensor_ty
+  %evals = poly.ntt %coeffs {root=#root} : !tensor_ty
+  %evals2 = field.pf.add %evals, %evals : !tensor_ty
   // CHECK: return %[[RESULT]] : [[T]]
-  return %t2 : !tensor_ty
+  return %evals2 : !tensor_ty
 }
@@ -14,16 +14,16 @@ func.func private @printMemrefI32(memref<*xi32>) attributes { llvm.emit_c_interf
 func.func @test_poly_ntt() {
   %coeffsRaw = arith.constant dense<[1,2,3,4]> : tensor<4xi32>
   %coeffs = field.pf.encapsulate %coeffsRaw : tensor<4xi32> -> tensor<4x!coeff_ty>
-  %poly = poly.from_tensor %coeffs : tensor<4x!coeff_ty> -> !poly_ty
-  %res = poly.ntt %poly {root=#root} : !poly_ty -> tensor<4x!coeff_ty>
+  %res = poly.ntt %coeffs {root=#root} : tensor<4x!coeff_ty>
 
   %extract = field.pf.extract %res : tensor<4x!coeff_ty> -> tensor<4xi32>
   %1 = bufferization.to_memref %extract : tensor<4xi32> to memref<4xi32>
   %U = memref.cast %1 : memref<4xi32> to memref<*xi32>
   func.call @printMemrefI32(%U) : (memref<*xi32>) -> ()
 
-  %intt = poly.intt %res {root=#root} : tensor<4x!coeff_ty> -> !poly_ty
-  %res2 = poly.to_tensor %intt : !poly_ty -> tensor<4x!coeff_ty>
+  %intt = poly.intt %res {root=#root} : tensor<4x!coeff_ty>
+  %poly = poly.from_tensor %intt : tensor<4x!coeff_ty> -> !poly_ty
+  %res2 = poly.to_tensor %poly : !poly_ty -> tensor<4x!coeff_ty>
   %extract2 = field.pf.extract %res2 : tensor<4x!coeff_ty> -> tensor<4xi32>
   %2= bufferization.to_memref %extract2 : tensor<4xi32> to memref<4xi32>
   %U2 = memref.cast %2 : memref<4xi32> to memref<*xi32>
 
@@ -67,17 +67,17 @@ func.func @test_lower_from_tensor(%t : tensor<4x!PF1>) -> !poly_ty1 {
 }
 
 // CHECK-LABEL: @test_lower_ntt
-// CHECK-SAME: (%[[INPUT:.*]]: [[P:.*]]) -> [[T:.*]] {
-func.func @test_lower_ntt(%input : !poly_ty1) -> tensor<4x!PF1> {
+// CHECK-SAME: (%[[INPUT:.*]]: [[T:.*]]) -> [[T]] {
+func.func @test_lower_ntt(%input : tensor<4x!PF1>) -> tensor<4x!PF1> {
   // CHECK-NOT: poly.ntt
-  %res = poly.ntt %input {root=#root} : !poly_ty1 -> tensor<4x!PF1>
+  %res = poly.ntt %input {root=#root} : tensor<4x!PF1>
   return %res: tensor<4x!PF1>
 }
 
 // CHECK-LABEL: @test_lower_intt
 // CHECK-SAME: (%[[INPUT:.*]]: [[T:.*]]) -> [[P:.*]] {
-func.func @test_lower_intt(%input : tensor<4x!PF1>) -> !poly_ty1 {
+func.func @test_lower_intt(%input : tensor<4x!PF1>) -> tensor<4x!PF1> {
   // CHECK-NOT: poly.intt
-  %res = poly.intt %input {root=#root} : tensor<4x!PF1> -> !poly_ty1
-  return %res: !poly_ty1
+  %res = poly.intt %input {root=#root} : tensor<4x!PF1>
+  return %res: tensor<4x!PF1>
 }
@@ -21,6 +21,7 @@ cc_library(
         "@llvm-project//mlir:BufferizationDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgDialect",
+        "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
 
@@ -6,6 +6,7 @@
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/BuiltinAttributeInterfaces.h"
 #include "mlir/IR/BuiltinAttributes.h"
@@ -351,8 +352,10 @@ void PrimeFieldToModArith::runOnOperation() {
       ConvertAny<affine::AffineForOp>, ConvertAny<affine::AffineParallelOp>,
       ConvertAny<affine::AffineLoadOp>, ConvertAny<affine::AffineStoreOp>,
       ConvertAny<affine::AffineYieldOp>, ConvertAny<linalg::GenericOp>,
-      ConvertAny<linalg::YieldOp>, ConvertAny<tensor::CastOp>,
-      ConvertAny<tensor::ExtractOp>, ConvertAny<tensor::FromElementsOp>,
+      ConvertAny<linalg::MapOp>, ConvertAny<memref::LoadOp>,
+      ConvertAny<memref::StoreOp>, ConvertAny<linalg::YieldOp>,
+      ConvertAny<tensor::CastOp>, ConvertAny<tensor::ExtractOp>,
+      ConvertAny<tensor::FromElementsOp>,
       ConvertAny<bufferization::MaterializeInDestinationOp>,
       ConvertAny<bufferization::ToMemrefOp>,
       ConvertAny<bufferization::ToTensorOp>, ConvertAny<tensor::InsertOp>>(
@@ -364,9 +367,10 @@ void PrimeFieldToModArith::runOnOperation() {
       affine::AffineForOp, affine::AffineParallelOp, affine::AffineLoadOp,
       affine::AffineStoreOp, affine::AffineYieldOp,
       bufferization::MaterializeInDestinationOp, bufferization::ToMemrefOp,
-      bufferization::ToTensorOp, linalg::GenericOp, linalg::YieldOp,
-      tensor::CastOp, tensor::ExtractOp, tensor::FromElementsOp,
-      tensor::InsertOp>([&](auto op) { return typeConverter.isLegal(op); });
+      bufferization::ToTensorOp, linalg::GenericOp, linalg::MapOp,
+      linalg::YieldOp, memref::LoadOp, memref::StoreOp, tensor::CastOp,
+      tensor::ExtractOp, tensor::FromElementsOp, tensor::InsertOp>(
+      [&](auto op) { return typeConverter.isLegal(op); });
 
   if (failed(applyPartialConversion(module, target, std::move(patterns)))) {
     signalPassFailure();
 
@@ -21,6 +21,7 @@ cc_library(
         "@llvm-project//mlir:BufferizationDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgDialect",
+        "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:Support",