matter-labs
diff --git a/‎gpu_prover_new/Cargo.toml‎
Lines changed: 4 additions & 0 deletions b/‎gpu_prover_new/Cargo.toml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎gpu_prover_new/native/CMakeLists.txt‎
Lines changed: 2 additions & 5 deletions b/‎gpu_prover_new/native/CMakeLists.txt‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎gpu_prover_new/native/bench/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎gpu_prover_new/native/bench/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎gpu_prover_new/native/ops/CMakeLists.txt‎
Lines changed: 7 additions & 0 deletions b/‎gpu_prover_new/native/ops/CMakeLists.txt‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎gpu_prover_new/native/ops/complex.cu‎
Lines changed: 0 additions & 2 deletions b/‎gpu_prover_new/native/ops/complex.cu‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎gpu_prover_new/native/ops/cub/CMakeLists.txt‎
Lines changed: 7 additions & 0 deletions b/‎gpu_prover_new/native/ops/cub/CMakeLists.txt‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎gpu_prover_new/native/ops/cub/common.cuh‎
Lines changed: 32 additions & 0 deletions b/‎gpu_prover_new/native/ops/cub/common.cuh‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎gpu_prover_new/native/ops/cub/device_radix_sort.cu‎
Lines changed: 25 additions & 0 deletions b/‎gpu_prover_new/native/ops/cub/device_radix_sort.cu‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎gpu_prover_new/native/ops/cub/device_reduce.cu‎
Lines changed: 52 additions & 0 deletions b/‎gpu_prover_new/native/ops/cub/device_reduce.cu‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎gpu_prover_new/native/ops/cub/device_run_length_encode.cu‎
Lines changed: 10 additions & 0 deletions b/‎gpu_prover_new/native/ops/cub/device_run_length_encode.cu‎
Lines changed: 10 additions & 0 deletions
@@ -11,10 +11,14 @@ categories.workspace = true
 build = "build/main.rs"
 
 [dependencies]
+fft = { workspace = true }
 field = { workspace = true }
 
 era_cudart = "0.154"
 era_cudart_sys = "0.154"
+itertools = "*"
+log = "0.4.29"
+rayon = "*"
 
 [build-dependencies]
 cmake = "0.1"
 
@@ -6,16 +6,13 @@ if ((NOT DEFINED CMAKE_CUDA_ARCHITECTURES) OR (CMAKE_CUDA_ARCHITECTURES STREQUAL
     set(CMAKE_CUDA_ARCHITECTURES native)
 endif ()
 add_library(gpu_prover_new_native STATIC
-        bench/field.cu
-        ops/blake2s.cu
-        ops/complex.cu
-        ops/complex.cuh
-        ops/simple.cu
         common.cuh
         field.cuh
         memory.cuh
         ptx.cuh
 )
+add_subdirectory(bench)
+add_subdirectory(ops)
 set_target_properties(gpu_prover_new_native PROPERTIES CUDA_STANDARD 20)
 set_target_properties(gpu_prover_new_native PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 set_target_properties(gpu_prover_new_native PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
 
@@ -0,0 +1 @@
+target_sources(gpu_prover_new_native PRIVATE field.cu)
@@ -0,0 +1,7 @@
+target_sources(gpu_prover_new_native PRIVATE
+        blake2s.cu
+        complex.cu
+        complex.cuh
+        simple.cu
+)
+add_subdirectory(cub)
@@ -169,7 +169,6 @@ DEVICE_FORCEINLINE void bit_reverse_naive(const matrix_getter<T, ld_modifier::cs
 BIT_REVERSE_NAIVE(bf);
 BIT_REVERSE_NAIVE(e2);
 BIT_REVERSE_NAIVE(e4);
-BIT_REVERSE_NAIVE(e6);
 BIT_REVERSE_NAIVE(dg);
 
 DEVICE_FORCEINLINE uint2 triangular_index_flat_to_two_dim(const unsigned index, const unsigned m) {
@@ -235,7 +234,6 @@ DEVICE_FORCEINLINE void bit_reverse(const matrix_getter<T, ld_modifier::cs> src,
 BIT_REVERSE(bf, bf, 0);
 BIT_REVERSE(e2, e2, 0);
 BIT_REVERSE(e4, e4, 0);
-BIT_REVERSE(e6, e6, 1);
 BIT_REVERSE(dg, e4, 1);
 
 // EXTERN __global__ void ab_fold_kernel(const e4 *challenge, const e4 *src, e4 *dst, const unsigned root_offset, const unsigned log_count) {
 
@@ -0,0 +1,7 @@
+target_sources(gpu_prover_new_native PRIVATE
+        common.cuh
+        device_radix_sort.cu
+        device_reduce.cu
+        device_run_length_encode.cu
+)
+add_subdirectory(device_scan)
@@ -0,0 +1,32 @@
+#pragma once
+
+#include "../../field.cuh"
+#include "../../memory.cuh"
+#include <cub/cub.cuh>
+
+using namespace ::cub;
+using namespace ::airbender::field;
+using namespace ::airbender::memory;
+
+namespace airbender::ops::cub {
+
+#define BINARY_OP(op, init_fn)                                                                                                                                 \
+  template <typename T> struct op {                                                                                                                            \
+    DEVICE_FORCEINLINE T operator()(const T &a, const T &b) const { return T::op(a, b); }                                                                      \
+    static HOST_DEVICE_FORCEINLINE T init() { return T::init_fn(); }                                                                                           \
+  }
+
+BINARY_OP(add, ZERO);
+BINARY_OP(mul, ONE);
+
+template <> struct add<u32> {
+  DEVICE_FORCEINLINE u32 operator()(const u32 &a, const u32 &b) const { return a + b; }
+  static HOST_DEVICE_FORCEINLINE u32 init() { return 0; }
+};
+
+template <> struct mul<u32> {
+  DEVICE_FORCEINLINE u32 operator()(const u32 &a, const u32 &b) const { return a * b; }
+  static HOST_DEVICE_FORCEINLINE u32 init() { return 1; }
+};
+
+} // namespace airbender::ops::cub
@@ -0,0 +1,25 @@
+#include "common.cuh"
+
+namespace airbender::ops::cub::device_radix_sort {
+
+#define SORT_KEYS(dir, arg_t, method)                                                                                                                          \
+  EXTERN cudaError_t ab_sort_keys_##dir##_##arg_t(void *d_temp_storage, size_t &temp_storage_bytes, const arg_t *d_keys_in, arg_t *d_keys_out,                 \
+                                                  const unsigned num_items, const int begin_bit, const int end_bit, const cudaStream_t stream) {               \
+    return DeviceRadixSort::method(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, begin_bit, end_bit, stream);                          \
+  }
+
+SORT_KEYS(a, u32, SortKeys);
+SORT_KEYS(d, u32, SortKeysDescending);
+
+#define SORT_PAIRS(dir, arg_k_t, arg_v_t, method)                                                                                                              \
+  EXTERN cudaError_t ab_sort_pairs_##dir##_##arg_k_t##_##arg_v_t(                                                                                              \
+      void *d_temp_storage, size_t &temp_storage_bytes, const arg_k_t *d_keys_in, arg_k_t *d_keys_out, const arg_v_t *d_values_in, arg_v_t *d_values_out,      \
+      const unsigned num_items, const int begin_bit, const int end_bit, const cudaStream_t stream) {                                                           \
+    return DeviceRadixSort::method(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, begin_bit, end_bit,        \
+                                   stream);                                                                                                                    \
+  }
+
+SORT_PAIRS(a, u32, u32, SortPairs);
+SORT_PAIRS(d, u32, u32, SortPairsDescending);
+
+} // namespace airbender::ops::cub::device_radix_sort
@@ -0,0 +1,52 @@
+#include "common.cuh"
+
+namespace airbender::ops::cub::device_reduce {
+
+#define REDUCE(op, arg_t)                                                                                                                                      \
+  EXTERN cudaError_t ab_reduce_##op##_##arg_t(void *d_temp_storage, size_t &temp_storage_bytes, const arg_t *d_in, arg_t *d_out, const int num_items,          \
+                                              const cudaStream_t stream) {                                                                                     \
+    return DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, op<arg_t>(), op<arg_t>::init(), stream);                           \
+  }
+
+REDUCE(add, bf);
+REDUCE(add, e2);
+REDUCE(add, e4);
+REDUCE(add, e6);
+REDUCE(mul, bf);
+REDUCE(mul, e2);
+REDUCE(mul, e4);
+REDUCE(mul, e6);
+
+struct offset_iterator {
+#if CUB_VERSION >= 200300
+  using iterator_category = cuda::std::random_access_iterator_tag;
+  using value_type = int;
+  using difference_type = int;
+  using pointer = int *;
+  using reference = int &;
+#endif
+  const int offset;
+  const int stride;
+  DEVICE_FORCEINLINE int operator[](const int idx) const { return offset + idx * stride; }
+};
+
+#define SEGMENTED_REDUCE(op, arg_t)                                                                                                                            \
+  EXTERN cudaError_t ab_segmented_reduce_##op##_##arg_t(void *d_temp_storage, size_t &temp_storage_bytes, const matrix_accessor<arg_t> d_in, arg_t *d_out,     \
+                                                        const int num_segments, const int num_items, const cudaStream_t stream) {                              \
+    const int stride = static_cast<int>(d_in.stride);                                                                                                          \
+    const offset_iterator d_begin_offsets{0, stride};                                                                                                          \
+    const offset_iterator d_end_offsets{num_items, stride};                                                                                                    \
+    return DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in.ptr, d_out, num_segments, d_begin_offsets, d_end_offsets, op<arg_t>(),       \
+                                         op<arg_t>::init(), stream);                                                                                           \
+  }
+
+SEGMENTED_REDUCE(add, bf);
+SEGMENTED_REDUCE(add, e2);
+SEGMENTED_REDUCE(add, e4);
+SEGMENTED_REDUCE(add, e6);
+SEGMENTED_REDUCE(mul, bf);
+SEGMENTED_REDUCE(mul, e2);
+SEGMENTED_REDUCE(mul, e4);
+SEGMENTED_REDUCE(mul, e6);
+
+} // namespace airbender::ops::cub::device_reduce
@@ -0,0 +1,10 @@
+#include "common.cuh"
+
+namespace airbender::ops::cub::device_run_length_encode {
+
+EXTERN cudaError_t ab_encode_u32(void *d_temp_storage, size_t &temp_storage_bytes, const u32 *d_in, u32 *d_unique_out, unsigned *d_counts_out,
+                                 unsigned *d_num_runs_out, const int num_items, const cudaStream_t stream) {
+  return DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items, stream);
+}
+
+} // namespace airbender::ops::cub::device_run_length_encode
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+target_sources(gpu_prover_new_native PRIVATE field.cu)`