Skip to content

Commit 0b76678

Browse files
committed
wip
1 parent ff72c7b commit 0b76678

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+3690
-11
lines changed

gpu_prover_new/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,14 @@ categories.workspace = true
1111
build = "build/main.rs"
1212

1313
[dependencies]
14+
fft = { workspace = true }
1415
field = { workspace = true }
1516

1617
era_cudart = "0.154"
1718
era_cudart_sys = "0.154"
19+
itertools = "*"
20+
log = "0.4.29"
21+
rayon = "*"
1822

1923
[build-dependencies]
2024
cmake = "0.1"

gpu_prover_new/native/CMakeLists.txt

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,13 @@ if ((NOT DEFINED CMAKE_CUDA_ARCHITECTURES) OR (CMAKE_CUDA_ARCHITECTURES STREQUAL
66
set(CMAKE_CUDA_ARCHITECTURES native)
77
endif ()
88
add_library(gpu_prover_new_native STATIC
9-
bench/field.cu
10-
ops/blake2s.cu
11-
ops/complex.cu
12-
ops/complex.cuh
13-
ops/simple.cu
149
common.cuh
1510
field.cuh
1611
memory.cuh
1712
ptx.cuh
1813
)
14+
add_subdirectory(bench)
15+
add_subdirectory(ops)
1916
set_target_properties(gpu_prover_new_native PROPERTIES CUDA_STANDARD 20)
2017
set_target_properties(gpu_prover_new_native PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
2118
set_target_properties(gpu_prover_new_native PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
target_sources(gpu_prover_new_native PRIVATE field.cu)
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
target_sources(gpu_prover_new_native PRIVATE
2+
blake2s.cu
3+
complex.cu
4+
complex.cuh
5+
simple.cu
6+
)
7+
add_subdirectory(cub)

gpu_prover_new/native/ops/complex.cu

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,6 @@ DEVICE_FORCEINLINE void bit_reverse_naive(const matrix_getter<T, ld_modifier::cs
169169
BIT_REVERSE_NAIVE(bf);
170170
BIT_REVERSE_NAIVE(e2);
171171
BIT_REVERSE_NAIVE(e4);
172-
BIT_REVERSE_NAIVE(e6);
173172
BIT_REVERSE_NAIVE(dg);
174173

175174
DEVICE_FORCEINLINE uint2 triangular_index_flat_to_two_dim(const unsigned index, const unsigned m) {
@@ -235,7 +234,6 @@ DEVICE_FORCEINLINE void bit_reverse(const matrix_getter<T, ld_modifier::cs> src,
235234
BIT_REVERSE(bf, bf, 0);
236235
BIT_REVERSE(e2, e2, 0);
237236
BIT_REVERSE(e4, e4, 0);
238-
BIT_REVERSE(e6, e6, 1);
239237
BIT_REVERSE(dg, e4, 1);
240238

241239
// EXTERN __global__ void ab_fold_kernel(const e4 *challenge, const e4 *src, e4 *dst, const unsigned root_offset, const unsigned log_count) {
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
target_sources(gpu_prover_new_native PRIVATE
2+
common.cuh
3+
device_radix_sort.cu
4+
device_reduce.cu
5+
device_run_length_encode.cu
6+
)
7+
add_subdirectory(device_scan)
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#pragma once
2+
3+
#include "../../field.cuh"
4+
#include "../../memory.cuh"
5+
#include <cub/cub.cuh>
6+
7+
using namespace ::cub;
8+
using namespace ::airbender::field;
9+
using namespace ::airbender::memory;
10+
11+
namespace airbender::ops::cub {
12+
13+
#define BINARY_OP(op, init_fn) \
14+
template <typename T> struct op { \
15+
DEVICE_FORCEINLINE T operator()(const T &a, const T &b) const { return T::op(a, b); } \
16+
static HOST_DEVICE_FORCEINLINE T init() { return T::init_fn(); } \
17+
}
18+
19+
BINARY_OP(add, ZERO);
20+
BINARY_OP(mul, ONE);
21+
22+
template <> struct add<u32> {
23+
DEVICE_FORCEINLINE u32 operator()(const u32 &a, const u32 &b) const { return a + b; }
24+
static HOST_DEVICE_FORCEINLINE u32 init() { return 0; }
25+
};
26+
27+
template <> struct mul<u32> {
28+
DEVICE_FORCEINLINE u32 operator()(const u32 &a, const u32 &b) const { return a * b; }
29+
static HOST_DEVICE_FORCEINLINE u32 init() { return 1; }
30+
};
31+
32+
} // namespace airbender::ops::cub
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#include "common.cuh"
2+
3+
namespace airbender::ops::cub::device_radix_sort {
4+
5+
#define SORT_KEYS(dir, arg_t, method) \
6+
EXTERN cudaError_t ab_sort_keys_##dir##_##arg_t(void *d_temp_storage, size_t &temp_storage_bytes, const arg_t *d_keys_in, arg_t *d_keys_out, \
7+
const unsigned num_items, const int begin_bit, const int end_bit, const cudaStream_t stream) { \
8+
return DeviceRadixSort::method(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, begin_bit, end_bit, stream); \
9+
}
10+
11+
SORT_KEYS(a, u32, SortKeys);
12+
SORT_KEYS(d, u32, SortKeysDescending);
13+
14+
#define SORT_PAIRS(dir, arg_k_t, arg_v_t, method) \
15+
EXTERN cudaError_t ab_sort_pairs_##dir##_##arg_k_t##_##arg_v_t( \
16+
void *d_temp_storage, size_t &temp_storage_bytes, const arg_k_t *d_keys_in, arg_k_t *d_keys_out, const arg_v_t *d_values_in, arg_v_t *d_values_out, \
17+
const unsigned num_items, const int begin_bit, const int end_bit, const cudaStream_t stream) { \
18+
return DeviceRadixSort::method(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, begin_bit, end_bit, \
19+
stream); \
20+
}
21+
22+
SORT_PAIRS(a, u32, u32, SortPairs);
23+
SORT_PAIRS(d, u32, u32, SortPairsDescending);
24+
25+
} // namespace airbender::ops::cub::device_radix_sort
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#include "common.cuh"
2+
3+
namespace airbender::ops::cub::device_reduce {
4+
5+
#define REDUCE(op, arg_t) \
6+
EXTERN cudaError_t ab_reduce_##op##_##arg_t(void *d_temp_storage, size_t &temp_storage_bytes, const arg_t *d_in, arg_t *d_out, const int num_items, \
7+
const cudaStream_t stream) { \
8+
return DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, op<arg_t>(), op<arg_t>::init(), stream); \
9+
}
10+
11+
REDUCE(add, bf);
12+
REDUCE(add, e2);
13+
REDUCE(add, e4);
14+
REDUCE(add, e6);
15+
REDUCE(mul, bf);
16+
REDUCE(mul, e2);
17+
REDUCE(mul, e4);
18+
REDUCE(mul, e6);
19+
20+
struct offset_iterator {
21+
#if CUB_VERSION >= 200300
22+
using iterator_category = cuda::std::random_access_iterator_tag;
23+
using value_type = int;
24+
using difference_type = int;
25+
using pointer = int *;
26+
using reference = int &;
27+
#endif
28+
const int offset;
29+
const int stride;
30+
DEVICE_FORCEINLINE int operator[](const int idx) const { return offset + idx * stride; }
31+
};
32+
33+
#define SEGMENTED_REDUCE(op, arg_t) \
34+
EXTERN cudaError_t ab_segmented_reduce_##op##_##arg_t(void *d_temp_storage, size_t &temp_storage_bytes, const matrix_accessor<arg_t> d_in, arg_t *d_out, \
35+
const int num_segments, const int num_items, const cudaStream_t stream) { \
36+
const int stride = static_cast<int>(d_in.stride); \
37+
const offset_iterator d_begin_offsets{0, stride}; \
38+
const offset_iterator d_end_offsets{num_items, stride}; \
39+
return DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in.ptr, d_out, num_segments, d_begin_offsets, d_end_offsets, op<arg_t>(), \
40+
op<arg_t>::init(), stream); \
41+
}
42+
43+
SEGMENTED_REDUCE(add, bf);
44+
SEGMENTED_REDUCE(add, e2);
45+
SEGMENTED_REDUCE(add, e4);
46+
SEGMENTED_REDUCE(add, e6);
47+
SEGMENTED_REDUCE(mul, bf);
48+
SEGMENTED_REDUCE(mul, e2);
49+
SEGMENTED_REDUCE(mul, e4);
50+
SEGMENTED_REDUCE(mul, e6);
51+
52+
} // namespace airbender::ops::cub::device_reduce
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#include "common.cuh"
2+
3+
namespace airbender::ops::cub::device_run_length_encode {
4+
5+
EXTERN cudaError_t ab_encode_u32(void *d_temp_storage, size_t &temp_storage_bytes, const u32 *d_in, u32 *d_unique_out, unsigned *d_counts_out,
6+
unsigned *d_num_runs_out, const int num_items, const cudaStream_t stream) {
7+
return DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items, stream);
8+
}
9+
10+
} // namespace airbender::ops::cub::device_run_length_encode

0 commit comments

Comments
 (0)