Skip to content
Open
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
5b3078e
test identity case
qwang98 Nov 17, 2025
26f4389
fix compilation
qwang98 Nov 17, 2025
2acdfa2
remove unwanted fill_trace_new
qwang98 Nov 17, 2025
94f022a
added offset support for APC and sub; currently backward compatible w…
qwang98 Nov 18, 2025
87ff97c
changes so far
qwang98 Nov 19, 2025
6ac91a2
wip still debugging but first execution of the single apc in the fibo…
qwang98 Nov 25, 2025
a913881
trace for all rows generated
qwang98 Nov 26, 2025
67331b4
finally fixed the memory misalignment bug
qwang98 Nov 26, 2025
7f6ee7b
pass APC width and commented out some prints
qwang98 Nov 27, 2025
d1b735d
removed gap calculation from write array
qwang98 Dec 2, 2025
c44dafe
simplified COL_WRITE_VALUE_NEW by removing subs and perform in write_…
qwang98 Dec 2, 2025
3b297e8
use write_new for limbs wirte
qwang98 Dec 2, 2025
64a946a
skip add_count if we in apc
qwang98 Dec 2, 2025
effb1d0
remove subs from most if not all API
qwang98 Dec 2, 2025
fd69e72
finalize decompose_new
qwang98 Dec 2, 2025
46394a1
finalized adapter.fill_trace_row_new
qwang98 Dec 2, 2025
51d2efd
finalize core.fill_trace_row
qwang98 Dec 2, 2025
6063ea4
reordered cuda and rust APIs
qwang98 Dec 2, 2025
cead7cb
reordered rust and cuda ABIs
qwang98 Dec 2, 2025
54aad09
removed all prints
qwang98 Dec 2, 2025
2f1f303
fixed bug on fill zero for padding rows
qwang98 Dec 3, 2025
9898818
finally fixed keccak
qwang98 Dec 4, 2025
897e5d4
added opcode flag optimization
qwang98 Dec 4, 2025
fd2dd1a
pass nullptr for offsets for non apc
qwang98 Dec 4, 2025
894f3c2
optimization: check is_apc in slice_from, write_new, and write_array_…
qwang98 Dec 4, 2025
3e77951
fix thread error; keccak fully fixed and tested
qwang98 Dec 5, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions crates/circuits/primitives/cuda/include/primitives/histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,33 @@ struct VariableRangeChecker {
}
#ifdef CUDA_DEBUG
assert(bits_remaining == 0 && x == 0);
#endif
}

__device__ __forceinline__ void decompose_new(
uint32_t x,
size_t bits,
RowSliceNew limbs,
const size_t limbs_len
) {
size_t range_max_bits = max_bits();
#ifdef CUDA_DEBUG
assert(limbs_len >= d_div_ceil(bits, range_max_bits));
#endif
uint32_t mask = (1 << range_max_bits) - 1;
size_t bits_remaining = bits;
#pragma unroll
for (int i = 0; i < limbs_len; i++) {
uint32_t limb_u32 = x & mask;
limbs.write_new(i, limb_u32);
if (!limbs.is_apc) {
add_count(limb_u32, min(bits_remaining, range_max_bits));
}
Comment on lines +101 to +104
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the only decompose_new difference from decompose, which has:

limbs[i] = limb_u32;
add_count(limb_u32, min(bits_remaining, range_max_bits));

x >>= range_max_bits;
bits_remaining -= min(bits_remaining, range_max_bits);
}
#ifdef CUDA_DEBUG
assert(bits_remaining == 0 && x == 0);
#endif
}
};
Expand Down
11 changes: 11 additions & 0 deletions crates/circuits/primitives/cuda/include/primitives/less_than.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,17 @@ __device__ __forceinline__ void generate_subrow(
) {
rc.decompose(y - x - 1, max_bits, lower_decomp, lower_decomp_len);
}

__device__ __forceinline__ void generate_subrow_new(
VariableRangeChecker &rc,
const uint32_t max_bits,
uint32_t x,
uint32_t y,
const size_t lower_decomp_len,
RowSliceNew lower_decomp
) {
rc.decompose_new(y - x - 1, max_bits, lower_decomp, lower_decomp_len);
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No difference from generate_subrow_new except that we use decompose_new.

}
} // namespace AssertLessThan

namespace IsLessThan {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#pragma once

#include <cstdio>

// Utility buffer to print a single APC row atomically from device code.
struct RowPrintBuffer {
static constexpr int kCapacity = 8192;
char data[kCapacity];
int len;

__device__ __forceinline__ void reset() { len = 0; }

__device__ __forceinline__ void append_char(char c) {
if (len < kCapacity - 1) {
data[len++] = c;
}
}

__device__ __forceinline__ void append_literal(const char *literal) {
for (const char *ptr = literal; *ptr != '\0'; ++ptr) {
append_char(*ptr);
}
}

__device__ __forceinline__ void append_uint(unsigned long long value) {
char tmp[32];
int tmp_len = 0;

if (value == 0) {
tmp[tmp_len++] = '0';
} else {
while (value > 0 && tmp_len < static_cast<int>(sizeof(tmp))) {
tmp[tmp_len++] = static_cast<char>('0' + (value % 10));
value /= 10;
}
}

for (int i = tmp_len - 1; i >= 0; --i) {
append_char(tmp[i]);
}
}

__device__ __forceinline__ void flush() {
data[len] = '\0';
printf("%s", data);
}

// Execute `fn` with this buffer after clearing it, then flush.
// `fn` must be a device callable accepting `RowPrintBuffer &`.
template <typename Fn>
__device__ __forceinline__ void write_with(Fn fn) {
reset();
fn(*this);
flush();
}
};
196 changes: 196 additions & 0 deletions crates/circuits/primitives/cuda/include/primitives/trace_access.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,127 @@
#pragma once

#include "fp.h"
#include "primitives/row_print_buffer.cuh"
#include <cstddef>
#include <cstdint>
#include <type_traits>


__device__ __forceinline__ size_t number_of_gaps_in(const uint32_t *sub, size_t start, size_t len);

/// A RowSlice is a contiguous section of a row in col-based trace.
struct RowSliceNew {
Fp *ptr;
size_t stride;
size_t optimized_offset;
size_t dummy_offset;
Comment on lines +13 to +17
Copy link
Author

@qwang98 qwang98 Nov 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added this new RowSliceNew struct that stores the offsets.

optimized_offset is the smaller one, basically the cumulative dummy_offset subtracted by gap.

dummy_offset is the larger one, basically the cumulative COL_INDEX of original columns.

uint32_t *subs;
bool is_apc;


__device__ RowSliceNew(Fp *ptr, size_t stride, size_t optimized_offset, size_t dummy_offset, uint32_t *subs, bool is_apc) : ptr(ptr), stride(stride), optimized_offset(optimized_offset), dummy_offset(dummy_offset), subs(subs), is_apc(is_apc) {}

__device__ __forceinline__ Fp &operator[](size_t column_index) const {
// While implementing tracegen for SHA256, we encountered what we believe to be an nvcc
// compiler bug. Occasionally, at various non-zero PTXAS optimization levels the compiler
// tries to replace this multiplication with a series of SHL, ADD, and AND instructions
// that we believe erroneously adds ~2^49 to the final address via an improper carry
// propagation. To read more, see https://github.com/stephenh-axiom-xyz/cuda-illegal.
return ptr[column_index * stride];
}

__device__ static RowSliceNew null() { return RowSliceNew(nullptr, 0, 0, 0, nullptr, false); }

__device__ bool is_valid() const { return ptr != nullptr; }

template <typename T>
__device__ __forceinline__ void write(size_t column_index, T value) const {
ptr[column_index * stride] = value;
}


// #define COL_WRITE_VALUE_NEW(ROW, STRUCT, FIELD, VALUE, SUB) \
// do { \
// auto _row_ref = (ROW); \
// const auto *_sub_ptr = (SUB); \
// const size_t _col_idx = COL_INDEX(STRUCT, FIELD); \
// const auto _apc_idx = _sub_ptr[_col_idx + _row_ref.dummy_offset]; \
// const auto _value_tmp = (VALUE); \
// if (_apc_idx != UINT32_MAX) { \
// _row_ref.write(_apc_idx - _row_ref.optimized_offset, _value_tmp); \
// }


// /// Write a single value into `FIELD` of struct `STRUCT<T>` at a given row.
// #define COL_WRITE_VALUE(ROW, STRUCT, FIELD, VALUE) (ROW).write(COL_INDEX(STRUCT, FIELD), VALUE)


template <typename T>
__device__ __forceinline__ void write_new(size_t column_index, T value) const {
const uint32_t apc_idx = subs[dummy_offset + column_index];
if (apc_idx != UINT32_MAX) {
ptr[(apc_idx - optimized_offset) * stride] = value;
}
}

template <typename T>
__device__ __forceinline__ void write_array(size_t column_index, size_t length, const T *values)
const {
#pragma unroll
for (size_t i = 0; i < length; i++) {
ptr[(column_index + i) * stride] = values[i];
}
}

template <typename T>
__device__ __forceinline__ void write_array_new(size_t column_index, size_t length, const T *values)
const {
#pragma unroll
for (size_t i = 0; i < length; i++) {
const uint32_t apc_idx = subs[dummy_offset + column_index + i];
if (apc_idx != UINT32_MAX) {
ptr[(apc_idx - optimized_offset) * stride] = values[i];
}
}
}

template <typename T>
__device__ __forceinline__ void write_bits(size_t column_index, const T value) const {
#pragma unroll
for (size_t i = 0; i < sizeof(T) * 8; i++) {
ptr[(column_index + i) * stride] = (value >> i) & 1;
}
}

__device__ __forceinline__ void fill_zero(size_t column_index_from, size_t length) const {
#pragma unroll
for (size_t i = 0, c = column_index_from; i < length; i++, c++) {
ptr[c * stride] = 0;
}
}

__device__ __forceinline__ RowSliceNew slice_from(size_t column_index) const {
uint32_t gap = number_of_gaps_in(subs, dummy_offset, column_index);
// RowPrintBuffer buffer;
// buffer.reset();
// buffer.append_literal("slice_from: optimized_offset before ");
// buffer.append_uint(optimized_offset);
// buffer.append_literal(" | dummy_offset before ");
// buffer.append_uint(dummy_offset);
// buffer.append_literal(" | column_index ");
// buffer.append_uint(column_index);
// buffer.append_literal(" | gap ");
// buffer.append_uint(gap);
// buffer.append_literal("\n");
// buffer.flush();

return RowSliceNew(ptr + (column_index - gap) * stride, stride, optimized_offset + column_index - gap, dummy_offset + column_index, subs, is_apc);
}

__device__ __forceinline__ RowSliceNew shift_row(size_t n) const {
return RowSliceNew(ptr + n, stride, optimized_offset, dummy_offset, subs, is_apc);
}
};

/// A RowSlice is a contiguous section of a row in col-based trace.
struct RowSlice {
Expand Down Expand Up @@ -61,6 +181,51 @@ struct RowSlice {
}
};

template <typename T>
__device__ __forceinline__ unsigned long long to_debug_uint(T value) {
using Base = std::remove_cv_t<std::remove_reference_t<T>>;
if constexpr (std::is_same_v<Base, Fp>) {
return static_cast<unsigned long long>(value.asRaw());
} else {
return static_cast<unsigned long long>(value);
}
}

template <typename RowT, typename ValueT>
__device__ __forceinline__ void debug_log_col_write_new(
const RowT &row,
size_t column_index,
uint32_t apc_idx,
ValueT value
) {
RowPrintBuffer buffer;
buffer.reset();
buffer.append_literal("COL_WRITE VALUE ");
buffer.append_uint(to_debug_uint(value));
buffer.append_literal(" from col_idx ");
buffer.append_uint(static_cast<unsigned long long>(column_index));
buffer.append_literal(" which is absolute col_idx ");
buffer.append_uint(
static_cast<unsigned long long>(column_index + row.dummy_offset)
);
if (apc_idx != UINT32_MAX) {
buffer.append_literal(" to apc_idx ");
buffer.append_uint(apc_idx);
buffer.append_literal(" which is relative apc_idx ");
long long relative = static_cast<long long>(apc_idx)
- static_cast<long long>(row.optimized_offset);
if (relative >= 0) {
buffer.append_uint(static_cast<unsigned long long>(relative));
} else {
buffer.append_literal("(negative)");
}
} else {
buffer.append_literal(" (skipped; apc_idx == UINT32_MAX)");
}
buffer.append_literal("\n");
buffer.flush();
}

/// Compute the 0-based column index of member `FIELD` within struct template `STRUCT<T>`,
/// by instantiating it as `STRUCT<uint8_t>` so that offsetof yields the element index.
#define COL_INDEX(STRUCT, FIELD) (offsetof(STRUCT<uint8_t>, FIELD))
Expand All @@ -71,10 +236,30 @@ struct RowSlice {
/// Write a single value into `FIELD` of struct `STRUCT<T>` at a given row.
#define COL_WRITE_VALUE(ROW, STRUCT, FIELD, VALUE) (ROW).write(COL_INDEX(STRUCT, FIELD), VALUE)

/// Conditionally write a single value into `FIELD` based on APC sub-columns.
/// TODO: move gating to write
/// #define COL_WRITE_VALUE_NEW(ROW, STRUCT, FIELD, VALUE, SUB)
/// do {
/// auto _row_ref = (ROW);
/// const auto *_sub_ptr = (SUB);
/// const size_t _col_idx = COL_INDEX(STRUCT, FIELD);
/// const auto _apc_idx = _sub_ptr[_col_idx + _row_ref.dummy_offset];
/// const auto _value_tmp = (VALUE);
/// if (_apc_idx != UINT32_MAX) {
/// _row_ref.write(_apc_idx - _row_ref.optimized_offset, _value_tmp);
/// }
/// } while (0)
/// debug_log_col_write_new(_row_ref, _col_idx, _apc_idx, _value_tmp);
#define COL_WRITE_VALUE_NEW(ROW, STRUCT, FIELD, VALUE) (ROW).write_new(COL_INDEX(STRUCT, FIELD), VALUE)

/// Write an array of values into the fixed‐length `FIELD` array of `STRUCT<T>` for one row.
#define COL_WRITE_ARRAY(ROW, STRUCT, FIELD, VALUES) \
(ROW).write_array(COL_INDEX(STRUCT, FIELD), COL_ARRAY_LEN(STRUCT, FIELD), VALUES)

/// Write an array of values into the fixed‐length `FIELD` array of `STRUCT<T>` for one row.
#define COL_WRITE_ARRAY_NEW(ROW, STRUCT, FIELD, VALUES) \
(ROW).write_array_new(COL_INDEX(STRUCT, FIELD), COL_ARRAY_LEN(STRUCT, FIELD), VALUES)

/// Write a single value bits into `FIELD` of struct `STRUCT<T>` at a given row.
#define COL_WRITE_BITS(ROW, STRUCT, FIELD, VALUE) (ROW).write_bits(COL_INDEX(STRUCT, FIELD), VALUE)

Expand All @@ -83,3 +268,14 @@ struct RowSlice {
(ROW).fill_zero( \
COL_INDEX(STRUCT, FIELD), sizeof(static_cast<STRUCT<uint8_t> *>(nullptr)->FIELD) \
)

__device__ __forceinline__ size_t number_of_gaps_in(const uint32_t *sub, size_t start, size_t len) {
size_t gaps = 0;
#pragma unroll
for (size_t i = start; i < start + len; ++i) {
if (sub[i] == UINT32_MAX) {
++gaps;
}
}
return gaps;
}
10 changes: 10 additions & 0 deletions crates/circuits/primitives/cuda/src/range_tuple.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#include <cstdio>

#include "fp.h"
#include "launcher.cuh"
#include "primitives/row_print_buffer.cuh"

__global__ void range_tuple_checker_tracegen(
const uint32_t *count,
Expand All @@ -8,7 +11,14 @@ __global__ void range_tuple_checker_tracegen(
size_t num_bins
) {
uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
// RowPrintBuffer buffer;
// buffer.reset();
// buffer.append_literal("num_bins=");
// buffer.append_uint(num_bins);
// buffer.append_literal("\n");
// buffer.flush();
if (idx < num_bins) {
// printf("idx=%u\n", idx);
trace[idx] = Fp(count[idx] + (cpu_count ? cpu_count[idx] : 0));
}
}
Expand Down
3 changes: 3 additions & 0 deletions crates/circuits/primitives/src/range_tuple/cuda.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ impl<RA, const N: usize> Chip<RA, GpuBackend> for RangeTupleCheckerChipGPU<N> {
// ATTENTION: we create a new buffer to copy `count` into because this chip is stateful and
// `count` will be reused.
let trace = DeviceMatrix::<F>::with_capacity(self.count.len(), NUM_RANGE_TUPLE_COLS);
println!("d_count len: {}", self.count.len());
println!("trace len: {}", trace.buffer().len());
println!("cpu_count len: {}", cpu_count.as_ref().map_or(0, |b| b.len()));
unsafe {
tracegen(&self.count, &cpu_count, trace.buffer()).unwrap();
}
Expand Down
12 changes: 12 additions & 0 deletions crates/vm/cuda/include/system/memory/controller.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,18 @@ struct MemoryAuxColsFactory {
COL_WRITE_VALUE(row, MemoryBaseAuxCols, prev_timestamp, prev_timestamp);
}

__device__ void fill_new(RowSliceNew row, uint32_t prev_timestamp, uint32_t timestamp) {
AssertLessThan::generate_subrow_new(
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The same as fill except that we use generate_subrow_new here.

range_checker,
timestamp_max_bits,
prev_timestamp,
timestamp,
AUX_LEN,
row.slice_from(COL_INDEX(MemoryBaseAuxCols, timestamp_lt_aux))
);
COL_WRITE_VALUE_NEW(row, MemoryBaseAuxCols, prev_timestamp, prev_timestamp);
}

__device__ void fill_zero(RowSlice row) {
row.fill_zero(0, sizeof(MemoryBaseAuxCols<uint8_t>));
}
Expand Down
Loading
Loading