-
Notifications
You must be signed in to change notification settings - Fork 0
[OVM GPU] direct to APC trace gen #50
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: 1.4.1-upgrade
Are you sure you want to change the base?
Changes from 15 commits
5b3078e
26f4389
2acdfa2
94f022a
87ff97c
6ac91a2
a913881
67331b4
7f6ee7b
d1b735d
c44dafe
3b297e8
64a946a
effb1d0
fd69e72
46394a1
51d2efd
6063ea4
cead7cb
54aad09
2f1f303
9898818
897e5d4
fd2dd1a
894f3c2
3e77951
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -80,6 +80,33 @@ struct VariableRangeChecker { | |
| } | ||
| #ifdef CUDA_DEBUG | ||
| assert(bits_remaining == 0 && x == 0); | ||
| #endif | ||
| } | ||
|
|
||
| __device__ __forceinline__ void decompose_new( | ||
| uint32_t x, | ||
| size_t bits, | ||
| RowSliceNew limbs, | ||
| const size_t limbs_len | ||
| ) { | ||
| size_t range_max_bits = max_bits(); | ||
| #ifdef CUDA_DEBUG | ||
| assert(limbs_len >= d_div_ceil(bits, range_max_bits)); | ||
| #endif | ||
| uint32_t mask = (1 << range_max_bits) - 1; | ||
| size_t bits_remaining = bits; | ||
| #pragma unroll | ||
| for (int i = 0; i < limbs_len; i++) { | ||
| uint32_t limb_u32 = x & mask; | ||
| limbs.write_new(i, limb_u32); | ||
| if (!limbs.is_apc) { | ||
| add_count(limb_u32, min(bits_remaining, range_max_bits)); | ||
| } | ||
|
Comment on lines
+101
to
+104
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the only |
||
| x >>= range_max_bits; | ||
| bits_remaining -= min(bits_remaining, range_max_bits); | ||
| } | ||
| #ifdef CUDA_DEBUG | ||
| assert(bits_remaining == 0 && x == 0); | ||
| #endif | ||
| } | ||
| }; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -40,6 +40,17 @@ __device__ __forceinline__ void generate_subrow( | |
| ) { | ||
| rc.decompose(y - x - 1, max_bits, lower_decomp, lower_decomp_len); | ||
| } | ||
|
|
||
| __device__ __forceinline__ void generate_subrow_new( | ||
| VariableRangeChecker &rc, | ||
| const uint32_t max_bits, | ||
| uint32_t x, | ||
| uint32_t y, | ||
| const size_t lower_decomp_len, | ||
| RowSliceNew lower_decomp | ||
| ) { | ||
| rc.decompose_new(y - x - 1, max_bits, lower_decomp, lower_decomp_len); | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No difference from |
||
| } | ||
| } // namespace AssertLessThan | ||
|
|
||
| namespace IsLessThan { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,56 @@ | ||
| #pragma once | ||
|
|
||
| #include <cstdio> | ||
|
|
||
| // Utility buffer to print a single APC row atomically from device code. | ||
| struct RowPrintBuffer { | ||
| static constexpr int kCapacity = 8192; | ||
| char data[kCapacity]; | ||
| int len; | ||
|
|
||
| __device__ __forceinline__ void reset() { len = 0; } | ||
|
|
||
| __device__ __forceinline__ void append_char(char c) { | ||
| if (len < kCapacity - 1) { | ||
| data[len++] = c; | ||
| } | ||
| } | ||
|
|
||
| __device__ __forceinline__ void append_literal(const char *literal) { | ||
| for (const char *ptr = literal; *ptr != '\0'; ++ptr) { | ||
| append_char(*ptr); | ||
| } | ||
| } | ||
|
|
||
| __device__ __forceinline__ void append_uint(unsigned long long value) { | ||
| char tmp[32]; | ||
| int tmp_len = 0; | ||
|
|
||
| if (value == 0) { | ||
| tmp[tmp_len++] = '0'; | ||
| } else { | ||
| while (value > 0 && tmp_len < static_cast<int>(sizeof(tmp))) { | ||
| tmp[tmp_len++] = static_cast<char>('0' + (value % 10)); | ||
| value /= 10; | ||
| } | ||
| } | ||
|
|
||
| for (int i = tmp_len - 1; i >= 0; --i) { | ||
| append_char(tmp[i]); | ||
| } | ||
| } | ||
|
|
||
| __device__ __forceinline__ void flush() { | ||
| data[len] = '\0'; | ||
| printf("%s", data); | ||
| } | ||
|
|
||
| // Execute `fn` with this buffer after clearing it, then flush. | ||
| // `fn` must be a device callable accepting `RowPrintBuffer &`. | ||
| template <typename Fn> | ||
| __device__ __forceinline__ void write_with(Fn fn) { | ||
| reset(); | ||
| fn(*this); | ||
| flush(); | ||
| } | ||
| }; |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,7 +1,127 @@ | ||
| #pragma once | ||
|
|
||
| #include "fp.h" | ||
| #include "primitives/row_print_buffer.cuh" | ||
| #include <cstddef> | ||
| #include <cstdint> | ||
| #include <type_traits> | ||
|
|
||
|
|
||
| __device__ __forceinline__ size_t number_of_gaps_in(const uint32_t *sub, size_t start, size_t len); | ||
|
|
||
| /// A RowSlice is a contiguous section of a row in col-based trace. | ||
| struct RowSliceNew { | ||
| Fp *ptr; | ||
| size_t stride; | ||
| size_t optimized_offset; | ||
| size_t dummy_offset; | ||
|
Comment on lines
+13
to
+17
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added this new
|
||
| uint32_t *subs; | ||
| bool is_apc; | ||
|
|
||
|
|
||
| __device__ RowSliceNew(Fp *ptr, size_t stride, size_t optimized_offset, size_t dummy_offset, uint32_t *subs, bool is_apc) : ptr(ptr), stride(stride), optimized_offset(optimized_offset), dummy_offset(dummy_offset), subs(subs), is_apc(is_apc) {} | ||
|
|
||
| __device__ __forceinline__ Fp &operator[](size_t column_index) const { | ||
| // While implementing tracegen for SHA256, we encountered what we believe to be an nvcc | ||
| // compiler bug. Occasionally, at various non-zero PTXAS optimization levels the compiler | ||
| // tries to replace this multiplication with a series of SHL, ADD, and AND instructions | ||
| // that we believe erroneously adds ~2^49 to the final address via an improper carry | ||
| // propagation. To read more, see https://github.com/stephenh-axiom-xyz/cuda-illegal. | ||
| return ptr[column_index * stride]; | ||
| } | ||
|
|
||
| __device__ static RowSliceNew null() { return RowSliceNew(nullptr, 0, 0, 0, nullptr, false); } | ||
|
|
||
| __device__ bool is_valid() const { return ptr != nullptr; } | ||
|
|
||
| template <typename T> | ||
| __device__ __forceinline__ void write(size_t column_index, T value) const { | ||
| ptr[column_index * stride] = value; | ||
| } | ||
|
|
||
|
|
||
| // #define COL_WRITE_VALUE_NEW(ROW, STRUCT, FIELD, VALUE, SUB) \ | ||
| // do { \ | ||
| // auto _row_ref = (ROW); \ | ||
| // const auto *_sub_ptr = (SUB); \ | ||
| // const size_t _col_idx = COL_INDEX(STRUCT, FIELD); \ | ||
| // const auto _apc_idx = _sub_ptr[_col_idx + _row_ref.dummy_offset]; \ | ||
| // const auto _value_tmp = (VALUE); \ | ||
| // if (_apc_idx != UINT32_MAX) { \ | ||
| // _row_ref.write(_apc_idx - _row_ref.optimized_offset, _value_tmp); \ | ||
| // } | ||
|
|
||
|
|
||
| // /// Write a single value into `FIELD` of struct `STRUCT<T>` at a given row. | ||
| // #define COL_WRITE_VALUE(ROW, STRUCT, FIELD, VALUE) (ROW).write(COL_INDEX(STRUCT, FIELD), VALUE) | ||
|
|
||
|
|
||
| template <typename T> | ||
| __device__ __forceinline__ void write_new(size_t column_index, T value) const { | ||
| const uint32_t apc_idx = subs[dummy_offset + column_index]; | ||
| if (apc_idx != UINT32_MAX) { | ||
| ptr[(apc_idx - optimized_offset) * stride] = value; | ||
| } | ||
| } | ||
|
|
||
| template <typename T> | ||
| __device__ __forceinline__ void write_array(size_t column_index, size_t length, const T *values) | ||
| const { | ||
| #pragma unroll | ||
| for (size_t i = 0; i < length; i++) { | ||
| ptr[(column_index + i) * stride] = values[i]; | ||
| } | ||
| } | ||
|
|
||
| template <typename T> | ||
| __device__ __forceinline__ void write_array_new(size_t column_index, size_t length, const T *values) | ||
| const { | ||
| #pragma unroll | ||
| for (size_t i = 0; i < length; i++) { | ||
| const uint32_t apc_idx = subs[dummy_offset + column_index + i]; | ||
| if (apc_idx != UINT32_MAX) { | ||
| ptr[(apc_idx - optimized_offset) * stride] = values[i]; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| template <typename T> | ||
| __device__ __forceinline__ void write_bits(size_t column_index, const T value) const { | ||
| #pragma unroll | ||
| for (size_t i = 0; i < sizeof(T) * 8; i++) { | ||
| ptr[(column_index + i) * stride] = (value >> i) & 1; | ||
| } | ||
| } | ||
|
|
||
| __device__ __forceinline__ void fill_zero(size_t column_index_from, size_t length) const { | ||
| #pragma unroll | ||
| for (size_t i = 0, c = column_index_from; i < length; i++, c++) { | ||
| ptr[c * stride] = 0; | ||
| } | ||
| } | ||
|
|
||
| __device__ __forceinline__ RowSliceNew slice_from(size_t column_index) const { | ||
| uint32_t gap = number_of_gaps_in(subs, dummy_offset, column_index); | ||
| // RowPrintBuffer buffer; | ||
| // buffer.reset(); | ||
| // buffer.append_literal("slice_from: optimized_offset before "); | ||
| // buffer.append_uint(optimized_offset); | ||
| // buffer.append_literal(" | dummy_offset before "); | ||
| // buffer.append_uint(dummy_offset); | ||
| // buffer.append_literal(" | column_index "); | ||
| // buffer.append_uint(column_index); | ||
| // buffer.append_literal(" | gap "); | ||
| // buffer.append_uint(gap); | ||
| // buffer.append_literal("\n"); | ||
| // buffer.flush(); | ||
|
|
||
| return RowSliceNew(ptr + (column_index - gap) * stride, stride, optimized_offset + column_index - gap, dummy_offset + column_index, subs, is_apc); | ||
| } | ||
|
|
||
| __device__ __forceinline__ RowSliceNew shift_row(size_t n) const { | ||
| return RowSliceNew(ptr + n, stride, optimized_offset, dummy_offset, subs, is_apc); | ||
| } | ||
| }; | ||
|
|
||
| /// A RowSlice is a contiguous section of a row in col-based trace. | ||
| struct RowSlice { | ||
|
|
@@ -61,6 +181,51 @@ struct RowSlice { | |
| } | ||
| }; | ||
|
|
||
| template <typename T> | ||
| __device__ __forceinline__ unsigned long long to_debug_uint(T value) { | ||
| using Base = std::remove_cv_t<std::remove_reference_t<T>>; | ||
| if constexpr (std::is_same_v<Base, Fp>) { | ||
| return static_cast<unsigned long long>(value.asRaw()); | ||
| } else { | ||
| return static_cast<unsigned long long>(value); | ||
| } | ||
| } | ||
|
|
||
| template <typename RowT, typename ValueT> | ||
| __device__ __forceinline__ void debug_log_col_write_new( | ||
| const RowT &row, | ||
| size_t column_index, | ||
| uint32_t apc_idx, | ||
| ValueT value | ||
| ) { | ||
| RowPrintBuffer buffer; | ||
| buffer.reset(); | ||
| buffer.append_literal("COL_WRITE VALUE "); | ||
| buffer.append_uint(to_debug_uint(value)); | ||
| buffer.append_literal(" from col_idx "); | ||
| buffer.append_uint(static_cast<unsigned long long>(column_index)); | ||
| buffer.append_literal(" which is absolute col_idx "); | ||
| buffer.append_uint( | ||
| static_cast<unsigned long long>(column_index + row.dummy_offset) | ||
| ); | ||
| if (apc_idx != UINT32_MAX) { | ||
| buffer.append_literal(" to apc_idx "); | ||
| buffer.append_uint(apc_idx); | ||
| buffer.append_literal(" which is relative apc_idx "); | ||
| long long relative = static_cast<long long>(apc_idx) | ||
| - static_cast<long long>(row.optimized_offset); | ||
| if (relative >= 0) { | ||
| buffer.append_uint(static_cast<unsigned long long>(relative)); | ||
| } else { | ||
| buffer.append_literal("(negative)"); | ||
| } | ||
| } else { | ||
| buffer.append_literal(" (skipped; apc_idx == UINT32_MAX)"); | ||
| } | ||
| buffer.append_literal("\n"); | ||
| buffer.flush(); | ||
| } | ||
|
|
||
| /// Compute the 0-based column index of member `FIELD` within struct template `STRUCT<T>`, | ||
| /// by instantiating it as `STRUCT<uint8_t>` so that offsetof yields the element index. | ||
| #define COL_INDEX(STRUCT, FIELD) (offsetof(STRUCT<uint8_t>, FIELD)) | ||
|
|
@@ -71,10 +236,30 @@ struct RowSlice { | |
| /// Write a single value into `FIELD` of struct `STRUCT<T>` at a given row. | ||
| #define COL_WRITE_VALUE(ROW, STRUCT, FIELD, VALUE) (ROW).write(COL_INDEX(STRUCT, FIELD), VALUE) | ||
|
|
||
| /// Conditionally write a single value into `FIELD` based on APC sub-columns. | ||
| /// TODO: move gating to write | ||
| /// #define COL_WRITE_VALUE_NEW(ROW, STRUCT, FIELD, VALUE, SUB) | ||
| /// do { | ||
| /// auto _row_ref = (ROW); | ||
| /// const auto *_sub_ptr = (SUB); | ||
| /// const size_t _col_idx = COL_INDEX(STRUCT, FIELD); | ||
| /// const auto _apc_idx = _sub_ptr[_col_idx + _row_ref.dummy_offset]; | ||
| /// const auto _value_tmp = (VALUE); | ||
| /// if (_apc_idx != UINT32_MAX) { | ||
| /// _row_ref.write(_apc_idx - _row_ref.optimized_offset, _value_tmp); | ||
| /// } | ||
| /// } while (0) | ||
| /// debug_log_col_write_new(_row_ref, _col_idx, _apc_idx, _value_tmp); | ||
| #define COL_WRITE_VALUE_NEW(ROW, STRUCT, FIELD, VALUE) (ROW).write_new(COL_INDEX(STRUCT, FIELD), VALUE) | ||
|
|
||
| /// Write an array of values into the fixed‐length `FIELD` array of `STRUCT<T>` for one row. | ||
| #define COL_WRITE_ARRAY(ROW, STRUCT, FIELD, VALUES) \ | ||
| (ROW).write_array(COL_INDEX(STRUCT, FIELD), COL_ARRAY_LEN(STRUCT, FIELD), VALUES) | ||
|
|
||
| /// Write an array of values into the fixed‐length `FIELD` array of `STRUCT<T>` for one row. | ||
| #define COL_WRITE_ARRAY_NEW(ROW, STRUCT, FIELD, VALUES) \ | ||
| (ROW).write_array_new(COL_INDEX(STRUCT, FIELD), COL_ARRAY_LEN(STRUCT, FIELD), VALUES) | ||
|
|
||
| /// Write a single value bits into `FIELD` of struct `STRUCT<T>` at a given row. | ||
| #define COL_WRITE_BITS(ROW, STRUCT, FIELD, VALUE) (ROW).write_bits(COL_INDEX(STRUCT, FIELD), VALUE) | ||
|
|
||
|
|
@@ -83,3 +268,14 @@ struct RowSlice { | |
| (ROW).fill_zero( \ | ||
| COL_INDEX(STRUCT, FIELD), sizeof(static_cast<STRUCT<uint8_t> *>(nullptr)->FIELD) \ | ||
| ) | ||
|
|
||
| __device__ __forceinline__ size_t number_of_gaps_in(const uint32_t *sub, size_t start, size_t len) { | ||
| size_t gaps = 0; | ||
| #pragma unroll | ||
| for (size_t i = start; i < start + len; ++i) { | ||
| if (sub[i] == UINT32_MAX) { | ||
| ++gaps; | ||
| } | ||
| } | ||
| return gaps; | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,6 +23,18 @@ struct MemoryAuxColsFactory { | |
| COL_WRITE_VALUE(row, MemoryBaseAuxCols, prev_timestamp, prev_timestamp); | ||
| } | ||
|
|
||
| __device__ void fill_new(RowSliceNew row, uint32_t prev_timestamp, uint32_t timestamp) { | ||
| AssertLessThan::generate_subrow_new( | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The same as |
||
| range_checker, | ||
| timestamp_max_bits, | ||
| prev_timestamp, | ||
| timestamp, | ||
| AUX_LEN, | ||
| row.slice_from(COL_INDEX(MemoryBaseAuxCols, timestamp_lt_aux)) | ||
| ); | ||
| COL_WRITE_VALUE_NEW(row, MemoryBaseAuxCols, prev_timestamp, prev_timestamp); | ||
| } | ||
|
|
||
| __device__ void fill_zero(RowSlice row) { | ||
| row.fill_zero(0, sizeof(MemoryBaseAuxCols<uint8_t>)); | ||
| } | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.