File tree Expand file tree Collapse file tree 1 file changed +7
-6
lines changed
tracegen-gpu/cuda/include Expand file tree Collapse file tree 1 file changed +7
-6
lines changed Original file line number Diff line number Diff line change 22
33#include " fp.h"
44#include < cstddef>
5- #include < cstdint>
65
76// / A RowSlice is a contiguous section of a row in col-based trace.
87struct RowSlice {
@@ -12,6 +11,11 @@ struct RowSlice {
1211 __device__ RowSlice (Fp *ptr, size_t stride) : ptr(ptr), stride(stride) {}
1312
1413 __device__ __forceinline__ Fp &operator [](size_t column_index) const {
14+ // While implementing tracegen for SHA256, we encountered what we believe to be an nvcc
15+ // compiler bug. Occasionally, at various non-zero PTXAS optimization levels the compiler
16+ // tries to replace this multiplication with a series of SHL, ADD, and AND instructions
17+ // that we believe erroneously adds ~2^49 to the final address via an improper carry
18+ // propagation. To read more, see https://github.com/stephenh-axiom-xyz/cuda-illegal.
1519 return ptr[column_index * stride];
1620 }
1721
@@ -25,11 +29,8 @@ struct RowSlice {
2529 }
2630
2731 template <typename T>
28- __device__ __forceinline__ void write_array (
29- size_t column_index,
30- size_t length,
31- const T *values
32- ) const {
32+ __device__ __forceinline__ void write_array (size_t column_index, size_t length, const T *values)
33+ const {
3334#pragma unroll
3435 for (size_t i = 0 ; i < length; i++) {
3536 ptr[(column_index + i) * stride] = values[i];
You can’t perform that action at this time.
0 commit comments