Skip to content

Commit b9defdb

Browse files
stephenh-axiom-xyzjonathanpwang
authored andcommitted
docs: add comment about potentially found CUDA bug (#223)
Resolves INT-4711.
1 parent ad13305 commit b9defdb

File tree

1 file changed

+7
-6
lines changed

1 file changed

+7
-6
lines changed

tracegen-gpu/cuda/include/trace_access.h

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
#include "fp.h"
44
#include <cstddef>
5-
#include <cstdint>
65

76
/// A RowSlice is a contiguous section of a row in col-based trace.
87
struct RowSlice {
@@ -12,6 +11,11 @@ struct RowSlice {
1211
__device__ RowSlice(Fp *ptr, size_t stride) : ptr(ptr), stride(stride) {}
1312

1413
__device__ __forceinline__ Fp &operator[](size_t column_index) const {
14+
// While implementing tracegen for SHA256, we encountered what we believe to be an nvcc
15+
// compiler bug. Occasionally, at various non-zero PTXAS optimization levels the compiler
16+
// tries to replace this multiplication with a series of SHL, ADD, and AND instructions
17+
// that we believe erroneously adds ~2^49 to the final address via an improper carry
18+
// propagation. To read more, see https://github.com/stephenh-axiom-xyz/cuda-illegal.
1519
return ptr[column_index * stride];
1620
}
1721

@@ -25,11 +29,8 @@ struct RowSlice {
2529
}
2630

2731
template <typename T>
28-
__device__ __forceinline__ void write_array(
29-
size_t column_index,
30-
size_t length,
31-
const T *values
32-
) const {
32+
__device__ __forceinline__ void write_array(size_t column_index, size_t length, const T *values)
33+
const {
3334
#pragma unroll
3435
for (size_t i = 0; i < length; i++) {
3536
ptr[(column_index + i) * stride] = values[i];

0 commit comments

Comments
 (0)