Skip to content

Commit 1920972

Browse files
authored
ROCm: Fix int32 overflow for blocksize quantization (#1796)
1 parent 45dcd4d commit 1920972

File tree

2 files changed

+41
-39
lines changed

2 files changed

+41
-39
lines changed

csrc/kernels.hip

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -348,16 +348,17 @@ template<typename T, int BLOCK_SIZE, int NUM_PER_TH, int STOCHASTIC, int DATA_TY
348348
//__launch_bounds__(TH, 4)
349349
__global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n)
350350
{
351-
const int n_full = gridDim.x * BLOCK_SIZE;
352-
int valid_items = 0;
353-
const int base_idx = (blockIdx.x * BLOCK_SIZE);
351+
// This can overflow, so we clamp to INT32_MAX. We won't have more elements than this.
352+
const int n_full = min(gridDim.x * BLOCK_SIZE, INT32_MAX);
353+
int valid_items = 0;
354+
const int base_idx = blockIdx.x * BLOCK_SIZE;
355+
356+
T vals[NUM_PER_TH];
357+
float rand_vals[NUM_PER_TH];
358+
unsigned char qvals[(DATA_TYPE > 0) ? NUM_PER_TH / 2 : NUM_PER_TH];
354359

355-
T vals[NUM_PER_TH];
356-
float rand_vals[NUM_PER_TH];
357-
unsigned char qvals[(DATA_TYPE > 0) ? NUM_PER_TH/2 : NUM_PER_TH];
358-
//float local_abs_max = -FLT_MAX;
359-
float local_abs_max = 0.0f;
360-
int local_rand_idx = 0;
360+
float local_abs_max = 0.0f;
361+
int local_rand_idx = 0;
361362

362363
typedef hipcub::BlockLoad<T, BLOCK_SIZE/NUM_PER_TH, NUM_PER_TH, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
363364
typedef hipcub::BlockStore<unsigned char, BLOCK_SIZE/NUM_PER_TH, (DATA_TYPE > 0) ? NUM_PER_TH/2 : NUM_PER_TH, hipcub::BLOCK_STORE_WARP_TRANSPOSE> StoreChar;
@@ -375,9 +376,9 @@ __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float
375376
for(int i = threadIdx.x; i < 256; i+=blockDim.x)
376377
smem_code[i] = code[i];
377378

378-
for (int i = base_idx; i < n_full; i += gridDim.x*BLOCK_SIZE)
379-
{
380-
valid_items = n - i > BLOCK_SIZE ? BLOCK_SIZE : n - i;
379+
380+
for (int64_t i = base_idx; i < n_full; i += gridDim.x * BLOCK_SIZE) {
381+
valid_items = min(BLOCK_SIZE, static_cast<int>(n - i));
381382
local_abs_max = -FLT_MAX;
382383

383384
__syncthreads();
@@ -465,7 +466,8 @@ __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * abs
465466
{
466467
if (DATA_TYPE > 0)
467468
{
468-
valid_items_load = min(TILE_SIZE, (n + 1) / 2 - i);
469+
// Cast n to int64_t to avoid overflow for large n
470+
valid_items_load = min(TILE_SIZE, static_cast<int>((static_cast<int64_t>(n) + 1) / 2) - i);
469471
valid_items_store = min(TILE_SIZE * 2, n - i * 2);
470472
}
471473
else

csrc/ops.hip

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ void quantize(float *code, float *A, unsigned char *out, int n)
3434
{
3535
int num_blocks = n/1024;
3636
num_blocks = n % 1024 == 0 ? num_blocks : num_blocks + 1;
37-
hipLaunchKernelGGL(( kQuantize), dim3(num_blocks), dim3(1024), 0, 0, code, A, out, n);
37+
hipLaunchKernelGGL(( kQuantize), dim3(num_blocks), dim3(1024), 0, 0, code, A, out, n);
3838
CUDA_CHECK_RETURN(hipPeekAtLastError());
3939
}
4040

@@ -72,21 +72,21 @@ template <typename T, int STOCHASTIC, int DATA_TYPE> void quantizeBlockwise(floa
7272

7373
template<typename T, int DATA_TYPE> void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int blocksize, const int n, hipStream_t stream)
7474
{
75-
int num_blocks = n/blocksize;
76-
num_blocks = n % blocksize == 0 ? num_blocks : num_blocks + 1;
7775
int tile_size = (DATA_TYPE > 0) ? 1024 : 512;
7876

77+
// Upcast to int64 to avoid overflow for large n
78+
int grid_blocks = ((int64_t)n + tile_size - 1) / tile_size;
79+
7980
if(DATA_TYPE > 0)
80-
hipLaunchKernelGGL(( kDequantizeBlockwise<T, 512, 64, 8, DATA_TYPE>), dim3((n+tile_size-1)/tile_size), dim3(64), 0, stream, code, A, absmax, out, blocksize/2, n);
81+
hipLaunchKernelGGL(( kDequantizeBlockwise<T, 512, 64, 8, DATA_TYPE>), dim3(grid_blocks), dim3(64), 0, stream, code, A, absmax, out, blocksize / 2, n);
8182
else
82-
hipLaunchKernelGGL(( kDequantizeBlockwise<T, 512, 64, 8, DATA_TYPE>), dim3((n+tile_size-1)/tile_size), dim3(64), 0, stream, code, A, absmax, out, blocksize, n);
83+
hipLaunchKernelGGL(( kDequantizeBlockwise<T, 512, 64, 8, DATA_TYPE>), dim3(grid_blocks), dim3(64), 0, stream, code, A, absmax, out, blocksize, n);
8384

8485
CUDA_CHECK_RETURN(hipPeekAtLastError());
8586
}
8687

8788

8889

89-
9090
template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p,
9191
float* state1, float* state2, float *unorm, float max_unorm, float param_norm,
9292
const float beta1, const float beta2, const float beta3, const float alpha,
@@ -102,10 +102,10 @@ template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p,
102102
if(max_unorm > 0.0f)
103103
{
104104
CUDA_CHECK_RETURN(hipMemset(unorm, 0, 1*sizeof(float)));
105-
hipLaunchKernelGGL(( kPreconditionOptimizer32bit2State<T, OPTIMIZER, 4096, 8>), dim3(num_blocks), dim3(512), 0, 0, g, p, state1, state2, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
105+
hipLaunchKernelGGL(( kPreconditionOptimizer32bit2State<T, OPTIMIZER, 4096, 8>), dim3(num_blocks), dim3(512), 0, 0, g, p, state1, state2, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
106106
CUDA_CHECK_RETURN(hipPeekAtLastError());
107107
}
108-
hipLaunchKernelGGL(( kOptimizer32bit2State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, beta3, alpha, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
108+
hipLaunchKernelGGL(( kOptimizer32bit2State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, beta3, alpha, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
109109
CUDA_CHECK_RETURN(hipPeekAtLastError());
110110
break;
111111
case MOMENTUM:
@@ -114,22 +114,22 @@ template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p,
114114
if(max_unorm > 0.0f)
115115
{
116116
CUDA_CHECK_RETURN(hipMemset(unorm, 0, 1*sizeof(float)));
117-
hipLaunchKernelGGL(( kPreconditionOptimizer32bit1State<T, OPTIMIZER, 4096, 8>), dim3(num_blocks), dim3(512), 0, 0, g, p, state1, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
117+
hipLaunchKernelGGL(( kPreconditionOptimizer32bit1State<T, OPTIMIZER, 4096, 8>), dim3(num_blocks), dim3(512), 0, 0, g, p, state1, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
118118
CUDA_CHECK_RETURN(hipPeekAtLastError());
119119
}
120120

121-
hipLaunchKernelGGL(( kOptimizer32bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, g, p, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
121+
hipLaunchKernelGGL(( kOptimizer32bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, g, p, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
122122
CUDA_CHECK_RETURN(hipPeekAtLastError());
123123
break;
124124
case LION:
125125
// in lion, the momentum update after the parameter update
126-
hipLaunchKernelGGL(( kOptimizer32bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, g, p, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
126+
hipLaunchKernelGGL(( kOptimizer32bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, g, p, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
127127
CUDA_CHECK_RETURN(hipPeekAtLastError());
128128

129129
if(max_unorm > 0.0f)
130130
{
131131
CUDA_CHECK_RETURN(hipMemset(unorm, 0, 1*sizeof(float)));
132-
hipLaunchKernelGGL(( kPreconditionOptimizer32bit1State<T, OPTIMIZER, 4096, 8>), dim3(num_blocks), dim3(512), 0, 0, g, p, state1, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
132+
hipLaunchKernelGGL(( kPreconditionOptimizer32bit1State<T, OPTIMIZER, 4096, 8>), dim3(num_blocks), dim3(512), 0, 0, g, p, state1, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
133133
CUDA_CHECK_RETURN(hipPeekAtLastError());
134134
}
135135
break;
@@ -156,30 +156,30 @@ template<typename T, int OPTIMIZER> void optimizerStatic8bit(T* p, T* g,
156156
case ADAM:
157157
CUDA_CHECK_RETURN(hipMemset(new_max1, 0, 1*sizeof(float)));
158158
CUDA_CHECK_RETURN(hipMemset(new_max2, 0, 1*sizeof(float)));
159-
hipLaunchKernelGGL(( kPreconditionOptimizerStatic8bit2State<T, OPTIMIZER>), dim3(num_blocks), dim3(256), 0, 0, p, g, state1, state2, unorm, beta1, beta2, eps, step, quantiles1, quantiles2, max1, max2, new_max1, new_max2, gnorm_scale, n);
159+
hipLaunchKernelGGL(( kPreconditionOptimizerStatic8bit2State<T, OPTIMIZER>), dim3(num_blocks), dim3(256), 0, 0, p, g, state1, state2, unorm, beta1, beta2, eps, step, quantiles1, quantiles2, max1, max2, new_max1, new_max2, gnorm_scale, n);
160160
CUDA_CHECK_RETURN(hipPeekAtLastError());
161-
hipLaunchKernelGGL(( kOptimizerStatic8bit2State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, p, g, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
161+
hipLaunchKernelGGL(( kOptimizerStatic8bit2State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, p, g, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
162162
quantiles1, quantiles2, max1, max2, new_max1, new_max2, weight_decay, gnorm_scale, n);
163163
CUDA_CHECK_RETURN(hipPeekAtLastError());
164164
break;
165165
case MOMENTUM:
166166
case RMSPROP:
167167
case ADAGRAD:
168168
CUDA_CHECK_RETURN(hipMemset(new_max1, 0, 1*sizeof(float)));
169-
hipLaunchKernelGGL(( kPreconditionOptimizerStatic8bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(256), 0, 0, p, g, state1, unorm, beta1, beta2, eps, step, quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
169+
hipLaunchKernelGGL(( kPreconditionOptimizerStatic8bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(256), 0, 0, p, g, state1, unorm, beta1, beta2, eps, step, quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
170170
CUDA_CHECK_RETURN(hipPeekAtLastError());
171-
hipLaunchKernelGGL(( kOptimizerStatic8bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, p, g, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
171+
hipLaunchKernelGGL(( kOptimizerStatic8bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, p, g, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
172172
quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
173173
CUDA_CHECK_RETURN(hipPeekAtLastError());
174174
break;
175175
case LION:
176176
// in lion, the momentum update happens after the parameter update
177-
hipLaunchKernelGGL(( kOptimizerStatic8bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, p, g, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
177+
hipLaunchKernelGGL(( kOptimizerStatic8bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, p, g, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
178178
quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
179179
CUDA_CHECK_RETURN(hipPeekAtLastError());
180180

181181
CUDA_CHECK_RETURN(hipMemset(new_max1, 0, 1*sizeof(float)));
182-
hipLaunchKernelGGL(( kPreconditionOptimizerStatic8bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(256), 0, 0, p, g, state1, unorm, beta1, beta2, eps, step, quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
182+
hipLaunchKernelGGL(( kPreconditionOptimizerStatic8bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(256), 0, 0, p, g, state1, unorm, beta1, beta2, eps, step, quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
183183
CUDA_CHECK_RETURN(hipPeekAtLastError());
184184
break;
185185
default:
@@ -221,7 +221,7 @@ template<typename T, int OPTIMIZER> void optimizerStatic8bitBlockwise(
221221
case ADEMAMIX:
222222
num_blocks = n/BLOCKSIZE_2STATE;
223223
num_blocks = n % BLOCKSIZE_2STATE == 0 ? num_blocks : num_blocks + 1;
224-
hipLaunchKernelGGL(( kOptimizerStatic8bit2StateBlockwise<T, OPTIMIZER, BLOCKSIZE_2STATE, NUM_2STATE>), dim3(num_blocks), dim3(BLOCKSIZE_2STATE/NUM_2STATE), 0, 0, p, g, state1, state2, beta1, beta2, beta3, alpha, eps, step, lr,
224+
hipLaunchKernelGGL(( kOptimizerStatic8bit2StateBlockwise<T, OPTIMIZER, BLOCKSIZE_2STATE, NUM_2STATE>), dim3(num_blocks), dim3(BLOCKSIZE_2STATE/NUM_2STATE), 0, 0, p, g, state1, state2, beta1, beta2, beta3, alpha, eps, step, lr,
225225
quantiles1, quantiles2, absmax1, absmax2, weight_decay, gnorm_scale, skip_zeros, n);
226226
CUDA_CHECK_RETURN(hipPeekAtLastError());
227227
break;
@@ -231,7 +231,7 @@ template<typename T, int OPTIMIZER> void optimizerStatic8bitBlockwise(
231231
case LION:
232232
num_blocks = n/BLOCKSIZE_1STATE;
233233
num_blocks = n % BLOCKSIZE_1STATE == 0 ? num_blocks : num_blocks + 1;
234-
hipLaunchKernelGGL(( kOptimizerStatic8bit1StateBlockwise<T, OPTIMIZER, BLOCKSIZE_1STATE, NUM_1STATE>), dim3(num_blocks), dim3(BLOCKSIZE_1STATE/NUM_1STATE), 0, 0, p, g, state1, beta1, beta2, eps, step, lr,
234+
hipLaunchKernelGGL(( kOptimizerStatic8bit1StateBlockwise<T, OPTIMIZER, BLOCKSIZE_1STATE, NUM_1STATE>), dim3(num_blocks), dim3(BLOCKSIZE_1STATE/NUM_1STATE), 0, 0, p, g, state1, beta1, beta2, eps, step, lr,
235235
quantiles1, absmax1, weight_decay, gnorm_scale, skip_zeros, n);
236236
CUDA_CHECK_RETURN(hipPeekAtLastError());
237237
break;
@@ -245,7 +245,7 @@ template<typename T> void percentileClipping(T * g, float *gnorm_vec, int step,
245245
int num_blocks = n/2048;
246246
num_blocks = n % 2048 == 0 ? num_blocks : num_blocks + 1;
247247
CUDA_CHECK_RETURN(hipMemset(&gnorm_vec[step % 100], 0, 1*sizeof(float)));
248-
hipLaunchKernelGGL(( kPercentileClipping<T, 2048, 4>), dim3(num_blocks), dim3(512), 0, 0, g, gnorm_vec, step, n);
248+
hipLaunchKernelGGL(( kPercentileClipping<T, 2048, 4>), dim3(num_blocks), dim3(512), 0, 0, g, gnorm_vec, step, n);
249249
CUDA_CHECK_RETURN(hipPeekAtLastError());
250250
}
251251

@@ -669,7 +669,7 @@ void spmm_coo(hipsparseHandle_t handle, int *A_rowidx, int *A_colidx, half *A_va
669669
template <typename T, int BITS> void spmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, T *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB)
670670
{
671671

672-
hipLaunchKernelGGL(( kspmm_coo_very_sparse_naive<T, 8, BITS>), dim3(nnz_rows), dim3(256), 0, 0, max_count, max_idx, offset_rowidx, rowidx, colidx, values, B, out, dequant_stats, nnz, rowsA, rowsB, colsB);
672+
hipLaunchKernelGGL(( kspmm_coo_very_sparse_naive<T, 8, BITS>), dim3(nnz_rows), dim3(256), 0, 0, max_count, max_idx, offset_rowidx, rowidx, colidx, values, B, out, dequant_stats, nnz, rowsA, rowsB, colsB);
673673
CUDA_CHECK_RETURN(hipPeekAtLastError());
674674
}
675675

@@ -679,17 +679,17 @@ template <typename T> void gemm_host(int m, int n, int k, T * A, T* B, T * out
679679
int num_blocks = (m+31)/32;
680680

681681
if(bits == 32)
682-
hipLaunchKernelGGL(( gemm_device<T, 32, 32>), dim3(num_blocks), dim3(32), 0, 0, m, n, k, A, B, out, lda, ldb, ldc);
682+
hipLaunchKernelGGL(( gemm_device<T, 32, 32>), dim3(num_blocks), dim3(32), 0, 0, m, n, k, A, B, out, lda, ldb, ldc);
683683
if(bits == 16)
684-
hipLaunchKernelGGL(( gemm_device<T, 16, 160>), dim3(num_blocks), dim3(160), 0, 0, m, n, k, A, B, out, lda, ldb, ldc);
684+
hipLaunchKernelGGL(( gemm_device<T, 16, 160>), dim3(num_blocks), dim3(160), 0, 0, m, n, k, A, B, out, lda, ldb, ldc);
685685
}
686686

687687
template <typename T> void gemm_4bit_inference(int m, int n, int k, T * A, unsigned char* B, float *absmax, T * out, int lda, int ldb, int ldc, int blocksize)
688688
{
689689

690690
int num_blocks = (m+31)/32;
691691

692-
hipLaunchKernelGGL(( kgemm_4bit_inference<T, 96>), dim3(num_blocks), dim3(96), 0, 0, m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize);
692+
hipLaunchKernelGGL(( kgemm_4bit_inference<T, 96>), dim3(num_blocks), dim3(96), 0, 0, m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize);
693693
}
694694

695695
template <typename T, int BITS> void gemm_4bit_inference_naive(int m, int n, int k, T * A, unsigned char* B, float *absmax, float *datatype, T * out, int lda, int ldb, int ldc, int blocksize, hipStream_t stream)
@@ -712,7 +712,7 @@ template <typename T, int FUNC> void func(T *A, T *B, T value, long n)
712712
int blocks = n/threads;
713713
blocks = n % threads == 0 ? blocks : blocks + 1;
714714
blocks = blocks > 65535 ? 65535 : blocks;
715-
hipLaunchKernelGGL(( kfunc<T, FUNC>), dim3(blocks), dim3(512), 0, 0, A, B, value, n);
715+
hipLaunchKernelGGL(( kfunc<T, FUNC>), dim3(blocks), dim3(512), 0, 0, A, B, value, n);
716716
CUDA_CHECK_RETURN(hipPeekAtLastError());
717717
}
718718

0 commit comments

Comments
 (0)