@@ -34,7 +34,7 @@ void quantize(float *code, float *A, unsigned char *out, int n)
3434{
3535 int num_blocks = n/1024 ;
3636 num_blocks = n % 1024 == 0 ? num_blocks : num_blocks + 1 ;
37- hipLaunchKernelGGL (( kQuantize ), dim3 (num_blocks), dim3 (1024 ), 0 , 0 , code, A, out, n);
37+ hipLaunchKernelGGL (( kQuantize ), dim3 (num_blocks), dim3 (1024 ), 0 , 0 , code, A, out, n);
3838 CUDA_CHECK_RETURN (hipPeekAtLastError ());
3939}
4040
@@ -72,21 +72,21 @@ template <typename T, int STOCHASTIC, int DATA_TYPE> void quantizeBlockwise(floa
7272
7373template <typename T, int DATA_TYPE> void dequantizeBlockwise (float *code, unsigned char *A, float *absmax, T *out, int blocksize, const int n, hipStream_t stream)
7474{
75- int num_blocks = n/blocksize;
76- num_blocks = n % blocksize == 0 ? num_blocks : num_blocks + 1 ;
7775 int tile_size = (DATA_TYPE > 0 ) ? 1024 : 512 ;
7876
77+ // Upcast to int64 to avoid overflow for large n
78+ int grid_blocks = ((int64_t )n + tile_size - 1 ) / tile_size;
79+
7980 if (DATA_TYPE > 0 )
80- hipLaunchKernelGGL (( kDequantizeBlockwise <T, 512 , 64 , 8 , DATA_TYPE>), dim3 ((n+tile_size- 1 )/tile_size) , dim3 (64 ), 0 , stream, code, A, absmax, out, blocksize/ 2 , n);
81+ hipLaunchKernelGGL (( kDequantizeBlockwise <T, 512 , 64 , 8 , DATA_TYPE>), dim3 (grid_blocks) , dim3 (64 ), 0 , stream, code, A, absmax, out, blocksize / 2 , n);
8182 else
82- hipLaunchKernelGGL (( kDequantizeBlockwise <T, 512 , 64 , 8 , DATA_TYPE>), dim3 ((n+tile_size- 1 )/tile_size ), dim3 (64 ), 0 , stream, code, A, absmax, out, blocksize, n);
83+ hipLaunchKernelGGL (( kDequantizeBlockwise <T, 512 , 64 , 8 , DATA_TYPE>), dim3 (grid_blocks ), dim3 (64 ), 0 , stream, code, A, absmax, out, blocksize, n);
8384
8485 CUDA_CHECK_RETURN (hipPeekAtLastError ());
8586}
8687
8788
8889
89-
9090template <typename T, int OPTIMIZER> void optimizer32bit (T* g, T* p,
9191 float * state1, float * state2, float *unorm, float max_unorm, float param_norm,
9292 const float beta1, const float beta2, const float beta3, const float alpha,
@@ -102,10 +102,10 @@ template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p,
102102 if (max_unorm > 0 .0f )
103103 {
104104 CUDA_CHECK_RETURN (hipMemset (unorm, 0 , 1 *sizeof (float )));
105- hipLaunchKernelGGL (( kPreconditionOptimizer32bit2State <T, OPTIMIZER, 4096 , 8 >), dim3 (num_blocks), dim3 (512 ), 0 , 0 , g, p, state1, state2, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
105+ hipLaunchKernelGGL (( kPreconditionOptimizer32bit2State <T, OPTIMIZER, 4096 , 8 >), dim3 (num_blocks), dim3 (512 ), 0 , 0 , g, p, state1, state2, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
106106 CUDA_CHECK_RETURN (hipPeekAtLastError ());
107107 }
108- hipLaunchKernelGGL (( kOptimizer32bit2State <T, OPTIMIZER>), dim3 (num_blocks), dim3 (1024 ), 0 , 0 , g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, beta3, alpha, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
108+ hipLaunchKernelGGL (( kOptimizer32bit2State <T, OPTIMIZER>), dim3 (num_blocks), dim3 (1024 ), 0 , 0 , g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, beta3, alpha, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
109109 CUDA_CHECK_RETURN (hipPeekAtLastError ());
110110 break ;
111111 case MOMENTUM:
@@ -114,22 +114,22 @@ template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p,
114114 if (max_unorm > 0 .0f )
115115 {
116116 CUDA_CHECK_RETURN (hipMemset (unorm, 0 , 1 *sizeof (float )));
117- hipLaunchKernelGGL (( kPreconditionOptimizer32bit1State <T, OPTIMIZER, 4096 , 8 >), dim3 (num_blocks), dim3 (512 ), 0 , 0 , g, p, state1, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
117+ hipLaunchKernelGGL (( kPreconditionOptimizer32bit1State <T, OPTIMIZER, 4096 , 8 >), dim3 (num_blocks), dim3 (512 ), 0 , 0 , g, p, state1, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
118118 CUDA_CHECK_RETURN (hipPeekAtLastError ());
119119 }
120120
121- hipLaunchKernelGGL (( kOptimizer32bit1State <T, OPTIMIZER>), dim3 (num_blocks), dim3 (1024 ), 0 , 0 , g, p, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
121+ hipLaunchKernelGGL (( kOptimizer32bit1State <T, OPTIMIZER>), dim3 (num_blocks), dim3 (1024 ), 0 , 0 , g, p, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
122122 CUDA_CHECK_RETURN (hipPeekAtLastError ());
123123 break ;
124124 case LION:
125125 // in lion, the momentum update after the parameter update
126- hipLaunchKernelGGL (( kOptimizer32bit1State <T, OPTIMIZER>), dim3 (num_blocks), dim3 (1024 ), 0 , 0 , g, p, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
126+ hipLaunchKernelGGL (( kOptimizer32bit1State <T, OPTIMIZER>), dim3 (num_blocks), dim3 (1024 ), 0 , 0 , g, p, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
127127 CUDA_CHECK_RETURN (hipPeekAtLastError ());
128128
129129 if (max_unorm > 0 .0f )
130130 {
131131 CUDA_CHECK_RETURN (hipMemset (unorm, 0 , 1 *sizeof (float )));
132- hipLaunchKernelGGL (( kPreconditionOptimizer32bit1State <T, OPTIMIZER, 4096 , 8 >), dim3 (num_blocks), dim3 (512 ), 0 , 0 , g, p, state1, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
132+ hipLaunchKernelGGL (( kPreconditionOptimizer32bit1State <T, OPTIMIZER, 4096 , 8 >), dim3 (num_blocks), dim3 (512 ), 0 , 0 , g, p, state1, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
133133 CUDA_CHECK_RETURN (hipPeekAtLastError ());
134134 }
135135 break ;
@@ -156,30 +156,30 @@ template<typename T, int OPTIMIZER> void optimizerStatic8bit(T* p, T* g,
156156 case ADAM:
157157 CUDA_CHECK_RETURN (hipMemset (new_max1, 0 , 1 *sizeof (float )));
158158 CUDA_CHECK_RETURN (hipMemset (new_max2, 0 , 1 *sizeof (float )));
159- hipLaunchKernelGGL (( kPreconditionOptimizerStatic8bit2State <T, OPTIMIZER>), dim3 (num_blocks), dim3 (256 ), 0 , 0 , p, g, state1, state2, unorm, beta1, beta2, eps, step, quantiles1, quantiles2, max1, max2, new_max1, new_max2, gnorm_scale, n);
159+ hipLaunchKernelGGL (( kPreconditionOptimizerStatic8bit2State <T, OPTIMIZER>), dim3 (num_blocks), dim3 (256 ), 0 , 0 , p, g, state1, state2, unorm, beta1, beta2, eps, step, quantiles1, quantiles2, max1, max2, new_max1, new_max2, gnorm_scale, n);
160160 CUDA_CHECK_RETURN (hipPeekAtLastError ());
161- hipLaunchKernelGGL (( kOptimizerStatic8bit2State <T, OPTIMIZER>), dim3 (num_blocks), dim3 (1024 ), 0 , 0 , p, g, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
161+ hipLaunchKernelGGL (( kOptimizerStatic8bit2State <T, OPTIMIZER>), dim3 (num_blocks), dim3 (1024 ), 0 , 0 , p, g, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
162162 quantiles1, quantiles2, max1, max2, new_max1, new_max2, weight_decay, gnorm_scale, n);
163163 CUDA_CHECK_RETURN (hipPeekAtLastError ());
164164 break ;
165165 case MOMENTUM:
166166 case RMSPROP:
167167 case ADAGRAD:
168168 CUDA_CHECK_RETURN (hipMemset (new_max1, 0 , 1 *sizeof (float )));
169- hipLaunchKernelGGL (( kPreconditionOptimizerStatic8bit1State <T, OPTIMIZER>), dim3 (num_blocks), dim3 (256 ), 0 , 0 , p, g, state1, unorm, beta1, beta2, eps, step, quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
169+ hipLaunchKernelGGL (( kPreconditionOptimizerStatic8bit1State <T, OPTIMIZER>), dim3 (num_blocks), dim3 (256 ), 0 , 0 , p, g, state1, unorm, beta1, beta2, eps, step, quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
170170 CUDA_CHECK_RETURN (hipPeekAtLastError ());
171- hipLaunchKernelGGL (( kOptimizerStatic8bit1State <T, OPTIMIZER>), dim3 (num_blocks), dim3 (1024 ), 0 , 0 , p, g, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
171+ hipLaunchKernelGGL (( kOptimizerStatic8bit1State <T, OPTIMIZER>), dim3 (num_blocks), dim3 (1024 ), 0 , 0 , p, g, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
172172 quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
173173 CUDA_CHECK_RETURN (hipPeekAtLastError ());
174174 break ;
175175 case LION:
176176 // in lion, the momentum update happens after the parameter update
177- hipLaunchKernelGGL (( kOptimizerStatic8bit1State <T, OPTIMIZER>), dim3 (num_blocks), dim3 (1024 ), 0 , 0 , p, g, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
177+ hipLaunchKernelGGL (( kOptimizerStatic8bit1State <T, OPTIMIZER>), dim3 (num_blocks), dim3 (1024 ), 0 , 0 , p, g, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
178178 quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
179179 CUDA_CHECK_RETURN (hipPeekAtLastError ());
180180
181181 CUDA_CHECK_RETURN (hipMemset (new_max1, 0 , 1 *sizeof (float )));
182- hipLaunchKernelGGL (( kPreconditionOptimizerStatic8bit1State <T, OPTIMIZER>), dim3 (num_blocks), dim3 (256 ), 0 , 0 , p, g, state1, unorm, beta1, beta2, eps, step, quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
182+ hipLaunchKernelGGL (( kPreconditionOptimizerStatic8bit1State <T, OPTIMIZER>), dim3 (num_blocks), dim3 (256 ), 0 , 0 , p, g, state1, unorm, beta1, beta2, eps, step, quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
183183 CUDA_CHECK_RETURN (hipPeekAtLastError ());
184184 break ;
185185 default :
@@ -221,7 +221,7 @@ template<typename T, int OPTIMIZER> void optimizerStatic8bitBlockwise(
221221 case ADEMAMIX:
222222 num_blocks = n/BLOCKSIZE_2STATE;
223223 num_blocks = n % BLOCKSIZE_2STATE == 0 ? num_blocks : num_blocks + 1 ;
224- hipLaunchKernelGGL (( kOptimizerStatic8bit2StateBlockwise <T, OPTIMIZER, BLOCKSIZE_2STATE, NUM_2STATE>), dim3 (num_blocks), dim3 (BLOCKSIZE_2STATE/NUM_2STATE), 0 , 0 , p, g, state1, state2, beta1, beta2, beta3, alpha, eps, step, lr,
224+ hipLaunchKernelGGL (( kOptimizerStatic8bit2StateBlockwise <T, OPTIMIZER, BLOCKSIZE_2STATE, NUM_2STATE>), dim3 (num_blocks), dim3 (BLOCKSIZE_2STATE/NUM_2STATE), 0 , 0 , p, g, state1, state2, beta1, beta2, beta3, alpha, eps, step, lr,
225225 quantiles1, quantiles2, absmax1, absmax2, weight_decay, gnorm_scale, skip_zeros, n);
226226 CUDA_CHECK_RETURN (hipPeekAtLastError ());
227227 break ;
@@ -231,7 +231,7 @@ template<typename T, int OPTIMIZER> void optimizerStatic8bitBlockwise(
231231 case LION:
232232 num_blocks = n/BLOCKSIZE_1STATE;
233233 num_blocks = n % BLOCKSIZE_1STATE == 0 ? num_blocks : num_blocks + 1 ;
234- hipLaunchKernelGGL (( kOptimizerStatic8bit1StateBlockwise <T, OPTIMIZER, BLOCKSIZE_1STATE, NUM_1STATE>), dim3 (num_blocks), dim3 (BLOCKSIZE_1STATE/NUM_1STATE), 0 , 0 , p, g, state1, beta1, beta2, eps, step, lr,
234+ hipLaunchKernelGGL (( kOptimizerStatic8bit1StateBlockwise <T, OPTIMIZER, BLOCKSIZE_1STATE, NUM_1STATE>), dim3 (num_blocks), dim3 (BLOCKSIZE_1STATE/NUM_1STATE), 0 , 0 , p, g, state1, beta1, beta2, eps, step, lr,
235235 quantiles1, absmax1, weight_decay, gnorm_scale, skip_zeros, n);
236236 CUDA_CHECK_RETURN (hipPeekAtLastError ());
237237 break ;
@@ -245,7 +245,7 @@ template<typename T> void percentileClipping(T * g, float *gnorm_vec, int step,
245245 int num_blocks = n/2048 ;
246246 num_blocks = n % 2048 == 0 ? num_blocks : num_blocks + 1 ;
247247 CUDA_CHECK_RETURN (hipMemset (&gnorm_vec[step % 100 ], 0 , 1 *sizeof (float )));
248- hipLaunchKernelGGL (( kPercentileClipping <T, 2048 , 4 >), dim3 (num_blocks), dim3 (512 ), 0 , 0 , g, gnorm_vec, step, n);
248+ hipLaunchKernelGGL (( kPercentileClipping <T, 2048 , 4 >), dim3 (num_blocks), dim3 (512 ), 0 , 0 , g, gnorm_vec, step, n);
249249 CUDA_CHECK_RETURN (hipPeekAtLastError ());
250250}
251251
@@ -669,7 +669,7 @@ void spmm_coo(hipsparseHandle_t handle, int *A_rowidx, int *A_colidx, half *A_va
669669template <typename T, int BITS> void spmm_coo_very_sparse_naive (int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, T *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB)
670670{
671671
672- hipLaunchKernelGGL (( kspmm_coo_very_sparse_naive<T, 8 , BITS>), dim3 (nnz_rows), dim3 (256 ), 0 , 0 , max_count, max_idx, offset_rowidx, rowidx, colidx, values, B, out, dequant_stats, nnz, rowsA, rowsB, colsB);
672+ hipLaunchKernelGGL (( kspmm_coo_very_sparse_naive<T, 8 , BITS>), dim3 (nnz_rows), dim3 (256 ), 0 , 0 , max_count, max_idx, offset_rowidx, rowidx, colidx, values, B, out, dequant_stats, nnz, rowsA, rowsB, colsB);
673673 CUDA_CHECK_RETURN (hipPeekAtLastError ());
674674}
675675
@@ -679,17 +679,17 @@ template <typename T> void gemm_host(int m, int n, int k, T * A, T* B, T * out
679679 int num_blocks = (m+31 )/32 ;
680680
681681 if (bits == 32 )
682- hipLaunchKernelGGL (( gemm_device<T, 32 , 32 >), dim3 (num_blocks), dim3 (32 ), 0 , 0 , m, n, k, A, B, out, lda, ldb, ldc);
682+ hipLaunchKernelGGL (( gemm_device<T, 32 , 32 >), dim3 (num_blocks), dim3 (32 ), 0 , 0 , m, n, k, A, B, out, lda, ldb, ldc);
683683 if (bits == 16 )
684- hipLaunchKernelGGL (( gemm_device<T, 16 , 160 >), dim3 (num_blocks), dim3 (160 ), 0 , 0 , m, n, k, A, B, out, lda, ldb, ldc);
684+ hipLaunchKernelGGL (( gemm_device<T, 16 , 160 >), dim3 (num_blocks), dim3 (160 ), 0 , 0 , m, n, k, A, B, out, lda, ldb, ldc);
685685}
686686
687687template <typename T> void gemm_4bit_inference (int m, int n, int k, T * A, unsigned char * B, float *absmax, T * out, int lda, int ldb, int ldc, int blocksize)
688688{
689689
690690 int num_blocks = (m+31 )/32 ;
691691
692- hipLaunchKernelGGL (( kgemm_4bit_inference<T, 96 >), dim3 (num_blocks), dim3 (96 ), 0 , 0 , m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize);
692+ hipLaunchKernelGGL (( kgemm_4bit_inference<T, 96 >), dim3 (num_blocks), dim3 (96 ), 0 , 0 , m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize);
693693}
694694
695695template <typename T, int BITS> void gemm_4bit_inference_naive (int m, int n, int k, T * A, unsigned char * B, float *absmax, float *datatype, T * out, int lda, int ldb, int ldc, int blocksize, hipStream_t stream)
@@ -712,7 +712,7 @@ template <typename T, int FUNC> void func(T *A, T *B, T value, long n)
712712 int blocks = n/threads;
713713 blocks = n % threads == 0 ? blocks : blocks + 1 ;
714714 blocks = blocks > 65535 ? 65535 : blocks;
715- hipLaunchKernelGGL (( kfunc<T, FUNC>), dim3 (blocks), dim3 (512 ), 0 , 0 , A, B, value, n);
715+ hipLaunchKernelGGL (( kfunc<T, FUNC>), dim3 (blocks), dim3 (512 ), 0 , 0 , A, B, value, n);
716716 CUDA_CHECK_RETURN (hipPeekAtLastError ());
717717}
718718
0 commit comments