int8: specify CUDA stream for int8 ops

matthewdouglas · matthewdouglas · commit dfc466868f5a · 2024-10-22T16:40:55.000-04:00
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
@@ -442,8 +442,7 @@ def is_on_gpu(tensors: Iterable[torch.Tensor]):
 
 
 def get_tensor_stream(tensor: Tensor) -> torch.cuda.Stream:
-    stream = torch.cuda.current_stream(tensor.device)
-    return stream
+    return torch.cuda.current_stream(tensor.device)
 
 
 def get_ptr(A: Optional[Tensor]) -> Optional[ct.c_void_p]:
@@ -461,8 +460,8 @@ def get_ptr(A: Optional[Tensor]) -> Optional[ct.c_void_p]:
     """
     if A is None:
         return None
-    else:
-        return ct.c_void_p(A.data.data_ptr())
+
+    return ct.c_void_p(A.data_ptr())
 
 
 def pre_call(device):
@@ -2323,11 +2322,12 @@ def igemmlt(A, B, out=None, Sout=None, dtype=torch.int32):
         ptrC = get_ptr(out)
         ptrRowScale = get_ptr(None)
         m, n, k, lda, ldb, ldc = map(ct.c_int32, (m, n, k, lda, ldb, ldc))
+        stream = get_tensor_stream(A)
 
         if dtype == torch.int32:
-            has_error = lib.cigemmlt_32(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc)
+            has_error = lib.cigemmlt_32(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc, stream)
         else:
-            has_error = lib.cigemmlt_8(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc)
+            has_error = lib.cigemmlt_8(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc, stream)
 
     if has_error == 100:  # `ERR_NOT_IMPLEMENTED` is defined as 100 in `ops.cu`
         raise NotImplementedError("igemmlt not implemented!")
@@ -2373,13 +2373,7 @@ def mm_dequant(
 
     with torch.cuda.device_of(A):
         lib.cdequant_mm_int32_fp16(
-            ptrA,
-            ptrRowStats,
-            ptrColStats,
-            ptrOut,
-            ptrBias,
-            numRows,
-            numCols,
+            ptrA, ptrRowStats, ptrColStats, ptrOut, ptrBias, numRows, numCols, get_tensor_stream(A)
         )
 
     return out
@@ -2428,7 +2422,14 @@ def get_row_absmax(A, threshold=0.0):
     is_on_gpu([A])
 
     with torch.cuda.device_of(A):
-        lib.cget_row_stats(get_ptr(A), get_ptr(row_stats), ct.c_float(threshold), ct.c_int32(rows), ct.c_int32(cols))
+        lib.cget_row_stats(
+            get_ptr(A),
+            get_ptr(row_stats),
+            ct.c_float(threshold),
+            ct.c_int32(rows),
+            ct.c_int32(cols),
+            get_tensor_stream(A),
+        )
 
     return row_stats
 
@@ -2547,12 +2548,16 @@ def int8_vectorwise_quant(A: torch.Tensor, threshold=0.0):
     rows = prod(A.shape[:-1])
     cols = A.shape[-1]
 
-    row_stats = torch.empty((rows,), device=A.device, dtype=torch.float32)
+    row_stats = torch.empty(rows, device=A.device, dtype=torch.float32)
     out_row = torch.empty(A.shape, device=A.device, dtype=torch.int8)
 
     if threshold > 0.0:
         # TODO we could improve perf of this
-        coo_tensor = extract_outliers_new(A, threshold)
+
+        # A.masked_fill(A.abs() < threshold, 0.0).to_sparse_coo()
+        # coo_tensor = extract_outliers_new(A, threshold)
+        coo_tensor = torch.masked_fill(A, A.abs() < threshold, 0.0).to_sparse_coo()
+
     else:
         coo_tensor = None
 
@@ -2564,6 +2569,7 @@ def int8_vectorwise_quant(A: torch.Tensor, threshold=0.0):
             ct.c_float(threshold),
             ct.c_int32(rows),
             ct.c_int32(cols),
+            get_tensor_stream(A),
         )
 
     return out_row, row_stats, coo_tensor
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
@@ -481,10 +481,8 @@ def forward(self, x: torch.Tensor):
             x = x.to(self.compute_dtype)
 
         bias = None if self.bias is None else self.bias.to(self.compute_dtype)
-        out = bnb.matmul_4bit(x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state)
-
-        out = out.to(inp_dtype)
 
+        out = bnb.matmul_4bit(x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state).to(inp_dtype)
         return out
 
 
diff --git a/csrc/kernels.cu b/csrc/kernels.cu
@@ -3558,6 +3558,7 @@ template <typename T, int THREADS, int BITS> __global__ void kgemm_4bit_inferenc
   const int warp_idx = threadIdx.x / 32;
   const int warp_lane = threadIdx.x % 32;
   const int row_B = (THREADS/32)*blockIdx.x + warp_idx;
+  const int offset_B = ldb*row_B;
   const int num_values_8bit = num_values_4bit/2;
   float local_C = 0.0f;
 
@@ -3578,7 +3579,6 @@ template <typename T, int THREADS, int BITS> __global__ void kgemm_4bit_inferenc
   for(int inner_idx = warp_lane*num_values_4bit; inner_idx < K; inner_idx += 32*num_values_4bit)
   {
     const int inner_idx_halved = inner_idx/2;
-    const int offset_B = ldb*row_B;
     const int absidx = ((2*offset_B)+inner_idx) >> (31 - __clz(blocksize));
     //int absidx = ((2*offset_B)+inner_idx)/blocksize;
 	  local_absmax = __ldg(&(absmax[absidx]));
diff --git a/csrc/ops.cu b/csrc/ops.cu
@@ -423,7 +423,8 @@ template <int DTYPE_OUT, int SCALE_ROWS> int igemmlt(
   const int8_t * B,
   void * C,
   float * row_scale,
-  int lda, int ldb, int ldc
+  int lda, int ldb, int ldc,
+  cudaStream_t stream
 ) {
 
   // Calculate C = A^T @ B, in col-major layout.
@@ -461,7 +462,7 @@ template <int DTYPE_OUT, int SCALE_ROWS> int igemmlt(
         B, bDesc, &beta,
         (int32_t*)C, cDesc,
         (int32_t*)C, cDesc,
-        NULL, NULL, 0, 0
+        NULL, NULL, 0, stream
       ));
   } else {
     if (!SCALE_ROWS) {
@@ -472,7 +473,7 @@ template <int DTYPE_OUT, int SCALE_ROWS> int igemmlt(
         B, bDesc, &beta,
         (int8_t*)C, cDesc,
         (int8_t*)C, cDesc,
-        NULL, NULL, 0, 0
+        NULL, NULL, 0, stream
       ));
     } else {
       cublasLtPointerMode_t alphaVec = CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_HOST;
@@ -489,7 +490,7 @@ template <int DTYPE_OUT, int SCALE_ROWS> int igemmlt(
         B, bDesc, &beta,
         (int8_t*)C, cDesc,
         (int8_t*)C, cDesc,
-        NULL, NULL, 0, 0
+        NULL, NULL, 0, stream
       ));
     }
   }
@@ -510,23 +511,23 @@ int fill_up_to_nearest_multiple(int value, int multiple)
   return value + (value % multiple == 0 ? 0 : (multiple - (value % multiple)));
 }
 
-void dequant_mm_int32_fp16(int *A, float *rowStats, float *colStats, half *out, half *bias, int numRows, int numCols)
+void dequant_mm_int32_fp16(int *A, float *rowStats, float *colStats, half *out, half *bias, int numRows, int numCols, cudaStream_t stream)
 {
   const int threads = 512;
   const int num_per_thread = 4;
   const int num_per_block = threads * num_per_thread;
   const int n = numRows*numCols;
   const int num_blocks = (n + num_per_block - 1) / num_per_block;
 
-  kdequant_mm_int32_fp16<num_per_thread, threads><<<num_blocks, threads>>>(A, rowStats, colStats, out, bias, numRows, numCols, n);
+  kdequant_mm_int32_fp16<num_per_thread, threads><<<num_blocks, threads, 0, stream>>>(A, rowStats, colStats, out, bias, numRows, numCols, n);
   CUDA_CHECK_RETURN(cudaPeekAtLastError());
 }
 
-void int8VectorQuant(half * __restrict__ A, int8_t *out, float *rowStats, float threshold, int rows, int cols) {
+void int8VectorQuant(half * __restrict__ A, int8_t *out, float *rowStats, float threshold, int rows, int cols, cudaStream_t stream) {
   if (threshold == 0.0) {
-    kInt8VectorQuant<half, 1024, 0><<<rows, 1024>>>(A, out, rowStats, threshold, rows, cols);
+    kInt8VectorQuant<half, 1024, 0><<<rows, 1024, 0, stream>>>(A, out, rowStats, threshold, rows, cols);
   } else {
-    kInt8VectorQuant<half, 1024, 1><<<rows, 1024>>>(A, out, rowStats, threshold, rows, cols);
+    kInt8VectorQuant<half, 1024, 1><<<rows, 1024, 0, stream>>>(A, out, rowStats, threshold, rows, cols);
   }
   CUDA_CHECK_RETURN(cudaPeekAtLastError());
 }
@@ -553,11 +554,11 @@ void getColRowStats(half * A, float *rowStats, float *colStats, int *nnz_count_r
 
 }
 
-void getRowStats(half *A, float *rowStats, float threshold, int rows, int cols) {
+void getRowStats(half *A, float *rowStats, float threshold, int rows, int cols, cudaStream_t stream) {
   if (threshold == 0.0)
-    kgetRowStats<half, 1024, 0><<<rows, 1024>>>(A, rowStats, threshold, rows, cols);
+    kgetRowStats<half, 1024, 0><<<rows, 1024, 0, stream>>>(A, rowStats, threshold, rows, cols);
   else
-    kgetRowStats<half, 1024, 1><<<rows, 1024>>>(A, rowStats, threshold, rows, cols);
+    kgetRowStats<half, 1024, 1><<<rows, 1024, 0, stream>>>(A, rowStats, threshold, rows, cols);
   CUDA_CHECK_RETURN(cudaPeekAtLastError());
 }
 
@@ -795,9 +796,9 @@ template void extractOutliers<COL_AMPERE>(char * A, int *idx, char *out, int idx
 template void spmm_coo_very_sparse_naive<half, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB);
 template void spmm_coo_very_sparse_naive<signed char, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB);
 
-template int igemmlt<32, 0>(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
-template int igemmlt<8, 0>(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
-template int igemmlt<8, 1>(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
+template int igemmlt<32, 0>(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, cudaStream_t stream);
+template int igemmlt<8, 0>(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, cudaStream_t stream);
+template int igemmlt<8, 1>(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, cudaStream_t stream);
 
 template void transformRowToFormat<COL32, 0>(char * A, char *out, int rows, int cols);
 template void transformRowToFormat<COL32, 1>(char * A, char *out, int rows, int cols);
diff --git a/csrc/ops.cuh b/csrc/ops.cuh
@@ -171,16 +171,16 @@ void gemmex(Context * context, bool transposeA, bool transposeB, int m, int n, i
 void strided_gemmex(Context *context, bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc,
                     long long int strideA, long long int strideB, long long int strideC, int batchCount);
 
-template <int DTYPE_OUT, int SCALE_ROWS> int igemmlt(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
+template <int DTYPE_OUT, int SCALE_ROWS> int igemmlt(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, cudaStream_t stream);
 
 template <typename T, int SRC, int TARGET, bool transpose, int DTYPE> void transform(cublasLtHandle_t ltHandle, T *A, T *out, int dim1, int dim2);
 void cutlass_igemm(bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc);
-void dequant_mm_int32_fp16(int *A, float *rowStats, float *colStats, half *out, half* bias, int numRows, int numCols);
+void dequant_mm_int32_fp16(int *A, float *rowStats, float *colStats, half *out, half* bias, int numRows, int numCols, cudaStream_t stream);
 void getColRowStats(half * A, float *rowStats, float *colStats, int *nnz_count_row, float nnz_threshold, int rows, int cols);
-void getRowStats(half *A, float *rowStats, float threshold, int rows, int cols);
+void getRowStats(half *A, float *rowStats, float threshold, int rows, int cols, cudaStream_t stream);
 void doubleRowColQuant(half * A, float *rowStats, float *colStats, char *out_col_normed, char *out_row_normed,
                        int *rowidx, int *colidx, half *val, int *nnz_block_ptr, float threshold, int rows, int cols);
-void int8VectorQuant(half * __restrict__ A, int8_t *out, float *rowStats, float threshold, int rows, int cols);
+void int8VectorQuant(half * __restrict__ A, int8_t *out, float *rowStats, float threshold, int rows, int cols, cudaStream_t stream);
 
 template <int FORMAT, int TRANSPOSE> void transformRowToFormat(char * A, char *out, int rows, int cols);
 
diff --git a/csrc/pythonInterface.cpp b/csrc/pythonInterface.cpp
@@ -175,14 +175,14 @@ void transform_row2ampereT(char * A, char *out, int rows, int cols){ transformRo
 void extractOutliers_turing(char * A, int *idx, char *out, int idx_size, int rows, int cols){ extractOutliers<COL_TURING>(A, idx, out, idx_size, rows, cols); }
 void extractOutliers_ampere(char * A, int *idx, char *out, int idx_size, int rows, int cols){ extractOutliers<COL_AMPERE>(A, idx, out, idx_size, rows, cols); }
 
-int igemmlt_32(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc) {
-    return igemmlt<32, 0>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc);
+int igemmlt_32(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, cudaStream_t stream) {
+    return igemmlt<32, 0>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc, stream);
 }
-int igemmlt_8(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc) {
-    return igemmlt<8, 0>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc);
+int igemmlt_8(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, cudaStream_t stream) {
+    return igemmlt<8, 0>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc, stream);
 }
-int igemmlt_8_rowscale(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc) {
-    return igemmlt<8, 1>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc);
+int igemmlt_8_rowscale(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, cudaStream_t stream) {
+    return igemmlt<8, 1>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc, stream);
 }
 
 void spmm_coo_very_sparse_naive_fp16(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB)
@@ -308,14 +308,14 @@ extern "C"
 	Context *get_context(){ return new Context(); }
 	ContextCusparse *get_cusparse(){ return new ContextCusparse(); }
 
-	int cigemmlt_32(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc) {
-		return igemmlt_32((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc);
+	int cigemmlt_32(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, cudaStream_t stream) {
+		return igemmlt_32((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc, stream);
 	}
-	int cigemmlt_8(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc) {
-		return igemmlt_8((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc);
+	int cigemmlt_8(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, cudaStream_t stream) {
+		return igemmlt_8((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc, stream);
 	}
-	int cigemmlt_8_rowscale(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc) {
-		return igemmlt_8_rowscale((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc);
+	int cigemmlt_8_rowscale(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc,  cudaStream_t stream) {
+		return igemmlt_8_rowscale((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc, stream);
 	}
 
   #define MAKE_FUNC_CTRANSFORM(fbits, fsrc, ftrgt, ftranspose, dtype, src, target, transpose, bits) \
@@ -333,15 +333,15 @@ extern "C"
 	MAKE_FUNC_CTRANSFORM(8, col32, row, n, int8_t, COL32, ROW, false, 8)
 	MAKE_FUNC_CTRANSFORM(32, col32, row, n, int32_t, COL32, ROW, false, 32)
 
-	void cdequant_mm_int32_fp16(int *A, float *rowStats, float *colStats, half *out, half* bias, int numRows, int numCols)
-	{ dequant_mm_int32_fp16(A, rowStats, colStats, out, bias, numRows, numCols); }
+	void cdequant_mm_int32_fp16(int *A, float *rowStats, float *colStats, half *out, half* bias, int numRows, int numCols, cudaStream_t stream)
+	{ dequant_mm_int32_fp16(A, rowStats, colStats, out, bias, numRows, numCols, stream); }
 	void cget_col_row_stats(half * A, float *rowStats, float *colStats, int *nnz_count_row, float nnz_threshold, int rows, int cols)
 	{ getColRowStats(A, rowStats, colStats, nnz_count_row, nnz_threshold, rows, cols); }
-	void cget_row_stats(half *A, float *rowStats, float threshold, int rows, int cols) {
-		getRowStats(A, rowStats, threshold, rows, cols);
+	void cget_row_stats(half *A, float *rowStats, float threshold, int rows, int cols, cudaStream_t stream) {
+		getRowStats(A, rowStats, threshold, rows, cols, stream);
 	}
-	void cint8_vector_quant(half * __restrict__ A, int8_t *out, float *rowStats, float threshold, int rows, int cols) {
-		int8VectorQuant(A, out, rowStats, threshold, rows, cols);
+	void cint8_vector_quant(half * __restrict__ A, int8_t *out, float *rowStats, float threshold, int rows, int cols, cudaStream_t stream) {
+		int8VectorQuant(A, out, rowStats, threshold, rows, cols, stream);
 	}
   void cdouble_rowcol_quant(half * A, float *rowStats, float *colStats, char *out_col_normed, char *out_row_normed, int *rowidx, int *colidx, half *val, int *nnz_row_ptr, float threshold, int rows, int cols)
 	{ doubleRowColQuant(A, rowStats, colStats, out_col_normed, out_row_normed, rowidx, colidx, val, nnz_row_ptr, threshold, rows, cols); }