bitsandbytes-foundation
diff --git a/‎csrc/kernels.cu‎
Lines changed: 0 additions & 334 deletions b/‎csrc/kernels.cu‎
Lines changed: 0 additions & 334 deletions
@@ -2205,333 +2205,6 @@ __global__ void kdequant_mm_int32_fp16(
   }
 }
 
-template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int TRANSPOSE, int FORMAT> __global__ void kTransformRowToFormat(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols)
-{
-
-  // 0. Load data into 32*32 shared memory tiles
-  // 1. transpose / reorder in shared memory
-  // 2. store
-
-  // COL32 FORMAT:
-  // rows*32 tiles
-
-  // TURING FORMAT:
-  // 8*32 tiles with 4*4 subtiles
-  // the 8*32 subtile has first all 4*4 subtiles of even rows (max 4*4*4 = 64 elements)
-  // the subsequent 4*4 subtiles are for all odd rows if some rows columns are empty the values are zero
-  // the tile repeats again after the 8*32 tile in a major column order, meaning: (next 8 rows are A[8:16, 0:32])
-  // the next tile is the next 8 rows for the same 32 columns. Once all rows are finished, the column
-  // index increases by 32
-
-  // AMPERE FORMAT:
-  // 32*32 tiles with 8*32 subtiles. The rows are interleaved in pairs of two rows with offset of 8 between pairs of two rows:
-	// row idx (each number stands for 32 values): [0 1 8 9 16 17 24 25] [2 3 10 11 18 19 26 27]...
-  // the tiles are column-major ordered, so after 1024*1024 values we process: A[32:64, 0:32]
-
-
-  // To have efficient loads and stores if we transpose we need 128 consequitive bytes which at 1 byte are 128 values
-  // As such we need:
-  // at least 32*4 shared memory tiles for col32; preferably 32*32
-  // at least 32*6 shared memory tiles for col32_ampere: preferably 32*32
-  // at least 32*8 shared memory tiles for col4_turing: preferably 32*32
-  // for efficient loading of row major we need to load 128 elements and repeat this 32 items
-  // this would imply a 32x128 shared memory tile -> 4kb
-  // It is more efficient to have more than 1 warp, so with 64 threads we need 32x128 -> 8 kb
-  // we have 64k sharded mem per SM in Turing which is 8 blocks per SM which is 2*8 = 32 warps = 100% occupancy
-  // for turing and 50% for A100 and 75% for RTX 30s / A40 which is probably good enough
-  // register pressure should be low with: 8 registers from local memoryh per block and 64 registers per SM
-  //
-  // to make the shared memory work with that occupancy we might need to union the block loads/stores
-
-  // each block loads TILE_COLs columns and TILE_ROW rows
-  // after reading a tile the row counter increase by TILE_ROWS
-  // the col counter reset after reading TILE_COL elements
-  const int base_row = ((blockIdx.x*TILE_COLS)/tiledCols)*TILE_ROWS;
-  // col increases by TILE_SIZE for each block and wraps back to 0 after tiledCols is reached
-  const int base_col = (blockIdx.x*TILE_COLS) % tiledCols;
-  const int base_idx = (base_row*cols) + base_col;
-
-  // we load 128 bytes per warp with
-  // 32 rows for transposes that fill col32 types
-  // so that we can have contiguous stores
-  __shared__ char smem_data[32*33*ITEMS_PER_THREAD];
-  char local_data[ITEMS_PER_THREAD];
-  typedef cub::BlockExchange<char, THREADS, ITEMS_PER_THREAD> BlockExchange;
-
-  // we load row after row from the base_position
-  // Load data row by row
-  int warps = blockDim.x/32;
-  int warp_id = threadIdx.x/32;
-  int warp_lane = threadIdx.x % 32;
-  int offset = 0;
-
-  int smem_row = 0;
-  // each warp loads one row of 128 bytes
-  for(int row = warp_id; row < TILE_ROWS; row+=warps)
-  {
-    int i = base_idx + (row*cols);
-    // we load up to 128 bytes/items per load
-    int valid_items = cols - base_col > 32*ITEMS_PER_THREAD ? 32*ITEMS_PER_THREAD : cols - base_col;
-
-    // 0. Load data into 32*32 shared memory tiles
-    if(base_row + row < rows)
-    {
-      #pragma unroll ITEMS_PER_THREAD
-      for(int j = 0; j < ITEMS_PER_THREAD; j++)
-      {
-        int col_idx = warp_lane+(j*32);
-        if(col_idx < valid_items)
-          local_data[j] = A[i+col_idx];
-        else
-          local_data[j] = 0;
-      }
-    }
-    else
-    {
-      #pragma unroll ITEMS_PER_THREAD
-      for(int j = 0; j < ITEMS_PER_THREAD; j++)
-        local_data[j] = 0;
-    }
-
-    if(TRANSPOSE)
-    {
-      #pragma unroll ITEMS_PER_THREAD
-      for(int j = 0; j < ITEMS_PER_THREAD; j++)
-      {
-        int local_col = (32*j)+warp_lane;
-        //int local_row = row;
-        // store as 256x32
-        smem_data[(local_col*33) + row] = local_data[j];
-      }
-    }
-    else
-    {
-      // treat smem as 32x256, that is 32 rows and 256 columns
-      #pragma unroll ITEMS_PER_THREAD
-      for(int j = 0; j < ITEMS_PER_THREAD; j++)
-        smem_data[row*32*ITEMS_PER_THREAD + (warp_lane) + (j*32)] = local_data[j];
-    }
-
-
-
-    smem_row += warps;
-
-    // 1. transpose / reorder in shared memory
-    if(smem_row % 32 == 0)
-    {
-      smem_row = 0;
-      __syncthreads();
-
-      for(int subrow = warp_id; subrow < 32; subrow+=warps)
-      {
-        for(int j = 0; j < ITEMS_PER_THREAD; j++)
-        {
-
-          switch(FORMAT)
-          {
-              case COL32:
-                if(TRANSPOSE)
-                {
-                  // data lies in shared memory in the following way:
-                  // row0 [col0 col1 ... col31]
-                  // row1 [col0 col1 ... col31]
-                  // ...
-                  //
-                  // As such we read consecutive entries with 256 threads (8rows x 32 columns)
-                  // as j increase, the row increase by a factor of 8
-                  // We load 8 rows per subrow loop, and subrow increase by 8 per loop
-                  // so we have an offset of 8 rows every loop or (subrow/warps)*8 = (subrow/8)*8
-                  const int jrow = j*ITEMS_PER_THREAD; // 8 rows per j
-                  const int subrow_loop_row = (subrow/warps)*ITEMS_PER_THREAD*ITEMS_PER_THREAD; // 8 rows per j; 8j per subrow loop (subrow/warps)
-                  //const int local_row =  warp_id; // each warp_id is one row
-                  //const int block_row = base_col; // block offset for row
-                  //const int local_col = warp_lane
-                  //const int global_col = base_row; // block offset for col
-                  if((base_col + subrow_loop_row + jrow + warp_id < outRows) && (base_row+warp_lane < rows))
-                  {
-                    // each row has 32 columns and is offset by 1 to prevent bank conflict during storage into smem
-                    char data = smem_data[(subrow_loop_row + jrow + warp_id)*33 + warp_lane];
-
-                    // each 32 columns we have new tile
-                    // each tile has size outRows*32 and base_row is done in increments of 32
-                    offset = base_row*outRows;
-                    out[offset + (base_col + jrow + subrow_loop_row)*32 + threadIdx.x] = data;
-                  }
-                }
-                else
-                {
-                  if(((base_row+subrow) < rows) && (base_col+(j*32)+warp_lane < outCols))
-                  {
-                    offset = (base_col/32)*(32*rows);
-                    char data = smem_data[(subrow*32*ITEMS_PER_THREAD) + (j*32) + warp_lane];
-                    out[offset+(base_row+subrow)*32 + ((j)*rows*32)+warp_lane] = data;
-                  }
-                }
-                break;
-              case COL_TURING:
-                // TURING FORMAT:
-                // 8*32 tiles with 4*4 subtiles
-                // the 8*32 subtile has first all 4*4 subtiles of even rows (max 4*4*4 = 64 elements)
-                // the subsequent 4*4 subtiles are for all odd rows if some rows columns are empty the values are zero
-                // the tile repeats again after the 8*32 tile in a major column order, meaning: (next 8 rows are A[8:16, 0:32])
-                // the next tile is the next 8 rows for the same 32 columns. Once all rows are finished, the column
-                // index increases by 32
-                //
-                // [0 0 0 0, 2 2 2 2, 4 4 4 4, 6 6 6 6, 0 0 0 0 ...]
-                if(TRANSPOSE)
-                {
-                  const int jrow = j*ITEMS_PER_THREAD; // 8 rows per j
-                  const int subrow_loop_row = (subrow/warps)*ITEMS_PER_THREAD*ITEMS_PER_THREAD; // 8 rows per j; 8j per subrow loop (subrow/warps)
-                  //const int local_row =  warp_id; // each warp_id is one row
-                  //const int block_row = base_col; // block offset for row
-                  //const int local_col = warp_lane
-                  //const int global_col = base_row; // block offset for col
-                  if((base_col + subrow_loop_row + jrow + warp_id < outRows) && (base_row+warp_lane < rows))
-                  {
-                    // each row has 32 columns and is offset by 1 to prevent bank conflict during storage into smem
-                    char data = smem_data[(subrow_loop_row + jrow + warp_id)*33 + warp_lane];
-
-                    // each 32 columns we have new tile
-                    // each tile has size 8*32 = 256 elements offset
-                    // for each row offset of 8 we increaes the tile first
-                    // after all rows are exhausted, we increase the col
-                    int row_offset = ((base_col+jrow+subrow_loop_row+warp_id)/8)*256; // global_row+jrow+subrow_loop_row+local_row, increase tile(=256) every 8 rows
-
-                    // we increase by row_tile_column every 32 columns
-                    // base_row increase in increments of 32
-                    //int row_tile_column = 256*outRows/8; // there are outRows/8 row tiles, and each tile is 256 elements
-                    //int col_offset = (base_row/32)*row_tile_column;
-                    // -> we can remove the divisions to speed up compute since outRows is always a multiple of 8
-                    // 256*outRows/8*base_row/32 = outRows*base_row
-                    int col_offset = outRows*base_row;
-
-                    offset = row_offset+col_offset;
-
-                    // since we process even number of rows with each j (8) and with each subrow (8j) we can determine
-                    // odd or even rows with the warp_id (each warp processes one row)
-                    // the col is warp_lane (max 32 columns per row) and the row warp_id
-                    if(warp_id % 2 == 1)
-                      // odd
-                      offset += 128 + (warp_lane/4)*16 + (warp_lane%4) + (((warp_id%8)-1)*2);
-                    else
-                      // even
-                      offset += 0   + (warp_lane/4)*16 + (warp_lane%4) + ((warp_id%8)*2);
-
-                    out[offset] = data;
-                  }
-                }
-                else
-                {
-                  if(((base_row+subrow) < rows) && (base_col+(j*32)+warp_lane < outCols))
-                  {
-                    char data = smem_data[(subrow*32*ITEMS_PER_THREAD) + (j*32) + warp_lane];
-                    // set offset designates the tile offset among the 8*32 tiles
-                    // we first increase rows and then columns. Since we load 128 columns at once
-                    // we increase the offset by outRows*32 every 32 columns
-                    // additionally, we increase the offset by 8*32=256 every 8 rows
-                    offset = ((base_col+(j*32))/32)*outRows*32 + (((base_row+subrow)/8)*256); // global offset (8x32 tile)
-                    // first 4 rows are reserved for even rows, [0, 2, 4, 6], the next 4 for odd
-                    // each of these has 32 values in total for 32*4 = 128 as offset if odd
-                    // every set of 4 columns increases the total offset by 16
-                    // each even row increase the offset by 4, for example row 2 is offset by 4, 4 by 6 etc so: subrow/2*4 = subrow*2
-                    // this happens every 8 rows anew (subrow % 8)
-                    // one writes 4 columns at once that is (col % 4) for the particular index in the subtile
-                    int subcol = warp_lane;
-
-                    // add local offset (4x4 sub-tile)
-                    if(subrow % 2 == 1)
-                      // odd
-                      offset += 128 + (subcol/4)*16 + (subcol%4) + (((subrow%8)-1)*2);
-                    else
-                      // even
-                      offset += 0   + (subcol/4)*16 + (subcol%4) + ((subrow%8)*2);
-
-                    out[offset] = data;
-                  }
-                }
-                break;
-								case COL_AMPERE:
-									// AMPERE FORMAT:
-									// 32*32 tiles with 8*32 subtiles. The rows are interleaved in pairs of two rows with offset of 8 between pairs of two rows:
-									// row idx (each number stands for 32 values): [0 1 8 9 16 17 24 25] [2 3 10 11 18 19 26 27]...
-									// the tiles are column-major ordered, so after 1024*1024 values we process: A[32:64, 0:32]
-									if(TRANSPOSE)
-									{
-										const int jrow = j*ITEMS_PER_THREAD; // 8 rows per j
-										const int subrow_loop_row = (subrow/warps)*ITEMS_PER_THREAD*ITEMS_PER_THREAD; // 8 rows per j; 8j per subrow loop (subrow/warps)
-										//const int local_row =  warp_id; // each warp_id is one row
-										//const int block_row = base_col; // block offset for row
-										//const int local_col = warp_lane
-										//const int global_col = base_row; // block offset for col
-										if((base_col + subrow_loop_row + jrow + warp_id < outRows) && (base_row+warp_lane < rows))
-										{
-											// each row has 32 columns and is offset by 1 to prevent bank conflict during storage into smem
-											char data = smem_data[(subrow_loop_row + jrow + warp_id)*33 + warp_lane];
-
-											// each 32 columns we have new tile
-											// each tile has size 32*32 = 1024 elements offset
-											// for each row offset of 32 we increaes the tile first
-											// after all rows are exhausted, we increase the col
-											int row_offset = ((base_col+jrow+subrow_loop_row+warp_id)/32)*1024; // global_row+jrow+subrow_loop_row+local_row, increase tile(=256) every 8 rows
-
-											// we increase by row_tile_column every 32 columns
-											// base_row increase in increments of 32
-											//int row_tile_column = 1024*outRows/32; // there are outRows/32 row tiles, and each tile is 1024 elements
-											//int col_offset = (base_row/32)*row_tile_column;
-											// -> we can remove the divisions to speed up compute since outRows is always a multiple of 8
-											// 1024*outRows/32*base_row/32 = outRows*base_row
-											int col_offset = outRows*base_row;
-
-											offset = row_offset+col_offset;
-
-
-											// same as in the non-transpose case (see below)
-											// the difference is that now rows = cols
-											// in this case warp_id = subrow
-
-											// [0 1 8 9 16 17 24 25] [2 3 10 11 18 19 26 27]...
-											// subrow % 8 -> [0,1] in tile0, [2, 3] in tile 1 etc
-											// subrow % 2 -> 0 for 1st row in the pair, 1 for the 2nd row
-											// every 2 rows, the offset increases by two [0, 1, 8, 9...]
-											// every 2 rows, the row index increase by 8 [0, 1, 8, 9...]
-											int local_row = (jrow + warp_id) % 32; // offset for row > 32 is already calculated into row_offset
-											int ampere_row = ((local_row % 8)/2)*8 + (local_row/8)*2 + (local_row % 2);
-
-											// global offset + row with 32 cols each + 32 cols per j + col_idx=warp_lane
-											out[offset + (ampere_row*32) + warp_lane] = data;
-										}
-									}
-									else
-									{
-										if(((base_row+subrow) < rows) && (base_col+(j*32)+warp_lane < outCols))
-										{
-											char data = smem_data[(subrow*32*ITEMS_PER_THREAD) + (j*32) + warp_lane];
-
-											// set offset designates the tile offset among the 32*32 tiles
-											// we first increase rows and then columns. Since we load 128 columns at once
-											// we increase the offset by outRows*32 every 32 columns
-											// additionally, we increase the offset by 32*32=1024 every 32 rows
-											offset = ((base_col+(j*32))/32)*outRows*32 + (((base_row+subrow)/32)*1024); // global offset (32x32 tile)
-
-											// [0 1 8 9 16 17 24 25] [2 3 10 11 18 19 26 27]...
-											// subrow % 8 -> [0,1] in tile0, [2, 3] in tile 1 etc
-											// subrow % 2 -> 0 for 1st row in the pair, 1 for the 2nd row
-											// every 2 rows, the offset increases by two [0, 1, 8, 9...]
-											// every 2 rows, the row index increase by 8 [0, 1, 8, 9...]
-											int local_row = ((subrow % 8)/2)*8 + (subrow/8)*2 + (subrow % 2);
-
-											// global offset + row with 32 cols each + 32 cols per j + col_idx
-											out[offset + (local_row*32) + warp_lane] = data;
-										}
-									}
-								break;
-          }
-        }
-      }
-    }
-  }
-}
-
 #define DENORM 1.0f/127.0f
 #define MAX_SPARSE_COUNT 32
 #define SMEM_SIZE 8*256
@@ -3386,13 +3059,6 @@ template __global__ void kspmm_coo_very_sparse_naive<signed char, 8, 8>(int *max
 template __global__ void kspmm_coo_very_sparse_naive<signed char, 16, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
 template __global__ void kspmm_coo_very_sparse_naive<signed char, 32, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
 
-template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 0, COL32>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);
-template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 1, COL32>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);
-template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 0, COL_TURING>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);
-template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 1, COL_TURING>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);
-template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 0, COL_AMPERE>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);
-template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 1, COL_AMPERE>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);
-
 template __global__ void kdequant_mm_int32_fp16<4, 512>(int *__restrict__ const A, float *__restrict__ const rowStats, float *__restrict__ const colStats, half *out, half * __restrict__ const bias, const int numRows, const int numCols, const int n);
 
 template __device__ unsigned char dQuantize<0>(float* smem_code, const float rand, float x);