You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
template<typename T, int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int SPARSE_DECOMP> __global__voidkgetColRowStats(T * __restrict__ A, float *rowStats, float *colStats, int * nnz_count_row, float nnz_threshold, int rows, int cols, int tiledRows, int tiledCols)
template __global__void kgetColRowStats<half, 64, 4, 16, 64*4, 0>(half * __restrict__ A, float *rowStats, float *colStats, int * nnz_count_row, float nnz_threshold, int rows, int cols, int tiledRows, int tiledCols);
2389
-
template __global__void kgetColRowStats<half, 64, 4, 16, 64*4, 1>(half * __restrict__ A, float *rowStats, float *colStats, int * nnz_count_row, float nnz_threshold, int rows, int cols, int tiledRows, int tiledCols);
2390
2236
template __global__void kgetRowStats<half, 1024, 0>(half * __restrict__ A, float *rowStats, float threshold, int rows, int cols);
2391
2237
template __global__void kgetRowStats<half, 1024, 1>(half * __restrict__ A, float *rowStats, float threshold, int rows, int cols);
template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int SPARSE_DECOMP> __global__voidkDoubleRowColQuant(half *__restrict__const A, float *__restrict__const rowStats, float * __restrict__const colStats, char *out_col_normed, char *out_row_normed, int *rowidx, int *colidx, half *val, int * __restrict__ nnz_block_ptr, float threshold, int rows, int cols, int tiledCols)
2462
-
{
2463
-
// assumes TILE_SIZE == THREADS*ITEMS_PER_THREAD
2464
-
// Each thread reads the same column but multiple rows
2465
-
// Rows are loaded in shared memory and access is shared across the threadblock (broadcast)
2466
-
2467
-
// 0. Load row stats data into shared memory; load col stat (1 fixed per thread)
2468
-
// 1. Load data row by row (should be at least with TILE_SIZE = 512)
2469
-
// 2. quantize data with row/col stats
2470
-
// 3. Store data (TILE_SIZE = 512 is a bit slow, but should still be close enough to good performance)
2471
-
2472
-
// each block loads TILE_COLs columns and TILE_ROW rows
2473
-
// after reading a tile the row counter increase by TILE_ROWS
2474
-
// the col counter reset after reading TILE_COL elements
template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int TRANSPOSE, int FORMAT> __global__voidkTransformRowToFormat(char *__restrict__const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols)
template<typename T, int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int SPARSE_DECOMP> __global__voidkgetColRowStats(T * __restrict__ A, float *rowStats, float *colStats, int * nnz_count_row, float nnz_threshold, int rows, int cols, int tiledRows, int tiledCols);
120
119
template<typename T, int THREADS, int SPARSE_DECOMP> __global__voidkgetRowStats(T * __restrict__ A, float *rowStats, float threshold, int rows, int cols);
121
120
template<typename T, int THREADS, int SPARSE_DECOMP> __global__voidkInt8VectorQuant(T * __restrict__ A, int8_t *out, float *rowStats, float threshold, int rows, int cols);
122
121
123
-
template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int SPARSE_DECOMP> __global__voidkDoubleRowColQuant(half *__restrict__const A, float *__restrict__const rowStats, float * __restrict__const colStats, char *out_col_normed, char *out_row_normed, int *rowidx, int *colidx, half *val, int * __restrict__ nnz_block_ptr, float threshold, int rows, int cols, int tiledCols);
124
-
125
122
template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int TRANSPOSE, int FORMAT> __global__voidkTransformRowToFormat(char *__restrict__const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);
126
123
127
124
template <int FORMAT> __global__voidkExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA);
@@ -562,29 +542,6 @@ void getRowStats(half *A, float *rowStats, float threshold, int rows, int cols,
562
542
CUDA_CHECK_RETURN(cudaPeekAtLastError());
563
543
}
564
544
565
-
voiddoubleRowColQuant(half * A, float *rowStats, float *colStats, char *out_col_normed, char *out_row_normed, int *rowidx, int *colidx, half *val, int *nnz_block_ptr, float threshold, int rows, int cols)
566
-
{
567
-
int threads = 64;
568
-
int items_per_thread = 4;
569
-
int tile_cols = threads*items_per_thread;
570
-
int tile_rows = 16;
571
-
int tiledCols = fill_up_to_nearest_multiple(cols, tile_cols);
572
-
int tiledRows = fill_up_to_nearest_multiple(rows, tile_rows);
0 commit comments