Skip to content

Commit 5892223

Browse files
cleanup: remove unused kernels/C++ code (#1458)
* (chore) Remove unused dotfiles * cleanup: remove unused kernels/C++ code
1 parent 5b01589 commit 5892223

File tree

2 files changed

+0
-247
lines changed

2 files changed

+0
-247
lines changed

csrc/kernels.cu

Lines changed: 0 additions & 222 deletions
Original file line numberDiff line numberDiff line change
@@ -37,60 +37,6 @@ __device__ float atomicMax(float* address, float val) {
3737
return __int_as_float(old);
3838
}
3939

40-
__device__ float atomicMin(float* address, float val) {
41-
int* address_as_i = reinterpret_cast<int*>(address);
42-
int old = *address_as_i, assumed;
43-
do {
44-
assumed = old;
45-
old = atomicCAS(
46-
reinterpret_cast<int*>(address), assumed,
47-
__float_as_int(fminf(val, __int_as_float(assumed))));
48-
} while (assumed != old);
49-
return __int_as_float(old);
50-
}
51-
52-
__device__ float dDequantizeFP4(unsigned char val, float absmax)
53-
{
54-
float sign = (val & 0b1000) == 8 ? -1.0f : 1.0f;
55-
if((val & 0b0110) == 0)
56-
{
57-
// subnormal
58-
if((val & 0b0001) == 0)
59-
return 0.0f;
60-
else
61-
return sign*0.0625f*absmax;
62-
}
63-
else
64-
{
65-
// normal
66-
float exponent = ((val & 0b0100) == 4 ? 2.0f : 8.0f) + ((val & 0b0010) == 2 ? 0.0f : 2.0f);
67-
float fraction = (val & 0b0001) == 1 ? 1.5f : 1.0f;
68-
69-
return sign*exponent*fraction*absmax;
70-
}
71-
}
72-
73-
__device__ float d2DequantizeFP4(unsigned char val)
74-
{
75-
float sign = (val & 0b1000) == 8 ? -1.0f : 1.0f;
76-
if((val & 0b0110) == 0)
77-
{
78-
// subnormal
79-
if((val & 0b0001) == 0)
80-
return 0.0f;
81-
else
82-
return sign*0.0625f;
83-
}
84-
else
85-
{
86-
// normal
87-
float exponent = ((val & 0b0100) == 4 ? 2.0f : 8.0f) + ((val & 0b0010) == 2 ? 0.0f : 2.0f);
88-
float fraction = (val & 0b0001) == 1 ? 1.5f : 1.0f;
89-
90-
return sign*exponent*fraction;
91-
}
92-
}
93-
9440
__device__ float dDequantizeFP4Tree(unsigned char val, float absmax)
9541
{
9642
float sign = (val & 0b1000) == 8 ? -1.0f : 1.0f;
@@ -167,60 +113,6 @@ __device__ unsigned char dQuantizeFP4(float x)
167113
return 0b0000+sign;
168114
}
169115

170-
__device__ half dhDequantizeNF4(unsigned char val)
171-
{
172-
// the values for this tree was generated by test_normal_map_tree
173-
// in the file tests/test_functional.py
174-
if((val & 0b1000) == 8)
175-
if((val & 0b0100) == 4) // 1
176-
if((val & 0b0010) == 2) // 11
177-
if((val & 0b0001) == 1) // 111
178-
return 1.0f;
179-
else
180-
return 0.7229568362236023f;
181-
else
182-
if((val & 0b0001) == 1) // 110
183-
return 0.5626170039176941f;
184-
else
185-
return 0.44070982933044434f;
186-
else
187-
if((val & 0b0010) == 2) //10
188-
if((val & 0b0001) == 1) // 101
189-
return 0.33791524171829224f;
190-
else
191-
return 0.24611230194568634f;
192-
else
193-
if((val & 0b0001) == 1) // 100
194-
return 0.16093020141124725f;
195-
else
196-
return 0.07958029955625534f;
197-
198-
else
199-
if((val & 0b0100) == 4) // 0
200-
if((val & 0b0010) == 2) //01
201-
if((val & 0b0001) == 1) // 011
202-
return 0.0f;
203-
else
204-
return -0.09105003625154495f;
205-
else
206-
if((val & 0b0001) == 1) // 010
207-
return -0.18477343022823334f;
208-
else
209-
return -0.28444138169288635f;
210-
else
211-
if((val & 0b0010) == 2) //00
212-
if((val & 0b0001) == 1) // 001
213-
return -0.39491748809814453f;
214-
else
215-
return -0.5250730514526367f;
216-
else
217-
if((val & 0b0001) == 1) // 000
218-
return -0.6961928009986877f;
219-
else
220-
return -1.0f;
221-
222-
}
223-
224116
__device__ __forceinline__ float dDequantizeNF4(unsigned char val)
225117
{
226118

@@ -3424,118 +3316,6 @@ template <typename T, int THREADS, int BITS> __global__ void kgemm_4bit_inferenc
34243316

34253317
}
34263318

3427-
3428-
//#define ROWS 2
3429-
//template <typename T, int ITEMS, int THREADS> __global__ void gemm_device(int M, int N, int K, T const* A, T* B, T * out, int lda, int ldb, int ldc)
3430-
//{
3431-
//// 0. We want to fill a 8x128 tile for a thread block so we have 8x16 tile for each warp
3432-
//// 1. Load dataB into register
3433-
//// 2. Dequantize B
3434-
//// 3. Fetch data from A and multiply
3435-
//
3436-
// typedef cub::BlockLoad<T, THREADS , ITEMS, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadA;
3437-
// //__shared__ typename LoadA::TempStorage loada;
3438-
// typedef cub::BlockLoad<T, THREADS , ITEMS, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadB;
3439-
// //__shared__ typename LoadB::TempStorage loadb;
3440-
// typedef cub::BlockReduce<T, THREADS> BlockReduce;
3441-
// // Allocate shared memory for BlockReduce
3442-
// //__shared__ typename BlockReduce::TempStorage reduce;
3443-
//
3444-
// __shared__ union {
3445-
// typename BlockReduce::TempStorage reduce;
3446-
// typename LoadB::TempStorage loadb;
3447-
// typename LoadA::TempStorage loada;
3448-
// } temp_storage;
3449-
//
3450-
//
3451-
// T dataA[ITEMS];
3452-
// T local_B[ITEMS];
3453-
// T local_accC[ROWS];
3454-
// int valid_items = 0;
3455-
// const int col_offset = blockIdx.x * 8;
3456-
//
3457-
// __shared__ T tileA[ROWS*THREADS*ITEMS];
3458-
// __shared__ T accumulatorC[ROWS*8];
3459-
//
3460-
// //#pragma unroll 8
3461-
// //for(int i = 0; i < 8; i++)
3462-
// // tileA[threadIdx.x + (i*256)] = 0.0f;
3463-
// //__syncthreads();
3464-
// if(threadIdx.x < 64)
3465-
// accumulatorC[threadIdx.x] = 0.0f;
3466-
// __syncthreads();
3467-
//
3468-
//
3469-
// for(int inner_idx = 0; inner_idx < K; inner_idx+= THREADS*ITEMS)
3470-
// {
3471-
// valid_items = K - inner_idx > THREADS*ITEMS ? THREADS*ITEMS : K - inner_idx;
3472-
// int baserow = 0;
3473-
// for(int row = baserow; row < (baserow+ROWS) && row < N; row++)
3474-
// {
3475-
// LoadA(temp_storage.loada).Load(&(A[(row*K) + inner_idx]), dataA, valid_items, 0.0f);
3476-
//
3477-
// #pragma unroll ITEMS
3478-
// for(int k = 0; k < ITEMS; k++)
3479-
// tileA[row*THREADS*ITEMS + threadIdx.x + (k*THREADS)] = dataA[k];
3480-
//
3481-
// __syncthreads();
3482-
// }
3483-
// baserow += ROWS;
3484-
//
3485-
// // load 16 columns from B at a time. B is transposed, so its like loading rows
3486-
// // each warp loads one row
3487-
// // each thread loads 128 byte
3488-
//
3489-
// // col: inner_idx + warp_lane
3490-
// // row: ldb*(offset + warp_id)
3491-
// for(int col = 0; col < 8 && (col_offset + col) < M; col++)
3492-
// {
3493-
// int colB = col_offset + col;
3494-
//
3495-
// for(int k = 0; k < ROWS; k++)
3496-
// local_accC[k] = 0.0f;
3497-
//
3498-
// int base_idxB = ldb*colB;
3499-
// valid_items = K - inner_idx > THREADS*ITEMS ? THREADS*ITEMS : K - inner_idx;
3500-
// LoadB(temp_storage.loadb).Load(&(B[base_idxB + inner_idx]), local_B, valid_items, 0.0f);
3501-
// __syncthreads();
3502-
//
3503-
// for(int row = 0; row < ROWS && row < N; row++)
3504-
// {
3505-
// #pragma unroll ITEMS
3506-
// for(int k = 0; k < ITEMS; k++)
3507-
// {
3508-
// int idxA = row*THREADS*ITEMS + threadIdx.x + (THREADS*k);
3509-
// local_accC[row] += tileA[idxA]*local_B[k];
3510-
// }
3511-
//
3512-
// local_accC[row] = BlockReduce(temp_storage.reduce).Reduce(local_accC[row], cub::Sum());
3513-
// if(threadIdx.x == 0)
3514-
// atomicAdd(&accumulatorC[row*8 + col], local_accC[row]);
3515-
// }
3516-
// }
3517-
// }
3518-
//
3519-
// for(int row = 0; row < ROWS && row < N; row++)
3520-
// {
3521-
// int out_idx = ldc*row + col_offset;
3522-
//
3523-
// //if(threadIdx.x < 8)
3524-
// // if(accumulatorC[row*8 + threadIdx.x] != 0.0)
3525-
// // printf("%i %i %i %i %f idx %i %i %i\n", row, col_offset, threadIdx.x, N, accumulatorC[row*8 + threadIdx.x], ldc, out_idx, blockIdx.x);
3526-
//
3527-
// if(threadIdx.x < 8 && (col_offset + threadIdx.x) < M)
3528-
// {
3529-
// //printf("%i %i %i %i %f idx %i %i\n", row, col_offset, threadIdx.x, N, accumulatorC[row*8 + threadIdx.x], ldc, out_idx);
3530-
// out[out_idx + threadIdx.x] = accumulatorC[row*8 + threadIdx.x];
3531-
// }
3532-
// }
3533-
//
3534-
//
3535-
//
3536-
//}
3537-
3538-
35393319
template <typename T, int FUNC> __global__ void kfunc(T *A, T *B, T value, long n)
35403320
{
35413321
for(long i = (blockDim.x*blockIdx.x) + threadIdx.x; i < n; i+=(blockDim.x*gridDim.x))
@@ -3756,8 +3536,6 @@ MAKE_optimizerStatic8bit2State(ADAM, float)
37563536

37573537
template __global__ void kPercentileClipping<float, 2048, 4>(float * __restrict__ g, float *gnorm_vec, int step, const int n);
37583538
template __global__ void kPercentileClipping<half, 2048, 4>(half * __restrict__ g, float *gnorm_vec, int step, const int n);
3759-
// template __global__ void kPercentileClipping<float, 128, 4>(float * __restrict__ g, float *gnorm_vec, int step, const int n);
3760-
// template __global__ void kPercentileClipping<half, 128, 4>(half * __restrict__ g, float *gnorm_vec, int step, const int n);
37613539

37623540
#define MAKE_kQuantizeBlockwise(dtype, blocksize, num_per_thread, stochastic, data_type_name) \
37633541
template __global__ void kQuantizeBlockwise<dtype, blocksize, num_per_thread, stochastic, data_type_name>(float * code, dtype * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); \

csrc/ops.cu

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -674,43 +674,18 @@ template <typename T> void gemm_host(int m, int n, int k, T * A, T* B, T * out
674674

675675
int num_blocks = (m+31)/32;
676676

677-
//cout << num_blocks << endl;
678-
//cout << lda << endl;
679-
//cout << ldb << endl;
680-
//cout << ldc << endl;
681-
682-
//cout << m << endl;
683-
//cout << n << endl;
684-
//cout << k << endl;
685677
if(bits == 32)
686-
//gemm_device<T, 32, 128><<< num_blocks, 128, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc);
687678
gemm_device<T, 32, 32><<< num_blocks, 32, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc);
688679
if(bits == 16)
689-
//gemm_device<T, 16, 256><<< num_blocks, 256, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc);
690680
gemm_device<T, 16, 160><<< num_blocks, 160, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc);
691-
//gemm_device<T, 16, 128><<< num_blocks, 128, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc);
692-
//gemm_device<T, 16, 96><<< num_blocks, 96, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc);
693-
//gemm_device<T, 16, 32><<< num_blocks, 32, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc);
694-
//gemm_device<T, 16, 64><<< num_blocks, 64, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc);
695681
}
696682

697683
template <typename T> void gemm_4bit_inference(int m, int n, int k, T * A, unsigned char* B, float *absmax, T * out, int lda, int ldb, int ldc, int blocksize)
698684
{
699685

700686
int num_blocks = (m+31)/32;
701687

702-
//cout << num_blocks << endl;
703-
//cout << lda << endl;
704-
//cout << ldb << endl;
705-
//cout << ldc << endl;
706-
707-
//cout << m << endl;
708-
//cout << n << endl;
709-
//cout << k << endl;
710688
kgemm_4bit_inference<T, 96><<< num_blocks, 96, 0, 0 >>>(m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize);
711-
//kgemm_4bit_inference<T, 256><<< num_blocks, 256, 0, 0 >>>(m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize);
712-
//kgemm_4bit_inference<T, 160><<< num_blocks, 160, 0, 0 >>>(m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize);
713-
//kgemm_4bit_inference<T, 32><<< num_blocks, 32, 0, 0 >>>(m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize);
714689
}
715690

716691
template <typename T, int BITS> void gemm_4bit_inference_naive(int m, int n, int k, T * A, unsigned char* B, float *absmax, float *datatype, T * out, int lda, int ldb, int ldc, int blocksize, cudaStream_t stream)

0 commit comments

Comments
 (0)