Skip to content

Commit ac5550a

Browse files
committed
Added changes for deployment.
1 parent 0f40fa3 commit ac5550a

File tree

3 files changed

+7
-15
lines changed

3 files changed

+7
-15
lines changed

Makefile

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ COMPUTE_CAPABILITY += -gencode arch=compute_52,code=sm_52 # Maxwell
3333
COMPUTE_CAPABILITY += -gencode arch=compute_60,code=sm_60 # Pascal
3434
COMPUTE_CAPABILITY += -gencode arch=compute_61,code=sm_61 # Pascal
3535
COMPUTE_CAPABILITY += -gencode arch=compute_70,code=sm_70 # Volta
36-
COMPUTE_CAPABILITY += -gencode arch=compute_72,code=sm_72 # Volta
3736

3837
CC_KEPLER := -gencode arch=compute_35,code=sm_35 # Kepler
3938
CC_KEPLER += -gencode arch=compute_37,code=sm_37 # Kepler

csrc/kernels.cu

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,12 @@
1616
#include <thrust/device_vector.h>
1717
#include <mma.h>
1818

19-
#include <cooperative_groups/memcpy_async.h>
20-
#include <cuda/pipeline>
2119

2220
#define HLF_MAX 65504
2321
#define TH 1024
2422
#define NUM 4
2523
#define NUM_BLOCK 4096
2624

27-
using namespace nvcuda;
2825

2926
// source: https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
3027
__device__ float atomicMax(float* address, float val) {
@@ -3094,6 +3091,9 @@ template <typename T, typename TCAST, int ITEMS> __device__ inline void vector_l
30943091
#define WARPS 5
30953092
template <typename T, int BITS, int THREADS> __global__ void gemm_device(int M, int N, int K, T * __restrict__ const A, T* B, T * out, int lda, int ldb, int ldc)
30963093
{
3094+
3095+
#if __CUDA_ARCH__ >= 750
3096+
using namespace nvcuda;
30973097
int col_offset = blockIdx.x *32;
30983098
const int warp_id = threadIdx.x / 32;
30993099
const int half_warp_id = threadIdx.x / 16;
@@ -3294,11 +3294,14 @@ template <typename T, int BITS, int THREADS> __global__ void gemm_device(int M,
32943294

32953295
if(col_offset + warp_lane < M)
32963296
out[col_offset + warp_lane] = smem_A[warp_lane];
3297+
#endif
32973298
}
32983299

32993300
template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, int N, int K, T * __restrict__ const A, unsigned char *B, float *absmax, T * out, int lda, int ldb, int ldc, int blocksize)
33003301
{
33013302

3303+
#if __CUDA_ARCH__ >= 750
3304+
using namespace nvcuda;
33023305
int col_offset = blockIdx.x *32;
33033306
const int warp_id = threadIdx.x / 32;
33043307
const int half_warp_id = threadIdx.x / 16;
@@ -3459,6 +3462,7 @@ template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, i
34593462

34603463
if(col_offset + warp_lane < M)
34613464
out[col_offset + warp_lane] = smem_A[warp_lane];
3465+
#endif
34623466
}
34633467

34643468
//#define ROWS 2

deploy.sh

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -139,17 +139,6 @@ if [ ! -f "./bitsandbytes/libbitsandbytes_cuda121.so" ]; then
139139
fi
140140

141141

142-
make clean
143-
export CUDA_HOME=$BASE_PATH/cuda-10.2
144-
make cuda10x_nomatmul CUDA_VERSION=102
145-
146-
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda102_nocublaslt.so" ]; then
147-
# Control will enter here if $DIRECTORY doesn't exist.
148-
echo "Compilation unsuccessul!" 1>&2
149-
exit 64
150-
fi
151-
152-
153142
make clean
154143
export CUDA_HOME=$BASE_PATH/cuda-11.0
155144
make cuda110_nomatmul CUDA_VERSION=110

0 commit comments

Comments
 (0)