Skip to content

Commit 1083fd0

Browse files
njhillNarsilcyang49tjohnson31415
committed
Support for serving GPTQ quantized models
Adapted from corresponding changes to HF TGI (pre license-change) Co-authored-by: Nicolas Patry <[email protected]> Co-authored-by: Jamie Yang <[email protected]> Co-authored-by: Travis Johnson <[email protected]>
1 parent a8926f6 commit 1083fd0

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+3488
-84
lines changed

Dockerfile

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,16 @@ FROM python-builder as build
213213
COPY server/custom_kernels/ /usr/src/.
214214
RUN cd /usr/src && python setup.py build_ext && python setup.py install
215215

216+
217+
## Build transformers exllama kernels ##########################################
218+
FROM python-builder as exllama-kernels-builder
219+
220+
WORKDIR /usr/src
221+
222+
COPY server/exllama_kernels/ .
223+
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
224+
225+
216226
## Flash attention cached build image ##########################################
217227
FROM base as flash-att-cache
218228
COPY --from=flash-att-builder /usr/src/flash-attention/build /usr/src/flash-attention/build
@@ -249,10 +259,13 @@ COPY --from=flash-att-cache /usr/src/flash-attention/csrc/rotary/build/lib.linux
249259
# Copy build artifacts from flash attention v2 builder
250260
COPY --from=flash-att-v2-cache /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-* ${SITE_PACKAGES}
251261

262+
# Copy build artifacts from exllama kernels builder
263+
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-* ${SITE_PACKAGES}
264+
252265
# Install server
253266
COPY proto proto
254267
COPY server server
255-
RUN cd server && make gen-server && pip install ".[accelerate, onnx-gpu]" --no-cache-dir
268+
RUN cd server && make gen-server && pip install ".[accelerate, onnx-gpu, quantize]" --no-cache-dir
256269

257270
# Patch codegen model changes into transformers 4.34.0
258271
RUN cp server/transformers_patch/modeling_codegen.py ${SITE_PACKAGES}/transformers/models/codegen/modeling_codegen.py

Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,5 +56,7 @@ python-tests: build-test-image
5656
-e HUGGINGFACE_HUB_CACHE=/transformers_cache \
5757
-e TRANSFORMERS_CACHE=/transformers_cache cpu-tests:0 pytest -sv --ignore=server/tests/test_utils.py server/tests
5858

59+
clean:
60+
rm -rf target
5961

6062
.PHONY: build build-test-image integration-tests python-tests

launcher/src/main.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ struct Args {
3333
dtype: Option<String>,
3434
#[clap(default_value = None, long, env)]
3535
dtype_str: Option<String>,
36+
#[clap(default_value = None, long, env)]
37+
quantize: Option<String>,
3638
#[clap(long, env)]
3739
num_shard: Option<usize>,
3840
#[clap(default_value = "96", long, env)]
@@ -156,6 +158,7 @@ fn main() -> ExitCode {
156158
args.revision,
157159
args.deployment_framework,
158160
args.dtype.or(args.dtype_str),
161+
args.quantize,
159162
args.max_sequence_length,
160163
args.max_new_tokens,
161164
args.max_batch_size,
@@ -396,6 +399,7 @@ fn shard_manager(
396399
revision: Option<String>,
397400
deployment_framework: String,
398401
dtype: Option<String>,
402+
quantize: Option<String>,
399403
max_sequence_length: usize,
400404
max_new_tokens: usize,
401405
max_batch_size: usize,
@@ -442,6 +446,11 @@ fn shard_manager(
442446
shard_argv.push(dtype);
443447
}
444448

449+
if let Some(quantize) = quantize {
450+
shard_argv.push("--quantize".to_string());
451+
shard_argv.push(quantize);
452+
}
453+
445454
// Activate tensor parallelism
446455
if world_size > 1 {
447456
shard_argv.push("--sharded".to_string());
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
// Adapted from turboderp exllama: https://github.com/turboderp/exllama
2+
3+
#define _cuda_buffers_cu
4+
#include "cuda_buffers.cuh"
5+
6+
CudaBuffers* g_buffers[CUDA_MAX_DEVICES] = {NULL};
7+
// __constant__ half2 q4_table[16][256];
8+
// half2 q4_table_host[16][256];
9+
// bool q4_table_init = false;
10+
11+
CudaBuffers::CudaBuffers
12+
(
13+
int _device,
14+
half* _temp_state,
15+
half* _temp_dq
16+
) :
17+
device(_device),
18+
temp_state(_temp_state),
19+
temp_dq(_temp_dq)
20+
{
21+
cudaSetDevice(_device);
22+
23+
cudaStreamCreate(&alt_stream_1);
24+
cudaStreamCreate(&alt_stream_2);
25+
cudaStreamCreate(&alt_stream_3);
26+
cudaEventCreate(&alt_stream_1_done);
27+
cudaEventCreate(&alt_stream_2_done);
28+
cudaEventCreate(&alt_stream_3_done);
29+
}
30+
31+
CudaBuffers::~CudaBuffers()
32+
{
33+
cudaStreamDestroy(alt_stream_1);
34+
cudaStreamDestroy(alt_stream_2);
35+
cudaStreamDestroy(alt_stream_3);
36+
cudaEventDestroy(alt_stream_1_done);
37+
cudaEventDestroy(alt_stream_2_done);
38+
cudaEventDestroy(alt_stream_3_done);
39+
}
40+
41+
CudaBuffers* get_buffers(const int device_index)
42+
{
43+
return g_buffers[device_index];
44+
}
45+
46+
void prepare_buffers_cuda
47+
(
48+
int _device,
49+
half* _temp_state,
50+
half* _temp_dq
51+
)
52+
{
53+
CudaBuffers* buffers = new CudaBuffers
54+
(
55+
_device,
56+
_temp_state,
57+
_temp_dq
58+
);
59+
60+
g_buffers[_device] = buffers;
61+
}
62+
63+
void cleanup_buffers_cuda()
64+
{
65+
for (int i = 0; i < CUDA_MAX_DEVICES; i++)
66+
{
67+
if (!g_buffers[i]) continue;
68+
delete g_buffers[i];
69+
g_buffers[i] = NULL;
70+
}
71+
}
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
// Adapted from turboderp exllama: https://github.com/turboderp/exllama
2+
3+
#ifndef _cuda_buffers_cuh
4+
#define _cuda_buffers_cuh
5+
6+
#include <cuda_runtime.h>
7+
#include <cuda_fp16.h>
8+
#include <cstdint>
9+
#include <cstdio>
10+
11+
const int CUDA_MAX_DEVICES = 16;
12+
13+
// #ifndef _cuda_buffers_cu
14+
// extern __constant__ half2 q4_table[16][256];
15+
// #endif
16+
17+
class CudaBuffers
18+
{
19+
public:
20+
int device;
21+
22+
half* temp_state; // [max_hidden_rows * intermediate_size]
23+
half* temp_dq; // size of largest quant tensor * 8
24+
25+
cudaStream_t alt_stream_1;
26+
cudaStream_t alt_stream_2;
27+
cudaStream_t alt_stream_3;
28+
cudaEvent_t alt_stream_1_done;
29+
cudaEvent_t alt_stream_2_done;
30+
cudaEvent_t alt_stream_3_done;
31+
32+
CudaBuffers
33+
(
34+
int _device,
35+
half* _temp_state,
36+
half* _temp_dq
37+
);
38+
~CudaBuffers();
39+
};
40+
41+
CudaBuffers* get_buffers(const int device_index);
42+
43+
void prepare_buffers_cuda
44+
(
45+
int _device,
46+
half* _temp_state,
47+
half* _temp_dq
48+
);
49+
50+
void cleanup_buffers_cuda();
51+
52+
#endif
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
// Adapted from turboderp exllama: https://github.com/turboderp/exllama
2+
3+
#ifndef _cuda_compat_cuh
4+
#define _cuda_compat_cuh
5+
6+
// atomicAdd for half types, to support CC < 7.x
7+
8+
__device__ __forceinline__ void atomicAdd_half(half* address, half val)
9+
{
10+
unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
11+
unsigned int old = *address_as_ui;
12+
unsigned int assumed;
13+
14+
do
15+
{
16+
assumed = old;
17+
__half_raw hsum;
18+
hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
19+
half tmpres = __hadd(hsum, val);
20+
hsum = __half_raw(tmpres);
21+
old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
22+
old = atomicCAS(address_as_ui, assumed, old);
23+
}
24+
while (assumed != old);
25+
}
26+
27+
// atomicAdd for half2 types
28+
29+
__device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val)
30+
{
31+
unsigned int* address_as_ui = (unsigned int*)address;
32+
unsigned int old = *address_as_ui;
33+
unsigned int assumed;
34+
do
35+
{
36+
assumed = old;
37+
half2 old_val = *((half2*)&old);
38+
half2 new_val = __hadd2(old_val, val);
39+
old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val));
40+
}
41+
while (assumed != old);
42+
}
43+
44+
//
45+
46+
#if defined(__CUDA_ARCH__)
47+
#if __CUDA_ARCH__ < 700
48+
49+
__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
50+
51+
#if __CUDA_ARCH__ < 600
52+
__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
53+
#endif
54+
55+
#endif
56+
#endif
57+
58+
#endif
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
// Adapted from turboderp exllama: https://github.com/turboderp/exllama
2+
3+
#include "column_remap.cuh"
4+
#include "../util.cuh"
5+
6+
const int SHUF_BLOCKSIZE_X = 256;
7+
const int SHUF_BLOCKSIZE_Y = 16;
8+
9+
__global__ void column_remap_kernel
10+
(
11+
const half* __restrict__ x,
12+
half* __restrict__ x_new,
13+
const int x_width,
14+
const int x_height,
15+
const uint32_t* x_map
16+
)
17+
{
18+
int x_column = SHUF_BLOCKSIZE_X * blockIdx.x + threadIdx.x;
19+
int x_row = SHUF_BLOCKSIZE_Y * blockIdx.y;
20+
21+
int x_stride = x_width;
22+
int x_idx = x_row * x_stride + x_column;
23+
24+
int x_row_end = min(x_row + SHUF_BLOCKSIZE_Y, x_height);
25+
int x_idx_end = x_row_end * x_stride + x_column;
26+
27+
int s_column = x_map[x_column];
28+
int s_idx = x_row * x_stride + s_column;
29+
30+
while (x_idx < x_idx_end)
31+
{
32+
x_new[x_idx] = x[s_idx];
33+
x_idx += x_stride;
34+
s_idx += x_stride;
35+
}
36+
}
37+
38+
// Remap columns in x to correspond to sequential group index before matmul
39+
//
40+
// perform x -> seq_x such that seq_x @ seq_w == x @ w
41+
42+
void column_remap_cuda
43+
(
44+
const half* x,
45+
half* x_new,
46+
const int x_height,
47+
const int x_width,
48+
const uint32_t* x_map
49+
)
50+
{
51+
dim3 threads(SHUF_BLOCKSIZE_X, 1, 1);
52+
53+
dim3 blocks
54+
(
55+
(x_width + SHUF_BLOCKSIZE_X - 1) / SHUF_BLOCKSIZE_X,
56+
(x_height + SHUF_BLOCKSIZE_Y - 1) / SHUF_BLOCKSIZE_Y,
57+
1
58+
);
59+
60+
column_remap_kernel<<<blocks, threads>>>(x, x_new, x_width, x_height, x_map);
61+
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
// Adapted from turboderp exllama: https://github.com/turboderp/exllama
2+
3+
#ifndef _column_remap_cuh
4+
#define _column_remap_cuh
5+
6+
#include <cuda_runtime.h>
7+
#include <cuda_fp16.h>
8+
#include <cstdint>
9+
10+
void column_remap_cuda
11+
(
12+
const half* x,
13+
half* x_new,
14+
const int x_height,
15+
const int x_width,
16+
const uint32_t* x_map
17+
);
18+
19+
#endif

0 commit comments

Comments
 (0)