IBM
diff --git a/‎Dockerfile
Lines changed: 10 additions & 0 deletions b/‎Dockerfile
Lines changed: 10 additions & 0 deletions
diff --git a/‎server/exllamav2_kernels/exllamav2_kernels/config.h
Lines changed: 13 additions & 0 deletions b/‎server/exllamav2_kernels/exllamav2_kernels/config.h
Lines changed: 13 additions & 0 deletions
diff --git a/‎server/exllamav2_kernels/exllamav2_kernels/cpp/quantize_func.cpp
Lines changed: 59 additions & 0 deletions b/‎server/exllamav2_kernels/exllamav2_kernels/cpp/quantize_func.cpp
Lines changed: 59 additions & 0 deletions
diff --git a/‎server/exllamav2_kernels/exllamav2_kernels/cpp/quantize_func.h
Lines changed: 25 additions & 0 deletions b/‎server/exllamav2_kernels/exllamav2_kernels/cpp/quantize_func.h
Lines changed: 25 additions & 0 deletions
@@ -222,6 +222,13 @@ WORKDIR /usr/src
 COPY server/exllama_kernels/ .
 RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
 
+## Build transformers exllamav2 kernels ########################################
+FROM python-builder as exllamav2-kernels-builder
+
+WORKDIR /usr/src
+
+COPY server/exllamav2_kernels/ .
+RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
 
 ## Flash attention cached build image ##########################################
 FROM base as flash-att-cache
@@ -262,6 +269,9 @@ COPY --from=flash-att-v2-cache /usr/src/flash-attention-v2/build/lib.linux-x86_6
 # Copy build artifacts from exllama kernels builder
 COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-* ${SITE_PACKAGES}
 
+# Copy build artifacts from exllamav2 kernels builder
+COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-* ${SITE_PACKAGES}
+
 # Install server
 COPY proto proto
 COPY server server
 
@@ -0,0 +1,13 @@
+#ifndef _config_h
+#define _config_h
+
+#define MAX_Q_GEMM_ROWS 50
+
+#define QMODE_2BIT 1
+#define QMODE_3BIT 1
+#define QMODE_4BIT 1
+#define QMODE_5BIT 1
+#define QMODE_6BIT 0
+#define QMODE_8BIT 0
+
+#endif
@@ -0,0 +1,59 @@
+#include "quantize_func.h"
+#include "../cuda/quantize.cuh"
+
+void quantize_range
+(
+    torch::Tensor quant,
+    torch::Tensor scale,
+    torch::Tensor out_q,
+    float qzero,
+    float maxq,
+    torch::Tensor hessian_inv,
+    torch::Tensor weights,
+    torch::Tensor error,
+    int a,
+    int b
+)
+{
+    int columns = weights.size(1);
+    int hcolumns = hessian_inv.size(1);
+
+    for (int c = a; c < b; c++)
+    {
+        quantize_cuda
+        (
+            ((const float*) weights.data_ptr()) + c * columns,
+            ((float*) quant.data_ptr()) + c * columns,
+            (const float*) scale.data_ptr(),
+            out_q.device().is_meta() ? NULL : ((uint16_t*) out_q.data_ptr()) + c * columns,
+            1,
+            columns,
+            qzero,
+            maxq
+        );
+
+        adjust_error_row_cuda
+        (
+            (const float*) hessian_inv.data_ptr(),
+            (float*) error.data_ptr(),
+            (const float*) weights.data_ptr(),
+            (const float*) quant.data_ptr(),
+            c,
+            columns,
+            hcolumns
+        );
+
+        vv_mul_sub_cuda
+        (
+            ((const float*) hessian_inv.data_ptr()) + c * hcolumns + c,
+            ((const float*) error.data_ptr()) + c * columns,
+            ((float*) weights.data_ptr()) + c * columns,
+            b - c,
+            columns
+        );
+    }
+
+    torch::Tensor x = hessian_inv.slice(0, a, b).slice(1, b).transpose(0, 1);
+    torch::Tensor y = error.slice(0, a, b);
+    weights.slice(0, b).addmm_(x, y, 1.0f, -1.0f);
+}
@@ -0,0 +1,25 @@
+#ifndef _quantize_func_h
+#define _quantize_func_h
+
+#include <torch/extension.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cstdint>
+#include <cstdio>
+
+void quantize_range
+(
+    torch::Tensor quant,
+    torch::Tensor scale,
+    torch::Tensor out_q,
+    float qzero,
+    float maxq,
+    torch::Tensor hessian_inv,
+    torch::Tensor weights,
+    torch::Tensor error,
+    int a,
+    int b
+);
+
+#endif