gptqmodel has renamed the package names from exllama_kernels to gptqmodel_exllama_kernels

chichun-charlie-liu · chichun-charlie-liu · commit c51ce6001122 · 2025-06-02T17:08:56.000Z
diff --git a/.pylintrc b/.pylintrc
@@ -64,8 +64,8 @@ ignore-patterns=^\.#
 # and thus existing member attributes cannot be deduced by static analysis). It
 # supports qualified module names, as well as Unix pattern matching.
 ignored-modules=gptqmodel,                
-                exllama_kernels,
-                exllamav2_kernels,
+                gptqmodel_exllama_kernels,
+                gptqmodel_exllamav2_kernels,
                 llmcompressor,
                 cutlass_mm,
                 pygraphviz,
diff --git a/fms_mo/custom_ext_kernels/utils.py b/fms_mo/custom_ext_kernels/utils.py
@@ -529,8 +529,8 @@ def exllama_ops_load_and_reg(qcfg=None, run_unit_test=False):
             return
 
         # Third Party
-        import exllama_kernels
-        import exllamav2_kernels
+        import gptqmodel_exllama_kernels
+        import gptqmodel_exllamav2_kernels
 
         # Register op
         @reg_op(f"{namespace}::exv1_i4f16")
@@ -547,7 +547,7 @@ def exv1_i4f16_impl(x, q4, q4_width):
                 (x.shape[0], q4_width), dtype=torch.float16, device=x.device
             )
 
-            exllama_kernels.q4_matmul(x, q4, output)
+            gptqmodel_exllama_kernels.q4_matmul(x, q4, output)
             return output.view(outshape)
 
         # Abstract implementation
@@ -575,7 +575,9 @@ def exv2_i4f16_impl(x, q_handle, q4_width, force_cuda):
                 (x.shape[0], q4_width), dtype=torch.float16, device=x.device
             )
 
-            exllamav2_kernels.gemm_half_q_half(x, q_handle, output, force_cuda)
+            gptqmodel_exllamav2_kernels.gemm_half_q_half(
+                x, q_handle, output, force_cuda
+            )
             return output.view(outshape)
 
         # Abstract implementation
diff --git a/fms_mo/modules/linear.py b/fms_mo/modules/linear.py
@@ -1501,14 +1501,14 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 try:
     # Third Party
-    from exllama_kernels import prepare_buffers, set_tuning_params
     from gptqmodel.nn_modules.qlinear.exllama import (
         ExllamaQuantLinear as QLinearExllamaV1,
     )
     from gptqmodel.nn_modules.qlinear.exllamav2 import (
         ExllamaV2QuantLinear as QLinearExllamaV2,
     )
     from gptqmodel.nn_modules.qlinear.exllamav2 import ext_gemm_half_q_half
+    from gptqmodel_exllama_kernels import prepare_buffers, set_tuning_params
     from transformers.pytorch_utils import Conv1D
 
     class QLinearExv1WI4AF16(QLinearExllamaV1):
diff --git a/fms_mo/utils/import_utils.py b/fms_mo/utils/import_utils.py
@@ -22,8 +22,8 @@
 
 optional_packages = [
     "gptqmodel",
-    "exllama_kernels",
-    "exllamav2_kernels",
+    "gptqmodel_exllama_kernels",
+    "gptqmodel_exllamav2_kernels",
     "llmcompressor",
     "mx",
     "matplotlib",

Original file line number	Diff line number	Diff line change
`@@ -1501,14 +1501,14 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:`
`1501`	`1501`
`1502`	`1502`	`try:`
`1503`	`1503`	`# Third Party`
`1504`		`- from exllama_kernels import prepare_buffers, set_tuning_params`
`1505`	`1504`	`from gptqmodel.nn_modules.qlinear.exllama import (`
`1506`	`1505`	`ExllamaQuantLinear as QLinearExllamaV1,`
`1507`	`1506`	`)`
`1508`	`1507`	`from gptqmodel.nn_modules.qlinear.exllamav2 import (`
`1509`	`1508`	`ExllamaV2QuantLinear as QLinearExllamaV2,`
`1510`	`1509`	`)`
`1511`	`1510`	`from gptqmodel.nn_modules.qlinear.exllamav2 import ext_gemm_half_q_half`
	`1511`	`+ from gptqmodel_exllama_kernels import prepare_buffers, set_tuning_params`
`1512`	`1512`	`from transformers.pytorch_utils import Conv1D`
`1513`	`1513`
`1514`	`1514`	`class QLinearExv1WI4AF16(QLinearExllamaV1):`