Merge pull request #78 from chichun-charlie-liu/main

chichun-charlie-liu · web-flow · commit b5f84d94dc64 · 2025-03-25T10:07:46.000-04:00
fix: make triton optional for systems without GPUs
diff --git a/fms_mo/modules/linear.py b/fms_mo/modules/linear.py
@@ -27,9 +27,6 @@
 import torch.nn.functional as F
 
 # Local
-from fms_mo.custom_ext_kernels.triton_kernels import (
-    tl_matmul_chunk_truncate as tl_matmul,
-)
 from fms_mo.custom_ext_kernels.utils import pack_vectorized
 from fms_mo.quant.quantizers import (
     HardPrune,
@@ -39,6 +36,13 @@
     get_weight_quantizer,
     mask_fc_kij,
 )
+from fms_mo.utils.import_utils import available_packages
+
+if available_packages["triton"]:
+    # Local
+    from fms_mo.custom_ext_kernels.triton_kernels import (
+        tl_matmul_chunk_truncate as tl_matmul,
+    )
 
 logger = logging.getLogger(__name__)
 
@@ -879,7 +883,9 @@ def from_torch_iW(cls, nnlin_iW, prec, a_cv, a_cvn, w_cv, zero_shift, **kwargs):
         qlinear_iW.nbits_w = 8
         qlinear_iW.acc_dtype = kwargs.get("acc_dtype", torch.float)
         qlinear_iW.usePTnativeQfunc = kwargs.get("use_PT_native_Qfunc", True)
-        qlinear_iW.use_int_kernel = kwargs.get("use_int_kernel", "triton")
+        qlinear_iW.use_int_kernel = kwargs.get(
+            "use_int_kernel", "triton" if available_packages["triton"] else False
+        )
         qlinear_iW.weight = nn.Parameter(
             nnlin_iW.weight.to(torch.int8), requires_grad=False
         )
@@ -1119,15 +1125,15 @@ def set_matmul_op(self):
                 imatmul_ops_reg,
             )
 
-            if self.use_int_kernel == "triton":
+            if self.use_int_kernel == "triton" and available_packages["triton"]:
                 # will use real imatmul written in triton
                 imm_func = partial(
                     tl_matmul,
                     chunk_trun_bits=self.truncate_lsb,
                     chunk_size=self.chunk_size,
                 )
 
-            elif self.use_int_kernel == "cutlass":
+            elif self.use_int_kernel == "cutlass" and available_packages["cutlass"]:
                 # will use real imatmul written in cutlass
                 cutlass_ops_load_and_reg()
                 # Third Party
diff --git a/fms_mo/run_quant.py b/fms_mo/run_quant.py
@@ -92,6 +92,7 @@ def quantize(
                 "auto_gptq module not found. For more instructions on installing the appropriate "
                 "package, see https://github.com/AutoGPTQ/AutoGPTQ?tab=readme-ov-file#installation"
             )
+        gptq_args.use_triton = gptq_args.use_triton and available_packages["triton"]
         run_gptq(model_args, data_args, opt_args, gptq_args)
     elif opt_args.quant_method == "fp8":
         if not available_packages["llmcompressor"]:
diff --git a/fms_mo/utils/import_utils.py b/fms_mo/utils/import_utils.py
@@ -29,6 +29,7 @@
     "graphviz",
     "pygraphviz",
     "fms",
+    "triton",
 ]
 
 available_packages = {}
diff --git a/pyproject.toml b/pyproject.toml
@@ -37,7 +37,8 @@ dependencies = [
 "huggingface_hub",
 "pandas",
 "safetensors",
-"ibm-fms>=0.0.8"
+"ibm-fms>=0.0.8",
+"pkginfo>1.10"
 ]
 
 [project.optional-dependencies]
diff --git a/tests/models/conftest.py b/tests/models/conftest.py
@@ -1079,7 +1079,7 @@ def input_bert():
         torch.FloatTensor: BERT sample input
     """
     text = "Replace me by any text you'd like."
-    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+    tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
     return tokenizer(text, return_tensors="pt")
 
 
@@ -1091,4 +1091,4 @@ def model_bert():
     Returns:
         transformers.models.bert.modeling_bert.BertModel: BERT model
     """
-    return BertModel.from_pretrained("bert-base-uncased", torchscript=True)
+    return BertModel.from_pretrained("google-bert/bert-base-uncased", torchscript=True)
diff --git a/tests/triton_kernels/test_triton_mm.py b/tests/triton_kernels/test_triton_mm.py
@@ -18,10 +18,18 @@
 import torch
 
 # Local
-from fms_mo.custom_ext_kernels.triton_kernels import (
-    tl_matmul_chunk_truncate as tl_matmul,
-)
 from fms_mo.modules.linear import LinearFPxAcc
+from fms_mo.utils.import_utils import available_packages
+
+if available_packages["triton"]:
+    # Local
+    from fms_mo.custom_ext_kernels.triton_kernels import (
+        tl_matmul_chunk_truncate as tl_matmul,
+    )
+else:
+    raise ImportError(
+        "triton python package is not avaialble, please check your installation."
+    )
 
 
 @pytest.mark.parametrize("mkn", [64, 256, 1024, 4096])

Original file line number	Diff line number	Diff line change
`@@ -92,6 +92,7 @@ def quantize(`
`92`	`92`	`"auto_gptq module not found. For more instructions on installing the appropriate "`
`93`	`93`	`"package, see https://github.com/AutoGPTQ/AutoGPTQ?tab=readme-ov-file#installation"`
`94`	`94`	`)`
	`95`	`+ gptq_args.use_triton = gptq_args.use_triton and available_packages["triton"]`
`95`	`96`	`run_gptq(model_args, data_args, opt_args, gptq_args)`
`96`	`97`	`elif opt_args.quant_method == "fp8":`
`97`	`98`	`if not available_packages["llmcompressor"]:`
Original file line number	Diff line number	Diff line change
`@@ -29,6 +29,7 @@`
`29`	`29`	`"graphviz",`
`30`	`30`	`"pygraphviz",`
`31`	`31`	`"fms",`
	`32`	`+ "triton",`
`32`	`33`	`]`
`33`	`34`
`34`	`35`	`available_packages = {}`
Original file line number	Diff line number	Diff line change
`@@ -37,7 +37,8 @@ dependencies = [`
`37`	`37`	`"huggingface_hub",`
`38`	`38`	`"pandas",`
`39`	`39`	`"safetensors",`
`40`		`-"ibm-fms>=0.0.8"`
	`40`	`+"ibm-fms>=0.0.8",`
	`41`	`+"pkginfo>1.10"`
`41`	`42`	`]`
`42`	`43`
`43`	`44`	`[project.optional-dependencies]`