Enable gptq quantization through quantize API

tharapalanivel · tharapalanivel · commit 2e1e58d8a921 · 2025-01-22T23:37:32.000-08:00
Signed-off-by: Thara Palanivel &lt;130496890+tharapalanivel@users.noreply.github.com&gt;
diff --git a/fms_mo/run_quant.py b/fms_mo/run_quant.py
@@ -85,11 +85,11 @@ def quantize(
     logger.info(f"{fms_mo_args}\n{opt_args.quant_method}\n")
 
     if opt_args.quant_method == "gptq":
-        if not available_packages["auto_gptq"]:
+        if not available_packages["gptqmodel"]:
             raise ImportError(
                 "Quantization method has been selected as gptq but unable to use external library, "
-                "auto_gptq module not found. For more instructions on installing the appropriate "
-                "package, see https://github.com/AutoGPTQ/AutoGPTQ?tab=readme-ov-file#installation"
+                "gptqmodel module not found. For more instructions on installing the appropriate "
+                "package, see https://github.com/ModelCloud/GPTQModel/tree/main?tab=readme-ov-file#install"
             )
         run_gptq(model_args, data_args, opt_args, gptq_args)
     elif opt_args.quant_method == "fp8":
@@ -127,6 +127,7 @@ def run_gptq(model_args, data_args, opt_args, gptq_args):
     from gptqmodel import GPTQModel, QuantizeConfig
     from gptqmodel.models._const import SUPPORTED_MODELS
     from gptqmodel.models.auto import MODEL_MAP
+    from gptqmodel.utils.backend import BACKEND
 
     # Local
     from fms_mo.utils.custom_gptq_models import custom_gptq_classes
@@ -164,17 +165,17 @@ def run_gptq(model_args, data_args, opt_args, gptq_args):
     start_time = time.time()
     model.quantize(
         data,
-        use_triton=gptq_args.use_triton,
+        backend=BACKEND.TRITON if gptq_args.use_triton else BACKEND.AUTO,
         batch_size=gptq_args.batch_size,
-        cache_examples_on_gpu=gptq_args.cache_examples_on_gpu,
+        calibration_enable_gpu_cache=gptq_args.cache_examples_on_gpu,
     )
 
     logger.info(
         f"Time to quantize model at {opt_args.output_dir}: {time.time() - start_time}"
     )
 
     logger.info(f"Saving quantized model and tokenizer to {opt_args.output_dir}")
-    model.save_quantized(opt_args.output_dir, use_safetensors=True)
+    model.save_quantized(opt_args.output_dir)
     tokenizer.save_pretrained(opt_args.output_dir)
 
 
diff --git a/fms_mo/utils/custom_gptq_models.py b/fms_mo/utils/custom_gptq_models.py
@@ -15,10 +15,10 @@
 """Allow users to add new GPTQ classes for their custom models easily."""
 
 # Third Party
-from auto_gptq.modeling import BaseGPTQForCausalLM
+from gptqmodel.models.base import BaseGPTQModel
 
 
-class GraniteGPTQForCausalLM(BaseGPTQForCausalLM):
+class GraniteGPTQForCausalLM(BaseGPTQModel):
     """Enable Granite for GPTQ."""
 
     layer_type = "GraniteDecoderLayer"
@@ -32,7 +32,7 @@ class GraniteGPTQForCausalLM(BaseGPTQForCausalLM):
     ]
 
 
-class GraniteMoeGPTQForCausalLM(BaseGPTQForCausalLM):
+class GraniteMoeGPTQForCausalLM(BaseGPTQModel):
     """Enable Granite MOE for GPTQ."""
 
     layer_type = "GraniteMoeDecoderLayer"
diff --git a/fms_mo/utils/import_utils.py b/fms_mo/utils/import_utils.py
@@ -21,7 +21,7 @@
 import torch
 
 optional_packages = [
-    "auto_gptq",
+    "gptqmodel",
     "exllama_kernels",
     "exllamav2_kernels",
     "llmcompressor",
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,7 +23,7 @@ classifiers=[
 dynamic = ["version"]
 dependencies = [
 "numpy>=1.26.4,<2.3.0",
-"accelerate>=0.20.3,!=0.34,<1.1",
+"accelerate>=1.2.1,!=0.34",
 "transformers>=4.45,<4.48",
 "torch>=2.2.0,<2.4", 
 "tqdm>=4.66.2,<5.0",
@@ -41,7 +41,7 @@ dependencies = [
 [project.optional-dependencies]
 dev = ["pre-commit>=3.0.4,<5.0"]
 fp8 = ["llmcompressor"]
-gptq = ["gptqmodel"]
+gptq = ["Cython", "gptqmodel>=1.7.3"]
 visualize = ["matplotlib", "graphviz", "pygraphviz"]
 flash-attn = ["flash-attn>=2.5.3,<3.0"]
 opt = ["fms-model-optimizer[fp8, gptq]"]
diff --git a/tests/build/test_launch_script.py b/tests/build/test_launch_script.py
@@ -86,7 +86,7 @@ def cleanup_env():
 
 
 @pytest.mark.skipif(
-    not available_packages["auto_gptq"],
+    not available_packages["gptqmodel"],
     reason="Only runs if auto-gptq package is installed",
 )
 def test_successful_gptq():
@@ -254,7 +254,7 @@ def _validate_quantization_output(base_dir, quant_method):
 
     # Check quantized model files exist
     if quant_method == "gptq":
-        assert len(glob.glob(os.path.join(base_dir, "gptq_model-*.safetensors"))) > 0
+        assert len(glob.glob(os.path.join(base_dir, "model*.safetensors"))) > 0
         assert os.path.exists(os.path.join(base_dir, "quantize_config.json")) is True
         assert os.path.exists(os.path.join(base_dir, "config.json")) is True