foundation-model-stack
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/test.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 7 additions & 6 deletions b/‎README.md‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎fms_mo/fx/dynamo_utils.py‎
Lines changed: 53 additions & 22 deletions b/‎fms_mo/fx/dynamo_utils.py‎
Lines changed: 53 additions & 22 deletions
diff --git a/‎fms_mo/fx/utils.py‎
Lines changed: 3 additions & 1 deletion b/‎fms_mo/fx/utils.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎fms_mo/modules/linear.py‎
Lines changed: 12 additions & 6 deletions b/‎fms_mo/modules/linear.py‎
Lines changed: 12 additions & 6 deletions
diff --git a/‎fms_mo/run_quant.py‎
Lines changed: 1 addition & 0 deletions b/‎fms_mo/run_quant.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎fms_mo/utils/import_utils.py‎
Lines changed: 1 addition & 0 deletions b/‎fms_mo/utils/import_utils.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 6 additions & 5 deletions b/‎pyproject.toml‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎tests/aiu_addons/conftest.py‎
Lines changed: 25 additions & 5 deletions b/‎tests/aiu_addons/conftest.py‎
Lines changed: 25 additions & 5 deletions
diff --git a/‎tests/artifacts/aiu_addons/ref_w-per_channel_a-per_tensor_asymm_sq-N.pt‎
3.19 KB b/‎tests/artifacts/aiu_addons/ref_w-per_channel_a-per_tensor_asymm_sq-N.pt‎
3.19 KB
@@ -40,9 +40,9 @@ jobs:
     strategy:
       matrix:
         python:
-          - "3.9"
           - "3.10"
           - "3.11"
+          - "3.12"
         platform:
           - "ubuntu-latest"
 
 
@@ -36,9 +36,7 @@ FMS Model Optimizer is a framework for developing reduced precision neural netwo
 ### Requirements
 
 1. **🐧 Linux system with Nvidia GPU (V100/A100/H100)**
-2. Python 3.9 to Python 3.11
-
-    📋 Python 3.12 is currently not supported due to PyTorch Dynamo constraint
+2. Python 3.10 to Python 3.12
 3. CUDA >=12
 
 *Optional packages based on optimization functionality required:*
@@ -47,9 +45,12 @@ FMS Model Optimizer is a framework for developing reduced precision neural netwo
     - [auto_gptq](https://pypi.org/project/auto-gptq/) or build from [source](https://github.com/AutoGPTQ/AutoGPTQ)
 - If you want to experiment with **INT8** deployment in [QAT](./examples/QAT_INT8/) and [PTQ](./examples/PTQ_INT8/) examples:
     - Nvidia GPU with compute capability > 8.0 (A100 family or higher)
-    - [Ninja](https://ninja-build.org/)
-    - Clone the [CUTLASS](https://github.com/NVIDIA/cutlass) repository
-    - `PyTorch 2.3.1` (as newer version will cause issue for the custom CUDA kernel used in these examples)
+    - Option 1:
+        - [Ninja](https://ninja-build.org/)
+        - Clone the [CUTLASS](https://github.com/NVIDIA/cutlass) repository
+        - `PyTorch 2.3.1` (as newer version will cause issue for the custom CUDA kernel used in these examples)
+    - Option 2:
+        - use triton kernel included. But this kernel is currently not faster than FP16.
 - **FP8** is a reduced precision format like **INT8**:
     - Nvidia A100 family or higher
     - [llm-compressor](https://github.com/vllm-project/llm-compressor)
 
@@ -33,6 +33,28 @@
 logger = logging.getLogger(__name__)
 
 
+def run_fwd_once(model, sample_inp):
+    """Convenient function to run model once using correct input unpack."""
+    with torch.no_grad():
+        if isinstance(sample_inp, dict) or all(
+            hasattr(sample_inp, k) for k in ("keys", "values", "items")
+        ):
+            out = model(**sample_inp)
+        elif isinstance(sample_inp, tuple):
+            out = model(*sample_inp)
+        elif isinstance(sample_inp, torch.Tensor):
+            out = model(sample_inp)
+        else:
+            try:
+                #   assume user provided input is ready-to-run...
+                out = model(sample_inp)
+            except RuntimeError:
+                logger.info(
+                    f"Unknown data structure for example_input.{type(sample_inp)} Please check."
+                )
+    return out
+
+
 def dfs_gm(
     gm,
     targetOp=None,
@@ -229,7 +251,9 @@ def _dfs(curr_node, depth):
 
 
 def find_conv_on_shortcut_gm(
-    gm: torch.fx.GraphModule, lut_fx_mod_name_to_org: Optional[Dict[str, str]] = None
+    gm: torch.fx.GraphModule,
+    lut_fx_mod_name_to_org: Optional[Dict[str, str]] = None,
+    lut_name_to_mod=None,
 ):
     """Identify Conv on shortcut using FX GM DFS
     It's (almost) specific for ResNet-like CNNs, will return a list of module names (as used in the
@@ -254,6 +278,9 @@ def find_conv_on_shortcut_gm(
     5. count levels of each branch, decide which one is the shortcut
     """
 
+    if lut_name_to_mod is None:
+        lut_name_to_mod = {}
+
     # 1. Find "add" nodes, including inplace add as some may use "out+=shortcut"
     nodes_add = dfs_gm(gm, ["add"], return_nodes=True)
 
@@ -337,9 +364,13 @@ def find_conv_on_shortcut_gm(
                 if n_conv_i.op == "call_module":
                     conv_mod = gm.get_submodule(n_conv_i.target)
                 else:
-                    conv_mod = get_org_mod_name_of_fx_node(
+                    # in case aten IR is being used
+                    conv_mod_name = get_org_mod_name_of_fx_node(
                         n_conv_i, lut_fx2org=lut_fx_mod_name_to_org
                     )
+                    conv_mod = lut_name_to_mod.get(conv_mod_name, None)
+                    if not isinstance(conv_mod, torch.nn.Conv2d):
+                        continue
                 if conv_mod.out_channels > conv_mod.in_channels:  # see Note 2
                     qconv_candidate.append(
                         get_org_mod_name_of_fx_node(
@@ -1003,8 +1034,17 @@ def cus_backend_model_analyzer(
             for _, m in gm_fx.named_modules()
             if isinstance(m, torch.nn.Conv2d) or issubclass(type(m), torch.nn.Conv2d)
         ]
-        if len(all_conv) > 0:
-            skip_candidates += find_conv_on_shortcut_gm(gm_fx, lut_fx_mod_name_to_org)
+        # if gm is using aten IR, only ops can be seen, no modules.
+        conv_ops = dfs_gm(
+            gm_fx,
+            targetOp=[torch.nn.Conv2d, torch.nn.functional.conv2d],
+            return_nodes=True,
+        )
+        lut_name_to_mod = {n: m for m, n in qcfg["LUTmodule_name"].items()}
+        if len(all_conv) > 0 or len(conv_ops) > 0:
+            skip_candidates += find_conv_on_shortcut_gm(
+                gm_fx, lut_fx_mod_name_to_org, lut_name_to_mod
+            )
 
         # Check 2. first/last, see Note 2 and 3, NOTE that transformers are handled differently
         if qcfg["N_backend_called"] > 1:
@@ -1064,6 +1104,7 @@ def cus_backend_model_analyzer(
     from functools import partial
 
     # Third Party
+    from torchvision.models import VisionTransformer
     from transformers import PreTrainedModel
 
     if issubclass(type(model), torch.nn.Module):
@@ -1075,7 +1116,7 @@ def cus_backend_model_analyzer(
         model_to_be_traced = model
         model_param_size = 999
 
-    is_transformers = issubclass(type(model), PreTrainedModel)
+    is_transformers = issubclass(type(model), (PreTrainedModel, VisionTransformer))
     if model_param_size > 1:
         # Standard
         import sys
@@ -1111,35 +1152,25 @@ def call_seq_hook(mod, *_args, **_kwargs):
                 h_hooks.append(m.register_forward_hook(call_seq_hook))
 
         with torch.no_grad():
-            model(**sample_inp)
+            run_fwd_once(model, sample_inp)
 
         for h in h_hooks:
             h.remove()
 
         # only add last layer
         qcfg["qskip_layer_name"] += [qcfg["mod_call_seq"][-1]]
+        # unless it's a ViT, skip first Conv as well
+        if issubclass(type(model), VisionTransformer) and isinstance(
+            model.get_submodule(qcfg["mod_call_seq"][0]), torch.nn.Conv2d
+        ):
+            qcfg["qskip_layer_name"] += [qcfg["mod_call_seq"][0]]
 
     with torch.no_grad():
         model_opt = torch.compile(
             model_to_be_traced,
             backend=cus_bknd,
         )
-        if isinstance(sample_inp, dict) or all(
-            hasattr(sample_inp, k) for k in ("keys", "values", "items")
-        ):
-            model_opt(**sample_inp)
-        elif isinstance(sample_inp, tuple):
-            model_opt(*sample_inp)
-        elif isinstance(sample_inp, torch.Tensor):
-            model_opt(sample_inp)
-        else:
-            try:
-                #   assume user provided input is ready-to-run...
-                model_opt(sample_inp)
-            except RuntimeError:
-                logger.info(
-                    f"Unknown data structure for example_input.{type(sample_inp)} Please check."
-                )
+        run_fwd_once(model_opt, sample_inp)
 
         del model_opt
 
 
@@ -343,6 +343,8 @@ def get_org_mod_name_of_fx_node(
         str: corresponding name on original graph
     """
     org_name = f"Unknown:{node.name}"
+    if lut_fx2org is None:
+        lut_fx2org = {}
     if "nn_module_stack" in node.meta:
         n_fx_mod_name = list(node.meta["nn_module_stack"].keys())[-1]
         n_fx_org_mod_name = list(node.meta["nn_module_stack"].values())[-1][0]
@@ -360,7 +362,7 @@ def get_org_mod_name_of_fx_node(
                         org_name = v[: -len(suffix)]
                     break
 
-        if org_name is None:
+        if org_name.startswith("Unknown:"):
             org_name = lname_to_org_name(n_fx_org_mod_name)
 
     return org_name
 
@@ -27,9 +27,6 @@
 import torch.nn.functional as F
 
 # Local
-from fms_mo.custom_ext_kernels.triton_kernels import (
-    tl_matmul_chunk_truncate as tl_matmul,
-)
 from fms_mo.custom_ext_kernels.utils import pack_vectorized
 from fms_mo.quant.quantizers import (
     HardPrune,
@@ -39,6 +36,13 @@
     get_weight_quantizer,
     mask_fc_kij,
 )
+from fms_mo.utils.import_utils import available_packages
+
+if available_packages["triton"]:
+    # Local
+    from fms_mo.custom_ext_kernels.triton_kernels import (
+        tl_matmul_chunk_truncate as tl_matmul,
+    )
 
 logger = logging.getLogger(__name__)
 
@@ -879,7 +883,9 @@ def from_torch_iW(cls, nnlin_iW, prec, a_cv, a_cvn, w_cv, zero_shift, **kwargs):
         qlinear_iW.nbits_w = 8
         qlinear_iW.acc_dtype = kwargs.get("acc_dtype", torch.float)
         qlinear_iW.usePTnativeQfunc = kwargs.get("use_PT_native_Qfunc", True)
-        qlinear_iW.use_int_kernel = kwargs.get("use_int_kernel", "triton")
+        qlinear_iW.use_int_kernel = kwargs.get(
+            "use_int_kernel", "triton" if available_packages["triton"] else False
+        )
         qlinear_iW.weight = nn.Parameter(
             nnlin_iW.weight.to(torch.int8), requires_grad=False
         )
@@ -1119,15 +1125,15 @@ def set_matmul_op(self):
                 imatmul_ops_reg,
             )
 
-            if self.use_int_kernel == "triton":
+            if self.use_int_kernel == "triton" and available_packages["triton"]:
                 # will use real imatmul written in triton
                 imm_func = partial(
                     tl_matmul,
                     chunk_trun_bits=self.truncate_lsb,
                     chunk_size=self.chunk_size,
                 )
 
-            elif self.use_int_kernel == "cutlass":
+            elif self.use_int_kernel == "cutlass" and available_packages["cutlass"]:
                 # will use real imatmul written in cutlass
                 cutlass_ops_load_and_reg()
                 # Third Party
 
@@ -92,6 +92,7 @@ def quantize(
                 "auto_gptq module not found. For more instructions on installing the appropriate "
                 "package, see https://github.com/AutoGPTQ/AutoGPTQ?tab=readme-ov-file#installation"
             )
+        gptq_args.use_triton = gptq_args.use_triton and available_packages["triton"]
         run_gptq(model_args, data_args, opt_args, gptq_args)
     elif opt_args.quant_method == "fp8":
         if not available_packages["llmcompressor"]:
 
@@ -29,6 +29,7 @@
     "graphviz",
     "pygraphviz",
     "fms",
+    "triton",
 ]
 
 available_packages = {}
 
@@ -7,25 +7,25 @@ name = "fms-model-optimizer"
 description = "Quantization Techniques"
 readme = "README.md"
 license = {text = "Apache-2.0"}
-requires-python = ">=3.9,<3.12"
+requires-python = ">3.9,<3.13"
 classifiers=[
     "Development Status :: 3 - Alpha",
     "License :: OSI Approved :: Apache Software License",
     "License :: OSI Approved :: MIT License",
     "Operating System :: POSIX :: Linux",
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
     "Programming Language :: Python :: Implementation :: CPython",
 ]
 dynamic = ["version"]
 dependencies = [
 "numpy>=1.26.4,<2.3.0",
-"accelerate>=0.20.3,!=0.34,<1.4",
+"accelerate>=0.20.3,!=0.34,<1.7",
 "transformers>=4.45,<4.51",
-"torch>=2.2.0,<2.5", 
+"torch>=2.2.0,<2.6", 
 "triton>=3.0,<3.2",
 "tqdm>=4.66.2,<5.0",
 "datasets>=3.0.0,<4.0",
@@ -37,7 +37,8 @@ dependencies = [
 "huggingface_hub",
 "pandas",
 "safetensors",
-"ibm-fms>=0.0.8"
+"ibm-fms>=0.0.8",
+"pkginfo>1.10"
 ]
 
 [project.optional-dependencies]
 
@@ -74,11 +74,31 @@ def get_gptq_gemm_inputs(request) -> tuple[torch.Tensor, ...]:
         "atype": "per_tensor_symm",  # per_tensor_asymm, per_token
         "smoothquant": False,
     },
-    # {
-    #     "wtype": "per_channel",  # per_channel
-    #     "atype": "per_tensor_symm",  # per_tensor_asymm, per_token
-    #     "smoothquant": False,
-    # },
+    {
+        "wtype": "per_tensor",  # per_channel
+        "atype": "per_tensor_asymm",  # per_tensor_asymm, per_token
+        "smoothquant": False,
+    },
+    {
+        "wtype": "per_channel",  # per_channel
+        "atype": "per_tensor_symm",  # per_tensor_asymm, per_token
+        "smoothquant": False,
+    },
+    {
+        "wtype": "per_tensor",  # per_channel
+        "atype": "per_token",  # per_tensor_asymm, per_token
+        "smoothquant": False,
+    },
+    {
+        "wtype": "per_channel",  # per_channel
+        "atype": "per_tensor_asymm",  # per_tensor_asymm, per_token
+        "smoothquant": False,
+    },
+    {
+        "wtype": "per_channel",  # per_channel
+        "atype": "per_token",  # per_tensor_asymm, per_token
+        "smoothquant": False,
+    },
 ]
Original file line number	Diff line number	Diff line change
`@@ -92,6 +92,7 @@ def quantize(`
`92`	`92`	`"auto_gptq module not found. For more instructions on installing the appropriate "`
`93`	`93`	`"package, see https://github.com/AutoGPTQ/AutoGPTQ?tab=readme-ov-file#installation"`
`94`	`94`	`)`
	`95`	`+ gptq_args.use_triton = gptq_args.use_triton and available_packages["triton"]`
`95`	`96`	`run_gptq(model_args, data_args, opt_args, gptq_args)`
`96`	`97`	`elif opt_args.quant_method == "fp8":`
`97`	`98`	`if not available_packages["llmcompressor"]:`
Original file line number	Diff line number	Diff line change
`@@ -29,6 +29,7 @@`
`29`	`29`	`"graphviz",`
`30`	`30`	`"pygraphviz",`
`31`	`31`	`"fms",`
	`32`	`+ "triton",`
`32`	`33`	`]`
`33`	`34`
`34`	`35`	`available_packages = {}`