foundation-model-stack
diff --git a/‎.github/workflows/labelpr.yaml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/labelpr.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/pypi.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/pypi.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎fms_mo/aiu_addons/gptq/gptq_aiu_op.py‎
Lines changed: 59 additions & 10 deletions b/‎fms_mo/aiu_addons/gptq/gptq_aiu_op.py‎
Lines changed: 59 additions & 10 deletions
diff --git a/‎fms_mo/aiu_addons/i8i8/i8i8_aiu_adapter.py‎
Lines changed: 17 additions & 36 deletions b/‎fms_mo/aiu_addons/i8i8/i8i8_aiu_adapter.py‎
Lines changed: 17 additions & 36 deletions
diff --git a/‎fms_mo/aiu_addons/i8i8/i8i8_aiu_linear.py‎
Lines changed: 106 additions & 29 deletions b/‎fms_mo/aiu_addons/i8i8/i8i8_aiu_linear.py‎
Lines changed: 106 additions & 29 deletions
@@ -13,7 +13,7 @@ jobs:
           github-token: ${{ secrets.GITHUB_TOKEN }}
           script: |
             // https://github.com/commitizen/conventional-commit-types
-            const valid_pr_types = ['feat', 'fix', 'docs', 'style', 'refactor', 'perf', 'test', 'build', 'ci', 'chore', 'revert'];
+            const valid_pr_types = ['feat', 'fix', 'docs', 'style', 'refactor', 'perf', 'test', 'build', 'ci', 'chore', 'revert', 'dependencies'];
 
 
             const title = context.payload.pull_request.title;
@@ -28,4 +28,4 @@ jobs:
             const labels = context.payload.pull_request.labels;
             const new_labels = labels.filter(label => !valid_pr_types.includes(label.name)); // keep all labels that are not in valid_pr_types
             new_labels.push({name: pr_type});
-            await github.rest.issues.update({ ...context.repo, issue_number: context.payload.number, labels: new_labels });
+            await github.rest.issues.update({ ...context.repo, issue_number: context.payload.number, labels: new_labels });
@@ -44,7 +44,7 @@ jobs:
                   # for setuptools-scm
                   fetch-depth: 0
 
-            - uses: hynek/build-and-inspect-python-package@v2
+            - uses: hynek/build-and-inspect-python-package@b5076c307dc91924a82ad150cdd1533b444d3310 # v2.12.0
 
     # push to Test PyPI on
     # - a new GitHub release is published
@@ -77,7 +77,7 @@ jobs:
                   path: dist
 
             - name: Upload to Test PyPI
-              uses: pypa/gh-action-pypi-publish@v12.2.4
+              uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4
               with:
                   repository-url: https://test.pypi.org/legacy/
 
@@ -122,4 +122,4 @@ jobs:
               run: rm ./dist/*.sigstore.json
 
             - name: Upload to PyPI
-              uses: pypa/gh-action-pypi-publish@v12.2.4
+              uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4
@@ -17,6 +17,7 @@
 import logging
 
 # Third Party
+from packaging.version import Version
 import torch
 
 # pylint: disable=unused-argument
@@ -25,6 +26,36 @@
 logger = logging.getLogger(__name__)
 
 
+def implement_op_decorator(op_namespace_id):
+    """Version-dependent decorator for custom op implementation.
+    Always compare against pytorch version in current environment.
+    """
+
+    torch_version = Version(torch.__version__.split("+", maxsplit=1)[0])
+
+    def decorator(func):
+        if torch_version < Version("2.4"):
+            return torch.library.impl(op_namespace_id, "default")(func)
+        return torch.library.custom_op(op_namespace_id, mutates_args=())(func)
+
+    return decorator
+
+
+def register_op_decorator(op_namespace_id):
+    """Version-dependent decorator for custom op registration.
+    Always compare against pytorch version in current environment.
+    """
+
+    torch_version = Version(torch.__version__.split("+", maxsplit=1)[0])
+
+    def decorator(func):
+        if torch_version < Version("2.4"):
+            return torch.library.impl_abstract(op_namespace_id)(func)
+        return torch.library.register_fake(op_namespace_id)(func)
+
+    return decorator
+
+
 def register_aiu_gptq_op():
     """Register AIU-specific op to enable torch compile without graph break.
     The op preserves I/O shapes of a `X @ W^T` matmul but performs no operation.
@@ -36,17 +67,33 @@ def register_aiu_gptq_op():
     ):
         logger.warning("AIU op has already been registered")
         return
-
     op_namespace_id = "gptq_gemm::i4f16_fxinputs_aiu"
-    torch.library.define(
-        op_namespace_id,
-        "(Tensor x, Tensor qw, Tensor qzeros, Tensor scales, Tensor g_idx) -> Tensor",
-    )
+    if Version(torch.__version__.split("+", maxsplit=1)[0]) < Version("2.4"):
+        torch.library.define(
+            op_namespace_id,
+            "(Tensor x, Tensor qw, Tensor qzeros, "
+            "Tensor scales, Tensor g_idx) -> Tensor",
+        )
 
     # Add implementations for the operator
-    @torch.library.impl(op_namespace_id, "default")
-    def i4f16_fxinputs_aiu(x, qw, qzeros, scales, g_idx):
-        # on AIU, GPTQ qw is [out_feat, in_feat]
+    @implement_op_decorator(op_namespace_id)
+    def i4f16_fxinputs_aiu(
+        x: torch.Tensor,
+        qw: torch.Tensor,
+        qzeros: torch.Tensor,
+        scales: torch.Tensor,
+        g_idx: torch.Tensor,
+    ) -> torch.Tensor:
+        """Implement fake processing of GPTQ W4A16 matmul. The purpose is to create a
+        node on the computational graph to be captured during compiling for AIU.
+
+        Instead of computing the weight decompression and matmul, this function returns
+        a zero tensor with the expected shape.
+
+        NOTE: on AIU, GPTQ qw is [out_feat, in_feat], while AutoGPTQ saves the quantized
+        weights as [in_feat, out_feat]
+        """
+
         outshape = x.shape[:-1] + (qw.shape[0],)
         x = x.view(-1, x.shape[-1])
         output = torch.zeros(
@@ -56,8 +103,10 @@ def i4f16_fxinputs_aiu(x, qw, qzeros, scales, g_idx):
         )
         return output.view(outshape)
 
-    @torch.library.impl_abstract(op_namespace_id)
-    def i4f16_fxinputs_aiu_abstract(x, qw, qzeros, scales, g_idx):
+    @register_op_decorator(op_namespace_id)
+    def _(x, qw, qzeros, scales, g_idx):
+        """OP template of I/O sizes"""
+
         outshape = x.shape[:-1] + (qw.shape[0],)
         return torch.empty(
             outshape,
 
@@ -97,41 +97,22 @@ def _add_defaults_and_concat(
         )
 
 
-# registration of new adapter steps for each architecture
-serialization.register_adapter_step("llama", "int8_qparams_aiu", _int8_qparams_aiu)
-serialization.register_adapter_step(
-    "gpt_bigcode", "int8_qparams_aiu", _int8_qparams_aiu
-)
-serialization.register_adapter_step("roberta", "int8_qparams_aiu", _int8_qparams_aiu)
-serialization.register_adapter_step(
-    "roberta_question_answering",
-    "int8_qparams_aiu",
-    _int8_qparams_aiu,
-)
-
-# registration of multi-step adapter for each architecture
-serialization.register_adapter(
+# registration of new adapter step and adapter for each architecture
+for arch in [
     "llama",
-    "fms_mo",
-    [
-        "hf_to_fms_names",
-        "hf_to_fms_rope",
-        "weight_fusion",
-        "int8_qparams_aiu",
-    ],
-)
-serialization.register_adapter(
-    "gpt_bigcode", "fms_mo", ["hf_to_fms_names", "weight_fusion", "int8_qparams_aiu"]
-)
-serialization.register_adapter(
-    "roberta", "fms_mo", ["hf_to_fms_names", "weight_fusion", "int8_qparams_aiu"]
-)
-serialization.register_adapter(
+    "gpt_bigcode",
+    "granite",
+    "roberta",
     "roberta_question_answering",
-    "fms_mo",
-    [
-        "hf_to_fms_names",
-        "weight_fusion",
-        "int8_qparams_aiu",
-    ],
-)
+]:
+    serialization.register_adapter_step(arch, "int8_qparams_aiu", _int8_qparams_aiu)
+    if arch in ["llama", "granite"]:
+        steps_to_register = [
+            "hf_to_fms_names",
+            "hf_to_fms_rope",
+            "weight_fusion",
+            "int8_qparams_aiu",
+        ]
+    else:
+        steps_to_register = ["hf_to_fms_names", "weight_fusion", "int8_qparams_aiu"]
+    serialization.register_adapter(arch, "fms_mo", steps_to_register)
@@ -84,10 +84,10 @@ def __init__(
             "weight",
             torch.zeros(out_features, in_features, dtype=torch.int8),
         )
-        if bias:
-            self.register_buffer(
-                "bias", torch.zeros((out_features), dtype=torch.float16)
-            )
+
+        self.has_bias = bias
+        bias_size = out_features if self.has_bias else 1
+        self.register_buffer("bias", torch.zeros((bias_size), dtype=torch.float16))
 
         if config.weight_per_channel:
             w_clip_size = out_features
@@ -188,11 +188,32 @@ def forward(self, x):
             self.smoothquant,
         )
 
+    def re_register_qdata(self) -> None:
+        """Remove existing self.qdata tensor and register it again as a buffer.
+        This method is used during TP, after other quantization metadata have been
+        updated.
+        """
+
+        del self.qdata
+        self.register_buffer(
+            "qdata",
+            torch.cat(
+                (
+                    self.w_clip_val,
+                    self.w_clip_valn,
+                    self.a_clip_val,
+                    self.a_clip_valn,
+                    self.zero_shift,
+                    self.smoothquant_scale,
+                )
+            ),
+        )
+
     def __repr__(self) -> str:
         return (
             f"{self.__class__.__name__}"
             f"(in={self.in_features}, out={self.out_features}, "
-            f"bias={self.bias is not None}, wq={self.weight_quant_type}, "
+            f"bias={self.has_bias}, wq={self.weight_quant_type}, "
             f"aq={self.activ_quant_type}, smoothq={self.smoothquant}, "
             f"op={self.aiu_op})"
         )
@@ -222,7 +243,7 @@ def get_int8_aiu_linear(
     # Preprocess linear_config if its linear_type field is a callable
     # (which would not initialize correctly the dataclass parameters).
     # We don't want to alter the original linear_config though.
-    linear_config_for_dataclass: Optional[dict[Union[str, Callable], Any]] = None
+    linear_config_for_dataclass = None
     if callable(linear_config["linear_type"]):
         linear_config_for_dataclass = update_from_partial(linear_config)
         linear_config_for_dataclass["linear_type"] = linear_type
@@ -240,6 +261,36 @@ def get_int8_aiu_linear(
     return linear
 
 
+def is_w_clip_per_channel(
+    w_clip: torch.Tensor,
+) -> bool:
+    """Determine whether the weight clip value in use for INT8 quantization of the
+    provided linear module is:
+    - per-tensor (1 element, 1-dim tensor), or
+    - per-channel (out_feat elements, 1-dim tensor).
+    """
+
+    if w_clip.dim() != 1:
+        raise ValueError(
+            f"TP error: weight clip value dimensions {str(list(w_clip.size()))} are "
+            "incompatible with expected per-tensor or per-channel quantization."
+        )
+    return w_clip.numel() > 1
+
+
+def is_smoothquant_enabled(
+    smoothquant_scale: torch.Tensor,
+) -> bool:
+    """Determine whether smoothquant is enabled on a module."""
+
+    if smoothquant_scale.dim() != 1:
+        raise ValueError(
+            "TP error: smoothquant_scale array should always be 1-dimensional but "
+            f"has size {str(list(smoothquant_scale.size()))}"
+        )
+    return smoothquant_scale.numel() > 1
+
+
 def shard_int8_aiu_linear(
     tensor_values: dict[str, torch.Tensor],
     tp_module: TPModule,
@@ -259,49 +310,73 @@ def shard_int8_aiu_linear(
               | bias     |   0   |  -  |
               | others*  |   N   |  -  |
 
-    Other quantization parameters: w_clip_val, w_clip_valn,
-    a_clip_val, a_clip_valn, zero_shift, smoothquant_scale
-    No sharding on all these parameters, except w_clip_val and w_clip_valn when
-    per-channel quantization is used
+    Other quantization parameters: w_clip_val, w_clip_valn, a_clip_val, a_clip_valn,
+    zero_shift, smoothquant_scale
+
+    No sharding on any of these parameters (they are CLONED on each rank), with the
+    exception of:
+    - w_clip_val and w_clip_valn, only column-sharding and only when per-channel
+      quantization is used
+    - smoothquant_scale, only row-sharding and only if smoothquant in use
+
+    These parameters are 1-dimensional, so if sharding is needed, it is always applied
+    on dim=0.
     """
+
     param_sharding_info: dict[str, dict[str, LinearParameterShardingInfo]] = {}
+    w_clip_linear_param = None
     for module_name, module_info in module_sharding_info.items():
-        int8_aiu_mod = module_info.linear_module
+        int8_aiu_module = module_info.linear_module
+
+        # check every module if per-channel in use (sharding depends on module)
+        if is_w_clip_per_channel(module_info.linear_module.w_clip_val):
+            w_clip_linear_param = LinearParameterShardingInfo(
+                0,
+                ShardType.SHARD if module_info.sharding_dim == 0 else ShardType.CLONE,
+            )
+        else:
+            w_clip_linear_param = LinearParameterShardingInfo(0, ShardType.CLONE)
+
+        # check for every linear module if smoothquant is enabled
+        if is_smoothquant_enabled(module_info.linear_module.smoothquant_scale):
+            smoothquant_linear_param = LinearParameterShardingInfo(
+                0, ShardType.SHARD if module_info.sharding_dim == 1 else ShardType.CLONE
+            )
+        else:
+            smoothquant_linear_param = LinearParameterShardingInfo(0, ShardType.CLONE)
+
         params: dict[str, LinearParameterShardingInfo] = {
             "weight": LinearParameterShardingInfo(
                 module_info.sharding_dim, ShardType.SHARD
             ),
-            # FIXME: with per-channel W, clips need to be sharded
-            # but if per-tensor w, there should be no sharding
-            # HOW CAN WE DISCRIMINATE THE TWO CASES?
-            "w_clip_val": LinearParameterShardingInfo(0, ShardType.CLONE),
-            "w_clip_valn": LinearParameterShardingInfo(0, ShardType.CLONE),
-            # "w_clip_val": LinearParameterShardingInfo(
-            #     module_info.sharding_dim,
-            #     ShardType.SHARD if module_info.sharding_dim == 0 else ShardType.RANK0,
-            # ),
-            # "w_clip_valn": LinearParameterShardingInfo(
-            #     module_info.sharding_dim,
-            #     ShardType.SHARD if module_info.sharding_dim == 0 else ShardType.RANK0,
-            # ),
+            "w_clip_val": w_clip_linear_param,
+            "w_clip_valn": w_clip_linear_param,
             "a_clip_val": LinearParameterShardingInfo(0, ShardType.CLONE),
             "a_clip_valn": LinearParameterShardingInfo(0, ShardType.CLONE),
             "zero_shift": LinearParameterShardingInfo(0, ShardType.CLONE),
-            "smooqthquant_scale": LinearParameterShardingInfo(0, ShardType.CLONE),
+            "smoothquant_scale": smoothquant_linear_param,
         }
-        if int8_aiu_mod.bias is not None:
+        if int8_aiu_module.bias is not None and int8_aiu_module.bias.numel() > 1:
             params["bias"] = LinearParameterShardingInfo(
-                module_info.sharding_dim,
+                0,
                 ShardType.SHARD if module_info.sharding_dim == 0 else ShardType.RANK0,
             )
         param_sharding_info[module_name] = params
 
+    # trim qdata from dictionary of tensors to be copied on sharded modules.
+    # if not trimmed, qdata wouldn't be copied but the keys would be marked as unused
+    tensor_values = {k: v for k, v in tensor_values.items() if "qdata" not in k}
+
     unused_keys = shard_base_linear(
         tensor_values, tp_module, module_sharding_info, param_sharding_info
     )
 
-    raise NotImplementedError("TP not yet supported for INT8. Work in progress")
-    # return unused_keys
+    # qdata contains all quantization metadata to pass to the AIU and needs to be
+    # updated post-sharding, after metadata tensor have changed
+    for module_name, module_info in module_sharding_info.items():
+        module_info.linear_module.re_register_qdata()
+
+    return unused_keys
 
 
 register_linear_type_to_module_map(
@@ -320,4 +395,6 @@ def shard_int8_aiu_linear(
         use_smoothquant=True,
     ),
 )
+
+# int8 linear with and w/o smoothquant share a common sharding map
 register_linear_type_to_sharding_map("int8_aiu", shard_int8_aiu_linear)