foundation-model-stack
diff --git a/‎.gitignore‎
Lines changed: 7 additions & 1 deletion b/‎.gitignore‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎fms_mo/aiu_addons/gptq/__init__.py‎ b/‎fms_mo/aiu_addons/gptq/__init__.py‎
diff --git a/‎fms_mo/aiu_addons/gptq/gptq_aiu_adapter.py‎
Lines changed: 59 additions & 0 deletions b/‎fms_mo/aiu_addons/gptq/gptq_aiu_adapter.py‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎fms_mo/aiu_addons/gptq/gptq_aiu_linear.py‎
Lines changed: 210 additions & 0 deletions b/‎fms_mo/aiu_addons/gptq/gptq_aiu_linear.py‎
Lines changed: 210 additions & 0 deletions
diff --git a/‎fms_mo/aiu_addons/gptq/gptq_aiu_op.py‎
Lines changed: 67 additions & 0 deletions b/‎fms_mo/aiu_addons/gptq/gptq_aiu_op.py‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎fms_mo/aiu_addons/i8i8/__init__.py‎ b/‎fms_mo/aiu_addons/i8i8/__init__.py‎
@@ -38,4 +38,10 @@ venv/
 dictionary.dic
 
 # Generated error log
-error.log
+error.log
+
+# Files generated from running examples
+fms_mo.log
+data_train/
+data_test/
+act_scales/
@@ -0,0 +1,59 @@
+# Copyright The FMS Model Optimizer Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Implement FMS adapter for GPTQ W4A16 checkpoints"""
+
+# Standard
+from typing import Mapping
+
+# Third Party
+from fms.utils import serialization
+import torch
+
+
+def _gptq_qweights_transpose_aiu(
+    input_sd: Mapping[str, torch.Tensor],
+) -> Mapping[str, torch.Tensor]:
+    new_sd = {}
+    for name, param in input_sd.items():
+        new_sd[name] = param
+        # for AIU, qweights are needed as [out_feat, in_feat]
+        if "qweight" in name:
+            new_sd[name] = new_sd[name].t()
+        elif "g_idx" in name:
+            new_sd[name] = torch.zeros(1, dtype=torch.int32, device=param.device)
+    return new_sd
+
+
+serialization.register_adapter_step(
+    "llama", "gptq_qweights_transpose_aiu", _gptq_qweights_transpose_aiu
+)
+serialization.register_adapter_step(
+    "gpt_bigcode", "gptq_qweights_transpose_aiu", _gptq_qweights_transpose_aiu
+)
+serialization.register_adapter(
+    "llama",
+    "hf_gptq_aiu",
+    [
+        "hf_to_fms_names",
+        "hf_to_fms_rope",
+        "hf_gptq_fusion_check",
+        "weight_fusion",
+        "gptq_qweights_transpose_aiu",
+    ],
+)
+serialization.register_adapter(
+    "gpt_bigcode",
+    "hf_gptq_aiu",
+    ["hf_to_fms_names", "weight_fusion", "gptq_qweights_transpose_aiu"],
+)
@@ -0,0 +1,210 @@
+# Copyright The FMS Model Optimizer Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Implement GPTQ W4A16 linear module compatible with AIU compiler"""
+
+# Standard
+from typing import Any, Mapping, Optional
+import math
+
+# Third Party
+from fms.modules.linear import (
+    LinearModuleShardingInfo,
+    LinearParameterShardingInfo,
+    register_linear_type_to_module_map,
+    register_linear_type_to_sharding_map,
+    shard_base_linear,
+)
+from fms.modules.tp import ShardType, TPModule
+from fms.utils.gptq import GPTQLinearConfig
+import torch
+import torch.nn as nn
+
+# Local
+from fms_mo.aiu_addons.gptq.gptq_aiu_op import register_aiu_gptq_op
+
+register_aiu_gptq_op()
+
+
+class GPTQLinearAIU(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool,
+        config: GPTQLinearConfig,
+    ):
+        super().__init__()
+
+        self.in_features = in_features
+        self.out_features = out_features
+        self.bits = config.bits
+        self.group_size = config.group_size if config.group_size != -1 else in_features
+        self.desc_act = config.desc_act
+        # self.weight_transposed = True
+
+        if self.bits not in [4]:
+            raise NotImplementedError(
+                "AIU GPTQLinear only supports 4 bits quantization."
+            )
+        if in_features % self.group_size != 0:
+            raise ValueError("`in_features` must be divisible by `group_size`.")
+        if in_features % 32 or out_features % 32:
+            raise ValueError("`in_features` and `out_features` must be divisible by 32")
+        if self.desc_act:
+            raise NotImplementedError(
+                "AIU GPTQLinear does not support activation reordering (`desc_act`)"
+            )
+
+        # Register quantization parameters
+        self.register_buffer(
+            "qweight",
+            torch.zeros(
+                # transposed w.r.t. GPTQ ckpt (AIU requirement)
+                (out_features, in_features // 32 * self.bits),
+                dtype=torch.int32,
+            ),
+        )
+        self.register_buffer(
+            "qzeros",
+            torch.zeros(
+                (
+                    math.ceil(in_features / self.group_size),
+                    out_features // 32 * self.bits,
+                ),
+                dtype=torch.int32,
+            ),
+        )
+        self.register_buffer(
+            "scales",
+            torch.zeros(
+                (math.ceil(in_features / self.group_size), out_features),
+                dtype=torch.float16,
+            ),
+        )
+        # AIU requirement
+        self.register_buffer("g_idx", torch.tensor([0], dtype=torch.int32))
+        if bias:
+            self.register_buffer(
+                "bias",
+                torch.zeros((out_features), dtype=torch.float16),
+            )
+        else:
+            self.bias = None
+
+        # Register op
+        if not hasattr(torch.ops, "gptq_gemm") or not hasattr(
+            torch.ops.gptq_gemm, "i4f16_fxinputs_aiu"
+        ):
+            raise ValueError(
+                "Custom AIU op `gptq_gemm.i4f16_fxinputs_aiu` has not been registered."
+            )
+        self.aiu_op = torch.ops.gptq_gemm.i4f16_fxinputs_aiu
+
+    def forward(self, x):
+        x = self.aiu_op(
+            x.half(),
+            self.qweight,
+            self.qzeros,
+            self.scales,
+            self.g_idx,
+        )
+        if self.bias is not None:
+            x.add_(self.bias)
+        return x
+
+    def __repr__(self) -> str:
+        return (
+            f"{self.__class__.__name__}"
+            f"(in={self.in_features}, out={self.out_features}, "
+            f"bias={self.bias is not None}, group={self.group_size}, "
+            f"op={self.aiu_op})"
+        )
+
+
+def get_gptq_aiu_linear(
+    in_features: int,
+    out_features: int,
+    bias: bool,
+    linear_config: Optional[Mapping[str, Any]] = None,
+):
+    gptq_config = GPTQLinearConfig(**linear_config)
+    if gptq_config.desc_act:
+        raise NotImplementedError(
+            "Activation reordering (desc_act=True) not supported on AIU"
+        )
+    linear = GPTQLinearAIU(
+        in_features=in_features,
+        out_features=out_features,
+        bias=bias,
+        config=gptq_config,
+    )
+    setattr(linear, "desc_act", gptq_config.desc_act)
+    return linear
+
+
+def shard_gptq_aiu_linear(
+    tensor_values: dict[str, torch.Tensor],
+    tp_module: TPModule,
+    module_sharding_info: dict[str, LinearModuleShardingInfo],
+) -> Optional[set]:
+    """
+    Set up GPTQ quantization parameters to be sharded onto
+    AIU-compliant linear modules
+
+                         |     GPU     |
+    sharding  | qparam   | shard | dim |
+    ----------+----------+-------+-----|
+    colwise   | qweight  |   Y   |  0  |
+              | bias     |   Y   |  0  |
+              | scales   |   Y   |  1  |
+              | qzeros   |   Y   |  1  |
+              | g_idx    |   N   |  -  |
+    ----------+----------+-------+-----|
+    rowwise   | qweight  |   Y   |  1  |
+              | bias     |   0   |  -  |
+              | scales   |   Y   |  0  |
+              | qzeros   |   Y   |  0  |
+              | g_idx    |   N   |  -  |
+    """
+    param_sharding_info: dict[str, dict[str, LinearParameterShardingInfo]] = {}
+    for module_name, module_info in module_sharding_info.items():
+        gptq_aiu_mod = module_info.linear_module
+        params: dict[str, LinearParameterShardingInfo] = {
+            "qweight": LinearParameterShardingInfo(
+                module_info.sharding_dim, ShardType.SHARD
+            ),
+            "scales": LinearParameterShardingInfo(
+                1 - module_info.sharding_dim, ShardType.SHARD
+            ),
+            "qzeros": LinearParameterShardingInfo(
+                1 - module_info.sharding_dim, ShardType.SHARD
+            ),
+            # g_idx on aiu is 1-dim zero tensor, always cloned on each shard
+            "g_idx": LinearParameterShardingInfo(0, ShardType.CLONE),
+        }
+        if gptq_aiu_mod.bias is not None:
+            params["bias"] = LinearParameterShardingInfo(
+                module_info.sharding_dim,
+                ShardType.SHARD if module_info.sharding_dim == 0 else ShardType.RANK0,
+            )
+        param_sharding_info[module_name] = params
+
+    unused_keys = shard_base_linear(
+        tensor_values, tp_module, module_sharding_info, param_sharding_info
+    )
+    return unused_keys
+
+
+register_linear_type_to_module_map("gptq_aiu", get_gptq_aiu_linear)
+register_linear_type_to_sharding_map("gptq_aiu", shard_gptq_aiu_linear)
@@ -0,0 +1,67 @@
+# Copyright The FMS Model Optimizer Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Registration of GPTQ W4A16 node compatible with AIU compiler"""
+
+# Standard
+import logging
+
+# Third Party
+import torch
+
+logger = logging.getLogger(__name__)
+
+
+def register_aiu_gptq_op():
+    """Register AIU-specific op to enable torch compile without graph break.
+    The op preserves I/O shapes of a `X @ W^T` matmul but performs no operation.
+    Quantization parameters are taken as arguments, so that they end up attached to
+    the computational graph.
+    """
+    if hasattr(torch.ops, "gptq_gemm") and hasattr(
+        torch.ops.gptq_gemm, "i4f16_fxinputs_aiu"
+    ):
+        logger.warning("AIU op has already been registered")
+        return
+
+    op_namespace_id = "gptq_gemm::i4f16_fxinputs_aiu"
+    torch.library.define(
+        op_namespace_id,
+        "(Tensor x, Tensor qw, Tensor qzeros, Tensor scales, Tensor g_idx) -> Tensor",
+    )
+
+    # Add implementations for the operator
+    @torch.library.impl(op_namespace_id, "default")
+    def i4f16_fxinputs_aiu(x, qw, qzeros, scales, g_idx):
+        # on AIU, GPTQ qw is [out_feat, in_feat]
+        outshape = x.shape[:-1] + (qw.shape[0],)
+        x = x.view(-1, x.shape[-1])
+        output = torch.zeros(
+            (x.shape[0], qw.shape[0]),
+            dtype=torch.float16,
+            device=x.device,
+        )
+        return output.view(outshape)
+
+    @torch.library.impl_abstract(op_namespace_id)
+    def i4f16_fxinputs_aiu_abstract(x, qw, qzeros, scales, g_idx):
+        outshape = x.shape[:-1] + (qw.shape[0],)
+        return torch.empty(
+            outshape,
+            dtype=torch.float16,
+            device=x.device,
+            requires_grad=False,
+        )
+
+    logger.info("GPTQ op 'i4f16_fxinputs_aiu' has been registered")
+    return