nod-ai
diff --git a/‎core/shark_turbine/runtime/op_reg/base.py‎
Lines changed: 8 additions & 0 deletions b/‎core/shark_turbine/runtime/op_reg/base.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎core/shark_turbine/transforms/general/custom_op_expansion.py‎
Lines changed: 56 additions & 9 deletions b/‎core/shark_turbine/transforms/general/custom_op_expansion.py‎
Lines changed: 56 additions & 9 deletions
diff --git a/‎llm/tests/ops/matmul_test.py‎
Lines changed: 0 additions & 103 deletions b/‎llm/tests/ops/matmul_test.py‎
Lines changed: 0 additions & 103 deletions
diff --git a/‎llm/tests/ops/mmt_block_scaled_offset_q4_test.py‎
Lines changed: 67 additions & 0 deletions b/‎llm/tests/ops/mmt_block_scaled_offset_q4_test.py‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎llm/tests/ops/mmt_block_scaled_q8_test.py‎
Lines changed: 62 additions & 0 deletions b/‎llm/tests/ops/mmt_block_scaled_q8_test.py‎
Lines changed: 62 additions & 0 deletions
@@ -350,6 +350,14 @@ def return_tensor(self, t: Tensor) -> "TensorArg":
         """
         ...
 
+    def return_new_tensor(self, size: list, dtype: torch.dtype) -> "TensorArg":
+        """Constructs a new symbolic tensor and marks the next result as returning it.
+
+        This delegates to `return_tensor` but takes care of some easy to mess
+        up boiler plate for dynamic shapes.
+        """
+        return self.return_tensor(torch.empty(size, dtype=dtype, device="meta"))
+
 
 class EagerKernelSelection(KernelSelection):
     """Kernel selection specialized for eager arguments."""
 
@@ -4,8 +4,12 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+from typing import Callable
+
 import torch
 from torch import Tensor
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.symbolic_shapes import ShapeEnv
 
 from ...dynamo.type_conversion import (
     NativeTypeConverter,
@@ -52,6 +56,8 @@ def __init__(
         self.ops_to_delete: dict[Operation, None] = {}
         self.type_converter = NativeTypeConverter(root_op.context)
         self.symbol_table = SymbolTable(root_op)
+        self.shape_env = ShapeEnv()
+        self.fake_mode = FakeTensorMode(shape_env=self.shape_env)
 
     def delete_op(self, op):
         self.ops_to_delete[op.operation] = None
@@ -86,9 +92,15 @@ def expand_func(self, func_op: Operation):
     def expand_custom_op(self, op_reg: CustomOp, op: Operation):
         original_operands: list[Value] = list(op.operands)
         ksel = AOTKernelSelection(
-            op_reg, original_operands, list(op.results), self.type_converter
+            op_reg,
+            original_operands,
+            list(op.results),
+            self.type_converter,
+            self.shape_env,
         )
-        op_reg.select(ksel)
+        with self.fake_mode:
+            op_reg.select(ksel)
+        ksel._run_validators()
 
         module_body = self.root_op.regions[0].blocks[0]
         kb = InlineKernelBuilder(
@@ -110,6 +122,8 @@ class AOTKernelSelection(KernelSelection):
         "operands",
         "results",
         "type_converter",
+        "shape_env",
+        "_validators",
     ]
 
     def __init__(
@@ -118,11 +132,18 @@ def __init__(
         operands: list[Value],
         results: list[Value],
         type_converter: NativeTypeConverter,
+        shape_env: ShapeEnv,
     ):
         super().__init__(op, len(operands))
         self.operands = operands
         self.results = results
         self.type_converter = type_converter
+        self.shape_env = shape_env
+        self._validators: list[Callable] = []
+
+    def _run_validators(self):
+        for v in self._validators:
+            v()
 
     def arg_tensor(self, arg: int, *, inplace_tied: bool = False) -> TensorArg:
         # This is annoying: We have to go from the Torch MLIR type system to the
@@ -133,29 +154,55 @@ def arg_tensor(self, arg: int, *, inplace_tied: bool = False) -> TensorArg:
         arg_descs = self.arg_descs
         assert arg_descs[arg] is None, f"Already constrained argument {arg}"
         operand = self.operands[arg]
-        signed_native_type = self.type_converter.torch_type_to_native(operand.type)
+        signed_native_type = self.type_converter.torch_type_to_native(
+            operand.type, signless=False
+        )
         try:
             rtt = RankedTensorType(signed_native_type)
-            # TODO: We need to do the FakeMode/ShapeEnv dance to create a symbolic
-            # fake tensor here.
         except TypeError as e:
             raise TypeError(
                 f"Argument type mismatch from Torch IR for arg {arg}: Expected ranked tensor, got {signed_native_type}"
             ) from e
-        assert not any(
-            rtt.is_dynamic_dim(i) for i in range(rtt.rank)
-        ), "NYI: Dynamic shape tensors in custom op AOT mode"
         element_type_asm = str(rtt.element_type)
         try:
             dtype = MLIR_TYPE_ASM_TO_TORCH_DTYPE[element_type_asm]
         except KeyError as e:
             raise AssertionError(
                 f"Could not find dtype mapping for {element_type_asm} in MLIR_TYPE_ASM_TO_TORCH_DTYPE"
             )
-        t = torch.empty(rtt.shape, dtype=dtype, device="meta")
+
+        # Because we are operating in fake_mode, replace MLIR dyn dims with
+        # symints for the PyTorch type system.
+        shape_env = self.shape_env
+        sym_shape = [
+            d if d >= 0 else shape_env.create_unbacked_symint() for d in rtt.shape
+        ]
+        t = torch.empty(sym_shape, dtype=dtype)
         arg_descs[arg] = desc = TensorArg(t)
         if inplace_tied:
             self.inplace_tied_arg_descs.append(desc)
+
+        def validator():
+            rank = rtt.rank
+            for i in range(rank):
+                spec_dim = desc.spec_dims[i]
+                if rtt.is_dynamic_dim(i):
+                    # Make sure that it wasn't specialized.
+                    if spec_dim is not None:
+                        raise ValueError(
+                            f"Custom op {self.op}, arg {arg} requires a static dim "
+                            f"at index {i} but it is dynamic: {rtt}"
+                        )
+                else:
+                    # Make sure specialized dim matches.
+                    actual_dim = rtt.get_dim_size(i)
+                    if spec_dim is not None and actual_dim != spec_dim:
+                        raise ValueError(
+                            f"Custom op {self.op}, arg {arg} has a mismatched static "
+                            f"dim at index {i}: actual = {actual_dim}, expected = {spec_dim}"
+                        )
+
+        self._validators.append(validator)
         return desc
 
     def arg_tensor_list(self, arg: int) -> TensorListArg:
 
@@ -0,0 +1,67 @@
+# Copyright 2024 Advanced Micro Devices, Inc
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import logging
+
+logging.basicConfig(level=logging.DEBUG)
+
+import unittest
+
+import torch
+
+from shark_turbine import aot
+from turbine_llm import ops
+from turbine_llm.types import layout_utils
+
+
+class mmt_block_scaled_offset_q4_unsigned_test(unittest.TestCase):
+    def setUp(self):
+        torch.manual_seed(42)
+
+    def test_basic(self):
+        a = torch.rand([4, 16, 3200], dtype=torch.float32)
+        d = torch.rand([3200, 100, 1], dtype=torch.float16)
+        qs = (torch.rand([3200, 100, 16], dtype=torch.float32) * 32).to(torch.uint8)
+        m = torch.rand([3200, 100, 1], dtype=torch.float16)
+        result = ops.mmt_block_scaled_offset_q4_unsigned(a, d, qs, m)
+
+        # Dequantize and test with normal matmul.
+        # Tolerances are empirical and results are not expected to match exactly.
+        qs_i8 = layout_utils.promote_linear_i4_block_to_i8(qs)
+        b = (d.to(torch.float32) * qs_i8.to(torch.float32) + m).flatten(1)
+        torch.testing.assert_close(result, torch.matmul(a, b.T), atol=1e-1, rtol=1e-5)
+
+    def testExportDynamicDims(self):
+        class MyModule(torch.nn.Module):
+            def forward(self, a, d, qs, m):
+                return ops.mmt_block_scaled_offset_q4_unsigned(a, d, qs, m)
+
+        mod = MyModule()
+        batch = torch.export.Dim("batch")
+        m = torch.export.Dim("m")
+        ep = torch.export.export(
+            mod,
+            args=(
+                torch.rand([4, 16, 3200], dtype=torch.float32),
+                torch.rand([3200, 100, 1], dtype=torch.float16),
+                (torch.rand([3200, 100, 16], dtype=torch.float32) * 32).to(torch.uint8),
+                torch.rand([3200, 100, 1], dtype=torch.float16),
+            ),
+            dynamic_shapes={
+                "a": {0: batch, 1: m},
+                "d": {},
+                "qs": {},
+                "m": {},
+            },
+        )
+        asm = str(aot.export(ep).mlir_module)
+        self.assertIn(
+            "@turbine_llm_mmt_block_scaled_offset_q4_unsigned_3d_3200_3200_32_f32", asm
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
@@ -0,0 +1,62 @@
+# Copyright 2024 Advanced Micro Devices, Inc
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import logging
+
+logging.basicConfig(level=logging.DEBUG)
+
+import unittest
+
+import torch
+
+from shark_turbine import aot
+from turbine_llm import ops
+
+
+class mmt_block_scaled_q8_test(unittest.TestCase):
+    def setUp(self):
+        torch.manual_seed(42)
+
+    def testF32BS32(self):
+        a = torch.rand([4, 16, 3200], dtype=torch.float32)
+        d = torch.rand([3200, 100, 1], dtype=torch.float16)
+        qs = (torch.rand([3200, 100, 32], dtype=torch.float32) * 32.0).to(torch.int8)
+        result = ops.mmt_block_scaled_q8(a, d, qs)
+
+        # Dequantize and test with normal matmul.
+        # Tolerances are empirical and results are not expected to match exactly.
+        b = (d.to(torch.float32) * qs.to(torch.float32)).flatten(1)
+        torch.testing.assert_close(result, torch.matmul(a, b.T), atol=1e-1, rtol=1e-5)
+
+    def testExportDynamicDims(self):
+        class MyModule(torch.nn.Module):
+            def forward(self, a, b, qs):
+                return ops.mmt_block_scaled_q8(a, b, qs)
+
+        mod = MyModule()
+        batch = torch.export.Dim("batch")
+        m = torch.export.Dim("m")
+        ep = torch.export.export(
+            mod,
+            args=(
+                torch.rand([4, 16, 3200], dtype=torch.float32),
+                torch.rand([3200, 100, 1], dtype=torch.float16),
+                (torch.rand([3200, 100, 32], dtype=torch.float32) * 32.0).to(
+                    torch.int8
+                ),
+            ),
+            dynamic_shapes={
+                "a": {0: batch, 1: m},
+                "b": {},
+                "qs": {},
+            },
+        )
+        asm = str(aot.export(ep).mlir_module)
+        self.assertIn("@turbine_llm_mmt_block_scaled_q8_3d_3200_3200_32_f32", asm)
+
+
+if __name__ == "__main__":
+    unittest.main()