nod-ai
diff --git a/‎core/shark_turbine/runtime/op_reg/base.py‎
Lines changed: 17 additions & 2 deletions b/‎core/shark_turbine/runtime/op_reg/base.py‎
Lines changed: 17 additions & 2 deletions
diff --git a/‎llm/tests/ops/matmul_test.py‎
Lines changed: 38 additions & 0 deletions b/‎llm/tests/ops/matmul_test.py‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎llm/tests/types/layout_utils_test.py‎
Lines changed: 29 additions & 0 deletions b/‎llm/tests/types/layout_utils_test.py‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎llm/turbine_llm/ops/custom_inference_ops.py‎
Lines changed: 24 additions & 3 deletions b/‎llm/turbine_llm/ops/custom_inference_ops.py‎
Lines changed: 24 additions & 3 deletions
diff --git a/‎llm/turbine_llm/ops/matmul.py‎
Lines changed: 156 additions & 0 deletions b/‎llm/turbine_llm/ops/matmul.py‎
Lines changed: 156 additions & 0 deletions
@@ -8,7 +8,7 @@
 dispatcher.
 """
 
-from typing import Any, Callable, Optional, Sequence, Type, Union, cast
+from typing import Any, Callable, List, Optional, Sequence, Type, Union, cast
 
 from abc import ABC, abstractmethod
 import functools
@@ -478,6 +478,9 @@ def mlir_type_asm(self) -> str:
         return "i64"
 
 
+_NoneInt: Optional[int] = None
+
+
 class TensorArg:
     __slots__ = [
         "t",
@@ -491,13 +494,25 @@ class TensorArg:
     def __init__(self, t: Tensor):
         self.t = t
         # Any static dims that we are specializing. Defaults to all dynamic.
-        self.spec_dims: Sequence[Optional[int]] = len(t.shape) * [None]
+        self.spec_dims = len(t.shape) * [_NoneInt]
         # All descriptors have an attribute to indicate their value
         # as a tensor, and those that aren't are fixated to None.
         # This is to enable fast lookup in the hot path of determining
         # how to dispatch.
         self.maybe_tensor_value: Tensor = t
 
+    def specialize_all_dims(self):
+        """Marks all dimensions as specialized."""
+        self.spec_dims = list(self.t.shape)
+
+    def specialize_dims(self, *indices: int):
+        """Specializes individual dimensions.
+
+        `i` can have negative indexing.
+        """
+        for i in indices:
+            self.spec_dims[i] = self.t.size(i)
+
     def __repr__(self):
         return (
             f"TensorArg(shape={self.t.shape}, dtype={self.t.dtype}, "
 
@@ -34,6 +34,9 @@ def test3DF32(self):
 
 
 class mmt_block_scaled_q8_test(unittest.TestCase):
+    def setUp(self):
+        torch.manual_seed(42)
+
     def testF32BS32(self):
         a = torch.rand([4, 16, 3200], dtype=torch.float32)
         d = torch.rand([3200, 100, 1], dtype=torch.float16)
@@ -47,6 +50,9 @@ def testF32BS32(self):
 
 
 class mmt_block_scaled_offset_q4_unsigned_test(unittest.TestCase):
+    def setUp(self):
+        torch.manual_seed(42)
+
     def test_basic(self):
         a = torch.rand([4, 16, 3200], dtype=torch.float32)
         d = torch.rand([3200, 100, 1], dtype=torch.float16)
@@ -61,5 +67,37 @@ def test_basic(self):
         torch.testing.assert_close(result, torch.matmul(a, b.T), atol=1e-1, rtol=1e-5)
 
 
+class mmt_super_block_scaled_offset_q4_unsigned(unittest.TestCase):
+    def setUp(self):
+        torch.manual_seed(42)
+
+    @unittest.skip(
+        "compiler bad tile selection:"
+        "https://github.com/openxla/iree/issues/17078#issuecomment-2062331207"
+    )
+    def test_basic(self):
+        # n = 2560, k = 5120, sup = 20, sub = 8, bs = 32
+        a = torch.rand([4, 16, 5120], dtype=torch.float32)
+        d = torch.rand([2560, 20, 1], dtype=torch.float16)
+        dmin = torch.rand([2560, 20, 1], dtype=torch.float16)
+        sb_scales_hi = (torch.rand([2560, 20, 2], dtype=torch.float32) * 127).to(
+            torch.uint8
+        )
+        sb_scales_low = (torch.rand([2560, 20, 4], dtype=torch.float32) * 127).to(
+            torch.uint8
+        )
+        sb_mins_hi = (torch.rand([2560, 20, 2], dtype=torch.float32) * 127).to(
+            torch.uint8
+        )
+        sb_mins_low = (torch.rand([2560, 20, 4], dtype=torch.float32) * 127).to(
+            torch.uint8
+        )
+        qs = (torch.rand([2560, 20, 8, 16], dtype=torch.float32) * 127).to(torch.uint8)
+        result = ops.mmt_super_block_scaled_offset_q4_unsigned(
+            a, d, dmin, sb_scales_hi, sb_scales_low, sb_mins_hi, sb_mins_low, qs
+        )
+        # TODO: Validate numerics once enabled and crash bug fixed.
+
+
 if __name__ == "__main__":
     unittest.main()
@@ -64,6 +64,35 @@ def test_promote_i4_block_to_i8_signed(self):
             r0,
         )
 
+    def test_promote_i2_block_to_i8(self):
+        data = torch.tensor([[0xC1, 0xB2, 0xA3, 0x94, 0x85]], dtype=torch.uint8)
+        expected = torch.tensor(
+            # fmt: off
+            [[
+                1, 0, 0, 3,  # 0xC1
+                2, 0, 3, 2,  # 0xB2
+                3, 0, 2, 2,  # 0xA3
+                0, 1, 1, 2,  # 0x94
+                1, 1, 0, 2   # 0x85
+            ]],
+            dtype=torch.uint8,
+            # fmt: on
+        )
+        r0 = promote_linear_i2_block_to_i8(data)
+        torch.testing.assert_close(r0, expected)
+
+    def test_promote_i6_block_to_i8(self):
+        # High 2 bit values: 0, 3, 1, 3, 1, 3, 0, 3
+        high = torch.tensor([[0xDC, 0xCD]], dtype=torch.uint8)
+        # Low 4 bit values:
+        # '0xb', '0xc', '0x2', '0x3', '0x1', '0x1', '0x6', '0x7'
+        low = torch.tensor([[0xCB, 0x32, 0x11, 0x76]], dtype=torch.uint8)
+        r0 = promote_linear_i6_block_to_i8(high, low)
+        r_debug = repr(debug_map_tensor_as_hex_string(r0))
+        self.assertEqual(
+            r_debug, "[['0xb', '0x3c', '0x12', '0x33', '0x11', '0x31', '0x6', '0x37']]"
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
@@ -18,13 +18,15 @@
     InferenceTensor,
     PrimitiveTensor,
     QuantizedTensor,
+    SuperBlockOffsetScaled_4_6_Layout,
     gguf_interop,
 )
 
 from .matmul import (
     mmtfp,
     mmt_block_scaled_offset_q4_unsigned,
     mmt_block_scaled_q8,
+    mmt_super_block_scaled_offset_q4_unsigned,
 )
 
 __all__ = [
@@ -59,7 +61,7 @@ def _matmul(
             return NotImplemented
 
         # Handle quantized tensor layout switched.
-        handler = _QMMT_DISPATCH.get(type(rhs))
+        handler = _QMMT_DISPATCH.get(rhs.layout_type)
         if handler is None:
             return NotImplemented
         return handler(lhs, rhs)
@@ -87,7 +89,26 @@ def _mmt_block_scaled_q4(lhs: torch.Tensor, rhs: QuantizedTensor[BlockScaledI4La
     )
 
 
+def _mmt_super_block_offset_scaled_4_6_q4(
+    lhs: torch.Tensor, rhs: QuantizedTensor[SuperBlockOffsetScaled_4_6_Layout]
+):
+    rhs_unpacked = rhs.unpack()
+    sb_scales_hi, sb_scales_low = rhs_unpacked.sb_scales_bit_packed
+    sb_mins_hi, sb_mins_low = rhs_unpacked.sb_mins_bit_packed
+    return mmt_super_block_scaled_offset_q4_unsigned(
+        lhs,
+        rhs_unpacked.d,
+        rhs_unpacked.dmin,
+        sb_scales_hi,
+        sb_scales_low,
+        sb_mins_hi,
+        sb_mins_low,
+        rhs_unpacked.qs_bit_packed,
+    )
+
+
 _QMMT_DISPATCH: dict[type, Callable] = {
-    gguf_interop.Q4_1: _mmt_block_scaled_q4,
-    gguf_interop.Q8_0: _mmt_block_scaled,
+    BlockScaledI4Layout: _mmt_block_scaled_q4,
+    BlockScaledLayout: _mmt_block_scaled,
+    SuperBlockOffsetScaled_4_6_Layout: _mmt_super_block_offset_scaled_4_6_q4,
 }
@@ -12,6 +12,7 @@
     "mmtfp",
     "mmt_block_scaled_offset_q4_unsigned",
     "mmt_block_scaled_q8",
+    "mmt_super_block_scaled_offset_q4_unsigned",
 ]
 
 
@@ -95,6 +96,161 @@ def generate(self, ksel: KernelSelection, kb: KernelBuilder):
         kb.yield_results(*call_function(target_function, *kb.arg_bindings))
 
 
+@CustomOp.register(library=LIBRARY)
+class mmt_super_block_scaled_offset_q4_unsigned(CustomOp):
+    """Super block scaled q4 matmul with transposed RHS.
+
+    Arguments:
+
+    * `a`: [B, M, K]
+    * `d`: [N, SUP_COUNT, 1]
+    * `dmin`: [N, SUP_COUNT, 1]
+    * `sb_scales_hi`: [N, SUP_COUNT, SUB_COUNT // 4]
+    * `sb_scales_lo`: [N, SUP_COUNT, SUB_COUNT // 2]
+    * `sb_min_hi`: [N, SUP_COUNT, SUB_COUNT // 4]
+    * `sb_mins_lo`: [N, SUP_COUNT, SUB_COUNT // 2]
+    * `qs`: [N, SUP_COUNT, SUB_COUNT, BS // 2]
+
+    Where: `K == SUP_COUNT * SUB_COUNT * BS`
+
+    Given this and hi/lo combined into a single value, the dequantization
+    formula is:
+
+    ```
+    d_scaled = (d * sb_scales).unsqueeze(-1)
+    dmin_scaled = (dmin * sb_mins).unsqueeze(-1)
+    return d_scaled * qs - dmin_scaled
+    ```
+    """
+
+    signature = (
+        "mmt_super_block_scaled_offset_q4_unsigned("
+        "Tensor a, Tensor d, Tensor dmin, "
+        "Tensor sb_scales_hi, Tensor sb_scales_low, "
+        "Tensor sb_mins_hi, Tensor sb_mins_low, "
+        "Tensor qs"
+        ") -> (Tensor)"
+    )
+
+    def select(self, ksel: KernelSelection):
+        a_desc = ksel.arg_tensor(0)
+        d_desc = ksel.arg_tensor(1)
+        dmin_desc = ksel.arg_tensor(2)
+        sb_scales_hi_desc = ksel.arg_tensor(3)
+        sb_scales_low_desc = ksel.arg_tensor(4)
+        sb_mins_hi_desc = ksel.arg_tensor(5)
+        sb_mins_low_desc = ksel.arg_tensor(6)
+        qs_desc = ksel.arg_tensor(7)
+
+        # a arg
+        *batch_dims, m, k = a_desc.t.shape
+        a_desc.specialize_dims(-1)
+        if not a_desc.t.dtype.is_floating_point:
+            raise ValueError(
+                f"mmt_super_block_scaled_offset_q4_unsigned arg 'a': Expected floating point (got {a_desc.t.dtype})"
+            )
+        if len(batch_dims) != 1:
+            raise ValueError(
+                f"mmt_super_block_scaled_offset_q4_unsigned arg 'a': Expected 3d tensor (got {a_desc.t.shape})"
+            )
+
+        # qs arg
+        n, sup_count, sub_count, bs_div2 = qs_desc.t.shape
+        qs_desc.specialize_all_dims()
+        bs = bs_div2 * 2
+        if k != (sup_count * sub_count * bs):
+            raise ValueError(
+                f"mmt_super_block_scaled_offset_q4_unsigned arg 'qs': Incorrect shape (got {qs_desc.t.shape}, k={k})"
+            )
+
+        # d arg
+        v_n, v_sup_count, one = d_desc.t.shape
+        d_desc.specialize_all_dims()
+        if v_n != n or v_sup_count != sup_count or one != 1:
+            raise ValueError(
+                f"mmt_super_block_scaled_offset_q4_unsigned arg 'd': Incorrect shape (got {d_desc.t.shape})"
+            )
+
+        # dmin arg
+        v_n, v_sup_count, one = dmin_desc.t.shape
+        dmin_desc.specialize_all_dims()
+        if v_n != n or v_sup_count != sup_count or one != 1:
+            raise ValueError(
+                f"mmt_super_block_scaled_offset_q4_unsigned arg 'dmin': Incorrect shape (got {d_desc.t.shape})"
+            )
+
+        # sb_scales_hi arg
+        v_n, v_sup_count, v_sub_div4 = sb_scales_hi_desc.t.shape
+        sb_scales_hi_desc.specialize_all_dims()
+        if v_n != n or v_sup_count != sup_count or v_sub_div4 != (sub_count // 4):
+            raise ValueError(
+                f"mmt_super_block_scaled_offset_q4_unsigned arg 'sb_scales_hi': Incorrect shape (got {sb_scales_hi_desc.t.shape})"
+            )
+
+        # sb_scales_low arg
+        v_n, v_sup_count, v_sub_div2 = sb_scales_low_desc.t.shape
+        sb_scales_low_desc.specialize_all_dims()
+        if v_n != n or v_sup_count != sup_count or v_sub_div2 != (sub_count // 2):
+            raise ValueError(
+                f"mmt_super_block_scaled_offset_q4_unsigned arg 'sb_scales_low': Incorrect shape (got {sb_scales_low_desc.t.shape})"
+            )
+
+        # sb_mins_hi arg
+        v_n, v_sup_count, v_sub_div4 = sb_mins_hi_desc.t.shape
+        sb_mins_hi_desc.specialize_all_dims()
+        if v_n != n or v_sup_count != sup_count or v_sub_div4 != (sub_count // 4):
+            raise ValueError(
+                f"mmt_super_block_scaled_offset_q4_unsigned arg 'sb_mins_hi': Incorrect shape (got {sb_mins_hi_desc.t.shape})"
+            )
+
+        # sb_mins_low arg
+        v_n, v_sup_count, v_sub_div2 = sb_mins_low_desc.t.shape
+        sb_mins_low_desc.specialize_all_dims()
+        if v_n != n or v_sup_count != sup_count or v_sub_div2 != (sub_count // 2):
+            raise ValueError(
+                f"mmt_super_block_scaled_offset_q4_unsigned arg 'sb_mins_low': Incorrect shape (got {sb_mins_low_desc.t.shape})"
+            )
+
+        # c return
+        c = torch.empty(batch_dims + [m, n], dtype=a_desc.t.dtype)
+        c_desc = ksel.return_tensor(c)  # Shape batch..., m, n
+        c_desc.specialize_dims(-1)
+
+    def generate(self, ksel: KernelSelection, kb: KernelBuilder):
+        a = kb.arg_value(0)
+        a_tensor_type = RankedTensorType(a.type)
+        *_, k = a_tensor_type.shape
+        d = kb.arg_value(1)
+        d_tensor_type = RankedTensorType(d.type)
+        qs = kb.arg_value(7)
+        qs_tensor_type = RankedTensorType(qs.type)
+        n, sup_count, sub_count, bs_div2 = qs_tensor_type.shape
+        bs = bs_div2 * 2
+        a_type_str = str(a_tensor_type.element_type)
+        scale_type_str = str(d_tensor_type.element_type)
+
+        template_file = "mmt_super_block_scaled_offset_q4_unsigned_3d.mlir"
+        target_function_name = f"mmt_super_block_scaled_offset_q4_unsigned_3d_{n}_{k}_{sup_count}_{sub_count}_{bs}_{a_type_str}"
+
+        target_function = inline_template_function(
+            kb,
+            template_file,
+            target_function_name,
+            n=n,
+            k=k,
+            sup_count=sup_count,
+            sub_count=sub_count,
+            sub_div4=sub_count // 4,
+            sub_div2=sub_count // 2,
+            bs=bs,
+            bs_div2=bs_div2,
+            a_type=a_type_str,
+            scale_type=scale_type_str,
+        )
+        kb.yield_results(*call_function(target_function, *kb.arg_bindings))
+        print(kb.module_body.owner)
+
+
 @CustomOp.register(library=LIBRARY)
 class mmt_block_scaled_q8(CustomOp):
     """Generic block scaled matmul with transposed RHS.