[Fix] fallback in specialize even for native types (#8122)

stadlmax · web-flow · commit 7871be232696 · 2025-09-09T15:44:14.000-07:00
- should address regression discovered introduced in #7771 
- adds unit-test which ideally covers all supported specializations to
spot other edge cases sooner (currently not exhaustive for dtypes and
descriptor layouts)
diff --git a/python/src/specialize.cc b/python/src/specialize.cc
@@ -506,6 +506,14 @@ std::pair<py::object, py::object> specialize_arg(PyObject *backend,
     return handle_tensor(backend, arg, is_const, specialize_value, align);
   }
 
+  // fallback for default types
+  if (PyLong_Check(arg)) {
+    return handle_long_type(backend, arg, is_const, specialize_value, align);
+  }
+  if (PyFloat_Check(arg)) {
+    return handle_float_type(backend, arg, is_const, specialize_value, align);
+  }
+
   return {};
 }
 
diff --git a/python/test/unit/runtime/test_specialize.py b/python/test/unit/runtime/test_specialize.py
@@ -0,0 +1,173 @@
+import numpy
+import pytest
+import torch
+from collections import namedtuple
+from triton._C.libtriton import native_specialize_impl
+from triton.runtime.jit import MockTensor, JITCallable
+from triton._utils import canonicalize_dtype
+from triton.backends.nvidia.compiler import CUDABackend
+from triton.backends.amd.compiler import HIPBackend
+from triton.language import constexpr
+from triton.tools.tensor_descriptor import TensorDescriptor
+from triton.experimental.gluon.nvidia.hopper import TensorDescriptor as GluonTensorDescriptor
+from triton.experimental.gluon.language._layouts import NVMMASharedLayout
+
+
+def mock_tensor_from_tensor(tensor):
+    return MockTensor(tensor.dtype, tensor.shape)
+
+
+class MockJITCallable(JITCallable):
+
+    def __init__(self):
+        pass
+
+    def cache_key(self):
+        return "mock_jit_callable"
+
+
+class MockFloat(float):
+
+    def __new__(cls, value):
+        return super().__new__(cls, value)
+
+
+class MockInt(int):
+
+    def __new__(cls, value):
+        return super().__new__(cls, value)
+
+
+def reference_specialize_impl(backend, arg, is_const, specialize_value, align):
+    if arg is None:
+        return ("constexpr", None)
+    elif isinstance(arg, bool):
+        return ("u1", None)
+    elif isinstance(arg, int):
+        key = backend.get_int_specialization(arg, align=align) if specialize_value else None
+        if arg == 1 and specialize_value:
+            return ("constexpr", 1)
+        elif -(2**31) <= arg and arg <= 2**31 - 1:
+            return ("i32", key)
+        elif 2**63 <= arg and arg <= 2**64 - 1:
+            return ("u64", key)
+        else:
+            return ("i64", key)
+    elif isinstance(arg, float):
+        return ("fp32", None)
+    elif hasattr(arg, "data_ptr"):
+        dsk = (arg.dtype, is_const)
+        res = ("*k" if dsk[1] else "*") + canonicalize_dtype(dsk[0])
+        key = backend.get_tensor_specialization(arg, align=align) if specialize_value else None
+        return (res, key)
+    elif isinstance(arg, JITCallable):
+        return ("constexpr", arg.cache_key)
+    elif isinstance(arg, constexpr):
+        return ("constexpr", arg)
+    elif isinstance(arg, tuple):
+        spec = [reference_specialize_impl(backend, x, False, True, True) for x in arg]
+        make_tuple = lambda vals: type(arg)(*vals) if hasattr(arg, "_fields") else tuple(vals)
+        tys = make_tuple([x[0] for x in spec])
+        keys = make_tuple([x[1] for x in spec])
+        return (tys, keys)
+    elif isinstance(arg, TensorDescriptor):
+        assert hasattr(arg.base, "data_ptr")
+        inner = canonicalize_dtype(arg.base.dtype)
+        return (f"tensordesc<{inner}{list(arg.block_shape)}>", None)
+    elif isinstance(arg, GluonTensorDescriptor):
+        assert hasattr(arg.base, "data_ptr")
+        inner = canonicalize_dtype(arg.base.dtype)
+        return (f"tensordesc<{inner}{list(arg.block_shape)},{arg.layout!r}>", None)
+    else:
+        raise TypeError("Unsupported type: %s" % type(arg))
+
+
+def native_inputs_to_specialize():
+    return [
+        1.0,
+        None,
+        False,
+        True,
+        1,
+        0,
+        -1,
+        16,
+        17,
+        2**31 - 1,
+        2**31,
+        -2 * 31 - 1,
+        2**63 - 1,
+        2**63,
+        2**63 + 1,
+        2**64 - 1,
+    ]
+
+
+def derived_inputs_to_specialize():
+    return [
+        constexpr(1),
+        constexpr(False),
+        constexpr(1.0),
+        numpy.float64(1.0),
+        MockFloat(1.0),
+        MockInt(1),
+        MockJITCallable(),
+    ]
+
+
+def tuples_to_specialize():
+    return [
+        (1, 1),
+        (False, True),
+        namedtuple('strides', ['x', 'y'])(1, 1),
+        namedtuple('flags', ['x', 'y'])(False, True),
+    ]
+
+
+def tensors_to_specialize():
+    return [
+        torch.empty(shape, dtype=dtype, device="cpu")
+        for shape in [(1, ), (1, 1), (16, ), (16, 16), (128, ), (128, 128)]
+        for dtype in [torch.float64, torch.float32, torch.float16, torch.bfloat16, torch.int32, torch.int64]
+    ]
+
+
+def tensordescriptors_to_specialize():
+    return [
+        TensorDescriptor.from_tensor(tensor, block_shape=tensor.shape)
+        for tensor in tensors_to_specialize()
+        if tensor.shape[-1] % 16 == 0
+    ]
+
+
+def gluon_tensordescriptors_to_specialize():
+    return [
+        GluonTensorDescriptor.from_tensor(
+            tensor,
+            block_shape=tensor.shape,
+            layout=NVMMASharedLayout(0, tensor.dtype.itemsize * 8, len(tensor.shape)),
+        ) for tensor in tensors_to_specialize() if tensor.shape[-1] % 16 == 0
+    ]
+
+
+def mock_tensors_to_specialize():
+    return [mock_tensor_from_tensor(tensor) for tensor in tensors_to_specialize()]
+
+
+@pytest.mark.parametrize("input_generator", [
+    native_inputs_to_specialize,
+    tuples_to_specialize,
+    tensors_to_specialize,
+    tensordescriptors_to_specialize,
+    gluon_tensordescriptors_to_specialize,
+    mock_tensors_to_specialize,
+])
+@pytest.mark.parametrize("backend", [CUDABackend, HIPBackend])
+@pytest.mark.parametrize("is_const", [True, False])
+@pytest.mark.parametrize("specialize_value", [True, False])
+@pytest.mark.parametrize("align", [True, False])
+def test_specialize_impl(input_generator, backend, is_const, specialize_value, align):
+    for arg in input_generator():
+        result = native_specialize_impl(backend, arg, is_const, specialize_value, align)
+        expected = reference_specialize_impl(backend, arg, is_const, specialize_value, align)
+        assert result == expected

Original file line number	Diff line number	Diff line change
`@@ -506,6 +506,14 @@ std::pair<py::object, py::object> specialize_arg(PyObject *backend,`
`506`	`506`	`return handle_tensor(backend, arg, is_const, specialize_value, align);`
`507`	`507`	`}`
`508`	`508`
	`509`	`+ // fallback for default types`
	`510`	`+ if (PyLong_Check(arg)) {`
	`511`	`+ return handle_long_type(backend, arg, is_const, specialize_value, align);`
	`512`	`+ }`
	`513`	`+ if (PyFloat_Check(arg)) {`
	`514`	`+ return handle_float_type(backend, arg, is_const, specialize_value, align);`
	`515`	`+ }`
	`516`	`+`
`509`	`517`	`return {};`
`510`	`518`	`}`
`511`	`519`