[FRONTEND] Fix floating points argument passing (#7439)

ShawnZhong · web-flow · commit 1748b81063da · 2025-07-10T17:00:02.000-07:00
Fix #6176 ```python @triton.jit def kernel(ptr, val: tl.float16): tl.store(ptr, val) ptr = torch.tensor([0.0], device="cuda:0") kernel[1,](ptr, 42.0) print(ptr) # Expected: tensor([42.], device='cuda:0') # Actual: tensor([0.], device='cuda:0') ``` The issue is caused by naively passing a Python float to a Triton kernel that accepts `tl.float16` Before this PR, the conversion chain for the float input looks like the following: ``` PyArg_ParseTuple incorrectly passed to PyFloat ================> float ----------!!!----------> kernel that accepts tl.float16 ``` This PR always makes `PyArg_ParseTuple` to parse Python float to C double, and then calls [`PyFloat_Pack{2,4,8}`](https://docs.python.org/3/c-api/float.html#pack-functions) to convert it to its proper storage type. ``` PyArg_ParseTuple PyFloat_Pack{2,4,8} passed to PyFloat ==================> double ====================> uint{16,32,64}_t -------------> kernel that accepts tl.float{16,32,64} ``` The generated code snippet looks something like this (for AMD backend) ```c double _arg1; PyArg_ParseTuple(args, "piiiKKOOOOOd", ..., &_arg1); uint16_t _arg1_storage = 0; PyFloat_Pack2(_arg1, (void*)&_arg1_storage, 1); _launch(gridX, gridY, gridZ, ..., _arg1_storage); ``` - [x] Fix AMD backend - [x] Fix NVIDIA backend - [x] Add tests
diff --git a/python/test/unit/language/test_annotations.py b/python/test/unit/language/test_annotations.py
@@ -3,6 +3,7 @@
 import triton
 import triton.language as tl
 import pytest
+import numpy as np
 
 
 def annotated_function(return_type=None, **arg_types):
@@ -49,3 +50,36 @@ def _kernel(X: torch.Tensor, N: int, BLOCK_SIZE: tl.constexpr):
         _kernel[(1, )](x.shape[0], x.shape[0], 32)
     except AttributeError:
         pass
+
+
+# Test float annotations are properly respected
+@pytest.mark.parametrize(
+    ("dtype", "test_val"),
+    [(dtype, test_val)
+     for dtype in [tl.float16, tl.bfloat16, tl.float32, tl.float64]
+     for test_val in [0.0, 42.0, float("inf"), float("nan")]],
+)
+def test_float_annotation(device, dtype, test_val):
+
+    @triton.jit
+    @annotated_function(val=dtype)
+    def _kernel(ptr, val):
+        tl.static_assert(val.dtype == dtype)
+        tl.store(ptr, val)
+
+    ptr = torch.empty(1, device=device, dtype=torch.float32)
+    h = _kernel[(1, )](ptr, test_val)
+    np.testing.assert_allclose(ptr.cpu().numpy(), [test_val], atol=1e-6)
+
+    # Check that the type is properly emitted in the IR
+    if dtype == tl.float16:
+        assert "%arg1: f16" in h.asm["ttir"]
+        assert "arith.extf %arg1 : f16 to f32" in h.asm["ttir"]
+    elif dtype == tl.bfloat16:
+        assert "%arg1: bf16" in h.asm["ttir"]
+        assert "arith.extf %arg1 : bf16 to f32" in h.asm["ttir"]
+    elif dtype == tl.float32:
+        assert "%arg1: f32" in h.asm["ttir"]
+    elif dtype == tl.float64:
+        assert "%arg1: f64" in h.asm["ttir"]
+        assert "arith.truncf %arg1 : f64 to f32" in h.asm["ttir"]
diff --git a/third_party/amd/backend/driver.py b/third_party/amd/backend/driver.py
@@ -163,14 +163,29 @@ def ty_to_cpp(ty):
         "u16": "uint16_t",
         "u32": "uint32_t",
         "u64": "uint64_t",
-        "fp16": "float",
-        "bf16": "float",
-        "fp32": "float",
-        "f32": "float",
+        "fp16": "double",
+        "bf16": "double",
+        "fp32": "double",
+        "f32": "double",
         "fp64": "double",
     }[ty]
 
 
+FLOAT_STORAGE_TYPE = {
+    "fp16": "uint16_t",
+    "bf16": "uint16_t",
+    "fp32": "uint32_t",
+    "f32": "uint32_t",
+    "fp64": "uint64_t",
+}
+FLOAT_PACK_FUNCTION = {
+    "fp16": "pack_fp16",
+    "bf16": "pack_bf16",
+    "fp32": "pack_fp32",
+    "f32": "pack_fp32",
+    "fp64": "pack_fp64",
+}
+
 _BASE_ARGS_FORMAT = "piiiKKOOOO"
 
 
@@ -226,7 +241,6 @@ def format_of(ty):
         if ty == "constexpr":
             return "O"
         return {
-            "float": "f",
             "double": "d",
             "long": "l",
             "int8_t": "b",
@@ -249,13 +263,30 @@ def format_of(ty):
     args_list = ', ' + ', '.join(f"&_arg{i}" for i, ty in signature.items()) if len(signature) > 0 else ''
     # Record the end of regular arguments;
     # subsequent arguments are architecture-specific descriptors, such as tensor descriptors for CUDA.
-    arg_decls = ', '.join(f"{ty_to_cpp(ty)} arg{i}" for i, ty in signature.items() if ty != "constexpr")
+    arg_decl_list = []
+    for i, ty in signature.items():
+        if ty == "constexpr":
+            continue
+        if ty in FLOAT_STORAGE_TYPE:
+            arg_decl_list.append(f"{FLOAT_STORAGE_TYPE[ty]} arg{i}")
+        else:
+            arg_decl_list.append(f"{ty_to_cpp(ty)} arg{i}")
+    arg_decls = ', '.join(arg_decl_list)
     internal_args_list = []
     for i, ty in signature.items():
         if ty[0] == "*":
             internal_args_list.append(f"ptr_info{i}.dev_ptr")
+        elif ty in FLOAT_STORAGE_TYPE:
+            internal_args_list.append(f"_arg{i}_storage")
         elif ty != "constexpr":
             internal_args_list.append(f"_arg{i}")
+
+    float_storage_decls = [
+        f"{FLOAT_STORAGE_TYPE[ty]} _arg{i}_storage = {FLOAT_PACK_FUNCTION[ty]}(_arg{i});"
+        for i, ty in signature.items()
+        if ty in FLOAT_STORAGE_TYPE
+    ]
+
     libhip_path = _get_path_to_hip_runtime_dylib()
 
     # generate glue code
@@ -309,9 +340,6 @@ def format_of(ty):
 bool initSymbolTable() {{
   // Use the HIP runtime library loaded into the existing process if it exits.
   void *lib = dlopen("libamdhip64.so", RTLD_NOLOAD);
-  if (lib) {{
-    // printf("[triton] chosen loaded libamdhip64.so in the process\\n");
-  }}
 
   // Otherwise, go through the list of search paths to dlopen the first HIP
   // driver library.
@@ -321,7 +349,6 @@ def format_of(ty):
       void *handle = dlopen(hipLibSearchPaths[i], RTLD_LAZY | RTLD_LOCAL);
       if (handle) {{
         lib = handle;
-        // printf("[triton] chosen %s\\n", hipLibSearchPaths[i]);
       }}
     }}
   }}
@@ -382,7 +409,6 @@ def format_of(ty):
 #define HIP_CHECK(ans) {{ gpuAssert((ans), __FILE__, __LINE__); }}
 
 static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int launch_cooperative_grid, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, hipStream_t stream, hipFunction_t function{', ' + arg_decls if len(arg_decls) > 0 else ''}) {{
-  // printf("_launch hip kernel\\n");
   hipDeviceptr_t global_scratch = 0;
   void *params[] = {{ {', '.join(params)} }};
   if (gridX*gridY*gridZ > 0 && launch_cooperative_grid) {{
@@ -440,8 +466,33 @@ def format_of(ty):
   return ptr_info;
 }}
 
+static uint16_t pack_fp16(double f) {{
+    uint16_t result;
+    // from https://github.com/python/pythoncapi-compat
+#if 0x030600B1 <= PY_VERSION_HEX && PY_VERSION_HEX <= 0x030B00A1 && !defined(PYPY_VERSION)
+    _PyFloat_Pack2(f, (void*)&result, 1);
+#else
+    PyFloat_Pack2(f, (void*)&result, 1);
+#endif
+    return result;
+}}
+
+static uint16_t pack_bf16(double f) {{
+    float f32 = (float)f;
+    uint32_t u32 = *(uint32_t*)&f32;
+    return (uint16_t)(u32 >> 16);
+}}
+
+static uint32_t pack_fp32(double f) {{
+    float f32 = (float)f;
+    return *(uint32_t*)&f32;
+}}
+
+static uint64_t pack_fp64(double f) {{
+    return *(uint64_t*)&f;
+}}
+
 static PyObject* launch(PyObject* self, PyObject* args) {{
-   // printf("launch\\n");
   int gridX, gridY, gridZ;
   uint64_t _stream;
   uint64_t _function;
@@ -458,6 +509,8 @@ def format_of(ty):
     return NULL;
   }}
 
+  {' '.join(float_storage_decls)}
+
   // extract kernel metadata
   int num_warps, num_ctas, shared_memory, clusterDimX, clusterDimY, clusterDimZ;
   if (!PyArg_ParseTuple(kernel_metadata, \"iiiiii\", &num_warps, &num_ctas, &shared_memory, &clusterDimX, &clusterDimY, &clusterDimZ)) {{
diff --git a/third_party/nvidia/backend/driver.py b/third_party/nvidia/backend/driver.py
@@ -94,15 +94,30 @@ def ty_to_cpp(ty):
         "u16": "uint16_t",
         "u32": "uint32_t",
         "u64": "uint64_t",
-        "fp16": "float",
-        "bf16": "float",
-        "fp32": "float",
-        "f32": "float",
+        "fp16": "double",
+        "bf16": "double",
+        "fp32": "double",
+        "f32": "double",
         "fp64": "double",
         "nvTmaDesc": "CUtensorMap",
     }[ty]
 
 
+FLOAT_STORAGE_TYPE = {
+    "fp16": "uint16_t",
+    "bf16": "uint16_t",
+    "fp32": "uint32_t",
+    "f32": "uint32_t",
+    "fp64": "uint64_t",
+}
+FLOAT_PACK_FUNCTION = {
+    "fp16": "pack_fp16",
+    "bf16": "pack_bf16",
+    "fp32": "pack_fp32",
+    "f32": "pack_fp32",
+    "fp64": "pack_fp64",
+}
+
 _BASE_ARGS_FORMAT = "iiiKKppOOOOO"
 
 
@@ -175,7 +190,6 @@ def format_of(ty):
         if ty.startswith("tensordesc"):
             return "O"
         return {
-            "float": "f",
             "double": "d",
             "long": "l",
             "int8_t": "b",
@@ -201,11 +215,21 @@ def format_of(ty):
     args_list = ', ' + ', '.join(f"&_arg{i}" for i, ty in signature.items()) if len(signature) > 0 else ''
     # Record the end of regular arguments;
     # subsequent arguments are architecture-specific descriptors, such as tensor descriptors for CUDA.
-    arg_decls = ', '.join(f"{ty_to_cpp(ty)} arg{i}" for i, ty in signature.items() if ty != "constexpr")
+    arg_decl_list = []
+    for i, ty in signature.items():
+        if ty == "constexpr":
+            continue
+        if ty in FLOAT_STORAGE_TYPE:
+            arg_decl_list.append(f"{FLOAT_STORAGE_TYPE[ty]} arg{i}")
+        else:
+            arg_decl_list.append(f"{ty_to_cpp(ty)} arg{i}")
+    arg_decls = ', '.join(arg_decl_list)
     internal_args_list = []
     for i, ty in signature.items():
         if ty[0] == "*":
             internal_args_list.append(f"ptr_info{i}.dev_ptr")
+        elif ty in FLOAT_STORAGE_TYPE:
+            internal_args_list.append(f"_arg{i}_storage")
         elif ty == "nvTmaDesc":
             # Note: we have to dereference the pointer
             internal_args_list.append(f"*tma_ptr{i}")
@@ -224,6 +248,11 @@ def format_of(ty):
         f"CUtensorMap* tma_ptr{i} = getTmaDesc(_arg{i}); if (!tma_ptr{i}) return NULL;" for i, ty in signature.items()
         if ty == "nvTmaDesc"
     ]
+    float_storage_decls = [
+        f"{FLOAT_STORAGE_TYPE[ty]} _arg{i}_storage = {FLOAT_PACK_FUNCTION[ty]}(_arg{i});"
+        for i, ty in signature.items()
+        if ty in FLOAT_STORAGE_TYPE
+    ]
     params = [f"&arg{i}" for i, ty in signature.items() if ty != "constexpr"]
     params.append("&global_scratch")
     src = f"""
@@ -442,6 +471,32 @@ def format_of(ty):
   }}
 }}
 
+static uint16_t pack_fp16(double f) {{
+    uint16_t result;
+    // from https://github.com/python/pythoncapi-compat
+#if 0x030600B1 <= PY_VERSION_HEX && PY_VERSION_HEX <= 0x030B00A1 && !defined(PYPY_VERSION)
+    _PyFloat_Pack2(f, (void*)&result, 1);
+#else
+    PyFloat_Pack2(f, (void*)&result, 1);
+#endif
+    return result;
+}}
+
+static uint16_t pack_bf16(double f) {{
+    float f32 = (float)f;
+    uint32_t u32 = *(uint32_t*)&f32;
+    return (uint16_t)(u32 >> 16);
+}}
+
+static uint32_t pack_fp32(double f) {{
+    float f32 = (float)f;
+    return *(uint32_t*)&f32;
+}}
+
+static uint64_t pack_fp64(double f) {{
+    return *(uint64_t*)&f;
+}}
+
 static PyObject* launch(PyObject* self, PyObject* args) {{
   // ensure cuda context is valid before calling any CUDA APIs, e.g. before getPointer calls cuPointerGetAttributes
   ensureCudaContext();
@@ -492,6 +547,7 @@ def format_of(ty):
   // raise exception asap
   {newline.join(ptr_decls)}
   {newline.join(tma_decls)}
+  {newline.join(float_storage_decls)}
   Py_BEGIN_ALLOW_THREADS;
   _launch(gridX, gridY, gridZ, num_warps, num_ctas, launch_cooperative_grid, launch_pdl, clusterDimX, clusterDimY, clusterDimZ, shared_memory, (CUstream)_stream, (CUfunction)_function, global_scratch{', ' + ', '.join(internal_args_list) if len(internal_args_list) > 0 else ''});
   Py_END_ALLOW_THREADS;