[FRONTEND] Improve errors for TMA desc failure (#8462)

ThomasRaoux · web-flow · commit e5e0081db333 · 2025-10-16T13:17:35.000-07:00
catch empty tensors and also print out all the argument when there is an
unexpected failure
diff --git a/python/test/unit/cuda/test_tma_descriptor.py b/python/test/unit/cuda/test_tma_descriptor.py
@@ -23,7 +23,7 @@ def test_1d_tma_descriptor_exception(M, BLOCK_M, expect_error):
         _ = TensorDescriptor.from_tensor(x, [BLOCK_M])
 
 
-@pytest.mark.parametrize("M, BLOCK_M, expect_error_m", [(128, 32, False), (125, 33, True)])
+@pytest.mark.parametrize("M, BLOCK_M, expect_error_m", [(128, 32, False), (125, 33, True), (0, 32, False)])
 @pytest.mark.parametrize("N, BLOCK_N, expect_error_n", [(128, 32, False), (128, 30, True), (127, 32, False)])
 def test_2d_tma_descriptor_exception(M, N, BLOCK_M, BLOCK_N, expect_error_n, expect_error_m):
     if not torch.cuda.is_available() or not torch.cuda.get_device_capability()[0] >= 9:
@@ -39,10 +39,14 @@ def test_2d_tma_descriptor_exception(M, N, BLOCK_M, BLOCK_N, expect_error_n, exp
 
     shape_error = expect_error_n or expect_error_m
     error_alignment = (N % 16) != 0
-    expect_error = shape_error or error_alignment
+    zero_shape_error = M <= 0 or N <= 0
+    expect_error = shape_error or error_alignment or zero_shape_error
 
     exc_type = ValueError if shape_error else AssertionError
     match = "Shape element . must be a power of 2" if shape_error else "strides must be 16-byte aligned"
+    if zero_shape_error and not shape_error and not error_alignment:
+        match = "shape must be positive"
+        exc_type = AssertionError
     ctx = pytest.raises(exc_type, match=match) if expect_error else nullcontext()
     with ctx:
         _ = TensorDescriptor.from_tensor(A, [BLOCK_M, BLOCK_N])
diff --git a/python/triton/experimental/gluon/nvidia/hopper.py b/python/triton/experimental/gluon/nvidia/hopper.py
@@ -27,6 +27,8 @@ def __post_init__(self):
         elem_bytes = get_primitive_bitwidth(dtype_str) // 8
         for stride in self.strides[:-1]:
             assert (stride * elem_bytes) % 16 == 0, "strides must be 16-byte aligned"
+        for shape_dim in self.shape:
+            assert shape_dim > 0, "shape must be positive"
         assert self.strides[-1] == 1, "Last dimension must be contiguous"
         assert isinstance(self.layout, NVMMASharedLayout), "Layout must be NVMMASharedLayout"
         assert self.padding == "zero" or self.padding == "nan", "Illegal value for padding"
diff --git a/python/triton/tools/tensor_descriptor.py b/python/triton/tools/tensor_descriptor.py
@@ -24,6 +24,8 @@ def __post_init__(self):
         elem_bytes = self.base.dtype.itemsize
         for stride in self.strides[:-1]:
             assert (stride * elem_bytes) % 16 == 0, "strides must be 16-byte aligned"
+        for shape_dim in self.shape:
+            assert shape_dim > 0, "shape must be positive"
         assert self.strides[-1] == 1, "Last dimension must be contiguous"
         assert self.padding == "zero" or self.padding == "nan", "Illegal value for padding"
         if self.padding == "nan":
diff --git a/third_party/nvidia/backend/driver.c b/third_party/nvidia/backend/driver.c
@@ -1,6 +1,7 @@
 #include "cuda.h"
 #include <dlfcn.h>
 #include <stdbool.h>
+#include <stdio.h>
 #include <stdlib.h>
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
@@ -420,10 +421,53 @@ static PyObject *fillTMADescriptor(PyObject *self, PyObject *args) {
   static cuTensorMapEncodeTiled_t cuTensorMapEncodeTiled = NULL;
   INITIALIZE_FUNCTION_POINTER_IF_NULL(cuTensorMapEncodeTiled,
                                       getCuTensorMapEncodeTiledHandle);
-  CUDA_CHECK_AND_RETURN_NULL(cuTensorMapEncodeTiled(
+  CUresult res = cuTensorMapEncodeTiled(
       &desc->tensorMap, elemType, rank, (void *)global_address, shapeInt,
       stridesLL, blockSizeInt, elementStrides, CU_TENSOR_MAP_INTERLEAVE_NONE,
-      swizzle, CU_TENSOR_MAP_L2_PROMOTION_L2_128B, fill));
+      swizzle, CU_TENSOR_MAP_L2_PROMOTION_L2_128B, fill);
+  if (res != CUDA_SUCCESS) {
+    const char *str;
+    cuGetErrorString(res, &str);
+    char err[4096] = {0};
+    size_t off = 0;
+    off += snprintf(
+        err + off, sizeof(err) - off,
+        "Triton Error [CUDA]: Failed to create tensor map descriptor: %s\n",
+        str ? str : "Unknown error");
+    off += snprintf(err + off, sizeof(err) - off,
+                    "elemType=%d rank=%d global_address=0x%llx elemSize=%d "
+                    "swizzle=%d padding=%d\n",
+                    elemType, rank, (unsigned long long)global_address,
+                    elemSize, swizzle, padding);
+    off += snprintf(err + off, sizeof(err) - off, "shape=[");
+    for (int i = 0; i < rank; ++i) {
+      off +=
+          snprintf(err + off, sizeof(err) - off, "%llu%s",
+                   (unsigned long long)shapeInt[i], (i + 1 < rank) ? ", " : "");
+    }
+    off += snprintf(err + off, sizeof(err) - off, "]\n");
+    off += snprintf(err + off, sizeof(err) - off, "strides=[");
+    for (int i = 0; i < rank; ++i) {
+      off += snprintf(err + off, sizeof(err) - off, "%llu%s",
+                      (unsigned long long)stridesLL[i],
+                      (i + 1 < rank) ? ", " : "");
+    }
+    off += snprintf(err + off, sizeof(err) - off, "]\n");
+    off += snprintf(err + off, sizeof(err) - off, "blockSize=[");
+    for (int i = 0; i < rank; ++i) {
+      off += snprintf(err + off, sizeof(err) - off, "%u%s",
+                      (unsigned)blockSizeInt[i], (i + 1 < rank) ? ", " : "");
+    }
+    off += snprintf(err + off, sizeof(err) - off, "] elementStrides=[");
+    for (int i = 0; i < rank; ++i) {
+      off += snprintf(err + off, sizeof(err) - off, "%u%s",
+                      (unsigned)elementStrides[i], (i + 1 < rank) ? ", " : "");
+    }
+    off += snprintf(err + off, sizeof(err) - off, "]\n");
+    PyErr_SetString(PyExc_RuntimeError, err);
+
+    goto cleanup;
+  }
 
   return (PyObject *)desc;