Add implicit downcast in TMA descriptor store (#6236)

codex-maintainers · ThomasRaoux · web-flow · commit 286e91fd0763 · 2025-08-21T17:39:43.000-07:00
#### Description This fixes a missing implicit downcast when storing blocks through TMA descriptors. Previously, attempting to widen the result of a descriptor load (e.g. from `float16` to `float32`) and then store it back via the descriptor would result in an MLIR verification error because the block types no longer matched: ```python # ptr.element_ty is tl.float16 desc = tl._experimental_make_tensor_descriptor(ptr, shape=.., strides=..., block_shape=...) value = desc.load([off_x, off_y]).to(tl.float32) # 'tt.experimental_descriptor_store' op tensor desciptor block and tensor types must match desc.store([off_x, off_y], value) ``` The pointer/`tl.store` path already cast values to the target element type; descriptor stores should behave the same. #### Changes * Updated `descriptor_store` in `python/triton/language/semantic.py` to cast the incoming tensor to the descriptor's element type before emitting the `create_descriptor_store` IR node. * Added a regression test `test_tensor_descriptor_store_downcast` to `python/test/unit/cuda/test_experimental_tma.py` which widens a `float16`/`bfloat16` block to `float32` and stores it back via the descriptor. * Ran `pre-commit` hooks to keep formatting consistent. A quick check under `TRITON_INTERPRET=1` shows the new downcast path works: ``` True # torch.equal(a, out) when storing a widened float16 block True # bfloat16 as well ``` #### Checklist - [x] I am not making a trivial change, such as fixing a typo in a comment. - [x] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [x] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - [x] I have added tests under `python/test`. - [x] I have not added any `lit` tests. --- This fix aligns descriptor stores with pointer store semantics and avoids an IR verifier failure when the stored block's element type is wider than the descriptor’s element type. Co-authored-by: Thomas Raoux <thomas.raoux@openai.com>
diff --git a/python/test/unit/language/test_tensor_descriptor.py b/python/test/unit/language/test_tensor_descriptor.py
@@ -1671,3 +1671,30 @@ def test_host_tensor_descriptor_matmul(num_stages, num_ctas, BLOCK_M, BLOCK_N, B
         # Only a subset of TMEM and stmatrix layout pairs are compatible, for example 16x256bx2 and m8n8x4.
         assert "stmatrix.sync.aligned.m8n8.x4.shared.b16" in kernel.asm[
             "ptx"] or "stmatrix.sync.aligned.x4.m8n8.shared.b16" in kernel.asm["ptx"]
+
+
+@pytest.mark.interpreter
+@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
+def test_tensor_descriptor_store_downcast(dtype_str, device):
+
+    @triton.jit
+    def kernel(desc, M, N, M_BLOCK: tl.constexpr, N_BLOCK: tl.constexpr):
+        moffset = tl.program_id(axis=0) * M_BLOCK
+        noffset = tl.program_id(axis=1) * N_BLOCK
+        midx = moffset + tl.arange(0, M_BLOCK)[:, None]
+        nidx = noffset + tl.arange(0, N_BLOCK)[None, :]
+        val_f32 = (midx * N + nidx).to(tl.float32)
+        # implicit downcast in the store.
+        desc.store([moffset, noffset], val_f32)
+
+    M, N = 32, 128
+    torch_dtype = getattr(torch, dtype_str)
+    M_BLOCK = 8
+    N_BLOCK = 32
+    grid_m = M // M_BLOCK
+    grid_n = N // N_BLOCK
+    out = torch.empty((M, N), dtype=torch_dtype, device=device)
+    desc = TensorDescriptor(out, out.shape, out.stride(), [M_BLOCK, N_BLOCK])
+    kernel[(grid_m, grid_n)](desc, M, N, M_BLOCK=M_BLOCK, N_BLOCK=N_BLOCK)
+    ref = torch.arange(M * N, dtype=torch.float32, device=device).reshape(M, N).to(torch_dtype)
+    torch.testing.assert_close(out, ref)
diff --git a/python/triton/language/semantic.py b/python/triton/language/semantic.py
@@ -1107,6 +1107,8 @@ def validate_store_like(self, desc: tl.tensor_descriptor_base, value: TensorTy,
 
     def descriptor_store(self, desc: tl.tensor_descriptor_base, value: TensorTy, offsets) -> TensorTy:
         self.validate_store_like(desc, value, offsets)
+        # implicitly cast to the descriptor's type
+        value = self.cast(value, desc.dtype)
         offsets = self._convert_to_ir_values(offsets, require_i64=False)
         return self.tensor(self.builder.create_descriptor_store(desc.handle, value.handle, offsets), tl.void)