[Gluon][Blackwell] Add _reinterpret to tmem descriptor and fix its lowering for TMEM (#7160)

Mogball · web-flow · commit 6d5fb9f63bca · 2025-06-12T12:21:05.000-04:00
diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
@@ -298,6 +298,7 @@ def TTG_MemDescReinterpretOp : TTG_Op<"memdesc_reinterpret", [Pure, MemDescViewT
   }];
 
   let hasVerifier = 1;
+  let hasFolder = 1;
 }
 
 def TTG_LocalLoadOp : TTG_Op<"local_load"> {
diff --git a/lib/Dialect/TritonGPU/IR/Ops.cpp b/lib/Dialect/TritonGPU/IR/Ops.cpp
@@ -495,6 +495,12 @@ LogicalResult MemDescReinterpretOp::verify() {
   return success();
 }
 
+OpFoldResult MemDescReinterpretOp::fold(FoldAdaptor adaptor) {
+  if (getType() == getSrc().getType())
+    return getSrc();
+  return {};
+}
+
 // LocalAllocOp
 void LocalAllocOp::getEffects(
     SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
diff --git a/python/triton/experimental/gluon/language/nvidia/blackwell/__init__.py b/python/triton/experimental/gluon/language/nvidia/blackwell/__init__.py
@@ -157,6 +157,16 @@ def index(self, index, _semantic: GluonSemantic = None) -> tensor_memory_descrip
         ret.handle = builder.create_memdesc_subview(ret.type.to_ir(builder), self.handle, offsets)
         return ret
 
+    @builtin
+    def _reinterpret(self, dtype, shape, layout, _semantic: GluonSemantic = None) -> tensor_memory_descriptor:
+        dtype = _unwrap_if_constexpr(dtype)
+        shape = [_unwrap_if_constexpr(s) for s in shape]
+        layout = _unwrap_if_constexpr(layout)
+
+        ty = tensor_memory_descriptor_type(dtype, shape, layout, shape)
+        handle = _semantic.builder.create_memdesc_reinterpret(ty.to_ir(_semantic.builder), self.handle)
+        return tensor_memory_descriptor(handle, **ty.__dict__)
+
 
 @builtin
 def allocate_tensor_memory(element_ty, shape, layout, value=None, _semantic=None):
diff --git a/test/Conversion/tritongpu_to_llvm_blackwell.mlir b/test/Conversion/tritongpu_to_llvm_blackwell.mlir
@@ -591,3 +591,18 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
     tt.return
   }
 }
+
+// -----
+
+#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, unpacked = true>
+
+module attributes {"ttg.num-warps" = 4 : i32} {
+
+// CHECK-LABEL: @reinterpret
+tt.func private @reinterpret(%arg0: !ttg.memdesc<128xf32, #tmem, #ttng.tensor_memory>) -> !ttg.memdesc<16x16xf16, #tmem, #ttng.tensor_memory> {
+  %0 = ttg.memdesc_reinterpret %arg0 : !ttg.memdesc<128xf32, #tmem, #ttng.tensor_memory> -> !ttg.memdesc<16x16xf16, #tmem, #ttng.tensor_memory>
+  // CHECK-NEXT: return %arg0
+  tt.return %0 : !ttg.memdesc<16x16xf16, #tmem, #ttng.tensor_memory>
+}
+
+}
diff --git a/test/TritonNvidiaGPU/canonicalize.mlir b/test/TritonNvidiaGPU/canonicalize.mlir
@@ -1,13 +1,23 @@
 // RUN: triton-opt %s -canonicalize | FileCheck %s
 
-// CHECK-LABEL: @test_dce_tmem_alloc
-//   CHECK-NOT:   ttng.tmem_alloc
-//       CHECK:   tt.return
 #linear = #ttg.linear<{register = [[0, 1], [0, 2], [32, 0], [64, 0]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0]], warp = [[0, 0], [0, 0]], block = []}>
 #tmem_scales = #ttng.tensor_memory_scales_encoding<>
+#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, unpacked = true>
 module attributes {"ttg.num-warps" = 8 : i32, "ttg.num-ctas" = 1 : i32, "ttg.target" = "cuda:80"} {
+
+// CHECK-LABEL: @test_dce_tmem_alloc
 tt.func @test_dce_tmem_alloc(%arg: tensor<128x4xi8, #linear>) {
-    %a = ttng.tmem_alloc %arg : (tensor<128x4xi8, #linear>) -> !ttg.memdesc<128x4xi8, #tmem_scales, #ttng.tensor_memory>
-    tt.return
+  // CHECK-NOT: ttng.tmem_alloc
+  %a = ttng.tmem_alloc %arg : (tensor<128x4xi8, #linear>) -> !ttg.memdesc<128x4xi8, #tmem_scales, #ttng.tensor_memory>
+  // CHECK-NEXT: tt.return
+  tt.return
 }
+
+// CHECK-LABEL: @reinterpret_fold
+tt.func @reinterpret_fold(%arg0: !ttg.memdesc<128xf32, #tmem, #ttng.tensor_memory>) -> !ttg.memdesc<128xf32, #tmem, #ttng.tensor_memory> {
+  %0 = ttg.memdesc_reinterpret %arg0 : !ttg.memdesc<128xf32, #tmem, #ttng.tensor_memory> -> !ttg.memdesc<128xf32, #tmem, #ttng.tensor_memory>
+  // CHECK-NEXT: return %arg0
+  tt.return %0 : !ttg.memdesc<128xf32, #tmem, #ttng.tensor_memory>
+}
+
 }  // end module
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/CMakeLists.txt b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/CMakeLists.txt
@@ -27,4 +27,5 @@ add_triton_library(TritonNVIDIAGPUToLLVM
     LINK_LIBS PUBLIC
     TritonGPUToLLVM
     TritonProtonToLLVM
+    MLIRReconcileUnrealizedCasts
 )
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertWarpSpecializeToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertWarpSpecializeToLLVM.cpp
@@ -1,10 +1,12 @@
 #include "TargetInfo.h"
 #include "TritonNVIDIAGPUToLLVM/PTXAsmFormat.h"
 #include "mlir/Analysis/TopologicalSortUtils.h"
+#include "mlir/Conversion/Passes.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "triton/Conversion/TritonGPUToLLVM/Passes.h"
 #include "triton/Conversion/TritonGPUToLLVM/TypeConverter.h"
@@ -561,10 +563,9 @@ struct ConvertWarpSpecializeToLLVM
       if (isa<WarpSpecializeOp, WarpSpecializePartitionsOp, WarpYieldOp>(op))
         convertOpTypes(op, typeConverter);
     });
-    RewritePatternSet patterns(&getContext());
-    UnrealizedConversionCastOp::getCanonicalizationPatterns(patterns,
-                                                            &getContext());
-    if (failed(applyPatternsGreedily(mod, std::move(patterns))))
+    OpPassManager pm;
+    pm.addPass(createReconcileUnrealizedCastsPass());
+    if (failed(runPipeline(pm, mod)))
       return signalPassFailure();
 
     SmallVector<LLVM::LLVMFuncOp> kernels;
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TensorMemoryToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TensorMemoryToLLVM.cpp
@@ -884,6 +884,23 @@ struct MemDescSubviewOpConversion
   }
 };
 
+class MemDescReinterpretOpConversion
+    : public ConvertOpToLLVMPattern<MemDescReinterpretOp> {
+public:
+  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+  LogicalResult
+  matchAndRewrite(MemDescReinterpretOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    if (!isa<triton::nvidia_gpu::TensorMemoryEncodingAttr>(
+            op.getSrc().getType().getEncoding())) {
+      return failure();
+    }
+    rewriter.replaceOp(op, adaptor.getSrc());
+    return success();
+  }
+};
+
 struct TMEMSubSliceOpConversion
     : public ConvertOpToLLVMPattern<triton::nvidia_gpu::TMEMSubSliceOp> {
   using ConvertOpToLLVMPattern<
@@ -937,5 +954,6 @@ void mlir::triton::NVIDIA::populateTensorMemorySubviewOpToLLVMPattern(
     LLVMTypeConverter &typeConverter, RewritePatternSet &patterns,
     PatternBenefit benefit) {
   patterns.add<MemDescSubviewOpConversion>(typeConverter, benefit);
+  patterns.add<MemDescReinterpretOpConversion>(typeConverter, benefit);
   return;
 }

Original file line number	Diff line number	Diff line change
`@@ -298,6 +298,7 @@ def TTG_MemDescReinterpretOp : TTG_Op<"memdesc_reinterpret", [Pure, MemDescViewT`
`298`	`298`	`}];`
`299`	`299`
`300`	`300`	`let hasVerifier = 1;`
	`301`	`+ let hasFolder = 1;`
`301`	`302`	`}`
`302`	`303`
`303`	`304`	`def TTG_LocalLoadOp : TTG_Op<"local_load"> {`
Original file line number	Diff line number	Diff line change
`@@ -27,4 +27,5 @@ add_triton_library(TritonNVIDIAGPUToLLVM`
`27`	`27`	`LINK_LIBS PUBLIC`
`28`	`28`	`TritonGPUToLLVM`
`29`	`29`	`TritonProtonToLLVM`
	`30`	`+ MLIRReconcileUnrealizedCasts`
`30`	`31`	`)`