[BACKEND] Fix memdesc of pointers (#8515)

ThomasRaoux · web-flow · commit 3a832d6e7f8b · 2025-10-23T13:08:16.000-07:00
Prevent crash when lowering memdesc of pointer
diff --git a/include/triton/Dialect/Triton/IR/Utility.h b/include/triton/Dialect/Triton/IR/Utility.h
@@ -1,6 +1,8 @@
 #ifndef TRITON_IR_UTILITY_H_
 #define TRITON_IR_UTILITY_H_
 
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/BuiltinTypes.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include <algorithm>
 #include <numeric>
@@ -10,6 +12,14 @@ namespace mlir {
 // Bitwidth of pointers
 constexpr int kPtrBitWidth = 64;
 
+// Returns the bit width of a type, treating pointer-like types as 64-bit.
+// This handles LLVM dialect pointer types.
+inline int getIntOrFloatOrPtrBitWidth(Type type) {
+  if (isa<LLVM::LLVMPointerType, triton::PointerType>(type))
+    return kPtrBitWidth;
+  return type.getIntOrFloatBitWidth();
+}
+
 template <typename T, typename U> SmallVector<T> convertType(ArrayRef<U> in) {
   SmallVector<T> out;
   for (const auto &i : in)
diff --git a/lib/Analysis/Allocation.cpp b/lib/Analysis/Allocation.cpp
@@ -152,7 +152,8 @@ class AllocationAnalysis {
       auto shapePerCTA = gpu::getAllocationShapePerCTA(allocType);
       numElems = product<int64_t>(shapePerCTA);
     }
-    int64_t bytes = numElems * allocType.getElementTypeBitWidth() / 8;
+    int64_t bytes =
+        numElems * getIntOrFloatOrPtrBitWidth(allocType.getElementType()) / 8;
 
     auto alignment = alloc.getAlignmentOrDefault();
     allocation->addBuffer<BufferT::BufferKind::Explicit>(alloc, bytes,
diff --git a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -271,8 +271,7 @@ struct ConvertLayoutOpConversion
     StringAttr kReg = str_attr("register");
     StringAttr kLane = str_attr("lane");
     auto elemTy = getTypeConverter()->convertType(srcTy.getElementType());
-    int bitwidth =
-        elemTy.isIntOrFloat() ? elemTy.getIntOrFloatBitWidth() : kPtrBitWidth;
+    int bitwidth = getIntOrFloatOrPtrBitWidth(elemTy);
 
     auto factors = getWarpLayoutConvertDecomposition(srcTy, dstTy, bitwidth);
     auto &[pReg, pLane, mixedTranspositions, nPack] = factors;
diff --git a/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp
@@ -276,7 +276,7 @@ struct ElementwiseInlineAsmOpConversion
       auto ty = getTypeConverter()->convertType(getElementType(result));
 
       // Pack return elements into 32-bits.
-      unsigned bitWidth = ty.isIntOrFloat() ? ty.getIntOrFloatBitWidth() : 64;
+      unsigned bitWidth = getIntOrFloatOrPtrBitWidth(ty);
       unsigned numElemsPerReg =
           std::min(std::max(32 / bitWidth, 1u), op.getPackedElement());
       assert(op.getPackedElement() % numElemsPerReg == 0);
diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
@@ -540,7 +540,7 @@ SmallVector<Value> lowerLdSt(
   auto kLane = str_attr("lane");
   auto kWarp = str_attr("warp");
   auto kOffset = str_attr("offset");
-  auto bitwidth = llvmElemTy.getIntOrFloatBitWidth();
+  auto bitwidth = getIntOrFloatOrPtrBitWidth(llvmElemTy);
 
   auto [elemsPerVec, permutation] =
       largestVectorisation(ctx, cvt, bitwidth, maybeMaxVecElems);
@@ -625,7 +625,7 @@ lowerLocalLdSt(Location loc, MLIRContext *ctx,
   assert(*cvt.getOutDimNames().begin() == str_attr("offset"));
   auto calcPaddedOffset = [&](Value smemOffset) {
     TritonLLVMOpBuilder b(loc, rewriter);
-    auto bitwidth = llvmElemTy.getIntOrFloatBitWidth();
+    auto bitwidth = getIntOrFloatOrPtrBitWidth(llvmElemTy);
     if (auto paddedEnc = dyn_cast<triton::gpu::PaddedSharedEncodingAttr>(
             srcTy.getEncoding())) {
       // Apply the offset needed for padding.
diff --git a/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
@@ -11,6 +11,16 @@ using namespace mlir::triton;
 using namespace mlir::triton::gpu;
 using ::mlir::LLVM::getSharedMemoryObjectFromStruct;
 namespace {
+
+Value bitOrPtrCast(Value val, Type type, TritonLLVMOpBuilder &b) {
+  if (isa<LLVM::LLVMPointerType>(val.getType()) &&
+      !isa<LLVM::LLVMPointerType>(type)) {
+    return b.ptrtoint(type, val);
+  } else {
+    return b.bitcast(val, type);
+  }
+}
+
 struct SplatOpConversion : public ConvertOpToLLVMPattern<triton::SplatOp> {
   using ConvertOpToLLVMPattern<triton::SplatOp>::ConvertOpToLLVMPattern;
   // Convert SplatOp or arith::ConstantOp with SplatElementsAttr to a
@@ -39,13 +49,13 @@ struct SplatOpConversion : public ConvertOpToLLVMPattern<triton::SplatOp> {
       unsigned ratio = srcBitWidth / cstBitWidth;
       Type intTy = IntegerType::get(elemType.getContext(), cstBitWidth);
       VectorType vecType = VectorType::get(ratio, intTy);
-      Value intCst = b.bitcast(constVal, intTy);
+      Value intCst = bitOrPtrCast(constVal, intTy, b);
       Value vec = b.undef(vecType);
       for (unsigned i = 0; i < ratio; ++i)
         vec = b.insert_element(vecType, vec, intCst, b.int_val(32, i));
       constVal = vec;
     }
-    auto llSrc = b.bitcast(constVal, srcType);
+    Value llSrc = bitOrPtrCast(constVal, srcType, b);
     size_t elemsPerThread = getTotalElemsPerThread(tensorTy);
     llvm::SmallVector<Value> elems(elemsPerThread, llSrc);
     return packLLElements(loc, typeConverter, elems, rewriter, resType);
diff --git a/test/Analysis/test-allocation.mlir b/test/Analysis/test-allocation.mlir
@@ -155,6 +155,18 @@ tt.func @preallocate(%A : !tt.ptr<f16>) {
   tt.return
 }
 
+// expected-remark @below {{memdesc_ptr}}
+// expected-remark @below {{size = 6144}}
+tt.func @memdesc_ptr() {
+  // expected-remark @below {{offset = 0, size = 4096}}
+  %a0 = ttg.local_alloc : () -> !ttg.memdesc<32x16x!tt.ptr<f16>, #A_SHARED, #ttg.shared_memory, mutable>
+  // expected-remark @below {{offset = 4096, size = 2048}}
+  %a1 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16x!tt.ptr<f16>, #A_SHARED, #ttg.shared_memory, mutable>
+  ttg.local_dealloc %a0 : !ttg.memdesc<32x16x!tt.ptr<f16>, #A_SHARED, #ttg.shared_memory, mutable>
+  ttg.local_dealloc %a1 : !ttg.memdesc<1x16x16x!tt.ptr<f16>, #A_SHARED, #ttg.shared_memory, mutable>
+  tt.return
+}
+
 // Unused tensors are immediately released
 // expected-remark @below {{unused}}
 // expected-remark @below {{size = 1024}}
@@ -279,9 +291,9 @@ tt.func @multi_color_multi_rounds(%arg0: !tt.ptr<f16>) {
 }
 
 
-// expected-remark @below {{alloc}}
+// expected-remark @below {{alloc_ptr}}
 // expected-remark @below {{size = 512}}
-tt.func @alloc(%A : !tt.ptr<f16>) {
+tt.func @alloc_ptr(%A : !tt.ptr<f16>) {
   // expected-remark @below {{offset = 0, size = 512}}
   %cst0 = ttg.local_alloc : () -> !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
   %cst1 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL>
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/Utility.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/Utility.cpp
@@ -1,6 +1,7 @@
 #include "Utility.h"
 
 #include "mlir/Dialect/SCF/IR/SCF.h"
+#include "triton/Dialect/Triton/IR/Utility.h"
 #include "triton/Tools/LayoutUtils.h"
 
 #include <limits>
@@ -159,7 +160,7 @@ ttg::PaddedSharedEncodingAttr composePaddedLayoutForAsyncCopyCDNA4(
     return {};
   }
 
-  unsigned bitWidth = srcTy.getElementType().getIntOrFloatBitWidth();
+  unsigned bitWidth = getIntOrFloatOrPtrBitWidth(srcTy.getElementType());
   unsigned elemByteWidth = std::max(bitWidth / 8u, 1u);
   auto loadBytes = shape[0] * shape[1] * elemByteWidth;
   if (loadBytes < 16384) {
diff --git a/third_party/nvidia/lib/Dialect/NVWS/Transforms/InsertAref.cpp b/third_party/nvidia/lib/Dialect/NVWS/Transforms/InsertAref.cpp
@@ -145,7 +145,7 @@ int getTxCount(Operation *descOp) {
   auto encoding = getEncodingFromDescriptor(descOp, tensorType, desc);
   auto shapePerCTA = getShapePerCTA(encoding, tensorType.getShape());
   return product(shapePerCTA) *
-         tensorType.getElementType().getIntOrFloatBitWidth() / 8;
+         getIntOrFloatOrPtrBitWidth(tensorType.getElementType()) / 8;
 }
 
 void createNVWSDescriptorLoadOp(OpBuilder &builder, Operation *ttDescLoadOp,
diff --git a/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp b/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp
@@ -256,7 +256,8 @@ class LoadAcquireOpPattern : public OpRewritePattern<ttn::LoadAcquireOp> {
     auto loc = op->getLoc();
     auto b = TritonLLVMOpBuilder(loc, rewriter);
     Type valueTy = op.getType();
-    const unsigned valueNBits = std::max(8u, valueTy.getIntOrFloatBitWidth());
+    const unsigned valueNBits =
+        std::max(8u, (unsigned)getIntOrFloatOrPtrBitWidth(valueTy));
     const size_t maxWordWidth = std::max<size_t>(32, valueNBits);
     const size_t width = std::min((size_t)valueNBits, maxWordWidth);
 
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ElementwiseOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ElementwiseOpToLLVM.cpp
@@ -559,7 +559,7 @@ struct FDivOpConversion
                                    ConversionPatternRewriter &rewriter,
                                    Type elemTy, MultipleOperandsRange operands,
                                    Location loc) const {
-    unsigned bitwidth = elemTy.getIntOrFloatBitWidth();
+    unsigned bitwidth = getIntOrFloatOrPtrBitWidth(elemTy);
     StringRef name;
     Type resultTy;
     if (32 == bitwidth) {
@@ -643,7 +643,7 @@ struct ExpOpConversionApprox
                                    Location loc) const {
     auto b = TritonLLVMOpBuilder(loc, rewriter);
     // For non-FP32 input, call __nv_expf for higher-precision calculation
-    if (elemTy.getIntOrFloatBitWidth() != 32)
+    if (getIntOrFloatOrPtrBitWidth(elemTy) != 32)
       return {};
 
     const double log2e = 1.4426950408889634;
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TargetInfo.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TargetInfo.cpp
@@ -195,7 +195,7 @@ void TargetInfo::storeDShared(RewriterBase &rewriter, Location loc, Value ptr,
   auto vecTy = cast<VectorType>(val.getType());
   Type elemTy = vecTy.getElementType();
   unsigned vec = vecTy.getNumElements();
-  unsigned elemBitwidth = elemTy.getIntOrFloatBitWidth();
+  unsigned elemBitwidth = getIntOrFloatOrPtrBitWidth(elemTy);
   assert(llvm::isPowerOf2_32(vec));
 
   if (elemBitwidth < 8) {
@@ -213,7 +213,11 @@ void TargetInfo::storeDShared(RewriterBase &rewriter, Location loc, Value ptr,
   if (!elemTy.isInteger()) {
     SmallVector<Value> vals = unpackLLVector(loc, val, rewriter);
     for (Value &v : vals) {
-      v = b.bitcast(v, int_ty(elemBitwidth));
+      if (isa<LLVM::LLVMPointerType>(v.getType())) {
+        v = b.ptrtoint(int_ty(elemBitwidth), v);
+      } else {
+        v = b.bitcast(v, int_ty(elemBitwidth));
+      }
     }
     storeDShared(rewriter, loc, ptr, ctaId, packLLVector(loc, vals, rewriter),
                  pred);
@@ -316,7 +320,7 @@ Value TargetInfo::loadDShared(RewriterBase &rewriter, Location loc, Value ptr,
   auto vecTy = cast<VectorType>(loadTy);
   Type elemTy = vecTy.getElementType();
   unsigned vec = vecTy.getNumElements();
-  unsigned elemBitwidth = elemTy.getIntOrFloatBitWidth();
+  unsigned elemBitwidth = getIntOrFloatOrPtrBitWidth(elemTy);
   assert(llvm::isPowerOf2_32(vec));
 
   if (elemBitwidth < 8) {
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/Utility.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/Utility.cpp
@@ -163,7 +163,7 @@ LogicalResult lowerLdStMatrix(
   auto kOffset = S("offset");
   auto kAddr = S("addr");
   auto smemPtrTy = ptr_ty(ctx, 3);
-  auto bitwidth = llvmElemTy.getIntOrFloatBitWidth();
+  auto bitwidth = getIntOrFloatOrPtrBitWidth(llvmElemTy);
   // In the contiguous case we can pack elements <= 32 bits
   // In the transpose case we just have the b8 and b16 cases
   if ((!transpose && bitwidth > 32) ||

Original file line number	Diff line number	Diff line change
`@@ -152,7 +152,8 @@ class AllocationAnalysis {`
`152`	`152`	`auto shapePerCTA = gpu::getAllocationShapePerCTA(allocType);`
`153`	`153`	`numElems = product<int64_t>(shapePerCTA);`
`154`	`154`	`}`
`155`		`- int64_t bytes = numElems * allocType.getElementTypeBitWidth() / 8;`
	`155`	`+ int64_t bytes =`
	`156`	`+ numElems * getIntOrFloatOrPtrBitWidth(allocType.getElementType()) / 8;`
`156`	`157`
`157`	`158`	`auto alignment = alloc.getAlignmentOrDefault();`
`158`	`159`	`allocation->addBuffer<BufferT::BufferKind::Explicit>(alloc, bytes,`
Original file line number	Diff line number	Diff line change
`@@ -145,7 +145,7 @@ int getTxCount(Operation *descOp) {`
`145`	`145`	`auto encoding = getEncodingFromDescriptor(descOp, tensorType, desc);`
`146`	`146`	`auto shapePerCTA = getShapePerCTA(encoding, tensorType.getShape());`
`147`	`147`	`return product(shapePerCTA) *`
`148`		`- tensorType.getElementType().getIntOrFloatBitWidth() / 8;`
	`148`	`+ getIntOrFloatOrPtrBitWidth(tensorType.getElementType()) / 8;`
`149`	`149`	`}`
`150`	`150`
`151`	`151`	`void createNVWSDescriptorLoadOp(OpBuilder &builder, Operation *ttDescLoadOp,`