Merge commit 'f7f5b3af26d283348e78f3593b337a39267f7ff9'

whitneywhtsang · whitneywhtsang · commit e625b79d5b48 · 2025-02-28T00:03:52.000Z
diff --git a/include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h b/include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h
@@ -89,6 +89,8 @@ class TargetInfoBase {
 
   virtual int getSharedAddressSpace() const = 0;
 
+  virtual int getAddressSpace(Attribute addressSpace) const = 0;
+
   virtual bool supportVectorizedAtomics() const = 0;
 
   // Helper used by targets to annotate store operations during lowering to
diff --git a/lib/Conversion/TritonGPUToLLVM/TypeConverter.cpp b/lib/Conversion/TritonGPUToLLVM/TypeConverter.cpp
@@ -54,8 +54,8 @@ Type TritonGPUToLLVMTypeConverter::convertMemDescType(
     MemDescType type, const TargetInfoBase &targetInfo) {
   auto ctx = type.getContext();
   // base ptr
-  auto ptrType =
-      LLVM::LLVMPointerType::get(ctx, targetInfo.getSharedAddressSpace());
+  auto ptrType = LLVM::LLVMPointerType::get(
+      ctx, targetInfo.getAddressSpace(type.getMemorySpace()));
 
   if (isa<triton::nvidia_gpu::TensorMemoryEncodingAttr,
           triton::nvidia_gpu::TensorMemoryScalesEncodingAttr>(
diff --git a/lib/Dialect/Triton/IR/Ops.cpp b/lib/Dialect/Triton/IR/Ops.cpp
@@ -882,7 +882,7 @@ void MakeTensorPtrOp::build(OpBuilder &builder, OperationState &state,
   auto tensorType = RankedTensorType::get(
       SmallVector<int64_t>(tensorShape.begin(), tensorShape.end()),
       pointerType.getPointeeType());
-  auto result = PointerType::get(tensorType, 1);
+  auto result = PointerType::get(tensorType, pointerType.getAddressSpace());
 
   return build(builder, state, result, base, shape, strides, offsets,
                builder.getDenseI32ArrayAttr(order));
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -153,6 +153,20 @@ SmallVector<unsigned> getShapePerCTATile(Attribute layout) {
 }
 
 bool isExpensiveView(Type srcType, Type dstType) {
+  auto tensorSrcType = cast<RankedTensorType>(srcType);
+  auto tensorDstType = cast<RankedTensorType>(dstType);
+  auto llSrc =
+      toLinearLayout(tensorSrcType.getShape(), tensorSrcType.getEncoding());
+  auto llDst =
+      toLinearLayout(tensorDstType.getShape(), tensorDstType.getEncoding());
+  // In case there are replicated value we need to make sure the new and old
+  // layout have matching masks.
+  for (auto [srcMask, dstMask] :
+       llvm::zip(llSrc.getFreeVariableMasks(), llDst.getFreeVariableMasks())) {
+    assert(srcMask.first == dstMask.first);
+    if (srcMask.second != dstMask.second)
+      return true;
+  }
   return getTotalElemsPerThread(srcType) != getTotalElemsPerThread(dstType);
 }
 
diff --git a/test/TritonGPU/canonicalize.mlir b/test/TritonGPU/canonicalize.mlir
@@ -40,6 +40,25 @@ tt.func @test_canonicalize_convert_expensive_view(%arg0: tensor<256x16xf32, #blo
 
 // -----
 
+// test that the convert doesn't get combined with view if the resulting operations
+// is an expensive view which would require moving data across threads.
+// CHECK-LABEL: @test_canonicalize_convert_expensive_view
+// CHECK-SAME: (%[[ARG:.+]]: tensor<2xf32
+//       CHECK:   %[[C:.+]] = ttg.convert_layout %[[ARG]]
+//       CHECK:   %[[V:.+]] = tt.reshape %[[C]] allow_reorder
+//       CHECK:   tt.return %[[V]]
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:80"} {
+  tt.func @test_canonicalize_convert_expensive_view2(%arg0: tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked}>>) -> tensor<2xf32, #blocked1> {
+    %c = ttg.convert_layout %arg0 : tensor<2xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<2xf32, #blocked1>
+    %r = tt.reshape %c allow_reorder : tensor<2xf32, #blocked1> -> tensor<2xf32, #blocked1>
+    tt.return %r : tensor<2xf32, #blocked1>
+  }
+}
+
+// -----
+
 // test that the convert does get combined with the view even if the resulting operation
 // is an efficient view.
 // CHECK-LABEL: @test_canonicalize_convert_view
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp
@@ -460,6 +460,16 @@ Value TargetInfo::getStackPointer(RewriterBase &rewriter,
   return rewriter.create<LLVM::AddressOfOp>(funcOp.getLoc(), globalBase);
 }
 
+int TargetInfo::getAddressSpace(Attribute addressSpace) const {
+  int spaceId = 0;
+  if (isa<triton::gpu::SharedMemorySpaceAttr>(addressSpace)) {
+    spaceId = 3;
+  } else {
+    llvm::report_fatal_error("Only support SharedMemorySpace for now");
+  }
+  return spaceId;
+}
+
 bool TargetInfo::supportVectorizedAtomics() const {
   // Note: not currently tested or used, but AMD generally supports vectorized
   // atomics.
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.h b/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.h
@@ -65,8 +65,11 @@ class TargetInfo : public mlir::triton::TargetInfoBase {
 
   void assertFail(RewriterBase &rewriter, Location loc, StringRef message,
                   StringRef file, StringRef func, int line) const override;
+
   int getSharedAddressSpace() const override;
 
+  int getAddressSpace(Attribute addressSpace) const override;
+
   bool supportVectorizedAtomics() const override;
 
   void storeOpAnnotation(triton::gpu::LocalStoreOp op, size_t localStoreOpCount,
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TargetInfo.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TargetInfo.cpp
@@ -630,6 +630,18 @@ Value TargetInfo::getStackPointer(RewriterBase &rewriter,
   return rewriter.create<LLVM::AddressOfOp>(funcOp.getLoc(), globalBase);
 }
 
+int TargetInfo::getAddressSpace(Attribute addressSpace) const {
+  int spaceId = 0;
+  if (isa<triton::gpu::SharedMemorySpaceAttr,
+          triton::nvidia_gpu::TensorMemorySpaceAttr>(addressSpace)) {
+    spaceId = 3;
+  } else {
+    llvm::report_fatal_error(
+        "Only support SharedMemorySpace, TensorMemorySpace for now");
+  }
+  return spaceId;
+}
+
 bool TargetInfo::supportVectorizedAtomics() const {
   return computeCapability >= 90 && ptxVersion >= 81;
 }
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TargetInfo.h b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TargetInfo.h
@@ -58,11 +58,14 @@ class TargetInfo : public mlir::triton::TargetInfoBase {
 
   void assertFail(RewriterBase &rewriter, Location loc, StringRef message,
                   StringRef file, StringRef func, int line) const override;
+
   int getSharedAddressSpace() const override;
 
   Value getStackPointer(RewriterBase &rewriter,
                         FunctionOpInterface funcOp) const override;
 
+  int getAddressSpace(Attribute addressSpace) const override;
+
   bool supportVectorizedAtomics() const override;
 
   int getPtxVersion() const { return ptxVersion; }