Merge OpenAI Triton commit 3a93d6f (#5266)

whitneywhtsang · web-flow · commit 3b3a787bca25 · 2025-10-07T17:18:41.000-04:00
This PR change the Triton base from bea27e3 to 3a93d6f (Oct 1). Pass rate: 94.2%
diff --git a/.github/workflows/llvm-build.yml b/.github/workflows/llvm-build.yml
@@ -106,7 +106,7 @@ jobs:
         -DLLVM_BUILD_UTILS=ON
         -DLLVM_BUILD_TOOLS=ON
         -DLLVM_ENABLE_ASSERTIONS=ON
-        -DMLIR_ENABLE_BINDINGS_PYTHON=ON
+        -DMLIR_ENABLE_BINDINGS_PYTHON=OFF
         -DLLVM_ENABLE_PROJECTS="mlir;lld"
         -DLLVM_INSTALL_UTILS=ON
         -DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU"
@@ -130,7 +130,7 @@ jobs:
         -DLLVM_BUILD_UTILS=ON
         -DLLVM_BUILD_TOOLS=ON
         -DLLVM_ENABLE_ASSERTIONS=ON
-        -DMLIR_ENABLE_BINDINGS_PYTHON=ON
+        -DMLIR_ENABLE_BINDINGS_PYTHON=OFF
         -DLLVM_ENABLE_PROJECTS="mlir;llvm;lld"
         -DLLVM_ENABLE_DIA_SDK=OFF
         -DLLVM_INSTALL_UTILS=ON
@@ -179,7 +179,7 @@ jobs:
         -DCLANG_TABLEGEN=$HOST_TOOLS/clang-tblgen \
         -DLLVM_ENABLE_ASSERTIONS=ON \
         -DCMAKE_LINKER=$LINKER \
-        -DMLIR_ENABLE_BINDINGS_PYTHON=ON \
+        -DMLIR_ENABLE_BINDINGS_PYTHON=OFF \
         -DLLVM_ENABLE_ZSTD=OFF \
         -DLLVM_ABI_BREAKING_CHECKS=FORCE_OFF \
         -DLLVM_INSTALL_UTILS=ON \
@@ -202,12 +202,6 @@ jobs:
         -DLLVM_ENABLE_TERMINFO=OFF \
         llvm-project/llvm
         ninja -C llvm-project/build install
-        CURR_PWD="$(pwd)"
-        cd "${{ env.llvm_install_dir }}/python_packages/mlir_core/mlir/_mlir_libs/"
-        for file in *x86_64*; do
-          mv "$file" "${file/x86_64/aarch64}"
-        done
-        cd $CURR_PWD
         tar czf "${{ env.llvm_install_dir }}.tar.gz" "${{ env.llvm_install_dir }}"
 
     - name: Configure, Build, and Install LLVM (macOS arm64)
@@ -225,7 +219,7 @@ jobs:
         -DLLVM_BUILD_UTILS=ON
         -DLLVM_BUILD_TOOLS=ON
         -DLLVM_ENABLE_ASSERTIONS=ON
-        -DMLIR_ENABLE_BINDINGS_PYTHON=ON
+        -DMLIR_ENABLE_BINDINGS_PYTHON=OFF
         -DLLVM_ENABLE_PROJECTS="mlir;lld"
         -DLLVM_ENABLE_ZSTD=OFF
         -DLLVM_INSTALL_UTILS=ON
diff --git a/.github/workflows/llvm-build/almalinux.Dockerfile b/.github/workflows/llvm-build/almalinux.Dockerfile
@@ -29,7 +29,7 @@ RUN cmake -GNinja -Bbuild \
   -DLLVM_BUILD_UTILS=ON \
   -DLLVM_BUILD_TOOLS=ON \
   -DLLVM_ENABLE_ASSERTIONS=ON \
-  -DMLIR_ENABLE_BINDINGS_PYTHON=ON \
+  -DMLIR_ENABLE_BINDINGS_PYTHON=OFF \
   -DLLVM_ENABLE_PROJECTS="mlir;lld" \
   -DLLVM_ENABLE_TERMINFO=OFF \
   -DLLVM_INSTALL_UTILS=ON \
diff --git a/.github/workflows/llvm-build/centos.Dockerfile b/.github/workflows/llvm-build/centos.Dockerfile
@@ -46,7 +46,7 @@ RUN cmake -GNinja -Bbuild \
   -DLLVM_BUILD_UTILS=ON \
   -DLLVM_BUILD_TOOLS=ON \
   -DLLVM_ENABLE_ASSERTIONS=ON \
-  -DMLIR_ENABLE_BINDINGS_PYTHON=ON \
+  -DMLIR_ENABLE_BINDINGS_PYTHON=OFF \
   -DLLVM_ENABLE_PROJECTS="mlir;lld" \
   -DLLVM_ENABLE_TERMINFO=OFF \
   -DLLVM_INSTALL_UTILS=ON \
diff --git a/cmake/llvm-hash.txt b/cmake/llvm-hash.txt
@@ -1 +1 @@
-064f02dac0c81c19350a74415b3245f42fed09dc
+f6ded0be897e2878612dd903f7e8bb85448269e5
diff --git a/scripts/build-llvm-project.sh b/scripts/build-llvm-project.sh
@@ -22,6 +22,7 @@ if [ -z "$CMAKE_ARGS" ]; then
               -DCMAKE_CXX_COMPILER=clang++
               -DLLVM_ENABLE_LLD=ON
               -DLLVM_OPTIMIZED_TABLEGEN=ON
+              -DMLIR_ENABLE_BINDINGS_PYTHON=OFF
               -DLLVM_TARGETS_TO_BUILD="$LLVM_TARGETS"
               -DCMAKE_EXPORT_COMPILE_COMMANDS=1
               -DLLVM_ENABLE_PROJECTS="$LLVM_PROJECTS"
diff --git a/test/Conversion/amd/wmma-v2-shortcut.mlir b/test/Conversion/amd/wmma-v2-shortcut.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s --tritongpu-reduce-data-duplication --allocate-shared-memory --convert-triton-amdgpu-to-llvm=arch="gfx1200" -split-input-file | FileCheck %s
+// RUN: triton-opt %s --tritongpu-reduce-data-duplication --allocate-shared-memory --convert-triton-amdgpu-to-llvm=arch="gfx1200" -reconcile-unrealized-casts -split-input-file | FileCheck %s
 
 #wmmaTv2 = #ttg.amd_wmma<{version = 2, warpsPerCTA = [1, 1], isTranspose = true}>
 #dotop0v2 = #ttg.dot_op<{opIdx = 0, parent = #wmmaTv2, kWidth=8}>
diff --git a/test/Conversion/tritongpu_to_llvm.mlir b/test/Conversion/tritongpu_to_llvm.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s -split-input-file --allocate-shared-memory-nv --convert-triton-gpu-to-llvm 2>/dev/null | FileCheck %s --dump-input-context 20
+// RUN: triton-opt %s -split-input-file --allocate-shared-memory-nv --convert-triton-gpu-to-llvm -reconcile-unrealized-casts 2>/dev/null | FileCheck %s --dump-input-context 20
 
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
   // CHECK: llvm.func @test_empty_kernel(%arg0: i32, %arg1: !llvm.ptr<1>, %arg2: !llvm.ptr<1>, %arg3: !llvm.ptr<1>)
diff --git a/test/Conversion/tritonnvidiagpu_to_llvm.mlir b/test/Conversion/tritonnvidiagpu_to_llvm.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s -split-input-file --convert-triton-gpu-to-llvm=compute-capability=90 | FileCheck %s
+// RUN: triton-opt %s -split-input-file --convert-triton-gpu-to-llvm=compute-capability=90 -reconcile-unrealized-casts | FileCheck %s
 
 #shared0 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
 #smem = #ttg.shared_memory
diff --git a/test/LLVMIR/convert-to-llvmir-with-dbg-info.mlir b/test/LLVMIR/convert-to-llvmir-with-dbg-info.mlir
@@ -29,6 +29,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
                         %arg2: !llvm.ptr<1>, %arg3: i32, %arg4: !llvm.ptr<1>) {
     %constant_i32 = llvm.mlir.constant(9 : i32) : i32
     %constant_i16 = llvm.mlir.constant(0 : i16) : i16
+    %constant_i64 = llvm.mlir.constant(9 : i64) : i64
 
     // CHECK: !DILocalVariable(name: "pid", scope:
     %pid = rocdl.workgroup.id.x : i32 loc(#loc14)
@@ -49,14 +50,14 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
 
     // CHECK: !DILocalVariable(name: "x", scope:
     %x_ptr = llvm.getelementptr %arg0[%block_start] : (!llvm.ptr<1>, i32) -> !llvm.ptr<1>, f32
-    %x_buffer_ptr = rocdl.make.buffer.rsrc %x_ptr, %constant_i16, %constant_i32, %constant_i32 : <1> to <8> loc(#loc18)
+    %x_buffer_ptr = rocdl.make.buffer.rsrc %x_ptr, %constant_i16, %constant_i64, %constant_i32 : <1> to <8> loc(#loc18)
     llvm.intr.dbg.value #di_local_variable4 = %x_buffer_ptr : !llvm.ptr<8> loc(#loc8)
     %x_val = rocdl.raw.ptr.buffer.load %x_buffer_ptr, %mask_i1, %constant_i32, %constant_i32 : vector<4xf32> loc(#loc18)
     %x_scalar = llvm.extractelement %x_val[%constant_i32 : i32] : vector<4xf32> loc(#loc18)
 
     // CHECK: !DILocalVariable(name: "y", scope:
     %y_ptr = llvm.getelementptr %arg1[%block_start] : (!llvm.ptr<1>, i32) -> !llvm.ptr<1>, f32
-    %y_buffer_ptr = rocdl.make.buffer.rsrc %y_ptr, %constant_i16, %constant_i32, %constant_i32 : <1> to <8> loc(#loc19)
+    %y_buffer_ptr = rocdl.make.buffer.rsrc %y_ptr, %constant_i16, %constant_i64, %constant_i32 : <1> to <8> loc(#loc19)
     llvm.intr.dbg.value #di_local_variable5 = %y_buffer_ptr : !llvm.ptr<8> loc(#loc10)
     %y_val = rocdl.raw.ptr.buffer.load %y_buffer_ptr, %mask_i1, %constant_i32, %constant_i32 : vector<4xf32> loc(#loc19)
     %y_scalar = llvm.extractelement %y_val[%constant_i32 : i32] : vector<4xf32> loc(#loc19)
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/BufferOpsEmitter.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/BufferOpsEmitter.cpp
@@ -86,7 +86,7 @@ Value BufferEmitter::createResourceDescriptor(Value basePtr,
 
   Value flagsConst = b.int_val(32, flags);
   Type rsrcType = LLVM::LLVMPointerType::get(rewriter.getContext(), 8);
-  Value numRecordsByte = b.int_val(32, std::numeric_limits<int>::max() - 1);
+  Value numRecordsByte = b.int_val(64, std::numeric_limits<int>::max() - 1);
 
   Value resource = rewriter.createOrFold<ROCDL::MakeBufferRsrcOp>(
       loc, rsrcType, basePtr, stride, numRecordsByte, flagsConst);
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/MFMA.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/MFMA.cpp
@@ -425,21 +425,9 @@ struct DotOpMFMAConversionHelper {
     // Now we have a vector of kBase elements of desired type.
     // Then we need to prepare vec for results.
     if (type.getIntOrFloatBitWidth() == 8) {
-      if (1 == kBase) {
+      if (1 == kBase)
         // This is only for the scale operands of scaled mfma on CDNA4
-        if (isConstantScale) {
-          // If the scale is constant(created by arith::ConstantOp), it will
-          // be put in a sgpr instead of vgpr. In that case, instead of
-          // vgpr[7:0], the instruction reads sgpr[30:23] as the scale value.
-          // So we need to manually left shift the scale by 23 bits to meet
-          // the requirement.
-          results = b.shl(i32_ty, b.zext(i32_ty, b.bitcast(vec, i8_ty)),
-                          b.i32_val(23));
-        } else {
-          results = b.zext(i32_ty, b.bitcast(vec, i8_ty));
-        }
-      }
-
+        results = b.zext(i32_ty, b.bitcast(vec, i8_ty));
       if (2 == kBase)
         // This case can occur during scale tensor packing when there aren't
         // enough elements to fill all 4 opSel slots. For example, with an A
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TritonGPUToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/TritonGPUToLLVM.cpp
@@ -254,7 +254,7 @@ struct ConvertTritonAMDGPUToLLVM
         loc, arrayTy, /*isConstant=*/false, LLVM::Linkage::External,
         "global_smem", /*value=*/Attribute(), /*alignment=*/16,
         // Add ROCm support.
-        static_cast<unsigned>(NVVM::NVVMMemorySpace::kSharedMemorySpace));
+        static_cast<unsigned>(NVVM::NVVMMemorySpace::Shared));
   }
 };
 
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/CanonicalizePointers.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/CanonicalizePointers.cpp
@@ -1302,15 +1302,16 @@ class ConvertCFCondBranch
     SmallVector<Value> trueOperands = flattenValues(remappedTrueOperands);
     SmallVector<Value> falseOperands = flattenValues(remappedFalseOperands);
 
-    rewriter.replaceOpWithNewOp<cf::CondBranchOp>(
-        branchOp, branchOp.getCondition(), branchOp.getTrueDest(), trueOperands,
-        branchOp.getFalseDest(), falseOperands);
+    auto newOp = rewriter.create<cf::CondBranchOp>(
+        branchOp.getLoc(), branchOp.getCondition(), branchOp.getTrueDest(),
+        trueOperands, branchOp.getFalseDest(), falseOperands);
 
     convertSimpleBlockSignature(branchOp.getTrueDest(), remappedTrueOperands,
                                 rewriter, fatPtrs);
     convertSimpleBlockSignature(branchOp.getFalseDest(), remappedFalseOperands,
                                 rewriter, fatPtrs);
 
+    rewriter.replaceOp(branchOp, newOp);
     return success();
   }
 };
@@ -1481,10 +1482,11 @@ class ConvertCFBranch : public PointerCanonicalizationPattern<cf::BranchOp> {
     ArrayRef<ValueRange> remappedDestOperands = adaptor.getDestOperands();
     SmallVector<Value> trueOperands = flattenValues(remappedDestOperands);
 
-    rewriter.replaceOpWithNewOp<cf::BranchOp>(branchOp, branchOp.getDest(),
-                                              trueOperands);
+    auto newOp = rewriter.create<cf::BranchOp>(
+        branchOp.getLoc(), branchOp.getDest(), trueOperands);
     convertSimpleBlockSignature(branchOp.getDest(), remappedDestOperands,
                                 rewriter, fatPtrs);
+    rewriter.replaceOp(branchOp, newOp);
     return success();
   }
 };
@@ -1919,6 +1921,7 @@ void TritonAMDGPUCanonicalizePointersPass::runOnOperation() {
 
   ConversionConfig config;
   config.buildMaterializations = false;
+  config.allowPatternRollback = false;
   ConversionTarget target(getContext());
   auto isLegal = [&opsToRewrite](Operation *op) {
     if (auto ifOp = llvm::dyn_cast<scf::IfOp>(op)) {
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/Utility.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/Utility.cpp
@@ -24,11 +24,14 @@ int deduceMinCountBetweeOps(Operation *beginOp, Operation *endOp,
           deduceMinCountInBlock(ifOp.getElseRegion().front(), countFunc);
       count += std::min(minThen, minElse);
     } else if (auto forOp = llvm::dyn_cast<scf::ForOp>(op)) {
-      auto tripCount = constantTripCount(forOp.getLowerBound(),
-                                         forOp.getUpperBound(), forOp.getStep())
-                           .value_or(0);
-      if (tripCount > 0) {
-        count += tripCount * deduceMinCountInBlock(*forOp.getBody(), countFunc);
+      if (std::optional<APInt> tripCount = forOp.getStaticTripCount()) {
+        uint64_t tcVal = 0;
+        if (forOp.getUnsignedCmp() && tripCount->ugt(0))
+          tcVal = tripCount->getZExtValue();
+        else if (!forOp.getUnsignedCmp() && tripCount->sgt(0))
+          tcVal = tripCount->getSExtValue();
+        if (tcVal > 0)
+          count += tcVal * deduceMinCountInBlock(*forOp.getBody(), countFunc);
       }
     } else {
       count += countFunc(op);
diff --git a/third_party/amd/python/triton_amd.cc b/third_party/amd/python/triton_amd.cc
@@ -419,11 +419,11 @@ void init_triton_amd(py::module &&m) {
 
         const llvm::MCTargetOptions mcOptions;
         std::unique_ptr<llvm::MCRegisterInfo> mri(
-            target->createMCRegInfo(amdTargetTriple));
+            target->createMCRegInfo(triple));
         std::unique_ptr<llvm::MCAsmInfo> mai(
-            target->createMCAsmInfo(*mri, amdTargetTriple, mcOptions));
+            target->createMCAsmInfo(*mri, triple, mcOptions));
         std::unique_ptr<llvm::MCSubtargetInfo> sti(
-            target->createMCSubtargetInfo(amdTargetTriple, arch, features));
+            target->createMCSubtargetInfo(triple, arch, features));
 
         llvm::MCContext ctx(triple, mai.get(), mri.get(), sti.get(), &srcMgr,
                             &mcOptions);
@@ -472,7 +472,7 @@ void init_triton_amd(py::module &&m) {
     if (!target)
       throw std::runtime_error("target lookup error: " + error);
     std::unique_ptr<llvm::MCSubtargetInfo> sti(
-        target->createMCSubtargetInfo(amdTargetTriple, arch, ""));
+        target->createMCSubtargetInfo(triple, arch, ""));
     return sti->checkFeatures("+architected-sgprs");
   });
 
diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py
@@ -439,7 +439,8 @@ def make_ptx(self, src, metadata, opt, capability):
         triple = 'nvptx64-nvidia-cuda'
         proc = sm_arch_from_capability(capability)
         features = get_features(opt, self.target.arch)
-        ret = llvm.translate_to_asm(src, triple, proc, features, [], opt.enable_fp_fusion, False)
+        flags = ["nvptx-mad-wide-opt"]
+        ret = llvm.translate_to_asm(src, triple, proc, features, flags, opt.enable_fp_fusion, False)
         # Find kernel names (there should only be one)
         names = re.findall(r".visible .entry ([a-zA-Z_][a-zA-Z0-9_]*)", ret)
         assert len(names) == 1
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TritonGPUToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TritonGPUToLLVM.cpp
@@ -219,7 +219,7 @@ struct ConvertTritonGPUToLLVM
         loc, arrayTy, /*isConstant=*/false, LLVM::Linkage::External,
         "global_smem", /*value=*/Attribute(), /*alignment=*/16,
         // Add ROCm support.
-        static_cast<unsigned>(NVVM::NVVMMemorySpace::kSharedMemorySpace));
+        static_cast<unsigned>(NVVM::NVVMMemorySpace::Shared));
   }
 };
 

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-064f02dac0c81c19350a74415b3245f42fed09dc`
	`1`	`+f6ded0be897e2878612dd903f7e8bb85448269e5`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-// RUN: triton-opt %s --tritongpu-reduce-data-duplication --allocate-shared-memory --convert-triton-amdgpu-to-llvm=arch="gfx1200" -split-input-file \| FileCheck %s`
	`1`	`+// RUN: triton-opt %s --tritongpu-reduce-data-duplication --allocate-shared-memory --convert-triton-amdgpu-to-llvm=arch="gfx1200" -reconcile-unrealized-casts -split-input-file \| FileCheck %s`
`2`	`2`
`3`	`3`	`#wmmaTv2 = #ttg.amd_wmma<{version = 2, warpsPerCTA = [1, 1], isTranspose = true}>`
`4`	`4`	`#dotop0v2 = #ttg.dot_op<{opIdx = 0, parent = #wmmaTv2, kWidth=8}>`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-// RUN: triton-opt %s -split-input-file --allocate-shared-memory-nv --convert-triton-gpu-to-llvm 2>/dev/null \| FileCheck %s --dump-input-context 20`
	`1`	`+// RUN: triton-opt %s -split-input-file --allocate-shared-memory-nv --convert-triton-gpu-to-llvm -reconcile-unrealized-casts 2>/dev/null \| FileCheck %s --dump-input-context 20`
`2`	`2`
`3`	`3`	`module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {`
`4`	`4`	`// CHECK: llvm.func @test_empty_kernel(%arg0: i32, %arg1: !llvm.ptr<1>, %arg2: !llvm.ptr<1>, %arg3: !llvm.ptr<1>)`