diff --git a/cmake/llvm-hash.txt b/cmake/llvm-hash.txt index af3c4032f076..20726f76e504 100644 --- a/cmake/llvm-hash.txt +++ b/cmake/llvm-hash.txt @@ -1 +1 @@ -e12cbd8339b89563059c2bb2a312579b652560d0 +8957e64a20fc7f4277565c6cfe3e555c119783ce diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp index 0800530c8338..2e42a43ef4c8 100644 --- a/lib/Analysis/Utility.cpp +++ b/lib/Analysis/Utility.cpp @@ -953,7 +953,7 @@ SetVector multiRootGetSlice(Operation *op, BackwardSliceOptions opt; opt.omitBlockArguments = true; opt.filter = backwardFilter; - getBackwardSlice(currentOp, &backwardSlice, opt); + (void)getBackwardSlice(currentOp, &backwardSlice, opt); slice.insert(backwardSlice.begin(), backwardSlice.end()); // Compute and insert the forwardSlice starting from currentOp. diff --git a/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp index 4155bccf9924..fb78360adaac 100644 --- a/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp +++ b/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp @@ -298,7 +298,7 @@ struct ElementwiseInlineAsmOpConversion /*asm_string=*/op.getAsmString(), /*constraints=*/op.getConstraints(), /*has_side_effects=*/!op.getPure(), - /*is_align_stack=*/false, + /*is_align_stack=*/false, LLVM::TailCallKind::None, /*asm_dialect=*/ LLVM::AsmDialectAttr::get(rewriter.getContext(), LLVM::AsmDialect::AD_ATT), diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp index fafb58231351..54c40a1ff3de 100644 --- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp +++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp @@ -257,7 +257,7 @@ static int computeOrigBitWidth(Value x) { mlir::BackwardSliceOptions opt; opt.omitBlockArguments = true; opt.filter = bwdFilter; - getBackwardSlice(x, &slice, opt); + (void)getBackwardSlice(x, &slice, opt); // TODO: This heuristic may be a bit too coarse and may need improving // If the chain contains a fp4 to fp16/bf16 conversion, then the original diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/ScheduleLoops.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/ScheduleLoops.cpp index c3a159b48527..ff81a3edc59b 100644 --- a/lib/Dialect/TritonGPU/Transforms/Pipeliner/ScheduleLoops.cpp +++ b/lib/Dialect/TritonGPU/Transforms/Pipeliner/ScheduleLoops.cpp @@ -271,7 +271,7 @@ CoarseSchedule::Cluster schedulePrologueAndEpilogue(scf::ForOp forOp, BackwardSliceOptions opt; opt.omitBlockArguments = true; opt.omitUsesFromAbove = false; - getBackwardSlice((Operation *)op, &backwardSlice, opt); + (void)getBackwardSlice((Operation *)op, &backwardSlice, opt); for (auto op : backwardSlice) { if (auto ifOp = dyn_cast(op)) { diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/WGMMAPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/WGMMAPipeline.cpp index 5aefa9f5e5b1..7cdecf15071d 100644 --- a/lib/Dialect/TritonGPU/Transforms/Pipeliner/WGMMAPipeline.cpp +++ b/lib/Dialect/TritonGPU/Transforms/Pipeliner/WGMMAPipeline.cpp @@ -197,7 +197,7 @@ static void threadValuesThroughWait(ttng::WarpGroupDotWaitOp wait, return op->getBlock() == wait->getBlock(); }; SetVector slice; - getBackwardSlice(v, &slice, options); + (void)getBackwardSlice(v, &slice, options); } for (ttng::WarpGroupDotOp dot : asyncDots) { diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py index 5119786a0be9..3dba2efa32c2 100644 --- a/python/test/unit/language/test_core.py +++ b/python/test/unit/language/test_core.py @@ -6891,7 +6891,7 @@ def test_tl_range_num_stages(device): if capability[0] >= 8: ptx = pgm.asm['ptx'] # check that the loop got pipelined with the right number of stages. - assert 'cp.async.wait_group 6' in ptx + assert 'cp.async.wait_group \t6' in ptx def test_tl_range_fuse(): diff --git a/test/Triton/reproducer.mlir b/test/Triton/reproducer.mlir index 5a6747d217a9..7c62decd88bc 100644 --- a/test/Triton/reproducer.mlir +++ b/test/Triton/reproducer.mlir @@ -17,4 +17,4 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num- #-} // CHECK: Pass Manager with -// CHECK-NEXT: convert-triton-gpu-to-llvm +// CHECK: convert-triton-gpu-to-llvm diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp index f77abbf66771..ba0957b4df0e 100644 --- a/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp +++ b/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp @@ -37,30 +37,25 @@ cvtScalePkUpcastFromFp8(Location loc, ConversionPatternRewriter &rewriter, fp8x4Vec = b.insert_element(fp8x4VecTy, fp8x4Vec, v1, idx1); auto i32v = b.bitcast(fp8x4Vec, i32_ty); - auto resType = i32_ty; - auto dstType = f32_ty; + Type resElemType; if constexpr (std::is_same_v || std::is_same_v) { - resType = i64_ty; - dstType = f32_ty; + resElemType = f32_ty; } else if constexpr (std::is_same_v || std::is_same_v) { - resType = i32_ty; - dstType = f16_ty; + resElemType = f16_ty; } else { - resType = i32_ty; - dstType = bf16_ty; + resElemType = bf16_ty; } + Type resType = vec_ty(resElemType, 2); Value scale = b.f32_val(1); - Value select = b.false_val(); - auto result = rewriter.create(loc, resType, i32v, scale, select); - auto retVecTy = vec_ty(dstType, 2); - auto retVec = b.bitcast(result, retVecTy); + auto result = rewriter.create(loc, resType, i32v, scale, + /*srcLoHiSel=*/false); SmallVector ret(2); - ret[0] = b.extract_element(dstType, retVec, idx0); - ret[1] = b.extract_element(dstType, retVec, idx1); + ret[0] = b.extract_element(resElemType, result, idx0); + ret[1] = b.extract_element(resElemType, result, idx1); return ret; } @@ -73,13 +68,12 @@ cvtScalePkDowncastToFp8(Location loc, ConversionPatternRewriter &rewriter, Type v2I16Ty = vec_ty(i16_ty, 2); Value v2I16Vec = b.undef(v2I16Ty); Value scale = b.f32_val(1); - Value select = b.false_val(); Value result; if constexpr (std::is_same_v || std::is_same_v) { result = rewriter.create(loc, v2I16Ty, v2I16Vec, v0, v1, scale, - select); + /*dstLoHiSel=*/false); } else { Type v2F16Ty = vec_ty(v0.getType(), 2); Value srcVec = b.undef(v2F16Ty); @@ -88,7 +82,7 @@ cvtScalePkDowncastToFp8(Location loc, ConversionPatternRewriter &rewriter, srcVec = b.insert_element(v2F16Ty, srcVec, v0, idx0); srcVec = b.insert_element(v2F16Ty, srcVec, v1, idx1); result = rewriter.create(loc, v2I16Ty, v2I16Vec, srcVec, scale, - select); + /*dstLoHiSel=*/false); } auto fp8x4VecTy = vec_ty(i8_ty, 4); auto fp8x4Vec = b.bitcast(result, fp8x4VecTy); @@ -312,8 +306,8 @@ static SmallVector cvtPkF8ToFp32(Location loc, auto resType = i64_ty; auto dstType = f32_ty; - Value select = b.false_val(); - auto result = rewriter.create(loc, resType, i32v, select); + auto result = + rewriter.create(loc, resType, i32v, /*wordSel=*/false); auto f32x2VecTy = vec_ty(dstType, 2); auto retVec = b.bitcast(result, f32x2VecTy); SmallVector ret(2); @@ -330,10 +324,10 @@ static SmallVector cvtPkFp32ToF8(Location loc, auto b = TritonLLVMOpBuilder(loc, rewriter); Type v2I16Ty = vec_ty(i16_ty, 2); Value old = b.undef(i32_ty); - Value select = b.false_val(); Value result; - result = rewriter.create(loc, v2I16Ty, v0, v1, old, select); + result = + rewriter.create(loc, v2I16Ty, v0, v1, old, /*wordSel=*/false); auto fp8x4VecTy = vec_ty(i8_ty, 4); auto fp8x4Vec = b.bitcast(result, fp8x4VecTy); SmallVector ret(2); diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/GCNAsmFormat.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/GCNAsmFormat.cpp index 2de1c0f3d23d..c10db663c0e3 100644 --- a/third_party/amd/lib/TritonAMDGPUToLLVM/GCNAsmFormat.cpp +++ b/third_party/amd/lib/TritonAMDGPUToLLVM/GCNAsmFormat.cpp @@ -82,6 +82,7 @@ mlir::Value GCNBuilder::launch(RewriterBase &rewriter, Location loc, Type resTy, getConstraints(), // constraints hasSideEffect, // has_side_effects isAlignStack, // is_align_stack + LLVM::TailCallKind::None, LLVM::AsmDialectAttr::get(ctx, LLVM::AsmDialect::AD_ATT), // asm_dialect ArrayAttr::get(ctx, attrs) // operand_attrs diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp index a72c5d639863..e62f0c4c9c98 100644 --- a/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp +++ b/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp @@ -691,7 +691,7 @@ bool isChainDotTail(tt::DotOpInterface dotOp) { Operation *opA = dotOp.getA().getDefiningOp(); if (!opA) return false; - getBackwardSlice(opA, &bwdSlices, bwdOpt); + (void)getBackwardSlice(opA, &bwdSlices, bwdOpt); if (llvm::find_if(bwdSlices, [](Operation *op) { return isa(op); }) != bwdSlices.end()) diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/BlockPingpong.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/BlockPingpong.cpp index f04d3b0928bf..3abc8d72dbdf 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/BlockPingpong.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/BlockPingpong.cpp @@ -131,7 +131,7 @@ void Pingponger::moveOpAndPredecessorsUpSameBlock(Operation *op) { return op->getBlock() == checkedOp->getBlock() && checkedOp->isBeforeInBlock(op); }; - getBackwardSlice(op, &backwardSlice, opt); + (void)getBackwardSlice(op, &backwardSlice, opt); for (auto predOp : backwardSlice) appendOp(predOp); appendOp(op); diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp index 9a728e32f921..b0fb0e54db69 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp @@ -190,7 +190,7 @@ static void moveUpGlobalLoadInPrologue(triton::FuncOp funcOp) { // Only move ops residing in the same block. return defBlock == block; }; - mlir::getBackwardSlice(op.getOperation(), &backwardSet, options); + (void)mlir::getBackwardSlice(op.getOperation(), &backwardSet, options); backwardSet.insert(op); auto ipoint = findEarlyInsertionPoint(block, op); diff --git a/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/WSCodePartition.cpp b/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/WSCodePartition.cpp index bda1530c43bc..67aa392d2e5d 100644 --- a/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/WSCodePartition.cpp +++ b/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/WSCodePartition.cpp @@ -479,7 +479,7 @@ void reorderProducerOps(SmallVector &channels) { BackwardSliceOptions opt; opt.omitBlockArguments = true; SetVector backwardSlice; - getBackwardSlice(channel->getSrcOp(), &backwardSlice, opt); + (void)getBackwardSlice(channel->getSrcOp(), &backwardSlice, opt); for (auto &op : backwardSlice) { if (op->getBlock() == block) op->moveBefore(channel->getSrcOp()); diff --git a/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/WSDataPartition.cpp b/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/WSDataPartition.cpp index 910f496f4e78..972072a2ff1f 100644 --- a/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/WSDataPartition.cpp +++ b/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/WSDataPartition.cpp @@ -219,7 +219,7 @@ static bool rematerializeOp(Operation *op, DataPartitionScheme &partitionScheme, SetVector slice; BackwardSliceOptions opt; opt.omitBlockArguments = true; - getBackwardSlice(op, &slice); + (void)getBackwardSlice(op, &slice); for (auto depOp : slice) partitionScheme.undoPartition(depOp); return true; diff --git a/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/WSTaskPartition.cpp b/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/WSTaskPartition.cpp index 5f14d50eea0e..4952e4d346eb 100644 --- a/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/WSTaskPartition.cpp +++ b/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/WSTaskPartition.cpp @@ -80,8 +80,8 @@ void doTaskPartition(triton::FuncOp &funcOp, unsigned numWarpGroups) { if (!dotOp) continue; SetVector backwardSlice; - getBackwardSlice(dotOp.getA(), &backwardSlice, opt); - getBackwardSlice(dotOp.getB(), &backwardSlice, opt); + (void)getBackwardSlice(dotOp.getA(), &backwardSlice, opt); + (void)getBackwardSlice(dotOp.getB(), &backwardSlice, opt); for (auto depOp : backwardSlice) { if (isa(depOp)) { producerOps.insert(depOp); diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/PTXAsmFormat.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/PTXAsmFormat.cpp index 2f4f03007aa0..27ab8f2675ee 100644 --- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/PTXAsmFormat.cpp +++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/PTXAsmFormat.cpp @@ -105,6 +105,7 @@ mlir::Value PTXBuilder::launch(OpBuilder &rewriter, Location loc, Type resTy, getConstraints(), // constraints hasSideEffect, // has_side_effects isAlignStack, // is_align_stack + LLVM::TailCallKind::None, LLVM::AsmDialectAttr::get(ctx, LLVM::AsmDialect::AD_ATT), // asm_dialect ArrayAttr::get(ctx, attrs) // operand_attrs