Experimental to None

jbdalido · jbdalido · commit 3262d4c7f671 · 2025-04-02T10:31:38.000+02:00
diff --git a/include/triton/Dialect/Triton/IR/TritonOps.td b/include/triton/Dialect/Triton/IR/TritonOps.td
@@ -1362,7 +1362,7 @@ def TT_DescriptorScatterOp : TT_Op<"descriptor_scatter", [
   }];
 }
 
-def TT_ExperimentalTensormapCreateOp: TT_Op<
+def TT_TensormapCreateOp: TT_Op<
   "experimental_tensormap_create",
   [
     MemoryEffects<[MemRead<GlobalMemory>, MemWrite<GlobalMemory>]>,
diff --git a/include/triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h b/include/triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h
@@ -222,7 +222,7 @@ mlir::LogicalResult createTMADesc(mlir::Value tmaPtr,
     }
   }
 
-  builder.template create<triton::ExperimentalTensormapCreateOp>(
+  builder.template create<triton::TensormapCreateOp>(
       loc,
       /*desc_ptr=*/tmaPtr,
       /*global_address=*/op.getBase(),
diff --git a/lib/Analysis/Allocation.cpp b/lib/Analysis/Allocation.cpp
@@ -210,7 +210,7 @@ unsigned defaultAllocationAnalysisScratchSizeFn(Operation *op) {
     assert(!isa<PointerType>(elemTy) && "unexpected pointer type");
     return elems * std::max<int>(8, elemTy.getIntOrFloatBitWidth()) / 8;
   }
-  if (isa<ExperimentalTensormapCreateOp>(op)) {
+  if (isa<TensormapCreateOp>(op)) {
     constexpr int32_t kTMASize = 128;
     return kTMASize;
   }
diff --git a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp b/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp
@@ -629,7 +629,7 @@ void populateTritonPatterns(TritonGPUTypeConverter &typeConverter,
       GenericOpPattern<triton::AtomicRMWOp>, GenericOpPattern<ReturnOp>,
       GenericOpPattern<triton::DescriptorLoadOp>,
       GenericOpPattern<triton::DescriptorStoreOp>,
-      GenericOpPattern<triton::ExperimentalTensormapCreateOp>,
+      GenericOpPattern<triton::TensormapCreateOp>,
       GenericOpPattern<triton::ExperimentalTensormapFenceproxyAcquireOp>,
       // this assumes the right layout will be set later for dot scaled.
       GenericOpPattern<triton::DotScaledOp>, GenericOpPattern<triton::CallOp>,
diff --git a/lib/Dialect/Triton/IR/Ops.cpp b/lib/Dialect/Triton/IR/Ops.cpp
@@ -1341,8 +1341,8 @@ LogicalResult DescriptorStoreOp::verify() {
                                       getSrc().getType());
 }
 
-// -- ExperimentalTensormapCreateOp --
-LogicalResult ExperimentalTensormapCreateOp::verify() {
+// -- TensormapCreateOp --
+LogicalResult TensormapCreateOp::verify() {
   auto rank = getBoxDim().size();
   if (getGlobalDim().size() != rank) {
     return emitError("Rank mismatch for global dim. Got ")
diff --git a/lib/Dialect/TritonGPU/Transforms/TaskIdPropagate.cpp b/lib/Dialect/TritonGPU/Transforms/TaskIdPropagate.cpp
@@ -193,7 +193,7 @@ bool verifyTaskId(triton::FuncOp &funcOp,
     }
 
     auto partitionShouldBeUsedSpecified = [](Operation *op) {
-      if (isa<StoreOp, ExperimentalDescriptorLoadOp>(op))
+      if (isa<StoreOp, DescriptorLoadOp>(op))
         return true;
       if (isa<AtomicRMWOp, AtomicCASOp>(op))
         return true;
@@ -218,7 +218,7 @@ bool verifyTaskId(triton::FuncOp &funcOp,
       Operation *defOp = operand.getDefiningOp();
       if (!defOp)
         continue;
-      if (llvm::isa<tt::LoadOp, tt::ExperimentalDescriptorLoadOp>(defOp))
+      if (llvm::isa<tt::LoadOp, tt::DescriptorLoadOp>(defOp))
         continue;
       auto defTaskIds = getAsyncTaskIds(defOp);
       // Make sure defTaskIds cover asyncTaskIds. Call addAsyncTaskIds if
diff --git a/lib/Dialect/TritonGPU/Transforms/WSCodePartition.cpp b/lib/Dialect/TritonGPU/Transforms/WSCodePartition.cpp
@@ -671,7 +671,7 @@ void getTransitiveUsers(Value root,
 void collectAsyncChannels(SmallVector<std::unique_ptr<Channel>> &channels,
                           triton::FuncOp &funcOp, unsigned numBuffers) {
   funcOp.walk([&](Operation *op) {
-    if (isa<tt::LoadOp, tt::ExperimentalDescriptorLoadOp>(op) ||
+    if (isa<tt::LoadOp, tt::DescriptorLoadOp>(op) ||
         isa<mlir::triton::DotOpInterface>(op)) {
       auto producerTaskIds = getAsyncTaskIds(op);
       if (producerTaskIds.empty() || producerTaskIds.size() > 1) {
@@ -1611,7 +1611,7 @@ DenseMap<Channel *, DenseMap<int, Value>> createToken(
       auto copyOp = copyOpMap.find(channel)->second.first;
       if (isa<ttg::AsyncCopyGlobalToLocalOp>(copyOp)) {
         tokenLoadType = ttng::TokenLoadType::AsyncLoadOp;
-      } else if (isa<ExperimentalDescriptorLoadOp>(copyOp)) {
+      } else if (isa<DescriptorLoadOp>(copyOp)) {
         tokenLoadType = ttng::TokenLoadType::TMALoadOp;
       } else if (isa<LocalStoreOp>(copyOp)) {
         tokenLoadType = ttng::TokenLoadType::LocalStoreOp;
@@ -1636,7 +1636,7 @@ DenseMap<Channel *, DenseMap<int, Value>> createToken(
       }
 
       auto producerOp = it->second.front()->getSrcOp();
-      if (isa<tt::ExperimentalDescriptorLoadOp>(producerOp)) {
+      if (isa<tt::DescriptorLoadOp>(producerOp)) {
         Value bAlloc = createBarrierAlloc(funcOp, channel->numBuffers);
         // Channels in the group share the same set of tokens.
         for (auto &c : it->second) {
@@ -1863,7 +1863,7 @@ createLocalCopy(const DenseMap<Channel *, Value> &bufferMap, Channel *channel,
   return {copy, sharedLoad};
 }
 
-static int getTMALoadSize(tt::ExperimentalDescriptorLoadOp &tmaLoad) {
+static int getTMALoadSize(tt::DescriptorLoadOp &tmaLoad) {
   auto tensorTy = cast<RankedTensorType>(tmaLoad->getResult(0).getType());
   int loadSize = product(tensorTy.getShape());
   return loadSize * tensorTy.getElementType().getIntOrFloatBitWidth() / 8;
@@ -1921,7 +1921,7 @@ Value getBufferForPipelineStage(OpBuilderWithAsyncTaskIds &builder,
 
 Operation *
 optimizeTMALoads(OpBuilderWithAsyncTaskIds &builder,
-                 SmallVector<tt::ExperimentalDescriptorLoadOp> &tmaLoads,
+                 SmallVector<tt::DescriptorLoadOp> &tmaLoads,
                  SmallVector<Value> &buffers, Value barrierAlloc,
                  Value bufferIdx, Value bufferIdxExtract, Value phase,
                  Operation *headProducer, Operation *headConsumer) {
@@ -2168,7 +2168,7 @@ void insertAsyncComm(
 
       // Insert ProducerCommitOp if producer is LoadOp. For TMA, TMA lowering
       // will handle the ProducerCommit.
-      if (!isa<tt::ExperimentalDescriptorLoadOp>(headProducer)) {
+      if (!isa<tt::DescriptorLoadOp>(headProducer)) {
         builder.setInsertionPointAfter(tailProducer);
         builder.createWithAsyncTaskIds<ttng::ProducerCommitOp>(
             tailProducer->getLoc(), token.second, bufferIdx);
@@ -2178,7 +2178,7 @@ void insertAsyncComm(
     for (auto token : tokens) {
       builder.setAsynTaskIdsFromArray(token.first);
       // Insert ConsumerWaitOp
-      if (!isa<tt::ExperimentalDescriptorLoadOp>(headProducer)) {
+      if (!isa<tt::DescriptorLoadOp>(headProducer)) {
         auto consumerWaitPoint = getSameLevelOp(headProducer, headConsumer);
         builder.setInsertionPoint(consumerWaitPoint);
         builder.createWithAsyncTaskIds<ttng::ConsumerWaitOp>(
@@ -2193,13 +2193,13 @@ void insertAsyncComm(
           consumerReleasePoint->getLoc(), token.second, bufferIdx);
     }
 
-    SmallVector<tt::ExperimentalDescriptorLoadOp> tmaLoads;
+    SmallVector<tt::DescriptorLoadOp> tmaLoads;
     SmallVector<Value> buffers;
     DenseMap<Operation *, Operation *> producerCopyMap;
     // Go through all channels in this channel group.
     for (auto &c : kv.second) {
       if (auto tmaLoad =
-              dyn_cast<tt::ExperimentalDescriptorLoadOp>(c->getSrcOp())) {
+              dyn_cast<tt::DescriptorLoadOp>(c->getSrcOp())) {
         tmaLoads.push_back(tmaLoad);
         buffers.push_back(bufferMap.find(c)->second);
       }
@@ -2278,7 +2278,7 @@ void insertAsyncCopy(
 
     // No need to create async copy for TMA load which will be handled in
     // insertAsyncComm.
-    if (isa<tt::ExperimentalDescriptorLoadOp>(srcOp)) {
+    if (isa<tt::DescriptorLoadOp>(srcOp)) {
       producerConsumerOps = {srcOp, domininatingChannel->getDstOp()};
     } else if (isa<triton::LoadOp>(srcOp)) {
       SmallVector<AsyncTaskId> asyncTasksPC = getAsyncTaskIds(srcOp);
diff --git a/lib/Dialect/TritonGPU/Transforms/WSDataPartition.cpp b/lib/Dialect/TritonGPU/Transforms/WSDataPartition.cpp
@@ -58,7 +58,7 @@ void fixTaskId(triton::FuncOp &funcOp) {
       if (!defOp)
         continue;
       // Do not update loads.
-      if (isa<tt::LoadOp, tt::ExperimentalDescriptorLoadOp>(defOp))
+      if (isa<tt::LoadOp, tt::DescriptorLoadOp>(defOp))
         continue;
       auto defTaskIds = getAsyncTaskIds(defOp);
       // Make sure defTaskIds cover asyncTaskIds. Call addAsyncTaskIds if
@@ -131,7 +131,7 @@ void getBackwardSliceToPartition(Value root, unsigned dim, int sliceSize,
             isa<arith::ConstantOp, arith::ExtSIOp, arith::ExtUIOp,
                 arith::ExtFOp, BroadcastOp, ExpandDimsOp, MakeRangeOp, SplatOp,
                 ConvertLayoutOp, triton::gpu::LocalAllocOp, LoadOp,
-                ExperimentalDescriptorLoadOp, nvidia_gpu::TMEMAllocOp,
+                DescriptorLoadOp, nvidia_gpu::TMEMAllocOp,
                 nvidia_gpu::TMEMLoadOp>(op)) {
           for (Value operand : op->getOperands())
             queue.push_back(operand);
@@ -592,11 +592,11 @@ Operation *sliceOp(Operation *op, int offset, IRMapping &mappings,
       sliceOp(operand, offset, mappings, reverseMappings, partitionScheme);
     // TODO: slice store base ptr
     newOp = cloneAndSetResultType(op);
-  } else if (isa<ExperimentalDescriptorLoadOp, ExperimentalDescriptorStoreOp>(
+  } else if (isa<DescriptorLoadOp, ExperimentalDescriptorStoreOp>(
                  op)) {
     SmallVector<int64_t> shape;
     Value coordVal;
-    if (auto loadOp = dyn_cast<ExperimentalDescriptorLoadOp>(op)) {
+    if (auto loadOp = dyn_cast<DescriptorLoadOp>(op)) {
       sliceOp(loadOp.getDesc(), offset, mappings, reverseMappings,
               partitionScheme);
       coordVal = loadOp.getIndices()[dim];
@@ -619,7 +619,7 @@ Operation *sliceOp(Operation *op, int offset, IRMapping &mappings,
     }
 
     newOp = cloneAndSetResultType(op);
-    if (isa<ExperimentalDescriptorLoadOp>(op)) {
+    if (isa<DescriptorLoadOp>(op)) {
       // map load result
       auto v = op->getResult(0);
       auto newV = newOp->getResult(0);
diff --git a/lib/Dialect/TritonGPU/Transforms/WSTaskPartition.cpp b/lib/Dialect/TritonGPU/Transforms/WSTaskPartition.cpp
@@ -62,7 +62,7 @@ void doPartition(triton::FuncOp &funcOp, unsigned numConsumerGroups) {
       loops.push_back(forOp);
     else if (isa<nvidia_gpu::WarpGroupDotOp>(op))
       dots.push_back(op);
-    else if (isa<triton::LoadOp, ExperimentalDescriptorLoadOp>(op))
+    else if (isa<triton::LoadOp, DescriptorLoadOp>(op))
       loads.push_back(op);
   });
 
@@ -100,7 +100,7 @@ void doPartition(triton::FuncOp &funcOp, unsigned numConsumerGroups) {
     getBackwardSlice(dotOp.getA(), &backwardSlice, opt);
     getBackwardSlice(dotOp.getB(), &backwardSlice, opt);
     for (auto depOp : backwardSlice) {
-      if (isa<ExperimentalDescriptorLoadOp>(depOp)) {
+      if (isa<DescriptorLoadOp>(depOp)) {
         producerOps.insert(depOp);
       } else if (isa<triton::LoadOp>(depOp) && isExpensiveLoadOrStore(depOp)) {
         producerOps.insert(depOp);
diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/TMALowering.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/TMALowering.cpp
@@ -111,7 +111,7 @@ class TMALoadLowering : public OpRewritePattern<DescriptorLoadOp> {
 public:
   using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(ExperimentalDescriptorLoadOp op,
+  LogicalResult matchAndRewrite(DescriptorLoadOp op,
                                 PatternRewriter &baseRewriter) const override {
     PatternRewriterWithAsyncTaskIds rewriter(baseRewriter, op);
     auto createLoad = [&](Value tmaPtr, Value barrierAlloc, Value alloc,
@@ -131,7 +131,7 @@ class TMALoadLowering : public OpRewritePattern<DescriptorLoadOp> {
 struct TMAGatherLowering : public OpRewritePattern<DescriptorGatherOp> {
   using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(ExperimentalDescriptorGatherOp op,
+  LogicalResult matchAndRewrite(DescriptorGatherOp op,
                                 PatternRewriter &baseRewriter) const override {
     PatternRewriterWithAsyncTaskIds rewriter(baseRewriter, op);
     auto createLoad = [&](Value tmaPtr, Value barrierAlloc, Value alloc,
diff --git a/python/src/ir.cc b/python/src/ir.cc
@@ -1447,7 +1447,7 @@ void init_triton_ir(py::module &&m) {
               std::vector<Value> element_stride, int32_t elem_type,
               int32_t interleave_layout, int32_t swizzle_mode,
               int32_t fill_mode) {
-             self.create<ExperimentalTensormapCreateOp>(
+             self.create<TensormapCreateOp>(
                  desc_ptr, global_address, box_dim, global_dim, global_stride,
                  element_stride, elem_type, interleave_layout, swizzle_mode,
                  fill_mode);
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TMAToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TMAToLLVM.cpp
@@ -236,17 +236,17 @@ void zero_fill_tma(Location loc, MLIRContext *ctx,
   LLVM::NVIDIA::createSyncWarp(loc, rewriter);
 }
 
-struct ExperimentalTensormapCreateOpConversion
-    : public ConvertOpToLLVMPattern<ExperimentalTensormapCreateOp> {
+struct TensormapCreateOpConversion
+    : public ConvertOpToLLVMPattern<TensormapCreateOp> {
   const NVIDIA::TargetInfo &targetInfo;
 
-  ExperimentalTensormapCreateOpConversion(LLVMTypeConverter &converter,
+  TensormapCreateOpConversion(LLVMTypeConverter &converter,
                                           const NVIDIA::TargetInfo &targetInfo,
                                           PatternBenefit benefit)
       : ConvertOpToLLVMPattern(converter, benefit), targetInfo(targetInfo) {}
 
   LogicalResult
-  matchAndRewrite(triton::ExperimentalTensormapCreateOp op, OpAdaptor adaptor,
+  matchAndRewrite(triton::TensormapCreateOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     Location loc = op->getLoc();
     auto b = TritonLLVMOpBuilder(loc, rewriter);
@@ -328,7 +328,7 @@ struct TensorDescToTMAPtrOpConversion
 void mlir::triton::NVIDIA::populateTMAToLLVMPatterns(
     LLVMTypeConverter &typeConverter, const TargetInfo &targetInfo,
     RewritePatternSet &patterns, PatternBenefit benefit) {
-  patterns.add<ExperimentalTensormapCreateOpConversion>(typeConverter,
+  patterns.add<TensormapCreateOpConversion>(typeConverter,
                                                         targetInfo, benefit);
   patterns
       .add<ExperimentalTensormapFenceproxyAcquireOpConversion,

Original file line number	Diff line number	Diff line change
`@@ -1362,7 +1362,7 @@ def TT_DescriptorScatterOp : TT_Op<"descriptor_scatter", [`
`1362`	`1362`	`}];`
`1363`	`1363`	`}`
`1364`	`1364`
`1365`		`-def TT_ExperimentalTensormapCreateOp: TT_Op<`
	`1365`	`+def TT_TensormapCreateOp: TT_Op<`
`1366`	`1366`	`"experimental_tensormap_create",`
`1367`	`1367`	`[`
`1368`	`1368`	`MemoryEffects<[MemRead<GlobalMemory>, MemWrite<GlobalMemory>]>,`
Original file line number	Diff line number	Diff line change
`@@ -222,7 +222,7 @@ mlir::LogicalResult createTMADesc(mlir::Value tmaPtr,`
`222`	`222`	`}`
`223`	`223`	`}`
`224`	`224`
`225`		`- builder.template create<triton::ExperimentalTensormapCreateOp>(`
	`225`	`+ builder.template create<triton::TensormapCreateOp>(`
`226`	`226`	`loc,`
`227`	`227`	`/desc_ptr=/tmaPtr,`
`228`	`228`	`/global_address=/op.getBase(),`
Original file line number	Diff line number	Diff line change
`@@ -210,7 +210,7 @@ unsigned defaultAllocationAnalysisScratchSizeFn(Operation *op) {`
`210`	`210`	`assert(!isa<PointerType>(elemTy) && "unexpected pointer type");`
`211`	`211`	`return elems * std::max<int>(8, elemTy.getIntOrFloatBitWidth()) / 8;`
`212`	`212`	`}`
`213`		`- if (isa<ExperimentalTensormapCreateOp>(op)) {`
	`213`	`+ if (isa<TensormapCreateOp>(op)) {`
`214`	`214`	`constexpr int32_t kTMASize = 128;`
`215`	`215`	`return kTMASize;`
`216`	`216`	`}`