Skip to content

Commit 3262d4c

Browse files
committed
Experimental to None
1 parent c16b855 commit 3262d4c

File tree

12 files changed

+33
-33
lines changed

12 files changed

+33
-33
lines changed

include/triton/Dialect/Triton/IR/TritonOps.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1362,7 +1362,7 @@ def TT_DescriptorScatterOp : TT_Op<"descriptor_scatter", [
13621362
}];
13631363
}
13641364

1365-
def TT_ExperimentalTensormapCreateOp: TT_Op<
1365+
def TT_TensormapCreateOp: TT_Op<
13661366
"experimental_tensormap_create",
13671367
[
13681368
MemoryEffects<[MemRead<GlobalMemory>, MemWrite<GlobalMemory>]>,

include/triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,7 @@ mlir::LogicalResult createTMADesc(mlir::Value tmaPtr,
222222
}
223223
}
224224

225-
builder.template create<triton::ExperimentalTensormapCreateOp>(
225+
builder.template create<triton::TensormapCreateOp>(
226226
loc,
227227
/*desc_ptr=*/tmaPtr,
228228
/*global_address=*/op.getBase(),

lib/Analysis/Allocation.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ unsigned defaultAllocationAnalysisScratchSizeFn(Operation *op) {
210210
assert(!isa<PointerType>(elemTy) && "unexpected pointer type");
211211
return elems * std::max<int>(8, elemTy.getIntOrFloatBitWidth()) / 8;
212212
}
213-
if (isa<ExperimentalTensormapCreateOp>(op)) {
213+
if (isa<TensormapCreateOp>(op)) {
214214
constexpr int32_t kTMASize = 128;
215215
return kTMASize;
216216
}

lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -629,7 +629,7 @@ void populateTritonPatterns(TritonGPUTypeConverter &typeConverter,
629629
GenericOpPattern<triton::AtomicRMWOp>, GenericOpPattern<ReturnOp>,
630630
GenericOpPattern<triton::DescriptorLoadOp>,
631631
GenericOpPattern<triton::DescriptorStoreOp>,
632-
GenericOpPattern<triton::ExperimentalTensormapCreateOp>,
632+
GenericOpPattern<triton::TensormapCreateOp>,
633633
GenericOpPattern<triton::ExperimentalTensormapFenceproxyAcquireOp>,
634634
// this assumes the right layout will be set later for dot scaled.
635635
GenericOpPattern<triton::DotScaledOp>, GenericOpPattern<triton::CallOp>,

lib/Dialect/Triton/IR/Ops.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1341,8 +1341,8 @@ LogicalResult DescriptorStoreOp::verify() {
13411341
getSrc().getType());
13421342
}
13431343

1344-
// -- ExperimentalTensormapCreateOp --
1345-
LogicalResult ExperimentalTensormapCreateOp::verify() {
1344+
// -- TensormapCreateOp --
1345+
LogicalResult TensormapCreateOp::verify() {
13461346
auto rank = getBoxDim().size();
13471347
if (getGlobalDim().size() != rank) {
13481348
return emitError("Rank mismatch for global dim. Got ")

lib/Dialect/TritonGPU/Transforms/TaskIdPropagate.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ bool verifyTaskId(triton::FuncOp &funcOp,
193193
}
194194

195195
auto partitionShouldBeUsedSpecified = [](Operation *op) {
196-
if (isa<StoreOp, ExperimentalDescriptorLoadOp>(op))
196+
if (isa<StoreOp, DescriptorLoadOp>(op))
197197
return true;
198198
if (isa<AtomicRMWOp, AtomicCASOp>(op))
199199
return true;
@@ -218,7 +218,7 @@ bool verifyTaskId(triton::FuncOp &funcOp,
218218
Operation *defOp = operand.getDefiningOp();
219219
if (!defOp)
220220
continue;
221-
if (llvm::isa<tt::LoadOp, tt::ExperimentalDescriptorLoadOp>(defOp))
221+
if (llvm::isa<tt::LoadOp, tt::DescriptorLoadOp>(defOp))
222222
continue;
223223
auto defTaskIds = getAsyncTaskIds(defOp);
224224
// Make sure defTaskIds cover asyncTaskIds. Call addAsyncTaskIds if

lib/Dialect/TritonGPU/Transforms/WSCodePartition.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -671,7 +671,7 @@ void getTransitiveUsers(Value root,
671671
void collectAsyncChannels(SmallVector<std::unique_ptr<Channel>> &channels,
672672
triton::FuncOp &funcOp, unsigned numBuffers) {
673673
funcOp.walk([&](Operation *op) {
674-
if (isa<tt::LoadOp, tt::ExperimentalDescriptorLoadOp>(op) ||
674+
if (isa<tt::LoadOp, tt::DescriptorLoadOp>(op) ||
675675
isa<mlir::triton::DotOpInterface>(op)) {
676676
auto producerTaskIds = getAsyncTaskIds(op);
677677
if (producerTaskIds.empty() || producerTaskIds.size() > 1) {
@@ -1611,7 +1611,7 @@ DenseMap<Channel *, DenseMap<int, Value>> createToken(
16111611
auto copyOp = copyOpMap.find(channel)->second.first;
16121612
if (isa<ttg::AsyncCopyGlobalToLocalOp>(copyOp)) {
16131613
tokenLoadType = ttng::TokenLoadType::AsyncLoadOp;
1614-
} else if (isa<ExperimentalDescriptorLoadOp>(copyOp)) {
1614+
} else if (isa<DescriptorLoadOp>(copyOp)) {
16151615
tokenLoadType = ttng::TokenLoadType::TMALoadOp;
16161616
} else if (isa<LocalStoreOp>(copyOp)) {
16171617
tokenLoadType = ttng::TokenLoadType::LocalStoreOp;
@@ -1636,7 +1636,7 @@ DenseMap<Channel *, DenseMap<int, Value>> createToken(
16361636
}
16371637

16381638
auto producerOp = it->second.front()->getSrcOp();
1639-
if (isa<tt::ExperimentalDescriptorLoadOp>(producerOp)) {
1639+
if (isa<tt::DescriptorLoadOp>(producerOp)) {
16401640
Value bAlloc = createBarrierAlloc(funcOp, channel->numBuffers);
16411641
// Channels in the group share the same set of tokens.
16421642
for (auto &c : it->second) {
@@ -1863,7 +1863,7 @@ createLocalCopy(const DenseMap<Channel *, Value> &bufferMap, Channel *channel,
18631863
return {copy, sharedLoad};
18641864
}
18651865

1866-
static int getTMALoadSize(tt::ExperimentalDescriptorLoadOp &tmaLoad) {
1866+
static int getTMALoadSize(tt::DescriptorLoadOp &tmaLoad) {
18671867
auto tensorTy = cast<RankedTensorType>(tmaLoad->getResult(0).getType());
18681868
int loadSize = product(tensorTy.getShape());
18691869
return loadSize * tensorTy.getElementType().getIntOrFloatBitWidth() / 8;
@@ -1921,7 +1921,7 @@ Value getBufferForPipelineStage(OpBuilderWithAsyncTaskIds &builder,
19211921

19221922
Operation *
19231923
optimizeTMALoads(OpBuilderWithAsyncTaskIds &builder,
1924-
SmallVector<tt::ExperimentalDescriptorLoadOp> &tmaLoads,
1924+
SmallVector<tt::DescriptorLoadOp> &tmaLoads,
19251925
SmallVector<Value> &buffers, Value barrierAlloc,
19261926
Value bufferIdx, Value bufferIdxExtract, Value phase,
19271927
Operation *headProducer, Operation *headConsumer) {
@@ -2168,7 +2168,7 @@ void insertAsyncComm(
21682168

21692169
// Insert ProducerCommitOp if producer is LoadOp. For TMA, TMA lowering
21702170
// will handle the ProducerCommit.
2171-
if (!isa<tt::ExperimentalDescriptorLoadOp>(headProducer)) {
2171+
if (!isa<tt::DescriptorLoadOp>(headProducer)) {
21722172
builder.setInsertionPointAfter(tailProducer);
21732173
builder.createWithAsyncTaskIds<ttng::ProducerCommitOp>(
21742174
tailProducer->getLoc(), token.second, bufferIdx);
@@ -2178,7 +2178,7 @@ void insertAsyncComm(
21782178
for (auto token : tokens) {
21792179
builder.setAsynTaskIdsFromArray(token.first);
21802180
// Insert ConsumerWaitOp
2181-
if (!isa<tt::ExperimentalDescriptorLoadOp>(headProducer)) {
2181+
if (!isa<tt::DescriptorLoadOp>(headProducer)) {
21822182
auto consumerWaitPoint = getSameLevelOp(headProducer, headConsumer);
21832183
builder.setInsertionPoint(consumerWaitPoint);
21842184
builder.createWithAsyncTaskIds<ttng::ConsumerWaitOp>(
@@ -2193,13 +2193,13 @@ void insertAsyncComm(
21932193
consumerReleasePoint->getLoc(), token.second, bufferIdx);
21942194
}
21952195

2196-
SmallVector<tt::ExperimentalDescriptorLoadOp> tmaLoads;
2196+
SmallVector<tt::DescriptorLoadOp> tmaLoads;
21972197
SmallVector<Value> buffers;
21982198
DenseMap<Operation *, Operation *> producerCopyMap;
21992199
// Go through all channels in this channel group.
22002200
for (auto &c : kv.second) {
22012201
if (auto tmaLoad =
2202-
dyn_cast<tt::ExperimentalDescriptorLoadOp>(c->getSrcOp())) {
2202+
dyn_cast<tt::DescriptorLoadOp>(c->getSrcOp())) {
22032203
tmaLoads.push_back(tmaLoad);
22042204
buffers.push_back(bufferMap.find(c)->second);
22052205
}
@@ -2278,7 +2278,7 @@ void insertAsyncCopy(
22782278

22792279
// No need to create async copy for TMA load which will be handled in
22802280
// insertAsyncComm.
2281-
if (isa<tt::ExperimentalDescriptorLoadOp>(srcOp)) {
2281+
if (isa<tt::DescriptorLoadOp>(srcOp)) {
22822282
producerConsumerOps = {srcOp, domininatingChannel->getDstOp()};
22832283
} else if (isa<triton::LoadOp>(srcOp)) {
22842284
SmallVector<AsyncTaskId> asyncTasksPC = getAsyncTaskIds(srcOp);

lib/Dialect/TritonGPU/Transforms/WSDataPartition.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ void fixTaskId(triton::FuncOp &funcOp) {
5858
if (!defOp)
5959
continue;
6060
// Do not update loads.
61-
if (isa<tt::LoadOp, tt::ExperimentalDescriptorLoadOp>(defOp))
61+
if (isa<tt::LoadOp, tt::DescriptorLoadOp>(defOp))
6262
continue;
6363
auto defTaskIds = getAsyncTaskIds(defOp);
6464
// Make sure defTaskIds cover asyncTaskIds. Call addAsyncTaskIds if
@@ -131,7 +131,7 @@ void getBackwardSliceToPartition(Value root, unsigned dim, int sliceSize,
131131
isa<arith::ConstantOp, arith::ExtSIOp, arith::ExtUIOp,
132132
arith::ExtFOp, BroadcastOp, ExpandDimsOp, MakeRangeOp, SplatOp,
133133
ConvertLayoutOp, triton::gpu::LocalAllocOp, LoadOp,
134-
ExperimentalDescriptorLoadOp, nvidia_gpu::TMEMAllocOp,
134+
DescriptorLoadOp, nvidia_gpu::TMEMAllocOp,
135135
nvidia_gpu::TMEMLoadOp>(op)) {
136136
for (Value operand : op->getOperands())
137137
queue.push_back(operand);
@@ -592,11 +592,11 @@ Operation *sliceOp(Operation *op, int offset, IRMapping &mappings,
592592
sliceOp(operand, offset, mappings, reverseMappings, partitionScheme);
593593
// TODO: slice store base ptr
594594
newOp = cloneAndSetResultType(op);
595-
} else if (isa<ExperimentalDescriptorLoadOp, ExperimentalDescriptorStoreOp>(
595+
} else if (isa<DescriptorLoadOp, ExperimentalDescriptorStoreOp>(
596596
op)) {
597597
SmallVector<int64_t> shape;
598598
Value coordVal;
599-
if (auto loadOp = dyn_cast<ExperimentalDescriptorLoadOp>(op)) {
599+
if (auto loadOp = dyn_cast<DescriptorLoadOp>(op)) {
600600
sliceOp(loadOp.getDesc(), offset, mappings, reverseMappings,
601601
partitionScheme);
602602
coordVal = loadOp.getIndices()[dim];
@@ -619,7 +619,7 @@ Operation *sliceOp(Operation *op, int offset, IRMapping &mappings,
619619
}
620620

621621
newOp = cloneAndSetResultType(op);
622-
if (isa<ExperimentalDescriptorLoadOp>(op)) {
622+
if (isa<DescriptorLoadOp>(op)) {
623623
// map load result
624624
auto v = op->getResult(0);
625625
auto newV = newOp->getResult(0);

lib/Dialect/TritonGPU/Transforms/WSTaskPartition.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ void doPartition(triton::FuncOp &funcOp, unsigned numConsumerGroups) {
6262
loops.push_back(forOp);
6363
else if (isa<nvidia_gpu::WarpGroupDotOp>(op))
6464
dots.push_back(op);
65-
else if (isa<triton::LoadOp, ExperimentalDescriptorLoadOp>(op))
65+
else if (isa<triton::LoadOp, DescriptorLoadOp>(op))
6666
loads.push_back(op);
6767
});
6868

@@ -100,7 +100,7 @@ void doPartition(triton::FuncOp &funcOp, unsigned numConsumerGroups) {
100100
getBackwardSlice(dotOp.getA(), &backwardSlice, opt);
101101
getBackwardSlice(dotOp.getB(), &backwardSlice, opt);
102102
for (auto depOp : backwardSlice) {
103-
if (isa<ExperimentalDescriptorLoadOp>(depOp)) {
103+
if (isa<DescriptorLoadOp>(depOp)) {
104104
producerOps.insert(depOp);
105105
} else if (isa<triton::LoadOp>(depOp) && isExpensiveLoadOrStore(depOp)) {
106106
producerOps.insert(depOp);

lib/Dialect/TritonNvidiaGPU/Transforms/TMALowering.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ class TMALoadLowering : public OpRewritePattern<DescriptorLoadOp> {
111111
public:
112112
using OpRewritePattern::OpRewritePattern;
113113

114-
LogicalResult matchAndRewrite(ExperimentalDescriptorLoadOp op,
114+
LogicalResult matchAndRewrite(DescriptorLoadOp op,
115115
PatternRewriter &baseRewriter) const override {
116116
PatternRewriterWithAsyncTaskIds rewriter(baseRewriter, op);
117117
auto createLoad = [&](Value tmaPtr, Value barrierAlloc, Value alloc,
@@ -131,7 +131,7 @@ class TMALoadLowering : public OpRewritePattern<DescriptorLoadOp> {
131131
struct TMAGatherLowering : public OpRewritePattern<DescriptorGatherOp> {
132132
using OpRewritePattern::OpRewritePattern;
133133

134-
LogicalResult matchAndRewrite(ExperimentalDescriptorGatherOp op,
134+
LogicalResult matchAndRewrite(DescriptorGatherOp op,
135135
PatternRewriter &baseRewriter) const override {
136136
PatternRewriterWithAsyncTaskIds rewriter(baseRewriter, op);
137137
auto createLoad = [&](Value tmaPtr, Value barrierAlloc, Value alloc,

0 commit comments

Comments
 (0)