Skip to content

Commit da26bee

Browse files
Merge OpenAI Triton commit 7a3247b (#3804)
This PR change the Triton base from f4e780c to 7a3247b (Mar 31). Pass rate: 90.48%->90.38% (#3821)
2 parents 6a8daa1 + 529dff1 commit da26bee

File tree

18 files changed

+191
-165
lines changed

18 files changed

+191
-165
lines changed
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
name: Create Release
2+
3+
on:
4+
push:
5+
branches:
6+
- main
7+
- release/*
8+
tags:
9+
# Final Release tags look like: v1.11.0
10+
- v[0-9]+.[0-9]+.[0-9]+
11+
# Release candidate tags look like: v1.11.0-rc1
12+
- v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
13+
release:
14+
types: [published]
15+
pull_request:
16+
paths: [.github/workflows/create_release.yml]
17+
18+
jobs:
19+
20+
release:
21+
if: ${{ github.repository == 'triton-lang/triton' }}
22+
name: Create Release
23+
runs-on: ubuntu-latest
24+
permissions:
25+
contents: write
26+
outputs:
27+
release_name: "${{ steps.release_name.outputs.name }}"
28+
steps:
29+
- uses: actions/checkout@v4
30+
with:
31+
show-progress: false
32+
submodules: 'recursive'
33+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
34+
- name: Fake name for PRs
35+
if: ${{ github.event_name == 'pull_request' }}
36+
run: echo "PT_GITHUB_REF=refs/tags/pr-tag" >> "$GITHUB_ENV"
37+
- name: Real name for non-PRs
38+
if: ${{ github.event_name != 'pull_request' }}
39+
run: echo "PT_GITHUB_REF=$GITHUB_REF" >> "$GITHUB_ENV"
40+
- name: Set filenames
41+
run: |
42+
tag_or_branch="${PT_GITHUB_REF#refs/tags/}"
43+
tag_or_branch="${tag_or_branch#refs/heads/}"
44+
# replace directory separators with _ in branch name
45+
tag_or_branch="${tag_or_branch//\//_}"
46+
echo "RELEASE_NAME=triton-$tag_or_branch" >> "$GITHUB_ENV"
47+
echo "RELEASE_FILE=triton-$tag_or_branch.tar.gz" >> "$GITHUB_ENV"
48+
- name: Create source distribution
49+
run: |
50+
# Create new folder with specified name so extracting the archive yields that
51+
rm -rf "/tmp/$RELEASE_NAME"
52+
cp -r "$PWD" "/tmp/$RELEASE_NAME"
53+
mv "/tmp/$RELEASE_NAME" .
54+
# Cleanup
55+
find "$RELEASE_NAME" -name '.git*' -exec rm -rv {} \; || true
56+
# Create archive
57+
tar -czf "$RELEASE_FILE" "$RELEASE_NAME"
58+
echo "Created source archive $RELEASE_FILE with content: $(ls -a "$RELEASE_NAME")"
59+
- name: Upload source distribution for release
60+
if: ${{ github.event_name == 'release' }}
61+
uses: softprops/action-gh-release@v2
62+
with:
63+
files: ${{env.RELEASE_FILE}}
64+
- name: Upload source distribution to GHA artifacts for release tags
65+
if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
66+
uses: actions/[email protected]
67+
with:
68+
name: ${{ env.RELEASE_FILE }}
69+
path: ${{ env.RELEASE_FILE }}
70+
- name: Set output
71+
id: release_name
72+
run: echo "name=release_name::${{ env.RELEASE_NAME }}.tar.gz" >> "${GITHUB_OUTPUT}"
73+
74+
concurrency:
75+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }}
76+
cancel-in-progress: true

cmake/llvm-hash.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
71a977d0d611f3e9f6137a6b8a26b730b2886ce9
1+
1d4801f22ab1fd6205b1cf625b690aefc554cd4c

include/triton/Dialect/TritonGPU/IR/Dialect.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,10 @@ llvm::SmallVector<T> expandMatrixShapeWithBatch(llvm::ArrayRef<T> s);
270270

271271
llvm::SmallVector<unsigned>
272272
expandMatrixOrderWithBatch(llvm::ArrayRef<unsigned> o);
273+
274+
// Return true if the two layouts represent the exact same mapping.
275+
bool areLayoutsEquivalent(ArrayRef<int64_t> shape, Attribute lhs,
276+
Attribute rhs);
273277
} // namespace mlir::triton::gpu
274278

275279
#endif // TRITON_DIALECT_TRITONGPU_IR_DIALECT_H_

include/triton/Tools/Sys/GetEnv.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
4343
"NVPTX_ENABLE_DUMP",
4444
"STORE_TMEM_TO_GLOBAL_BYPASS_SMEM",
4545
"ALLOW_LHS_TMEM_LAYOUT_CONVERSION",
46-
"ENABLE_LHS_TO_TMEM",
4746
"TRITON_F32_DEFAULT",
4847
"ENABLE_MMA_V5_ATT_PIPELINE",
4948
"TRITON_INTEL_ADVANCED_PATH",

lib/Dialect/TritonGPU/IR/Dialect.cpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2469,10 +2469,9 @@ struct TritonGPUInferLayoutInterface
24692469
}
24702470
if (!expected || !got)
24712471
return failure();
2472+
24722473
// Check whether the encodings are structurally the same.
2473-
auto expectedLL = triton::gpu::toLinearLayout(shape, expected);
2474-
auto gotLL = triton::gpu::toLinearLayout(shape, got);
2475-
if (expectedLL != gotLL) {
2474+
if (!areLayoutsEquivalent(shape, expected, got)) {
24762475
return emitOptionalError(loc, "Expected result encoding ", expected,
24772476
" but was ", got);
24782477
}
@@ -3208,3 +3207,10 @@ int triton::gpu::lookupThreadsPerWarp(OpBuilder &rewriter) {
32083207
assert(op && "cannot create thread ID outside of module");
32093208
return triton::gpu::TritonGPUDialect::getThreadsPerWarp(cast<ModuleOp>(op));
32103209
}
3210+
3211+
bool triton::gpu::areLayoutsEquivalent(ArrayRef<int64_t> shape, Attribute lhs,
3212+
Attribute rhs) {
3213+
auto lhsLL = triton::gpu::toLinearLayout(shape, lhs);
3214+
auto rhsLL = triton::gpu::toLinearLayout(shape, rhs);
3215+
return lhsLL == rhsLL;
3216+
}

lib/Dialect/TritonGPU/Transforms/WarpSpecialization/AutomaticWarpSpecialization.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,9 @@ void AutomaticWarpSpecialization::runOnOperation() {
3535
OpPassManager pm;
3636
pm.addPass(createTritonGPULoadMMASpecialization({numStages}));
3737
pm.addPass(createTritonGPURewritePartitionDependencies());
38-
// `int-range-optimizations` and SCCP are good at cleaning up loop arithmetic.
39-
// FIXME: Re-enable integer range analysis once it is fixed.
40-
// pm.addPass(arith::createIntRangeOptimizationsPass());
38+
// `int-range-optimizations` is good at cleaning up loop arithmetic involving
39+
// circular buffers.
40+
pm.addPass(arith::createIntRangeOptimizationsPass());
4141
pm.addPass(createSCCPPass());
4242
pm.addPass(createCSEPass());
4343
pm.addPass(createTritonGPUPartitionLoops());

lib/Dialect/TritonNvidiaGPU/IR/Dialect.cpp

Lines changed: 12 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -171,69 +171,26 @@ bool isDistributedLayoutTMemCompatible(Operation *op,
171171
int numWarps = lookupNumWarps(op);
172172
assert(numWarps % 4 == 0);
173173
int numWarpGroups = numWarps / 4;
174-
175-
int blockM = 0;
176-
int blockN = 0;
177-
bool scalesEncoding = false;
178-
if (auto attr = dyn_cast<triton::nvidia_gpu::TensorMemoryEncodingAttr>(
174+
if (isa<triton::nvidia_gpu::TensorMemoryScalesEncodingAttr>(
179175
memType.getEncoding())) {
180-
blockM = attr.getBlockM();
181-
blockN = attr.getBlockN();
182-
} else {
183-
assert(isa<triton::nvidia_gpu::TensorMemoryScalesEncodingAttr>(
184-
memType.getEncoding()) &&
185-
"Expecting a tensor memory encoding attribute");
186176
return tensorType.getEncoding() ==
187177
triton::gpu::LinearEncodingAttr::get(
188178
tensorType.getContext(),
189179
getScaleTMEMStoreLinearLayout(tensorType, numWarps));
190180
}
181+
auto attr =
182+
cast<triton::nvidia_gpu::TensorMemoryEncodingAttr>(memType.getEncoding());
183+
int blockM = attr.getBlockM();
184+
int blockN = attr.getBlockN();
191185
if (isDistributedLayoutSplitMTmemLoadStore(tensorType, memType, numWarps))
192186
return true;
193-
auto shapePerCTA = mlir::triton::gpu::getShapePerCTA(tensorType);
194-
int numElements = product(shapePerCTA);
195-
int numBlocks = ceil<int>(numElements, blockM * blockN);
196-
bool useStridedMessage = blockM == 64;
197-
198-
int numWarpGroupsPerBlock = ceil<int>(numWarpGroups, numBlocks);
199-
200-
auto tensorEncoding =
201-
cast<triton::gpu::BlockedEncodingAttr>(tensorType.getEncoding());
202-
auto sizePerThread = tensorEncoding.getSizePerThread();
203-
auto threadsPerWarp = tensorEncoding.getThreadsPerWarp();
204-
auto warpsPerCTA = tensorEncoding.getWarpsPerCTA();
205-
auto order = tensorEncoding.getOrder();
206-
207-
if (order.size() != 2)
208-
return false;
209-
210-
if (order[0] != 0 || order[1] != 1)
211-
return false;
212-
213-
if (useStridedMessage) {
214-
// For blockM=64 we need to use 16x32bx2 message, meaning the distributed
215-
// layout needs to be organized into 16x2 threads per warp and one row
216-
// access per thread.
217-
if (threadsPerWarp[0] != 16 || threadsPerWarp[1] != 2 ||
218-
sizePerThread[0] != 1)
219-
return false;
220-
221-
if (numBlocks == 1) {
222-
// with blockM=64 and just single block we cannot split along the M
223-
// dimension. Check that if we split, we split along N.
224-
if (numWarpGroupsPerBlock > 1) {
225-
if (warpsPerCTA[1] == 1)
226-
return false;
227-
}
228-
}
229-
} else {
230-
// For blockM=128, we need to use a 32x32b message, which requires 32
231-
// threads to be sequentially ordered across the M dimension, ensuring
232-
// that each thread accesses a single and unique TMEM datapath.
233-
if (threadsPerWarp[0] != 32 || sizePerThread[0] != 1)
234-
return false;
235-
}
236-
return true;
187+
Attribute layout =
188+
nvidia_gpu::getTmemCompatibleLayout(blockM, blockN, tensorType, numWarps);
189+
// TODO: Add support for more layout compatible with tmem load/store. There
190+
// will only be a discret set of layout possible due to the limiations of
191+
// tmem_load/store.
192+
return areLayoutsEquivalent(tensorType.getShape(), layout,
193+
tensorType.getEncoding());
237194
}
238195

239196
} // namespace nvidia_gpu

lib/Dialect/TritonNvidiaGPU/Transforms/PromoteLHSToTMem.cpp

Lines changed: 16 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,23 @@ template <class MMAOpTy> class LHSToTMem : public OpRewritePattern<MMAOpTy> {
5252
return failure();
5353
Value src = localAllocOp.getSrc();
5454
auto srcType = cast<RankedTensorType>(src.getType());
55-
auto srcLayout = cast<ttg::BlockedEncodingAttr>(srcType.getEncoding());
55+
auto srcLayout = srcType.getEncoding();
56+
auto accTMemEncoding = dyn_cast<ttng::TensorMemoryEncodingAttr>(
57+
tcGen5MMAOp.getD().getType().getEncoding());
58+
ArrayRef<unsigned> CTASplitNum =
59+
triton::gpu::getCTALayout(srcLayout).getCTASplitNum();
60+
// TMem encoding for A operand is the same as for D (Acc), but packed.
61+
auto aTMemEncoding = ttng::TensorMemoryEncodingAttr::get(
62+
context, accTMemEncoding.getBlockM(), lhs.getType().getShape()[1],
63+
/*unpacked=*/false, CTASplitNum[0], CTASplitNum[1]);
64+
Attribute tensorMemorySpace =
65+
triton::nvidia_gpu::TensorMemorySpaceAttr::get(context);
66+
ttg::MemDescType lhsMemDescType = ttg::MemDescType::get(
67+
lhs.getType().getShape(), lhs.getType().getElementType(), aTMemEncoding,
68+
tensorMemorySpace,
69+
/*mutableMemory=*/false);
5670
bool layoutTmemCompatible = ttng::isDistributedLayoutTMemCompatible(
57-
tcGen5MMAOp, srcType, tcGen5MMAOp.getD().getType());
71+
tcGen5MMAOp, srcType, lhsMemDescType);
5872
Attribute newLayout = srcLayout;
5973
if (!layoutTmemCompatible) {
6074
if (triton::tools::getBoolEnv("ALLOW_LHS_TMEM_LAYOUT_CONVERSION")) {
@@ -70,19 +84,6 @@ template <class MMAOpTy> class LHSToTMem : public OpRewritePattern<MMAOpTy> {
7084
RankedTensorType::get(ty.getShape(), ty.getElementType(), newLayout);
7185
src = rewriter.create<ttg::ConvertLayoutOp>(loc, newTy, src);
7286
}
73-
auto accTMemEncoding = dyn_cast<ttng::TensorMemoryEncodingAttr>(
74-
tcGen5MMAOp.getD().getType().getEncoding());
75-
ArrayRef<unsigned> CTASplitNum = srcLayout.getCTALayout().getCTASplitNum();
76-
// TMem encoding for A operand is the same as for D (Acc), but unpacked.
77-
auto aTMemEncoding = ttng::TensorMemoryEncodingAttr::get(
78-
context, accTMemEncoding.getBlockM(), lhs.getType().getShape()[1],
79-
/*unpacked=*/false, CTASplitNum[0], CTASplitNum[1]);
80-
Attribute tensorMemorySpace =
81-
triton::nvidia_gpu::TensorMemorySpaceAttr::get(context);
82-
Type lhsMemDescType = triton::gpu::MemDescType::get(
83-
lhs.getType().getShape(), lhs.getType().getElementType(), aTMemEncoding,
84-
tensorMemorySpace,
85-
/*mutableMemory=*/false);
8687
Value tMemAlloc =
8788
rewriter.create<ttng::TMEMAllocOp>(loc, lhsMemDescType, src);
8889
tcGen5MMAOp.getAMutable().assign(tMemAlloc);
@@ -100,8 +101,6 @@ class TritonNvidiaGPUPromoteLHSToTMemPass
100101
TritonNvidiaGPUPromoteLHSToTMemPassBase;
101102

102103
void runOnOperation() override {
103-
if (!triton::tools::getBoolEnv("ENABLE_LHS_TO_TMEM"))
104-
return;
105104
MLIRContext *context = &getContext();
106105
ModuleOp m = getOperation();
107106

0 commit comments

Comments
 (0)