Skip to content

Commit 8ef63b9

Browse files
authored
Merge branch 'main' into buffer_load_base_opt
2 parents 8733bda + de8e715 commit 8ef63b9

File tree

130 files changed

+7185
-5664
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

130 files changed

+7185
-5664
lines changed

.github/workflows/llvm-build.yml

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ on:
99
pull_request:
1010
paths:
1111
- .github/workflows/llvm-build.yml
12+
- .github/workflows/llvm-build/almalinux.Dockerfile
13+
- .github/workflows/llvm-build/centos.Dockerfile
1214
workflow_dispatch:
1315

1416
env:
@@ -135,6 +137,7 @@ jobs:
135137
-DLLVM_INSTALL_UTILS=ON
136138
-DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU"
137139
-DLLVM_ENABLE_TERMINFO=OFF
140+
-DLLVM_ENABLE_ZSTD=OFF
138141
llvm-project/llvm
139142
140143
ninja -C llvm-project/build check-mlir install
@@ -237,7 +240,11 @@ jobs:
237240
run: |
238241
# if this step crashes, it can leave behind a stale docker container
239242
docker container prune -f
240-
docker rmi -f $(docker images -q)
243+
244+
images=$(docker images -q)
245+
if [ -n "$images" ]; then
246+
docker rmi -f $images
247+
fi
241248
242249
docker build --tag llvm-build --build-arg llvm_dir=llvm-project \
243250
-f llvm-build/.github/workflows/llvm-build/almalinux.Dockerfile .
@@ -264,16 +271,16 @@ jobs:
264271
path: |
265272
${{ github.workspace }}/llvm-*-${{ matrix.config.target-os }}-${{ matrix.config.arch }}.tar.gz
266273
267-
- name: Azure Login
268-
if: ${{ (github.repository == 'triton-lang/triton') }}
274+
- name: Azure login
275+
if: ${{ (github.repository == 'triton-lang/triton') && github.ref_name == 'llvm-head' }}
269276
uses: azure/login@v2
270277
with:
271-
client-id: ${{ secrets.AZURE_CLIENT_ID }}
272-
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
273-
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
278+
client-id: ${{ secrets.AZURE_CLIENT_ID_LLVM }}
279+
tenant-id: ${{ secrets.AZURE_TENANT_ID_LLVM }}
280+
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID_LLVM }}
274281

275282
- name: Upload LLVM Artifacts to Azure
276-
if: ${{ (github.repository == 'triton-lang/triton') }}
283+
if: ${{ (github.repository == 'triton-lang/triton') && github.ref_name == 'llvm-head' }}
277284
shell: bash -el {0}
278285
run: |
279286
az storage blob upload --account-name oaitriton --auth-mode login --container-name public --file "${{ env.llvm_install_dir }}.tar.gz" --name "llvm-builds/${{ env.llvm_install_dir }}.tar.gz" --overwrite
@@ -282,7 +289,7 @@ jobs:
282289
echo "Blob URL: ${URL}"
283290
284291
- name: Azure Logout
285-
if: ${{ (github.repository == 'triton-lang/triton') }}
292+
if: ${{ (github.repository == 'triton-lang/triton') && github.ref_name == 'llvm-head' }}
286293
run: |
287294
az logout
288295
az cache purge

.github/workflows/llvm-build/almalinux.Dockerfile

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
FROM almalinux:8
1+
# https://github.com/AlmaLinux/container-images/blob/9f9b3c8c8cf4a57fd42f362570ff47c75788031f/default/amd64/Dockerfile
2+
FROM almalinux:8.10-20250411
23
ARG llvm_dir=llvm-project
34
# Add the cache artifacts and the LLVM source tree to the container
45
ADD sccache /sccache
@@ -8,6 +9,7 @@ ENV SCCACHE_CACHE_SIZE="2G"
89

910
RUN dnf install --assumeyes llvm-toolset
1011
RUN dnf install --assumeyes python38-pip python38-devel git
12+
RUN alternatives --set python3 /usr/bin/python3.8
1113

1214
RUN python3 -m pip install --upgrade pip
1315
RUN python3 -m pip install --upgrade cmake ninja sccache lit
@@ -26,6 +28,8 @@ RUN cmake -GNinja -Bbuild \
2628
-DCMAKE_CXX_FLAGS="-Wno-everything" \
2729
-DCMAKE_LINKER=lld \
2830
-DCMAKE_INSTALL_PREFIX="/install" \
31+
-DPython3_EXECUTABLE="/usr/bin/python3.8" \
32+
-DPython_EXECUTABLE="/usr/bin/python3.8" \
2933
-DLLVM_BUILD_UTILS=ON \
3034
-DLLVM_BUILD_TOOLS=ON \
3135
-DLLVM_ENABLE_ASSERTIONS=ON \
@@ -34,6 +38,7 @@ RUN cmake -GNinja -Bbuild \
3438
-DLLVM_ENABLE_TERMINFO=OFF \
3539
-DLLVM_INSTALL_UTILS=ON \
3640
-DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU" \
41+
-DLLVM_ENABLE_ZSTD=OFF \
3742
/source/llvm-project/llvm
3843

3944
RUN ninja -C build install

include/triton/Analysis/Utility.h

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -268,19 +268,6 @@ bool cvtNeedsSharedMemory(RankedTensorType srcTy, RankedTensorType dstTy);
268268
// ConvertLayoutOpHelper in the future
269269
bool shouldUseDistSmem(Attribute srcLayout, Attribute dstLayout);
270270

271-
/// Multi-root DAG topological sort.
272-
/// Performs a topological sort of the Operation in the `toSort` SetVector.
273-
/// Returns a topologically sorted SetVector.
274-
/// It is faster than mlir::topologicalSort because it prunes nodes that have
275-
/// been visited before.
276-
SetVector<Operation *>
277-
multiRootTopologicalSort(const SetVector<Operation *> &toSort);
278-
279-
/// This uses the toplogicalSort above
280-
SetVector<Operation *>
281-
multiRootGetSlice(Operation *op, TransitiveFilter backwardFilter = nullptr,
282-
TransitiveFilter forwardFilter = nullptr);
283-
284271
/// Create a basic DataFlowSolver with constant and dead code analysis included.
285272
std::unique_ptr<DataFlowSolver> createDataFlowSolver();
286273

include/triton/Dialect/Triton/IR/TritonOps.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -732,6 +732,7 @@ def TT_DotScaledOp : TT_Op<"dot_scaled", [Pure,
732732
`lhs` `=` $a_elem_type `rhs` `=` $b_elem_type attr-dict
733733
`:` type($a) (`,` type($a_scale)^)? `*` type($b) (`,` type($b_scale)^)? `->` type($d)
734734
}];
735+
let hasVerifier = 1;
735736
}
736737

737738
//

include/triton/Dialect/TritonGPU/IR/Dialect.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,11 @@ constexpr static char AttrNumWarpsName[] = "ttg.num-warps";
4545
constexpr static char AttrNumCTAsName[] = "ttg.num-ctas";
4646
constexpr static char AttrTargetName[] = "ttg.target";
4747
constexpr static char AttrNumThreadsPerWarp[] = "ttg.threads-per-warp";
48+
// FIXME: rename to match above
49+
constexpr static char kPartitionAttrName[] = "ttg.partition";
50+
constexpr static char kPartitionOutputsAttrName[] = "ttg.partition.outputs";
51+
constexpr static char kPartitionStagesAttrName[] = "ttg.partition.stages";
52+
constexpr static char kWarpSpecializeTagAttrName[] = "ttg.warp_specialize.tag";
4853

4954
// Find the contextual number of warps on which this operation is executed.
5055
int lookupNumWarps(Operation *op);
@@ -293,6 +298,10 @@ LogicalResult verifyMemoryOpTypes(Operation *op, ShapedType srcTy,
293298
ShapedType dstTy);
294299
// Verify a memory allocation operation.
295300
LogicalResult verifyAllocOp(Operation *op, Value src, MemDescType dstTy);
301+
302+
std::optional<SetVector<int>> getPartitionIds(Operation *op);
303+
std::optional<int> getNumOutputPartitionIds(Operation *op);
304+
std::optional<SetVector<int>> getOutputPartitionIds(Operation *op, int idx);
296305
} // namespace mlir::triton::gpu
297306

298307
#endif // TRITON_DIALECT_TRITONGPU_IR_DIALECT_H_

include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -117,19 +117,6 @@ chooseDsReadTrLayout(Attribute enc, ArrayRef<int64_t> shape,
117117
int32_t elemBitWidth, unsigned instBitWidth,
118118
unsigned numLanesInShuffleGroup);
119119

120-
LinearLayout getScaleTMEMStoreLinearLayout(RankedTensorType scaleType,
121-
int numWarps);
122-
123-
std::optional<LinearLayout>
124-
getTmemLoadStoreLayout16x256(int M, int N, RankedTensorType oldType,
125-
int numWarps);
126-
127-
// Return a layout valid for TMemLoad op for a tmem layout of block MxN that
128-
// distribute the data long M for the warp groups. This doesn't affect the TMem
129-
// layout it just returns a distributed layout compatible for tmem_load.
130-
LinearLayout getTmemLoadLayoutSplitLongM(int M, int N, RankedTensorType oldType,
131-
int numWarps);
132-
133120
// Create LinearLayout for scale in scaled mfma.
134121
LinearLayout chooseScaledMfmaScaleLayout(MLIRContext *ctx, int dotOperandIdx,
135122
ArrayRef<int64_t> dotOperandShape,
@@ -161,5 +148,15 @@ std::optional<LinearLayout> chooseMfmaLikeStoreLayout(RankedTensorType valType);
161148
LinearLayout getCoreMatrixLinearLayout(NVMMASharedEncodingAttr shared,
162149
bool disableSwizzle);
163150

151+
// Make a LinearLayout that maps a block-id to an N-dimensional index.
152+
//
153+
// The tensor is split up into CTAsPerCGA pieces, which are distributed among
154+
// the CTAsPerCGA CTAs (i.e. blocks) in the CGA (i.e. groups).
155+
//
156+
// See the nomenclature note at the top of the LinearLayoutConversions.cpp file
157+
// for an explanation of why this is called makeCgaLayout when it accepts a
158+
// CTALayoutAttr.
159+
LinearLayout makeCgaLayout(CTALayoutAttr layout);
160+
164161
} // namespace mlir::triton::gpu
165162
#endif // TRITON_DIALECT_TRITONGPU_IR_LINEARLAYOUTCONVERSIONS_H

include/triton/Dialect/TritonGPU/Transforms/Partition.h

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,6 @@ class ForOp;
1616
} // namespace scf
1717
} // namespace mlir
1818

19-
static constexpr char kPartitionAttrName[] = "ttg.partition";
20-
static constexpr char kPartitionOutputsAttrName[] = "ttg.partition.outputs";
21-
static constexpr char kPartitionStagesAttrName[] = "ttg.partition.stages";
22-
static constexpr char kWarpSpecializeTagAttrName[] = "ttg.warp_specialize.tag";
23-
2419
//===----------------------------------------------------------------------===//
2520
// PartitionSet
2621
//===----------------------------------------------------------------------===//
@@ -40,6 +35,7 @@ class Partition {
4035
ArrayRef<Operation *> getOps() const { return ops; }
4136
void addOp(Operation *op) { ops.push_back(op); }
4237
bool hasOp(Operation *op) const;
38+
bool empty() const { return ops.empty(); }
4339

4440
// Iterate the inputs of the partition. Input values are those that originate
4541
// from a different partition or a previous iteration of the current
@@ -127,8 +123,9 @@ void setPartition(Operation *op, const SetVector<Partition *> &partitions);
127123
// which does not work with Partition instances and iterate* functions, since
128124
// it does not keep the op attributes and the op list of a partition in sync.
129125
void setPartition(Operation *op, const SetVector<int> &partitionIds);
130-
131-
std::optional<SetVector<int>> getPartitionIds(Operation *op);
126+
void setPartitionOutputs(Operation *op,
127+
ArrayRef<SetVector<int>> partitionOutputsIds);
128+
SmallVector<SetVector<int>, 4> getPartitionOutputs(Operation *op);
132129

133130
} // namespace mlir::triton::gpu
134131

0 commit comments

Comments
 (0)