Skip to content

Commit 3c9f49d

Browse files
Merge commit 'a4f185405ef049459dcba5d38dc24eca1584a14a'
2 parents a876742 + a4f1854 commit 3c9f49d

File tree

24 files changed

+1435
-467
lines changed

24 files changed

+1435
-467
lines changed

.github/workflows/llvm-build.yml

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,9 @@ jobs:
3030
- {runner: 'Ubuntu 20.04 ARM64', runs_on: 'ubuntu-20.04', target-os: 'ubuntu', arch: 'arm64'}
3131
- {runner: 'CentOS 7', runs_on: ['self-hosted', 'CPU'], target-os: 'centos', arch: 'x64'}
3232
- {runner: 'AlmaLinux 8', runs_on: ['self-hosted', 'CPU'], target-os: 'almalinux', arch: 'x64'}
33-
- {runner: 'MacOS X64', runs_on: 'macos-12', target-os: 'macos', arch: 'x64'}
34-
- {runner: 'MacOS ARM64', runs_on: 'macos-12', target-os: 'macos', arch: 'arm64'}
35-
# TODO(#2805): add back once the workflow works and runs in comparable time to the other ones
36-
# - {runner: 'Windows Latest', runs_on: 'windows-latest', target-os: 'windows', arch: 'x64'}
33+
- {runner: 'MacOS X64', runs_on: 'macos-13', target-os: 'macos', arch: 'x64'}
34+
- {runner: 'MacOS ARM64', runs_on: 'macos-13', target-os: 'macos', arch: 'arm64'}
35+
- {runner: 'Windows Latest', runs_on: 'windows-latest', target-os: 'windows', arch: 'x64'}
3736

3837
steps:
3938

@@ -127,7 +126,8 @@ jobs:
127126
-DLLVM_BUILD_TOOLS=ON
128127
-DLLVM_ENABLE_ASSERTIONS=ON
129128
-DMLIR_ENABLE_BINDINGS_PYTHON=ON
130-
-DLLVM_ENABLE_PROJECTS="clang;mlir"
129+
-DLLVM_ENABLE_PROJECTS="mlir;llvm"
130+
-DLLVM_ENABLE_DIA_SDK=OFF
131131
-DLLVM_INSTALL_UTILS=ON
132132
-DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU"
133133
-DLLVM_ENABLE_TERMINFO=OFF
@@ -300,6 +300,7 @@ jobs:
300300

301301
- name: Upload LLVM Artifacts to Azure
302302
if: ${{ (github.repository == 'triton-lang/triton') }}
303+
shell: bash -el {0}
303304
run: |
304305
az storage blob upload --account-name oaitriton --auth-mode login --container-name public --file "${{ env.llvm_install_dir }}.tar.gz" --name "llvm-builds/${{ env.llvm_install_dir }}.tar.gz" --overwrite
305306

include/triton/Dialect/Triton/IR/Dialect.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,16 @@ class DialectInferLayoutInterface
7878
Attribute operandEncodingB) const = 0;
7979
};
8080

81+
class DialectVerifyTensorLayoutInterface
82+
: public DialectInterface::Base<DialectVerifyTensorLayoutInterface> {
83+
public:
84+
DialectVerifyTensorLayoutInterface(Dialect *dialect) : Base(dialect) {}
85+
86+
virtual LogicalResult
87+
verifyTensorLayout(Attribute layout, RankedTensorType type, ModuleOp module,
88+
function_ref<InFlightDiagnostic()> emitError) const = 0;
89+
};
90+
8191
} // namespace triton
8292
} // namespace mlir
8393

include/triton/Dialect/TritonGPU/Transforms/Passes.td

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,38 @@ def TritonGPUPipeline : Pass<"tritongpu-pipeline", "mlir::ModuleOp"> {
2323
];
2424
}
2525

26+
def TritonGPUTestPipelineAssignLatencies : Pass<"tritongpu-test-pipeline-assign-latencies", "mlir::ModuleOp"> {
27+
let summary = "test assigning latencies to interesting ops ahead of pipelining";
28+
29+
let description = [{
30+
This is a test pass that tests `assignLatencies` method of `TritonGPULoopScheduling`.
31+
}];
32+
33+
let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
34+
"mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect",
35+
"mlir::scf::SCFDialect",
36+
"mlir::arith::ArithDialect"];
37+
38+
let options = [
39+
Option<"numStages", "num-stages",
40+
"int32_t", /*default*/"3",
41+
"number of pipeline stages">
42+
];
43+
}
44+
45+
def TritonGPUTestPipelineScheduleLoop : Pass<"tritongpu-test-pipeline-schedule-loop", "mlir::ModuleOp"> {
46+
let summary = "test scheduling a loop for software pipelining";
47+
48+
let description = [{
49+
This is a test pass that tests `scheduleLoop` method of `TritonGPULoopScheduling`.
50+
}];
51+
52+
let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
53+
"mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect",
54+
"mlir::scf::SCFDialect",
55+
"mlir::arith::ArithDialect"];
56+
}
57+
2658
def TritonGPUF32DotTC : Pass<"tritongpu-F32DotTC", "mlir::ModuleOp"> {
2759
let summary = "3xTF32 trick";
2860

include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@ static const char *kNumStagesAttrName = "tt.num_stages";
1111
static const char *kLoopStageAttrName = "loop.stage";
1212
static const char *kLoopClusterAttrName = "loop.cluster";
1313

14+
bool loopHasDistGreaterThanOne(scf::ForOp forOp);
15+
bool isOuterLoop(scf::ForOp forOp);
16+
1417
/// Function to mask operations during scheduling.
1518
Operation *predicateOp(RewriterBase &rewriter, Operation *op, Value pred);
1619

include/triton/Dialect/TritonGPU/Transforms/Schedule.h

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,18 @@
1111
namespace mlir {
1212
namespace triton {
1313

14+
namespace gpu {
15+
16+
/// Discover operations that should become async and assign latencies to them
17+
/// based on the numStages value provided by the user.
18+
DenseMap<Operation *, int> assignLatencies(ModuleOp forOp, int numStages);
19+
20+
/// Schedule the loop based on the latencies assigned to the operations.
21+
void scheduleLoop(scf::ForOp forOp,
22+
const DenseMap<Operation *, int> &opLatency);
23+
24+
}; // namespace gpu
25+
1426
/// This fill out the pipelining options including schedule and annotations
1527
/// for wait ops. This also does pre-processing by converting some of the
1628
/// loads into async loads so that the IR is ready to be pipelined.
@@ -108,8 +120,7 @@ class CoarseSchedule {
108120

109121
// Add dependencies of anchor ops to the coarse schedule. Schedule them to
110122
// the same stage and ordering cluster as the anchor op.
111-
void scheduleDependencies(scf::ForOp forOp, CoarseSchedule &schedule,
112-
int numStages);
123+
void scheduleDependencies(scf::ForOp forOp, CoarseSchedule &schedule);
113124

114125
} // namespace triton
115126
} // namespace mlir

lib/Dialect/Triton/IR/Traits.cpp

Lines changed: 6 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,9 @@
55
#include "mlir/IR/TypeUtilities.h"
66
#include "triton/Dialect/Triton/IR/Types.h"
77
#include "triton/Dialect/Triton/IR/Utility.h"
8-
#include "triton/Dialect/TritonGPU/IR/Dialect.h"
98
#include "llvm/Support/ErrorHandling.h"
109

1110
using namespace mlir;
12-
namespace ttg = mlir::triton::gpu;
1311

1412
static LogicalResult verifySameEncoding(Type typeA, Type typeB,
1513
bool allowTensorPointerType) {
@@ -118,53 +116,12 @@ LogicalResult OpTrait::impl::verifyTensorLayouts(Operation *op) {
118116
if (!layout)
119117
return success();
120118

121-
if (isa<ttg::SharedEncodingAttr>(layout))
122-
return makeErr() << "Shared layout is not allowed on tensor type.";
123-
// TODO(jlebar): Currently this only checks blocked layouts, but other
124-
// layouts also have invariants!
125-
126-
// TODO(jlebar): Handle the case when the encoding is nested within tt.ptr.
127-
if (auto blocked = dyn_cast<ttg::BlockedEncodingAttr>(layout)) {
128-
// A different verifier should have checked that the layout itself is
129-
// valid, including that threads-per-warp has the same rank as
130-
// warps-per-block etc.
131-
auto layoutRank = blocked.getThreadsPerWarp().size();
132-
if (layoutRank != rankedTy.getRank()) {
133-
return makeErr() << layout << ".\nLayout has rank " << layoutRank
134-
<< ", but the tensor it's attached to has rank "
135-
<< rankedTy.getRank() << ".";
136-
}
137-
138-
int moduleThreadsPerWarp =
139-
ttg::TritonGPUDialect::getThreadsPerWarp(module);
140-
int64_t layoutThreadsPerWarp = product(blocked.getThreadsPerWarp());
141-
if (layoutThreadsPerWarp != moduleThreadsPerWarp) {
142-
return makeErr() << layout << ".\nLayout has a total of "
143-
<< layoutThreadsPerWarp
144-
<< " threads per warp, but the module specifies "
145-
<< moduleThreadsPerWarp << " threads per warp.";
146-
}
147-
148-
int moduleWarpsPerCTA = ttg::TritonGPUDialect::getNumWarps(module);
149-
int64_t layoutWarpsPerCTA = product(blocked.getWarpsPerCTA());
150-
if (layoutWarpsPerCTA != moduleWarpsPerCTA) {
151-
return makeErr() << layout << ".\nLayout has a total of "
152-
<< layoutWarpsPerCTA
153-
<< " warps per CTA, but the module specifies "
154-
<< moduleWarpsPerCTA << " warps per CTA.";
155-
}
156-
157-
if (blocked.getCTALayout().getCTAsPerCGA().size() > 0) {
158-
int moduleCTAsPerCGA = ttg::TritonGPUDialect::getNumCTAs(module);
159-
int64_t layoutCTAsPerCGA =
160-
product(blocked.getCTALayout().getCTAsPerCGA());
161-
if (layoutCTAsPerCGA != moduleCTAsPerCGA) {
162-
return makeErr() << layout << ".\nLayout has a total of "
163-
<< layoutCTAsPerCGA
164-
<< " CTAs per CGA, but the module specifies "
165-
<< moduleCTAsPerCGA << " CTAs per CGA.";
166-
}
167-
}
119+
Dialect &dialect = layout.getDialect();
120+
auto verifyLayoutInterface =
121+
dyn_cast<mlir::triton::DialectVerifyTensorLayoutInterface>(&dialect);
122+
if (verifyLayoutInterface) {
123+
return verifyLayoutInterface->verifyTensorLayout(layout, rankedTy, module,
124+
makeErr);
168125
}
169126

170127
return success();

lib/Dialect/TritonGPU/IR/Dialect.cpp

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3058,6 +3058,68 @@ struct TritonGPUInferLayoutInterface
30583058
}
30593059
};
30603060

3061+
struct TritonGPUVerifyTensorLayoutInterface
3062+
: public triton::DialectVerifyTensorLayoutInterface {
3063+
using DialectVerifyTensorLayoutInterface::DialectVerifyTensorLayoutInterface;
3064+
3065+
LogicalResult verifyTensorLayout(
3066+
Attribute layout, RankedTensorType rankedTy, ModuleOp module,
3067+
function_ref<InFlightDiagnostic()> makeErr) const override {
3068+
if (isa<triton::gpu::SharedEncodingAttr>(layout))
3069+
return makeErr() << "Shared layout is not allowed on tensor type.";
3070+
// TODO(jlebar): Currently this only checks blocked layouts, but other
3071+
// layouts also have invariants!
3072+
3073+
// TODO(jlebar): Handle the case when the encoding is nested within tt.ptr.
3074+
if (auto blocked = dyn_cast<triton::gpu::BlockedEncodingAttr>(layout)) {
3075+
// A different verifier should have checked that the layout itself is
3076+
// valid, including that threads-per-warp has the same rank as
3077+
// warps-per-block etc.
3078+
auto layoutRank = blocked.getThreadsPerWarp().size();
3079+
if (layoutRank != rankedTy.getRank()) {
3080+
return makeErr() << layout << ".\nLayout has rank " << layoutRank
3081+
<< ", but the tensor it's attached to has rank "
3082+
<< rankedTy.getRank() << ".";
3083+
}
3084+
3085+
int moduleThreadsPerWarp =
3086+
triton::gpu::TritonGPUDialect::getThreadsPerWarp(module);
3087+
int64_t layoutThreadsPerWarp = product(blocked.getThreadsPerWarp());
3088+
if (layoutThreadsPerWarp != moduleThreadsPerWarp) {
3089+
return makeErr() << layout << ".\nLayout has a total of "
3090+
<< layoutThreadsPerWarp
3091+
<< " threads per warp, but the module specifies "
3092+
<< moduleThreadsPerWarp << " threads per warp.";
3093+
}
3094+
3095+
int moduleWarpsPerCTA =
3096+
triton::gpu::TritonGPUDialect::getNumWarps(module);
3097+
int64_t layoutWarpsPerCTA = product(blocked.getWarpsPerCTA());
3098+
if (layoutWarpsPerCTA != moduleWarpsPerCTA) {
3099+
return makeErr() << layout << ".\nLayout has a total of "
3100+
<< layoutWarpsPerCTA
3101+
<< " warps per CTA, but the module specifies "
3102+
<< moduleWarpsPerCTA << " warps per CTA.";
3103+
}
3104+
3105+
if (blocked.getCTALayout().getCTAsPerCGA().size() > 0) {
3106+
int moduleCTAsPerCGA =
3107+
triton::gpu::TritonGPUDialect::getNumCTAs(module);
3108+
int64_t layoutCTAsPerCGA =
3109+
product(blocked.getCTALayout().getCTAsPerCGA());
3110+
if (layoutCTAsPerCGA != moduleCTAsPerCGA) {
3111+
return makeErr() << layout << ".\nLayout has a total of "
3112+
<< layoutCTAsPerCGA
3113+
<< " CTAs per CGA, but the module specifies "
3114+
<< moduleCTAsPerCGA << " CTAs per CGA.";
3115+
}
3116+
}
3117+
}
3118+
3119+
return success();
3120+
}
3121+
};
3122+
30613123
//===----------------------------------------------------------------------===//
30623124
// Canonicalizer
30633125
//===----------------------------------------------------------------------===//
@@ -3798,6 +3860,7 @@ void TritonGPUDialect::initialize() {
37983860
>();
37993861
addInterfaces<TritonGPUOpAsmInterface>();
38003862
addInterfaces<TritonGPUInferLayoutInterface>();
3863+
addInterfaces<TritonGPUVerifyTensorLayoutInterface>();
38013864

38023865
RankedTensorType::attachInterface<TensorModel>(*getContext());
38033866
MemDescType::attachInterface<MemDescModel>(*getContext());

lib/Dialect/TritonGPU/Transforms/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,12 @@ add_triton_library(TritonGPUTransforms
88
OptimizeAccumulatorInit.cpp
99
OptimizeDotOperands.cpp
1010
OptimizeThreadLocality.cpp
11+
Pipeliner/AssignLatencies.cpp
1112
Pipeliner/MatmulLoopPipeline.cpp
1213
Pipeliner/OuterLoopPipeline.cpp
1314
Pipeliner/PipelineExpander.cpp
15+
Pipeliner/TestPipelineAssignLatencies.cpp
16+
Pipeliner/TestPipelineScheduleLoop.cpp
1417
Pipeliner/SoftwarePipeliner.cpp
1518
Pipeliner/TMAStoresPipeline.cpp
1619
Pipeliner/PipeliningUtility.cpp

0 commit comments

Comments
 (0)