Skip to content

Commit c0e155d

Browse files
Merge OpenAI Triton commit 0173f75 (#5260)
This PR change the Triton base from bea27e3 to 0173f75 (Oct 6). Pass rate: 94.2%->94.21%
2 parents 3b3a787 + f21b341 commit c0e155d

File tree

97 files changed

+4699
-2311
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

97 files changed

+4699
-2311
lines changed

.github/workflows/integration-tests-amd.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ jobs:
1313
integration-tests-amd:
1414
runs-on: ${{ matrix.runner }}
1515
timeout-minutes: 45
16+
continue-on-error: ${{ matrix.runner[1] == 'gfx90a' }}
1617
strategy:
1718
matrix:
1819
runner: ${{ fromJson(inputs.matrix) }}

CMakeLists.txt

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -89,10 +89,6 @@ if(NOT CMAKE_BUILD_TYPE)
8989
set(CMAKE_BUILD_TYPE "Release")
9090
endif()
9191

92-
if(NOT WIN32)
93-
find_library(TERMINFO_LIBRARY tinfo)
94-
endif()
95-
9692
if(TRITON_BUILD_UT)
9793
# This is an aggregate target for all unit tests.
9894
add_custom_target(TritonUnitTests)

include/triton/Conversion/TritonGPUToLLVM/Utility.h

Lines changed: 0 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -528,32 +528,6 @@ Value emitPadding(Location loc, RewriterBase &rewriter,
528528
triton::gpu::PaddedSharedEncodingAttr layout,
529529
unsigned bitwidth, Value smemOffset, bool offsetInBytes);
530530

531-
// Emits IR to load data from shared memory into registers, or to store data
532-
// from registers into shared memory.
533-
//
534-
// You supply perVectorCallback, which is called once per group of register
535-
// elements to transfer. You can use this callback to emit IR to load or store
536-
// data from or to shared memory.
537-
//
538-
// elemLlvmTy should be dstTy's element type converted to an LLVM-dialect type.
539-
//
540-
// If maxVecElems is provided, we won't vectorize more than this many elements.
541-
//
542-
// Returns true on success.
543-
[[nodiscard]] bool emitTransferBetweenRegistersAndShared(
544-
RankedTensorType registerTy, triton::gpu::MemDescType sharedTy,
545-
Type elemLlvmTy, std::optional<int32_t> maxVecElems,
546-
const SharedMemoryObject &smemObj, Location loc, RewriterBase &rewriter,
547-
const TargetInfoBase &target,
548-
std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback);
549-
550-
[[nodiscard]] bool emitTransferBetweenRegistersAndShared(
551-
LinearLayout &regLayout, triton::gpu::MemDescType sharedTy, Type elemLlvmTy,
552-
std::optional<int32_t> maxVecElems, const SharedMemoryObject &smemObj,
553-
Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
554-
Value laneId, Value warpId,
555-
std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback);
556-
557531
// Close cousin of lowerLdStMatrix in MemoryOpToLLVM.cpp
558532
// We might want to merge them at some point, but having to support
559533
// ldmatrix.trans makes the code in lowerLdStMatrix a bit specific

include/triton/Dialect/TritonGPU/Transforms/Schedule.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ class CoarseSchedule {
5454
iterator end() { return orderClusters.end(); }
5555
const_iterator end() const { return orderClusters.end(); }
5656
size_t size() const { return orderClusters.size(); }
57+
void clear() { orderClusters.clear(); }
5758
iterator newAtBack() {
5859
orderClusters.push_back(orderClusters.size());
5960
return std::prev(orderClusters.end());
@@ -157,7 +158,10 @@ class CoarseSchedule {
157158
// Set <stage, cluster> based on CoarseSchedule.
158159
void serialize(scf::ForOp &forOp) const;
159160
// Create a CoarseSchedule based on forOp's <stage, cluster>.
160-
LogicalResult deSerialize(scf::ForOp &forOp);
161+
// If normalizeClusterId is true, clusters [minClusterId, maxClusterId] will
162+
// be remapped to [0, maxClusterId - minClusterId].
163+
// If false, it won't remap and clusters [0, maxClusterId] will be created.
164+
LogicalResult deSerialize(scf::ForOp &forOp, bool normalizeClusterId = true);
161165

162166
static ClusterHash hashCluster(Cluster cluster) {
163167
return reinterpret_cast<ClusterHash>(&*cluster);

include/triton/Tools/LayoutUtils.h

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,41 @@ std::pair<int, ColumnAction>
147147
largestVectorisation(MLIRContext *ctx, const LinearLayout &cvt, int bitwidth,
148148
std::optional<int> maybeMaxVecElems = std::nullopt);
149149

150+
// Close cousin of doing zerosLike(tile) * divideLeft(cvt, tile)
151+
// This one is a tad more general in the sense that it allows to divide
152+
// cvt:
153+
// - register=1 -> (0, 1)
154+
// register=2 -> (8, 0)
155+
// register=4 -> (0, 8)
156+
// register=8 -> (0, 16)
157+
// register=16 -> (0, 32)
158+
// register=32 -> (0, 64)
159+
// register=64 -> (16, 0)
160+
// - lane=1 -> (0, 2)
161+
// lane=2 -> (0, 4)
162+
// lane=4 -> (1, 0)
163+
// lane=8 -> (2, 0)
164+
// lane=16 -> (4, 0)
165+
// - warp=1 -> (32, 0)
166+
// warp=2 -> (64, 0)
167+
// - block is a size 1 dimension
168+
// where out dims are: [row (size 128), col (size 128)]
169+
// tile:
170+
// - register=1 -> (0, 1)
171+
// register=2 -> (8, 0)
172+
// - lane=1 -> (0, 2)
173+
// lane=2 -> (0, 4)
174+
// lane=4 -> (1, 0)
175+
// lane=8 -> (2, 0)
176+
// lane=16 -> (4, 0)
177+
// - warp=1 -> (32, 0)
178+
// warp=2 -> (64, 0)
179+
// where out dims are: [row (size 128), col (size 8)]
180+
// which would not be possible to lower via the divideLeft approach as we
181+
// cannot divide by the tile given the `register=64 -> (16, 0)` basis.
182+
std::optional<LinearLayout> getReps(const LinearLayout &cvt,
183+
const LinearLayout &tile);
184+
150185
} // namespace mlir::triton
151186

152187
#endif // TRITON_TOOLS_LAYOUTUTILS_H

include/triton/Tools/Sys/GetEnv.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
4343
"ALLOW_LHS_TMEM_LAYOUT_CONVERSION",
4444
"TRITON_F32_DEFAULT",
4545
"TRITON_PREFER_TMEM_16x256_LAYOUT",
46+
"TRITON_ENABLE_EXPERIMENTAL_CONSAN",
4647
"TRITON_INTEL_AGGRESSIVE_DPAS_REUSE",
4748
"TRITON_INTEL_ENABLE_BLOCK_IO_ALL_LAYOUTS",
4849
"TRITON_INTEL_ENABLE_DPAS_FOR_WARP_SIZE_32",

lib/Conversion/TritonGPUToLLVM/Utility.cpp

Lines changed: 0 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -706,110 +706,6 @@ lowerLocalLdSt(Location loc, MLIRContext *ctx,
706706
maybeMaxVecElems, localLoadOp);
707707
}
708708

709-
bool emitTransferBetweenRegistersAndShared(
710-
LinearLayout &regLayout, triton::gpu::MemDescType sharedTy, Type elemLlvmTy,
711-
std::optional<int32_t> maxVecElems, const SharedMemoryObject &smemObj,
712-
Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
713-
Value laneId, Value warpId,
714-
std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback) {
715-
MLIRContext *ctx = rewriter.getContext();
716-
auto b = TritonLLVMOpBuilder(loc, rewriter);
717-
718-
StringAttr kBlock = str_attr("block");
719-
StringAttr kRegister = str_attr("register");
720-
StringAttr kLane = str_attr("lane");
721-
StringAttr kWarp = str_attr("warp");
722-
StringAttr kOffset = str_attr("offset");
723-
724-
auto shape = sharedTy.getShape();
725-
auto paddedEnc =
726-
dyn_cast<triton::gpu::PaddedSharedEncodingAttr>(sharedTy.getEncoding());
727-
LinearLayout regToSharedLayout = LinearLayout::empty();
728-
if (paddedEnc) {
729-
const auto &sharedLL = paddedEnc.getLinearComponent();
730-
regToSharedLayout = regLayout.invertAndCompose(sharedLL);
731-
} else {
732-
auto sharedLL = triton::gpu::toLinearLayout(sharedTy);
733-
regToSharedLayout = regLayout.invertAndCompose(sharedLL);
734-
}
735-
736-
// TODO(jlebar): We don't currently support loading from shared memory in a
737-
// different CTA. We'd need to emit `mapa.shared::cluster` instructions.
738-
if (regToSharedLayout.hasInDim(kBlock) &&
739-
regToSharedLayout.hasOutDim(kBlock) &&
740-
!regToSharedLayout.isTrivialOver({kBlock})) {
741-
return false;
742-
}
743-
744-
// Determine how many consecutive registers map to consecutive shmem elements
745-
// in out-dimension offsetN. This is our load instruction's vector width.
746-
//
747-
// It's OK if the vector width we choose here is wider than the hardware
748-
// supports; LLVM will legalize it.
749-
int vecElems =
750-
std::min({regToSharedLayout.getNumConsecutiveInOut(),
751-
maxVecElems.value_or(std::numeric_limits<int>::max())});
752-
if (paddedEnc) {
753-
vecElems = std::min(vecElems, int(paddedEnc.getMinInterval()));
754-
}
755-
756-
auto withCTAOffset = triton::gpu::getNumCTAs(sharedTy.getEncoding()) > 1;
757-
Value blockId =
758-
withCTAOffset ? target.getClusterCTAId(rewriter, loc) : b.i32_val(0);
759-
760-
int numElems = regToSharedLayout.getInDimSize(kRegister);
761-
auto vecTy = vec_ty(elemLlvmTy, vecElems);
762-
SmallVector<uint32_t> regIds;
763-
for (int i = 0; i < numElems / vecElems; i++) {
764-
regIds.push_back(i * vecElems);
765-
}
766-
767-
auto smemBase = smemObj.getBase();
768-
769-
auto indicesVec = applyLinearLayoutVec(loc, rewriter, regToSharedLayout,
770-
{{kRegister, b.i32_val(0)},
771-
{kLane, laneId},
772-
{kWarp, warpId},
773-
{kBlock, blockId}},
774-
regIds);
775-
776-
// Compute affine offset given by memdesc_subslice
777-
auto offset = smemObj.getShmemOffset(loc, rewriter, sharedTy);
778-
SmallVector<Value> vecAddrVec;
779-
for (auto &indices : indicesVec) {
780-
Value smemOffset = indices[0].second;
781-
smemOffset = b.xor_(smemOffset, offset);
782-
if (paddedEnc) {
783-
// Apply the offset needed for padding.
784-
auto bitwidth = elemLlvmTy.getIntOrFloatBitWidth();
785-
Value padOffset = emitPadding(loc, rewriter, paddedEnc, bitwidth,
786-
smemOffset, /*offsetInBytes=*/false);
787-
smemOffset = b.add(smemOffset, padOffset);
788-
}
789-
auto vecAddr = b.gep(smemBase.getType(), elemLlvmTy, smemBase, smemOffset,
790-
LLVM::GEPNoWrapFlags::inbounds);
791-
vecAddrVec.push_back(vecAddr);
792-
}
793-
794-
for (Value &vecAddr : vecAddrVec) {
795-
perVectorCallback(vecTy, vecAddr);
796-
}
797-
return true;
798-
}
799-
800-
bool emitTransferBetweenRegistersAndShared(
801-
RankedTensorType registerTy, triton::gpu::MemDescType sharedTy,
802-
Type elemLlvmTy, std::optional<int32_t> maxVecElems,
803-
const SharedMemoryObject &smemObj, Location loc, RewriterBase &rewriter,
804-
const TargetInfoBase &target,
805-
std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback) {
806-
auto regLayout = triton::gpu::toLinearLayout(registerTy);
807-
auto [laneId, warpId] = getLaneAndWarpId(rewriter, loc);
808-
return emitTransferBetweenRegistersAndShared(
809-
regLayout, sharedTy, elemLlvmTy, maxVecElems, smemObj, loc, rewriter,
810-
target, laneId, warpId, perVectorCallback);
811-
}
812-
813709
SmallVector<Value> unpackLLElements(Location loc, Value llvmStruct,
814710
RewriterBase &rewriter) {
815711
assert(bool(llvmStruct) && "can not unpack null values");

0 commit comments

Comments
 (0)