Skip to content

Commit b94c6e1

Browse files
Garra1980chencha3
andauthored
[LLVM Pulldown] LLVM bump to rev 6b7e1b97f4bbb6dd6fca04ee4caccbf54ec92f09 (#1091)
LLVM pulldown to rev 6b7e1b97f4bbb6dd6fca04ee4caccbf54ec92f09 Co-authored-by: Chao Chen <[email protected]>
1 parent 6bad678 commit b94c6e1

27 files changed

+152
-94
lines changed

build_tools/llvm_version.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
c539ec0db53ac850d121f1420fc9da72a5bf8891
1+
6b7e1b97f4bbb6dd6fca04ee4caccbf54ec92f09

include/imex/Dialect/Region/IR/RegionOps.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,8 +120,8 @@ def EnvironmentRegionOp : Region_Op<"env_region", [
120120

121121
let builders = [
122122
OpBuilder<(ins "::mlir::Attribute":$environment,
123-
CArg<"::mlir::ValueRange", "std::nullopt">:$args,
124-
CArg<"::mlir::TypeRange", "std::nullopt">:$results,
123+
CArg<"::mlir::ValueRange", "{}">:$args,
124+
CArg<"::mlir::TypeRange", "{}">:$results,
125125
CArg<"::llvm::function_ref<void(::mlir::OpBuilder &, ::mlir::Location)>", "nullptr">)>
126126
];
127127

lib/Conversion/NDArrayToLinalg/NDArrayToLinalg.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ struct CopyLowering : public ::mlir::OpRewritePattern<::imex::ndarray::CopyOp> {
102102
// create a region with given env, add copy op within it
103103
auto env = rewriter.getStringAttr("protect_copy_op");
104104
rewriter.create<::imex::region::EnvironmentRegionOp>(
105-
loc, env, std::nullopt, std::nullopt,
105+
loc, env, llvm::ArrayRef<mlir::Value>(), llvm::ArrayRef<mlir::Type>(),
106106
[&srcMR, &mr](::mlir::OpBuilder &builder, ::mlir::Location loc) {
107107
(void)builder.create<::mlir::memref::CopyOp>(loc, srcMR, mr);
108108
(void)builder.create<::imex::region::EnvironmentRegionYieldOp>(loc);

lib/Conversion/XeGPUToVC/LSCPatterns.cpp

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,13 @@ namespace imex {
4646

4747
namespace LSC {
4848

49+
static SmallVector<int64_t> generateFullPermutation(int rank) {
50+
SmallVector<int64_t> permutation;
51+
for (int i = rank - 1; i >= 0; --i)
52+
permutation.push_back(i);
53+
return permutation;
54+
}
55+
4956
static int getCacheEncoding(std::optional<xegpu::CachePolicy> hint) {
5057

5158
if (!hint.has_value())
@@ -1139,6 +1146,21 @@ class LoadGatherPattern : public OpConversionPattern<LoadGatherOp> {
11391146
auto newValue = genLoadIntrinsicCallWithC32BConversion(
11401147
rewriter, loc, resultTy, simd_lanes, op.getMask(), l1hint, l3hint,
11411148
elemTy, chunkSize, tdescTy.getMemorySpace(), adaptor.getTensorDesc());
1149+
1150+
// transpose the result because of the difference between hardware
1151+
// implementation and the XeGPU definition.
1152+
if (resultTy.getRank() > 1) {
1153+
SmallVector<int64_t> permutation =
1154+
generateFullPermutation(resultTy.getRank());
1155+
llvm::ArrayRef<int64_t> shape = resultTy.getShape();
1156+
auto intrinsicTy =
1157+
VectorType::get(applyPermutation(shape, permutation), elemTy);
1158+
newValue =
1159+
rewriter.create<vector::ShapeCastOp>(loc, intrinsicTy, newValue);
1160+
newValue =
1161+
rewriter.create<vector::TransposeOp>(loc, newValue, permutation);
1162+
}
1163+
11421164
rewriter.replaceOp(op, newValue);
11431165

11441166
return success();
@@ -1220,10 +1242,22 @@ class StoreScatterPattern : public OpConversionPattern<StoreScatterOp> {
12201242
auto l1hint = op.getL1Hint();
12211243
// auto l2hint = op.getL2Hint();
12221244
auto l3hint = op.getL3Hint();
1245+
1246+
Value data = adaptor.getValue();
1247+
// transpose the value because of the difference between hardware
1248+
// implementation and the XeGPU definition.
1249+
if (tdescTy.getRank() > 1) {
1250+
Type flatVecTy =
1251+
data.getType(); // 1D VectorType expected by the intrinsic
1252+
SmallVector<int64_t> permutation =
1253+
generateFullPermutation(tdescTy.getRank());
1254+
data = rewriter.create<vector::ShapeCastOp>(loc, op.getValueType(), data);
1255+
data = rewriter.create<vector::TransposeOp>(loc, data, permutation);
1256+
data = rewriter.create<vector::ShapeCastOp>(loc, flatVecTy, data);
1257+
}
12231258
auto callOp = genStoreIntrinsicCallWithC32BConversion(
12241259
rewriter, loc, simd_lanes, op.getMask(), l1hint, l3hint, elemTy,
1225-
chunkSize, tdescTy.getMemorySpace(), adaptor.getTensorDesc(),
1226-
adaptor.getValue());
1260+
chunkSize, tdescTy.getMemorySpace(), adaptor.getTensorDesc(), data);
12271261

12281262
rewriter.replaceOp(op, callOp);
12291263
return success();

lib/Conversion/XeTileToXeGPU/XeTileToXeGPU.cpp

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -535,12 +535,11 @@ class GatherOpPattern : public OpConversionPattern<xetile::LoadGatherOp> {
535535
auto ldTy = VectorType::get(type.getNumElements(), elemTy);
536536
auto maskTy =
537537
VectorType::get(type.getNumElements(), rewriter.getIntegerType(1));
538-
auto transposeAttr = UnitAttr();
539538
auto [L1, L2, L3] = getCachePolicy(op);
540539
auto mask =
541540
rewriter.create<vector::ShapeCastOp>(loc, maskTy, adaptor.getMask());
542541
auto ldOp = rewriter.create<xegpu::LoadGatherOp>(
543-
loc, ldTy, adaptor.getTile(), mask, transposeAttr, L1, L2, L3);
542+
loc, ldTy, adaptor.getTile(), mask, L1, L2, L3);
544543
auto v = rewriter.create<vector::ShapeCastOp>(loc, op.getType(), ldOp);
545544
rewriter.replaceOp(op, v);
546545
return success();
@@ -574,9 +573,15 @@ class StoreOpPattern : public OpConversionPattern<xetile::StoreTileOp> {
574573
auto maskTy = VectorType::get(tileTy.getShape()[1], rewriter.getI1Type());
575574
auto mask = rewriter.create<arith::ConstantOp>(
576575
loc, DenseElementsAttr::get(maskTy, rewriter.getBoolAttr(true)));
577-
auto transAttr = rewriter.getUnitAttr();
576+
577+
if (tileTy.getRank() > 1) {
578+
SmallVector<int64_t> permutation = llvm::to_vector(
579+
llvm::reverse(llvm::seq<int64_t>(tileTy.getRank())));
580+
value = rewriter.create<vector::TransposeOp>(loc, value, permutation);
581+
}
582+
578583
rewriter.replaceOpWithNewOp<xegpu::StoreScatterOp>(
579-
op, value, adaptor.getTile(), mask, transAttr, L1, L2, L3);
584+
op, value, adaptor.getTile(), mask, L1, L2, L3);
580585
} else {
581586
// Since the low-level instruction works on 1D vector of 32-bits data, the
582587
// data to be stored need to be linearized and bitcasted.
@@ -605,12 +610,11 @@ class ScatterOpPattern : public OpConversionPattern<xetile::StoreScatterOp> {
605610
auto numElems = tileTy.getNumElements();
606611
auto valTy = VectorType::get(numElems, tileTy.getElementType());
607612
auto maskTy = VectorType::get(numElems, rewriter.getIntegerType(1));
608-
auto transposeAttr = UnitAttr();
609613
auto [L1, L2, L3] = getCachePolicy(op, xegpu::CachePolicy::WRITE_BACK);
610614
mask = rewriter.create<vector::ShapeCastOp>(op.getLoc(), maskTy, mask);
611615
value = rewriter.create<vector::ShapeCastOp>(op.getLoc(), valTy, value);
612-
rewriter.replaceOpWithNewOp<xegpu::StoreScatterOp>(
613-
op, value, tdesc, mask, transposeAttr, L1, L2, L3);
616+
rewriter.replaceOpWithNewOp<xegpu::StoreScatterOp>(op, value, tdesc, mask,
617+
L1, L2, L3);
614618
return success();
615619
}
616620
};

lib/Dialect/LLVMIR/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ add_imex_dialect_library(MLIRXeVMDialect
1616
Core
1717

1818
LINK_LIBS PUBLIC
19+
MLIRDialectUtils
1920
MLIRIR
2021
MLIRLLVMDialect
2122
MLIRSideEffectInterfaces

lib/Dialect/NDArray/Transforms/AddGPURegions.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ static ::mlir::LogicalResult matchAndRewritePTOP(::mlir::Operation *op,
6565

6666
// create a region with given env and clone creator op within and yield it
6767
rewriter.replaceOpWithNewOp<::imex::region::EnvironmentRegionOp>(
68-
op, env, std::nullopt, op->getResultTypes(),
68+
op, env, llvm::ArrayRef<mlir::Value>(), op->getResultTypes(),
6969
[op](::mlir::OpBuilder &builder, ::mlir::Location loc) {
7070
auto cOp = builder.clone(*op);
7171
(void)builder.create<::imex::region::EnvironmentRegionYieldOp>(

lib/Transforms/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ add_mlir_library(IMEXTransforms
3131
MLIRSupport
3232
MLIRTransformUtils
3333
MLIRVectorTransforms
34-
MLIRCopyOpInterface
3534

3635
DEPENDS
3736
IMEXTransformsPassIncGen

lib/Transforms/InsertGPUAllocs.cpp

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,9 @@ class InsertGPUAllocsPass final
104104
builder.setInsertionPoint(alloc);
105105
auto allocResult = builder.create<::mlir::gpu::AllocOp>(
106106
alloc.getLoc(), alloc.getType(), /*asyncToken*/ nullptr,
107-
/*asyncDependencies*/ std::nullopt, alloc.getDynamicSizes(),
108-
alloc.getSymbolOperands(), /*hostShared*/ hostShared);
107+
/*asyncDependencies*/ llvm::ArrayRef<mlir::Value>(),
108+
alloc.getDynamicSizes(), alloc.getSymbolOperands(),
109+
/*hostShared*/ hostShared);
109110
alloc.replaceAllUsesWith(allocResult);
110111
alloc.erase();
111112
}
@@ -114,7 +115,8 @@ class InsertGPUAllocsPass final
114115
for (auto dealloc : deallocOpsInGpuRegion) {
115116
builder.setInsertionPoint(dealloc);
116117
(void)builder.create<::mlir::gpu::DeallocOp>(
117-
dealloc.getLoc(), std::nullopt /*async*/, dealloc.getMemref());
118+
dealloc.getLoc(), llvm::ArrayRef<mlir::Type>() /*async*/,
119+
dealloc.getMemref());
118120
dealloc.erase();
119121
}
120122

@@ -429,8 +431,8 @@ class InsertGPUAllocsPass final
429431
bool hostShared = access.hostRead || access.hostWrite;
430432
auto gpuAlloc = builder.create<mlir::gpu::AllocOp>(
431433
loc, alloc.getType(), /*asyncToken*/ nullptr,
432-
/*asyncDependencies*/ std::nullopt, alloc.getDynamicSizes(),
433-
alloc.getSymbolOperands(), hostShared);
434+
/*asyncDependencies*/ llvm::ArrayRef<mlir::Value>(),
435+
alloc.getDynamicSizes(), alloc.getSymbolOperands(), hostShared);
434436
auto allocResult = gpuAlloc.getResult(0);
435437
builder.setInsertionPoint(term);
436438
for (mlir::OpOperand &use : alloc.getResult().getUses()) {
@@ -453,7 +455,8 @@ class InsertGPUAllocsPass final
453455
}
454456

455457
alloc.replaceAllUsesWith(allocResult);
456-
builder.create<mlir::gpu::DeallocOp>(loc, std::nullopt, allocResult);
458+
builder.create<mlir::gpu::DeallocOp>(loc, llvm::ArrayRef<mlir::Type>(),
459+
allocResult);
457460
alloc.erase();
458461
}
459462
}
@@ -484,8 +487,8 @@ class InsertGPUAllocsPass final
484487
bool hostShared = access.hostRead || access.hostWrite;
485488
auto gpuAlloc = builder.create<mlir::gpu::AllocOp>(
486489
loc, allocType, /*asyncToken*/ nullptr,
487-
/*asyncDependencies*/ std::nullopt, dims,
488-
/*symbolOperands*/ std::nullopt, hostShared);
490+
/*asyncDependencies*/ llvm::ArrayRef<mlir::Value>(), dims,
491+
/*symbolOperands*/ llvm::ArrayRef<mlir::Value>(), hostShared);
489492
auto allocResult = gpuAlloc.getResult(0);
490493
if (access.hostWrite && access.deviceRead) {
491494
auto copy =
@@ -502,15 +505,16 @@ class InsertGPUAllocsPass final
502505
if (access.hostRead && access.deviceWrite) {
503506
builder.create<mlir::memref::CopyOp>(loc, castedAllocResult, op);
504507
}
505-
builder.create<mlir::gpu::DeallocOp>(loc, std::nullopt,
506-
castedAllocResult);
508+
builder.create<mlir::gpu::DeallocOp>(
509+
loc, llvm::ArrayRef<mlir::Type>(), castedAllocResult);
507510
} else {
508511
op.replaceAllUsesExcept(allocResult, filter);
509512
builder.setInsertionPoint(term);
510513
if (access.hostRead && access.deviceWrite) {
511514
builder.create<mlir::memref::CopyOp>(loc, allocResult, op);
512515
}
513-
builder.create<mlir::gpu::DeallocOp>(loc, std::nullopt, allocResult);
516+
builder.create<mlir::gpu::DeallocOp>(
517+
loc, llvm::ArrayRef<mlir::Type>(), allocResult);
514518
}
515519
} else if (m_clientAPI == "vulkan") {
516520
auto gpuAlloc =

lib/Transforms/OptimizeTranspose.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -372,8 +372,8 @@ static void createStoreScatter(Value data, Value slm, Value base,
372372

373373
auto loc = data.getLoc();
374374
auto shape = type.getShape();
375-
auto chunkSize = type.getRank() == 2 ? shape[0] : 1;
376-
auto simdLanes = type.getRank() == 2 ? shape[1] : shape[0];
375+
auto chunkSize = type.getRank() == 2 ? shape[1] : 1;
376+
auto simdLanes = type.getRank() == 2 ? shape[0] : shape[1];
377377

378378
llvm::SmallVector<int64_t> staticOffsets;
379379
for (auto i = 0; i < simdLanes; i++) {
@@ -392,13 +392,11 @@ static void createStoreScatter(Value data, Value slm, Value base,
392392
chunkSize, xegpu::MemorySpace::SLM);
393393
auto desc = rewriter.create<xegpu::CreateDescOp>(loc, tdescTy, slm, offsets);
394394

395-
auto transposeAttr = rewriter.getUnitAttr();
396395
auto maskTy = VectorType::get(simdLanes, rewriter.getI1Type());
397396
auto mask = rewriter.create<arith::ConstantOp>(
398397
loc, DenseElementsAttr::get(maskTy, rewriter.getBoolAttr(true)));
399-
rewriter.create<xegpu::StoreScatterOp>(loc, data, desc, mask, transposeAttr,
400-
nullptr /*L1*/, nullptr /*L2*/,
401-
nullptr /*L3*/);
398+
rewriter.create<xegpu::StoreScatterOp>(loc, data, desc, mask, nullptr /*L1*/,
399+
nullptr /*L2*/, nullptr /*L3*/);
402400
}
403401

404402
static Value createBlockLoad(TypedValue<MemRefType> slm, Value base,
@@ -660,7 +658,7 @@ struct UpdateNdOffsetOpPattern final
660658
//
661659
// Following:
662660
// clang-format off
663-
// %0 = load ...
661+
// %0 = load %t ...
664662
// %1 = transpose %0 ...
665663
// %2 = shape_cast %1 ...
666664
// %3 = shuffle %2 ...
@@ -670,8 +668,7 @@ struct UpdateNdOffsetOpPattern final
670668
//
671669
// is replaced with:
672670
// clang-format off
673-
// %0 = load ...
674-
// %1 = load+transpose %0 ...
671+
// %1 = load+transpose %t...
675672
// ... DPAS B usage ...
676673
// clang-format on
677674
struct TransposeRewritePattern : public OpRewritePattern<vector::TransposeOp> {
@@ -798,6 +795,9 @@ struct TransposeRewritePattern : public OpRewritePattern<vector::TransposeOp> {
798795
auto offset = rewriter.create<arith::MulIOp>(
799796
loc, sgId, index_val(numElems), nullptr /* overflowFlags */);
800797

798+
data =
799+
rewriter.create<vector::TransposeOp>(loc, data, op.getPermutation());
800+
801801
// store data using store_scatter to SLM at the given offset.
802802
createStoreScatter(data, slm, offset, rewriter);
803803

@@ -955,7 +955,7 @@ struct OptimizeTransposePass final
955955
GreedyRewriteConfig config;
956956
config.setRegionSimplificationLevel(GreedySimplifyRegionLevel::Disabled);
957957
config.setUseTopDownTraversal(true);
958-
config.setStrictness(GreedyRewriteStrictness::ExistingAndNewOps);
958+
config.setStrictness(GreedyRewriteStrictness::ExistingOps);
959959
patterns.add<TransposeRewritePattern>(context, analysis, uArchInterface);
960960
if (failed(applyPatternsGreedily(getOperation(), std::move(patterns),
961961
config))) {

0 commit comments

Comments
 (0)