Skip to content

Commit 6df7456

Browse files
authored
Switch to new xegpu-to-vc SPIRV pass pipeline for XeGPU integration tests (#734)
This change contains following: - Switch to new SPIRV pass pipeline to use XeGPU to Func dialect lowering - runner commandline changes for XeGPU test cases to use new pass pipeline. - Minor fixes to lower dynamic shapes in create_nd, update_nd op. - TODO: xegpu_to_vc pass still needs to add lowering for xegpu.create_nbarrier, so some of the gemm_4k* testcases still use old XeGPUToSPIRV pass. It will be addressed in follow-up PR (5 test cases are pending transition)
1 parent 6e29bc0 commit 6df7456

23 files changed

+167
-64
lines changed

CMakeLists.txt

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -268,9 +268,6 @@ message(STATUS "LLVM_DEFINITIONS: ${LLVM_DEFINITIONS}")
268268
add_definitions(${LLVM_DEFINITIONS})
269269

270270
set(LLVM_LIT_ARGS "-sv" CACHE STRING "lit default options")
271-
if (IMEX_ENABLE_SYCL_RUNTIME OR IMEX_ENABLE_L0_RUNTIME)
272-
set(LLVM_LIT_ARGS "-j 4 ${LLVM_LIT_ARGS}") # do not stress GPU
273-
endif()
274271

275272
set(IMEX_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
276273
# LLVM_EXTERNAL_PROJECTS build puts library, executables and tools in LLVM's CMAKE_BINARY_DIR

lib/Conversion/XeGPUToVC/XeGPUToVC.cpp

Lines changed: 33 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@
1616
#include <imex/Conversion/XeGPUToVC/XeGPUToVC.h>
1717

1818
#include "mlir/Dialect/SPIRV/IR/SPIRVAttributes.h"
19+
#include "mlir/IR/BuiltinDialect.h"
20+
#include "mlir/IR/BuiltinOps.h"
21+
#include "mlir/IR/BuiltinTypes.h"
1922
#include "mlir/Pass/Pass.h"
2023
#include "mlir/Pass/PassManager.h"
2124

@@ -155,14 +158,17 @@ struct CreateNdDescPattern
155158
// compute surface width
156159
auto bytesPerElem = createIntConstant(bitWidth / 8);
157160
auto one = createIntConstant(1);
158-
surfaceW = rewriter.create<arith::ExtUIOp>(loc, i32Type,
159-
adaptor.getShape()[1]);
160-
surfaceW = rewriter.create<arith::MulIOp>(loc, surfaceW, bytesPerElem);
161+
auto surfaceWCast = rewriter.create<arith::IndexCastUIOp>(
162+
loc, i32Type, adaptor.getShape()[1]);
163+
164+
surfaceW =
165+
rewriter.create<arith::MulIOp>(loc, surfaceWCast, bytesPerElem);
161166
surfaceW = rewriter.create<arith::SubIOp>(loc, surfaceW, one);
162167
// compute surface height
163-
surfaceH = rewriter.create<arith::ExtUIOp>(loc, i32Type,
164-
adaptor.getShape()[0]);
165-
surfaceH = rewriter.create<arith::SubIOp>(loc, surfaceH, one);
168+
169+
auto surfaceHCast = rewriter.create<arith::IndexCastUIOp>(
170+
loc, i32Type, adaptor.getShape()[0]);
171+
surfaceH = rewriter.create<arith::SubIOp>(loc, surfaceHCast, one);
166172
// fixme: pitch = width for now
167173
surfaceP = surfaceW;
168174
}
@@ -210,15 +216,15 @@ class UpdateNDOffsetToVCPattern
210216

211217
auto loc = op.getLoc();
212218
auto i32Type = rewriter.getI32Type();
213-
auto offsets = adaptor.getOffsets();
219+
auto offsets = op.getOffsets();
214220

215221
// Get Payload
216222
auto desc = adaptor.getTensorDesc();
217-
218223
for (size_t i = 0; i < offsets.size(); i++) {
219224
auto offset = offsets[i];
220-
if (auto cst = dyn_cast<arith::ConstantOp>(offset.getDefiningOp()))
221-
if (auto attr = dyn_cast<mlir::IntegerAttr>(cst.getValue());
225+
if (auto cst =
226+
dyn_cast_if_present<arith::ConstantOp>(offset.getDefiningOp()))
227+
if (auto attr = dyn_cast_if_present<mlir::IntegerAttr>(cst.getValue());
222228
attr && attr.getInt() == 0)
223229
continue;
224230

@@ -227,7 +233,8 @@ class UpdateNDOffsetToVCPattern
227233
// offset.
228234
int32_t idx = i == 0 ? 6 : 5;
229235
auto oldOffset = rewriter.create<vector::ExtractOp>(loc, desc, idx);
230-
offset = rewriter.create<arith::TruncIOp>(loc, i32Type, offset);
236+
offset = rewriter.create<arith::IndexCastUIOp>(loc, i32Type, offset);
237+
231238
auto newOffset = rewriter.create<arith::AddIOp>(loc, oldOffset, offset);
232239

233240
// Update new 2D Block OffsetX/OffsetY in Payload descriptor.
@@ -630,6 +637,7 @@ struct DpasPattern : public OpConversionPattern<::mlir::xegpu::DpasOp> {
630637
auto infoAttr = rewriter.getIntegerAttr(rewriter.getI32Type(), infoVal);
631638
auto info = rewriter.create<arith::ConstantOp>(loc, rewriter.getI32Type(),
632639
infoAttr);
640+
633641
auto newResultType = encodeVectorType(rewriter, resultType).second;
634642
SmallVector<Value, 4> args{adaptor.getRhs(), adaptor.getLhs(), info};
635643
std::string funcName = "llvm.genx.dpas.nosrc0.";
@@ -1106,7 +1114,7 @@ class FenceToVCPattern : public OpConversionPattern<::mlir::xegpu::FenceOp> {
11061114
}
11071115
};
11081116

1109-
struct VectorShapeCast final
1117+
struct VectorShapeCastVC final
11101118
: public OpConversionPattern<mlir::vector::ShapeCastOp> {
11111119
using OpConversionPattern<mlir::vector::ShapeCastOp>::OpConversionPattern;
11121120

@@ -1130,7 +1138,7 @@ struct VectorShapeCast final
11301138
}
11311139
};
11321140

1133-
struct VectorExtract final
1141+
struct VectorExtractVC final
11341142
: public OpConversionPattern<mlir::vector::ExtractOp> {
11351143
using OpConversionPattern<mlir::vector::ExtractOp>::OpConversionPattern;
11361144

@@ -1139,6 +1147,7 @@ struct VectorExtract final
11391147
ConversionPatternRewriter &rewriter) const override {
11401148

11411149
auto *converter = getTypeConverter();
1150+
11421151
auto dstTy = converter->convertType(extractOp.getType());
11431152
if (!dstTy)
11441153
return failure();
@@ -1200,7 +1209,7 @@ static uint64_t getFirstIntValue(mlir::ArrayAttr attr) {
12001209
return (*attr.getAsValueRange<IntegerAttr>().begin()).getZExtValue();
12011210
};
12021211

1203-
struct VectorExtractStridedSlice final
1212+
struct VectorExtractStridedSliceVC final
12041213
: public OpConversionPattern<vector::ExtractStridedSliceOp> {
12051214
using OpConversionPattern<vector::ExtractStridedSliceOp>::OpConversionPattern;
12061215
LogicalResult
@@ -1298,7 +1307,7 @@ struct VectorExtractStridedSlice final
12981307
}
12991308
};
13001309

1301-
struct VectorShuffle final
1310+
struct VectorShuffleVC final
13021311
: public OpConversionPattern<mlir::vector::ShuffleOp> {
13031312
using OpConversionPattern<mlir::vector::ShuffleOp>::OpConversionPattern;
13041313

@@ -1387,7 +1396,7 @@ struct SCFForOpBlockVCPattern final
13871396

13881397
rewriter.applySignatureConversion(&op.getRegion(), signatureConverter);
13891398

1390-
newOp.getBody()->erase();
1399+
rewriter.eraseBlock(newOp.getBody());
13911400
rewriter.inlineRegionBefore(op.getRegion(), newOp.getRegion(),
13921401
newOp.getRegion().end());
13931402
rewriter.replaceOp(op, newOp.getResults());
@@ -1453,8 +1462,8 @@ struct XeGPUToVCPass : public ::imex::ConvertXeGPUToVCBase<XeGPUToVCPass> {
14531462
target.addDynamicallyLegalDialect<mlir::scf::SCFDialect>(
14541463
[&](mlir::Operation *op) { return isLegalXeGPUSCFOp(op); });
14551464

1456-
target.addIllegalOp<::mlir::vector::ShapeCastOp>();
1457-
target.addIllegalOp<::mlir::vector::ExtractStridedSliceOp>();
1465+
target.addIllegalOp<::mlir::vector::ShapeCastOp,
1466+
::mlir::vector::ExtractStridedSliceOp>();
14581467

14591468
typeConverter.addConversion(
14601469
[&](xegpu::TensorDescType type) -> ::mlir::Type {
@@ -1507,11 +1516,12 @@ struct XeGPUToVCPass : public ::imex::ConvertXeGPUToVCBase<XeGPUToVCPass> {
15071516
CompilerHintToVCPattern, FenceToVCPattern,
15081517
UpdateNDOffsetToVCPattern, SCFYieldOpVCPattern>(
15091518
patterns.getContext());
1510-
patterns.add<GatherScatterToRawSend<xegpu::LoadGatherOp>,
1511-
GatherScatterToRawSend<xegpu::StoreScatterOp>, AtomicToLsc,
1512-
VectorShapeCast, VectorExtract, VectorExtractStridedSlice,
1513-
VectorShuffle, SCFForOpBlockVCPattern>(typeConverter,
1514-
patterns.getContext());
1519+
patterns
1520+
.add<GatherScatterToRawSend<xegpu::LoadGatherOp>,
1521+
GatherScatterToRawSend<xegpu::StoreScatterOp>, AtomicToLsc,
1522+
VectorShapeCastVC, VectorExtractVC, VectorExtractStridedSliceVC,
1523+
VectorShuffleVC, SCFForOpBlockVCPattern>(typeConverter,
1524+
patterns.getContext());
15151525

15161526
if (this->useRawSend) {
15171527
patterns.add<LoadStorePrefetchNdToRawSendPattern<xegpu::LoadNdOp>,

test/Integration/Dialect/XeGPU/dynamic_memref.vc.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xegpu-to-llvm.pp \
1+
// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
22
// RUN: --runner imex-cpu-runner -e main \
33
// RUN: --entry-point-result=void \
44
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
5-
// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-llvm.pp \
5+
// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
66
// RUN: --runner imex-cpu-runner -e main \
77
// RUN: --entry-point-result=void \
88
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck

test/Integration/Dialect/XeGPU/exp_f32.vc.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xegpu-to-llvm.pp \
1+
// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
22
// RUN: --runner imex-cpu-runner -e main \
33
// RUN: --entry-point-result=void \
44
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
5-
// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-llvm.pp \
5+
// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
66
// RUN: --runner imex-cpu-runner -e main \
77
// RUN: --entry-point-result=void \
88
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck

test/Integration/Dialect/XeGPU/fmax_f32.vc.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xegpu-to-llvm.pp \
1+
// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
22
// RUN: --runner imex-cpu-runner -e main \
33
// RUN: --entry-point-result=void \
44
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
5-
// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-llvm.pp \
5+
// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
66
// RUN: --runner imex-cpu-runner -e main \
77
// RUN: --entry-point-result=void \
88
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck

test/Integration/Dialect/XeGPU/gemm_1024x1016x1016_f16_f16_f32.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xegpu-to-llvm.pp \
1+
// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
22
// RUN: --runner imex-cpu-runner -e main \
33
// RUN: --entry-point-result=void \
44
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
5-
// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-llvm.pp \
5+
// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
66
// RUN: --runner imex-cpu-runner -e main \
77
// RUN: --entry-point-result=void \
88
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck

test/Integration/Dialect/XeGPU/gemm_1024x1024xf16.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xegpu-to-llvm.pp \
1+
// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
22
// RUN: --runner imex-cpu-runner -e main \
33
// RUN: --entry-point-result=void \
44
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
5-
// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-llvm.pp \
5+
// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
66
// RUN: --runner imex-cpu-runner -e main \
77
// RUN: --entry-point-result=void \
88
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck

test/Integration/Dialect/XeGPU/gemm_1024x1024xf16.using.updateoffset.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xegpu-to-llvm.pp \
1+
// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
22
// RUN: --runner imex-cpu-runner -e main \
33
// RUN: --entry-point-result=void \
44
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
5-
// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-llvm.pp \
5+
// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
66
// RUN: --runner imex-cpu-runner -e main \
77
// RUN: --entry-point-result=void \
88
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck

test/Integration/Dialect/XeGPU/gemm_with_transposed_B_1kx1kx1k_f16_f16_f32.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xegpu-to-llvm.pp \
1+
// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
22
// RUN: --runner imex-cpu-runner -e main \
33
// RUN: --entry-point-result=void \
44
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
5-
// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-llvm.pp \
5+
// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
66
// RUN: --runner imex-cpu-runner -e main \
77
// RUN: --entry-point-result=void \
88
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck

test/Integration/Dialect/XeGPU/large_stores_8x32xf16_w_1d_vector_shuffle.vc.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xegpu-to-llvm.pp \
1+
// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
22
// RUN: --runner imex-cpu-runner -e main \
33
// RUN: --entry-point-result=void \
44
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
5-
// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-llvm.pp \
5+
// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
66
// RUN: --runner imex-cpu-runner -e main \
77
// RUN: --entry-point-result=void \
88
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck

0 commit comments

Comments
 (0)