Skip to content

Commit 08ade3f

Browse files
committed
clean up
1 parent 6aa4aef commit 08ade3f

File tree

3 files changed

+2
-202
lines changed

3 files changed

+2
-202
lines changed

mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -409,7 +409,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
409409
}
410410

411411
def XeGPU_UpdateNdOffsetOp : XeGPU_Op<"update_nd_offset",
412-
[Pure, AllTypesMatch<["TensorDesc", "result"]>]> {
412+
[AllTypesMatch<["TensorDesc", "result"]>]> {
413413
let summary = "It updates the offsets for the TensorDesc.";
414414
let description = [{The op updates the offset of the given TensorDesc.
415415
The offsets are relative offset to the current position in the number

mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp

Lines changed: 1 addition & 134 deletions
Original file line numberDiff line numberDiff line change
@@ -301,10 +301,6 @@ class LayoutInfoPropagation
301301
ArrayRef<LayoutInfoLattice *> operands,
302302
ArrayRef<const LayoutInfoLattice *> results);
303303

304-
void visitPrefetchNdOp(xegpu::PrefetchNdOp prefetch,
305-
ArrayRef<LayoutInfoLattice *> operands,
306-
ArrayRef<const LayoutInfoLattice *> results);
307-
308304
void visitVectorMultiReductionOp(vector::MultiDimReductionOp reduction,
309305
ArrayRef<LayoutInfoLattice *> operands,
310306
ArrayRef<const LayoutInfoLattice *> results);
@@ -356,9 +352,6 @@ LogicalResult LayoutInfoPropagation::visitOperation(
356352
.Case<xegpu::UpdateNdOffsetOp>([&](auto updateNdOffsetOp) {
357353
visitUpdateNdOffsetOp(updateNdOffsetOp, operands, results);
358354
})
359-
.Case<xegpu::PrefetchNdOp>([&](auto prefetchNdOp) {
360-
visitPrefetchNdOp(prefetchNdOp, operands, results);
361-
})
362355
// No need to propagate the layout to operands in CreateNdDescOp because
363356
// they are scalars (offsets, sizes, etc.).
364357
.Case<xegpu::CreateNdDescOp>([&](auto createNdDescOp) {})
@@ -388,18 +381,6 @@ LogicalResult LayoutInfoPropagation::visitOperation(
388381
return success();
389382
}
390383

391-
void LayoutInfoPropagation::visitPrefetchNdOp(
392-
xegpu::PrefetchNdOp prefetch, ArrayRef<LayoutInfoLattice *> operands,
393-
ArrayRef<const LayoutInfoLattice *> results) {
394-
// Here we assign the default layout to the tensor descriptor operand of
395-
// prefetch.
396-
auto tdescTy = prefetch.getTensorDescType();
397-
auto prefetchLayout = getDefaultLayoutInfo(
398-
VectorType::get(tdescTy.getShape(), tdescTy.getElementType()));
399-
// Propagate the layout to the source tensor descriptor.
400-
propagateIfChanged(operands[0], operands[0]->meet(prefetchLayout));
401-
}
402-
403384
void LayoutInfoPropagation::visitVectorMultiReductionOp(
404385
vector::MultiDimReductionOp reduction,
405386
ArrayRef<LayoutInfoLattice *> operands,
@@ -1431,119 +1412,6 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
14311412
}
14321413
};
14331414

1434-
/// Sink an update_nd_offset op feeding into yield op of an enclosing
1435-
/// `gpu.warp_execute_on_lane_0` region. The warp op will still contain the
1436-
/// original op that will not be used by the yield op (and should be cleaned
1437-
/// up later). The yield op will bypass the updateOp's arguments. The tensor
1438-
/// descriptor type is not distributed. Appropriate cast ops are inserted if
1439-
/// the distributed types does not match expected xegpu SIMT types.
1440-
/// Example:
1441-
/// ```
1442-
/// #lo0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
1443-
/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
1444-
/// (!xegpu.tensor_desc<4x8xf32, #lo0>) {
1445-
/// ...
1446-
/// %update = xegpu.update_nd_offset %arg0, [%c32, %c16]:
1447-
/// !xegpu.tensor_desc<4x8xf32, #lo0>
1448-
/// gpu.yield %update
1449-
/// }
1450-
/// ...
1451-
/// ```
1452-
/// To
1453-
/// ```
1454-
/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
1455-
/// !xegpu.tensor_desc<4x8xf32, #lo0>) {
1456-
/// ...
1457-
/// %dead = xegpu.update_nd_offset %arg0, [%c32, %c16]:
1458-
/// !xegpu.tensor_desc<4x8xf32, #lo0> gpu.yield %dead, %arg0
1459-
/// gup.yield %dead, %arg0, %c32, %c16
1460-
/// }
1461-
/// %0 = xegpu.unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
1462-
/// #lo0> -> !xegpu.tensor_desc<4x8xf32>
1463-
/// %1 = xegpu.update_nd_offset %0, [%c32, %c16]:
1464-
/// !xegpu.tensor_desc<4x8xf32>
1465-
/// ...
1466-
/// ```
1467-
struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern {
1468-
using gpu::WarpDistributionPattern::WarpDistributionPattern;
1469-
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
1470-
PatternRewriter &rewriter) const override {
1471-
OpOperand *operand =
1472-
getWarpResult(subgroupOp, llvm::IsaPred<xegpu::UpdateNdOffsetOp>);
1473-
if (!operand)
1474-
return rewriter.notifyMatchFailure(
1475-
subgroupOp, "warp result is not a xegpu::UpdateNdOffset op");
1476-
auto updateOp = operand->get().getDefiningOp<xegpu::UpdateNdOffsetOp>();
1477-
unsigned operandIdx = operand->getOperandNumber();
1478-
xegpu::TensorDescType newTensorDescTy =
1479-
dropLayouts(updateOp.getTensorDescType());
1480-
1481-
SmallVector<Value, 3> newYieldValues;
1482-
SmallVector<Type, 3> newYieldTypes;
1483-
for (Value operand : updateOp->getOperands()) {
1484-
newYieldValues.push_back(operand);
1485-
if (isa<xegpu::TensorDescType>(operand.getType())) {
1486-
newYieldTypes.push_back(newTensorDescTy);
1487-
} else {
1488-
newYieldTypes.push_back(operand.getType());
1489-
}
1490-
}
1491-
SmallVector<size_t> newRetIndices;
1492-
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
1493-
rewriter, subgroupOp, newYieldValues, newYieldTypes, newRetIndices);
1494-
rewriter.setInsertionPointAfter(newWarpOp);
1495-
SmallVector<Value> newUpdateOperands;
1496-
for (size_t i : newRetIndices) {
1497-
if (isa<xegpu::TensorDescType>(newWarpOp.getResult(i).getType())) {
1498-
newUpdateOperands.push_back(resolveDistributedTy(
1499-
newWarpOp.getResult(i), newTensorDescTy, rewriter));
1500-
} else {
1501-
newUpdateOperands.push_back(newWarpOp.getResult(i));
1502-
}
1503-
}
1504-
auto newUpdateOp = rewriter.create<xegpu::UpdateNdOffsetOp>(
1505-
newWarpOp.getLoc(), newTensorDescTy, newUpdateOperands,
1506-
removeTemporaryLayoutAttributes(updateOp->getAttrs()));
1507-
Value distributedVal = newWarpOp.getResult(operandIdx);
1508-
rewriter.replaceAllUsesWith(distributedVal, newUpdateOp);
1509-
return success();
1510-
}
1511-
};
1512-
1513-
struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
1514-
using gpu::WarpDistributionPattern::WarpDistributionPattern;
1515-
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
1516-
PatternRewriter &rewriter) const override {
1517-
auto yield = cast<gpu::YieldOp>(
1518-
subgroupOp.getBodyRegion().getBlocks().begin()->getTerminator());
1519-
Operation *lastNode = yield->getPrevNode();
1520-
auto prefetchOp = dyn_cast_or_null<xegpu::PrefetchNdOp>(lastNode);
1521-
if (!prefetchOp)
1522-
return failure();
1523-
xegpu::LayoutAttr layout = prefetchOp.getTensorDescType().getLayoutAttr();
1524-
if (!layout)
1525-
return rewriter.notifyMatchFailure(
1526-
prefetchOp, "the source tensor descriptor lacks layout attribute");
1527-
1528-
SmallVector<Value, 1> newYieldValues = {prefetchOp.getTensorDesc()};
1529-
SmallVector<Type, 1> newYieldTypes = {prefetchOp.getTensorDescType()};
1530-
SmallVector<size_t> newRetIndices;
1531-
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
1532-
rewriter, subgroupOp, newYieldValues, newYieldTypes, newRetIndices);
1533-
1534-
xegpu::TensorDescType newTensorDescTy =
1535-
dropLayouts(prefetchOp.getTensorDescType());
1536-
rewriter.setInsertionPointAfter(newWarpOp);
1537-
SmallVector<Value> newPrefetchOperands = {resolveDistributedTy(
1538-
newWarpOp.getResult(newRetIndices[0]), newTensorDescTy, rewriter)};
1539-
rewriter.create<xegpu::PrefetchNdOp>(
1540-
newWarpOp.getLoc(), TypeRange{}, newPrefetchOperands,
1541-
removeTemporaryLayoutAttributes(prefetchOp->getAttrs()));
1542-
rewriter.eraseOp(prefetchOp);
1543-
return success();
1544-
}
1545-
};
1546-
15471415
/// Generic pattern for sinking a GPU index operations feeding into yield op
15481416
/// of an enclosing `gpu.warp_execute_on_lane_0` region. The original index op
15491417
/// becomes dead and an equivalent copy of the index op is created outside the
@@ -1619,8 +1487,7 @@ struct XeGPUSubgroupDistributePass final
16191487
void xegpu::populateXeGPUSubgroupDistributePatterns(
16201488
RewritePatternSet &patterns) {
16211489
patterns.add<CreateNdDescDistribution, StoreNdDistribution,
1622-
LoadNdDistribution, DpasDistribution, PrefetchNdDistribution,
1623-
UpdateNdOffsetDistribution>(patterns.getContext());
1490+
LoadNdDistribution, DpasDistribution>(patterns.getContext());
16241491
// TODO: Is this the right place to add these patterns?
16251492
patterns.add<GpuIndexOpDistribution<gpu::BlockIdOp>,
16261493
GpuIndexOpDistribution<gpu::BlockDimOp>,

mlir/test/Dialect/XeGPU/subgroup-distribution.mlir

Lines changed: 0 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -161,73 +161,6 @@ gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %arg1: ui64,
161161
}
162162
}
163163

164-
// -----
165-
// CHECK-LABEL: gpu.func @update_nd_offset_1d(
166-
// CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>) {
167-
// CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<1xf32>
168-
// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf32> -> !xegpu.tensor_desc<16xf32>
169-
// CHECK: %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%c32] : !xegpu.tensor_desc<16xf32>
170-
// CHECK: xegpu.store_nd %[[CST]], %[[T1]] : vector<1xf32>, !xegpu.tensor_desc<16xf32>
171-
gpu.module @test {
172-
gpu.func @update_nd_offset_1d(%arg0: memref<256xf32>){
173-
%c0 = arith.constant 0 : index
174-
%c32 = arith.constant 32 : index
175-
%1 = arith.constant dense<1.000000e+00> : vector<16xf32>
176-
%0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf32> -> !xegpu.tensor_desc<16xf32>
177-
%2 = xegpu.update_nd_offset %0, [%c32] : !xegpu.tensor_desc<16xf32>
178-
xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
179-
gpu.return
180-
}
181-
}
182-
183-
// -----
184-
// CHECK-LABEL: gpu.func @update_nd_offset_2d
185-
// CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf32>) {
186-
// CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf32>
187-
// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32>
188-
// CHECK: %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%c32, %c32] : !xegpu.tensor_desc<16x16xf32>
189-
// CHECK: xegpu.store_nd %[[CST]], %[[T1]] : vector<16xf32>, !xegpu.tensor_desc<16x16xf32>
190-
gpu.module @test {
191-
gpu.func @update_nd_offset_2d(%arg0: memref<256x256xf32>){
192-
%c0 = arith.constant 0 : index
193-
%c32 = arith.constant 32 : index
194-
%1 = arith.constant dense<1.000000e+00> : vector<16x16xf32>
195-
%0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32>
196-
%2 = xegpu.update_nd_offset %0, [%c32, %c32] : !xegpu.tensor_desc<16x16xf32>
197-
xegpu.store_nd %1, %2 : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32>
198-
gpu.return
199-
}
200-
}
201-
202-
// -----
203-
// CHECK-LABEL: gpu.func @prefetch_2d
204-
// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) {
205-
// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16>
206-
// CHECK: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16>
207-
gpu.module @test {
208-
gpu.func @prefetch_2d(%arg0: memref<256x256xf16>){
209-
%c0 = arith.constant 0 : index
210-
%0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16>
211-
xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<16x16xf16>
212-
gpu.return
213-
}
214-
}
215-
216-
// -----
217-
// CHECK-LABEL: gpu.func @prefetch_1d
218-
// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
219-
// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16>
220-
// CHECK: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16>
221-
gpu.module @test {
222-
gpu.func @prefetch_1d(%arg0: memref<256xf16>){
223-
%c0 = arith.constant 0 : index
224-
%0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16>
225-
xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<16xf16>
226-
gpu.return
227-
}
228-
}
229-
230-
231164
// -----
232165
// CHECK-LABEL: gpu.func @gemm_loop
233166
// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<1024x1024xf32>) {

0 commit comments

Comments
 (0)