@@ -301,10 +301,6 @@ class LayoutInfoPropagation
301301 ArrayRef<LayoutInfoLattice *> operands,
302302 ArrayRef<const LayoutInfoLattice *> results);
303303
304- void visitPrefetchNdOp (xegpu::PrefetchNdOp prefetch,
305- ArrayRef<LayoutInfoLattice *> operands,
306- ArrayRef<const LayoutInfoLattice *> results);
307-
308304 void visitVectorMultiReductionOp (vector::MultiDimReductionOp reduction,
309305 ArrayRef<LayoutInfoLattice *> operands,
310306 ArrayRef<const LayoutInfoLattice *> results);
@@ -356,9 +352,6 @@ LogicalResult LayoutInfoPropagation::visitOperation(
356352 .Case <xegpu::UpdateNdOffsetOp>([&](auto updateNdOffsetOp) {
357353 visitUpdateNdOffsetOp (updateNdOffsetOp, operands, results);
358354 })
359- .Case <xegpu::PrefetchNdOp>([&](auto prefetchNdOp) {
360- visitPrefetchNdOp (prefetchNdOp, operands, results);
361- })
362355 // No need to propagate the layout to operands in CreateNdDescOp because
363356 // they are scalars (offsets, sizes, etc.).
364357 .Case <xegpu::CreateNdDescOp>([&](auto createNdDescOp) {})
@@ -388,18 +381,6 @@ LogicalResult LayoutInfoPropagation::visitOperation(
388381 return success ();
389382}
390383
391- void LayoutInfoPropagation::visitPrefetchNdOp (
392- xegpu::PrefetchNdOp prefetch, ArrayRef<LayoutInfoLattice *> operands,
393- ArrayRef<const LayoutInfoLattice *> results) {
394- // Here we assign the default layout to the tensor descriptor operand of
395- // prefetch.
396- auto tdescTy = prefetch.getTensorDescType ();
397- auto prefetchLayout = getDefaultLayoutInfo (
398- VectorType::get (tdescTy.getShape (), tdescTy.getElementType ()));
399- // Propagate the layout to the source tensor descriptor.
400- propagateIfChanged (operands[0 ], operands[0 ]->meet (prefetchLayout));
401- }
402-
403384void LayoutInfoPropagation::visitVectorMultiReductionOp (
404385 vector::MultiDimReductionOp reduction,
405386 ArrayRef<LayoutInfoLattice *> operands,
@@ -1431,119 +1412,6 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
14311412 }
14321413};
14331414
1434- // / Sink an update_nd_offset op feeding into yield op of an enclosing
1435- // / `gpu.warp_execute_on_lane_0` region. The warp op will still contain the
1436- // / original op that will not be used by the yield op (and should be cleaned
1437- // / up later). The yield op will bypass the updateOp's arguments. The tensor
1438- // / descriptor type is not distributed. Appropriate cast ops are inserted if
1439- // / the distributed types does not match expected xegpu SIMT types.
1440- // / Example:
1441- // / ```
1442- // / #lo0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
1443- // / %r = gpu.warp_execute_on_lane_0(%laneid) ->
1444- // / (!xegpu.tensor_desc<4x8xf32, #lo0>) {
1445- // / ...
1446- // / %update = xegpu.update_nd_offset %arg0, [%c32, %c16]:
1447- // / !xegpu.tensor_desc<4x8xf32, #lo0>
1448- // / gpu.yield %update
1449- // / }
1450- // / ...
1451- // / ```
1452- // / To
1453- // / ```
1454- // / %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
1455- // / !xegpu.tensor_desc<4x8xf32, #lo0>) {
1456- // / ...
1457- // / %dead = xegpu.update_nd_offset %arg0, [%c32, %c16]:
1458- // / !xegpu.tensor_desc<4x8xf32, #lo0> gpu.yield %dead, %arg0
1459- // / gup.yield %dead, %arg0, %c32, %c16
1460- // / }
1461- // / %0 = xegpu.unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
1462- // / #lo0> -> !xegpu.tensor_desc<4x8xf32>
1463- // / %1 = xegpu.update_nd_offset %0, [%c32, %c16]:
1464- // / !xegpu.tensor_desc<4x8xf32>
1465- // / ...
1466- // / ```
1467- struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern {
1468- using gpu::WarpDistributionPattern::WarpDistributionPattern;
1469- LogicalResult matchAndRewrite (gpu::WarpExecuteOnLane0Op subgroupOp,
1470- PatternRewriter &rewriter) const override {
1471- OpOperand *operand =
1472- getWarpResult (subgroupOp, llvm::IsaPred<xegpu::UpdateNdOffsetOp>);
1473- if (!operand)
1474- return rewriter.notifyMatchFailure (
1475- subgroupOp, " warp result is not a xegpu::UpdateNdOffset op" );
1476- auto updateOp = operand->get ().getDefiningOp <xegpu::UpdateNdOffsetOp>();
1477- unsigned operandIdx = operand->getOperandNumber ();
1478- xegpu::TensorDescType newTensorDescTy =
1479- dropLayouts (updateOp.getTensorDescType ());
1480-
1481- SmallVector<Value, 3 > newYieldValues;
1482- SmallVector<Type, 3 > newYieldTypes;
1483- for (Value operand : updateOp->getOperands ()) {
1484- newYieldValues.push_back (operand);
1485- if (isa<xegpu::TensorDescType>(operand.getType ())) {
1486- newYieldTypes.push_back (newTensorDescTy);
1487- } else {
1488- newYieldTypes.push_back (operand.getType ());
1489- }
1490- }
1491- SmallVector<size_t > newRetIndices;
1492- gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns (
1493- rewriter, subgroupOp, newYieldValues, newYieldTypes, newRetIndices);
1494- rewriter.setInsertionPointAfter (newWarpOp);
1495- SmallVector<Value> newUpdateOperands;
1496- for (size_t i : newRetIndices) {
1497- if (isa<xegpu::TensorDescType>(newWarpOp.getResult (i).getType ())) {
1498- newUpdateOperands.push_back (resolveDistributedTy (
1499- newWarpOp.getResult (i), newTensorDescTy, rewriter));
1500- } else {
1501- newUpdateOperands.push_back (newWarpOp.getResult (i));
1502- }
1503- }
1504- auto newUpdateOp = rewriter.create <xegpu::UpdateNdOffsetOp>(
1505- newWarpOp.getLoc (), newTensorDescTy, newUpdateOperands,
1506- removeTemporaryLayoutAttributes (updateOp->getAttrs ()));
1507- Value distributedVal = newWarpOp.getResult (operandIdx);
1508- rewriter.replaceAllUsesWith (distributedVal, newUpdateOp);
1509- return success ();
1510- }
1511- };
1512-
1513- struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
1514- using gpu::WarpDistributionPattern::WarpDistributionPattern;
1515- LogicalResult matchAndRewrite (gpu::WarpExecuteOnLane0Op subgroupOp,
1516- PatternRewriter &rewriter) const override {
1517- auto yield = cast<gpu::YieldOp>(
1518- subgroupOp.getBodyRegion ().getBlocks ().begin ()->getTerminator ());
1519- Operation *lastNode = yield->getPrevNode ();
1520- auto prefetchOp = dyn_cast_or_null<xegpu::PrefetchNdOp>(lastNode);
1521- if (!prefetchOp)
1522- return failure ();
1523- xegpu::LayoutAttr layout = prefetchOp.getTensorDescType ().getLayoutAttr ();
1524- if (!layout)
1525- return rewriter.notifyMatchFailure (
1526- prefetchOp, " the source tensor descriptor lacks layout attribute" );
1527-
1528- SmallVector<Value, 1 > newYieldValues = {prefetchOp.getTensorDesc ()};
1529- SmallVector<Type, 1 > newYieldTypes = {prefetchOp.getTensorDescType ()};
1530- SmallVector<size_t > newRetIndices;
1531- gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns (
1532- rewriter, subgroupOp, newYieldValues, newYieldTypes, newRetIndices);
1533-
1534- xegpu::TensorDescType newTensorDescTy =
1535- dropLayouts (prefetchOp.getTensorDescType ());
1536- rewriter.setInsertionPointAfter (newWarpOp);
1537- SmallVector<Value> newPrefetchOperands = {resolveDistributedTy (
1538- newWarpOp.getResult (newRetIndices[0 ]), newTensorDescTy, rewriter)};
1539- rewriter.create <xegpu::PrefetchNdOp>(
1540- newWarpOp.getLoc (), TypeRange{}, newPrefetchOperands,
1541- removeTemporaryLayoutAttributes (prefetchOp->getAttrs ()));
1542- rewriter.eraseOp (prefetchOp);
1543- return success ();
1544- }
1545- };
1546-
15471415// / Generic pattern for sinking a GPU index operations feeding into yield op
15481416// / of an enclosing `gpu.warp_execute_on_lane_0` region. The original index op
15491417// / becomes dead and an equivalent copy of the index op is created outside the
@@ -1619,8 +1487,7 @@ struct XeGPUSubgroupDistributePass final
16191487void xegpu::populateXeGPUSubgroupDistributePatterns (
16201488 RewritePatternSet &patterns) {
16211489 patterns.add <CreateNdDescDistribution, StoreNdDistribution,
1622- LoadNdDistribution, DpasDistribution, PrefetchNdDistribution,
1623- UpdateNdOffsetDistribution>(patterns.getContext ());
1490+ LoadNdDistribution, DpasDistribution>(patterns.getContext ());
16241491 // TODO: Is this the right place to add these patterns?
16251492 patterns.add <GpuIndexOpDistribution<gpu::BlockIdOp>,
16261493 GpuIndexOpDistribution<gpu::BlockDimOp>,
0 commit comments