@@ -268,7 +268,7 @@ struct MoveFuncBodyToWarpExecuteOnLane0
268
268
// / %r = gpu.warp_execute_on_lane_0(%laneid) ->
269
269
// / (!xegpu.tensor_desc<4x8xf32, #layout0>) {
270
270
// / ...
271
- // / %td = xegpu.create_nd_tdesc %arg0[0, 0]
271
+ // / %td = xegpu.create_nd_tdesc %arg0
272
272
// / : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #layout0>
273
273
// / vector.yield %td
274
274
// / }
@@ -277,11 +277,11 @@ struct MoveFuncBodyToWarpExecuteOnLane0
277
277
// / ```
278
278
// / %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (...) {
279
279
// / ...
280
- // / %dead = xegpu.create_nd_tdesc %arg0[0, 0]
280
+ // / %dead = xegpu.create_nd_tdesc %arg0
281
281
// / : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #layout0>
282
282
// / vector.yield %arg0, %dead
283
283
// / }
284
- // / %td = xegpu.create_nd_tdesc %r#0[0, 0] : memref<4x8xf32>
284
+ // / %td = xegpu.create_nd_tdesc %r#0: memref<4x8xf32>
285
285
// / -> !xegpu.tensor_desc<4x8xf32>
286
286
// /
287
287
// / ```
@@ -301,6 +301,10 @@ struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
301
301
if (!layout)
302
302
return rewriter.notifyMatchFailure (
303
303
descOp, " the tensor descriptor lacks layout attribute" );
304
+ // CreateNdOp must not have offsets.
305
+ if (descOp.getMixedOffsets ().size ())
306
+ return rewriter.notifyMatchFailure (
307
+ descOp, " xegpu::CreateNdDescOp must not have offsets" );
304
308
305
309
SmallVector<size_t > newRetIndices;
306
310
rewriter.setInsertionPoint (warpOp);
@@ -339,22 +343,23 @@ struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
339
343
// / #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
340
344
// / gpu.warp_execute_on_lane_0(%laneid) -> () {
341
345
// / ...
342
- // / xegpu.store_nd %arg0, %arg1: vector<4x8xf32>,
346
+ // / xegpu.store_nd %arg0, %arg1 [%x, %y] : vector<4x8xf32>,
343
347
// / !xegpu.tensor_desc<4x8xf32, #layout0>
344
348
// / }
345
349
// / ```
346
350
// / To
347
351
// / ```
348
352
// / %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
349
- // / !xegpu.tensor_desc<4x8xf32, #layout0>) {
350
- // / gpu.yield %arg0, %arg1: vector<4x8xf32>, !xegpu.tensor_desc<4x8xf32,
351
- // / #layout0>
353
+ // / !xegpu.tensor_desc<4x8xf32, #layout0>, index, index) {
354
+ // / ...
355
+ // / gpu.yield %arg0, %arg1, %x, %y: vector<4x8xf32>,
356
+ // / !xegpu.tensor_desc<4x8xf32, #layout0>, index, index
352
357
// / }
353
358
// / %0 = vector.shape_cast %r#0: vector<4x1xf32> to vector<4xf32>
354
359
// / %1 = unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
355
360
// / #layout0>
356
361
// / -> !xegpu.tensor_desc<4x8xf32>
357
- // / xegpu.store_nd %0, %1: vector<4xf32>,
362
+ // / xegpu.store_nd %0, %1 [%r#2, %r#3] : vector<4xf32>,
358
363
// / !xegpu.tensor_desc<4x8xf32>
359
364
// /
360
365
// / ```
@@ -368,10 +373,15 @@ struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
368
373
if (!storeOp)
369
374
return failure ();
370
375
371
- int64_t offsetSize = static_cast <int64_t >(storeOp.getOffsets ().size ());
372
- if ((offsetSize != 0 ) || storeOp.getConstOffsetsAttr ())
373
- return failure ();
374
-
376
+ SmallVector<OpFoldResult> offsets = storeOp.getMixedOffsets ();
377
+ // Expecting offsets to be present.
378
+ if (offsets.empty ())
379
+ return rewriter.notifyMatchFailure (storeOp,
380
+ " the store op must have offsets" );
381
+ SmallVector<Value> offsetsAsValues =
382
+ vector::getAsValues (rewriter, storeOp.getLoc (), offsets);
383
+ SmallVector<Type> offsetTypes = llvm::to_vector (
384
+ llvm::map_range (offsetsAsValues, [](Value v) { return v.getType (); }));
375
385
xegpu::TensorDescType tensorDescTy = storeOp.getTensorDescType ();
376
386
xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr ();
377
387
if (!layout)
@@ -387,13 +397,13 @@ struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
387
397
distributedTypeByWarpOpOrFailure.value ();
388
398
389
399
SmallVector<size_t > newRetIndices;
400
+ SmallVector<Value> newYieldedValues = {storeOp.getValue (),
401
+ storeOp.getTensorDesc ()};
402
+ SmallVector<Type> newYieldedTypes = {distributedTypeByWarpOp, tensorDescTy};
403
+ newYieldedValues.append (offsetsAsValues.begin (), offsetsAsValues.end ());
404
+ newYieldedTypes.append (offsetTypes.begin (), offsetTypes.end ());
390
405
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns (
391
- rewriter, warpOp,
392
- /* new yielded values = */
393
- ValueRange{storeOp.getValue (), storeOp.getTensorDesc ()},
394
- /* new yielded types = */
395
- TypeRange{distributedTypeByWarpOp, storeOp.getTensorDescType ()},
396
- newRetIndices);
406
+ rewriter, warpOp, newYieldedValues, newYieldedTypes, newRetIndices);
397
407
// Create a new store op outside the warp op with the distributed vector
398
408
// type. Tensor descriptor is not distributed.
399
409
rewriter.setInsertionPointAfter (newWarpOp);
@@ -418,6 +428,9 @@ struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
418
428
newStoreOperands.push_back (
419
429
resolveDistributedTy (newWarpOp.getResult (newRetIndices[1 ]),
420
430
distributedTensorDescTy, rewriter));
431
+ // Collect offsets.
432
+ for (size_t i = 2 ; i < newRetIndices.size (); ++i)
433
+ newStoreOperands.push_back (newWarpOp.getResult (newRetIndices[i]));
421
434
422
435
auto newStoreOp =
423
436
xegpu::StoreNdOp::create (rewriter, newWarpOp.getLoc (), TypeRange{},
@@ -491,9 +504,15 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
491
504
loadOp,
492
505
" xegpu::LoadNdOp require chip information to determine transpose "
493
506
" requirement" );
494
- int64_t offsetSize = static_cast <int64_t >(loadOp.getOffsets ().size ());
495
- if ((offsetSize != 0 ) || loadOp.getConstOffsetsAttr ())
496
- return failure ();
507
+ // Expecting offsets to be present.
508
+ SmallVector<OpFoldResult> offsets = loadOp.getMixedOffsets ();
509
+ if (offsets.empty ())
510
+ return rewriter.notifyMatchFailure (loadOp,
511
+ " the load op must have offsets" );
512
+ SmallVector<Value> offsetsAsValues =
513
+ vector::getAsValues (rewriter, loadOp.getLoc (), offsets);
514
+ SmallVector<Type> offsetTypes = llvm::to_vector (
515
+ llvm::map_range (offsetsAsValues, [](Value v) { return v.getType (); }));
497
516
498
517
xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType ();
499
518
xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr ();
@@ -506,10 +525,12 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
506
525
cast<VectorType>(warpOp.getResult (operandIdx).getType ());
507
526
508
527
SmallVector<size_t > newRetIndices;
528
+ SmallVector<Value> newYieldedValues = {loadOp.getTensorDesc ()};
529
+ SmallVector<Type> newYieldedTypes = {tensorDescTy};
530
+ newYieldedValues.append (offsetsAsValues.begin (), offsetsAsValues.end ());
531
+ newYieldedTypes.append (offsetTypes.begin (), offsetTypes.end ());
509
532
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns (
510
- rewriter, warpOp,
511
- /* new yielded values = */ loadOp.getTensorDesc (),
512
- /* new yielded types = */ tensorDescTy, newRetIndices);
533
+ rewriter, warpOp, newYieldedValues, newYieldedTypes, newRetIndices);
513
534
514
535
// Create a new load op outside the warp op with the distributed vector
515
536
// type.
@@ -523,11 +544,15 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
523
544
loadOp.getTensorDescType ().dropLayouts (); // Distributed tensor
524
545
// descriptor type does not
525
546
// contain layout info.
547
+ SmallVector<Value> newLoadOperands{
548
+ resolveDistributedTy (newWarpOp.getResult (newRetIndices[0 ]),
549
+ distributedTensorDescTy, rewriter)};
550
+ // Collect offsets.
551
+ for (size_t i = 1 ; i < newRetIndices.size (); ++i)
552
+ newLoadOperands.push_back (newWarpOp.getResult (newRetIndices[i]));
526
553
auto newLoadOp = xegpu::LoadNdOp::create (
527
554
rewriter, newWarpOp.getLoc (), loadNdDistValueTyOrFailure.value (),
528
- resolveDistributedTy (newWarpOp->getResult (newRetIndices[0 ]),
529
- distributedTensorDescTy, rewriter),
530
- loadOp->getAttrs ());
555
+ newLoadOperands, loadOp->getAttrs ());
531
556
xegpu::removeLayoutAttrs (newLoadOp);
532
557
// Set the packed attribute if the layout requires it.
533
558
newLoadOp.setPacked (requirePacked (layout));
@@ -677,85 +702,6 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
677
702
}
678
703
};
679
704
680
- // / Sink an update_nd_offset op feeding into yield op of an enclosing
681
- // / `gpu.warp_execute_on_lane_0` region. The warp op will still contain the
682
- // / original op that will not be used by the yield op (and should be cleaned
683
- // / up later). The yield op will bypass the updateOp's arguments. The tensor
684
- // / descriptor type is not distributed. Appropriate cast ops are inserted if
685
- // / the distributed types does not match expected xegpu SIMT types.
686
- // / Example:
687
- // / ```
688
- // / #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
689
- // / %r = gpu.warp_execute_on_lane_0(%laneid) ->
690
- // / (!xegpu.tensor_desc<4x8xf32, #layout0>) {
691
- // / ...
692
- // / %update = xegpu.update_nd_offset %arg0, [%c32, %c16]:
693
- // / !xegpu.tensor_desc<4x8xf32, #layout0>
694
- // / gpu.yield %update
695
- // / }
696
- // / ...
697
- // / ```
698
- // / To
699
- // / ```
700
- // / %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (
701
- // / !xegpu.tensor_desc<4x8xf32, #layout0>,
702
- // / !xegpu.tensor_desc<4x8xf32, #layout0>, index, index) {
703
- // / ...
704
- // / %dead = xegpu.update_nd_offset %arg0, [%c32, %c16]:
705
- // / !xegpu.tensor_desc<4x8xf32, #layout0> gpu.yield %dead, %arg0
706
- // / gpu.yield %dead, %arg0, %c32, %c16
707
- // / }
708
- // / %0 = xegpu.unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
709
- // / #layout0> -> !xegpu.tensor_desc<4x8xf32>
710
- // / %1 = xegpu.update_nd_offset %0, [%r#2, %r#3]:
711
- // / !xegpu.tensor_desc<4x8xf32>
712
- // / ...
713
- // / ```
714
- struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern {
715
- using gpu::WarpDistributionPattern::WarpDistributionPattern;
716
- LogicalResult matchAndRewrite (gpu::WarpExecuteOnLane0Op warpOp,
717
- PatternRewriter &rewriter) const override {
718
- OpOperand *operand =
719
- getWarpResult (warpOp, llvm::IsaPred<xegpu::UpdateNdOffsetOp>);
720
- if (!operand)
721
- return rewriter.notifyMatchFailure (
722
- warpOp, " warp result is not a xegpu::UpdateNdOffset op" );
723
- auto updateOp = operand->get ().getDefiningOp <xegpu::UpdateNdOffsetOp>();
724
- unsigned operandIdx = operand->getOperandNumber ();
725
-
726
- SmallVector<size_t > newRetIndices;
727
- gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns (
728
- rewriter, warpOp, updateOp->getOperands (), updateOp.getOperandTypes (),
729
- newRetIndices);
730
- rewriter.setInsertionPointAfter (newWarpOp);
731
- // new update op does not have layout attribute.
732
- xegpu::TensorDescType distributedTensorDescTy =
733
- updateOp.getTensorDescType ().dropLayouts ();
734
- SmallVector<Value> newUpdateOperands =
735
- llvm::map_to_vector (newRetIndices, [&](size_t i) {
736
- // For the tensor descriptor operand, the layout attribute is
737
- // dropped after distribution. Types needs to be resolved in this
738
- // case.
739
- if (isa<xegpu::TensorDescType>(newWarpOp.getResult (i).getType ())) {
740
- return resolveDistributedTy (newWarpOp.getResult (i),
741
- distributedTensorDescTy, rewriter);
742
- }
743
- return newWarpOp.getResult (i);
744
- });
745
- // Create a new update op outside the warp op.
746
- auto newUpdateOp = xegpu::UpdateNdOffsetOp::create (
747
- rewriter, newWarpOp.getLoc (), distributedTensorDescTy,
748
- newUpdateOperands, updateOp->getAttrs ());
749
- xegpu::removeLayoutAttrs (newUpdateOp);
750
- Value distributedVal = newWarpOp.getResult (operandIdx);
751
- // Resolve the distributed type with the original type.
752
- Value typeResolved = resolveDistributedTy (
753
- newUpdateOp.getResult (), distributedVal.getType (), rewriter);
754
- rewriter.replaceAllUsesWith (distributedVal, typeResolved);
755
- return success ();
756
- }
757
- };
758
-
759
705
// / Distribute a prefetch_nd op at the end of enclosing
760
706
// / `gpu.warp_execute_on_lane_0`. In case arguments for the prefetch are passed
761
707
// / through the warp op interface they would be propagated as returned values.
@@ -769,18 +715,19 @@ struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern {
769
715
// / #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
770
716
// / gpu.warp_execute_on_lane_0(%laneid) -> () {
771
717
// / ...
772
- // / xegpu.prefetch_nd %arg0 : !xegpu.tensor_desc<4x8xf32, #layout0>
718
+ // / xegpu.prefetch_nd %arg0 [%x, %y] : !xegpu.tensor_desc<4x8xf32, #layout0>
773
719
// / }
774
720
// / ```
775
721
// / To
776
722
// / ```
777
723
// / %r:1 = gpu.warp_execute_on_lane_0(%laneid) -> (
778
- // / !xegpu.tensor_desc<4x8xf32, #layout0>) {
779
- // / gpu.yield %arg0: !xegpu.tensor_desc<4x8xf32, #layout0>
724
+ // / !xegpu.tensor_desc<4x8xf32, #layout0>, index, index) {
725
+ // / gpu.yield %arg0, %x, %y: !xegpu.tensor_desc<4x8xf32, #layout0>, index,
726
+ // / index
780
727
// / }
781
728
// / %1 = unrealized_conversion_cast %r#0: !xegpu.tensor_desc<4x8xf32,
782
729
// / #layout0> -> !xegpu.tensor_desc<4x8xf32>
783
- // / xegpu.prefetch_nd %1 : !xegpu.tensor_desc<4x8xf32>
730
+ // / xegpu.prefetch_nd %1 [%r#1, %r#2] : !xegpu.tensor_desc<4x8xf32>
784
731
// /
785
732
// / ```
786
733
struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
@@ -793,17 +740,25 @@ struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
793
740
if (!prefetchOp)
794
741
return failure ();
795
742
796
- int64_t offsetSize = static_cast <int64_t >(prefetchOp.getOffsets ().size ());
797
- if ((offsetSize != 0 ) || prefetchOp.getConstOffsetsAttr ())
798
- return failure ();
743
+ SmallVector<OpFoldResult> offsets = prefetchOp.getMixedOffsets ();
744
+ // PrefetchNdOp must have offsets.
745
+ if (offsets.empty ())
746
+ return rewriter.notifyMatchFailure (prefetchOp,
747
+ " the prefetch op must have offsets" );
748
+ SmallVector<Value> offsetsAsValues =
749
+ vector::getAsValues (rewriter, prefetchOp.getLoc (), offsets);
750
+ SmallVector<Type> offsetTypes = llvm::to_vector (
751
+ llvm::map_range (offsetsAsValues, [](Value v) { return v.getType (); }));
799
752
800
753
xegpu::LayoutAttr layout = prefetchOp.getTensorDescType ().getLayoutAttr ();
801
754
if (!layout)
802
755
return rewriter.notifyMatchFailure (
803
756
prefetchOp, " the source tensor descriptor lacks layout attribute" );
804
757
805
- SmallVector<Value, 1 > newYieldValues = {prefetchOp.getTensorDesc ()};
806
- SmallVector<Type, 1 > newYieldTypes = {prefetchOp.getTensorDescType ()};
758
+ SmallVector<Value> newYieldValues = {prefetchOp.getTensorDesc ()};
759
+ SmallVector<Type> newYieldTypes = {prefetchOp.getTensorDescType ()};
760
+ newYieldValues.append (offsetsAsValues.begin (), offsetsAsValues.end ());
761
+ newYieldTypes.append (offsetTypes.begin (), offsetTypes.end ());
807
762
SmallVector<size_t > newRetIndices;
808
763
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns (
809
764
rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
@@ -814,6 +769,9 @@ struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
814
769
rewriter.setInsertionPointAfter (newWarpOp);
815
770
SmallVector<Value> newPrefetchOperands = {resolveDistributedTy (
816
771
newWarpOp.getResult (newRetIndices[0 ]), newTensorDescTy, rewriter)};
772
+ // Collect offsets.
773
+ for (size_t i = 1 ; i < newRetIndices.size (); ++i)
774
+ newPrefetchOperands.push_back (newWarpOp.getResult (newRetIndices[i]));
817
775
xegpu::PrefetchNdOp::create (rewriter, newWarpOp.getLoc (), TypeRange{},
818
776
newPrefetchOperands, prefetchOp->getAttrs ());
819
777
xegpu::removeLayoutAttrs (prefetchOp);
@@ -1456,15 +1414,14 @@ struct XeGPUSubgroupDistributePass final
1456
1414
1457
1415
void xegpu::populateXeGPUSubgroupDistributePatterns (
1458
1416
RewritePatternSet &patterns) {
1459
- patterns
1460
- .add <CreateNdDescDistribution, StoreNdDistribution, LoadNdDistribution,
1461
- DpasDistribution, PrefetchNdDistribution, UpdateNdOffsetDistribution,
1462
- GpuBarrierDistribution, VectorMultiReductionDistribution,
1463
- LoadDistribution, StoreDistribution, VectorTransposeDistribution,
1464
- VectorBitcastDistribution,
1465
- MemrefExtractAlignedPointerAsIndexDistribution>(
1466
- patterns.getContext (),
1467
- /* pattern benefit=*/ regularPatternBenefit);
1417
+ patterns.add <CreateNdDescDistribution, StoreNdDistribution,
1418
+ LoadNdDistribution, DpasDistribution, PrefetchNdDistribution,
1419
+ GpuBarrierDistribution, VectorMultiReductionDistribution,
1420
+ LoadDistribution, StoreDistribution, VectorTransposeDistribution,
1421
+ VectorBitcastDistribution,
1422
+ MemrefExtractAlignedPointerAsIndexDistribution>(
1423
+ patterns.getContext (),
1424
+ /* pattern benefit=*/ regularPatternBenefit);
1468
1425
patterns.add <VectorShapeCastDistribution>(
1469
1426
patterns.getContext (),
1470
1427
/* pattern benefit=*/ highPatternBenefit);
0 commit comments