Skip to content

Commit 1aa368f

Browse files
committed
Do not make tdm_descriptor parametric
1 parent 0b42ea0 commit 1aa368f

File tree

3 files changed

+73
-105
lines changed

3 files changed

+73
-105
lines changed

mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp

Lines changed: 3 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2705,7 +2705,7 @@ struct AMDGPUMakeDmaDescriptorLowering
27052705
}
27062706
};
27072707

2708-
template <typename SourceOp, typename TargetD2Op, typename TargetOp>
2708+
template <typename SourceOp, typename TargetOp>
27092709
struct AMDGPUTensorLoadStoreOpLowering
27102710
: public ConvertOpToLLVMPattern<SourceOp> {
27112711
using ConvertOpToLLVMPattern<SourceOp>::ConvertOpToLLVMPattern;
@@ -2727,20 +2727,11 @@ struct AMDGPUTensorLoadStoreOpLowering
27272727
uint32_t cacheScope = static_cast<uint32_t>(op.getCacheScope());
27282728
int32_t cachePolicy = cacheScope | temporalHint << 2 | nonVolatile << 5;
27292729

2730-
if (op.getDesc().getType().getSize() == 2) {
2731-
rewriter.replaceOpWithNewOp<TargetD2Op>(op, desc[0], desc[1], cachePolicy,
2732-
/*alias_scopes=*/nullptr,
2733-
/*noalias_scopes=*/nullptr,
2734-
/*tbaa=*/nullptr);
2735-
return success();
2736-
}
2737-
27382730
rewriter.replaceOpWithNewOp<TargetOp>(op, desc[0], desc[1], desc[2],
27392731
desc[3], cachePolicy,
27402732
/*alias_scopes=*/nullptr,
27412733
/*noalias_scopes=*/nullptr,
27422734
/*tbaa=*/nullptr);
2743-
27442735
return success();
27452736
}
27462737
};
@@ -2771,10 +2762,8 @@ struct ConvertAMDGPUToROCDLPass
27712762
Type v8i32 = converter.convertType(VectorType::get(8, i32));
27722763
result.push_back(v4i32);
27732764
result.push_back(v8i32);
2774-
if (type.getSize() != 2) {
2775-
result.push_back(v4i32);
2776-
result.push_back(v4i32);
2777-
}
2765+
result.push_back(v4i32);
2766+
result.push_back(v4i32);
27782767
return success();
27792768
});
27802769

@@ -2845,10 +2834,8 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
28452834
GatherToLDSOpLowering, TransposeLoadOpLowering, AMDGPUPermlaneLowering,
28462835
AMDGPUMakeDmaBaseLowering, AMDGPUMakeDmaDescriptorLowering,
28472836
AMDGPUTensorLoadStoreOpLowering<TensorLoadToLDSOp,
2848-
ROCDL::TensorLoadToLDSD2Op,
28492837
ROCDL::TensorLoadToLDSOp>,
28502838
AMDGPUTensorLoadStoreOpLowering<TensorStoreFromLDSOp,
2851-
ROCDL::TensorStoreFromLDSD2Op,
28522839
ROCDL::TensorStoreFromLDSOp>>(converter,
28532840
chipset);
28542841
patterns.add<AMDGPUSwizzleBitModeLowering>(converter);

mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir

Lines changed: 68 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -369,7 +369,7 @@ func.func @make_dma_descriptor_atomic_barrier(%base: !amdgpu.tdm_base<i32>, %bar
369369

370370
// CHECK-LABEL: func @make_dma_descriptor_workgroup_mask
371371
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[WG_MASK:.+]]: i16, %[[TIMEOUT:.+]]: i1)
372-
func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %timeout: i1) -> !amdgpu.tdm_descriptor<2> {
372+
func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %timeout: i1) -> !amdgpu.tdm_descriptor {
373373
// CHECK-DAG: %[[DGROUP0:.+]] = builtin.unrealized_conversion_cast %[[BASE]]
374374

375375
// CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32)
@@ -440,126 +440,107 @@ func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_
440440
// CHECK: %[[DGROUP1:.+]] = llvm.insertelement %[[SGPR7]], %[[DGROUP1_6]][%[[C7]] : i32]
441441

442442
// CHECK: %[[DGROUPS:.+]] = builtin.unrealized_conversion_cast %[[DGROUP0]], %[[DGROUP1]] : vector<4xi32>, vector<8xi32> to !amdgpu.tdm_descriptor
443-
%descriptor = amdgpu.make_dma_descriptor %base globalSize [128, 64] globalStride [64, 1] sharedSize [128, 64] workgroupMask %wg_mask earlyTimeout %timeout : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<2>
444-
func.return %descriptor : !amdgpu.tdm_descriptor<2>
443+
%descriptor = amdgpu.make_dma_descriptor %base globalSize [128, 64] globalStride [64, 1] sharedSize [128, 64] workgroupMask %wg_mask earlyTimeout %timeout : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
444+
func.return %descriptor : !amdgpu.tdm_descriptor
445445
}
446446

447-
// CHECK-LABEL: func @tensor_load_to_lds_d2
448-
// CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor<2>)
449-
func.func @tensor_load_to_lds_d2(%desc: !amdgpu.tdm_descriptor<2>) {
450-
// CHECK: %[[DGROUPS:.+]]:2 = builtin.unrealized_conversion_cast %[[DESC]]
451-
// CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
452-
amdgpu.tensor_load_to_lds %desc : !amdgpu.tdm_descriptor<2>
447+
// CHECK-LABEL: func @tensor_load_to_lds
448+
// CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor)
449+
func.func @tensor_load_to_lds(%desc: !amdgpu.tdm_descriptor) {
450+
// CHECK: %[[DGROUPS:.+]]:4 = builtin.unrealized_conversion_cast %[[DESC]]
451+
// CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
452+
amdgpu.tensor_load_to_lds %desc : !amdgpu.tdm_descriptor
453453

454-
// CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
455-
amdgpu.tensor_load_to_lds %desc { cache_scope = #amdgpu.cache_scope<workgroup> } : !amdgpu.tdm_descriptor<2>
454+
// CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
455+
amdgpu.tensor_load_to_lds %desc { cache_scope = #amdgpu.cache_scope<workgroup> } : !amdgpu.tdm_descriptor
456456

457-
// CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 1 : vector<4xi32>, vector<8xi32>
458-
amdgpu.tensor_load_to_lds %desc { cache_scope = #amdgpu.cache_scope<shader_engine> } : !amdgpu.tdm_descriptor<2>
457+
// CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 1 : vector<4xi32>, vector<8xi32>
458+
amdgpu.tensor_load_to_lds %desc { cache_scope = #amdgpu.cache_scope<shader_engine> } : !amdgpu.tdm_descriptor
459459

460-
// CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 2 : vector<4xi32>, vector<8xi32>
461-
amdgpu.tensor_load_to_lds %desc { cache_scope = #amdgpu.cache_scope<device> } : !amdgpu.tdm_descriptor<2>
460+
// CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 2 : vector<4xi32>, vector<8xi32>
461+
amdgpu.tensor_load_to_lds %desc { cache_scope = #amdgpu.cache_scope<device> } : !amdgpu.tdm_descriptor
462462

463-
// CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 3 : vector<4xi32>, vector<8xi32>
464-
amdgpu.tensor_load_to_lds %desc { cache_scope = #amdgpu.cache_scope<system> } : !amdgpu.tdm_descriptor<2>
463+
// CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 3 : vector<4xi32>, vector<8xi32>
464+
amdgpu.tensor_load_to_lds %desc { cache_scope = #amdgpu.cache_scope<system> } : !amdgpu.tdm_descriptor
465465

466-
// CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
467-
amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<regular> } : !amdgpu.tdm_descriptor<2>
466+
// CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
467+
amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<regular> } : !amdgpu.tdm_descriptor
468468

469-
// CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 4 : vector<4xi32>, vector<8xi32>
470-
amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<nontemporal> } : !amdgpu.tdm_descriptor<2>
469+
// CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 4 : vector<4xi32>, vector<8xi32>
470+
amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<nontemporal> } : !amdgpu.tdm_descriptor
471471

472-
// CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 8 : vector<4xi32>, vector<8xi32>
473-
amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<highpriority> } : !amdgpu.tdm_descriptor<2>
472+
// CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 8 : vector<4xi32>, vector<8xi32>
473+
amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<highpriority> } : !amdgpu.tdm_descriptor
474474

475-
// CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 12 : vector<4xi32>, vector<8xi32>
476-
amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<lastuse> } : !amdgpu.tdm_descriptor<2>
475+
// CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 12 : vector<4xi32>, vector<8xi32>
476+
amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<lastuse> } : !amdgpu.tdm_descriptor
477477

478-
// CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 16 : vector<4xi32>, vector<8xi32>
479-
amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<nontemporal_regular> } : !amdgpu.tdm_descriptor<2>
478+
// CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 16 : vector<4xi32>, vector<8xi32>
479+
amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<nontemporal_regular> } : !amdgpu.tdm_descriptor
480480

481-
// CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 20 : vector<4xi32>, vector<8xi32>
482-
amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<regular_nontemporal> } : !amdgpu.tdm_descriptor<2>
481+
// CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 20 : vector<4xi32>, vector<8xi32>
482+
amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<regular_nontemporal> } : !amdgpu.tdm_descriptor
483483

484-
// CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 24 : vector<4xi32>, vector<8xi32>
485-
amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<nontemporal_highpriority> } : !amdgpu.tdm_descriptor<2>
484+
// CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 24 : vector<4xi32>, vector<8xi32>
485+
amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<nontemporal_highpriority> } : !amdgpu.tdm_descriptor
486486

487-
// CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
488-
amdgpu.tensor_load_to_lds %desc { non_volatile = false } : !amdgpu.tdm_descriptor<2>
487+
// CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
488+
amdgpu.tensor_load_to_lds %desc { non_volatile = false } : !amdgpu.tdm_descriptor
489489

490-
// CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 32 : vector<4xi32>, vector<8xi32>
491-
amdgpu.tensor_load_to_lds %desc { non_volatile = true } : !amdgpu.tdm_descriptor<2>
490+
// CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 32 : vector<4xi32>, vector<8xi32>
491+
amdgpu.tensor_load_to_lds %desc { non_volatile = true } : !amdgpu.tdm_descriptor
492492

493493
func.return
494494
}
495495

496-
// CHECK-LABEL: func @tensor_load_to_lds
497-
// CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor<4>)
498-
func.func @tensor_load_to_lds(%desc: !amdgpu.tdm_descriptor<4>) {
496+
// CHECK-LABEL: func @tensor_store_from_lds
497+
// CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor)
498+
func.func @tensor_store_from_lds(%desc: !amdgpu.tdm_descriptor) {
499499
// CHECK: %[[DGROUPS:.+]]:4 = builtin.unrealized_conversion_cast %[[DESC]]
500-
// CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
501-
amdgpu.tensor_load_to_lds %desc : !amdgpu.tdm_descriptor<4>
502-
func.return
503-
}
504-
505-
// CHECK-LABEL: func @tensor_store_from_lds_d2
506-
// CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor<2>)
507-
func.func @tensor_store_from_lds_d2(%desc: !amdgpu.tdm_descriptor<2>) {
508-
// CHECK: %[[DGROUPS:.+]]:2 = builtin.unrealized_conversion_cast %[[DESC]]
509-
// CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
510-
amdgpu.tensor_store_from_lds %desc : !amdgpu.tdm_descriptor<2>
511-
512-
// CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
513-
amdgpu.tensor_store_from_lds %desc { cache_scope = #amdgpu.cache_scope<workgroup> } : !amdgpu.tdm_descriptor<2>
514-
515-
// CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 1 : vector<4xi32>, vector<8xi32>
516-
amdgpu.tensor_store_from_lds %desc { cache_scope = #amdgpu.cache_scope<shader_engine> } : !amdgpu.tdm_descriptor<2>
517-
518-
// CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 2 : vector<4xi32>, vector<8xi32>
519-
amdgpu.tensor_store_from_lds %desc { cache_scope = #amdgpu.cache_scope<device> } : !amdgpu.tdm_descriptor<2>
500+
// CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
501+
amdgpu.tensor_store_from_lds %desc : !amdgpu.tdm_descriptor
520502

521-
// CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 3 : vector<4xi32>, vector<8xi32>
522-
amdgpu.tensor_store_from_lds %desc { cache_scope = #amdgpu.cache_scope<system> } : !amdgpu.tdm_descriptor<2>
503+
// CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
504+
amdgpu.tensor_store_from_lds %desc { cache_scope = #amdgpu.cache_scope<workgroup> } : !amdgpu.tdm_descriptor
523505

524-
// CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
525-
amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<regular> } : !amdgpu.tdm_descriptor<2>
506+
// CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 1 : vector<4xi32>, vector<8xi32>
507+
amdgpu.tensor_store_from_lds %desc { cache_scope = #amdgpu.cache_scope<shader_engine> } : !amdgpu.tdm_descriptor
526508

527-
// CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 4 : vector<4xi32>, vector<8xi32>
528-
amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<nontemporal> } : !amdgpu.tdm_descriptor<2>
509+
// CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 2 : vector<4xi32>, vector<8xi32>
510+
amdgpu.tensor_store_from_lds %desc { cache_scope = #amdgpu.cache_scope<device> } : !amdgpu.tdm_descriptor
529511

530-
// CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 8 : vector<4xi32>, vector<8xi32>
531-
amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<highpriority> } : !amdgpu.tdm_descriptor<2>
512+
// CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 3 : vector<4xi32>, vector<8xi32>
513+
amdgpu.tensor_store_from_lds %desc { cache_scope = #amdgpu.cache_scope<system> } : !amdgpu.tdm_descriptor
532514

533-
// CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 12 : vector<4xi32>, vector<8xi32>
534-
amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<writeback> } : !amdgpu.tdm_descriptor<2>
515+
// CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
516+
amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<regular> } : !amdgpu.tdm_descriptor
535517

536-
// CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 16 : vector<4xi32>, vector<8xi32>
537-
amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<nontemporal_regular> } : !amdgpu.tdm_descriptor<2>
518+
// CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 4 : vector<4xi32>, vector<8xi32>
519+
amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<nontemporal> } : !amdgpu.tdm_descriptor
538520

539-
// CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 20 : vector<4xi32>, vector<8xi32>
540-
amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<regular_nontemporal> } : !amdgpu.tdm_descriptor<2>
521+
// CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 8 : vector<4xi32>, vector<8xi32>
522+
amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<highpriority> } : !amdgpu.tdm_descriptor
541523

542-
// CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 24 : vector<4xi32>, vector<8xi32>
543-
amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<nontemporal_highpriority> } : !amdgpu.tdm_descriptor<2>
524+
// CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 12 : vector<4xi32>, vector<8xi32>
525+
amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<writeback> } : !amdgpu.tdm_descriptor
544526

545-
// CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 28 : vector<4xi32>, vector<8xi32>
546-
amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<nontemporal_writeback> } : !amdgpu.tdm_descriptor<2>
527+
// CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 16 : vector<4xi32>, vector<8xi32>
528+
amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<nontemporal_regular> } : !amdgpu.tdm_descriptor
547529

548-
// CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
549-
amdgpu.tensor_store_from_lds %desc { non_volatile = false } : !amdgpu.tdm_descriptor<2>
530+
// CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 20 : vector<4xi32>, vector<8xi32>
531+
amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<regular_nontemporal> } : !amdgpu.tdm_descriptor
550532

551-
// CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 32 : vector<4xi32>, vector<8xi32>
552-
amdgpu.tensor_store_from_lds %desc { non_volatile = true } : !amdgpu.tdm_descriptor<2>
553-
func.return
554-
}
533+
// CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 24 : vector<4xi32>, vector<8xi32>
534+
amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<nontemporal_highpriority> } : !amdgpu.tdm_descriptor
555535

536+
// CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 28 : vector<4xi32>, vector<8xi32>
537+
amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<nontemporal_writeback> } : !amdgpu.tdm_descriptor
556538

557-
// CHECK-LABEL: func @tensor_store_from_lds
558-
// CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor<4>)
559-
func.func @tensor_store_from_lds(%desc: !amdgpu.tdm_descriptor<4>) {
560-
// CHECK: %[[DGROUPS:.+]]:4 = builtin.unrealized_conversion_cast %[[DESC]]
561539
// CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
562-
amdgpu.tensor_store_from_lds %desc : !amdgpu.tdm_descriptor<4>
540+
amdgpu.tensor_store_from_lds %desc { non_volatile = false } : !amdgpu.tdm_descriptor
541+
542+
// CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 32 : vector<4xi32>, vector<8xi32>
543+
amdgpu.tensor_store_from_lds %desc { non_volatile = true } : !amdgpu.tdm_descriptor
563544
func.return
564545
}
565546

0 commit comments

Comments
 (0)