Skip to content

Commit e49b8f2

Browse files
committed
[mlir][amdgpu] Lower tensor load store ops.
* Adds attributes cache scopes, temporal hints. * Makes tdm_descriptor parametric. * Lowers tensor_load_to_lds and tensor_store_from_lds.
1 parent b8cec0a commit e49b8f2

File tree

4 files changed

+326
-7
lines changed

4 files changed

+326
-7
lines changed

mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,97 @@ def AMDGPU_AddressSpaceAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_AddressSpace,
8080
let assemblyFormat = "`<` $value `>`";
8181
}
8282

83+
def AMDGPU_TemporalLoadHints : I32EnumAttr<"TemporalLoadHints",
84+
"AMDGPU-specific temporal load hints",
85+
[
86+
I32EnumAttrCase<"RegularTemporal", 0, "regular">,
87+
I32EnumAttrCase<"NonTemporal", 1, "nontemporal">,
88+
I32EnumAttrCase<"HighPriorityTemporal", 2, "highpriority">,
89+
I32EnumAttrCase<"LastUse", 3, "lastuse">,
90+
I32EnumAttrCase<"NT_RT", 4, "nontemporal_regular">,
91+
I32EnumAttrCase<"RT_NT", 5, "regular_nontemporal">,
92+
I32EnumAttrCase<"NT_HT", 6, "nontemporal_highpriority">,
93+
]> {
94+
let genSpecializedAttr = 0;
95+
let cppNamespace = "::mlir::amdgpu";
96+
}
97+
98+
def AMDGPU_TemporalLoadHintsAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_TemporalLoadHints,
99+
"temporal_load_hint"> {
100+
let description = [{
101+
AMDGPU-specific temporal load hints.
102+
103+
- `regular_temporal` (default).
104+
- `nontemporal`: re-use is not expected.
105+
- `highpriority`: precedence over `regular_temporal`.
106+
- `lastuse`: last-use.
107+
- `nontemporal_regular`: non-temporal for near cache(s) and regular for far caches.
108+
- `regular_nontemporal`: regular for near cache(s) and non-temporal for far caches.
109+
- `nontemporal_highpriority`: non-temporal for near cache(s) and high priority for far caches.
110+
}];
111+
let assemblyFormat = "`<` $value `>`";
112+
}
113+
114+
def AMDGPU_TemporalStoreHints : I32EnumAttr<"TemporalStoreHints",
115+
"AMDGPU-specific temporal store hints",
116+
[
117+
I32EnumAttrCase<"RegularTemporal", 0, "regular">,
118+
I32EnumAttrCase<"NonTemporal", 1, "nontemporal">,
119+
I32EnumAttrCase<"HighPriorityTemporal", 2, "highpriority">,
120+
I32EnumAttrCase<"WriteBack", 3, "writeback">,
121+
I32EnumAttrCase<"NT_RT", 4, "nontemporal_regular">,
122+
I32EnumAttrCase<"RT_NT", 5, "regular_nontemporal">,
123+
I32EnumAttrCase<"NT_HT", 6, "nontemporal_highpriority">,
124+
I32EnumAttrCase<"NT_WB", 7, "nontemporal_writeback">,
125+
]> {
126+
let genSpecializedAttr = 0;
127+
let cppNamespace = "::mlir::amdgpu";
128+
}
129+
130+
def AMDGPU_TemporalStoreHintsAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_TemporalStoreHints,
131+
"temporal_store_hint"> {
132+
let description = [{
133+
AMDGPU-specific temporal load hints.
134+
135+
- `regular_temporal` (default).
136+
- `nontemporal`: re-use is not expected.
137+
- `highpriority`: precedence over `regular_temporal`.
138+
- `writeback`: same as "HT" but also overrides wr-rinse in far cache where it forces to stay dirty in cache.
139+
- `nontemporal_regular`: non-temporal for near cache(s) and regular for far caches.
140+
- `regular_nontemporal`: regular for near cache(s) and non-temporal for far caches.
141+
- `nontemporal_highpriority`: non-temporal for near cache(s) and high priority for far caches.
142+
- `nontemporal_writeback`: non-temporal for near cache(s) and WB for far cache.
143+
}];
144+
145+
let assemblyFormat = "`<` $value `>`";
146+
}
147+
148+
def AMDGPU_CacheScope : I32EnumAttr<"CacheScope",
149+
"Cache scope control enums.",
150+
[
151+
I32EnumAttrCase<"Workgroup", 0, "workgroup">,
152+
I32EnumAttrCase<"ShaderEngine", 1, "shader_engine">,
153+
I32EnumAttrCase<"Device", 2, "device">,
154+
I32EnumAttrCase<"System", 3, "system">,
155+
]> {
156+
let genSpecializedAttr = 0;
157+
let cppNamespace = "::mlir::amdgpu";
158+
}
159+
160+
def AMDGPU_CacheScopeAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_CacheScope,
161+
"cache_scope"> {
162+
let description = [{
163+
AMDGPU cache scope control enums.
164+
165+
- `workgroup` coherent among all VMEM threads in a workgroup.
166+
- `shader_engine`: coeherent among all client (threads) sharing a SE-cache.
167+
- `device`: coherent among all threads on the same device.
168+
- `system`: system
169+
}];
170+
171+
let assemblyFormat = "`<` $value `>`";
172+
}
173+
83174
//===----------------------------------------------------------------------===//
84175
// AMDGPU Type definitions
85176
//===----------------------------------------------------------------------===//
@@ -1394,4 +1485,41 @@ def AMDGPU_MakeDmaDescriptorOp :
13941485
let hasFolder = 1;
13951486
}
13961487

1488+
def AMDGPU_TensorLoadToLDSOp :
1489+
AMDGPU_Op<"tensor_load_to_lds", [MemoryEffects<[MemWrite]>, MemoryEffects<[MemRead]>]>,
1490+
Arguments<(ins AMDGPU_TDMDescriptorType: $desc,
1491+
DefaultValuedOptionalAttr<AMDGPU_CacheScopeAttr, "CacheScope::Workgroup">: $cache_scope,
1492+
DefaultValuedOptionalAttr<AMDGPU_TemporalLoadHintsAttr, "TemporalLoadHints::RegularTemporal">: $temporal_hint,
1493+
DefaultValuedOptionalAttr<BoolAttr, "false">: $non_volatile)> {
1494+
let summary = "Load tensors from global memory to LDS.";
1495+
let description = [{
1496+
Load tensors of up to five dimensions from global memory to LDS.
1497+
1498+
This operation was introduced in gfx1250.
1499+
}];
1500+
1501+
let assemblyFormat = [{
1502+
$desc attr-dict `:` qualified(type($desc))
1503+
}];
1504+
}
1505+
1506+
def AMDGPU_TensorStoreFromLDSOp :
1507+
AMDGPU_Op<"tensor_store_from_lds", [MemoryEffects<[MemWrite]>, MemoryEffects<[MemRead]>]>,
1508+
Arguments<(ins AMDGPU_TDMDescriptorType: $desc,
1509+
DefaultValuedOptionalAttr<AMDGPU_CacheScopeAttr, "CacheScope::Workgroup">: $cache_scope,
1510+
DefaultValuedOptionalAttr<AMDGPU_TemporalStoreHintsAttr, "TemporalStoreHints::RegularTemporal">: $temporal_hint,
1511+
DefaultValuedOptionalAttr<BoolAttr, "false">: $non_volatile)> {
1512+
1513+
let summary = "Store tensors from LDS to global memory.";
1514+
let description = [{
1515+
Store tensors of up to five dimensions from LDS to global memory.
1516+
1517+
This operation was introduced in gfx1250.
1518+
}];
1519+
1520+
let assemblyFormat = [{
1521+
$desc attr-dict `:` qualified(type($desc))
1522+
}];
1523+
}
1524+
13971525
#endif // AMDGPU

mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp

Lines changed: 73 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2705,6 +2705,47 @@ struct AMDGPUMakeDmaDescriptorLowering
27052705
}
27062706
};
27072707

2708+
template <typename SourceOp, typename TargetD2Op, typename TargetOp>
2709+
struct AMDGPUTensorLoadStoreOpLowering
2710+
: public ConvertOpToLLVMPattern<SourceOp> {
2711+
using ConvertOpToLLVMPattern<SourceOp>::ConvertOpToLLVMPattern;
2712+
using Adaptor = typename ConvertOpToLLVMPattern<SourceOp>::OneToNOpAdaptor;
2713+
AMDGPUTensorLoadStoreOpLowering(const LLVMTypeConverter &converter,
2714+
Chipset chipset)
2715+
: ConvertOpToLLVMPattern<SourceOp>(converter), chipset(chipset) {}
2716+
Chipset chipset;
2717+
2718+
LogicalResult
2719+
matchAndRewrite(SourceOp op, Adaptor adaptor,
2720+
ConversionPatternRewriter &rewriter) const override {
2721+
if (chipset < kGfx1250)
2722+
return op->emitOpError("is only supported on gfx1250");
2723+
2724+
ValueRange desc = adaptor.getDesc();
2725+
uint32_t temporalHint = static_cast<uint32_t>(op.getTemporalHint());
2726+
bool nonVolatile = static_cast<bool>(op.getNonVolatile());
2727+
uint32_t cacheScope = static_cast<uint32_t>(op.getCacheScope());
2728+
int32_t cachePolicy = cacheScope | temporalHint << 2 | nonVolatile << 5;
2729+
2730+
if (op.getDesc().getType().getSize() == 2) {
2731+
rewriter.replaceOpWithNewOp<TargetD2Op>(op, desc[0], desc[1],
2732+
cachePolicy,
2733+
/*alias_scopes=*/nullptr,
2734+
/*noalias_scopes=*/nullptr,
2735+
/*tbaa=*/nullptr);
2736+
return success();
2737+
}
2738+
2739+
rewriter.replaceOpWithNewOp<TargetOp>(op, desc[0], desc[1], desc[2],
2740+
desc[3], cachePolicy,
2741+
/*alias_scopes=*/nullptr,
2742+
/*noalias_scopes=*/nullptr,
2743+
/*tbaa=*/nullptr);
2744+
2745+
return success();
2746+
}
2747+
};
2748+
27082749
struct ConvertAMDGPUToROCDLPass
27092750
: public impl::ConvertAMDGPUToROCDLPassBase<ConvertAMDGPUToROCDLPass> {
27102751
using Base::Base;
@@ -2723,6 +2764,30 @@ struct ConvertAMDGPUToROCDLPass
27232764
Type i32 = IntegerType::get(type.getContext(), 32);
27242765
return converter.convertType(VectorType::get(4, i32));
27252766
});
2767+
converter.addConversion(
2768+
[&](TDMDescriptorType type,
2769+
SmallVectorImpl<Type> &result) -> std::optional<LogicalResult> {
2770+
Type i32 = IntegerType::get(type.getContext(), 32);
2771+
Type v4i32 = converter.convertType(VectorType::get(4, i32));
2772+
Type v8i32 = converter.convertType(VectorType::get(8, i32));
2773+
result.push_back(v4i32);
2774+
result.push_back(v8i32);
2775+
if (type.getSize() != 2) {
2776+
result.push_back(v4i32);
2777+
result.push_back(v4i32);
2778+
}
2779+
return success();
2780+
});
2781+
2782+
auto addUnrealizedCast = [](OpBuilder &builder, TypeRange types,
2783+
ValueRange inputs,
2784+
Location loc) -> SmallVector<Value> {
2785+
auto cast =
2786+
UnrealizedConversionCastOp::create(builder, loc, types, inputs);
2787+
return cast.getResults();
2788+
};
2789+
2790+
converter.addTargetMaterialization(addUnrealizedCast);
27262791

27272792
populateAMDGPUToROCDLConversionPatterns(converter, patterns, *maybeChipset);
27282793
LLVMConversionTarget target(getContext());
@@ -2779,7 +2844,13 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
27792844
ScaledExtPackedOpLowering, PackedScaledTruncOpLowering,
27802845
PackedTrunc2xFp8OpLowering, PackedStochRoundFp8OpLowering,
27812846
GatherToLDSOpLowering, TransposeLoadOpLowering, AMDGPUPermlaneLowering,
2782-
AMDGPUMakeDmaBaseLowering, AMDGPUMakeDmaDescriptorLowering>(converter,
2783-
chipset);
2847+
AMDGPUMakeDmaBaseLowering, AMDGPUMakeDmaDescriptorLowering,
2848+
AMDGPUTensorLoadStoreOpLowering<TensorLoadToLDSOp,
2849+
ROCDL::TensorLoadToLDSD2Op,
2850+
ROCDL::TensorLoadToLDSOp>,
2851+
AMDGPUTensorLoadStoreOpLowering<TensorStoreFromLDSOp,
2852+
ROCDL::TensorStoreFromLDSD2Op,
2853+
ROCDL::TensorStoreFromLDSOp>>(converter,
2854+
chipset);
27842855
patterns.add<AMDGPUSwizzleBitModeLowering>(converter);
27852856
}

mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir

Lines changed: 123 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -369,7 +369,7 @@ func.func @make_dma_descriptor_atomic_barrier(%base: !amdgpu.tdm_base<i32>, %bar
369369

370370
// CHECK-LABEL: func @make_dma_descriptor_workgroup_mask
371371
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[WG_MASK:.+]]: i16, %[[TIMEOUT:.+]]: i1)
372-
func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %timeout: i1) -> !amdgpu.tdm_descriptor {
372+
func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %timeout: i1) -> !amdgpu.tdm_descriptor<2> {
373373
// CHECK-DAG: %[[DGROUP0:.+]] = builtin.unrealized_conversion_cast %[[BASE]]
374374

375375
// CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32)
@@ -440,6 +440,126 @@ func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_
440440
// CHECK: %[[DGROUP1:.+]] = llvm.insertelement %[[SGPR7]], %[[DGROUP1_6]][%[[C7]] : i32]
441441

442442
// CHECK: %[[DGROUPS:.+]] = builtin.unrealized_conversion_cast %[[DGROUP0]], %[[DGROUP1]] : vector<4xi32>, vector<8xi32> to !amdgpu.tdm_descriptor
443-
%descriptor = amdgpu.make_dma_descriptor %base globalSize [128, 64] globalStride [64, 1] sharedSize [128, 64] workgroupMask %wg_mask earlyTimeout %timeout : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
444-
func.return %descriptor : !amdgpu.tdm_descriptor
443+
%descriptor = amdgpu.make_dma_descriptor %base globalSize [128, 64] globalStride [64, 1] sharedSize [128, 64] workgroupMask %wg_mask earlyTimeout %timeout : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<2>
444+
func.return %descriptor : !amdgpu.tdm_descriptor<2>
445+
}
446+
447+
// CHECK-LABEL: func @tensor_load_to_lds_d2
448+
// CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor<2>)
449+
func.func @tensor_load_to_lds_d2(%desc: !amdgpu.tdm_descriptor<2>) {
450+
// CHECK: %[[DGROUPS:.+]]:2 = builtin.unrealized_conversion_cast %[[DESC]]
451+
// CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
452+
amdgpu.tensor_load_to_lds %desc : !amdgpu.tdm_descriptor<2>
453+
454+
// CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
455+
amdgpu.tensor_load_to_lds %desc { cache_scope = #amdgpu.cache_scope<workgroup> } : !amdgpu.tdm_descriptor<2>
456+
457+
// CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 1 : vector<4xi32>, vector<8xi32>
458+
amdgpu.tensor_load_to_lds %desc { cache_scope = #amdgpu.cache_scope<shader_engine> } : !amdgpu.tdm_descriptor<2>
459+
460+
// CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 2 : vector<4xi32>, vector<8xi32>
461+
amdgpu.tensor_load_to_lds %desc { cache_scope = #amdgpu.cache_scope<device> } : !amdgpu.tdm_descriptor<2>
462+
463+
// CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 3 : vector<4xi32>, vector<8xi32>
464+
amdgpu.tensor_load_to_lds %desc { cache_scope = #amdgpu.cache_scope<system> } : !amdgpu.tdm_descriptor<2>
465+
466+
// CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
467+
amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<regular> } : !amdgpu.tdm_descriptor<2>
468+
469+
// CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 4 : vector<4xi32>, vector<8xi32>
470+
amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<nontemporal> } : !amdgpu.tdm_descriptor<2>
471+
472+
// CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 8 : vector<4xi32>, vector<8xi32>
473+
amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<highpriority> } : !amdgpu.tdm_descriptor<2>
474+
475+
// CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 12 : vector<4xi32>, vector<8xi32>
476+
amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<lastuse> } : !amdgpu.tdm_descriptor<2>
477+
478+
// CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 16 : vector<4xi32>, vector<8xi32>
479+
amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<nontemporal_regular> } : !amdgpu.tdm_descriptor<2>
480+
481+
// CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 20 : vector<4xi32>, vector<8xi32>
482+
amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<regular_nontemporal> } : !amdgpu.tdm_descriptor<2>
483+
484+
// CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 24 : vector<4xi32>, vector<8xi32>
485+
amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<nontemporal_highpriority> } : !amdgpu.tdm_descriptor<2>
486+
487+
// CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
488+
amdgpu.tensor_load_to_lds %desc { non_volatile = false } : !amdgpu.tdm_descriptor<2>
489+
490+
// CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 32 : vector<4xi32>, vector<8xi32>
491+
amdgpu.tensor_load_to_lds %desc { non_volatile = true } : !amdgpu.tdm_descriptor<2>
492+
493+
func.return
445494
}
495+
496+
// CHECK-LABEL: func @tensor_load_to_lds
497+
// CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor<4>)
498+
func.func @tensor_load_to_lds(%desc: !amdgpu.tdm_descriptor<4>) {
499+
// CHECK: %[[DGROUPS:.+]]:4 = builtin.unrealized_conversion_cast %[[DESC]]
500+
// CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
501+
amdgpu.tensor_load_to_lds %desc : !amdgpu.tdm_descriptor<4>
502+
func.return
503+
}
504+
505+
// CHECK-LABEL: func @tensor_store_from_lds_d2
506+
// CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor<2>)
507+
func.func @tensor_store_from_lds_d2(%desc: !amdgpu.tdm_descriptor<2>) {
508+
// CHECK: %[[DGROUPS:.+]]:2 = builtin.unrealized_conversion_cast %[[DESC]]
509+
// CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
510+
amdgpu.tensor_store_from_lds %desc : !amdgpu.tdm_descriptor<2>
511+
512+
// CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
513+
amdgpu.tensor_store_from_lds %desc { cache_scope = #amdgpu.cache_scope<workgroup> } : !amdgpu.tdm_descriptor<2>
514+
515+
// CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 1 : vector<4xi32>, vector<8xi32>
516+
amdgpu.tensor_store_from_lds %desc { cache_scope = #amdgpu.cache_scope<shader_engine> } : !amdgpu.tdm_descriptor<2>
517+
518+
// CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 2 : vector<4xi32>, vector<8xi32>
519+
amdgpu.tensor_store_from_lds %desc { cache_scope = #amdgpu.cache_scope<device> } : !amdgpu.tdm_descriptor<2>
520+
521+
// CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 3 : vector<4xi32>, vector<8xi32>
522+
amdgpu.tensor_store_from_lds %desc { cache_scope = #amdgpu.cache_scope<system> } : !amdgpu.tdm_descriptor<2>
523+
524+
// CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
525+
amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<regular> } : !amdgpu.tdm_descriptor<2>
526+
527+
// CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 4 : vector<4xi32>, vector<8xi32>
528+
amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<nontemporal> } : !amdgpu.tdm_descriptor<2>
529+
530+
// CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 8 : vector<4xi32>, vector<8xi32>
531+
amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<highpriority> } : !amdgpu.tdm_descriptor<2>
532+
533+
// CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 12 : vector<4xi32>, vector<8xi32>
534+
amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<writeback> } : !amdgpu.tdm_descriptor<2>
535+
536+
// CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 16 : vector<4xi32>, vector<8xi32>
537+
amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<nontemporal_regular> } : !amdgpu.tdm_descriptor<2>
538+
539+
// CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 20 : vector<4xi32>, vector<8xi32>
540+
amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<regular_nontemporal> } : !amdgpu.tdm_descriptor<2>
541+
542+
// CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 24 : vector<4xi32>, vector<8xi32>
543+
amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<nontemporal_highpriority> } : !amdgpu.tdm_descriptor<2>
544+
545+
// CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 28 : vector<4xi32>, vector<8xi32>
546+
amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<nontemporal_writeback> } : !amdgpu.tdm_descriptor<2>
547+
548+
// CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
549+
amdgpu.tensor_store_from_lds %desc { non_volatile = false } : !amdgpu.tdm_descriptor<2>
550+
551+
// CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 32 : vector<4xi32>, vector<8xi32>
552+
amdgpu.tensor_store_from_lds %desc { non_volatile = true } : !amdgpu.tdm_descriptor<2>
553+
func.return
554+
}
555+
556+
557+
// CHECK-LABEL: func @tensor_store_from_lds
558+
// CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor<4>)
559+
func.func @tensor_store_from_lds(%desc: !amdgpu.tdm_descriptor<4>) {
560+
// CHECK: %[[DGROUPS:.+]]:4 = builtin.unrealized_conversion_cast %[[DESC]]
561+
// CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
562+
amdgpu.tensor_store_from_lds %desc : !amdgpu.tdm_descriptor<4>
563+
func.return
564+
}
565+

0 commit comments

Comments
 (0)