@@ -369,7 +369,7 @@ func.func @make_dma_descriptor_atomic_barrier(%base: !amdgpu.tdm_base<i32>, %bar
369369
370370// CHECK-LABEL: func @make_dma_descriptor_workgroup_mask
371371// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[WG_MASK:.+]]: i16, %[[TIMEOUT:.+]]: i1)
372- func.func @make_dma_descriptor_workgroup_mask (%base: !amdgpu.tdm_base <i32 >, %wg_mask: i16 , %timeout: i1 ) -> !amdgpu.tdm_descriptor {
372+ func.func @make_dma_descriptor_workgroup_mask (%base: !amdgpu.tdm_base <i32 >, %wg_mask: i16 , %timeout: i1 ) -> !amdgpu.tdm_descriptor < 2 > {
373373 // CHECK-DAG: %[[DGROUP0:.+]] = builtin.unrealized_conversion_cast %[[BASE]]
374374
375375 // CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32)
@@ -440,6 +440,126 @@ func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_
440440 // CHECK: %[[DGROUP1:.+]] = llvm.insertelement %[[SGPR7]], %[[DGROUP1_6]][%[[C7]] : i32]
441441
442442 // CHECK: %[[DGROUPS:.+]] = builtin.unrealized_conversion_cast %[[DGROUP0]], %[[DGROUP1]] : vector<4xi32>, vector<8xi32> to !amdgpu.tdm_descriptor
443- %descriptor = amdgpu.make_dma_descriptor %base globalSize [128 , 64 ] globalStride [64 , 1 ] sharedSize [128 , 64 ] workgroupMask %wg_mask earlyTimeout %timeout : !amdgpu.tdm_base <i32 > -> !amdgpu.tdm_descriptor
444- func.return %descriptor : !amdgpu.tdm_descriptor
443+ %descriptor = amdgpu.make_dma_descriptor %base globalSize [128 , 64 ] globalStride [64 , 1 ] sharedSize [128 , 64 ] workgroupMask %wg_mask earlyTimeout %timeout : !amdgpu.tdm_base <i32 > -> !amdgpu.tdm_descriptor <2 >
444+ func.return %descriptor : !amdgpu.tdm_descriptor <2 >
445+ }
446+
447+ // CHECK-LABEL: func @tensor_load_to_lds_d2
448+ // CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor<2>)
449+ func.func @tensor_load_to_lds_d2 (%desc: !amdgpu.tdm_descriptor <2 >) {
450+ // CHECK: %[[DGROUPS:.+]]:2 = builtin.unrealized_conversion_cast %[[DESC]]
451+ // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
452+ amdgpu.tensor_load_to_lds %desc : !amdgpu.tdm_descriptor <2 >
453+
454+ // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
455+ amdgpu.tensor_load_to_lds %desc { cache_scope = #amdgpu.cache_scope <workgroup > } : !amdgpu.tdm_descriptor <2 >
456+
457+ // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 1 : vector<4xi32>, vector<8xi32>
458+ amdgpu.tensor_load_to_lds %desc { cache_scope = #amdgpu.cache_scope <shader_engine > } : !amdgpu.tdm_descriptor <2 >
459+
460+ // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 2 : vector<4xi32>, vector<8xi32>
461+ amdgpu.tensor_load_to_lds %desc { cache_scope = #amdgpu.cache_scope <device > } : !amdgpu.tdm_descriptor <2 >
462+
463+ // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 3 : vector<4xi32>, vector<8xi32>
464+ amdgpu.tensor_load_to_lds %desc { cache_scope = #amdgpu.cache_scope <system > } : !amdgpu.tdm_descriptor <2 >
465+
466+ // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
467+ amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint <regular > } : !amdgpu.tdm_descriptor <2 >
468+
469+ // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 4 : vector<4xi32>, vector<8xi32>
470+ amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint <nontemporal > } : !amdgpu.tdm_descriptor <2 >
471+
472+ // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 8 : vector<4xi32>, vector<8xi32>
473+ amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint <highpriority > } : !amdgpu.tdm_descriptor <2 >
474+
475+ // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 12 : vector<4xi32>, vector<8xi32>
476+ amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint <lastuse > } : !amdgpu.tdm_descriptor <2 >
477+
478+ // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 16 : vector<4xi32>, vector<8xi32>
479+ amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint <nontemporal_regular > } : !amdgpu.tdm_descriptor <2 >
480+
481+ // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 20 : vector<4xi32>, vector<8xi32>
482+ amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint <regular_nontemporal > } : !amdgpu.tdm_descriptor <2 >
483+
484+ // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 24 : vector<4xi32>, vector<8xi32>
485+ amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint <nontemporal_highpriority > } : !amdgpu.tdm_descriptor <2 >
486+
487+ // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
488+ amdgpu.tensor_load_to_lds %desc { non_volatile = false } : !amdgpu.tdm_descriptor <2 >
489+
490+ // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 32 : vector<4xi32>, vector<8xi32>
491+ amdgpu.tensor_load_to_lds %desc { non_volatile = true } : !amdgpu.tdm_descriptor <2 >
492+
493+ func.return
445494}
495+
496+ // CHECK-LABEL: func @tensor_load_to_lds
497+ // CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor<4>)
498+ func.func @tensor_load_to_lds (%desc: !amdgpu.tdm_descriptor <4 >) {
499+ // CHECK: %[[DGROUPS:.+]]:4 = builtin.unrealized_conversion_cast %[[DESC]]
500+ // CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
501+ amdgpu.tensor_load_to_lds %desc : !amdgpu.tdm_descriptor <4 >
502+ func.return
503+ }
504+
505+ // CHECK-LABEL: func @tensor_store_from_lds_d2
506+ // CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor<2>)
507+ func.func @tensor_store_from_lds_d2 (%desc: !amdgpu.tdm_descriptor <2 >) {
508+ // CHECK: %[[DGROUPS:.+]]:2 = builtin.unrealized_conversion_cast %[[DESC]]
509+ // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
510+ amdgpu.tensor_store_from_lds %desc : !amdgpu.tdm_descriptor <2 >
511+
512+ // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
513+ amdgpu.tensor_store_from_lds %desc { cache_scope = #amdgpu.cache_scope <workgroup > } : !amdgpu.tdm_descriptor <2 >
514+
515+ // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 1 : vector<4xi32>, vector<8xi32>
516+ amdgpu.tensor_store_from_lds %desc { cache_scope = #amdgpu.cache_scope <shader_engine > } : !amdgpu.tdm_descriptor <2 >
517+
518+ // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 2 : vector<4xi32>, vector<8xi32>
519+ amdgpu.tensor_store_from_lds %desc { cache_scope = #amdgpu.cache_scope <device > } : !amdgpu.tdm_descriptor <2 >
520+
521+ // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 3 : vector<4xi32>, vector<8xi32>
522+ amdgpu.tensor_store_from_lds %desc { cache_scope = #amdgpu.cache_scope <system > } : !amdgpu.tdm_descriptor <2 >
523+
524+ // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
525+ amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint <regular > } : !amdgpu.tdm_descriptor <2 >
526+
527+ // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 4 : vector<4xi32>, vector<8xi32>
528+ amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint <nontemporal > } : !amdgpu.tdm_descriptor <2 >
529+
530+ // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 8 : vector<4xi32>, vector<8xi32>
531+ amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint <highpriority > } : !amdgpu.tdm_descriptor <2 >
532+
533+ // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 12 : vector<4xi32>, vector<8xi32>
534+ amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint <writeback > } : !amdgpu.tdm_descriptor <2 >
535+
536+ // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 16 : vector<4xi32>, vector<8xi32>
537+ amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint <nontemporal_regular > } : !amdgpu.tdm_descriptor <2 >
538+
539+ // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 20 : vector<4xi32>, vector<8xi32>
540+ amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint <regular_nontemporal > } : !amdgpu.tdm_descriptor <2 >
541+
542+ // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 24 : vector<4xi32>, vector<8xi32>
543+ amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint <nontemporal_highpriority > } : !amdgpu.tdm_descriptor <2 >
544+
545+ // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 28 : vector<4xi32>, vector<8xi32>
546+ amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint <nontemporal_writeback > } : !amdgpu.tdm_descriptor <2 >
547+
548+ // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
549+ amdgpu.tensor_store_from_lds %desc { non_volatile = false } : !amdgpu.tdm_descriptor <2 >
550+
551+ // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 32 : vector<4xi32>, vector<8xi32>
552+ amdgpu.tensor_store_from_lds %desc { non_volatile = true } : !amdgpu.tdm_descriptor <2 >
553+ func.return
554+ }
555+
556+
557+ // CHECK-LABEL: func @tensor_store_from_lds
558+ // CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor<4>)
559+ func.func @tensor_store_from_lds (%desc: !amdgpu.tdm_descriptor <4 >) {
560+ // CHECK: %[[DGROUPS:.+]]:4 = builtin.unrealized_conversion_cast %[[DESC]]
561+ // CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
562+ amdgpu.tensor_store_from_lds %desc : !amdgpu.tdm_descriptor <4 >
563+ func.return
564+ }
565+
0 commit comments