@@ -369,31 +369,68 @@ multiclass FLAT_Global_Store_Pseudo_t16<string opName> {
369369 }
370370}
371371
372- class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_Pseudo<
372+ // Async loads, introduced in gfx1250, will store directly
373+ // to a DS address in vdst (they will not use M0 for DS addess).
374+ class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsync = 0> : FLAT_Pseudo<
373375 opName,
374376 (outs ),
375377 !con(
376- !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)),
377- (ins flat_offset:$offset, CPol_0:$cpol)),
378- " $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
379- let LGKM_CNT = 1;
378+ !if(IsAsync, (ins VGPR_32:$vdst), (ins)),
379+ !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)),
380+ (ins flat_offset:$offset, CPol_0:$cpol)),
381+ !if(IsAsync, " $vdst,", "")#" $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
382+ let LGKM_CNT = !not(IsAsync);
383+ let VM_CNT = !not(IsAsync);
384+ let ASYNC_CNT = IsAsync;
380385 let is_flat_global = 1;
381386 let lds = 1;
382387 let has_data = 0;
388+ let has_vdst = IsAsync; // vdst for ds address with IsAsync
389+ let mayLoad = 1;
390+ let mayStore = 1;
391+ let has_saddr = 1;
392+ let enabled_saddr = EnableSaddr;
393+ let VALU = 1;
394+ let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", "");
395+ let Uses = !if(IsAsync, [EXEC, ASYNCcnt], [M0, EXEC]);
396+ let Defs = !if(IsAsync, [ASYNCcnt], []);
397+ let SchedRW = [WriteVMEM, WriteLDS];
398+ }
399+
400+ multiclass FLAT_Global_Load_LDS_Pseudo<string opName, bit IsAsync = 0> {
401+ def "" : FLAT_Global_Load_LDS_Pseudo<opName, 0, IsAsync>,
402+ GlobalSaddrTable<0, opName>;
403+ def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1, IsAsync>,
404+ GlobalSaddrTable<1, opName>;
405+ }
406+
407+ class FLAT_Global_STORE_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_Pseudo<
408+ opName,
409+ (outs ),
410+ !con(
411+ !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), (ins VGPR_32:$vdata),
412+ (ins flat_offset:$offset, CPol_0:$cpol)),
413+ " $vaddr, $vdata"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
414+ let VM_CNT = 0;
415+ let ASYNC_CNT = 1;
416+ let is_flat_global = 1;
417+ let lds = 1;
418+ let has_data = 1; // vdata for ds address
383419 let has_vdst = 0;
384420 let mayLoad = 1;
385421 let mayStore = 1;
386422 let has_saddr = 1;
387423 let enabled_saddr = EnableSaddr;
388424 let VALU = 1;
389- let Uses = [M0, EXEC];
425+ let Uses = [EXEC, ASYNCcnt];
426+ let Defs = [ASYNCcnt];
390427 let SchedRW = [WriteVMEM, WriteLDS];
391428}
392429
393- multiclass FLAT_Global_Load_LDS_Pseudo <string opName> {
394- def "" : FLAT_Global_Load_LDS_Pseudo <opName>,
430+ multiclass FLAT_Global_STORE_LDS_Pseudo <string opName> {
431+ def "" : FLAT_Global_STORE_LDS_Pseudo <opName>,
395432 GlobalSaddrTable<0, opName>;
396- def _SADDR : FLAT_Global_Load_LDS_Pseudo <opName, 1>,
433+ def _SADDR : FLAT_Global_STORE_LDS_Pseudo <opName, 1>,
397434 GlobalSaddrTable<1, opName>;
398435}
399436
@@ -1156,6 +1193,15 @@ let SubtargetPredicate = isGFX12Plus in {
11561193
11571194let SubtargetPredicate = isGFX1250Plus in {
11581195
1196+ defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b8", 1>;
1197+ defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b32", 1>;
1198+ defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b64", 1>;
1199+ defm GLOBAL_LOAD_ASYNC_TO_LDS_B128 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b128", 1>;
1200+ defm GLOBAL_STORE_ASYNC_FROM_LDS_B8 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b8">;
1201+ defm GLOBAL_STORE_ASYNC_FROM_LDS_B32 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b32">;
1202+ defm GLOBAL_STORE_ASYNC_FROM_LDS_B64 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b64">;
1203+ defm GLOBAL_STORE_ASYNC_FROM_LDS_B128 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b128">;
1204+
11591205def TENSOR_SAVE : FLAT_Global_Tensor_Pseudo<"tensor_save", 1>;
11601206def TENSOR_STOP : FLAT_Global_Tensor_Pseudo<"tensor_stop">;
11611207} // End SubtargetPredicate = isGFX1250Plus
@@ -3374,6 +3420,15 @@ defm GLOBAL_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>;
33743420defm GLOBAL_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>;
33753421defm GLOBAL_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>;
33763422
3423+ defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x5f>;
3424+ defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x60>;
3425+ defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x61>;
3426+ defm GLOBAL_LOAD_ASYNC_TO_LDS_B128 : VFLAT_Real_AllAddr_gfx1250<0x62>;
3427+ defm GLOBAL_STORE_ASYNC_FROM_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x63>;
3428+ defm GLOBAL_STORE_ASYNC_FROM_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x64>;
3429+ defm GLOBAL_STORE_ASYNC_FROM_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x65>;
3430+ defm GLOBAL_STORE_ASYNC_FROM_LDS_B128 : VFLAT_Real_AllAddr_gfx1250<0x66>;
3431+
33773432defm GLOBAL_LOAD_TR_B128_w32 : VFLAT_Real_AllAddr_gfx1250<0x057, "global_load_tr16_b128">;
33783433defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "global_load_tr8_b64">;
33793434
0 commit comments