From 430b60ebe05f4215d1d5c54caa45b70157621634 Mon Sep 17 00:00:00 2001 From: vigneshwar jayakumar Date: Mon, 16 Jun 2025 14:29:16 -0500 Subject: [PATCH 1/2] [AMDGPU] Fix to prevent sinking of PERMLANE_SWAP instruction Permlane_swap instruction depends on exec mask, added isConvergent flag to prevent sinking of instruction. Fixes SWDEV-537232 --- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 6 +- llvm/lib/Target/AMDGPU/VOPInstructions.td | 1 + .../machine-sink-ignorable-exec-use.mir | 61 +++++++++++++++++++ 3 files changed, 67 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 7fdd951ecbd3c..d6718c556db1e 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -775,7 +775,8 @@ defm V_PRNG_B32 : VOP1Inst <"v_prng_b32", VOP_I32_I32, int_amdgcn_prng_b32>; let Constraints = "$vdst = $vdst_in, $src0_out = $src0", DisableEncoding="$vdst_in,$src0_out", - SchedRW = [Write32Bit, Write32Bit] in { + SchedRW = [Write32Bit, Write32Bit], + isConvergent = 1 in { let SubtargetPredicate = HasPermlane16Swap in { defm V_PERMLANE16_SWAP_B32 : VOP1Inst<"v_permlane16_swap_b32", VOP_PERMLANE_SWAP>; } @@ -1550,8 +1551,11 @@ defm V_CVT_PK_F32_FP8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x56>; defm V_CVT_PK_F32_BF8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x57>; defm V_PRNG_B32 : VOP1_Real_gfx9 <0x58>; + +let isConvergent = 1 in { defm V_PERMLANE16_SWAP_B32 : VOP1_OpSel_Real_e32e64_gfx9<0x059>; defm V_PERMLANE32_SWAP_B32 : VOP1_OpSel_Real_e32e64_gfx9<0x05a>; +} class MovDPP8Pattern : GCNPat < (vt (int_amdgcn_mov_dpp8 vt:$src, timm:$dpp8)), diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 952ee2fe2c955..c0c874300ef0f 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -15,6 +15,7 @@ class LetDummies { bit isConvertibleToThreeAddress; bit isMoveImm; bit isReMaterializable; + bit isConvergent; bit isAsCheapAsAMove; bit FPDPRounding; Predicate SubtargetPredicate; diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir index 0fc31ea9d6437..3cdba54026ccc 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir @@ -733,3 +733,64 @@ body: | liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc S_ENDPGM 0 ... +--- +name: test_no_sink_permlane_swap +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + ; GFX9-LABEL: name: test_no_sink_permlane_swap + ; GFX9: bb.0: + ; GFX9-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GFX9-NEXT: liveins: $vgpr0 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]] + ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY1]], 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; GFX9-NEXT: [[V_PERMLANE32_SWAP_B32_e64_:%[0-9]+]]:vgpr_32, [[V_PERMLANE32_SWAP_B32_e64_1:%[0-9]+]]:vgpr_32 = V_PERMLANE32_SWAP_B32_e64 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD]], 0, 0, implicit $exec + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; GFX9-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY2]](s32), [[S_MOV_B32_]], implicit $exec + ; GFX9-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX9-NEXT: S_BRANCH %bb.1 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.1: + ; GFX9-NEXT: successors: %bb.2(0x80000000) + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[V_MAX_I32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I32_e64 [[V_PERMLANE32_SWAP_B32_e64_]], [[V_PERMLANE32_SWAP_B32_e64_1]], implicit $exec + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.2: + ; GFX9-NEXT: successors: %bb.3(0x80000000) + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_MAX_I32_e64_]], %bb.1 + ; GFX9-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.3: + ; GFX9-NEXT: S_ENDPGM 0, implicit [[PHI]] + bb.0: + liveins: $vgpr0 + %1:vgpr_32 = COPY $vgpr0 + %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %5:sreg_64 = S_MOV_B64 0 + %7:vreg_64 = COPY %5 + %9:vgpr_32 = GLOBAL_LOAD_DWORD killed %7, 0, 0, implicit $exec :: (load (s32), addrspace 1) + %10:vgpr_32, %11:vgpr_32 = V_PERMLANE32_SWAP_B32_e64 %9:vgpr_32, %9:vgpr_32, 0, 0, implicit $exec + %15:vgpr_32(s32) = COPY $vgpr0 + %16:sreg_32 = S_MOV_B32 1 + %17:sreg_64 = V_CMP_LT_I32_e64 %15(s32), %16, implicit $exec + %18:sreg_64 = COPY %17 + %19:sreg_64 = SI_IF %18, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.1 + + bb.1: + %20:vgpr_32 = V_MAX_I32_e64 %10:vgpr_32, %11:vgpr_32, implicit $exec + + bb.2: + %22:vgpr_32 = PHI %3, %bb.0, %20, %bb.1 + SI_END_CF %19, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + + bb.3: + S_ENDPGM 0, implicit %22 +... From 6913acf1e4eb5dea9e93142837d1c1822d7be695 Mon Sep 17 00:00:00 2001 From: vigneshwar jayakumar Date: Tue, 17 Jun 2025 10:45:03 -0500 Subject: [PATCH 2/2] review changes --- .../machine-sink-ignorable-exec-use.mir | 40 +++++++++++-------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir index 3cdba54026ccc..ed22b353b0664 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir @@ -750,9 +750,9 @@ body: | ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]] ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY1]], 0, 0, implicit $exec :: (load (s32), addrspace 1) ; GFX9-NEXT: [[V_PERMLANE32_SWAP_B32_e64_:%[0-9]+]]:vgpr_32, [[V_PERMLANE32_SWAP_B32_e64_1:%[0-9]+]]:vgpr_32 = V_PERMLANE32_SWAP_B32_e64 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD]], 0, 0, implicit $exec - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 - ; GFX9-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY2]](s32), [[S_MOV_B32_]], implicit $exec + ; GFX9-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY2]], [[S_MOV_B32_]], implicit $exec ; GFX9-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX9-NEXT: S_BRANCH %bb.1 ; GFX9-NEXT: {{ $}} @@ -770,27 +770,33 @@ body: | ; GFX9-NEXT: bb.3: ; GFX9-NEXT: S_ENDPGM 0, implicit [[PHI]] bb.0: + successors: %bb.2(0x40000000), %bb.1(0x40000000) liveins: $vgpr0 - %1:vgpr_32 = COPY $vgpr0 - %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %5:sreg_64 = S_MOV_B64 0 - %7:vreg_64 = COPY %5 - %9:vgpr_32 = GLOBAL_LOAD_DWORD killed %7, 0, 0, implicit $exec :: (load (s32), addrspace 1) - %10:vgpr_32, %11:vgpr_32 = V_PERMLANE32_SWAP_B32_e64 %9:vgpr_32, %9:vgpr_32, 0, 0, implicit $exec - %15:vgpr_32(s32) = COPY $vgpr0 - %16:sreg_32 = S_MOV_B32 1 - %17:sreg_64 = V_CMP_LT_I32_e64 %15(s32), %16, implicit $exec - %18:sreg_64 = COPY %17 - %19:sreg_64 = SI_IF %18, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %2:sreg_64 = S_MOV_B64 0 + %3:vreg_64 = COPY %2 + %4:vgpr_32 = GLOBAL_LOAD_DWORD killed %3, 0, 0, implicit $exec :: (load (s32), addrspace 1) + %5:vgpr_32, %6:vgpr_32 = V_PERMLANE32_SWAP_B32_e64 %4, %4, 0, 0, implicit $exec + %7:vgpr_32 = COPY $vgpr0 + %8:sreg_32 = S_MOV_B32 1 + %9:sreg_64 = V_CMP_LT_I32_e64 %7, %8, implicit $exec + %10:sreg_64 = COPY %9 + %11:sreg_64 = SI_IF %10, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.1 bb.1: - %20:vgpr_32 = V_MAX_I32_e64 %10:vgpr_32, %11:vgpr_32, implicit $exec + successors: %bb.2(0x80000000) + + %12:vgpr_32 = V_MAX_I32_e64 %5, %6, implicit $exec bb.2: - %22:vgpr_32 = PHI %3, %bb.0, %20, %bb.1 - SI_END_CF %19, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + successors: %bb.3(0x80000000) + + %13:vgpr_32 = PHI %1, %bb.0, %12, %bb.1 + SI_END_CF %11, implicit-def dead $exec, implicit-def dead $scc, implicit $exec bb.3: - S_ENDPGM 0, implicit %22 + S_ENDPGM 0, implicit %13 ...