-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[AMDGPU] Add pattern to select scalar ops for fshr with uniform operands #165295
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[AMDGPU] Add pattern to select scalar ops for fshr with uniform operands #165295
Conversation
|
@llvm/pr-subscribers-backend-amdgpu Author: Akash Dutta (akadutta) ChangesReasoning behind proposed change. This helps us move away from selecting v_alignbits for fshr with uniform operands. V_ALIGNBIT is defined in the ISA as: I interpret that as : concat (s0, s1) >> S2, and use the 0X1F mask to return the lower 32 bits. fshr: fshr i32 %src0, i32 %src1, i32 %src2 So, I can expand the V_ALIGNBIT through bit manipulation as: Additionally, src2 in the fshr pattern should be:
The extra S_ANDs limit the selection only to the last 5 bits Patch is 30.04 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/165295.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 6f1feb1dc2996..6e64a1514d27d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2763,6 +2763,11 @@ def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
/* src2_modifiers */ 0,
$src2, /* clamp */ 0, /* op_sel */ 0)
>;
+
+def : GCNPat<(UniformTernaryFrag<fshr> i32:$src0, i32:$src1, i32:$src2),
+ (S_OR_B32 (S_LSHR_B32 $src1, (S_AND_B32 $src2, (i32 0xffffffff))), (S_LSHL_B32 $src0, (S_SUB_I32 (i32 32), (S_AND_B32 $src2, (i32 0xffffffff)))))
+>;
+
} // end True16Predicate = UseFakeTrue16Insts
/********** ====================== **********/
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index ef68f44bac203..4be928f0ef6f1 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -103,10 +103,15 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s0, s1, v0
-; GFX11-FAKE16-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, -1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_sub_i32 s3, 32, s2
+; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, s2
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, s3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-FAKE16-NEXT: s_endpgm
;
; GFX12-TRUE16-LABEL: fshr_i32:
@@ -128,10 +133,15 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
; GFX12-FAKE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x2c
; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_alignbit_b32 v0, s0, s1, v0
-; GFX12-FAKE16-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX12-FAKE16-NEXT: s_and_b32 s2, s2, -1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT: s_sub_co_i32 s3, 32, s2
+; GFX12-FAKE16-NEXT: s_lshr_b32 s1, s1, s2
+; GFX12-FAKE16-NEXT: s_lshl_b32 s0, s0, s3
+; GFX12-FAKE16-NEXT: s_or_b32 s0, s1, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-FAKE16-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-FAKE16-NEXT: s_endpgm
entry:
%0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
@@ -195,23 +205,49 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: fshr_i32_imm:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 7
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
-;
-; GFX12-LABEL: fshr_i32_imm:
-; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_alignbit_b32 v1, s2, s3, 7
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fshr_i32_imm:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, s2, s3, 7
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fshr_i32_imm:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_sub_i32 s4, 32, 7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, s4
+; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
+;
+; GFX12-TRUE16-LABEL: fshr_i32_imm:
+; GFX12-TRUE16: ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_alignbit_b32 v1, s2, s3, 7
+; GFX12-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-TRUE16-NEXT: s_endpgm
+;
+; GFX12-FAKE16-LABEL: fshr_i32_imm:
+; GFX12-FAKE16: ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT: s_sub_co_i32 s4, 32, 7
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: s_lshl_b32 s2, s2, s4
+; GFX12-FAKE16-NEXT: s_lshr_b32 s3, s3, 7
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT: s_or_b32 s2, s3, s2
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-FAKE16-NEXT: s_endpgm
entry:
%0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 7)
store i32 %0, ptr addrspace(1) %in
@@ -321,12 +357,20 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s6
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, s1, s3, v0
-; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s0, s2, v2
-; GFX11-FAKE16-NEXT: global_store_b64 v3, v[0:1], s[4:5]
+; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, -1
+; GFX11-FAKE16-NEXT: s_and_b32 s6, s6, -1
+; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, s7
+; GFX11-FAKE16-NEXT: s_sub_i32 s7, 32, s7
+; GFX11-FAKE16-NEXT: s_sub_i32 s8, 32, s6
+; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, s6
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, s8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, s7
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s2, s0
+; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-FAKE16-NEXT: s_endpgm
;
; GFX12-TRUE16-LABEL: fshr_v2i32:
@@ -352,12 +396,20 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s6
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_alignbit_b32 v1, s1, s3, v0
-; GFX12-FAKE16-NEXT: v_alignbit_b32 v0, s0, s2, v2
-; GFX12-FAKE16-NEXT: global_store_b64 v3, v[0:1], s[4:5]
+; GFX12-FAKE16-NEXT: s_and_b32 s7, s7, -1
+; GFX12-FAKE16-NEXT: s_and_b32 s6, s6, -1
+; GFX12-FAKE16-NEXT: s_lshr_b32 s3, s3, s7
+; GFX12-FAKE16-NEXT: s_sub_co_i32 s7, 32, s7
+; GFX12-FAKE16-NEXT: s_sub_co_i32 s8, 32, s6
+; GFX12-FAKE16-NEXT: s_lshr_b32 s2, s2, s6
+; GFX12-FAKE16-NEXT: s_lshl_b32 s0, s0, s8
+; GFX12-FAKE16-NEXT: s_lshl_b32 s1, s1, s7
+; GFX12-FAKE16-NEXT: s_or_b32 s0, s2, s0
+; GFX12-FAKE16-NEXT: s_or_b32 s1, s3, s1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-FAKE16-NEXT: s_endpgm
entry:
%0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
@@ -433,29 +485,69 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: fshr_v2i32_imm:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_alignbit_b32 v1, s1, s3, 9
-; GFX11-NEXT: v_alignbit_b32 v0, s0, s2, 7
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
-; GFX11-NEXT: s_endpgm
-;
-; GFX12-LABEL: fshr_v2i32_imm:
-; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_alignbit_b32 v1, s1, s3, 9
-; GFX12-NEXT: v_alignbit_b32 v0, s0, s2, 7
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
-; GFX12-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fshr_v2i32_imm:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, s1, s3, 9
+; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s0, s2, 7
+; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fshr_v2i32_imm:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_sub_i32 s6, 32, 9
+; GFX11-FAKE16-NEXT: s_sub_i32 s7, 32, 7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, s6
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, s7
+; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 7
+; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 9
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s2, s0
+; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-FAKE16-NEXT: s_endpgm
+;
+; GFX12-TRUE16-LABEL: fshr_v2i32_imm:
+; GFX12-TRUE16: ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT: s_clause 0x1
+; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX12-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_alignbit_b32 v1, s1, s3, 9
+; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, s0, s2, 7
+; GFX12-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-TRUE16-NEXT: s_endpgm
+;
+; GFX12-FAKE16-LABEL: fshr_v2i32_imm:
+; GFX12-FAKE16: ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT: s_clause 0x1
+; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX12-FAKE16-NEXT: s_sub_co_i32 s6, 32, 9
+; GFX12-FAKE16-NEXT: s_sub_co_i32 s7, 32, 7
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: s_lshl_b32 s1, s1, s6
+; GFX12-FAKE16-NEXT: s_lshl_b32 s0, s0, s7
+; GFX12-FAKE16-NEXT: s_lshr_b32 s2, s2, 7
+; GFX12-FAKE16-NEXT: s_lshr_b32 s3, s3, 9
+; GFX12-FAKE16-NEXT: s_or_b32 s0, s2, s0
+; GFX12-FAKE16-NEXT: s_or_b32 s1, s3, s1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-FAKE16-NEXT: s_endpgm
entry:
%0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
store <2 x i32> %0, ptr addrspace(1) %in
@@ -595,17 +687,32 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x54
; GFX11-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, 0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_alignbit_b32 v3, s11, s15, v0
-; GFX11-FAKE16-NEXT: v_alignbit_b32 v2, s10, s14, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, s9, s13, v4
-; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s8, s12, v5
-; GFX11-FAKE16-NEXT: global_store_b128 v6, v[0:3], s[4:5]
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, -1
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, -1
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, -1
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, -1
+; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s15, s3
+; GFX11-FAKE16-NEXT: s_sub_i32 s3, 32, s3
+; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s14, s2
+; GFX11-FAKE16-NEXT: s_sub_i32 s2, 32, s2
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s13, s1
+; GFX11-FAKE16-NEXT: s_sub_i32 s1, 32, s1
+; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s12, s0
+; GFX11-FAKE16-NEXT: s_sub_i32 s0, 32, s0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s11, s3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s10, s2
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s9, s1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s8, s0
+; GFX11-FAKE16-NEXT: s_or_b32 s3, s6, s3
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s7, s2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s12, s0
+; GFX11-FAKE16-NEXT: s_or_b32 s1, s13, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-FAKE16-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX11-FAKE16-NEXT: s_endpgm
;
; GFX12-TRUE16-LABEL: fshr_v4i32:
@@ -635,17 +742,32 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x54
; GFX12-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, 0
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT: v_alignbit_b32 v3, s11, s15, v0
-; GFX12-FAKE16-NEXT: v_alignbit_b32 v2, s10, s14, v1
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-FAKE16-NEXT: v_alignbit_b32 v1, s9, s13, v4
-; GFX12-FAKE16-NEXT: v_alignbit_b32 v0, s8, s12, v5
-; GFX12-FAKE16-NEXT: global_store_b128 v6, v[0:3], s[4:5]
+; GFX12-FAKE16-NEXT: s_and_b32 s3, s3, -1
+; GFX12-FAKE16-NEXT: s_and_b32 s2, s2, -1
+; GFX12-FAKE16-NEXT: s_and_b32 s1, s1, -1
+; GFX12-FAKE16-NEXT: s_and_b32 s0, s0, -1
+; GFX12-FAKE16-NEXT: s_lshr_b32 s6, s15, s3
+; GFX12-FAKE16-NEXT: s_sub_co_i32 s3, 32, s3
+; GFX12-FAKE16-NEXT: s_lshr_b32 s7, s14, s2
+; GFX12-FAKE16-NEXT: s_sub_co_i32 s2, 32, s2
+; GFX12-FAKE16-NEXT: s_lshr_b32 s13, s13, s1
+; GFX12-FAKE16-NEXT: s_sub_co_i32 s1, 32, s1
+; GFX12-FAKE16-NEXT: s_lshr_b32 s12, s12, s0
+; GFX12-FAKE16-NEXT: s_sub_co_i32 s0, 32, s0
+; GFX12-FAKE16-NEXT: s_lshl_b32 s3, s11, s3
+; GFX12-FAKE16-NEXT: s_lshl_b32 s2, s10, s2
+; GFX12-FAKE16-NEXT: s_lshl_b32 s1, s9, s1
+; GFX12-FAKE16-NEXT: s_lshl_b32 s0, s8, s0
+; GFX12-FAKE16-NEXT: s_or_b32 s3, s6, s3
+; GFX12-FAKE16-NEXT: s_or_b32 s2, s7, s2
+; GFX12-FAKE16-NEXT: s_or_b32 s0, s12, s0
+; GFX12-FAKE16-NEXT: s_or_b32 s1, s13, s1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-FAKE16-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-FAKE16-NEXT: s_endpgm
entry:
%0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z)
@@ -737,33 +859,89 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: fshr_v4i32_imm:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v4, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_alignbit_b32 v3, s11, s15, 1
-; GFX11-NEXT: v_alignbit_b32 v2, s10, s14, 9
-; GFX11-NEXT: v_alignbit_b32 v1, s9, s13, 7
-; GFX11-NEXT: v_alignbit_b32 v0, s8, s12, 1
-; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
-; GFX11-NEXT: s_endpgm
-;
-; GFX12-LABEL: fshr_v4i32_imm:
-; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
-; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-NEXT: v_mov_b32_e32 v4, 0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_alignbit_b32 v3, s11, s15, 1
-; GFX12-NEXT: v_alignbit_b32 v2, s10, s14, 9
-; GFX12-NEXT: v_alignbit_b32 v1, s9, s13, 7
-; GFX12-NEXT: v_alignbit_b32 v0, s8, s12, 1
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
-; GFX12-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fshr_v4i32_imm:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_alignbit_b32 v3, s11, s15, 1
+; GFX11-TRUE16-NEXT: v_alignbit_b32 v2, s10, s14, 9
+; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, s9, s13, 7
+; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s8, s12, 1
+; GFX11-TRUE16-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fshr_v4i32_imm:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_sub_i32 s2, 32, 1
+; GFX11-FAKE16-NEXT: s_sub_i32 s3, 32, 9
+; GFX11-FAKE16-NEXT: s_sub_i32 s6, 32, 7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s11, s2
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s15, 1
+; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s10, s3
+; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s14, 9
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s9, s6
+; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s13, 7
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2,...
[truncated]
|
|
@jrbyrnes @hidekisaito @carlobertolli . Requesting review on this change. |
|
@aleksandar-amd has also been working on this, see #143551. |
| >; | ||
|
|
||
| def : GCNPat<(UniformTernaryFrag<fshr> i32:$src0, i32:$src1, i32:$src2), | ||
| (S_OR_B32 (S_LSHR_B32 $src1, (S_AND_B32 $src2, (i32 0xffffffff))), (S_LSHL_B32 $src0, (S_SUB_I32 (i32 32), (S_AND_B32 $src2, (i32 0xffffffff))))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A 32-bit AND with 0xffffffff is a no-op. Do not generate this.
Thank you for pointing me to this PR. I took a look at it. To me, the 2 don't seem to overlap. This one simply adds a new set of scalar instructions for the fshr operation with uniform operands, whereas #143551 targets the rotr operation. Or am I missing something? |
fshr and rotr are very similar operations. Yes #143551 only handles rotr, but with very small changes it could handle fshr as well (or instead). It is more sophisticated than the current patch because it tries to optimize the common case where the shift/rotate amount is constant. |
Yes. That makes sense. Thank you for explaining. |
|
I think this patch is probably OK as-is but there are things that could be improved:
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Eyeballing the test, it doesn't look like we have test coverage for all the permutations of scalar / vector / constant in each operand position. Given the pattern complexity we probably should test all of these
llvm/test/CodeGen/AMDGPU/fshr.ll
Outdated
| ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 | ||
| ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0 | ||
| ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) | ||
| ; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, s2, s3, 7 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why doesn't the uniform pattern work for true16??? Maybe you need to mark the divergent patterns with DivergentTernaryFrag, or use AddedComplexity to ensure that the uniform pattern is preferred when both version would match.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is so far limited to the "True16Predicate = UseFakeTrue16Insts" condition.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please fix that. The uniform pattern has nothing to do with true16, so it should not be inside any True16Predicate = ... section.
llvm/test/CodeGen/AMDGPU/fshr.ll
Outdated
| ; GFX11-FAKE16-NEXT: s_mov_b32 s4, s3 | ||
| ; GFX11-FAKE16-NEXT: s_mov_b32 s5, s2 | ||
| ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) | ||
| ; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[4:5], 7 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess SIFoldOperands managed to fold away the s_and_b32 you generated here? It would still be better not to generate it in the first place, when the shift amount is constant.
| // The commented out code has been left intentionally to aid the review process, if needed. | ||
| // Will delete before landing. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please just remove it. We can always look at older commits in this PR if we want to see it.
I added a few new tests with immediates in different arg positions. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please don't add three copies of your new patterns. Just put them somewhere outside of any True16Predicate = ... clause.
Reasoning behind proposed change. This helps us move away from selecting v_alignbits for fshr with uniform operands.
V_ALIGNBIT is defined in the ISA as:
D0.u32 = 32'U(({ S0.u32, S1.u32 } >> S2.u32[4 : 0]) & 0xffffffffLL)
Note: S0 carries the MSBs and S1 carries the LSBs of the value being aligned.
I interpret that as : concat (s0, s1) >> S2, and use the 0X1F mask to return the lower 32 bits.
fshr:
fshr i32 %src0, i32 %src1, i32 %src2
Where:
concat(%src0, %src1) represents the 64-bit value formed by %src0 as the high 32 bits and %src1 as the low 32 bits.
%src2 is the shift amount.
Only the lower 32 bits are returned.
So these two are identical.
So, I can expand the V_ALIGNBIT through bit manipulation as:
Concat: S1 | (S0 << 32)
Shift: ((S1 | (S0 << 32)) >> S2)
Break the shift: (S1>>S2) | (S0 << (32 – S2)
The proposed pattern does exactly this.
Additionally, src2 in the fshr pattern should be:
The extra S_ANDs limit the selection only to the last 5 bits