Skip to content

Commit b8add37

Browse files
authored
[AMDGPU] Add pattern to select scalar ops for fshr with uniform operands (#165295)
Reasoning behind proposed change. This helps us move away from selecting v_alignbits for fshr with uniform operands. V_ALIGNBIT is defined in the ISA as: D0.u32 = 32'U(({ S0.u32, S1.u32 } >> S2.u32[4 : 0]) & 0xffffffffLL) Note: S0 carries the MSBs and S1 carries the LSBs of the value being aligned. I interpret that as : concat (s0, s1) >> S2, and use the 0X1F mask to return the lower 32 bits. fshr: fshr i32 %src0, i32 %src1, i32 %src2 Where: concat(%src0, %src1) represents the 64-bit value formed by %src0 as the high 32 bits and %src1 as the low 32 bits. %src2 is the shift amount. Only the lower 32 bits are returned. So these two are identical. So, I can expand the V_ALIGNBIT through bit manipulation as: Concat: S1 | (S0 << 32) Shift: ((S1 | (S0 << 32)) >> S2) Break the shift: (S1>>S2) | (S0 << (32 – S2) The proposed pattern does exactly this. Additionally, src2 in the fshr pattern should be: * must be 0–31. * If the shift is ≥32, hardware semantics differ; you must handle it with extra instructions. The extra S_ANDs limit the selection only to the last 5 bits
1 parent ea56ca2 commit b8add37

22 files changed

+22167
-19400
lines changed

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -791,6 +791,17 @@ def : GCNPat<
791791
(SI_CALL_ISEL $src0, (i64 0))
792792
>;
793793

794+
// Funnel shift right (fshr) patterns for uniform inputs.
795+
// These patterns implement this using scalar instructions by constructing a 64-bit
796+
// value {a, b} and performing a single right shift.
797+
def : GCNPat<(UniformTernaryFrag<fshr> i32:$src0, i32:$src1, i32:$src2),
798+
(i32 (EXTRACT_SUBREG (S_LSHR_B64 (REG_SEQUENCE SReg_64, $src1, sub0, $src0, sub1), (S_AND_B32 $src2, (i32 31))), sub0))
799+
>;
800+
801+
def : GCNPat<(UniformTernaryFrag<fshr> i32:$src0, i32:$src1, (i32 ShiftAmt32Imm:$src2)),
802+
(i32 (EXTRACT_SUBREG (S_LSHR_B64 (REG_SEQUENCE SReg_64, $src1, sub0, $src0, sub1), $src2), sub0))
803+
>;
804+
794805
// Wrapper around s_swappc_b64 with extra $callee parameter to track
795806
// the called function after regalloc.
796807
def SI_CALL : SPseudoInstSI <

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll

Lines changed: 13018 additions & 11672 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll

Lines changed: 681 additions & 655 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll

Lines changed: 1345 additions & 1282 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll

Lines changed: 109 additions & 107 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll

Lines changed: 46 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -290,34 +290,34 @@ define inreg <3 x half> @bitcast_v3bf16_to_v3f16_scalar(<3 x bfloat> inreg %a, i
290290
; VI-NEXT: s_cbranch_execnz .LBB1_4
291291
; VI-NEXT: .LBB1_2: ; %cmp.true
292292
; VI-NEXT: s_lshl_b32 s4, s17, 16
293-
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
294-
; VI-NEXT: v_add_f32_e32 v1, s4, v0
295-
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
296-
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
293+
; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000
294+
; VI-NEXT: v_add_f32_e32 v0, s4, v1
295+
; VI-NEXT: v_bfe_u32 v2, v0, 16, 1
296+
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0
297297
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
298-
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
299-
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
300298
; VI-NEXT: s_lshl_b32 s4, s16, 16
301-
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
302-
; VI-NEXT: v_add_f32_e32 v2, s4, v0
303-
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
304-
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
305-
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
306-
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
307-
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
308-
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
309-
; VI-NEXT: v_add_f32_e32 v0, s4, v0
310-
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
299+
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
300+
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
301+
; VI-NEXT: v_add_f32_e32 v0, s4, v1
302+
; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
311303
; VI-NEXT: v_bfe_u32 v3, v0, 16, 1
312304
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0
313305
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
306+
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
314307
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
315308
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
309+
; VI-NEXT: v_add_f32_e32 v1, s4, v1
316310
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
317-
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
318-
; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16
319-
; VI-NEXT: v_mov_b32_e32 v2, 0x7fc00000
320-
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
311+
; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
312+
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
313+
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
314+
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
315+
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
316+
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
317+
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
318+
; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
319+
; VI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
320+
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
321321
; VI-NEXT: s_setpc_b64 s[30:31]
322322
; VI-NEXT: .LBB1_3:
323323
; VI-NEXT: s_branch .LBB1_2
@@ -964,16 +964,16 @@ define inreg <3 x i16> @bitcast_v3bf16_to_v3i16_scalar(<3 x bfloat> inreg %a, i3
964964
; SI-NEXT: s_cbranch_execnz .LBB5_3
965965
; SI-NEXT: .LBB5_2: ; %cmp.true
966966
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
967+
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
968+
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v1
969+
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
970+
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
971+
; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
972+
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
967973
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
968-
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
969974
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
970-
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
971-
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
972-
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
973975
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
974-
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
975976
; SI-NEXT: v_lshr_b64 v[3:4], v[1:2], 16
976-
; SI-NEXT: v_alignbit_b32 v0, v5, v0, 16
977977
; SI-NEXT: .LBB5_3: ; %end
978978
; SI-NEXT: v_mov_b32_e32 v1, v3
979979
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -992,34 +992,34 @@ define inreg <3 x i16> @bitcast_v3bf16_to_v3i16_scalar(<3 x bfloat> inreg %a, i3
992992
; VI-NEXT: s_cbranch_execnz .LBB5_4
993993
; VI-NEXT: .LBB5_2: ; %cmp.true
994994
; VI-NEXT: s_lshl_b32 s4, s17, 16
995-
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
996-
; VI-NEXT: v_add_f32_e32 v1, s4, v0
997-
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
998-
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
995+
; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000
996+
; VI-NEXT: v_add_f32_e32 v0, s4, v1
997+
; VI-NEXT: v_bfe_u32 v2, v0, 16, 1
998+
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0
999999
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
1000-
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
1001-
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
10021000
; VI-NEXT: s_lshl_b32 s4, s16, 16
1003-
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
1004-
; VI-NEXT: v_add_f32_e32 v2, s4, v0
1005-
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
1006-
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
1007-
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
1008-
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
1009-
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
1010-
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
1011-
; VI-NEXT: v_add_f32_e32 v0, s4, v0
1012-
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
1001+
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
1002+
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
1003+
; VI-NEXT: v_add_f32_e32 v0, s4, v1
1004+
; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
10131005
; VI-NEXT: v_bfe_u32 v3, v0, 16, 1
10141006
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0
10151007
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
1008+
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
10161009
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
10171010
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
1011+
; VI-NEXT: v_add_f32_e32 v1, s4, v1
10181012
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
1019-
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1020-
; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16
1021-
; VI-NEXT: v_mov_b32_e32 v2, 0x7fc00000
1022-
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1013+
; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
1014+
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
1015+
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
1016+
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
1017+
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
1018+
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
1019+
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1020+
; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
1021+
; VI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
1022+
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
10231023
; VI-NEXT: s_setpc_b64 s[30:31]
10241024
; VI-NEXT: .LBB5_3:
10251025
; VI-NEXT: s_branch .LBB5_2

0 commit comments

Comments
 (0)