Skip to content

Commit 7b96cd7

Browse files
authored
[AMDGPU] Use "v_bfi_b32 x, y, -1" to implement (y | ~x) (#156653)
1 parent 8f1c39f commit 7b96cd7

File tree

8 files changed

+47
-60
lines changed

8 files changed

+47
-60
lines changed

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2496,6 +2496,22 @@ def : AMDGPUPatIgnoreCopies <
24962496
(i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
24972497
>;
24982498

2499+
// (y | ~x)
2500+
def : AMDGPUPatIgnoreCopies <
2501+
(DivergentBinFrag<or> i32:$y, (not_oneuse i32:$x)),
2502+
(V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, (i32 -1))
2503+
>;
2504+
2505+
// 64-bit version
2506+
def : AMDGPUPatIgnoreCopies <
2507+
(DivergentBinFrag<or> i64:$y, (not_oneuse i64:$x)),
2508+
(REG_SEQUENCE VReg_64,
2509+
(V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)),
2510+
(i32 (EXTRACT_SUBREG VReg_64:$y, sub0)), (i32 -1)), sub0,
2511+
(V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)),
2512+
(i32 (EXTRACT_SUBREG VReg_64:$y, sub1)), (i32 -1)), sub1)
2513+
>;
2514+
24992515
// SHA-256 Ch function
25002516
// z ^ (x & (y ^ z))
25012517
def : AMDGPUPatIgnoreCopies <

llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll

Lines changed: 14 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -99,15 +99,13 @@ define i32 @v_orn2_i32(i32 %src0, i32 %src1) {
9999
; GCN-LABEL: v_orn2_i32:
100100
; GCN: ; %bb.0:
101101
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
102-
; GCN-NEXT: v_not_b32_e32 v1, v1
103-
; GCN-NEXT: v_or_b32_e32 v0, v0, v1
102+
; GCN-NEXT: v_bfi_b32 v0, v1, v0, -1
104103
; GCN-NEXT: s_setpc_b64 s[30:31]
105104
;
106105
; GFX10PLUS-LABEL: v_orn2_i32:
107106
; GFX10PLUS: ; %bb.0:
108107
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
109-
; GFX10PLUS-NEXT: v_not_b32_e32 v1, v1
110-
; GFX10PLUS-NEXT: v_or_b32_e32 v0, v0, v1
108+
; GFX10PLUS-NEXT: v_bfi_b32 v0, v1, v0, -1
111109
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
112110
%not.src1 = xor i32 %src1, -1
113111
%or = or i32 %src0, %not.src1
@@ -117,14 +115,12 @@ define i32 @v_orn2_i32(i32 %src0, i32 %src1) {
117115
define amdgpu_ps float @v_orn2_i32_sv(i32 inreg %src0, i32 %src1) {
118116
; GCN-LABEL: v_orn2_i32_sv:
119117
; GCN: ; %bb.0:
120-
; GCN-NEXT: v_not_b32_e32 v0, v0
121-
; GCN-NEXT: v_or_b32_e32 v0, s2, v0
118+
; GCN-NEXT: v_bfi_b32 v0, v0, s2, -1
122119
; GCN-NEXT: ; return to shader part epilog
123120
;
124121
; GFX10PLUS-LABEL: v_orn2_i32_sv:
125122
; GFX10PLUS: ; %bb.0:
126-
; GFX10PLUS-NEXT: v_not_b32_e32 v0, v0
127-
; GFX10PLUS-NEXT: v_or_b32_e32 v0, s2, v0
123+
; GFX10PLUS-NEXT: v_bfi_b32 v0, v0, s2, -1
128124
; GFX10PLUS-NEXT: ; return to shader part epilog
129125
%not.src1 = xor i32 %src1, -1
130126
%or = or i32 %src0, %not.src1
@@ -135,14 +131,12 @@ define amdgpu_ps float @v_orn2_i32_sv(i32 inreg %src0, i32 %src1) {
135131
define amdgpu_ps float @v_orn2_i32_vs(i32 %src0, i32 inreg %src1) {
136132
; GCN-LABEL: v_orn2_i32_vs:
137133
; GCN: ; %bb.0:
138-
; GCN-NEXT: s_not_b32 s0, s2
139-
; GCN-NEXT: v_or_b32_e32 v0, s0, v0
134+
; GCN-NEXT: v_bfi_b32 v0, s2, v0, -1
140135
; GCN-NEXT: ; return to shader part epilog
141136
;
142137
; GFX10PLUS-LABEL: v_orn2_i32_vs:
143138
; GFX10PLUS: ; %bb.0:
144-
; GFX10PLUS-NEXT: s_not_b32 s0, s2
145-
; GFX10PLUS-NEXT: v_or_b32_e32 v0, s0, v0
139+
; GFX10PLUS-NEXT: v_bfi_b32 v0, s2, v0, -1
146140
; GFX10PLUS-NEXT: ; return to shader part epilog
147141
%not.src1 = xor i32 %src1, -1
148142
%or = or i32 %src0, %not.src1
@@ -247,19 +241,15 @@ define i64 @v_orn2_i64(i64 %src0, i64 %src1) {
247241
; GCN-LABEL: v_orn2_i64:
248242
; GCN: ; %bb.0:
249243
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
250-
; GCN-NEXT: v_not_b32_e32 v2, v2
251-
; GCN-NEXT: v_not_b32_e32 v3, v3
252-
; GCN-NEXT: v_or_b32_e32 v0, v0, v2
253-
; GCN-NEXT: v_or_b32_e32 v1, v1, v3
244+
; GCN-NEXT: v_bfi_b32 v0, v2, v0, -1
245+
; GCN-NEXT: v_bfi_b32 v1, v3, v1, -1
254246
; GCN-NEXT: s_setpc_b64 s[30:31]
255247
;
256248
; GFX10PLUS-LABEL: v_orn2_i64:
257249
; GFX10PLUS: ; %bb.0:
258250
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
259-
; GFX10PLUS-NEXT: v_not_b32_e32 v2, v2
260-
; GFX10PLUS-NEXT: v_not_b32_e32 v3, v3
261-
; GFX10PLUS-NEXT: v_or_b32_e32 v0, v0, v2
262-
; GFX10PLUS-NEXT: v_or_b32_e32 v1, v1, v3
251+
; GFX10PLUS-NEXT: v_bfi_b32 v0, v2, v0, -1
252+
; GFX10PLUS-NEXT: v_bfi_b32 v1, v3, v1, -1
263253
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
264254
%not.src1 = xor i64 %src1, -1
265255
%or = or i64 %src0, %not.src1
@@ -269,18 +259,14 @@ define i64 @v_orn2_i64(i64 %src0, i64 %src1) {
269259
define amdgpu_ps <2 x float> @v_orn2_i64_sv(i64 inreg %src0, i64 %src1) {
270260
; GCN-LABEL: v_orn2_i64_sv:
271261
; GCN: ; %bb.0:
272-
; GCN-NEXT: v_not_b32_e32 v0, v0
273-
; GCN-NEXT: v_not_b32_e32 v1, v1
274-
; GCN-NEXT: v_or_b32_e32 v0, s2, v0
275-
; GCN-NEXT: v_or_b32_e32 v1, s3, v1
262+
; GCN-NEXT: v_bfi_b32 v0, v0, s2, -1
263+
; GCN-NEXT: v_bfi_b32 v1, v1, s3, -1
276264
; GCN-NEXT: ; return to shader part epilog
277265
;
278266
; GFX10PLUS-LABEL: v_orn2_i64_sv:
279267
; GFX10PLUS: ; %bb.0:
280-
; GFX10PLUS-NEXT: v_not_b32_e32 v0, v0
281-
; GFX10PLUS-NEXT: v_not_b32_e32 v1, v1
282-
; GFX10PLUS-NEXT: v_or_b32_e32 v0, s2, v0
283-
; GFX10PLUS-NEXT: v_or_b32_e32 v1, s3, v1
268+
; GFX10PLUS-NEXT: v_bfi_b32 v0, v0, s2, -1
269+
; GFX10PLUS-NEXT: v_bfi_b32 v1, v1, s3, -1
284270
; GFX10PLUS-NEXT: ; return to shader part epilog
285271
%not.src1 = xor i64 %src1, -1
286272
%or = or i64 %src0, %not.src1

llvm/test/CodeGen/AMDGPU/andorn2.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,7 @@ entry:
7272
}
7373

7474
; GCN-LABEL: {{^}}vector_orn2_i32_s_v_one_use
75-
; GCN: v_not_b32
76-
; GCN: v_or_b32
75+
; GCN: v_bfi_b32
7776
define amdgpu_kernel void @vector_orn2_i32_s_v_one_use(
7877
ptr addrspace(1) %r0, i32 %s) {
7978
entry:
@@ -85,8 +84,7 @@ entry:
8584
}
8685

8786
; GCN-LABEL: {{^}}vector_orn2_i32_v_s_one_use
88-
; GCN: s_not_b32
89-
; GCN: v_or_b32
87+
; GCN: v_bfi_b32
9088
define amdgpu_kernel void @vector_orn2_i32_v_s_one_use(
9189
ptr addrspace(1) %r0, i32 %s) {
9290
entry:

llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,7 @@ define i32 @atomic_nand_i32_lds(ptr addrspace(3) %ptr) nounwind {
1111
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
1212
; GCN-NEXT: s_waitcnt lgkmcnt(0)
1313
; GCN-NEXT: v_mov_b32_e32 v2, v1
14-
; GCN-NEXT: v_not_b32_e32 v1, v2
15-
; GCN-NEXT: v_or_b32_e32 v1, -5, v1
14+
; GCN-NEXT: v_bfi_b32 v1, v2, -5, -1
1615
; GCN-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
1716
; GCN-NEXT: s_waitcnt lgkmcnt(0)
1817
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
@@ -37,8 +36,7 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
3736
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
3837
; GCN-NEXT: s_waitcnt vmcnt(0)
3938
; GCN-NEXT: v_mov_b32_e32 v3, v2
40-
; GCN-NEXT: v_not_b32_e32 v2, v3
41-
; GCN-NEXT: v_or_b32_e32 v2, -5, v2
39+
; GCN-NEXT: v_bfi_b32 v2, v3, -5, -1
4240
; GCN-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
4341
; GCN-NEXT: s_waitcnt vmcnt(0)
4442
; GCN-NEXT: buffer_wbinvl1_vol
@@ -64,8 +62,7 @@ define i32 @atomic_nand_i32_flat(ptr %ptr) nounwind {
6462
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
6563
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6664
; GCN-NEXT: v_mov_b32_e32 v3, v2
67-
; GCN-NEXT: v_not_b32_e32 v2, v3
68-
; GCN-NEXT: v_or_b32_e32 v2, -5, v2
65+
; GCN-NEXT: v_bfi_b32 v2, v3, -5, -1
6966
; GCN-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7067
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7168
; GCN-NEXT: buffer_wbinvl1_vol

llvm/test/CodeGen/AMDGPU/bitop3.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -183,9 +183,8 @@ define amdgpu_ps float @test_63(i32 %a, i32 %b) {
183183
;
184184
; GFX950-GISEL-LABEL: test_63:
185185
; GFX950-GISEL: ; %bb.0:
186-
; GFX950-GISEL-NEXT: v_not_b32_e32 v0, v0
187186
; GFX950-GISEL-NEXT: v_not_b32_e32 v1, v1
188-
; GFX950-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
187+
; GFX950-GISEL-NEXT: v_bfi_b32 v0, v0, v1, -1
189188
; GFX950-GISEL-NEXT: ; return to shader part epilog
190189
;
191190
; GFX1250-SDAG-LABEL: test_63:
@@ -195,10 +194,9 @@ define amdgpu_ps float @test_63(i32 %a, i32 %b) {
195194
;
196195
; GFX1250-GISEL-LABEL: test_63:
197196
; GFX1250-GISEL: ; %bb.0:
198-
; GFX1250-GISEL-NEXT: v_not_b32_e32 v0, v0
199197
; GFX1250-GISEL-NEXT: v_not_b32_e32 v1, v1
200198
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
201-
; GFX1250-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
199+
; GFX1250-GISEL-NEXT: v_bfi_b32 v0, v0, v1, -1
202200
; GFX1250-GISEL-NEXT: ; return to shader part epilog
203201
%nota = xor i32 %a, -1
204202
%notb = xor i32 %b, -1

llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -128,8 +128,7 @@ define protected amdgpu_kernel void @nand(ptr addrspace(1) %p, ptr addrspace(1)
128128
; CHECK-NEXT: .LBB5_1: ; %atomicrmw.start
129129
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
130130
; CHECK-NEXT: v_mov_b32_e32 v3, v0
131-
; CHECK-NEXT: v_not_b32_e32 v0, v3
132-
; CHECK-NEXT: v_or_b32_e32 v2, -2, v0
131+
; CHECK-NEXT: v_bfi_b32 v2, v3, -2, -1
133132
; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
134133
; CHECK-NEXT: s_waitcnt vmcnt(0)
135134
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3

llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll

Lines changed: 8 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -146,8 +146,7 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
146146
; GFX9-NEXT: .LBB1_1: ; %atomicrmw.start
147147
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
148148
; GFX9-NEXT: v_mov_b32_e32 v3, v2
149-
; GFX9-NEXT: v_not_b32_e32 v2, v3
150-
; GFX9-NEXT: v_or_b32_e32 v2, -5, v2
149+
; GFX9-NEXT: v_bfi_b32 v2, v3, -5, -1
151150
; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
152151
; GFX9-NEXT: s_waitcnt vmcnt(0)
153152
; GFX9-NEXT: buffer_wbinvl1_vol
@@ -169,8 +168,7 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
169168
; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start
170169
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
171170
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
172-
; GFX90A-NEXT: v_not_b32_e32 v2, v3
173-
; GFX90A-NEXT: v_or_b32_e32 v2, -5, v2
171+
; GFX90A-NEXT: v_bfi_b32 v2, v3, -5, -1
174172
; GFX90A-NEXT: buffer_wbl2
175173
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
176174
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -194,8 +192,7 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
194192
; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start
195193
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
196194
; GFX10-NEXT: v_mov_b32_e32 v3, v2
197-
; GFX10-NEXT: v_not_b32_e32 v2, v3
198-
; GFX10-NEXT: v_or_b32_e32 v2, -5, v2
195+
; GFX10-NEXT: v_bfi_b32 v2, v3, -5, -1
199196
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
200197
; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
201198
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -219,8 +216,7 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
219216
; GFX9-FLATSCR-NEXT: .LBB1_1: ; %atomicrmw.start
220217
; GFX9-FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1
221218
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v2
222-
; GFX9-FLATSCR-NEXT: v_not_b32_e32 v2, v3
223-
; GFX9-FLATSCR-NEXT: v_or_b32_e32 v2, -5, v2
219+
; GFX9-FLATSCR-NEXT: v_bfi_b32 v2, v3, -5, -1
224220
; GFX9-FLATSCR-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
225221
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
226222
; GFX9-FLATSCR-NEXT: buffer_wbinvl1_vol
@@ -242,9 +238,8 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
242238
; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start
243239
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
244240
; GFX11-NEXT: v_mov_b32_e32 v3, v2
245-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
246-
; GFX11-NEXT: v_not_b32_e32 v2, v3
247-
; GFX11-NEXT: v_or_b32_e32 v2, -5, v2
241+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
242+
; GFX11-NEXT: v_bfi_b32 v2, v3, -5, -1
248243
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
249244
; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
250245
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -273,9 +268,8 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
273268
; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start
274269
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
275270
; GFX12-NEXT: v_mov_b32_e32 v3, v2
276-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
277-
; GFX12-NEXT: v_not_b32_e32 v2, v3
278-
; GFX12-NEXT: v_or_b32_e32 v2, -5, v2
271+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
272+
; GFX12-NEXT: v_bfi_b32 v2, v3, -5, -1
279273
; GFX12-NEXT: global_wb scope:SCOPE_SYS
280274
; GFX12-NEXT: s_wait_storecnt 0x0
281275
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS

llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -282,8 +282,7 @@ define i32 @atomicrmw_nand_private_i32(ptr addrspace(5) %ptr) {
282282
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
283283
; GCN-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
284284
; GCN-NEXT: s_waitcnt vmcnt(0)
285-
; GCN-NEXT: v_not_b32_e32 v2, v1
286-
; GCN-NEXT: v_or_b32_e32 v2, -5, v2
285+
; GCN-NEXT: v_bfi_b32 v2, v1, -5, -1
287286
; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
288287
; GCN-NEXT: v_mov_b32_e32 v0, v1
289288
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)

0 commit comments

Comments
 (0)