Skip to content

Commit d4de780

Browse files
authored
[AMDGPU] Use "v_bfi_b32 x, 0, z" to implement (z & ~x) (#156636)
1 parent a1bfa2f commit d4de780

File tree

10 files changed

+1274
-1456
lines changed

10 files changed

+1274
-1456
lines changed

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2480,6 +2480,22 @@ def : AMDGPUPatIgnoreCopies <
24802480
(i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
24812481
>;
24822482

2483+
// (z & ~x)
2484+
def : AMDGPUPatIgnoreCopies <
2485+
(DivergentBinFrag<and> i32:$z, (not_oneuse i32:$x)),
2486+
(V_BFI_B32_e64 VSrc_b32:$x, (i32 0), VSrc_b32:$z)
2487+
>;
2488+
2489+
// 64-bit version
2490+
def : AMDGPUPatIgnoreCopies <
2491+
(DivergentBinFrag<and> i64:$z, (not_oneuse i64:$x)),
2492+
(REG_SEQUENCE VReg_64,
2493+
(V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), (i32 0),
2494+
(i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0,
2495+
(V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), (i32 0),
2496+
(i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
2497+
>;
2498+
24832499
// SHA-256 Ch function
24842500
// z ^ (x & (y ^ z))
24852501
def : AMDGPUPatIgnoreCopies <

llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll

Lines changed: 14 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -99,15 +99,13 @@ define i32 @v_andn2_i32(i32 %src0, i32 %src1) {
9999
; GCN-LABEL: v_andn2_i32:
100100
; GCN: ; %bb.0:
101101
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
102-
; GCN-NEXT: v_not_b32_e32 v1, v1
103-
; GCN-NEXT: v_and_b32_e32 v0, v0, v1
102+
; GCN-NEXT: v_bfi_b32 v0, v1, 0, v0
104103
; GCN-NEXT: s_setpc_b64 s[30:31]
105104
;
106105
; GFX10PLUS-LABEL: v_andn2_i32:
107106
; GFX10PLUS: ; %bb.0:
108107
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
109-
; GFX10PLUS-NEXT: v_not_b32_e32 v1, v1
110-
; GFX10PLUS-NEXT: v_and_b32_e32 v0, v0, v1
108+
; GFX10PLUS-NEXT: v_bfi_b32 v0, v1, 0, v0
111109
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
112110
%not.src1 = xor i32 %src1, -1
113111
%and = and i32 %src0, %not.src1
@@ -117,14 +115,12 @@ define i32 @v_andn2_i32(i32 %src0, i32 %src1) {
117115
define amdgpu_ps float @v_andn2_i32_sv(i32 inreg %src0, i32 %src1) {
118116
; GCN-LABEL: v_andn2_i32_sv:
119117
; GCN: ; %bb.0:
120-
; GCN-NEXT: v_not_b32_e32 v0, v0
121-
; GCN-NEXT: v_and_b32_e32 v0, s2, v0
118+
; GCN-NEXT: v_bfi_b32 v0, v0, 0, s2
122119
; GCN-NEXT: ; return to shader part epilog
123120
;
124121
; GFX10PLUS-LABEL: v_andn2_i32_sv:
125122
; GFX10PLUS: ; %bb.0:
126-
; GFX10PLUS-NEXT: v_not_b32_e32 v0, v0
127-
; GFX10PLUS-NEXT: v_and_b32_e32 v0, s2, v0
123+
; GFX10PLUS-NEXT: v_bfi_b32 v0, v0, 0, s2
128124
; GFX10PLUS-NEXT: ; return to shader part epilog
129125
%not.src1 = xor i32 %src1, -1
130126
%and = and i32 %src0, %not.src1
@@ -135,14 +131,12 @@ define amdgpu_ps float @v_andn2_i32_sv(i32 inreg %src0, i32 %src1) {
135131
define amdgpu_ps float @v_andn2_i32_vs(i32 %src0, i32 inreg %src1) {
136132
; GCN-LABEL: v_andn2_i32_vs:
137133
; GCN: ; %bb.0:
138-
; GCN-NEXT: s_not_b32 s0, s2
139-
; GCN-NEXT: v_and_b32_e32 v0, s0, v0
134+
; GCN-NEXT: v_bfi_b32 v0, s2, 0, v0
140135
; GCN-NEXT: ; return to shader part epilog
141136
;
142137
; GFX10PLUS-LABEL: v_andn2_i32_vs:
143138
; GFX10PLUS: ; %bb.0:
144-
; GFX10PLUS-NEXT: s_not_b32 s0, s2
145-
; GFX10PLUS-NEXT: v_and_b32_e32 v0, s0, v0
139+
; GFX10PLUS-NEXT: v_bfi_b32 v0, s2, 0, v0
146140
; GFX10PLUS-NEXT: ; return to shader part epilog
147141
%not.src1 = xor i32 %src1, -1
148142
%and = and i32 %src0, %not.src1
@@ -247,19 +241,15 @@ define i64 @v_andn2_i64(i64 %src0, i64 %src1) {
247241
; GCN-LABEL: v_andn2_i64:
248242
; GCN: ; %bb.0:
249243
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
250-
; GCN-NEXT: v_not_b32_e32 v2, v2
251-
; GCN-NEXT: v_not_b32_e32 v3, v3
252-
; GCN-NEXT: v_and_b32_e32 v0, v0, v2
253-
; GCN-NEXT: v_and_b32_e32 v1, v1, v3
244+
; GCN-NEXT: v_bfi_b32 v0, v2, 0, v0
245+
; GCN-NEXT: v_bfi_b32 v1, v3, 0, v1
254246
; GCN-NEXT: s_setpc_b64 s[30:31]
255247
;
256248
; GFX10PLUS-LABEL: v_andn2_i64:
257249
; GFX10PLUS: ; %bb.0:
258250
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
259-
; GFX10PLUS-NEXT: v_not_b32_e32 v2, v2
260-
; GFX10PLUS-NEXT: v_not_b32_e32 v3, v3
261-
; GFX10PLUS-NEXT: v_and_b32_e32 v0, v0, v2
262-
; GFX10PLUS-NEXT: v_and_b32_e32 v1, v1, v3
251+
; GFX10PLUS-NEXT: v_bfi_b32 v0, v2, 0, v0
252+
; GFX10PLUS-NEXT: v_bfi_b32 v1, v3, 0, v1
263253
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
264254
%not.src1 = xor i64 %src1, -1
265255
%and = and i64 %src0, %not.src1
@@ -269,18 +259,14 @@ define i64 @v_andn2_i64(i64 %src0, i64 %src1) {
269259
define amdgpu_ps <2 x float> @v_andn2_i64_sv(i64 inreg %src0, i64 %src1) {
270260
; GCN-LABEL: v_andn2_i64_sv:
271261
; GCN: ; %bb.0:
272-
; GCN-NEXT: v_not_b32_e32 v0, v0
273-
; GCN-NEXT: v_not_b32_e32 v1, v1
274-
; GCN-NEXT: v_and_b32_e32 v0, s2, v0
275-
; GCN-NEXT: v_and_b32_e32 v1, s3, v1
262+
; GCN-NEXT: v_bfi_b32 v0, v0, 0, s2
263+
; GCN-NEXT: v_bfi_b32 v1, v1, 0, s3
276264
; GCN-NEXT: ; return to shader part epilog
277265
;
278266
; GFX10PLUS-LABEL: v_andn2_i64_sv:
279267
; GFX10PLUS: ; %bb.0:
280-
; GFX10PLUS-NEXT: v_not_b32_e32 v0, v0
281-
; GFX10PLUS-NEXT: v_not_b32_e32 v1, v1
282-
; GFX10PLUS-NEXT: v_and_b32_e32 v0, s2, v0
283-
; GFX10PLUS-NEXT: v_and_b32_e32 v1, s3, v1
268+
; GFX10PLUS-NEXT: v_bfi_b32 v0, v0, 0, s2
269+
; GFX10PLUS-NEXT: v_bfi_b32 v1, v1, 0, s3
284270
; GFX10PLUS-NEXT: ; return to shader part epilog
285271
%not.src1 = xor i64 %src1, -1
286272
%and = and i64 %src0, %not.src1

0 commit comments

Comments
 (0)