Skip to content

Commit 7a9cac2

Browse files
committed
Allow shifting instead of masks if the types are legal.
1 parent bf34b2e commit 7a9cac2

File tree

3 files changed

+40
-28
lines changed

3 files changed

+40
-28
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -839,8 +839,11 @@ class LLVM_ABI TargetLoweringBase {
839839
/// Return true if the variant with 2 variable shifts is preferred.
840840
/// Return false if there is no preference.
841841
virtual bool shouldFoldMaskToVariableShiftPair(SDValue X) const {
842-
// By default, let's assume that no one prefers shifts.
843-
return false;
842+
// By default, let's assume that no one prefers shifts for vectors
843+
EVT VT = X.getValueType();
844+
845+
// Prefer shifts for legal types
846+
return isOperationLegal(ISD::SHL, VT);
844847
}
845848

846849
/// Return true if it is profitable to fold a pair of shifts into a mask.

llvm/test/CodeGen/AMDGPU/extract-lowbits.ll

Lines changed: 29 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -103,16 +103,16 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
103103
; SI: ; %bb.0:
104104
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
105105
; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1
106-
; SI-NEXT: v_lshr_b32_e32 v1, -1, v1
107-
; SI-NEXT: v_and_b32_e32 v0, v1, v0
106+
; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
107+
; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
108108
; SI-NEXT: s_setpc_b64 s[30:31]
109109
;
110110
; VI-LABEL: bzhi32_c0:
111111
; VI: ; %bb.0:
112112
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
113113
; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1
114-
; VI-NEXT: v_lshrrev_b32_e64 v1, v1, -1
115-
; VI-NEXT: v_and_b32_e32 v0, v1, v0
114+
; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
115+
; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
116116
; VI-NEXT: s_setpc_b64 s[30:31]
117117
%numhighbits = sub i32 32, %numlowbits
118118
%mask = lshr i32 -1, %numhighbits
@@ -121,12 +121,23 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
121121
}
122122

123123
define i32 @bzhi32_c0_clamp(i32 %val, i32 %numlowbits) nounwind {
124-
; GCN-LABEL: bzhi32_c0_clamp:
125-
; GCN: ; %bb.0:
126-
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
127-
; GCN-NEXT: v_and_b32_e32 v1, 31, v1
128-
; GCN-NEXT: v_bfe_u32 v0, v0, 0, v1
129-
; GCN-NEXT: s_setpc_b64 s[30:31]
124+
; SI-LABEL: bzhi32_c0_clamp:
125+
; SI: ; %bb.0:
126+
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
127+
; SI-NEXT: v_and_b32_e32 v1, 31, v1
128+
; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1
129+
; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
130+
; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
131+
; SI-NEXT: s_setpc_b64 s[30:31]
132+
;
133+
; VI-LABEL: bzhi32_c0_clamp:
134+
; VI: ; %bb.0:
135+
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
136+
; VI-NEXT: v_and_b32_e32 v1, 31, v1
137+
; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1
138+
; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
139+
; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
140+
; VI-NEXT: s_setpc_b64 s[30:31]
130141
%low5bits = and i32 %numlowbits, 31
131142
%numhighbits = sub i32 32, %low5bits
132143
%mask = lshr i32 -1, %numhighbits
@@ -139,16 +150,16 @@ define i32 @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits) nounwind {
139150
; SI: ; %bb.0:
140151
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
141152
; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1
142-
; SI-NEXT: v_lshr_b32_e32 v1, -1, v1
143-
; SI-NEXT: v_and_b32_e32 v0, v1, v0
153+
; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
154+
; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
144155
; SI-NEXT: s_setpc_b64 s[30:31]
145156
;
146157
; VI-LABEL: bzhi32_c1_indexzext:
147158
; VI: ; %bb.0:
148159
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
149160
; VI-NEXT: v_sub_u16_e32 v1, 32, v1
150-
; VI-NEXT: v_lshrrev_b32_e64 v1, v1, -1
151-
; VI-NEXT: v_and_b32_e32 v0, v1, v0
161+
; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
162+
; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
152163
; VI-NEXT: s_setpc_b64 s[30:31]
153164
%numhighbits = sub i8 32, %numlowbits
154165
%sh_prom = zext i8 %numhighbits to i32
@@ -162,16 +173,16 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
162173
; SI: ; %bb.0:
163174
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
164175
; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1
165-
; SI-NEXT: v_lshr_b32_e32 v1, -1, v1
166-
; SI-NEXT: v_and_b32_e32 v0, v0, v1
176+
; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
177+
; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
167178
; SI-NEXT: s_setpc_b64 s[30:31]
168179
;
169180
; VI-LABEL: bzhi32_c4_commutative:
170181
; VI: ; %bb.0:
171182
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
172183
; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1
173-
; VI-NEXT: v_lshrrev_b32_e64 v1, v1, -1
174-
; VI-NEXT: v_and_b32_e32 v0, v0, v1
184+
; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
185+
; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
175186
; VI-NEXT: s_setpc_b64 s[30:31]
176187
%numhighbits = sub i32 32, %numlowbits
177188
%mask = lshr i32 -1, %numhighbits

llvm/test/CodeGen/AMDGPU/r600.extract-lowbits.ll

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -266,7 +266,7 @@ define amdgpu_kernel void @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits, ptr add
266266
; EG: ; %bb.0:
267267
; EG-NEXT: ALU 0, @8, KC0[], KC1[]
268268
; EG-NEXT: TEX 0 @6
269-
; EG-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[]
269+
; EG-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[]
270270
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
271271
; EG-NEXT: CF_END
272272
; EG-NEXT: PAD
@@ -279,17 +279,16 @@ define amdgpu_kernel void @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits, ptr add
279279
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
280280
; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
281281
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
282-
; EG-NEXT: LSHR * T0.W, literal.x, PV.W,
283-
; EG-NEXT: -1(nan), 0(0.000000e+00)
284-
; EG-NEXT: AND_INT T0.X, PV.W, KC0[2].Y,
282+
; EG-NEXT: LSHL * T1.W, KC0[2].Y, PV.W,
283+
; EG-NEXT: LSHR T0.X, PV.W, T0.W,
285284
; EG-NEXT: LSHR * T1.X, KC0[2].W, literal.x,
286285
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
287286
;
288287
; CM-LABEL: bzhi32_c1_indexzext:
289288
; CM: ; %bb.0:
290289
; CM-NEXT: ALU 0, @8, KC0[], KC1[]
291290
; CM-NEXT: TEX 0 @6
292-
; CM-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[]
291+
; CM-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[]
293292
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
294293
; CM-NEXT: CF_END
295294
; CM-NEXT: PAD
@@ -302,9 +301,8 @@ define amdgpu_kernel void @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits, ptr add
302301
; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
303302
; CM-NEXT: AND_INT * T0.W, PV.W, literal.x,
304303
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
305-
; CM-NEXT: LSHR * T0.W, literal.x, PV.W,
306-
; CM-NEXT: -1(nan), 0(0.000000e+00)
307-
; CM-NEXT: AND_INT * T0.X, PV.W, KC0[2].Y,
304+
; CM-NEXT: LSHL * T1.W, KC0[2].Y, PV.W,
305+
; CM-NEXT: LSHR * T0.X, PV.W, T0.W,
308306
; CM-NEXT: LSHR * T1.X, KC0[2].W, literal.x,
309307
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
310308
%numhighbits = sub i8 32, %numlowbits

0 commit comments

Comments
 (0)