diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 88691b931a8d8..9865143edbdfb 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -839,8 +839,11 @@ class LLVM_ABI TargetLoweringBase { /// Return true if the variant with 2 variable shifts is preferred. /// Return false if there is no preference. virtual bool shouldFoldMaskToVariableShiftPair(SDValue X) const { - // By default, let's assume that no one prefers shifts. - return false; + // By default, let's assume that no one prefers shifts for vectors + EVT VT = X.getValueType(); + + // Prefer shifts for legal types + return isOperationLegal(ISD::SHL, VT); } /// Return true if it is profitable to fold a pair of shifts into a mask. diff --git a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll index 89bd5f1dc7969..59bc1dd019de3 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll @@ -103,16 +103,16 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1 -; SI-NEXT: v_lshr_b32_e32 v1, -1, v1 -; SI-NEXT: v_and_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bzhi32_c0: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1 -; VI-NEXT: v_lshrrev_b32_e64 v1, v1, -1 -; VI-NEXT: v_and_b32_e32 v0, v1, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; VI-NEXT: s_setpc_b64 s[30:31] %numhighbits = sub i32 32, %numlowbits %mask = lshr i32 -1, %numhighbits @@ -121,12 +121,23 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind { } define i32 @bzhi32_c0_clamp(i32 %val, i32 %numlowbits) nounwind { -; GCN-LABEL: bzhi32_c0_clamp: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 31, v1 -; GCN-NEXT: v_bfe_u32 v0, v0, 0, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bzhi32_c0_clamp: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 31, v1 +; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bzhi32_c0_clamp: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_and_b32_e32 v1, 31, v1 +; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; VI-NEXT: s_setpc_b64 s[30:31] %low5bits = and i32 %numlowbits, 31 %numhighbits = sub i32 32, %low5bits %mask = lshr i32 -1, %numhighbits @@ -139,16 +150,16 @@ define i32 @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits) nounwind { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1 -; SI-NEXT: v_lshr_b32_e32 v1, -1, v1 -; SI-NEXT: v_and_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bzhi32_c1_indexzext: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_sub_u16_e32 v1, 32, v1 -; VI-NEXT: v_lshrrev_b32_e64 v1, v1, -1 -; VI-NEXT: v_and_b32_e32 v0, v1, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; VI-NEXT: s_setpc_b64 s[30:31] %numhighbits = sub i8 32, %numlowbits %sh_prom = zext i8 %numhighbits to i32 @@ -162,16 +173,16 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1 -; SI-NEXT: v_lshr_b32_e32 v1, -1, v1 -; SI-NEXT: v_and_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bzhi32_c4_commutative: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1 -; VI-NEXT: v_lshrrev_b32_e64 v1, v1, -1 -; VI-NEXT: v_and_b32_e32 v0, v0, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; VI-NEXT: s_setpc_b64 s[30:31] %numhighbits = sub i32 32, %numlowbits %mask = lshr i32 -1, %numhighbits diff --git a/llvm/test/CodeGen/AMDGPU/r600.extract-lowbits.ll b/llvm/test/CodeGen/AMDGPU/r600.extract-lowbits.ll index 5b21a3652d11c..1055fc101a6c5 100644 --- a/llvm/test/CodeGen/AMDGPU/r600.extract-lowbits.ll +++ b/llvm/test/CodeGen/AMDGPU/r600.extract-lowbits.ll @@ -266,7 +266,7 @@ define amdgpu_kernel void @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits, ptr add ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD @@ -279,9 +279,8 @@ define amdgpu_kernel void @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits, ptr add ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; EG-NEXT: LSHR * T0.W, literal.x, PV.W, -; EG-NEXT: -1(nan), 0(0.000000e+00) -; EG-NEXT: AND_INT T0.X, PV.W, KC0[2].Y, +; EG-NEXT: LSHL * T1.W, KC0[2].Y, PV.W, +; EG-NEXT: LSHR T0.X, PV.W, T0.W, ; EG-NEXT: LSHR * T1.X, KC0[2].W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; @@ -289,7 +288,7 @@ define amdgpu_kernel void @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits, ptr add ; CM: ; %bb.0: ; CM-NEXT: ALU 0, @8, KC0[], KC1[] ; CM-NEXT: TEX 0 @6 -; CM-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[] +; CM-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X ; CM-NEXT: CF_END ; CM-NEXT: PAD @@ -302,9 +301,8 @@ define amdgpu_kernel void @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits, ptr add ; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00) ; CM-NEXT: AND_INT * T0.W, PV.W, literal.x, ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; CM-NEXT: LSHR * T0.W, literal.x, PV.W, -; CM-NEXT: -1(nan), 0(0.000000e+00) -; CM-NEXT: AND_INT * T0.X, PV.W, KC0[2].Y, +; CM-NEXT: LSHL * T1.W, KC0[2].Y, PV.W, +; CM-NEXT: LSHR * T0.X, PV.W, T0.W, ; CM-NEXT: LSHR * T1.X, KC0[2].W, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) %numhighbits = sub i8 32, %numlowbits