Skip to content

Commit 8a9e333

Browse files
authored
s390x: optimize 128-bit fshl and fshr by high values (#154919)
Turn a funnel shift by N in the range `121..128` into a funnel shift in the opposite direction by `128 - N`. Because there are dedicated instructions for funnel shifts by values smaller than 8, this emits fewer instructions. This additional rule is useful because LLVM appears to canonicalize `fshr` into `fshl`, meaning that the rules for `fshr` on values less than 8 would not match on organic input.
1 parent 8c6b7af commit 8a9e333

File tree

2 files changed

+68
-1
lines changed

2 files changed

+68
-1
lines changed

llvm/lib/Target/SystemZ/SystemZISelLowering.cpp

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6725,6 +6725,14 @@ SDValue SystemZTargetLowering::lowerFSHL(SDValue Op, SelectionDAG &DAG) const {
67256725
if ((ShiftAmt & 7) == 0 || Subtarget.hasVectorEnhancements2()) {
67266726
SDValue Op0 = DAG.getBitcast(MVT::v16i8, Op.getOperand(0));
67276727
SDValue Op1 = DAG.getBitcast(MVT::v16i8, Op.getOperand(1));
6728+
if (ShiftAmt > 120) {
6729+
// For N in 121..128, fshl N == fshr (128 - N), and for 1 <= N < 8
6730+
// SHR_DOUBLE_BIT emits fewer instructions.
6731+
SDValue Val =
6732+
DAG.getNode(SystemZISD::SHR_DOUBLE_BIT, DL, MVT::v16i8, Op0, Op1,
6733+
DAG.getTargetConstant(128 - ShiftAmt, DL, MVT::i32));
6734+
return DAG.getBitcast(MVT::i128, Val);
6735+
}
67286736
SmallVector<int, 16> Mask(16);
67296737
for (unsigned Elt = 0; Elt < 16; Elt++)
67306738
Mask[Elt] = (ShiftAmt >> 3) + Elt;
@@ -6748,13 +6756,21 @@ SDValue SystemZTargetLowering::lowerFSHR(SDValue Op, SelectionDAG &DAG) const {
67486756
// i128 FSHR with a constant amount that is a multiple of 8 can be
67496757
// implemented via VECTOR_SHUFFLE. If we have the vector-enhancements-2
67506758
// facility, FSHR with a constant amount less than 8 can be implemented
6751-
// via SHL_DOUBLE_BIT, and FSHR with other constant amounts by a
6759+
// via SHR_DOUBLE_BIT, and FSHR with other constant amounts by a
67526760
// combination of the two.
67536761
if (auto *ShiftAmtNode = dyn_cast<ConstantSDNode>(Op.getOperand(2))) {
67546762
uint64_t ShiftAmt = ShiftAmtNode->getZExtValue() & 127;
67556763
if ((ShiftAmt & 7) == 0 || Subtarget.hasVectorEnhancements2()) {
67566764
SDValue Op0 = DAG.getBitcast(MVT::v16i8, Op.getOperand(0));
67576765
SDValue Op1 = DAG.getBitcast(MVT::v16i8, Op.getOperand(1));
6766+
if (ShiftAmt > 120) {
6767+
// For N in 121..128, fshr N == fshl (128 - N), and for 1 <= N < 8
6768+
// SHL_DOUBLE_BIT emits fewer instructions.
6769+
SDValue Val =
6770+
DAG.getNode(SystemZISD::SHL_DOUBLE_BIT, DL, MVT::v16i8, Op0, Op1,
6771+
DAG.getTargetConstant(128 - ShiftAmt, DL, MVT::i32));
6772+
return DAG.getBitcast(MVT::i128, Val);
6773+
}
67586774
SmallVector<int, 16> Mask(16);
67596775
for (unsigned Elt = 0; Elt < 16; Elt++)
67606776
Mask[Elt] = 16 - (ShiftAmt >> 3) + Elt;

llvm/test/CodeGen/SystemZ/shift-17.ll

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,3 +249,54 @@ define i128 @f8(i128 %a, i128 %b, i128 %sh) {
249249
ret i128 %res
250250
}
251251

252+
; Funnel shift left by constant N in 121..128, in such cases fshl N == fshr (128 - N)
253+
define i128 @f9(i128 %a, i128 %b) {
254+
; CHECK-LABEL: f9:
255+
; CHECK: # %bb.0:
256+
; CHECK-NEXT: vl %v1, 0(%r4), 3
257+
; CHECK-NEXT: vl %v0, 0(%r3), 3
258+
; CHECK-NEXT: vrepib %v2, 5
259+
; CHECK-NEXT: vsrl %v1, %v1, %v2
260+
; CHECK-NEXT: vrepib %v2, 123
261+
; CHECK-NEXT: vslb %v0, %v0, %v2
262+
; CHECK-NEXT: vsl %v0, %v0, %v2
263+
; CHECK-NEXT: vo %v0, %v0, %v1
264+
; CHECK-NEXT: vst %v0, 0(%r2), 3
265+
; CHECK-NEXT: br %r14
266+
;
267+
; Z15-LABEL: f9:
268+
; Z15: # %bb.0:
269+
; Z15-NEXT: vl %v0, 0(%r4), 3
270+
; Z15-NEXT: vl %v1, 0(%r3), 3
271+
; Z15-NEXT: vsrd %v0, %v1, %v0, 5
272+
; Z15-NEXT: vst %v0, 0(%r2), 3
273+
; Z15-NEXT: br %r14
274+
%res = tail call i128 @llvm.fshl.i128(i128 %a, i128 %b, i128 123)
275+
ret i128 %res
276+
}
277+
278+
; Funnel shift right by constant N in 121..128, in such cases fshr N == fshl (128 - N)
279+
define i128 @f10(i128 %a, i128 %b) {
280+
; CHECK-LABEL: f10:
281+
; CHECK: # %bb.0:
282+
; CHECK-NEXT: vl %v1, 0(%r3), 3
283+
; CHECK-NEXT: vl %v0, 0(%r4), 3
284+
; CHECK-NEXT: vrepib %v2, 5
285+
; CHECK-NEXT: vsl %v1, %v1, %v2
286+
; CHECK-NEXT: vrepib %v2, 123
287+
; CHECK-NEXT: vsrlb %v0, %v0, %v2
288+
; CHECK-NEXT: vsrl %v0, %v0, %v2
289+
; CHECK-NEXT: vo %v0, %v1, %v0
290+
; CHECK-NEXT: vst %v0, 0(%r2), 3
291+
; CHECK-NEXT: br %r14
292+
;
293+
; Z15-LABEL: f10:
294+
; Z15: # %bb.0:
295+
; Z15-NEXT: vl %v0, 0(%r4), 3
296+
; Z15-NEXT: vl %v1, 0(%r3), 3
297+
; Z15-NEXT: vsld %v0, %v1, %v0, 5
298+
; Z15-NEXT: vst %v0, 0(%r2), 3
299+
; Z15-NEXT: br %r14
300+
%res = tail call i128 @llvm.fshr.i128(i128 %a, i128 %b, i128 123)
301+
ret i128 %res
302+
}

0 commit comments

Comments
 (0)