Skip to content

Commit bec726f

Browse files
authored
[X86] optimize ssse3 horizontal saturating add/sub (#169591)
Currently LLVM fails to recognize a manual implementation of `phadd` https://godbolt.org/z/zozrssaWb ```llvm declare <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16>, <8 x i16>) declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>) define <8 x i16> @phaddsw_v8i16_intrinsic(<8 x i16> %a, <8 x i16> %b) { entry: %res = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a, <8 x i16> %b) ret <8 x i16> %res } define <8 x i16> @phaddsw_v8i16_generic(<8 x i16> %a, <8 x i16> %b) { entry: %even = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> %odd = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> %sum = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %even, <8 x i16> %odd) ret <8 x i16> %sum } ``` ```asm phaddsw_v8i16_intrinsic: # @phaddsw_v8i16_intrinsic phaddsw xmm0, xmm1 ret phaddsw_v8i16_generic: # @phaddsw_v8i16_generic movdqa xmm2, xmmword ptr [rip + .LCPI1_0] # xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] movdqa xmm3, xmm1 pshufb xmm3, xmm2 movdqa xmm4, xmm0 pshufb xmm4, xmm2 punpcklqdq xmm4, xmm3 # xmm4 = xmm4[0],xmm3[0] psrad xmm1, 16 psrad xmm0, 16 packssdw xmm0, xmm1 paddsw xmm0, xmm4 ret ``` This PR does recognize the pattern.
1 parent 0dbedd1 commit bec726f

File tree

6 files changed

+150
-24
lines changed

6 files changed

+150
-24
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2668,6 +2668,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
26682668
ISD::AVGFLOORU,
26692669
ISD::BITREVERSE,
26702670
ISD::ADD,
2671+
ISD::SADDSAT,
2672+
ISD::SSUBSAT,
26712673
ISD::FADD,
26722674
ISD::FSUB,
26732675
ISD::FNEG,
@@ -8151,6 +8153,8 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl,
81518153
case X86ISD::FHSUB:
81528154
case X86ISD::HADD:
81538155
case X86ISD::HSUB:
8156+
case X86ISD::HADDS:
8157+
case X86ISD::HSUBS:
81548158
return true;
81558159
}
81568160
return false;
@@ -35121,6 +35125,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
3512135125
NODE_NAME_CASE(BLENDV)
3512235126
NODE_NAME_CASE(HADD)
3512335127
NODE_NAME_CASE(HSUB)
35128+
NODE_NAME_CASE(HADDS)
35129+
NODE_NAME_CASE(HSUBS)
3512435130
NODE_NAME_CASE(FHADD)
3512535131
NODE_NAME_CASE(FHSUB)
3512635132
NODE_NAME_CASE(CONFLICT)
@@ -40897,8 +40903,9 @@ static SDValue canonicalizeShuffleMaskWithHorizOp(
4089740903
}))
4089840904
return SDValue();
4089940905

40900-
bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
40901-
Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
40906+
bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::FHSUB ||
40907+
Opcode0 == X86ISD::HADD || Opcode0 == X86ISD::HSUB ||
40908+
Opcode0 == X86ISD::HADDS || Opcode0 == X86ISD::HSUBS);
4090240909
bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
4090340910
if (!isHoriz && !isPack)
4090440911
return SDValue();
@@ -54231,7 +54238,9 @@ static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
5423154238
const X86Subtarget &Subtarget) {
5423254239
EVT VT = N->getValueType(0);
5423354240
unsigned Opcode = N->getOpcode();
54234-
bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
54241+
bool IsAdd =
54242+
(Opcode == ISD::FADD) || (Opcode == ISD::ADD) || (Opcode == ISD::SADDSAT);
54243+
bool IsSat = (Opcode == ISD::SADDSAT) || (Opcode == ISD::SSUBSAT);
5423554244
SmallVector<int, 8> PostShuffleMask;
5423654245

5423754246
auto MergableHorizOp = [N](unsigned HorizOpcode) {
@@ -54261,11 +54270,17 @@ static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
5426154270
break;
5426254271
case ISD::ADD:
5426354272
case ISD::SUB:
54264-
if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
54265-
VT == MVT::v16i16 || VT == MVT::v8i32)) {
54273+
case ISD::SADDSAT:
54274+
case ISD::SSUBSAT:
54275+
if (!Subtarget.hasSSSE3())
54276+
break;
54277+
if (VT == MVT::v8i16 || VT == MVT::v16i16 ||
54278+
(!IsSat && (VT == MVT::v4i32 || VT == MVT::v8i32))) {
54279+
5426654280
SDValue LHS = N->getOperand(0);
5426754281
SDValue RHS = N->getOperand(1);
54268-
auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
54282+
auto HorizOpcode = IsSat ? (IsAdd ? X86ISD::HADDS : X86ISD::HSUBS)
54283+
: (IsAdd ? X86ISD::HADD : X86ISD::HSUB);
5426954284
if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
5427054285
PostShuffleMask, MergableHorizOp(HorizOpcode))) {
5427154286
auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
@@ -61052,6 +61067,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
6105261067
case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
6105361068
case X86ISD::ADD:
6105461069
case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI, Subtarget);
61070+
case ISD::SADDSAT:
61071+
case ISD::SSUBSAT: return combineToHorizontalAddSub(N, DAG, Subtarget);
6105561072
case X86ISD::CLOAD:
6105661073
case X86ISD::CSTORE: return combineX86CloadCstore(N, DAG);
6105761074
case X86ISD::SBB: return combineSBB(N, DAG);

llvm/lib/Target/X86/X86ISelLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,10 @@ namespace llvm {
270270
HADD,
271271
HSUB,
272272

273+
/// Integer horizontal saturating add/sub.
274+
HADDS,
275+
HSUBS,
276+
273277
/// Floating point horizontal add/sub.
274278
FHADD,
275279
FHSUB,

llvm/lib/Target/X86/X86InstrFragmentsSIMD.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,8 @@ def X86fhadd : SDNode<"X86ISD::FHADD", SDTFPBinOp>;
7171
def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>;
7272
def X86hadd : SDNode<"X86ISD::HADD", SDTIntBinOp>;
7373
def X86hsub : SDNode<"X86ISD::HSUB", SDTIntBinOp>;
74+
def X86hadds : SDNode<"X86ISD::HADDS", SDTIntBinOp>;
75+
def X86hsubs : SDNode<"X86ISD::HSUBS", SDTIntBinOp>;
7476
def X86comi : SDNode<"X86ISD::COMI", SDTX86FCmp>;
7577
def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86FCmp>;
7678
def X86comi512 : SDNode<"X86ISD::COMX", SDTX86FCmp>;

llvm/lib/Target/X86/X86InstrSSE.td

Lines changed: 16 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -4864,12 +4864,12 @@ let isCommutable = 0 in {
48644864
defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd",
48654865
int_x86_ssse3_psign_d_128,
48664866
SchedWriteVecALU.XMM, load, 0>, VEX, VVVV, WIG;
4867-
defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw",
4868-
int_x86_ssse3_phadd_sw_128,
4869-
SchedWritePHAdd.XMM, load, 0>, VEX, VVVV, WIG;
4870-
defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw",
4871-
int_x86_ssse3_phsub_sw_128,
4872-
SchedWritePHAdd.XMM, load, 0>, VEX, VVVV, WIG;
4867+
defm VPHADDSW : SS3I_binop_rm<0x03, "vphaddsw", X86hadds, v8i16, v8i16, VR128,
4868+
load, i128mem,
4869+
SchedWritePHAdd.XMM, 0>, VEX, VVVV, WIG;
4870+
defm VPHSUBSW : SS3I_binop_rm<0x07, "vphsubsw", X86hsubs, v8i16, v8i16, VR128,
4871+
load, i128mem,
4872+
SchedWritePHAdd.XMM, 0>, VEX, VVVV, WIG;
48734873
}
48744874
}
48754875

@@ -4907,12 +4907,12 @@ let isCommutable = 0 in {
49074907
SchedWriteVecALU.YMM>, VEX, VVVV, VEX_L, WIG;
49084908
defm VPSIGND : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
49094909
SchedWriteVecALU.YMM>, VEX, VVVV, VEX_L, WIG;
4910-
defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw",
4911-
int_x86_avx2_phadd_sw,
4912-
SchedWritePHAdd.YMM>, VEX, VVVV, VEX_L, WIG;
4913-
defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw",
4914-
int_x86_avx2_phsub_sw,
4915-
SchedWritePHAdd.YMM>, VEX, VVVV, VEX_L, WIG;
4910+
defm VPHADDSWY : SS3I_binop_rm<0x03, "vphaddsw", X86hadds, v16i16, v16i16,
4911+
VR256, load, i256mem,
4912+
SchedWritePHAdd.YMM, 0>, VEX, VVVV, VEX_L, WIG;
4913+
defm VPHSUBSWY : SS3I_binop_rm<0x07, "vphsubsw", X86hsubs, v16i16, v16i16,
4914+
VR256, load, i256mem,
4915+
SchedWritePHAdd.YMM, 0>, VEX, VVVV, VEX_L, WIG;
49164916
}
49174917
}
49184918

@@ -4935,12 +4935,10 @@ let isCommutable = 0 in {
49354935
SchedWriteVecALU.XMM, memop>;
49364936
defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
49374937
memop, i128mem, SchedWriteVarShuffle.XMM>;
4938-
defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw",
4939-
int_x86_ssse3_phadd_sw_128,
4940-
SchedWritePHAdd.XMM, memop>;
4941-
defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw",
4942-
int_x86_ssse3_phsub_sw_128,
4943-
SchedWritePHAdd.XMM, memop>;
4938+
defm PHADDSW : SS3I_binop_rm<0x03, "phaddsw", X86hadds, v8i16, v8i16, VR128,
4939+
memop, i128mem, SchedWritePHAdd.XMM>;
4940+
defm PHSUBSW : SS3I_binop_rm<0x07, "phsubsw", X86hsubs, v8i16, v8i16, VR128,
4941+
memop, i128mem, SchedWritePHAdd.XMM>;
49444942
defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
49454943
v16i8, VR128, memop, i128mem,
49464944
SchedWriteVecIMul.XMM>;

llvm/lib/Target/X86/X86IntrinsicsInfo.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -724,8 +724,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
724724
X86_INTRINSIC_DATA(avx2_permd, VPERM_2OP, X86ISD::VPERMV, 0),
725725
X86_INTRINSIC_DATA(avx2_permps, VPERM_2OP, X86ISD::VPERMV, 0),
726726
X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0),
727+
X86_INTRINSIC_DATA(avx2_phadd_sw, INTR_TYPE_2OP, X86ISD::HADDS, 0),
727728
X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0),
728729
X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0),
730+
X86_INTRINSIC_DATA(avx2_phsub_sw, INTR_TYPE_2OP, X86ISD::HSUBS, 0),
729731
X86_INTRINSIC_DATA(avx2_phsub_w, INTR_TYPE_2OP, X86ISD::HSUB, 0),
730732
X86_INTRINSIC_DATA(avx2_pmadd_ub_sw, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0),
731733
X86_INTRINSIC_DATA(avx2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0),
@@ -2017,11 +2019,13 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
20172019
X86_INTRINSIC_DATA(ssse3_phadd_d, INTR_TYPE_CAST_MMX, 0, 0),
20182020
X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
20192021
X86_INTRINSIC_DATA(ssse3_phadd_sw, INTR_TYPE_CAST_MMX, 0, 0),
2022+
X86_INTRINSIC_DATA(ssse3_phadd_sw_128, INTR_TYPE_2OP, X86ISD::HADDS, 0),
20202023
X86_INTRINSIC_DATA(ssse3_phadd_w, INTR_TYPE_CAST_MMX, 0, 0),
20212024
X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
20222025
X86_INTRINSIC_DATA(ssse3_phsub_d, INTR_TYPE_CAST_MMX, 0, 0),
20232026
X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
20242027
X86_INTRINSIC_DATA(ssse3_phsub_sw, INTR_TYPE_CAST_MMX, 0, 0),
2028+
X86_INTRINSIC_DATA(ssse3_phsub_sw_128, INTR_TYPE_2OP, X86ISD::HSUBS, 0),
20252029
X86_INTRINSIC_DATA(ssse3_phsub_w, INTR_TYPE_CAST_MMX, 0, 0),
20262030
X86_INTRINSIC_DATA(ssse3_phsub_w_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
20272031
X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw, INTR_TYPE_CAST_MMX, 0, 0),
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s -check-prefix=SSSE3
3+
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s -check-prefix=AVX2
4+
5+
define <8 x i16> @phaddsw_v8i16_intrinsic(<8 x i16> %a, <8 x i16> %b) {
6+
; SSSE3-LABEL: phaddsw_v8i16_intrinsic:
7+
; SSSE3: # %bb.0:
8+
; SSSE3-NEXT: phaddsw %xmm1, %xmm0
9+
; SSSE3-NEXT: retq
10+
;
11+
; AVX2-LABEL: phaddsw_v8i16_intrinsic:
12+
; AVX2: # %bb.0:
13+
; AVX2-NEXT: vphaddsw %xmm1, %xmm0, %xmm0
14+
; AVX2-NEXT: retq
15+
%res = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a, <8 x i16> %b)
16+
ret <8 x i16> %res
17+
}
18+
19+
define <8 x i16> @phaddsw_v8i16_generic(<8 x i16> %a, <8 x i16> %b) {
20+
; SSSE3-LABEL: phaddsw_v8i16_generic:
21+
; SSSE3: # %bb.0:
22+
; SSSE3-NEXT: phaddsw %xmm1, %xmm0
23+
; SSSE3-NEXT: retq
24+
;
25+
; AVX2-LABEL: phaddsw_v8i16_generic:
26+
; AVX2: # %bb.0:
27+
; AVX2-NEXT: vphaddsw %xmm1, %xmm0, %xmm0
28+
; AVX2-NEXT: retq
29+
%even = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
30+
%odd = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
31+
%sum = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %even, <8 x i16> %odd)
32+
ret <8 x i16> %sum
33+
}
34+
35+
define <16 x i16> @phaddsw_v16i16_generic(<16 x i16> %a, <16 x i16> %b) {
36+
; SSSE3-LABEL: phaddsw_v16i16_generic:
37+
; SSSE3: # %bb.0:
38+
; SSSE3-NEXT: phaddsw %xmm1, %xmm0
39+
; SSSE3-NEXT: phaddsw %xmm3, %xmm2
40+
; SSSE3-NEXT: movdqa %xmm2, %xmm1
41+
; SSSE3-NEXT: retq
42+
;
43+
; AVX2-LABEL: phaddsw_v16i16_generic:
44+
; AVX2: # %bb.0:
45+
; AVX2-NEXT: vphaddsw %ymm1, %ymm0, %ymm0
46+
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
47+
; AVX2-NEXT: retq
48+
%even = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
49+
%odd = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
50+
%sum = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %even, <16 x i16> %odd)
51+
ret <16 x i16> %sum
52+
}
53+
54+
define <8 x i16> @phsubsw_v8i16_intrinsic(<8 x i16> %a, <8 x i16> %b) {
55+
; SSSE3-LABEL: phsubsw_v8i16_intrinsic:
56+
; SSSE3: # %bb.0:
57+
; SSSE3-NEXT: phsubsw %xmm1, %xmm0
58+
; SSSE3-NEXT: retq
59+
;
60+
; AVX2-LABEL: phsubsw_v8i16_intrinsic:
61+
; AVX2: # %bb.0:
62+
; AVX2-NEXT: vphsubsw %xmm1, %xmm0, %xmm0
63+
; AVX2-NEXT: retq
64+
%res = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a, <8 x i16> %b)
65+
ret <8 x i16> %res
66+
}
67+
68+
define <8 x i16> @phsubsw_v8i16_generic(<8 x i16> %a, <8 x i16> %b) {
69+
; SSSE3-LABEL: phsubsw_v8i16_generic:
70+
; SSSE3: # %bb.0:
71+
; SSSE3-NEXT: phsubsw %xmm1, %xmm0
72+
; SSSE3-NEXT: retq
73+
;
74+
; AVX2-LABEL: phsubsw_v8i16_generic:
75+
; AVX2: # %bb.0:
76+
; AVX2-NEXT: vphsubsw %xmm1, %xmm0, %xmm0
77+
; AVX2-NEXT: retq
78+
%even = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
79+
%odd = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
80+
%diff = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %even, <8 x i16> %odd)
81+
ret <8 x i16> %diff
82+
}
83+
84+
define <16 x i16> @phsubsw_v16i16_generic(<16 x i16> %a, <16 x i16> %b) {
85+
; SSSE3-LABEL: phsubsw_v16i16_generic:
86+
; SSSE3: # %bb.0:
87+
; SSSE3-NEXT: phsubsw %xmm1, %xmm0
88+
; SSSE3-NEXT: phsubsw %xmm3, %xmm2
89+
; SSSE3-NEXT: movdqa %xmm2, %xmm1
90+
; SSSE3-NEXT: retq
91+
;
92+
; AVX2-LABEL: phsubsw_v16i16_generic:
93+
; AVX2: # %bb.0:
94+
; AVX2-NEXT: vphsubsw %ymm1, %ymm0, %ymm0
95+
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
96+
; AVX2-NEXT: retq
97+
%even = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
98+
%odd = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
99+
%diff = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %even, <16 x i16> %odd)
100+
ret <16 x i16> %diff
101+
}

0 commit comments

Comments
 (0)