Skip to content
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 83 additions & 1 deletion llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1829,6 +1829,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FCOPYSIGN, VT, Custom);
setOperationAction(ISD::FCANONICALIZE, VT, Custom);
}

setOperationAction(ISD::LRINT, MVT::v16f32,
Subtarget.hasDQI() ? Legal : Custom);
setOperationAction(ISD::LRINT, MVT::v8f64,
Expand Down Expand Up @@ -2101,6 +2102,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// These operations are handled on non-VLX by artificially widening in
// isel patterns.

for (MVT VT : {MVT::f16, MVT::f32, MVT::f64, MVT::v8f16, MVT::v4f32,
MVT::v2f64, MVT::v16f16, MVT::v8f32, MVT::v4f64, MVT::v32f16,
MVT::v16f32, MVT::v8f64})
setOperationAction(ISD::FLDEXP, VT, Custom);

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
Expand Down Expand Up @@ -19149,6 +19155,81 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
return SDValue();
}

static SDValue LowerFLDEXP(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc DL(Op);
SDValue X = Op.getOperand(0);
MVT XTy = X.getSimpleValueType();
SDValue Exp = Op.getOperand(1);
MVT ExtVT;

switch (XTy.SimpleTy) {
default:
return SDValue();
case MVT::f16:
if (!Subtarget.hasFP16())
X = DAG.getFPExtendOrRound(X, DL, MVT::f32);
[[fallthrough]];
case MVT::f32:
case MVT::f64: {
MVT VT = MVT::getVectorVT(X.getSimpleValueType(),
128 / X.getSimpleValueType().getSizeInBits());
Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp);
SDValue VX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, X);
SDValue VExp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Exp);
SDValue Scalefs = DAG.getNode(X86ISD::SCALEFS, DL, VT, VX, VExp, VX);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why SCALEFS?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mistake on my part, should be SCALEF.

SDValue Final = DAG.getExtractVectorElt(DL, X.getValueType(), Scalefs, 0);
if (X.getValueType() != XTy)
Final = DAG.getFPExtendOrRound(Final, DL, XTy);
return Final;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just use this direct:

Suggested change
return Final;
return DAG.getFPExtendOrRound(Final, DL, XTy);

getFPExtendOrRound will just return a no-op for matching types anyway

}
case MVT::v4f32:
case MVT::v2f64:
case MVT::v8f32:
case MVT::v4f64:
case MVT::v16f32:
case MVT::v8f64:
Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are converting Exp here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean why convert Exp before the widenSubVector? I think for the non-fp16 vector cases, it seems the simplest to convert Exp to fp as oppose to converting WideExp to fp. Otherwise, Exp and X could have different element count.
v4f64 -> v8f64
v4i32 -> v16i32

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My main concern is that after the switch, Exp might be a fp OR a int vector type

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After the switch, the only case where Exp is an integer is for vector fp16, but I do convert it to an fp after widening. I tried converting i16 vector exponents before widening, but that seemed very difficult.

Would it be preferable to convert Exp after the switch statement like so?

  case MVT::v8f64:
    // Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
    if (XTy.getSizeInBits() == 512 || Subtarget.hasVLX()) {
      Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
      return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp);
    }
    break;
    
 ...
 
  if (X.getValueType().getScalarType() != MVT::f16)
    Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
  SDValue WideX = widenSubVector(X, true, Subtarget, DAG, DL, 512);
  SDValue WideExp = widenSubVector(Exp, true, Subtarget, DAG, DL, 512);

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alternatively for the vector fp16 cases, I realized it's possible to convert Exp to fp after extending it to i32. So Exp would be fp for all cases after the switch statement.

    ExtVT = XTy.changeVectorElementType(MVT::f32);
    X = DAG.getFPExtendOrRound(X, DL, ExtVT);
    Exp = DAG.getSExtOrTrunc(Exp, DL, ExtVT.changeTypeToInteger());
    Exp = DAG.getNode(ISD::SINT_TO_FP, DL, ExtVT, Exp);

if (XTy.getSizeInBits() == 512 || Subtarget.hasVLX())
return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp, X);
break;
case MVT::v8f16:
case MVT::v16f16:
if (Subtarget.hasFP16()) {
if (Subtarget.hasVLX()) {
Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp, X);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why does SCALEF/S take 3 operands?

}
break;
}
ExtVT = XTy.changeVectorElementType(MVT::f32);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use DAG.getInsertVectorElt

X = DAG.getFPExtendOrRound(X, DL, ExtVT);
Exp = DAG.getSExtOrTrunc(Exp, DL, ExtVT.changeTypeToInteger());
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should be able to just use widenSubVector and tell it to widen to 512-bits instead of recomputing XVT/ExptVT

break;
case MVT::v32f16:
if (Subtarget.hasFP16()) {
Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp, X);
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this the same as XTy != X.getValueType()?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will fix.

return splitVectorOp(Op, DAG, DL);
}

SDValue WideX = widenSubVector(X, true, Subtarget, DAG, DL, 512);
SDValue WideExp = widenSubVector(Exp, true, Subtarget, DAG, DL, 512);
if (X.getValueType() != XTy) {
WideExp =
DAG.getNode(ISD::SINT_TO_FP, DL, WideX.getSimpleValueType(), WideExp);
SDValue Scalef = DAG.getNode(X86ISD::SCALEF, DL, WideX.getValueType(),
WideX, WideExp, WideX);
SDValue Final =
DAG.getExtractSubvector(DL, X.getSimpleValueType(), Scalef, 0);
return DAG.getFPExtendOrRound(Final, DL, XTy);
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(style) - don't use if-else chains if every block returns:

if (XTy.isVector()) {
  ...
  return DAG.getExtractSubvector(DL, XTy, Scalef, 0);
}

MVT VT = MVT::getVectorVT(X.getSimpleValueType(),
...

Alternatively you could move the scalar handling into the f32/f64 switch case above (and have f16 fallthrough)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will fix. I'll opt for the latter.

SDValue Scalef = DAG.getNode(X86ISD::SCALEF, DL, WideX.getValueType(), WideX,
WideExp, WideX);
return DAG.getExtractSubvector(DL, XTy, Scalef, 0);
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are these insertions just getNode(ISD::SCALAR_TO_VECTOR ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I can switch to ISD::SCALAR_TO_VECTOR.


Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You might be able to use splitVectorOp here?

static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
Expand Down Expand Up @@ -33681,7 +33762,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG);
// clang-format on
case ISD::FLDEXP: return LowerFLDEXP(Op, Subtarget, DAG);
// clang-format on
}
}

Expand Down
159 changes: 54 additions & 105 deletions llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
Original file line number Diff line number Diff line change
Expand Up @@ -79,38 +79,54 @@ define <4 x float> @fmul_pow2_ldexp_4xfloat(<4 x i32> %i) {
; CHECK-SSE-NEXT: .cfi_def_cfa_offset 8
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX-LABEL: fmul_pow2_ldexp_4xfloat:
; CHECK-AVX: # %bb.0:
; CHECK-AVX-NEXT: subq $40, %rsp
; CHECK-AVX-NEXT: .cfi_def_cfa_offset 48
; CHECK-AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX-NEXT: vextractps $1, %xmm0, %edi
; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX-NEXT: callq ldexpf@PLT
; CHECK-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-AVX-NEXT: vmovd %xmm0, %edi
; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX-NEXT: callq ldexpf@PLT
; CHECK-AVX-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3]
; CHECK-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-AVX-NEXT: vextractps $2, %xmm0, %edi
; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX-NEXT: callq ldexpf@PLT
; CHECK-AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
; CHECK-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
; CHECK-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-AVX-NEXT: vextractps $3, %xmm0, %edi
; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX-NEXT: callq ldexpf@PLT
; CHECK-AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
; CHECK-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; CHECK-AVX-NEXT: addq $40, %rsp
; CHECK-AVX-NEXT: .cfi_def_cfa_offset 8
; CHECK-AVX-NEXT: retq
; CHECK-AVX2-LABEL: fmul_pow2_ldexp_4xfloat:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: subq $40, %rsp
; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 48
; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX2-NEXT: vextractps $1, %xmm0, %edi
; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX2-NEXT: callq ldexpf@PLT
; CHECK-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-AVX2-NEXT: vmovd %xmm0, %edi
; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX2-NEXT: callq ldexpf@PLT
; CHECK-AVX2-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3]
; CHECK-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-AVX2-NEXT: vextractps $2, %xmm0, %edi
; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX2-NEXT: callq ldexpf@PLT
; CHECK-AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
; CHECK-AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
; CHECK-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-AVX2-NEXT: vextractps $3, %xmm0, %edi
; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX2-NEXT: callq ldexpf@PLT
; CHECK-AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
; CHECK-AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; CHECK-AVX2-NEXT: addq $40, %rsp
; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8
; CHECK-AVX2-NEXT: retq
;
; CHECK-ONLY-AVX512F-LABEL: fmul_pow2_ldexp_4xfloat:
; CHECK-ONLY-AVX512F: # %bb.0:
; CHECK-ONLY-AVX512F-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.0E+0,9.0E+0,9.0E+0,9.0E+0]
; CHECK-ONLY-AVX512F-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-ONLY-AVX512F-NEXT: vscalefps %zmm0, %zmm1, %zmm0
; CHECK-ONLY-AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; CHECK-ONLY-AVX512F-NEXT: vzeroupper
; CHECK-ONLY-AVX512F-NEXT: retq
;
; CHECK-SKX-LABEL: fmul_pow2_ldexp_4xfloat:
; CHECK-SKX: # %bb.0:
; CHECK-SKX-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-SKX-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.0E+0,9.0E+0,9.0E+0,9.0E+0]
; CHECK-SKX-NEXT: vscalefps %xmm0, %xmm1, %xmm0
; CHECK-SKX-NEXT: retq
%r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> <float 9.000000e+00, float 9.000000e+00, float 9.000000e+00, float 9.000000e+00>, <4 x i32> %i)
ret <4 x float> %r
}
Expand Down Expand Up @@ -562,79 +578,12 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
;
; CHECK-AVX512F-LABEL: fmul_pow2_ldexp_8xhalf:
; CHECK-AVX512F: # %bb.0:
; CHECK-AVX512F-NEXT: subq $72, %rsp
; CHECK-AVX512F-NEXT: .cfi_def_cfa_offset 80
; CHECK-AVX512F-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
; CHECK-AVX512F-NEXT: vpextrw $7, %xmm0, %eax
; CHECK-AVX512F-NEXT: movswl %ax, %edi
; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX512F-NEXT: callq ldexpf@PLT
; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-AVX512F-NEXT: vpextrw $6, %xmm0, %eax
; CHECK-AVX512F-NEXT: movswl %ax, %edi
; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX512F-NEXT: callq ldexpf@PLT
; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-AVX512F-NEXT: vpextrw $5, %xmm0, %eax
; CHECK-AVX512F-NEXT: movswl %ax, %edi
; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX512F-NEXT: callq ldexpf@PLT
; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-AVX512F-NEXT: vpextrw $4, %xmm0, %eax
; CHECK-AVX512F-NEXT: movswl %ax, %edi
; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX512F-NEXT: callq ldexpf@PLT
; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; CHECK-AVX512F-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-AVX512F-NEXT: vpextrw $3, %xmm0, %eax
; CHECK-AVX512F-NEXT: movswl %ax, %edi
; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX512F-NEXT: callq ldexpf@PLT
; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-AVX512F-NEXT: vpextrw $2, %xmm0, %eax
; CHECK-AVX512F-NEXT: movswl %ax, %edi
; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX512F-NEXT: callq ldexpf@PLT
; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-AVX512F-NEXT: vpextrw $1, %xmm0, %eax
; CHECK-AVX512F-NEXT: movswl %ax, %edi
; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX512F-NEXT: callq ldexpf@PLT
; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-AVX512F-NEXT: vmovd %xmm0, %eax
; CHECK-AVX512F-NEXT: movswl %ax, %edi
; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX512F-NEXT: callq ldexpf@PLT
; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; CHECK-AVX512F-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; CHECK-AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-AVX512F-NEXT: addq $72, %rsp
; CHECK-AVX512F-NEXT: .cfi_def_cfa_offset 8
; CHECK-AVX512F-NEXT: vbroadcastss {{.*#+}} ymm1 = [8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3]
; CHECK-AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0
; CHECK-AVX512F-NEXT: vcvtdq2ps %zmm0, %zmm0
; CHECK-AVX512F-NEXT: vscalefps %zmm0, %zmm1, %zmm0
; CHECK-AVX512F-NEXT: vcvtps2ph $4, %ymm0, %xmm0
; CHECK-AVX512F-NEXT: vzeroupper
; CHECK-AVX512F-NEXT: retq
%r = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, <8 x i16> %i)
ret <8 x half> %r
Expand Down
Loading
Loading