diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 413ef0136d5c0..bae223243b3dc 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2296,7 +2296,10 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 1, 1, 1, 1 } }, { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, { 3, 1, 1, 1 } }, { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, { 4, 1, 1, 1 } }, // 2*vcvtps2pd+vextractf64x4 + { ISD::FP_EXTEND, MVT::v16f32, MVT::v16f16, { 1, 1, 1, 1 } }, // vcvtph2ps + { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 1, 1, 1, 1 } }, + { ISD::FP_ROUND, MVT::v16f16, MVT::v16f32, { 1, 1, 1, 1 } }, // vcvtps2ph { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd @@ -2973,6 +2976,17 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, { 1, 1, 1, 1 } }, // PSHUFD }; + static const TypeConversionCostKindTblEntry F16ConversionTbl[] = { + { ISD::FP_ROUND, MVT::f16, MVT::f32, { 1, 1, 1, 1 } }, + { ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, { 1, 1, 1, 1 } }, + { ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, { 1, 1, 1, 1 } }, + { ISD::FP_EXTEND, MVT::f32, MVT::f16, { 1, 1, 1, 1 } }, + { ISD::FP_EXTEND, MVT::f64, MVT::f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd + { ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, { 1, 1, 1, 1 } }, + { ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, { 1, 1, 1, 1 } }, + { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd + }; + // Attempt to map directly to (simple) MVT types to let us match custom entries. EVT SrcTy = TLI->getValueType(DL, Src); EVT DstTy = TLI->getValueType(DL, Dst); @@ -3034,6 +3048,13 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, return *KindCost; } + if (ST->hasF16C()) { + if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD, + SimpleDstTy, SimpleSrcTy)) + if (auto KindCost = Entry->Cost[CostKind]) + return *KindCost; + } + if (ST->hasSSE41()) { if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) @@ -3107,6 +3128,13 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, if (auto KindCost = Entry->Cost[CostKind]) return std::max(LTSrc.first, LTDest.first) * *KindCost; + if (ST->hasF16C()) { + if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD, + LTDest.second, LTSrc.second)) + if (auto KindCost = Entry->Cost[CostKind]) + return std::max(LTSrc.first, LTDest.first) * *KindCost; + } + if (ST->hasSSE41()) if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, LTDest.second, LTSrc.second)) @@ -3146,6 +3174,11 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, TTI::CastContextHint::None, CostKind); } + if (ISD == ISD::FP_ROUND && LTDest.second.getScalarType() == MVT::f16) { + // Conversion requires a libcall. + return InstructionCost::getInvalid(); + } + // TODO: Allow non-throughput costs that aren't binary. auto AdjustCost = [&CostKind](InstructionCost Cost, InstructionCost N = 1) -> InstructionCost { @@ -6923,6 +6956,14 @@ bool X86TTIImpl::isVectorShiftByScalarCheap(Type *Ty) const { return true; } +unsigned X86TTIImpl::getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, + Type *ScalarValTy) const { + if (ST->hasF16C() && ScalarMemTy->isHalfTy()) { + return 4; + } + return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy); +} + bool X86TTIImpl::isProfitableToSinkOperands(Instruction *I, SmallVectorImpl &Ops) const { using namespace llvm::PatternMatch; diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index 0100f328ab4bd..36d00cee0d18b 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -302,6 +302,9 @@ class X86TTIImpl : public BasicTTIImplBase { bool isVectorShiftByScalarCheap(Type *Ty) const; + unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, + Type *ScalarValTy) const; + private: bool supportsGather() const; InstructionCost getGSVectorCost(unsigned Opcode, TTI::TargetCostKind CostKind, diff --git a/llvm/test/Transforms/SLPVectorizer/X86/conversion-fp16.ll b/llvm/test/Transforms/SLPVectorizer/X86/conversion-fp16.ll new file mode 100644 index 0000000000000..bcea147d724f5 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/conversion-fp16.ll @@ -0,0 +1,606 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -mtriple=x86_64-- -passes=slp-vectorizer -S -mattr=+avx2 | FileCheck %s --check-prefix=CHECK +; RUN: opt < %s -mtriple=x86_64-- -passes=slp-vectorizer -S -mattr=+avx2 -mattr=+f16c | FileCheck %s --check-prefix=CHECK-F16C +; RUN: opt < %s -mtriple=x86_64-- -passes=slp-vectorizer -S -mattr=+avx512f | FileCheck %s --check-prefix=CHECK-AVX512 + +define void @fpext_v4xf16_v4xf32(ptr %s0, ptr %d0) { +; CHECK-LABEL: define void @fpext_v4xf16_v4xf32( +; CHECK-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[S1:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 1 +; CHECK-NEXT: [[S2:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 2 +; CHECK-NEXT: [[S3:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 3 +; CHECK-NEXT: [[L0:%.*]] = load half, ptr [[S0]], align 2 +; CHECK-NEXT: [[L1:%.*]] = load half, ptr [[S1]], align 2 +; CHECK-NEXT: [[L2:%.*]] = load half, ptr [[S2]], align 2 +; CHECK-NEXT: [[L3:%.*]] = load half, ptr [[S3]], align 2 +; CHECK-NEXT: [[E0:%.*]] = fpext half [[L0]] to float +; CHECK-NEXT: [[E1:%.*]] = fpext half [[L1]] to float +; CHECK-NEXT: [[E2:%.*]] = fpext half [[L2]] to float +; CHECK-NEXT: [[E3:%.*]] = fpext half [[L3]] to float +; CHECK-NEXT: [[D1:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 1 +; CHECK-NEXT: [[D2:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 2 +; CHECK-NEXT: [[D3:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 3 +; CHECK-NEXT: store float [[E0]], ptr [[D0]], align 8 +; CHECK-NEXT: store float [[E1]], ptr [[D1]], align 8 +; CHECK-NEXT: store float [[E2]], ptr [[D2]], align 8 +; CHECK-NEXT: store float [[E3]], ptr [[D3]], align 8 +; CHECK-NEXT: ret void +; +; CHECK-F16C-LABEL: define void @fpext_v4xf16_v4xf32( +; CHECK-F16C-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-F16C-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr [[S0]], align 2 +; CHECK-F16C-NEXT: [[TMP2:%.*]] = fpext <4 x half> [[TMP1]] to <4 x float> +; CHECK-F16C-NEXT: store <4 x float> [[TMP2]], ptr [[D0]], align 8 +; CHECK-F16C-NEXT: ret void +; +; CHECK-AVX512-LABEL: define void @fpext_v4xf16_v4xf32( +; CHECK-AVX512-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-AVX512-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr [[S0]], align 2 +; CHECK-AVX512-NEXT: [[TMP2:%.*]] = fpext <4 x half> [[TMP1]] to <4 x float> +; CHECK-AVX512-NEXT: store <4 x float> [[TMP2]], ptr [[D0]], align 8 +; CHECK-AVX512-NEXT: ret void +; + %s1 = getelementptr inbounds half, ptr %s0, i64 1 + %s2 = getelementptr inbounds half, ptr %s0, i64 2 + %s3 = getelementptr inbounds half, ptr %s0, i64 3 + %l0 = load half, ptr %s0, align 2 + %l1 = load half, ptr %s1, align 2 + %l2 = load half, ptr %s2, align 2 + %l3 = load half, ptr %s3, align 2 + + %e0 = fpext half %l0 to float + %e1 = fpext half %l1 to float + %e2 = fpext half %l2 to float + %e3 = fpext half %l3 to float + + %d1 = getelementptr inbounds float, ptr %d0, i64 1 + %d2 = getelementptr inbounds float, ptr %d0, i64 2 + %d3 = getelementptr inbounds float, ptr %d0, i64 3 + store float %e0, ptr %d0, align 8 + store float %e1, ptr %d1, align 8 + store float %e2, ptr %d2, align 8 + store float %e3, ptr %d3, align 8 + ret void +} + +define void @fpext_v4xf16_v4xf64(ptr %s0, ptr %d0) { +; CHECK-LABEL: define void @fpext_v4xf16_v4xf64( +; CHECK-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[S1:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 1 +; CHECK-NEXT: [[S2:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 2 +; CHECK-NEXT: [[S3:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 3 +; CHECK-NEXT: [[L0:%.*]] = load half, ptr [[S0]], align 2 +; CHECK-NEXT: [[L1:%.*]] = load half, ptr [[S1]], align 2 +; CHECK-NEXT: [[L2:%.*]] = load half, ptr [[S2]], align 2 +; CHECK-NEXT: [[L3:%.*]] = load half, ptr [[S3]], align 2 +; CHECK-NEXT: [[E0:%.*]] = fpext half [[L0]] to double +; CHECK-NEXT: [[E1:%.*]] = fpext half [[L1]] to double +; CHECK-NEXT: [[E2:%.*]] = fpext half [[L2]] to double +; CHECK-NEXT: [[E3:%.*]] = fpext half [[L3]] to double +; CHECK-NEXT: [[D1:%.*]] = getelementptr inbounds double, ptr [[D0]], i64 1 +; CHECK-NEXT: [[D2:%.*]] = getelementptr inbounds double, ptr [[D0]], i64 2 +; CHECK-NEXT: [[D3:%.*]] = getelementptr inbounds double, ptr [[D0]], i64 3 +; CHECK-NEXT: store double [[E0]], ptr [[D0]], align 8 +; CHECK-NEXT: store double [[E1]], ptr [[D1]], align 8 +; CHECK-NEXT: store double [[E2]], ptr [[D2]], align 8 +; CHECK-NEXT: store double [[E3]], ptr [[D3]], align 8 +; CHECK-NEXT: ret void +; +; CHECK-F16C-LABEL: define void @fpext_v4xf16_v4xf64( +; CHECK-F16C-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] { +; CHECK-F16C-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr [[S0]], align 2 +; CHECK-F16C-NEXT: [[TMP2:%.*]] = fpext <4 x half> [[TMP1]] to <4 x double> +; CHECK-F16C-NEXT: store <4 x double> [[TMP2]], ptr [[D0]], align 8 +; CHECK-F16C-NEXT: ret void +; +; CHECK-AVX512-LABEL: define void @fpext_v4xf16_v4xf64( +; CHECK-AVX512-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] { +; CHECK-AVX512-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr [[S0]], align 2 +; CHECK-AVX512-NEXT: [[TMP2:%.*]] = fpext <4 x half> [[TMP1]] to <4 x double> +; CHECK-AVX512-NEXT: store <4 x double> [[TMP2]], ptr [[D0]], align 8 +; CHECK-AVX512-NEXT: ret void +; + %s1 = getelementptr inbounds half, ptr %s0, i64 1 + %s2 = getelementptr inbounds half, ptr %s0, i64 2 + %s3 = getelementptr inbounds half, ptr %s0, i64 3 + %l0 = load half, ptr %s0, align 2 + %l1 = load half, ptr %s1, align 2 + %l2 = load half, ptr %s2, align 2 + %l3 = load half, ptr %s3, align 2 + + %e0 = fpext half %l0 to double + %e1 = fpext half %l1 to double + %e2 = fpext half %l2 to double + %e3 = fpext half %l3 to double + + %d1 = getelementptr inbounds double, ptr %d0, i64 1 + %d2 = getelementptr inbounds double, ptr %d0, i64 2 + %d3 = getelementptr inbounds double, ptr %d0, i64 3 + store double %e0, ptr %d0, align 8 + store double %e1, ptr %d1, align 8 + store double %e2, ptr %d2, align 8 + store double %e3, ptr %d3, align 8 + ret void +} + +define void @fpext_v16xf16_v16xf32(ptr %s0, ptr %d0) { +; CHECK-LABEL: define void @fpext_v16xf16_v16xf32( +; CHECK-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[S1:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 1 +; CHECK-NEXT: [[S2:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 2 +; CHECK-NEXT: [[S3:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 3 +; CHECK-NEXT: [[S4:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 4 +; CHECK-NEXT: [[S5:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 5 +; CHECK-NEXT: [[S6:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 6 +; CHECK-NEXT: [[S7:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 7 +; CHECK-NEXT: [[S8:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 8 +; CHECK-NEXT: [[S9:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 9 +; CHECK-NEXT: [[S10:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 10 +; CHECK-NEXT: [[S11:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 11 +; CHECK-NEXT: [[S12:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 12 +; CHECK-NEXT: [[S13:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 13 +; CHECK-NEXT: [[S14:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 14 +; CHECK-NEXT: [[S15:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 15 +; CHECK-NEXT: [[L0:%.*]] = load half, ptr [[S0]], align 2 +; CHECK-NEXT: [[L1:%.*]] = load half, ptr [[S1]], align 2 +; CHECK-NEXT: [[L2:%.*]] = load half, ptr [[S2]], align 2 +; CHECK-NEXT: [[L3:%.*]] = load half, ptr [[S3]], align 2 +; CHECK-NEXT: [[L4:%.*]] = load half, ptr [[S4]], align 2 +; CHECK-NEXT: [[L5:%.*]] = load half, ptr [[S5]], align 2 +; CHECK-NEXT: [[L6:%.*]] = load half, ptr [[S6]], align 2 +; CHECK-NEXT: [[L7:%.*]] = load half, ptr [[S7]], align 2 +; CHECK-NEXT: [[L8:%.*]] = load half, ptr [[S8]], align 2 +; CHECK-NEXT: [[L9:%.*]] = load half, ptr [[S9]], align 2 +; CHECK-NEXT: [[L10:%.*]] = load half, ptr [[S10]], align 2 +; CHECK-NEXT: [[L11:%.*]] = load half, ptr [[S11]], align 2 +; CHECK-NEXT: [[L12:%.*]] = load half, ptr [[S12]], align 2 +; CHECK-NEXT: [[L13:%.*]] = load half, ptr [[S13]], align 2 +; CHECK-NEXT: [[L14:%.*]] = load half, ptr [[S14]], align 2 +; CHECK-NEXT: [[L15:%.*]] = load half, ptr [[S15]], align 2 +; CHECK-NEXT: [[E0:%.*]] = fpext half [[L0]] to float +; CHECK-NEXT: [[E1:%.*]] = fpext half [[L1]] to float +; CHECK-NEXT: [[E2:%.*]] = fpext half [[L2]] to float +; CHECK-NEXT: [[E3:%.*]] = fpext half [[L3]] to float +; CHECK-NEXT: [[E4:%.*]] = fpext half [[L4]] to float +; CHECK-NEXT: [[E5:%.*]] = fpext half [[L5]] to float +; CHECK-NEXT: [[E6:%.*]] = fpext half [[L6]] to float +; CHECK-NEXT: [[E7:%.*]] = fpext half [[L7]] to float +; CHECK-NEXT: [[E8:%.*]] = fpext half [[L8]] to float +; CHECK-NEXT: [[E9:%.*]] = fpext half [[L9]] to float +; CHECK-NEXT: [[E10:%.*]] = fpext half [[L10]] to float +; CHECK-NEXT: [[E11:%.*]] = fpext half [[L11]] to float +; CHECK-NEXT: [[E12:%.*]] = fpext half [[L12]] to float +; CHECK-NEXT: [[E13:%.*]] = fpext half [[L13]] to float +; CHECK-NEXT: [[E14:%.*]] = fpext half [[L14]] to float +; CHECK-NEXT: [[E15:%.*]] = fpext half [[L15]] to float +; CHECK-NEXT: [[D1:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 1 +; CHECK-NEXT: [[D2:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 2 +; CHECK-NEXT: [[D15:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 3 +; CHECK-NEXT: [[D4:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 4 +; CHECK-NEXT: [[D5:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 5 +; CHECK-NEXT: [[D6:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 6 +; CHECK-NEXT: [[D7:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 7 +; CHECK-NEXT: [[D8:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 8 +; CHECK-NEXT: [[D9:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 9 +; CHECK-NEXT: [[D10:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 10 +; CHECK-NEXT: [[D11:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 11 +; CHECK-NEXT: [[D12:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 12 +; CHECK-NEXT: [[D13:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 13 +; CHECK-NEXT: [[D14:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 14 +; CHECK-NEXT: [[D16:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 15 +; CHECK-NEXT: store float [[E0]], ptr [[D0]], align 8 +; CHECK-NEXT: store float [[E1]], ptr [[D1]], align 8 +; CHECK-NEXT: store float [[E2]], ptr [[D2]], align 8 +; CHECK-NEXT: store float [[E3]], ptr [[D15]], align 8 +; CHECK-NEXT: store float [[E4]], ptr [[D4]], align 8 +; CHECK-NEXT: store float [[E5]], ptr [[D5]], align 8 +; CHECK-NEXT: store float [[E6]], ptr [[D6]], align 8 +; CHECK-NEXT: store float [[E7]], ptr [[D7]], align 8 +; CHECK-NEXT: store float [[E8]], ptr [[D8]], align 8 +; CHECK-NEXT: store float [[E9]], ptr [[D9]], align 8 +; CHECK-NEXT: store float [[E10]], ptr [[D10]], align 8 +; CHECK-NEXT: store float [[E11]], ptr [[D11]], align 8 +; CHECK-NEXT: store float [[E12]], ptr [[D12]], align 8 +; CHECK-NEXT: store float [[E13]], ptr [[D13]], align 8 +; CHECK-NEXT: store float [[E14]], ptr [[D14]], align 8 +; CHECK-NEXT: store float [[E15]], ptr [[D16]], align 8 +; CHECK-NEXT: ret void +; +; CHECK-F16C-LABEL: define void @fpext_v16xf16_v16xf32( +; CHECK-F16C-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] { +; CHECK-F16C-NEXT: [[S8:%.*]] = getelementptr inbounds half, ptr [[S0]], i64 8 +; CHECK-F16C-NEXT: [[D8:%.*]] = getelementptr inbounds float, ptr [[D0]], i64 8 +; CHECK-F16C-NEXT: [[TMP1:%.*]] = load <8 x half>, ptr [[S0]], align 2 +; CHECK-F16C-NEXT: [[TMP2:%.*]] = fpext <8 x half> [[TMP1]] to <8 x float> +; CHECK-F16C-NEXT: [[TMP3:%.*]] = load <8 x half>, ptr [[S8]], align 2 +; CHECK-F16C-NEXT: [[TMP4:%.*]] = fpext <8 x half> [[TMP3]] to <8 x float> +; CHECK-F16C-NEXT: store <8 x float> [[TMP2]], ptr [[D0]], align 8 +; CHECK-F16C-NEXT: store <8 x float> [[TMP4]], ptr [[D8]], align 8 +; CHECK-F16C-NEXT: ret void +; +; CHECK-AVX512-LABEL: define void @fpext_v16xf16_v16xf32( +; CHECK-AVX512-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] { +; CHECK-AVX512-NEXT: [[TMP1:%.*]] = load <16 x half>, ptr [[S0]], align 2 +; CHECK-AVX512-NEXT: [[TMP2:%.*]] = fpext <16 x half> [[TMP1]] to <16 x float> +; CHECK-AVX512-NEXT: store <16 x float> [[TMP2]], ptr [[D0]], align 8 +; CHECK-AVX512-NEXT: ret void +; + %s1 = getelementptr inbounds half, ptr %s0, i64 1 + %s2 = getelementptr inbounds half, ptr %s0, i64 2 + %s3 = getelementptr inbounds half, ptr %s0, i64 3 + %s4 = getelementptr inbounds half, ptr %s0, i64 4 + %s5 = getelementptr inbounds half, ptr %s0, i64 5 + %s6 = getelementptr inbounds half, ptr %s0, i64 6 + %s7 = getelementptr inbounds half, ptr %s0, i64 7 + %s8 = getelementptr inbounds half, ptr %s0, i64 8 + %s9 = getelementptr inbounds half, ptr %s0, i64 9 + %s10 = getelementptr inbounds half, ptr %s0, i64 10 + %s11 = getelementptr inbounds half, ptr %s0, i64 11 + %s12 = getelementptr inbounds half, ptr %s0, i64 12 + %s13 = getelementptr inbounds half, ptr %s0, i64 13 + %s14 = getelementptr inbounds half, ptr %s0, i64 14 + %s15 = getelementptr inbounds half, ptr %s0, i64 15 + %l0 = load half, ptr %s0, align 2 + %l1 = load half, ptr %s1, align 2 + %l2 = load half, ptr %s2, align 2 + %l3 = load half, ptr %s3, align 2 + %l4 = load half, ptr %s4, align 2 + %l5 = load half, ptr %s5, align 2 + %l6 = load half, ptr %s6, align 2 + %l7 = load half, ptr %s7, align 2 + %l8 = load half, ptr %s8, align 2 + %l9 = load half, ptr %s9, align 2 + %l10 = load half, ptr %s10, align 2 + %l11 = load half, ptr %s11, align 2 + %l12 = load half, ptr %s12, align 2 + %l13 = load half, ptr %s13, align 2 + %l14 = load half, ptr %s14, align 2 + %l15 = load half, ptr %s15, align 2 + + %e0 = fpext half %l0 to float + %e1 = fpext half %l1 to float + %e2 = fpext half %l2 to float + %e3 = fpext half %l3 to float + %e4 = fpext half %l4 to float + %e5 = fpext half %l5 to float + %e6 = fpext half %l6 to float + %e7 = fpext half %l7 to float + %e8 = fpext half %l8 to float + %e9 = fpext half %l9 to float + %e10 = fpext half %l10 to float + %e11 = fpext half %l11 to float + %e12 = fpext half %l12 to float + %e13 = fpext half %l13 to float + %e14 = fpext half %l14 to float + %e15 = fpext half %l15 to float + + %d1 = getelementptr inbounds float, ptr %d0, i64 1 + %d2 = getelementptr inbounds float, ptr %d0, i64 2 + %d3 = getelementptr inbounds float, ptr %d0, i64 3 + %d4 = getelementptr inbounds float, ptr %d0, i64 4 + %d5 = getelementptr inbounds float, ptr %d0, i64 5 + %d6 = getelementptr inbounds float, ptr %d0, i64 6 + %d7 = getelementptr inbounds float, ptr %d0, i64 7 + %d8 = getelementptr inbounds float, ptr %d0, i64 8 + %d9 = getelementptr inbounds float, ptr %d0, i64 9 + %d10 = getelementptr inbounds float, ptr %d0, i64 10 + %d11 = getelementptr inbounds float, ptr %d0, i64 11 + %d12 = getelementptr inbounds float, ptr %d0, i64 12 + %d13 = getelementptr inbounds float, ptr %d0, i64 13 + %d14 = getelementptr inbounds float, ptr %d0, i64 14 + %d15 = getelementptr inbounds float, ptr %d0, i64 15 + store float %e0, ptr %d0, align 8 + store float %e1, ptr %d1, align 8 + store float %e2, ptr %d2, align 8 + store float %e3, ptr %d3, align 8 + store float %e4, ptr %d4, align 8 + store float %e5, ptr %d5, align 8 + store float %e6, ptr %d6, align 8 + store float %e7, ptr %d7, align 8 + store float %e8, ptr %d8, align 8 + store float %e9, ptr %d9, align 8 + store float %e10, ptr %d10, align 8 + store float %e11, ptr %d11, align 8 + store float %e12, ptr %d12, align 8 + store float %e13, ptr %d13, align 8 + store float %e14, ptr %d14, align 8 + store float %e15, ptr %d15, align 8 + ret void +} + +define void @fpround_v4xf32_v4xf16(ptr %s0, ptr %d0) { +; CHECK-LABEL: define void @fpround_v4xf32_v4xf16( +; CHECK-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[S1:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 1 +; CHECK-NEXT: [[S2:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 2 +; CHECK-NEXT: [[S3:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 3 +; CHECK-NEXT: [[L0:%.*]] = load float, ptr [[S0]], align 4 +; CHECK-NEXT: [[L1:%.*]] = load float, ptr [[S1]], align 4 +; CHECK-NEXT: [[L2:%.*]] = load float, ptr [[S2]], align 4 +; CHECK-NEXT: [[L3:%.*]] = load float, ptr [[S3]], align 4 +; CHECK-NEXT: [[T0:%.*]] = fptrunc float [[L0]] to half +; CHECK-NEXT: [[T1:%.*]] = fptrunc float [[L1]] to half +; CHECK-NEXT: [[T2:%.*]] = fptrunc float [[L2]] to half +; CHECK-NEXT: [[T3:%.*]] = fptrunc float [[L3]] to half +; CHECK-NEXT: [[D1:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 1 +; CHECK-NEXT: [[D2:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 2 +; CHECK-NEXT: [[D3:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 3 +; CHECK-NEXT: store half [[T0]], ptr [[D0]], align 2 +; CHECK-NEXT: store half [[T1]], ptr [[D1]], align 2 +; CHECK-NEXT: store half [[T2]], ptr [[D2]], align 2 +; CHECK-NEXT: store half [[T3]], ptr [[D3]], align 2 +; CHECK-NEXT: ret void +; +; CHECK-F16C-LABEL: define void @fpround_v4xf32_v4xf16( +; CHECK-F16C-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] { +; CHECK-F16C-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[S0]], align 4 +; CHECK-F16C-NEXT: [[TMP2:%.*]] = fptrunc <4 x float> [[TMP1]] to <4 x half> +; CHECK-F16C-NEXT: store <4 x half> [[TMP2]], ptr [[D0]], align 2 +; CHECK-F16C-NEXT: ret void +; +; CHECK-AVX512-LABEL: define void @fpround_v4xf32_v4xf16( +; CHECK-AVX512-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] { +; CHECK-AVX512-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[S0]], align 4 +; CHECK-AVX512-NEXT: [[TMP2:%.*]] = fptrunc <4 x float> [[TMP1]] to <4 x half> +; CHECK-AVX512-NEXT: store <4 x half> [[TMP2]], ptr [[D0]], align 2 +; CHECK-AVX512-NEXT: ret void +; + %s1 = getelementptr inbounds float, ptr %s0, i64 1 + %s2 = getelementptr inbounds float, ptr %s0, i64 2 + %s3 = getelementptr inbounds float, ptr %s0, i64 3 + %l0 = load float, ptr %s0, align 4 + %l1 = load float, ptr %s1, align 4 + %l2 = load float, ptr %s2, align 4 + %l3 = load float, ptr %s3, align 4 + + %t0 = fptrunc float %l0 to half + %t1 = fptrunc float %l1 to half + %t2 = fptrunc float %l2 to half + %t3 = fptrunc float %l3 to half + + %d1 = getelementptr inbounds half, ptr %d0, i64 1 + %d2 = getelementptr inbounds half, ptr %d0, i64 2 + %d3 = getelementptr inbounds half, ptr %d0, i64 3 + store half %t0, ptr %d0, align 2 + store half %t1, ptr %d1, align 2 + store half %t2, ptr %d2, align 2 + store half %t3, ptr %d3, align 2 + ret void +} + +define void @fpround_v16xf32_v16xf16(ptr %s0, ptr %d0) { +; CHECK-LABEL: define void @fpround_v16xf32_v16xf16( +; CHECK-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[S1:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 1 +; CHECK-NEXT: [[S2:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 2 +; CHECK-NEXT: [[S3:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 3 +; CHECK-NEXT: [[S4:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 4 +; CHECK-NEXT: [[S5:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 5 +; CHECK-NEXT: [[S6:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 6 +; CHECK-NEXT: [[S7:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 7 +; CHECK-NEXT: [[S8:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 8 +; CHECK-NEXT: [[S9:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 9 +; CHECK-NEXT: [[S10:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 10 +; CHECK-NEXT: [[S11:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 11 +; CHECK-NEXT: [[S12:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 12 +; CHECK-NEXT: [[S13:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 13 +; CHECK-NEXT: [[S14:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 14 +; CHECK-NEXT: [[S15:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 15 +; CHECK-NEXT: [[L0:%.*]] = load float, ptr [[S0]], align 4 +; CHECK-NEXT: [[L1:%.*]] = load float, ptr [[S1]], align 4 +; CHECK-NEXT: [[L2:%.*]] = load float, ptr [[S2]], align 4 +; CHECK-NEXT: [[L3:%.*]] = load float, ptr [[S3]], align 4 +; CHECK-NEXT: [[L4:%.*]] = load float, ptr [[S4]], align 4 +; CHECK-NEXT: [[L5:%.*]] = load float, ptr [[S5]], align 4 +; CHECK-NEXT: [[L6:%.*]] = load float, ptr [[S6]], align 4 +; CHECK-NEXT: [[L7:%.*]] = load float, ptr [[S7]], align 4 +; CHECK-NEXT: [[L8:%.*]] = load float, ptr [[S8]], align 4 +; CHECK-NEXT: [[L9:%.*]] = load float, ptr [[S9]], align 4 +; CHECK-NEXT: [[L10:%.*]] = load float, ptr [[S10]], align 4 +; CHECK-NEXT: [[L11:%.*]] = load float, ptr [[S11]], align 4 +; CHECK-NEXT: [[L12:%.*]] = load float, ptr [[S12]], align 4 +; CHECK-NEXT: [[L13:%.*]] = load float, ptr [[S13]], align 4 +; CHECK-NEXT: [[L14:%.*]] = load float, ptr [[S14]], align 4 +; CHECK-NEXT: [[L15:%.*]] = load float, ptr [[S15]], align 4 +; CHECK-NEXT: [[T0:%.*]] = fptrunc float [[L0]] to half +; CHECK-NEXT: [[T1:%.*]] = fptrunc float [[L1]] to half +; CHECK-NEXT: [[T2:%.*]] = fptrunc float [[L2]] to half +; CHECK-NEXT: [[T3:%.*]] = fptrunc float [[L3]] to half +; CHECK-NEXT: [[T4:%.*]] = fptrunc float [[L4]] to half +; CHECK-NEXT: [[T5:%.*]] = fptrunc float [[L5]] to half +; CHECK-NEXT: [[T6:%.*]] = fptrunc float [[L6]] to half +; CHECK-NEXT: [[T7:%.*]] = fptrunc float [[L7]] to half +; CHECK-NEXT: [[T8:%.*]] = fptrunc float [[L8]] to half +; CHECK-NEXT: [[T9:%.*]] = fptrunc float [[L9]] to half +; CHECK-NEXT: [[T10:%.*]] = fptrunc float [[L10]] to half +; CHECK-NEXT: [[T11:%.*]] = fptrunc float [[L11]] to half +; CHECK-NEXT: [[T12:%.*]] = fptrunc float [[L12]] to half +; CHECK-NEXT: [[T13:%.*]] = fptrunc float [[L13]] to half +; CHECK-NEXT: [[T14:%.*]] = fptrunc float [[L14]] to half +; CHECK-NEXT: [[T15:%.*]] = fptrunc float [[L15]] to half +; CHECK-NEXT: [[D1:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 1 +; CHECK-NEXT: [[D2:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 2 +; CHECK-NEXT: [[D3:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 3 +; CHECK-NEXT: [[D4:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 4 +; CHECK-NEXT: [[D5:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 5 +; CHECK-NEXT: [[D6:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 6 +; CHECK-NEXT: [[D7:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 7 +; CHECK-NEXT: [[D8:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 8 +; CHECK-NEXT: [[D9:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 9 +; CHECK-NEXT: [[D10:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 10 +; CHECK-NEXT: [[D11:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 11 +; CHECK-NEXT: [[D12:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 12 +; CHECK-NEXT: [[D13:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 13 +; CHECK-NEXT: [[D14:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 14 +; CHECK-NEXT: [[D15:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 15 +; CHECK-NEXT: store half [[T0]], ptr [[D0]], align 2 +; CHECK-NEXT: store half [[T1]], ptr [[D1]], align 2 +; CHECK-NEXT: store half [[T2]], ptr [[D2]], align 2 +; CHECK-NEXT: store half [[T3]], ptr [[D3]], align 2 +; CHECK-NEXT: store half [[T4]], ptr [[D4]], align 2 +; CHECK-NEXT: store half [[T5]], ptr [[D5]], align 2 +; CHECK-NEXT: store half [[T6]], ptr [[D6]], align 2 +; CHECK-NEXT: store half [[T7]], ptr [[D7]], align 2 +; CHECK-NEXT: store half [[T8]], ptr [[D8]], align 2 +; CHECK-NEXT: store half [[T9]], ptr [[D9]], align 2 +; CHECK-NEXT: store half [[T10]], ptr [[D10]], align 2 +; CHECK-NEXT: store half [[T11]], ptr [[D11]], align 2 +; CHECK-NEXT: store half [[T12]], ptr [[D12]], align 2 +; CHECK-NEXT: store half [[T13]], ptr [[D13]], align 2 +; CHECK-NEXT: store half [[T14]], ptr [[D14]], align 2 +; CHECK-NEXT: store half [[T15]], ptr [[D15]], align 2 +; CHECK-NEXT: ret void +; +; CHECK-F16C-LABEL: define void @fpround_v16xf32_v16xf16( +; CHECK-F16C-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] { +; CHECK-F16C-NEXT: [[S8:%.*]] = getelementptr inbounds float, ptr [[S0]], i64 8 +; CHECK-F16C-NEXT: [[D8:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 8 +; CHECK-F16C-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[S0]], align 4 +; CHECK-F16C-NEXT: [[TMP2:%.*]] = fptrunc <8 x float> [[TMP1]] to <8 x half> +; CHECK-F16C-NEXT: [[TMP3:%.*]] = load <8 x float>, ptr [[S8]], align 4 +; CHECK-F16C-NEXT: [[TMP4:%.*]] = fptrunc <8 x float> [[TMP3]] to <8 x half> +; CHECK-F16C-NEXT: store <8 x half> [[TMP2]], ptr [[D0]], align 2 +; CHECK-F16C-NEXT: store <8 x half> [[TMP4]], ptr [[D8]], align 2 +; CHECK-F16C-NEXT: ret void +; +; CHECK-AVX512-LABEL: define void @fpround_v16xf32_v16xf16( +; CHECK-AVX512-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] { +; CHECK-AVX512-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[S0]], align 4 +; CHECK-AVX512-NEXT: [[TMP2:%.*]] = fptrunc <16 x float> [[TMP1]] to <16 x half> +; CHECK-AVX512-NEXT: store <16 x half> [[TMP2]], ptr [[D0]], align 2 +; CHECK-AVX512-NEXT: ret void +; + %s1 = getelementptr inbounds float, ptr %s0, i64 1 + %s2 = getelementptr inbounds float, ptr %s0, i64 2 + %s3 = getelementptr inbounds float, ptr %s0, i64 3 + %s4 = getelementptr inbounds float, ptr %s0, i64 4 + %s5 = getelementptr inbounds float, ptr %s0, i64 5 + %s6 = getelementptr inbounds float, ptr %s0, i64 6 + %s7 = getelementptr inbounds float, ptr %s0, i64 7 + %s8 = getelementptr inbounds float, ptr %s0, i64 8 + %s9 = getelementptr inbounds float, ptr %s0, i64 9 + %s10 = getelementptr inbounds float, ptr %s0, i64 10 + %s11 = getelementptr inbounds float, ptr %s0, i64 11 + %s12 = getelementptr inbounds float, ptr %s0, i64 12 + %s13 = getelementptr inbounds float, ptr %s0, i64 13 + %s14 = getelementptr inbounds float, ptr %s0, i64 14 + %s15 = getelementptr inbounds float, ptr %s0, i64 15 + %l0 = load float, ptr %s0, align 4 + %l1 = load float, ptr %s1, align 4 + %l2 = load float, ptr %s2, align 4 + %l3 = load float, ptr %s3, align 4 + %l4 = load float, ptr %s4, align 4 + %l5 = load float, ptr %s5, align 4 + %l6 = load float, ptr %s6, align 4 + %l7 = load float, ptr %s7, align 4 + %l8 = load float, ptr %s8, align 4 + %l9 = load float, ptr %s9, align 4 + %l10 = load float, ptr %s10, align 4 + %l11 = load float, ptr %s11, align 4 + %l12 = load float, ptr %s12, align 4 + %l13 = load float, ptr %s13, align 4 + %l14 = load float, ptr %s14, align 4 + %l15 = load float, ptr %s15, align 4 + + %t0 = fptrunc float %l0 to half + %t1 = fptrunc float %l1 to half + %t2 = fptrunc float %l2 to half + %t3 = fptrunc float %l3 to half + %t4 = fptrunc float %l4 to half + %t5 = fptrunc float %l5 to half + %t6 = fptrunc float %l6 to half + %t7 = fptrunc float %l7 to half + %t8 = fptrunc float %l8 to half + %t9 = fptrunc float %l9 to half + %t10 = fptrunc float %l10 to half + %t11 = fptrunc float %l11 to half + %t12 = fptrunc float %l12 to half + %t13 = fptrunc float %l13 to half + %t14 = fptrunc float %l14 to half + %t15 = fptrunc float %l15 to half + + %d1 = getelementptr inbounds half, ptr %d0, i64 1 + %d2 = getelementptr inbounds half, ptr %d0, i64 2 + %d3 = getelementptr inbounds half, ptr %d0, i64 3 + %d4 = getelementptr inbounds half, ptr %d0, i64 4 + %d5 = getelementptr inbounds half, ptr %d0, i64 5 + %d6 = getelementptr inbounds half, ptr %d0, i64 6 + %d7 = getelementptr inbounds half, ptr %d0, i64 7 + %d8 = getelementptr inbounds half, ptr %d0, i64 8 + %d9 = getelementptr inbounds half, ptr %d0, i64 9 + %d10 = getelementptr inbounds half, ptr %d0, i64 10 + %d11 = getelementptr inbounds half, ptr %d0, i64 11 + %d12 = getelementptr inbounds half, ptr %d0, i64 12 + %d13 = getelementptr inbounds half, ptr %d0, i64 13 + %d14 = getelementptr inbounds half, ptr %d0, i64 14 + %d15 = getelementptr inbounds half, ptr %d0, i64 15 + store half %t0, ptr %d0, align 2 + store half %t1, ptr %d1, align 2 + store half %t2, ptr %d2, align 2 + store half %t3, ptr %d3, align 2 + store half %t4, ptr %d4, align 2 + store half %t5, ptr %d5, align 2 + store half %t6, ptr %d6, align 2 + store half %t7, ptr %d7, align 2 + store half %t8, ptr %d8, align 2 + store half %t9, ptr %d9, align 2 + store half %t10, ptr %d10, align 2 + store half %t11, ptr %d11, align 2 + store half %t12, ptr %d12, align 2 + store half %t13, ptr %d13, align 2 + store half %t14, ptr %d14, align 2 + store half %t15, ptr %d15, align 2 + ret void + +} + +; There is no instruction to round f64 to f16; this should not get vectorized! +define void @fpround_v2xf64_v2xf16(ptr %s0, ptr %d0) { +; CHECK-LABEL: define void @fpround_v2xf64_v2xf16( +; CHECK-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[S1:%.*]] = getelementptr inbounds double, ptr [[S0]], i64 1 +; CHECK-NEXT: [[L0:%.*]] = load double, ptr [[S0]], align 4 +; CHECK-NEXT: [[L1:%.*]] = load double, ptr [[S1]], align 4 +; CHECK-NEXT: [[T0:%.*]] = fptrunc double [[L0]] to half +; CHECK-NEXT: [[T1:%.*]] = fptrunc double [[L1]] to half +; CHECK-NEXT: [[D1:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 1 +; CHECK-NEXT: store half [[T0]], ptr [[D0]], align 2 +; CHECK-NEXT: store half [[T1]], ptr [[D1]], align 2 +; CHECK-NEXT: ret void +; +; CHECK-F16C-LABEL: define void @fpround_v2xf64_v2xf16( +; CHECK-F16C-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] { +; CHECK-F16C-NEXT: [[S1:%.*]] = getelementptr inbounds double, ptr [[S0]], i64 1 +; CHECK-F16C-NEXT: [[L0:%.*]] = load double, ptr [[S0]], align 4 +; CHECK-F16C-NEXT: [[L1:%.*]] = load double, ptr [[S1]], align 4 +; CHECK-F16C-NEXT: [[T0:%.*]] = fptrunc double [[L0]] to half +; CHECK-F16C-NEXT: [[T1:%.*]] = fptrunc double [[L1]] to half +; CHECK-F16C-NEXT: [[D1:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 1 +; CHECK-F16C-NEXT: store half [[T0]], ptr [[D0]], align 2 +; CHECK-F16C-NEXT: store half [[T1]], ptr [[D1]], align 2 +; CHECK-F16C-NEXT: ret void +; +; CHECK-AVX512-LABEL: define void @fpround_v2xf64_v2xf16( +; CHECK-AVX512-SAME: ptr [[S0:%.*]], ptr [[D0:%.*]]) #[[ATTR0]] { +; CHECK-AVX512-NEXT: [[S1:%.*]] = getelementptr inbounds double, ptr [[S0]], i64 1 +; CHECK-AVX512-NEXT: [[L0:%.*]] = load double, ptr [[S0]], align 4 +; CHECK-AVX512-NEXT: [[L1:%.*]] = load double, ptr [[S1]], align 4 +; CHECK-AVX512-NEXT: [[T0:%.*]] = fptrunc double [[L0]] to half +; CHECK-AVX512-NEXT: [[T1:%.*]] = fptrunc double [[L1]] to half +; CHECK-AVX512-NEXT: [[D1:%.*]] = getelementptr inbounds half, ptr [[D0]], i64 1 +; CHECK-AVX512-NEXT: store half [[T0]], ptr [[D0]], align 2 +; CHECK-AVX512-NEXT: store half [[T1]], ptr [[D1]], align 2 +; CHECK-AVX512-NEXT: ret void +; + %s1 = getelementptr inbounds double, ptr %s0, i64 1 + %l0 = load double, ptr %s0, align 4 + %l1 = load double, ptr %s1, align 4 + + %t0 = fptrunc double %l0 to half + %t1 = fptrunc double %l1 to half + + %d1 = getelementptr inbounds half, ptr %d0, i64 1 + store half %t0, ptr %d0, align 2 + store half %t1, ptr %d1, align 2 + ret void +}