diff --git a/llvm/lib/Target/AIE/AIELegalizerHelper.cpp b/llvm/lib/Target/AIE/AIELegalizerHelper.cpp index 24ef97844444..02eaad1ca441 100644 --- a/llvm/lib/Target/AIE/AIELegalizerHelper.cpp +++ b/llvm/lib/Target/AIE/AIELegalizerHelper.cpp @@ -1196,6 +1196,7 @@ bool AIELegalizerHelper::legalizeG_FPTRUNC(LegalizerHelper &Helper, bool AIELegalizerHelper::legalizeG_FPEXT(LegalizerHelper &Helper, MachineInstr &MI) const { + const AIEBaseInstrInfo *II = ST.getInstrInfo(); MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); @@ -1206,6 +1207,68 @@ bool AIELegalizerHelper::legalizeG_FPEXT(LegalizerHelper &Helper, LLT DstTy = MRI.getType(DstReg); LLT SrcTy = MRI.getType(SrcReg); + /* Vectors + VDst = G_FPEXT VSrc + converts to + ZeroVec = G_AIE_BROADCAST_VECTOR VSrc + VShuffleLow = G_AIE_SHUFFLE_VECTOR ZeroVec, VSrc, 18 + VShuffleHigh = G_AIE_SHUFFLE_VECTOR ZeroVec, VSrc, 19 + VShuffleLow = G_BITCAST VShuffleLow + VShuffleHigh = G_BITCAST VShuffleHigh + VDst = G_CONCAT_VECTORS VShuffle, VShuffleHigh + */ + if (DstTy.isVector() && SrcTy.isVector()) { + // Extract type information + auto DstElementType = DstTy.getElementType(); + auto SrcNumElements = SrcTy.getNumElements(); + // Create constants for shuffle modes + Register Mode18 = MIRBuilder.buildConstant(S32, 18).getReg(0); + Register Mode19 = MIRBuilder.buildConstant(S32, 19).getReg(0); + Register Zero = MIRBuilder.buildConstant(S32, 0).getReg(0); + // Get the instructions + const unsigned BroadcastOpc = II->getGenericBroadcastVectorOpcode(); + const unsigned VShuffleOpc = II->getGenericShuffleVectorOpcode(); + + // Step 1: Create a zero vector using broadcast + Register ZeroVec = + MIRBuilder.buildInstr(BroadcastOpc, {SrcTy}, {Zero}).getReg(0); + // Step 2: Create VSHUFFLE for lower 512 bits (mode 18) + Register VShuffleLow = + MIRBuilder.buildInstr(VShuffleOpc, {SrcTy}, {ZeroVec, SrcReg, Mode18}) + .getReg(0); + // Step 3: Create VSHUFFLE for high 512 bits (mode 19) + Register VShuffleHigh = + MIRBuilder.buildInstr(VShuffleOpc, {SrcTy}, {ZeroVec, SrcReg, Mode19}) + .getReg(0); + // Step 4: bitcast VShuffleLow and VShuffleHigh + // Example: <32xs16> -> <16xs32> + LLT CastToNewTy = + LLT::vector(ElementCount::getFixed(SrcNumElements / 2), DstElementType); + if (CastToNewTy.getSizeInBits() != + MRI.getType(VShuffleLow).getSizeInBits() || + CastToNewTy.getSizeInBits() != + MRI.getType(VShuffleHigh).getSizeInBits()) { + llvm::errs() + << "Error: Size mismatch in vector bitcast for G_FPEXT. Expected: " + << CastToNewTy.getSizeInBits() + << " bits, got: " << MRI.getType(VShuffleLow).getSizeInBits() + << " and " << MRI.getType(VShuffleHigh).getSizeInBits() << " bits\n"; + return false; + } + auto VShuffleLowCast = + MIRBuilder.buildCast(CastToNewTy, VShuffleLow).getReg(0); + auto VShuffleHighCast = + MIRBuilder.buildCast(CastToNewTy, VShuffleHigh).getReg(0); + // Step 5: Concatenate the two src vectors into dst vector + MIRBuilder.buildConcatVectors(DstReg, {VShuffleLowCast, VShuffleHighCast}); + // Possibly above line might be wrong, not tested enough. + // MIRBuilder.buildConcatVectors(DstReg, {VShuffleHighCast, VShuffleLowCast}); + + MI.eraseFromParent(); + return true; + } + + // Scalars // We only handle bfloat16 to single precision conversion if (DstTy != LLT::scalar(32) || SrcTy != LLT::scalar(16)) return false; @@ -1300,6 +1363,12 @@ bool AIELegalizerHelper::legalizeG_FMUL(LegalizerHelper &Helper, MI.eraseFromParent(); return true; } +bool isBF16Vector(const LLT Ty) { + return Ty.isVector() && Ty.getScalarSizeInBits() == 16; +} +bool isF32Vector(const LLT Ty) { + return Ty.isVector() && Ty.getScalarSizeInBits() == 32; +} bool AIELegalizerHelper::legalizeG_FADD_G_FSUB(LegalizerHelper &Helper, MachineInstr &MI) const { @@ -1309,6 +1378,138 @@ bool AIELegalizerHelper::legalizeG_FADD_G_FSUB(LegalizerHelper &Helper, const Register DstReg = MI.getOperand(0).getReg(); Register SrcLHS = MI.getOperand(1).getReg(); Register SrcRHS = MI.getOperand(2).getReg(); + const LLT SrcLHSTy = MRI.getType(SrcLHS); + const LLT SrcRHSTy = MRI.getType(SrcRHS); + + // Can be combined with the bf16 vector case + if (isF32Vector(SrcLHSTy) && isF32Vector(SrcRHSTy)) { + // vector should be of size 32 asssert + assert(SrcLHSTy.getNumElements() == 32 && SrcRHSTy.getNumElements() == 32 && + "Expected vector of size 32, type(f32) for inputs of G_FADD/G_FSUB"); + + // // Step 1: Convert bf16 vectors to f32 vectors using FPExt + const LLT F32VecTy = SrcLHSTy; + // LLT::fixed_vector(SrcLHSTy.getNumElements(), LLT::scalar(32)); + // Register SrcLHSF32 = MRI.createGenericVirtualRegister(F32VecTy); + // Register SrcRHSF32 = MRI.createGenericVirtualRegister(F32VecTy); + // MIRBuilder.buildFPExt(SrcLHSF32, SrcLHS); + // MIRBuilder.buildFPExt(SrcRHSF32, SrcRHS); + + // Step 2: Input is going to be <32 x bf16> pad it to <64 x f32> for AIE2P + // as AccV64S32 is legal on AIE2P. + if (ST.isAIE2P()) { + const Register UndefVec = MIRBuilder.buildUndef(F32VecTy).getReg(0); + const Register ConcatLHS = MRI.createGenericVirtualRegister(V64FP32); + const Register ConcatRHS = MRI.createGenericVirtualRegister(V64FP32); + MIRBuilder.buildConcatVectors(ConcatLHS, {SrcLHS, UndefVec}); + MIRBuilder.buildConcatVectors(ConcatRHS, {SrcRHS, UndefVec}); + SrcLHS = ConcatLHS; + SrcRHS = ConcatRHS; + } + + // Step 3: Perform the floating point operation + Register Res = MIRBuilder + .buildInstr(MI.getOpcode(), {MRI.getType(SrcLHS)}, + {SrcLHS, SrcRHS}) + .getReg(0); + + // Step 4: Handle accumulator conversion based on target + if (ST.isAIE2()) { + Res = MIRBuilder.buildBitcast(V8ACC64, Res).getReg(0); + } else if (ST.isAIE2P()) { + // Unmerge to get 2 vectors of <32xf32> as FADD/FSUB was done on <64xf32> + SmallVector UnmergedRegs; + const auto Unmerge = MIRBuilder.buildUnmerge(F32VecTy, Res); + getUnmergeResults(UnmergedRegs, *Unmerge); + Res = UnmergedRegs[0]; // Take the first <32xf32> vector, other half is + // just zeros. + } + + // // Step 5: Convert back to bf16 using the truncation intrinsic + // const int VecSize = MRI.getType(Res).getSizeInBits(); + // const LLT DstLLT = ST.isAIE2P() ? V32BF16 : V16BF16; + // Res = MIRBuilder + // .buildIntrinsic(getFpTrunc32ToBF16IntrID(ST, VecSize), {DstLLT}, + // true, false) + // .addUse(Res) + // .getReg(0); + + // // Handle AIE2 padding + // if (ST.isAIE2()) { + // Res = emitPadUndefVector(MRI, MIRBuilder, V32BF16, Res); + // } + + MIRBuilder.buildCopy(DstReg, Res); + + MI.eraseFromParent(); + return true; + } + + // Handle bf16 vectors code assumes the input is <32 x bf16>, the + // LegalizerInfo makes sure that the input is either padded or unmerged to <32 + // x bf16>. + if (isBF16Vector(SrcLHSTy) && isBF16Vector(SrcRHSTy)) { + // vector should be of size 32 asssert + assert(SrcLHSTy.getNumElements() == 32 && SrcRHSTy.getNumElements() == 32 && + "Expected vector of size 32 for inputs of G_FADD/G_FSUB"); + + // Step 1: Convert bf16 vectors to f32 vectors using FPExt + const LLT F32VecTy = + LLT::fixed_vector(SrcLHSTy.getNumElements(), LLT::scalar(32)); + Register SrcLHSF32 = MRI.createGenericVirtualRegister(F32VecTy); + Register SrcRHSF32 = MRI.createGenericVirtualRegister(F32VecTy); + MIRBuilder.buildFPExt(SrcLHSF32, SrcLHS); + MIRBuilder.buildFPExt(SrcRHSF32, SrcRHS); + + // Step 2: Input is going to be <32 x bf16> pad it to <64 x f32> for AIE2P + // as AccV64S32 is legal on AIE2P. + if (ST.isAIE2P()) { + const Register UndefVec = MIRBuilder.buildUndef(F32VecTy).getReg(0); + const Register ConcatLHS = MRI.createGenericVirtualRegister(V64FP32); + const Register ConcatRHS = MRI.createGenericVirtualRegister(V64FP32); + MIRBuilder.buildConcatVectors(ConcatLHS, {SrcLHSF32, UndefVec}); + MIRBuilder.buildConcatVectors(ConcatRHS, {SrcRHSF32, UndefVec}); + SrcLHSF32 = ConcatLHS; + SrcRHSF32 = ConcatRHS; + } + + // Step 3: Perform the floating point operation + Register Res = MIRBuilder + .buildInstr(MI.getOpcode(), {MRI.getType(SrcLHSF32)}, + {SrcLHSF32, SrcRHSF32}) + .getReg(0); + + // Step 4: Handle accumulator conversion based on target + if (ST.isAIE2()) { + Res = MIRBuilder.buildBitcast(V8ACC64, Res).getReg(0); + } else if (ST.isAIE2P()) { + // Unmerge to get 2 vectors of <32xf32> as FADD/FSUB was done on <64xf32> + SmallVector UnmergedRegs; + const auto Unmerge = MIRBuilder.buildUnmerge(F32VecTy, Res); + getUnmergeResults(UnmergedRegs, *Unmerge); + Res = UnmergedRegs[0]; // Take the first <32xf32> vector, other half is + // just zeros. + } + + // Step 5: Convert back to bf16 using the truncation intrinsic + const int VecSize = MRI.getType(Res).getSizeInBits(); + const LLT DstLLT = ST.isAIE2P() ? V32BF16 : V16BF16; + Res = MIRBuilder + .buildIntrinsic(getFpTrunc32ToBF16IntrID(ST, VecSize), {DstLLT}, + true, false) + .addUse(Res) + .getReg(0); + + // Handle AIE2 padding + if (ST.isAIE2()) { + Res = emitPadUndefVector(MRI, MIRBuilder, V32BF16, Res); + } + + MIRBuilder.buildCopy(DstReg, Res); + + MI.eraseFromParent(); + return true; + } assert(MRI.getType(DstReg) == LLT::scalar(16) && "Expected bfloat16 type in custom legalization."); diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PLegalizerInfo.cpp b/llvm/lib/Target/AIE/aie2p/AIE2PLegalizerInfo.cpp index 5ea47376c667..97c95e135d3f 100644 --- a/llvm/lib/Target/AIE/aie2p/AIE2PLegalizerInfo.cpp +++ b/llvm/lib/Target/AIE/aie2p/AIE2PLegalizerInfo.cpp @@ -73,6 +73,31 @@ static LegalityPredicate isValidVectorAIEP(const unsigned TypeIdx) { }; } +// `V2 = G_FPEXT V1` on vectors is valid iff: +// - V1 and V2 are floating-point vectors +// - V2 is wider than V1 for total vector sizes +// - Number of elements of both vectors are same +// - Size of Element of V2 = 2 * Size of Element of V1 +static LegalityPredicate isValidVectorFPEXT(const unsigned TypeIdx_dst, + const unsigned TypeIdx_src) { + return [=](const LegalityQuery &Query) { + const LLT DstTy = Query.Types[TypeIdx_dst]; + const LLT SrcTy = Query.Types[TypeIdx_src]; + if (DstTy.isVector() && SrcTy.isVector()) { + auto DstElementCount = DstTy.getElementCount(); + auto SrcElementCount = SrcTy.getElementCount(); + auto DstElementType = DstTy.getElementType(); + auto SrcElementType = SrcTy.getElementType(); + auto DstElementSize = DstElementType.getSizeInBits(); + auto SrcElementSize = SrcElementType.getSizeInBits(); + return DstTy.getSizeInBits() > SrcTy.getSizeInBits() && + DstElementCount == SrcElementCount && + (DstElementSize == (SrcElementSize * 2)); + } + return false; + }; +} + static LegalityPredicate negatePredicate(const std::function &Func) { return [=](const LegalityQuery &Query) { return !Func(Query); }; @@ -219,6 +244,13 @@ AIE2PLegalizerInfo::AIE2PLegalizerInfo(const AIE2PSubtarget &ST) getActionDefinitionsBuilder(G_FPEXT) .libcallFor({{S64, S32}}) .customFor({{S32, S16}}) + // Add support for vector types + // Extend vectors to have at least 512-bits + .clampMinNumElements(1, S8, 64) + .clampMinNumElements(1, S16, 32) + .clampMinNumElements(1, S32, 16) + .customIf(isValidVectorFPEXT(0 /* Dst */, 1 /* Src */)) + // .customFor({{V32S32, V32S16}}) .narrowScalarFor({{S64, S16}}, llvm::LegalizeMutations::changeTo(0, S32)); getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) @@ -241,7 +273,35 @@ AIE2PLegalizerInfo::AIE2PLegalizerInfo(const AIE2PSubtarget &ST) getActionDefinitionsBuilder({G_FADD, G_FSUB}) .legalFor({AccV64S32}) - .customFor({S16}) + // Handle custom bf16/f32 case for both scalar and vector types + .customFor({S16, V32S16, V32S32}) + // Convert smaller than <32 x f32/bf16> to legal sizes, doesn't change types + .moreElementsIf( + [=](const LegalityQuery &Query) { + const LLT &Ty = Query.Types[0]; + return Ty.isVector() && + (Ty.getScalarSizeInBits() == 32 || + Ty.getScalarSizeInBits() == 16) && + Ty.getNumElements() <= 32; + }, + [=](const LegalityQuery &Query) { + if (Query.Types[0].getScalarSizeInBits() == 32) { + // Note: Can cause slowdown as BUILD_VECTOR adds scalars + return std::make_pair(0, LLT::fixed_vector(64, S32)); + } else { + return std::make_pair(0, LLT::fixed_vector(32, S16)); + } + }) + // Converts <64xbf16> into 2 chunks of <32xbf16> + .fewerElementsIf( + [=](const LegalityQuery &Query) { + const LLT &Ty = Query.Types[0]; + return Ty.isVector() && (Ty.getScalarSizeInBits() == 16) && + Ty.getNumElements() == 64; + }, + [=](const LegalityQuery &Query) { + return std::make_pair(0, LLT::fixed_vector(32, S16)); + }) .libcallFor({S32, S64}); getActionDefinitionsBuilder({G_FDIV, G_FREM}) diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/legalize-vector-fadd.ll b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/legalize-vector-fadd.ll new file mode 100644 index 000000000000..c6e7feae1338 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/legalize-vector-fadd.ll @@ -0,0 +1,56 @@ +; RUN: llc -mtriple=aie2p -O0 -stop-after=legalizer %s -o - 2>&1 | FileCheck %s +; This test is a carved out test for sending patch upstream from +; iree-amd-aie/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/multi_reduction_to_reduction_sizes_types.mlirUntitled-1.mlir + +; Ideally reduction should be as follows(with minor changes for each shape): + ; Input1: <32xbf16> and Input2: <32xbf16> + ; Extended1<32xf32> = fpext <32xbf16> + ; Extended2<32xf32> = fpext <32xbf16> + ; Zero<32xf32> = zeroinitializer + ; Out1<64xf32> = Concat zero, > + ; Out2<64xf32> = Concat zero, > + ; Result<64xf32> = fadd >, > + ; R1<32xf32>, R2<32xf32> = unmerge > + ; R2 is all 0s + ; R1<32xbf16> = trunc > + +; check the vadd.f +; pad checks +; checks similar to <32xbf16> +; unpad checks +define bfloat @multi_reduction_1d_16_bf16(<16 x bfloat> %0, bfloat %1) { + %3 = call reassoc bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat %1, <16 x bfloat> %0) + ret bfloat %3 +} + + + +; CHECK-LABEL: name: multi_reduction_1d_32_bf16 +; CHECK: G_CONSTANT i32 0 +; CHECK: G_AIE_BROADCAST_VECTOR %{{[0-9]+}}(s32) +; CHECK: G_CONSTANT i32 2 +; CHECK: G_CONSTANT i32 3 +; CHECK: G_AIE_SHUFFLE_VECTOR %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}(s32) +; CHECK: G_AIE_SHUFFLE_VECTOR %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}(s32) +; CHECK: G_BITCAST %{{[0-9]+}}(<32 x s16>) +; CHECK: G_BITCAST %{{[0-9]+}}(<32 x s16>) +; CHECK: G_CONCAT_VECTORS %{{[0-9]+}}(<16 x s32>), %{{[0-9]+}}(<16 x s32>) +; CHECK: G_IMPLICIT_DEF +; CHECK: G_CONCAT_VECTORS %{{[0-9]+}}(<32 x s32>), %{{[0-9]+}}(<32 x s32>) +; CHECK: G_FADD %{{[0-9]+}}, %{{[0-9]+}} +; CHECK: G_UNMERGE_VALUES %{{[0-9]+}}(<64 x s32>) +; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.v32accfloat.to.v32bf16), %{{[0-9]+}}(<32 x s32>) +define bfloat @multi_reduction_1d_32_bf16(<32 x bfloat> %0, bfloat %1) { + %3 = call reassoc bfloat @llvm.vector.reduce.fadd.v32bf16(bfloat %1, <32 x bfloat> %0) + ret bfloat %3 +} + +; ; Converted to chunks of <32 x bf16> +; Check if the input is split into 2 chunks of <32 x bf16> +; Check for each chunk similar to <32xbf16> case +; Check if both inputs get concatenated to <64xbf16> + +define bfloat @multi_reduction_1d_64_bf16(<64 x bfloat> %0, bfloat %1) { + %3 = call reassoc bfloat @llvm.vector.reduce.fadd.v64bf16(bfloat %1, <64 x bfloat> %0) + ret bfloat %3 +} diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/legalize-vector-fpext.ll b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/legalize-vector-fpext.ll new file mode 100644 index 000000000000..6eb8fa08279f --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/legalize-vector-fpext.ll @@ -0,0 +1,73 @@ +; RUN: llc -mtriple=aie2p -O0 -stop-after=legalizer %s -o - 2>&1 | FileCheck %s + + +; Validates bfloat -> float legalization. +; CHECK-LABEL: name: extend +; CHECK: [[COPY:%[0-9]+]]:_(<32 x s16>) = COPY $x0 +; CHECK-NOT: G_SHL +; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 +; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 +; CHECK-NEXT: [[C0:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 +; CHECK-NEXT: [[BCAST:%[0-9]+]]:_(<32 x s16>) = G_AIE_BROADCAST_VECTOR [[C0]](s32) +; CHECK-NEXT: [[SHUF1:%[0-9]+]]:_(<32 x s16>) = G_AIE_SHUFFLE_VECTOR [[BCAST]], [[COPY]], [[C2]](s32) +; CHECK-NEXT: [[SHUF2:%[0-9]+]]:_(<32 x s16>) = G_AIE_SHUFFLE_VECTOR [[BCAST]], [[COPY]], [[C3]](s32) +; CHECK-NEXT: [[BIT1:%[0-9]+]]:_(<16 x s32>) = G_BITCAST [[SHUF1]](<32 x s16>) +; CHECK-NEXT: [[BIT2:%[0-9]+]]:_(<16 x s32>) = G_BITCAST [[SHUF2]](<32 x s16>) +; CHECK-NEXT: [[CONCAT:%[0-9]+]]:_(<32 x s32>) = G_CONCAT_VECTORS [[BIT1]](<16 x s32>), [[BIT2]](<16 x s32>) + +define <32 x float> @extend(bfloat %o, <32 x bfloat> %in) nounwind { + %X = fpext <32 x bfloat> %in to <32 x float> + ret <32 x float> %X +} + +; Pads the 17 valid values with undefined values to form a 32 size vector. + +; CHECK-LABEL: name: extend_non_power_of_2 +; CHECK: [[COPY:%[0-9]+]]:_(<32 x s16>) = COPY $x0 +; CHECK-COUNT-17: G_AIE_SEXT_EXTRACT_VECTOR_ELT +; CHECK-COUNT-32: G_AIE_ADD_VECTOR_ELT_HI +; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 +; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 +; CHECK-NEXT: [[C0:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 +; CHECK-NEXT: [[BCAST:%[0-9]+]]:_(<32 x s16>) = G_AIE_BROADCAST_VECTOR [[C0]](s32) +; CHECK-NEXT: [[SHUF1:%[0-9]+]]:_(<32 x s16>) = G_AIE_SHUFFLE_VECTOR [[BCAST]], %{{[0-9]+}}, [[C2]](s32) +; CHECK-NEXT: [[SHUF2:%[0-9]+]]:_(<32 x s16>) = G_AIE_SHUFFLE_VECTOR [[BCAST]], %{{[0-9]+}}, [[C3]](s32) +; CHECK-NEXT: [[BIT1:%[0-9]+]]:_(<16 x s32>) = G_BITCAST [[SHUF1]](<32 x s16>) +; CHECK-NEXT: [[BIT2:%[0-9]+]]:_(<16 x s32>) = G_BITCAST [[SHUF2]](<32 x s16>) +; CHECK-COUNT-17: G_AIE_SEXT_EXTRACT_VECTOR_ELT +; CHECK-COUNT-32: G_AIE_ADD_VECTOR_ELT_HI +; CHECK-NEXT: [[CONCAT:%[0-9]+]]:_(<32 x s32>) = G_CONCAT_VECTORS %{{[0-9]+}}(<16 x s32>), %{{[0-9]+}}(<16 x s32>) +define <17 x float> @extend_non_power_of_2(<17 x bfloat> %in) nounwind { + %X = fpext <17 x bfloat> %in to <17 x float> + ret <17 x float> %X +} + +; Validates if vector size < 256 bits + +; CHECK-LABEL: name: fpext_bf16_to_f32 +; CHECK: bb.1 +; CHECK: [[VEC_CONCAT:%[0-9]+]]:_(<32 x s16>) = G_CONCAT_VECTORS +; CHECK: G_AIE_SEXT_EXTRACT_VECTOR_ELT [[VEC_CONCAT]] +; CHECK: G_AIE_ADD_VECTOR_ELT_HI +; CHECK: [[SHUFFLE_VEC:%[0-9]+]]:_(<32 x s16>) = G_AIE_SHUFFLE_VECTOR +; CHECK-NOT: G_AIE_SHUFFLE_VECTOR +; CHECK: [[BITCAST:%[0-9]+]]:_(<16 x s32>) = G_BITCAST [[SHUFFLE_VEC]] +; CHECK: $x0 = COPY [[BITCAST]] +define <16 x float> @fpext_bf16_to_f32(<16 x bfloat> %in) nounwind { + %X = fpext <16 x bfloat> %in to <16 x float> + ret <16 x float> %X +} + +; Validates scalar path +; CHECK-LABEL: name: fpext_scalar_bf16_to_f32 +; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $r1 +; CHECK-NEXT: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 +; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C16]](s32) +; CHECK-NOT: G_AIE_SHUFFLE_VECTOR +; CHECK-NEXT: $r0 = COPY [[SHL]](s32) +; CHECK-NEXT: PseudoRET implicit $lr, implicit $r0 + +define float @fpext_scalar_bf16_to_f32(bfloat %in) nounwind { + %X = fpext bfloat %in to float + ret float %X +} diff --git a/llvm/test/CodeGen/AIE/aie2p/end-to-end/llvm_vec_reduce_intrinsic_vectorized.ll b/llvm/test/CodeGen/AIE/aie2p/end-to-end/llvm_vec_reduce_intrinsic_vectorized.ll new file mode 100644 index 000000000000..1fb1b165b543 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/end-to-end/llvm_vec_reduce_intrinsic_vectorized.ll @@ -0,0 +1,338 @@ +; RUN: llc -mtriple=aie2p %s -o - | FileCheck %s + +; Test if the vector reduction intrinsic is lowered to vadd instructions +; 1. vadd.f should not be equal to vector length for floats. +; 2. int = vadd.32 and float = vadd.f +; int= vadd.32 should be in slots + + + +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +; CHECK-LABEL: multi_reduction_1d_16_i32: +; CHECK: vadd.32 x0, x0, x2 +define i32 @multi_reduction_1d_16_i32(<16 x i32> %0, i32 %1) { + %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %0) + %4 = add i32 %1, %3 + ret i32 %4 +} + +; CHECK-LABEL: multi_reduction_1d_8_i32: +; CHECK: vadd.32 x0, x0, x2 +define i32 @multi_reduction_1d_8_i32(<8 x i32> %0, i32 %1) { + %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %0) + %4 = add i32 %1, %3 + ret i32 %4 +} + +; CHECK-LABEL: multi_reduction_1d_64_i32: +; CHECK: vadd.32 x0, x0, x2 +define i32 @multi_reduction_1d_64_i32(<64 x i32> %0, i32 %1) { + %3 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %0) + %4 = add i32 %1, %3 + ret i32 %4 +} + +; CHECK-LABEL: multi_reduction_1d_16_bf16: +; CHECK: vadd.f dm0, dm0, dm1 +define bfloat @multi_reduction_1d_16_bf16(<16 x bfloat> %0, bfloat %1) { + %3 = call reassoc bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat %1, <16 x bfloat> %0) + ret bfloat %3 +} + +; CHECK-LABEL: multi_reduction_1d_32_bf16: +; CHECK: vadd.f dm0, dm0, dm1 +define bfloat @multi_reduction_1d_32_bf16(<32 x bfloat> %0, bfloat %1) { + %3 = call reassoc bfloat @llvm.vector.reduce.fadd.v32bf16(bfloat %1, <32 x bfloat> %0) + ret bfloat %3 +} + +; CHECK-LABEL: multi_reduction_1d_64_bf16: +; CHECK: vadd.f dm0, dm0, dm1 +define bfloat @multi_reduction_1d_64_bf16(<64 x bfloat> %0, bfloat %1) { + %3 = call reassoc bfloat @llvm.vector.reduce.fadd.v64bf16(bfloat %1, <64 x bfloat> %0) + ret bfloat %3 +} + +; CHECK-LABEL: multi_reduction_1d_16_f32: +; CHECK: vadd.f dm0, dm0, dm1 +define float @multi_reduction_1d_16_f32(<16 x float> %0, float %1) { + %3 = call reassoc float @llvm.vector.reduce.fadd.v16f32(float %1, <16 x float> %0) + ret float %3 +} + +; CHECK-LABEL: multi_reduction_1d_32_f32: +; CHECK: vadd.f dm0, dm0, dm1 +define float @multi_reduction_1d_32_f32(<32 x float> %0, float %1) { + %3 = call reassoc float @llvm.vector.reduce.fadd.v32f32(float %1, <32 x float> %0) + ret float %3 +} + +; CHECK-LABEL: multi_reduction_1d_64_f32: +; CHECK: vadd.f dm0, dm0, dm1 +define float @multi_reduction_1d_64_f32(<64 x float> %0, float %1) { + %3 = call reassoc float @llvm.vector.reduce.fadd.v64f32(float %1, <64 x float> %0) + ret float %3 +} + +; CHECK-LABEL: multi_reduction_2d_4x4_i32: +; CHECK: vadd.32 x0, x0, x2 +define i32 @multi_reduction_2d_4x4_i32([4 x <4 x i32>] %0, i32 %1) { + %3 = extractvalue [4 x <4 x i32>] %0, 0 + %4 = shufflevector <4 x i32> %3, <4 x i32> %3, <16 x i32> + %5 = shufflevector <16 x i32> %4, <16 x i32> poison, <16 x i32> + %6 = extractvalue [4 x <4 x i32>] %0, 1 + %7 = shufflevector <4 x i32> %6, <4 x i32> %6, <16 x i32> + %8 = shufflevector <16 x i32> %7, <16 x i32> %5, <16 x i32> + %9 = extractvalue [4 x <4 x i32>] %0, 2 + %10 = shufflevector <4 x i32> %9, <4 x i32> %9, <16 x i32> + %11 = shufflevector <16 x i32> %10, <16 x i32> %8, <16 x i32> + %12 = extractvalue [4 x <4 x i32>] %0, 3 + %13 = shufflevector <4 x i32> %12, <4 x i32> %12, <16 x i32> + %14 = shufflevector <16 x i32> %13, <16 x i32> %11, <16 x i32> + %15 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %14) + %16 = add i32 %1, %15 + ret i32 %16 +} + +; CHECK-LABEL: multi_reduction_2d_2x4_i32: +; CHECK: vadd.32 x0, x0, x2 +define i32 @multi_reduction_2d_2x4_i32([2 x <4 x i32>] %0, i32 %1) { + %3 = extractvalue [2 x <4 x i32>] %0, 0 + %4 = shufflevector <4 x i32> %3, <4 x i32> %3, <8 x i32> + %5 = shufflevector <8 x i32> %4, <8 x i32> poison, <8 x i32> + %6 = extractvalue [2 x <4 x i32>] %0, 1 + %7 = shufflevector <4 x i32> %6, <4 x i32> %6, <8 x i32> + %8 = shufflevector <8 x i32> %7, <8 x i32> %5, <8 x i32> + %9 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %8) + %10 = add i32 %1, %9 + ret i32 %10 +} + +; CHECK-LABEL: multi_reduction_2d_8x8_i32: +; CHECK: vadd.32 x0, x0, x2 +define i32 @multi_reduction_2d_8x8_i32([8 x <8 x i32>] %0, i32 %1) { + %3 = extractvalue [8 x <8 x i32>] %0, 0 + %4 = shufflevector <8 x i32> %3, <8 x i32> %3, <64 x i32> + %5 = shufflevector <64 x i32> %4, <64 x i32> poison, <64 x i32> + %6 = extractvalue [8 x <8 x i32>] %0, 1 + %7 = shufflevector <8 x i32> %6, <8 x i32> %6, <64 x i32> + %8 = shufflevector <64 x i32> %7, <64 x i32> %5, <64 x i32> + %9 = extractvalue [8 x <8 x i32>] %0, 2 + %10 = shufflevector <8 x i32> %9, <8 x i32> %9, <64 x i32> + %11 = shufflevector <64 x i32> %10, <64 x i32> %8, <64 x i32> + %12 = extractvalue [8 x <8 x i32>] %0, 3 + %13 = shufflevector <8 x i32> %12, <8 x i32> %12, <64 x i32> + %14 = shufflevector <64 x i32> %13, <64 x i32> %11, <64 x i32> + %15 = extractvalue [8 x <8 x i32>] %0, 4 + %16 = shufflevector <8 x i32> %15, <8 x i32> %15, <64 x i32> + %17 = shufflevector <64 x i32> %16, <64 x i32> %14, <64 x i32> + %18 = extractvalue [8 x <8 x i32>] %0, 5 + %19 = shufflevector <8 x i32> %18, <8 x i32> %18, <64 x i32> + %20 = shufflevector <64 x i32> %19, <64 x i32> %17, <64 x i32> + %21 = extractvalue [8 x <8 x i32>] %0, 6 + %22 = shufflevector <8 x i32> %21, <8 x i32> %21, <64 x i32> + %23 = shufflevector <64 x i32> %22, <64 x i32> %20, <64 x i32> + %24 = extractvalue [8 x <8 x i32>] %0, 7 + %25 = shufflevector <8 x i32> %24, <8 x i32> %24, <64 x i32> + %26 = shufflevector <64 x i32> %25, <64 x i32> %23, <64 x i32> + %27 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %26) + %28 = add i32 %1, %27 + ret i32 %28 +} + +; CHECK-LABEL: multi_reduction_2d_4x4_bf16: +; CHECK: vadd.f dm0, dm0, dm1 +define bfloat @multi_reduction_2d_4x4_bf16([4 x <4 x bfloat>] %0, bfloat %1) { + %3 = extractvalue [4 x <4 x bfloat>] %0, 0 + %4 = shufflevector <4 x bfloat> %3, <4 x bfloat> %3, <16 x i32> + %5 = shufflevector <16 x bfloat> %4, <16 x bfloat> poison, <16 x i32> + %6 = extractvalue [4 x <4 x bfloat>] %0, 1 + %7 = shufflevector <4 x bfloat> %6, <4 x bfloat> %6, <16 x i32> + %8 = shufflevector <16 x bfloat> %7, <16 x bfloat> %5, <16 x i32> + %9 = extractvalue [4 x <4 x bfloat>] %0, 2 + %10 = shufflevector <4 x bfloat> %9, <4 x bfloat> %9, <16 x i32> + %11 = shufflevector <16 x bfloat> %10, <16 x bfloat> %8, <16 x i32> + %12 = extractvalue [4 x <4 x bfloat>] %0, 3 + %13 = shufflevector <4 x bfloat> %12, <4 x bfloat> %12, <16 x i32> + %14 = shufflevector <16 x bfloat> %13, <16 x bfloat> %11, <16 x i32> + %15 = call reassoc bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat %1, <16 x bfloat> %14) + ret bfloat %15 +} + +; CHECK-LABEL: multi_reduction_2d_8x4_bf16: +; CHECK: vadd.f dm0, dm0, dm1 +define bfloat @multi_reduction_2d_8x4_bf16([8 x <4 x bfloat>] %0, bfloat %1) { + %3 = extractvalue [8 x <4 x bfloat>] %0, 0 + %4 = shufflevector <4 x bfloat> %3, <4 x bfloat> %3, <32 x i32> + %5 = shufflevector <32 x bfloat> %4, <32 x bfloat> poison, <32 x i32> + %6 = extractvalue [8 x <4 x bfloat>] %0, 1 + %7 = shufflevector <4 x bfloat> %6, <4 x bfloat> %6, <32 x i32> + %8 = shufflevector <32 x bfloat> %7, <32 x bfloat> %5, <32 x i32> + %9 = extractvalue [8 x <4 x bfloat>] %0, 2 + %10 = shufflevector <4 x bfloat> %9, <4 x bfloat> %9, <32 x i32> + %11 = shufflevector <32 x bfloat> %10, <32 x bfloat> %8, <32 x i32> + %12 = extractvalue [8 x <4 x bfloat>] %0, 3 + %13 = shufflevector <4 x bfloat> %12, <4 x bfloat> %12, <32 x i32> + %14 = shufflevector <32 x bfloat> %13, <32 x bfloat> %11, <32 x i32> + %15 = extractvalue [8 x <4 x bfloat>] %0, 4 + %16 = shufflevector <4 x bfloat> %15, <4 x bfloat> %15, <32 x i32> + %17 = shufflevector <32 x bfloat> %16, <32 x bfloat> %14, <32 x i32> + %18 = extractvalue [8 x <4 x bfloat>] %0, 5 + %19 = shufflevector <4 x bfloat> %18, <4 x bfloat> %18, <32 x i32> + %20 = shufflevector <32 x bfloat> %19, <32 x bfloat> %17, <32 x i32> + %21 = extractvalue [8 x <4 x bfloat>] %0, 6 + %22 = shufflevector <4 x bfloat> %21, <4 x bfloat> %21, <32 x i32> + %23 = shufflevector <32 x bfloat> %22, <32 x bfloat> %20, <32 x i32> + %24 = extractvalue [8 x <4 x bfloat>] %0, 7 + %25 = shufflevector <4 x bfloat> %24, <4 x bfloat> %24, <32 x i32> + %26 = shufflevector <32 x bfloat> %25, <32 x bfloat> %23, <32 x i32> + %27 = call reassoc bfloat @llvm.vector.reduce.fadd.v32bf16(bfloat %1, <32 x bfloat> %26) + ret bfloat %27 +} + +; CHECK-LABEL: multi_reduction_2d_8x8_bf16: +; CHECK: vadd.f dm0, dm0, dm1 +define bfloat @multi_reduction_2d_8x8_bf16([8 x <8 x bfloat>] %0, bfloat %1) { + %3 = extractvalue [8 x <8 x bfloat>] %0, 0 + %4 = shufflevector <8 x bfloat> %3, <8 x bfloat> %3, <64 x i32> + %5 = shufflevector <64 x bfloat> %4, <64 x bfloat> poison, <64 x i32> + %6 = extractvalue [8 x <8 x bfloat>] %0, 1 + %7 = shufflevector <8 x bfloat> %6, <8 x bfloat> %6, <64 x i32> + %8 = shufflevector <64 x bfloat> %7, <64 x bfloat> %5, <64 x i32> + %9 = extractvalue [8 x <8 x bfloat>] %0, 2 + %10 = shufflevector <8 x bfloat> %9, <8 x bfloat> %9, <64 x i32> + %11 = shufflevector <64 x bfloat> %10, <64 x bfloat> %8, <64 x i32> + %12 = extractvalue [8 x <8 x bfloat>] %0, 3 + %13 = shufflevector <8 x bfloat> %12, <8 x bfloat> %12, <64 x i32> + %14 = shufflevector <64 x bfloat> %13, <64 x bfloat> %11, <64 x i32> + %15 = extractvalue [8 x <8 x bfloat>] %0, 4 + %16 = shufflevector <8 x bfloat> %15, <8 x bfloat> %15, <64 x i32> + %17 = shufflevector <64 x bfloat> %16, <64 x bfloat> %14, <64 x i32> + %18 = extractvalue [8 x <8 x bfloat>] %0, 5 + %19 = shufflevector <8 x bfloat> %18, <8 x bfloat> %18, <64 x i32> + %20 = shufflevector <64 x bfloat> %19, <64 x bfloat> %17, <64 x i32> + %21 = extractvalue [8 x <8 x bfloat>] %0, 6 + %22 = shufflevector <8 x bfloat> %21, <8 x bfloat> %21, <64 x i32> + %23 = shufflevector <64 x bfloat> %22, <64 x bfloat> %20, <64 x i32> + %24 = extractvalue [8 x <8 x bfloat>] %0, 7 + %25 = shufflevector <8 x bfloat> %24, <8 x bfloat> %24, <64 x i32> + %26 = shufflevector <64 x bfloat> %25, <64 x bfloat> %23, <64 x i32> + %27 = call reassoc bfloat @llvm.vector.reduce.fadd.v64bf16(bfloat %1, <64 x bfloat> %26) + ret bfloat %27 +} + +; CHECK-LABEL: multi_reduction_2d_4x4_f32: +; CHECK: vadd.f dm0, dm0, dm1 +define float @multi_reduction_2d_4x4_f32([4 x <4 x float>] %0, float %1) { + %3 = extractvalue [4 x <4 x float>] %0, 0 + %4 = shufflevector <4 x float> %3, <4 x float> %3, <16 x i32> + %5 = shufflevector <16 x float> %4, <16 x float> poison, <16 x i32> + %6 = extractvalue [4 x <4 x float>] %0, 1 + %7 = shufflevector <4 x float> %6, <4 x float> %6, <16 x i32> + %8 = shufflevector <16 x float> %7, <16 x float> %5, <16 x i32> + %9 = extractvalue [4 x <4 x float>] %0, 2 + %10 = shufflevector <4 x float> %9, <4 x float> %9, <16 x i32> + %11 = shufflevector <16 x float> %10, <16 x float> %8, <16 x i32> + %12 = extractvalue [4 x <4 x float>] %0, 3 + %13 = shufflevector <4 x float> %12, <4 x float> %12, <16 x i32> + %14 = shufflevector <16 x float> %13, <16 x float> %11, <16 x i32> + %15 = call reassoc float @llvm.vector.reduce.fadd.v16f32(float %1, <16 x float> %14) + ret float %15 +} + +; CHECK-LABEL: multi_reduction_2d_8x4_f32: +; CHECK: vadd.f dm0, dm0, dm1 +define float @multi_reduction_2d_8x4_f32([8 x <4 x float>] %0, float %1) { + %3 = extractvalue [8 x <4 x float>] %0, 0 + %4 = shufflevector <4 x float> %3, <4 x float> %3, <32 x i32> + %5 = shufflevector <32 x float> %4, <32 x float> poison, <32 x i32> + %6 = extractvalue [8 x <4 x float>] %0, 1 + %7 = shufflevector <4 x float> %6, <4 x float> %6, <32 x i32> + %8 = shufflevector <32 x float> %7, <32 x float> %5, <32 x i32> + %9 = extractvalue [8 x <4 x float>] %0, 2 + %10 = shufflevector <4 x float> %9, <4 x float> %9, <32 x i32> + %11 = shufflevector <32 x float> %10, <32 x float> %8, <32 x i32> + %12 = extractvalue [8 x <4 x float>] %0, 3 + %13 = shufflevector <4 x float> %12, <4 x float> %12, <32 x i32> + %14 = shufflevector <32 x float> %13, <32 x float> %11, <32 x i32> + %15 = extractvalue [8 x <4 x float>] %0, 4 + %16 = shufflevector <4 x float> %15, <4 x float> %15, <32 x i32> + %17 = shufflevector <32 x float> %16, <32 x float> %14, <32 x i32> + %18 = extractvalue [8 x <4 x float>] %0, 5 + %19 = shufflevector <4 x float> %18, <4 x float> %18, <32 x i32> + %20 = shufflevector <32 x float> %19, <32 x float> %17, <32 x i32> + %21 = extractvalue [8 x <4 x float>] %0, 6 + %22 = shufflevector <4 x float> %21, <4 x float> %21, <32 x i32> + %23 = shufflevector <32 x float> %22, <32 x float> %20, <32 x i32> + %24 = extractvalue [8 x <4 x float>] %0, 7 + %25 = shufflevector <4 x float> %24, <4 x float> %24, <32 x i32> + %26 = shufflevector <32 x float> %25, <32 x float> %23, <32 x i32> + %27 = call reassoc float @llvm.vector.reduce.fadd.v32f32(float %1, <32 x float> %26) + ret float %27 +} + +; CHECK-LABEL: multi_reduction_2d_8x8_f32: +; CHECK: vadd.f dm0, dm0, dm1 +define float @multi_reduction_2d_8x8_f32([8 x <8 x float>] %0, float %1) { + %3 = extractvalue [8 x <8 x float>] %0, 0 + %4 = shufflevector <8 x float> %3, <8 x float> %3, <64 x i32> + %5 = shufflevector <64 x float> %4, <64 x float> poison, <64 x i32> + %6 = extractvalue [8 x <8 x float>] %0, 1 + %7 = shufflevector <8 x float> %6, <8 x float> %6, <64 x i32> + %8 = shufflevector <64 x float> %7, <64 x float> %5, <64 x i32> + %9 = extractvalue [8 x <8 x float>] %0, 2 + %10 = shufflevector <8 x float> %9, <8 x float> %9, <64 x i32> + %11 = shufflevector <64 x float> %10, <64 x float> %8, <64 x i32> + %12 = extractvalue [8 x <8 x float>] %0, 3 + %13 = shufflevector <8 x float> %12, <8 x float> %12, <64 x i32> + %14 = shufflevector <64 x float> %13, <64 x float> %11, <64 x i32> + %15 = extractvalue [8 x <8 x float>] %0, 4 + %16 = shufflevector <8 x float> %15, <8 x float> %15, <64 x i32> + %17 = shufflevector <64 x float> %16, <64 x float> %14, <64 x i32> + %18 = extractvalue [8 x <8 x float>] %0, 5 + %19 = shufflevector <8 x float> %18, <8 x float> %18, <64 x i32> + %20 = shufflevector <64 x float> %19, <64 x float> %17, <64 x i32> + %21 = extractvalue [8 x <8 x float>] %0, 6 + %22 = shufflevector <8 x float> %21, <8 x float> %21, <64 x i32> + %23 = shufflevector <64 x float> %22, <64 x float> %20, <64 x i32> + %24 = extractvalue [8 x <8 x float>] %0, 7 + %25 = shufflevector <8 x float> %24, <8 x float> %24, <64 x i32> + %26 = shufflevector <64 x float> %25, <64 x float> %23, <64 x i32> + %27 = call reassoc float @llvm.vector.reduce.fadd.v64f32(float %1, <64 x float> %26) + ret float %27 +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) #0 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) #0 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>) #0 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat, <16 x bfloat>) #0 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare bfloat @llvm.vector.reduce.fadd.v32bf16(bfloat, <32 x bfloat>) #0 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare bfloat @llvm.vector.reduce.fadd.v64bf16(bfloat, <64 x bfloat>) #0 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.vector.reduce.fadd.v16f32(float, <16 x float>) #0 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.vector.reduce.fadd.v32f32(float, <32 x float>) #0 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.vector.reduce.fadd.v64f32(float, <64 x float>) #0 + +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3}