@@ -2720,34 +2720,55 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
27202720 // of elements.
27212721 //
27222722 // For example, suppose we have:
2723- // VectorA: <a1, a2, a3, a4, a5, a6>
2724- // VectorB: <b1, b2, b3, b4, b5, b6>
2725- // ReductionFactor: 3.
2723+ // VectorA: <a0, a1, a2, a3, a4, a5>
2724+ // VectorB: <b0, b1, b2, b3, b4, b5>
2725+ // ReductionFactor: 3
2726+ // Shards: 1
27262727 // The output would be:
2727- // <a1|a2|a3, a4|a5|a6, b1|b2|b3, b4|b5|b6>
2728+ // <a0|a1|a2, a3|a4|a5, b0|b1|b2, b3|b4|b5>
2729+ //
2730+ // If we have:
2731+ // VectorA: <a0, a1, a2, a3, a4, a5, a6, a7>
2732+ // VectorB: <b0, b1, b2, b3, b4, b5, b6, b7>
2733+ // ReductionFactor: 2
2734+ // Shards: 2
2735+ // then a and be each have 2 "shards", resulting in the output being
2736+ // interleaved:
2737+ // <a0|a1, a2|a3, b0|b1, b2|b3, a4|a5, a6|a7, b4|b5, b6|b7>
27282738 //
27292739 // This is convenient for instrumenting horizontal add/sub.
27302740 // For bitwise OR on "vertical" pairs, see maybeHandleSimpleNomemIntrinsic().
27312741 Value *horizontalReduce (IntrinsicInst &I, unsigned ReductionFactor,
2732- Value *VectorA, Value *VectorB) {
2742+ unsigned Shards, Value *VectorA, Value *VectorB) {
27332743 assert (isa<FixedVectorType>(VectorA->getType ()));
2734- unsigned TotalNumElems =
2744+ unsigned NumElems =
27352745 cast<FixedVectorType>(VectorA->getType ())->getNumElements ();
27362746
2747+ [[maybe_unused]] unsigned TotalNumElems = NumElems;
27372748 if (VectorB) {
27382749 assert (VectorA->getType () == VectorB->getType ());
2739- TotalNumElems = TotalNumElems * 2 ;
2750+ TotalNumElems *= 2 ;
27402751 }
27412752
2742- assert (TotalNumElems % ReductionFactor == 0 );
2753+ assert (NumElems % ( ReductionFactor * Shards) == 0 );
27432754
27442755 Value *Or = nullptr ;
27452756
27462757 IRBuilder<> IRB (&I);
27472758 for (unsigned i = 0 ; i < ReductionFactor; i++) {
27482759 SmallVector<int , 16 > Mask;
2749- for (unsigned X = 0 ; X < TotalNumElems; X += ReductionFactor)
2750- Mask.push_back (X + i);
2760+
2761+ for (unsigned j = 0 ; j < Shards; j++) {
2762+ unsigned Offset = NumElems / Shards * j;
2763+
2764+ for (unsigned X = 0 ; X < NumElems / Shards; X += ReductionFactor)
2765+ Mask.push_back (Offset + X + i);
2766+
2767+ if (VectorB) {
2768+ for (unsigned X = 0 ; X < NumElems / Shards; X += ReductionFactor)
2769+ Mask.push_back (NumElems + Offset + X + i);
2770+ }
2771+ }
27512772
27522773 Value *Masked;
27532774 if (VectorB)
@@ -2769,7 +2790,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
27692790 // /
27702791 // / e.g., <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16>)
27712792 // / <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8>, <16 x i8>)
2772- void handlePairwiseShadowOrIntrinsic (IntrinsicInst &I) {
2793+ void handlePairwiseShadowOrIntrinsic (IntrinsicInst &I, unsigned Shards ) {
27732794 assert (I.arg_size () == 1 || I.arg_size () == 2 );
27742795
27752796 assert (I.getType ()->isVectorTy ());
@@ -2792,8 +2813,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
27922813 if (I.arg_size () == 2 )
27932814 SecondArgShadow = getShadow (&I, 1 );
27942815
2795- Value *OrShadow = horizontalReduce (I, /* ReductionFactor=*/ 2 , FirstArgShadow ,
2796- SecondArgShadow);
2816+ Value *OrShadow = horizontalReduce (I, /* ReductionFactor=*/ 2 , Shards ,
2817+ FirstArgShadow, SecondArgShadow);
27972818
27982819 OrShadow = CreateShadowCast (IRB, OrShadow, getShadowTy (&I));
27992820
@@ -2808,7 +2829,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
28082829 // / conceptually operates on
28092830 // / (<4 x i16> [[VAR1]], <4 x i16> [[VAR2]])
28102831 // / and can be handled with ReinterpretElemWidth == 16.
2811- void handlePairwiseShadowOrIntrinsic (IntrinsicInst &I,
2832+ void handlePairwiseShadowOrIntrinsic (IntrinsicInst &I, unsigned Shards,
28122833 int ReinterpretElemWidth) {
28132834 assert (I.arg_size () == 1 || I.arg_size () == 2 );
28142835
@@ -2852,8 +2873,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
28522873 SecondArgShadow = IRB.CreateBitCast (SecondArgShadow, ReinterpretShadowTy);
28532874 }
28542875
2855- Value *OrShadow = horizontalReduce (I, /* ReductionFactor=*/ 2 , FirstArgShadow ,
2856- SecondArgShadow);
2876+ Value *OrShadow = horizontalReduce (I, /* ReductionFactor=*/ 2 , Shards ,
2877+ FirstArgShadow, SecondArgShadow);
28572878
28582879 OrShadow = CreateShadowCast (IRB, OrShadow, getShadowTy (&I));
28592880
@@ -6036,48 +6057,66 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
60366057 // Packed Horizontal Add/Subtract
60376058 case Intrinsic::x86_ssse3_phadd_w:
60386059 case Intrinsic::x86_ssse3_phadd_w_128:
6039- case Intrinsic::x86_avx2_phadd_w:
60406060 case Intrinsic::x86_ssse3_phsub_w:
60416061 case Intrinsic::x86_ssse3_phsub_w_128:
6042- case Intrinsic::x86_avx2_phsub_w: {
6043- handlePairwiseShadowOrIntrinsic (I, /* ReinterpretElemWidth=*/ 16 );
6062+ handlePairwiseShadowOrIntrinsic (I, /* Shards=*/ 1 ,
6063+ /* ReinterpretElemWidth=*/ 16 );
6064+ break ;
6065+
6066+ case Intrinsic::x86_avx2_phadd_w:
6067+ case Intrinsic::x86_avx2_phsub_w:
6068+ // TODO: Shards = 2
6069+ handlePairwiseShadowOrIntrinsic (I, /* Shards=*/ 1 ,
6070+ /* ReinterpretElemWidth=*/ 16 );
60446071 break ;
6045- }
60466072
60476073 // Packed Horizontal Add/Subtract
60486074 case Intrinsic::x86_ssse3_phadd_d:
60496075 case Intrinsic::x86_ssse3_phadd_d_128:
6050- case Intrinsic::x86_avx2_phadd_d:
60516076 case Intrinsic::x86_ssse3_phsub_d:
60526077 case Intrinsic::x86_ssse3_phsub_d_128:
6053- case Intrinsic::x86_avx2_phsub_d: {
6054- handlePairwiseShadowOrIntrinsic (I, /* ReinterpretElemWidth=*/ 32 );
6078+ handlePairwiseShadowOrIntrinsic (I, /* Shards=*/ 1 ,
6079+ /* ReinterpretElemWidth=*/ 32 );
6080+ break ;
6081+
6082+ case Intrinsic::x86_avx2_phadd_d:
6083+ case Intrinsic::x86_avx2_phsub_d:
6084+ // TODO: Shards = 2
6085+ handlePairwiseShadowOrIntrinsic (I, /* Shards=*/ 1 ,
6086+ /* ReinterpretElemWidth=*/ 32 );
60556087 break ;
6056- }
60576088
60586089 // Packed Horizontal Add/Subtract and Saturate
60596090 case Intrinsic::x86_ssse3_phadd_sw:
60606091 case Intrinsic::x86_ssse3_phadd_sw_128:
6061- case Intrinsic::x86_avx2_phadd_sw:
60626092 case Intrinsic::x86_ssse3_phsub_sw:
60636093 case Intrinsic::x86_ssse3_phsub_sw_128:
6064- case Intrinsic::x86_avx2_phsub_sw: {
6065- handlePairwiseShadowOrIntrinsic (I, /* ReinterpretElemWidth=*/ 16 );
6094+ handlePairwiseShadowOrIntrinsic (I, /* Shards=*/ 1 ,
6095+ /* ReinterpretElemWidth=*/ 16 );
6096+ break ;
6097+
6098+ case Intrinsic::x86_avx2_phadd_sw:
6099+ case Intrinsic::x86_avx2_phsub_sw:
6100+ // TODO: Shards = 2
6101+ handlePairwiseShadowOrIntrinsic (I, /* Shards=*/ 1 ,
6102+ /* ReinterpretElemWidth=*/ 16 );
60666103 break ;
6067- }
60686104
60696105 // Packed Single/Double Precision Floating-Point Horizontal Add
60706106 case Intrinsic::x86_sse3_hadd_ps:
60716107 case Intrinsic::x86_sse3_hadd_pd:
6072- case Intrinsic::x86_avx_hadd_pd_256:
6073- case Intrinsic::x86_avx_hadd_ps_256:
60746108 case Intrinsic::x86_sse3_hsub_ps:
60756109 case Intrinsic::x86_sse3_hsub_pd:
6110+ handlePairwiseShadowOrIntrinsic (I, /* Shards=*/ 1 );
6111+ break ;
6112+
6113+ case Intrinsic::x86_avx_hadd_pd_256:
6114+ case Intrinsic::x86_avx_hadd_ps_256:
60766115 case Intrinsic::x86_avx_hsub_pd_256:
6077- case Intrinsic::x86_avx_hsub_ps_256: {
6078- handlePairwiseShadowOrIntrinsic (I);
6116+ case Intrinsic::x86_avx_hsub_ps_256:
6117+ // TODO: Shards = 2
6118+ handlePairwiseShadowOrIntrinsic (I, /* Shards=*/ 1 );
60796119 break ;
6080- }
60816120
60826121 case Intrinsic::x86_avx_maskstore_ps:
60836122 case Intrinsic::x86_avx_maskstore_pd:
@@ -6460,7 +6499,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
64606499 // Add Long Pairwise
64616500 case Intrinsic::aarch64_neon_saddlp:
64626501 case Intrinsic::aarch64_neon_uaddlp: {
6463- handlePairwiseShadowOrIntrinsic (I);
6502+ handlePairwiseShadowOrIntrinsic (I, /* Shards= */ 1 );
64646503 break ;
64656504 }
64666505
0 commit comments