@@ -68,6 +68,35 @@ static mlir::Value emitVectorFCmp(CIRGenBuilderTy &builder,
6868 return bitCast;
6969}
7070
71+ static cir::VecShuffleOp emitPshufW (CIRGenFunction &cgf,
72+ CIRGenBuilderTy &builder,
73+ llvm::SmallVector<mlir::Value> &ops,
74+ const CallExpr *expr, const bool isLow) {
75+ uint32_t imm = cgf.getZExtIntValueFromConstOp (ops[1 ]);
76+
77+ auto vecTy = cast<cir::VectorType>(ops[0 ].getType ());
78+ unsigned numElts = vecTy.getSize ();
79+
80+ unsigned firstHalfStart = isLow ? 0 : 4 ;
81+ unsigned secondHalfStart = 4 - firstHalfStart;
82+
83+ // Splat the 8-bits of immediate 4 times to help the loop wrap around.
84+ imm = (imm & 0xff ) * 0x01010101 ;
85+
86+ int64_t indices[32 ];
87+ for (unsigned l = 0 ; l != numElts; l += 8 ) {
88+ for (unsigned i = firstHalfStart; i != firstHalfStart + 4 ; ++i) {
89+ indices[l + i] = l + (imm & 3 ) + firstHalfStart;
90+ imm /= 4 ;
91+ }
92+ for (unsigned i = secondHalfStart; i != secondHalfStart + 4 ; ++i)
93+ indices[l + i] = l + i;
94+ }
95+
96+ return builder.createVecShuffle (cgf.getLoc (expr->getExprLoc ()), ops[0 ],
97+ ArrayRef (indices, numElts));
98+ }
99+
71100mlir::Value CIRGenFunction::emitX86BuiltinExpr (unsigned builtinID,
72101 const CallExpr *expr) {
73102 if (builtinID == Builtin::BI__builtin_cpu_is) {
@@ -163,9 +192,7 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
163192 case X86::BI__builtin_ia32_vec_ext_v4di: {
164193 unsigned numElts = cast<cir::VectorType>(ops[0 ].getType ()).getSize ();
165194
166- uint64_t index =
167- ops[1 ].getDefiningOp <cir::ConstantOp>().getIntValue ().getZExtValue ();
168-
195+ uint64_t index = getZExtIntValueFromConstOp (ops[1 ]);
169196 index &= numElts - 1 ;
170197
171198 cir::ConstantOp indexVal =
@@ -497,6 +524,10 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
497524 case X86::BI__builtin_ia32_extracti64x2_256_mask:
498525 case X86::BI__builtin_ia32_extractf64x2_512_mask:
499526 case X86::BI__builtin_ia32_extracti64x2_512_mask:
527+ cgm.errorNYI (expr->getSourceRange (),
528+ std::string (" unimplemented X86 builtin call: " ) +
529+ getContext ().BuiltinInfo .getName (builtinID));
530+ return {};
500531 case X86::BI__builtin_ia32_vinsertf128_pd256:
501532 case X86::BI__builtin_ia32_vinsertf128_ps256:
502533 case X86::BI__builtin_ia32_vinsertf128_si256:
@@ -512,23 +543,69 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
512543 case X86::BI__builtin_ia32_insertf64x2_256:
513544 case X86::BI__builtin_ia32_inserti64x2_256:
514545 case X86::BI__builtin_ia32_insertf64x2_512:
515- case X86::BI__builtin_ia32_inserti64x2_512:
546+ case X86::BI__builtin_ia32_inserti64x2_512: {
547+ unsigned dstNumElts = cast<cir::VectorType>(ops[0 ].getType ()).getSize ();
548+ unsigned srcNumElts = cast<cir::VectorType>(ops[1 ].getType ()).getSize ();
549+ unsigned subVectors = dstNumElts / srcNumElts;
550+ assert (llvm::isPowerOf2_32 (subVectors) && " Expected power of 2 subvectors" );
551+
552+ uint64_t index = getZExtIntValueFromConstOp (ops[2 ]);
553+ index &= subVectors - 1 ; // Remove any extra bits.
554+ index *= srcNumElts;
555+
556+ int64_t indices[16 ];
557+ for (unsigned i = 0 ; i != dstNumElts; ++i)
558+ indices[i] = (i >= srcNumElts) ? srcNumElts + (i % srcNumElts) : i;
559+
560+ mlir::Value op1 = builder.createVecShuffle (
561+ getLoc (expr->getExprLoc ()), ops[1 ], ArrayRef (indices, dstNumElts));
562+
563+ for (unsigned i = 0 ; i != dstNumElts; ++i) {
564+ if (i >= index && i < (index + srcNumElts))
565+ indices[i] = (i - index) + dstNumElts;
566+ else
567+ indices[i] = i;
568+ }
569+
570+ return builder.createVecShuffle (getLoc (expr->getExprLoc ()), ops[0 ], op1,
571+ ArrayRef (indices, dstNumElts));
572+ }
516573 case X86::BI__builtin_ia32_pmovqd512_mask:
517574 case X86::BI__builtin_ia32_pmovwb512_mask:
575+ cgm.errorNYI (expr->getSourceRange (),
576+ std::string (" unimplemented X86 builtin call: " ) +
577+ getContext ().BuiltinInfo .getName (builtinID));
578+ return {};
518579 case X86::BI__builtin_ia32_pblendw128:
519580 case X86::BI__builtin_ia32_blendpd:
520581 case X86::BI__builtin_ia32_blendps:
521582 case X86::BI__builtin_ia32_blendpd256:
522583 case X86::BI__builtin_ia32_blendps256:
523584 case X86::BI__builtin_ia32_pblendw256:
524585 case X86::BI__builtin_ia32_pblendd128:
525- case X86::BI__builtin_ia32_pblendd256:
586+ case X86::BI__builtin_ia32_pblendd256: {
587+ uint32_t imm = getZExtIntValueFromConstOp (ops[2 ]);
588+ unsigned numElts = cast<cir::VectorType>(ops[0 ].getType ()).getSize ();
589+
590+ int64_t indices[16 ];
591+ // If there are more than 8 elements, the immediate is used twice so make
592+ // sure we handle that.
593+ for (unsigned i = 0 ; i != numElts; ++i)
594+ indices[i] = ((imm >> (i % 8 )) & 0x1 ) ? numElts + i : i;
595+
596+ return builder.createVecShuffle (getLoc (expr->getExprLoc ()), ops[0 ], ops[1 ],
597+ ArrayRef (indices, numElts));
598+ }
526599 case X86::BI__builtin_ia32_pshuflw:
527600 case X86::BI__builtin_ia32_pshuflw256:
528- case X86::BI__builtin_ia32_pshuflw512:
601+ case X86::BI__builtin_ia32_pshuflw512: {
602+ return emitPshufW (*this , builder, ops, expr, true );
603+ }
529604 case X86::BI__builtin_ia32_pshufhw:
530605 case X86::BI__builtin_ia32_pshufhw256:
531- case X86::BI__builtin_ia32_pshufhw512:
606+ case X86::BI__builtin_ia32_pshufhw512: {
607+ return emitPshufW (*this , builder, ops, expr, false );
608+ }
532609 case X86::BI__builtin_ia32_pshufd:
533610 case X86::BI__builtin_ia32_pshufd256:
534611 case X86::BI__builtin_ia32_pshufd512:
@@ -537,20 +614,106 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
537614 case X86::BI__builtin_ia32_vpermilpd256:
538615 case X86::BI__builtin_ia32_vpermilps256:
539616 case X86::BI__builtin_ia32_vpermilpd512:
540- case X86::BI__builtin_ia32_vpermilps512:
617+ case X86::BI__builtin_ia32_vpermilps512: {
618+ // TODO: Add tests for this branch.
619+ uint32_t imm = getSExtIntValueFromConstOp (ops[1 ]);
620+
621+ auto vecTy = cast<cir::VectorType>(ops[0 ].getType ());
622+ unsigned numElts = vecTy.getSize ();
623+ auto eltTy = vecTy.getElementType ();
624+
625+ unsigned eltBitWidth = getTypeSizeInBits (eltTy).getFixedValue ();
626+ unsigned numLaneElts = 128 / eltBitWidth;
627+
628+ // Splat the 8-bits of immediate 4 times to help the loop wrap around.
629+ imm = (imm & 0xff ) * 0x01010101 ;
630+
631+ llvm::SmallVector<int64_t , 16 > indices;
632+ for (unsigned l = 0 ; l != numElts; l += numLaneElts) {
633+ for (unsigned i = 0 ; i != numLaneElts; ++i) {
634+ indices.push_back ((imm % numLaneElts) + l);
635+ imm /= numLaneElts;
636+ }
637+ }
638+
639+ return builder.createVecShuffle (getLoc (expr->getExprLoc ()), ops[0 ],
640+ indices);
641+ }
541642 case X86::BI__builtin_ia32_shufpd:
542643 case X86::BI__builtin_ia32_shufpd256:
543644 case X86::BI__builtin_ia32_shufpd512:
544645 case X86::BI__builtin_ia32_shufps:
545646 case X86::BI__builtin_ia32_shufps256:
546- case X86::BI__builtin_ia32_shufps512:
647+ case X86::BI__builtin_ia32_shufps512: {
648+ uint32_t imm = getZExtIntValueFromConstOp (ops[2 ]);
649+
650+ auto vecTy = cast<cir::VectorType>(ops[0 ].getType ());
651+ unsigned numElts = vecTy.getSize ();
652+ unsigned numLanes = cgm.getDataLayout ().getTypeSizeInBits (vecTy) / 128 ;
653+ unsigned numLaneElts = numElts / numLanes;
654+
655+ // Splat the 8-bits of immediate 4 times to help the loop wrap around.
656+ imm = (imm & 0xff ) * 0x01010101 ;
657+
658+ int64_t indices[16 ];
659+ for (unsigned l = 0 ; l != numElts; l += numLaneElts) {
660+ for (unsigned i = 0 ; i != numLaneElts; ++i) {
661+ uint32_t idx = imm % numLaneElts;
662+ imm /= numLaneElts;
663+ if (i >= (numLaneElts / 2 ))
664+ idx += numElts;
665+ indices[l + i] = l + idx;
666+ }
667+ }
668+
669+ return builder.createVecShuffle (getLoc (expr->getExprLoc ()), ops[0 ], ops[1 ],
670+ ArrayRef (indices, numElts));
671+ }
547672 case X86::BI__builtin_ia32_permdi256:
548673 case X86::BI__builtin_ia32_permdf256:
549674 case X86::BI__builtin_ia32_permdi512:
550675 case X86::BI__builtin_ia32_permdf512:
676+ cgm.errorNYI (expr->getSourceRange (),
677+ std::string (" unimplemented X86 builtin call: " ) +
678+ getContext ().BuiltinInfo .getName (builtinID));
679+ return {};
551680 case X86::BI__builtin_ia32_palignr128:
552681 case X86::BI__builtin_ia32_palignr256:
553- case X86::BI__builtin_ia32_palignr512:
682+ case X86::BI__builtin_ia32_palignr512: {
683+ uint32_t shiftVal = getZExtIntValueFromConstOp (ops[2 ]) & 0xff ;
684+
685+ unsigned numElts = cast<cir::VectorType>(ops[0 ].getType ()).getSize ();
686+ assert (numElts % 16 == 0 );
687+
688+ // If palignr is shifting the pair of vectors more than the size of two
689+ // lanes, emit zero.
690+ if (shiftVal >= 32 )
691+ return builder.getNullValue (convertType (expr->getType ()),
692+ getLoc (expr->getExprLoc ()));
693+
694+ // If palignr is shifting the pair of input vectors more than one lane,
695+ // but less than two lanes, convert to shifting in zeroes.
696+ if (shiftVal > 16 ) {
697+ shiftVal -= 16 ;
698+ ops[1 ] = ops[0 ];
699+ ops[0 ] =
700+ builder.getNullValue (ops[0 ].getType (), getLoc (expr->getExprLoc ()));
701+ }
702+
703+ int64_t indices[64 ];
704+ // 256-bit palignr operates on 128-bit lanes so we need to handle that
705+ for (unsigned l = 0 ; l != numElts; l += 16 ) {
706+ for (unsigned i = 0 ; i != 16 ; ++i) {
707+ uint32_t idx = shiftVal + i;
708+ if (idx >= 16 )
709+ idx += numElts - 16 ; // End of lane, switch operand.
710+ indices[l + i] = l + idx;
711+ }
712+ }
713+
714+ return builder.createVecShuffle (getLoc (expr->getExprLoc ()), ops[1 ], ops[0 ],
715+ ArrayRef (indices, numElts));
716+ }
554717 case X86::BI__builtin_ia32_alignd128:
555718 case X86::BI__builtin_ia32_alignd256:
556719 case X86::BI__builtin_ia32_alignd512:
0 commit comments