Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 11 additions & 8 deletions clang/include/clang/Basic/BuiltinsX86.td
Original file line number Diff line number Diff line change
Expand Up @@ -123,13 +123,16 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
def Op#d128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
}

def pmaddubsw128 : X86Builtin<"_Vector<8, short>(_Vector<16, char>, _Vector<16, char>)">;
def pmulhrsw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
def pshufb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
def psignb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
def psignw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
def psignd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
}

let Features = "ssse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def pmaddubsw128 : X86Builtin<"_Vector<8, short>(_Vector<16, char>, _Vector<16, char>)">;
}
}

// AVX
Expand Down Expand Up @@ -278,13 +281,14 @@ let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] i
def psllw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
def pslld128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
def psllq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
def pmaddwd128 : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>)">;
def pslldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">;
def psrldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">;
}

let Features = "sse2",
Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def pmaddwd128 : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>)">;

def pmuludq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;

def psllwi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, int)">;
Expand Down Expand Up @@ -581,8 +585,6 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
def phsubw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
def phsubd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
def phsubsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
def pmaddubsw256 : X86Builtin<"_Vector<16, short>(_Vector<32, char>, _Vector<32, char>)">;
def pmaddwd256 : X86Builtin<"_Vector<8, int>(_Vector<16, short>, _Vector<16, short>)">;
def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">;
def pmulhrsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
def psadbw256 : X86Builtin<"_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">;
Expand Down Expand Up @@ -619,6 +621,9 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi

def pblendvb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Vector<32, char>)">;

def pmaddubsw256 : X86Builtin<"_Vector<16, short>(_Vector<32, char>, _Vector<32, char>)">;
def pmaddwd256 : X86Builtin<"_Vector<8, int>(_Vector<16, short>, _Vector<16, short>)">;

def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
def pmuludq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;

Expand Down Expand Up @@ -1378,10 +1383,6 @@ let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>
def subps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">;
}

let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
def pmaddubsw512 : X86Builtin<"_Vector<32, short>(_Vector<64, char>, _Vector<64, char>)">;
def pmaddwd512 : X86Builtin<"_Vector<16, int>(_Vector<32, short>, _Vector<32, short>)">;
}

let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
def addss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
Expand Down Expand Up @@ -1999,6 +2000,8 @@ let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512
}

let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
def pmaddubsw512 : X86Builtin<"_Vector<32, short>(_Vector<64, char>, _Vector<64, char>)">;
def pmaddwd512 : X86Builtin<"_Vector<16, int>(_Vector<32, short>, _Vector<32, short>)">;
def psllv32hi : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
def pshufhw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int)">;
def pshuflw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int)">;
Expand Down
50 changes: 50 additions & 0 deletions clang/lib/AST/ByteCode/InterpBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2579,6 +2579,32 @@ static bool interp__builtin_elementwise_maxmin(InterpState &S, CodePtr OpPC,
return true;
}

static bool interp__builtin_ia32_pmadd(
InterpState &S, CodePtr OpPC, const CallExpr *Call,
llvm::function_ref<APInt(const APSInt &, const APSInt &)> Fn) {
assert(Call->getArg(0)->getType()->isVectorType() &&
Call->getArg(1)->getType()->isVectorType());
const Pointer &RHS = S.Stk.pop<Pointer>();
const Pointer &LHS = S.Stk.pop<Pointer>();
const Pointer &Dst = S.Stk.peek<Pointer>();

const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
PrimType ElemT = *S.getContext().classify(VT->getElementType());
unsigned NumElems = VT->getNumElements();
bool DestUnsigned = Call->getType()->isUnsignedIntegerOrEnumerationType();

for (unsigned I = 0; I != NumElems; ++I) {
INT_TYPE_SWITCH_NO_BOOL(ElemT, {
APSInt Elem1 = LHS.elem<T>(I).toAPSInt();
APSInt Elem2 = RHS.elem<T>(I).toAPSInt();
Dst.elem<T>(I) = static_cast<T>(APSInt(Fn(Elem1, Elem2), DestUnsigned));
});
}

Dst.initializeAllElements();
return true;
}

static bool interp__builtin_ia32_pmul(InterpState &S, CodePtr OpPC,
const CallExpr *Call,
unsigned BuiltinID) {
Expand Down Expand Up @@ -3448,6 +3474,30 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
return interp__builtin_elementwise_int_binop(S, OpPC, Call,
llvm::APIntOps::avgCeilU);

case clang::X86::BI__builtin_ia32_pmaddubsw128:
case clang::X86::BI__builtin_ia32_pmaddubsw256:
case clang::X86::BI__builtin_ia32_pmaddubsw512:
return interp__builtin_ia32_pmadd(
S, OpPC, Call,
[](const APSInt &LoLHS, const APSInt &HiLHS, const APSInt &LoRHS,
const APSInt &HiRHS) {
unsigned BitWidth = 2 * LHS.getBitWidth();
return (LoLHS.zext(BitWidth) * LoRHS.sext(BitWidth))
.sadd_sat((HiLHS.zext(BitWidth) * HiRHS.sext(BitWidth)));
});

case clang::X86::BI__builtin_ia32_pmaddwd128:
case clang::X86::BI__builtin_ia32_pmaddwd256:
case clang::X86::BI__builtin_ia32_pmaddwd512:
return interp__builtin_ia32_pmadd(
S, OpPC, Call,
[](const APSInt &LoLHS, const APSInt &HiLHS, const APSInt &LoRHS,
const APSInt &HiRHS) {
unsigned BitWidth = 2 * LHS.getBitWidth();
return (LoLHS.sext(BitWidth) * LoRHS.sext(BitWidth)) +
(HiLHS.sext(BitWidth) * HiRHS.sext(BitWidth));
});

case clang::X86::BI__builtin_ia32_pmulhuw128:
case clang::X86::BI__builtin_ia32_pmulhuw256:
case clang::X86::BI__builtin_ia32_pmulhuw512:
Expand Down
47 changes: 47 additions & 0 deletions clang/lib/AST/ExprConstant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11778,6 +11778,53 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
case clang::X86::BI__builtin_ia32_pavgw512:
return EvaluateBinOpExpr(llvm::APIntOps::avgCeilU);

case clang::X86::BI__builtin_ia32_pmaddubsw128:
case clang::X86::BI__builtin_ia32_pmaddubsw256:
case clang::X86::BI__builtin_ia32_pmaddubsw512:
case clang::X86::BI__builtin_ia32_pmaddwd128:
case clang::X86::BI__builtin_ia32_pmaddwd256:
case clang::X86::BI__builtin_ia32_pmaddwd512: {
APValue SourceLHS, SourceRHS;
if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
!EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
return false;

unsigned SourceLen = SourceLHS.getVectorLength();
bool DestUnsigned = DestEltTy->isUnsignedIntegerOrEnumerationType();
SmallVector<APValue, 4> ResultElements;
ResultElements.reserve(SourceLen / 2);

for (unsigned EltNum = 0; EltNum < SourceLen; ++EltNum) {
const APSInt &LoLHS = SourceLHS.getVectorElt(EltNum).getInt();
const APSInt &LoRHS = SourceRHS.getVectorElt(EltNum).getInt();
const APSInt &HiLHS = SourceLHS.getVectorElt(EltNum).getInt();
const APSInt &HiRHS = SourceRHS.getVectorElt(EltNum).getInt();
unsigned BitWidth = 2 * LHS.getBitWidth();

switch (E->getBuiltinCallee()) {
case clang::X86::BI__builtin_ia32_pmaddubsw128:
case clang::X86::BI__builtin_ia32_pmaddubsw256:
case clang::X86::BI__builtin_ia32_pmaddubsw512:
ResultElements.push_back(APValue(
APSInt(
(LoLHS.zext(BitWidth) * LoRHS.sext(BitWidth))
.sadd_sat((HiLHS.zext(BitWidth) * HiRHS.sext(BitWidth)))),
DestUnsigned));
break;
case clang::X86::BI__builtin_ia32_pmaddwd128:
case clang::X86::BI__builtin_ia32_pmaddwd256:
case clang::X86::BI__builtin_ia32_pmaddwd512:
ResultElements.push_back(
APValue(APSInt((LoLHS.sext(BitWidth) * LoRHS.sext(BitWidth)) +
(HiLHS.sext(BitWidth) * HiRHS.sext(BitWidth))),
DestUnsigned));
break;
}
}

return Success(APValue(ResultElements.data(), ResultElements.size()), E);
}

case clang::X86::BI__builtin_ia32_pmulhuw128:
case clang::X86::BI__builtin_ia32_pmulhuw256:
case clang::X86::BI__builtin_ia32_pmulhuw512:
Expand Down
12 changes: 5 additions & 7 deletions clang/lib/Headers/avx2intrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -1035,10 +1035,9 @@ _mm256_hsubs_epi16(__m256i __a, __m256i __b)
/// \param __b
/// A 256-bit vector containing one of the source operands.
/// \returns A 256-bit vector of [16 x i16] containing the result.
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maddubs_epi16(__m256i __a, __m256i __b)
{
return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_maddubs_epi16(__m256i __a, __m256i __b) {
return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
}

/// Multiplies corresponding 16-bit elements of two 256-bit vectors of
Expand Down Expand Up @@ -1067,9 +1066,8 @@ _mm256_maddubs_epi16(__m256i __a, __m256i __b)
/// \param __b
/// A 256-bit vector of [16 x i16] containing one of the source operands.
/// \returns A 256-bit vector of [8 x i32] containing the result.
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_madd_epi16(__m256i __a, __m256i __b)
{
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_madd_epi16(__m256i __a, __m256i __b) {
return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
}

Expand Down
12 changes: 6 additions & 6 deletions clang/lib/Headers/avx512bwintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -1064,39 +1064,39 @@ _mm512_maskz_mulhi_epu16(__mmask32 __U, __m512i __A, __m512i __B) {
(__v32hi)_mm512_setzero_si512());
}

static __inline__ __m512i __DEFAULT_FN_ATTRS512
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_maddubs_epi16(__m512i __X, __m512i __Y) {
return (__m512i)__builtin_ia32_pmaddubsw512((__v64qi)__X, (__v64qi)__Y);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS512
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_mask_maddubs_epi16(__m512i __W, __mmask32 __U, __m512i __X,
__m512i __Y) {
return (__m512i)__builtin_ia32_selectw_512((__mmask32) __U,
(__v32hi)_mm512_maddubs_epi16(__X, __Y),
(__v32hi)__W);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS512
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_maskz_maddubs_epi16(__mmask32 __U, __m512i __X, __m512i __Y) {
return (__m512i)__builtin_ia32_selectw_512((__mmask32) __U,
(__v32hi)_mm512_maddubs_epi16(__X, __Y),
(__v32hi)_mm512_setzero_si512());
}

static __inline__ __m512i __DEFAULT_FN_ATTRS512
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_madd_epi16(__m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_pmaddwd512((__v32hi)__A, (__v32hi)__B);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS512
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_mask_madd_epi16(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
(__v16si)_mm512_madd_epi16(__A, __B),
(__v16si)__W);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS512
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_maskz_madd_epi16(__mmask16 __U, __m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
(__v16si)_mm512_madd_epi16(__A, __B),
Expand Down
16 changes: 8 additions & 8 deletions clang/lib/Headers/avx512vlbwintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -1295,57 +1295,57 @@ _mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A, __m256i __I,
(__v16hi)_mm256_setzero_si256());
}

static __inline__ __m128i __DEFAULT_FN_ATTRS128
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
(__v8hi)_mm_maddubs_epi16(__X, __Y),
(__v8hi)__W);
}

static __inline__ __m128i __DEFAULT_FN_ATTRS128
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_maskz_maddubs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
(__v8hi)_mm_maddubs_epi16(__X, __Y),
(__v8hi)_mm_setzero_si128());
}

static __inline__ __m256i __DEFAULT_FN_ATTRS256
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, __m256i __X,
__m256i __Y) {
return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
(__v16hi)_mm256_maddubs_epi16(__X, __Y),
(__v16hi)__W);
}

static __inline__ __m256i __DEFAULT_FN_ATTRS256
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_maskz_maddubs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) {
return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
(__v16hi)_mm256_maddubs_epi16(__X, __Y),
(__v16hi)_mm256_setzero_si256());
}

static __inline__ __m128i __DEFAULT_FN_ATTRS128
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_mask_madd_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
(__v4si)_mm_madd_epi16(__A, __B),
(__v4si)__W);
}

static __inline__ __m128i __DEFAULT_FN_ATTRS128
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_maskz_madd_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
(__v4si)_mm_madd_epi16(__A, __B),
(__v4si)_mm_setzero_si128());
}

static __inline__ __m256i __DEFAULT_FN_ATTRS256
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_mask_madd_epi16(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
(__v8si)_mm256_madd_epi16(__A, __B),
(__v8si)__W);
}

static __inline__ __m256i __DEFAULT_FN_ATTRS256
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_maskz_madd_epi16(__mmask8 __U, __m256i __A, __m256i __B) {
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
(__v8si)_mm256_madd_epi16(__A, __B),
Expand Down
4 changes: 2 additions & 2 deletions clang/lib/Headers/emmintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -2290,8 +2290,8 @@ _mm_avg_epu16(__m128i __a, __m128i __b) {
/// A 128-bit signed [8 x i16] vector.
/// \returns A 128-bit signed [4 x i32] vector containing the sums of products
/// of both parameters.
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a,
__m128i __b) {
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_madd_epi16(__m128i __a, __m128i __b) {
return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
}

Expand Down
9 changes: 4 additions & 5 deletions clang/lib/Headers/mmintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -679,11 +679,10 @@ _mm_subs_pu16(__m64 __m1, __m64 __m2) {
/// A 64-bit integer vector of [4 x i16].
/// \returns A 64-bit integer vector of [2 x i32] containing the sums of
/// products of both parameters.
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_madd_pi16(__m64 __m1, __m64 __m2)
{
return __trunc64(__builtin_ia32_pmaddwd128((__v8hi)__anyext128(__m1),
(__v8hi)__anyext128(__m2)));
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
_mm_madd_pi16(__m64 __m1, __m64 __m2) {
return __trunc64(__builtin_ia32_pmaddwd128((__v8hi)__zext128(__m1),
(__v8hi)__zext128(__m2)));
}

/// Multiplies each 16-bit signed integer element of the first 64-bit
Expand Down
19 changes: 10 additions & 9 deletions clang/lib/Headers/tmmintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@

#define __trunc64(x) \
(__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
#define __zext128(x) \
(__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \
1, 2, 3)
#define __anyext128(x) \
(__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \
1, -1, -1)
Expand Down Expand Up @@ -504,10 +507,9 @@ _mm_hsubs_pi16(__m64 __a, __m64 __b)
/// \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
/// \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
/// \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_maddubs_epi16(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_maddubs_epi16(__m128i __a, __m128i __b) {
return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
}

/// Multiplies corresponding pairs of packed 8-bit unsigned integer
Expand All @@ -534,11 +536,10 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b)
/// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
/// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
/// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_maddubs_pi16(__m64 __a, __m64 __b)
{
return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a),
(__v16qi)__anyext128(__b)));
static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_maddubs_pi16(__m64 __a, __m64 __b) {
return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__zext128(__a),
(__v16qi)__zext128(__b)));
}

/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
Expand Down
Loading