Skip to content

Commit 4bc2341

Browse files
author
whyuuwang
committed
merge disperse operation
1 parent b25aa5e commit 4bc2341

File tree

2 files changed

+58
-118
lines changed

2 files changed

+58
-118
lines changed

clang/include/clang/Basic/BuiltinsX86.td

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,6 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
124124
}
125125

126126
let Features = "ssse3" in {
127-
def pmaddubsw128 : X86Builtin<"_Vector<8, short>(_Vector<16, char>, _Vector<16, char>)">;
128127
def pmulhrsw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
129128
def pshufb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
130129
def psignb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
@@ -594,14 +593,7 @@ let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWid
594593
let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
595594
def mpsadbw256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
596595
def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant int)">;
597-
def pmaddubsw256 : X86Builtin<"_Vector<16, short>(_Vector<32, char>, _Vector<32, char>)">;
598-
def pmaddwd256 : X86Builtin<"_Vector<8, int>(_Vector<16, short>, _Vector<16, short>)">;
599-
def phaddw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
600-
def phaddd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
601-
def phaddsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
602-
def phsubw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
603-
def phsubd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
604-
def phsubsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
596+
605597
def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">;
606598
def pmulhrsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
607599
def psadbw256 : X86Builtin<"_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">;

clang/lib/AST/ExprConstant.cpp

Lines changed: 57 additions & 109 deletions
Original file line numberDiff line numberDiff line change
@@ -12420,9 +12420,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
1242012420
}
1242112421
case clang::X86::BI__builtin_ia32_phaddsw128:
1242212422
case clang::X86::BI__builtin_ia32_phaddsw256: {
12423-
APSInt Res(LHSA.isSigned() ? LHSA.sadd_sat(LHSB)
12424-
: LHSA.uadd_sat(LHSB),
12425-
DestUnsigned);
12423+
APSInt Res(LHSA.sadd_sat(LHSB));
1242612424
ResultElements.push_back(APValue(Res));
1242712425
break;
1242812426
}
@@ -12436,9 +12434,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
1243612434
}
1243712435
case clang::X86::BI__builtin_ia32_phsubsw128:
1243812436
case clang::X86::BI__builtin_ia32_phsubsw256: {
12439-
APSInt Res(LHSA.isSigned() ? LHSA.ssub_sat(LHSB)
12440-
: LHSA.usub_sat(LHSB),
12441-
DestUnsigned);
12437+
APSInt Res(LHSA.ssub_sat(LHSB));
1244212438
ResultElements.push_back(APValue(Res));
1244312439
break;
1244412440
}
@@ -12458,9 +12454,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
1245812454
}
1245912455
case clang::X86::BI__builtin_ia32_phaddsw128:
1246012456
case clang::X86::BI__builtin_ia32_phaddsw256: {
12461-
APSInt Res(RHSA.isSigned() ? RHSA.sadd_sat(RHSB)
12462-
: RHSA.uadd_sat(RHSB),
12463-
DestUnsigned);
12457+
APSInt Res(RHSA.sadd_sat(RHSB));
1246412458
ResultElements.push_back(APValue(Res));
1246512459
break;
1246612460
}
@@ -12474,9 +12468,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
1247412468
}
1247512469
case clang::X86::BI__builtin_ia32_phsubsw128:
1247612470
case clang::X86::BI__builtin_ia32_phsubsw256: {
12477-
APSInt Res(RHSA.isSigned() ? RHSA.ssub_sat(RHSB)
12478-
: RHSA.usub_sat(RHSB),
12479-
DestUnsigned);
12471+
APSInt Res(RHSA.ssub_sat(RHSB));
1248012472
ResultElements.push_back(APValue(Res));
1248112473
break;
1248212474
}
@@ -12486,110 +12478,66 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
1248612478
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
1248712479
}
1248812480
case clang::X86::BI__builtin_ia32_haddpd:
12489-
case clang::X86::BI__builtin_ia32_haddps: {
12490-
APValue SourceLHS, SourceRHS;
12491-
if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
12492-
!EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
12493-
return false;
12494-
unsigned SourceLen = SourceLHS.getVectorLength();
12495-
SmallVector<APValue, 4> ResultElements;
12496-
ResultElements.reserve(SourceLen);
12497-
llvm::RoundingMode RM = getActiveRoundingMode(getEvalInfo(), E);
12498-
for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
12499-
APFloat LHSA = SourceLHS.getVectorElt(EltNum).getFloat();
12500-
APFloat LHSB = SourceLHS.getVectorElt(EltNum + 1).getFloat();
12501-
LHSA.add(LHSB, RM);
12502-
ResultElements.push_back(APValue(LHSA));
12503-
}
12504-
for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
12505-
APFloat RHSA = SourceRHS.getVectorElt(EltNum).getFloat();
12506-
APFloat RHSB = SourceRHS.getVectorElt(EltNum + 1).getFloat();
12507-
RHSA.add(RHSB, RM);
12508-
ResultElements.push_back(APValue(RHSA));
12509-
}
12510-
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
12511-
}
12512-
case clang::X86::BI__builtin_ia32_hsubpd:
12513-
case clang::X86::BI__builtin_ia32_hsubps: {
12514-
APValue SourceLHS, SourceRHS;
12515-
if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
12516-
!EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
12517-
return false;
12518-
unsigned SourceLen = SourceLHS.getVectorLength();
12519-
SmallVector<APValue, 4> ResultElements;
12520-
ResultElements.reserve(SourceLen);
12521-
llvm::RoundingMode RM = getActiveRoundingMode(getEvalInfo(), E);
12522-
for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
12523-
APFloat LHSA = SourceLHS.getVectorElt(EltNum).getFloat();
12524-
APFloat LHSB = SourceLHS.getVectorElt(EltNum + 1).getFloat();
12525-
LHSA.subtract(LHSB, RM);
12526-
ResultElements.push_back(APValue(LHSA));
12527-
}
12528-
for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
12529-
APFloat RHSA = SourceRHS.getVectorElt(EltNum).getFloat();
12530-
APFloat RHSB = SourceRHS.getVectorElt(EltNum + 1).getFloat();
12531-
RHSA.subtract(RHSB, RM);
12532-
ResultElements.push_back(APValue(RHSA));
12533-
}
12534-
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
12535-
}
12481+
case clang::X86::BI__builtin_ia32_haddps:
12482+
case clang::X86::BI__builtin_ia32_haddps256:
1253612483
case clang::X86::BI__builtin_ia32_haddpd256:
12484+
case clang::X86::BI__builtin_ia32_hsubpd:
12485+
case clang::X86::BI__builtin_ia32_hsubps:
12486+
case clang::X86::BI__builtin_ia32_hsubps256:
1253712487
case clang::X86::BI__builtin_ia32_hsubpd256: {
1253812488
APValue SourceLHS, SourceRHS;
1253912489
if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
1254012490
!EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
1254112491
return false;
12542-
SmallVector<APValue, 4> ResultElements(4);
12543-
llvm::RoundingMode RM = getActiveRoundingMode(getEvalInfo(), E);
12544-
for (unsigned i = 0; i < 2; ++i) {
12545-
APFloat A = SourceLHS.getVectorElt(2 * i).getFloat();
12546-
APFloat B = SourceLHS.getVectorElt(2 * i + 1).getFloat();
12547-
if (E->getBuiltinCallee() == clang::X86::BI__builtin_ia32_haddpd256)
12548-
A.add(B, RM);
12549-
else
12550-
A.subtract(B, RM);
12551-
ResultElements[2 * i] = APValue(A);
12552-
}
12553-
for (unsigned i = 0; i < 2; ++i) {
12554-
APFloat A = SourceRHS.getVectorElt(2 * i).getFloat();
12555-
APFloat B = SourceRHS.getVectorElt(2 * i + 1).getFloat();
12556-
if (E->getBuiltinCallee() == clang::X86::BI__builtin_ia32_haddpd256)
12557-
A.add(B, RM);
12558-
else
12559-
A.subtract(B, RM);
12560-
ResultElements[2 * i + 1] = APValue(A);
12561-
}
12562-
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
12563-
}
12564-
case clang::X86::BI__builtin_ia32_haddps256:
12565-
case clang::X86::BI__builtin_ia32_hsubps256: {
12566-
APValue SourceLHS, SourceRHS;
12567-
if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
12568-
!EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
12569-
return false;
12570-
SmallVector<APValue, 4> ResultElements(8);
12492+
unsigned NumElts = SourceLHS.getVectorLength();
12493+
SmallVector<APValue, 4> ResultElements;
12494+
ResultElements.reserve(NumElts);
1257112495
llvm::RoundingMode RM = getActiveRoundingMode(getEvalInfo(), E);
12572-
for (unsigned i = 0; i < 4; ++i) {
12573-
unsigned SrcIdx = 2 * i;
12574-
unsigned DestIdx = (i < 2) ? i : (i + 2);
12575-
APFloat A = SourceLHS.getVectorElt(SrcIdx).getFloat();
12576-
APFloat B = SourceLHS.getVectorElt(SrcIdx + 1).getFloat();
12577-
if (E->getBuiltinCallee() == clang::X86::BI__builtin_ia32_haddps256)
12578-
A.add(B, RM);
12579-
else
12580-
A.subtract(B, RM);
12581-
ResultElements[DestIdx] = APValue(A);
12582-
}
12583-
for (unsigned i = 0; i < 4; ++i) {
12584-
unsigned SrcIdx = 2 * i;
12585-
unsigned DestIdx = (i < 2) ? (i + 2) : (i + 4);
12586-
APFloat A = SourceRHS.getVectorElt(SrcIdx).getFloat();
12587-
APFloat B = SourceRHS.getVectorElt(SrcIdx + 1).getFloat();
12588-
if (E->getBuiltinCallee() == clang::X86::BI__builtin_ia32_haddps256)
12589-
A.add(B, RM);
12590-
else
12591-
A.subtract(B, RM);
12592-
ResultElements[DestIdx] = APValue(A);
12496+
QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType();
12497+
unsigned EltBits = Info.Ctx.getTypeSize(DestEltTy);
12498+
unsigned NumLanes = NumElts * EltBits / 128;
12499+
unsigned NumElemsPerLane = NumElts / NumLanes;
12500+
unsigned HalfElemsPerLane = NumElemsPerLane / 2;
12501+
12502+
for (unsigned L = 0; L != NumElts; L += NumElemsPerLane) {
12503+
for (unsigned I = 0; I != HalfElemsPerLane; ++I) {
12504+
APFloat LHSA = SourceLHS.getVectorElt(L + (2 * I) + 0).getFloat();
12505+
APFloat LHSB = SourceLHS.getVectorElt(L + (2 * I) + 1).getFloat();
12506+
switch (E->getBuiltinCallee()) {
12507+
case clang::X86::BI__builtin_ia32_haddpd:
12508+
case clang::X86::BI__builtin_ia32_haddps:
12509+
case clang::X86::BI__builtin_ia32_haddps256:
12510+
case clang::X86::BI__builtin_ia32_haddpd256:
12511+
LHSA.add(LHSB, RM);
12512+
break;
12513+
case clang::X86::BI__builtin_ia32_hsubpd:
12514+
case clang::X86::BI__builtin_ia32_hsubps:
12515+
case clang::X86::BI__builtin_ia32_hsubps256:
12516+
case clang::X86::BI__builtin_ia32_hsubpd256:
12517+
LHSA.subtract(LHSB, RM);
12518+
break;
12519+
}
12520+
ResultElements.push_back(APValue(LHSA));
12521+
}
12522+
for (unsigned I = 0; I != HalfElemsPerLane; ++I) {
12523+
APFloat RHSA = SourceRHS.getVectorElt(L + (2 * I) + 0).getFloat();
12524+
APFloat RHSB = SourceRHS.getVectorElt(L + (2 * I) + 1).getFloat();
12525+
switch (E->getBuiltinCallee()) {
12526+
case clang::X86::BI__builtin_ia32_haddpd:
12527+
case clang::X86::BI__builtin_ia32_haddps:
12528+
case clang::X86::BI__builtin_ia32_haddps256:
12529+
case clang::X86::BI__builtin_ia32_haddpd256:
12530+
RHSA.add(RHSB, RM);
12531+
break;
12532+
case clang::X86::BI__builtin_ia32_hsubpd:
12533+
case clang::X86::BI__builtin_ia32_hsubps:
12534+
case clang::X86::BI__builtin_ia32_hsubps256:
12535+
case clang::X86::BI__builtin_ia32_hsubpd256:
12536+
RHSA.subtract(RHSB, RM);
12537+
break;
12538+
}
12539+
ResultElements.push_back(APValue(RHSA));
12540+
}
1259312541
}
1259412542
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
1259512543
}

0 commit comments

Comments
 (0)