Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions clang/include/clang/Basic/BuiltinsX86.td
Original file line number Diff line number Diff line change
Expand Up @@ -444,15 +444,18 @@ let Features = "avx512f,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth
def vgf2p8mulb_v64qi : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
}

let Features = "pclmul", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
let Features = "pclmul",
Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def pclmulqdq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Constant char)">;
}

let Features = "vpclmulqdq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
let Features = "vpclmulqdq",
Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
def pclmulqdq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant char)">;
}

let Features = "avx512f,vpclmulqdq", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
let Features = "avx512f,vpclmulqdq",
Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
def pclmulqdq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Constant char)">;
}

Expand Down
71 changes: 71 additions & 0 deletions clang/lib/AST/ByteCode/InterpBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2745,6 +2745,72 @@ static bool interp__builtin_ia32_addsub(InterpState &S, CodePtr OpPC,
return true;
}

static bool interp__builtin_ia32_pclmulqdq(InterpState &S, CodePtr OpPC,
const CallExpr *Call) {
// PCLMULQDQ: carry-less multiplication of selected 64-bit halves
// imm8 bit 0: selects lower (0) or upper (1) 64 bits of first operand
// imm8 bit 4: selects lower (0) or upper (1) 64 bits of second operand
assert(Call->getArg(0)->getType()->isVectorType() &&
Call->getArg(1)->getType()->isVectorType());

// Extract imm8 argument
APSInt Imm8 = popToAPSInt(S, Call->getArg(2));
bool SelectUpperA = (Imm8.getZExtValue() & 0x01) != 0;
bool SelectUpperB = (Imm8.getZExtValue() & 0x10) != 0;

const Pointer &RHS = S.Stk.pop<Pointer>();
const Pointer &LHS = S.Stk.pop<Pointer>();
const Pointer &Dst = S.Stk.peek<Pointer>();

const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
PrimType ElemT = *S.getContext().classify(VT->getElementType());
unsigned NumElems = VT->getNumElements();
const auto *DestVT = Call->getType()->castAs<VectorType>();
PrimType DestElemT = *S.getContext().classify(DestVT->getElementType());
bool DestUnsigned = Call->getType()->isUnsignedIntegerOrEnumerationType();

// Process each 128-bit lane (2 elements at a time)
for (unsigned Lane = 0; Lane < NumElems; Lane += 2) {
APSInt A0, A1, B0, B1;
INT_TYPE_SWITCH_NO_BOOL(ElemT, {
A0 = LHS.elem<T>(Lane + 0).toAPSInt();
A1 = LHS.elem<T>(Lane + 1).toAPSInt();
B0 = RHS.elem<T>(Lane + 0).toAPSInt();
B1 = RHS.elem<T>(Lane + 1).toAPSInt();
});

// Select the appropriate 64-bit values based on imm8
APSInt A = SelectUpperA ? A1 : A0;
APSInt B = SelectUpperB ? B1 : B0;

// Perform carry-less multiplication (polynomial multiplication in GF(2^64))
// This multiplies two 64-bit values to produce a 128-bit result
APInt AVal = A.extOrTrunc(64);
APInt BVal = B.extOrTrunc(64);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe make A + B both APInt and still use zextOrTrunc - I never trust APSInt signedness....

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great idea , updated it

APInt Result(128, 0);

// For each bit in A, if set, XOR B shifted left by that bit position
for (unsigned i = 0; i < 64; ++i) {
if (AVal[i]) {
APInt ShiftedB = BVal.zext(128) << i;
Result ^= ShiftedB;
}
}

// Split the 128-bit result into two 64-bit halves
APSInt ResultLow(Result.extractBits(64, 0), DestUnsigned);
APSInt ResultHigh(Result.extractBits(64, 64), DestUnsigned);

INT_TYPE_SWITCH_NO_BOOL(DestElemT, {
Dst.elem<T>(Lane + 0) = static_cast<T>(ResultLow);
Dst.elem<T>(Lane + 1) = static_cast<T>(ResultHigh);
});
}

Dst.initializeAllElements();
return true;
}

static bool interp__builtin_elementwise_triop_fp(
InterpState &S, CodePtr OpPC, const CallExpr *Call,
llvm::function_ref<APFloat(const APFloat &, const APFloat &,
Expand Down Expand Up @@ -4366,6 +4432,11 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
return llvm::APIntOps::muluExtended(LoLHS, LoRHS);
});

case clang::X86::BI__builtin_ia32_pclmulqdq128:
case clang::X86::BI__builtin_ia32_pclmulqdq256:
case clang::X86::BI__builtin_ia32_pclmulqdq512:
return interp__builtin_ia32_pclmulqdq(S, OpPC, Call);

case Builtin::BI__builtin_elementwise_fma:
return interp__builtin_elementwise_triop_fp(
S, OpPC, Call,
Expand Down
62 changes: 62 additions & 0 deletions clang/lib/AST/ExprConstant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13483,6 +13483,68 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
}
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
}
case clang::X86::BI__builtin_ia32_pclmulqdq128:
case clang::X86::BI__builtin_ia32_pclmulqdq256:
case clang::X86::BI__builtin_ia32_pclmulqdq512: {
// PCLMULQDQ: carry-less multiplication of selected 64-bit halves
// imm8 bit 0: selects lower (0) or upper (1) 64 bits of first operand
// imm8 bit 4: selects lower (0) or upper (1) 64 bits of second operand
APValue SourceLHS, SourceRHS;
if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
!EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
return false;

APSInt Imm8;
if (!EvaluateInteger(E->getArg(2), Imm8, Info))
return false;

// Extract bits 0 and 4 from imm8
bool SelectUpperA = (Imm8.getZExtValue() & 0x01) != 0;
bool SelectUpperB = (Imm8.getZExtValue() & 0x10) != 0;

unsigned NumElems = SourceLHS.getVectorLength();
SmallVector<APValue, 8> ResultElements;
ResultElements.reserve(NumElems);
QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType();
bool DestUnsigned = DestEltTy->isUnsignedIntegerOrEnumerationType();

// Process each 128-bit lane
for (unsigned Lane = 0; Lane < NumElems; Lane += 2) {
// Get the two 64-bit halves of the first operand
APSInt A0 = SourceLHS.getVectorElt(Lane + 0).getInt();
APSInt A1 = SourceLHS.getVectorElt(Lane + 1).getInt();
// Get the two 64-bit halves of the second operand
APSInt B0 = SourceRHS.getVectorElt(Lane + 0).getInt();
APSInt B1 = SourceRHS.getVectorElt(Lane + 1).getInt();

// Select the appropriate 64-bit values based on imm8
APSInt A = SelectUpperA ? A1 : A0;
APSInt B = SelectUpperB ? B1 : B0;

// Perform carry-less multiplication (polynomial multiplication in
// GF(2^64)) This multiplies two 64-bit values to produce a 128-bit result
APInt AVal = A.extOrTrunc(64);
APInt BVal = B.extOrTrunc(64);
APInt Result(128, 0);

// For each bit in A, if set, XOR B shifted left by that bit position
for (unsigned i = 0; i < 64; ++i) {
if (AVal[i]) {
APInt ShiftedB = BVal.zext(128) << i;
Result ^= ShiftedB;
}
}

// Split the 128-bit result into two 64-bit halves
APSInt ResultLow(Result.extractBits(64, 0), DestUnsigned);
APSInt ResultHigh(Result.extractBits(64, 64), DestUnsigned);

ResultElements.push_back(APValue(ResultLow));
ResultElements.push_back(APValue(ResultHigh));
}

return Success(APValue(ResultElements.data(), ResultElements.size()), E);
}
case Builtin::BI__builtin_elementwise_fshl:
case Builtin::BI__builtin_elementwise_fshr: {
APValue SourceHi, SourceLo, SourceShift;
Expand Down
18 changes: 17 additions & 1 deletion clang/test/CodeGen/X86/pclmul-builtins.c
Original file line number Diff line number Diff line change
@@ -1,9 +1,25 @@
// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +pclmul -emit-llvm -o - | FileCheck %s

// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +pclmul -emit-llvm -o - -std=c++11 | FileCheck %s
// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +pclmul -emit-llvm -o - -std=c++11 -fexperimental-new-constant-interpreter | FileCheck %s

#include <wmmintrin.h>
#include "builtin_test_helpers.h"

__m128i test_mm_clmulepi64_si128(__m128i a, __m128i b) {
// CHECK: @llvm.x86.pclmulqdq
return _mm_clmulepi64_si128(a, b, 0);
}

// Test constexpr evaluation for _mm_clmulepi64_si128
// imm8=0x00: lower 64 bits of both operands
// Test case: 0x1 * 0x3 = 0x3 (carry-less multiplication)
TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128((__m128i){0x1ULL, 0x0ULL}, (__m128i){0x3ULL, 0x0ULL}, 0x00), 0x3ULL, 0x0ULL));

// imm8=0x01: upper 64 bits of first operand, lower 64 bits of second
TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128((__m128i){0x0ULL, 0x1ULL}, (__m128i){0x3ULL, 0x0ULL}, 0x01), 0x3ULL, 0x0ULL));

// imm8=0x10: lower 64 bits of first operand, upper 64 bits of second
TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128((__m128i){0x1ULL, 0x0ULL}, (__m128i){0x0ULL, 0x3ULL}, 0x10), 0x3ULL, 0x0ULL));

// imm8=0x11: upper 64 bits of both operands
TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128((__m128i){0x0ULL, 0x1ULL}, (__m128i){0x0ULL, 0x3ULL}, 0x11), 0x3ULL, 0x0ULL));
13 changes: 13 additions & 0 deletions clang/test/CodeGen/X86/vpclmulqdq-builtins.c
Original file line number Diff line number Diff line change
@@ -1,17 +1,30 @@
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +vpclmulqdq -emit-llvm -o - | FileCheck %s --check-prefix AVX
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +vpclmulqdq -target-feature +avx512f -emit-llvm -o - | FileCheck %s --check-prefixes AVX,AVX512
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +vpclmulqdq -emit-llvm -o - -std=c++11 | FileCheck %s --check-prefix AVX
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +vpclmulqdq -target-feature +avx512f -emit-llvm -o - -std=c++11 | FileCheck %s --check-prefixes AVX,AVX512
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +vpclmulqdq -emit-llvm -o - -std=c++11 -fexperimental-new-constant-interpreter | FileCheck %s --check-prefix AVX
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +vpclmulqdq -target-feature +avx512f -emit-llvm -o - -std=c++11 -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes AVX,AVX512

#include <immintrin.h>
#include "builtin_test_helpers.h"

__m256i test_mm256_clmulepi64_epi128(__m256i A, __m256i B) {
// AVX: @llvm.x86.pclmulqdq.256
return _mm256_clmulepi64_epi128(A, B, 0);
}

// Test constexpr evaluation for _mm256_clmulepi64_epi128
// Each 128-bit lane is processed independently
TEST_CONSTEXPR(match_m256i(_mm256_clmulepi64_epi128((__m256i){0x1ULL, 0x0ULL, 0x2ULL, 0x0ULL}, (__m256i){0x3ULL, 0x0ULL, 0x5ULL, 0x0ULL}, 0x00), 0x3ULL, 0x0ULL, 0xaULL, 0x0ULL));

#ifdef __AVX512F__
__m512i test_mm512_clmulepi64_epi128(__m512i A, __m512i B) {
// AVX512: @llvm.x86.pclmulqdq.512
return _mm512_clmulepi64_epi128(A, B, 0);
}

// Test constexpr evaluation for _mm512_clmulepi64_epi128
// Each 128-bit lane is processed independently
TEST_CONSTEXPR(match_m512i(_mm512_clmulepi64_epi128((__m512i){0x1ULL, 0x0ULL, 0x2ULL, 0x0ULL, 0x4ULL, 0x0ULL, 0x8ULL, 0x0ULL}, (__m512i){0x3ULL, 0x0ULL, 0x5ULL, 0x0ULL, 0x7ULL, 0x0ULL, 0x9ULL, 0x0ULL}, 0x00), 0x3ULL, 0x0ULL, 0xaULL, 0x0ULL, 0x1cULL, 0x0ULL, 0x48ULL, 0x0ULL));
#endif

Loading