Skip to content
Merged
8 changes: 6 additions & 2 deletions clang/include/clang/Basic/BuiltinsX86.td
Original file line number Diff line number Diff line change
Expand Up @@ -577,11 +577,15 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
def psadbw256
: X86Builtin<
"_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">;
def permdf256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">;
def permti256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">;
def permdi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">;
}

let Features = "avx2",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

don't create yet another block - we have existing avx2 blocks with the same feature/attribute set

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks updated in latest commit

Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
def permdf256
: X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">;
def permdi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">;
}

let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">;
Expand Down
10 changes: 10 additions & 0 deletions clang/lib/AST/ByteCode/InterpBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4944,6 +4944,16 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
return std::make_pair(0, static_cast<int>(LaneOffset + Index));
});

case X86::BI__builtin_ia32_permdf256:
case X86::BI__builtin_ia32_permdi256:
return interp__builtin_ia32_shuffle_generic(
S, OpPC, Call, [](unsigned DstIdx, unsigned Control) {
// permute4x64 operates on 4 64-bit elements
// For element i (0-3), extract bits [2*i+1:2*i] from Control
unsigned Index = (Control >> (2 * DstIdx)) & 0x3;
return std::make_pair(0, static_cast<int>(Index));
});

case X86::BI__builtin_ia32_vpmultishiftqb128:
case X86::BI__builtin_ia32_vpmultishiftqb256:
case X86::BI__builtin_ia32_vpmultishiftqb512:
Expand Down
13 changes: 13 additions & 0 deletions clang/lib/AST/ExprConstant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13122,6 +13122,19 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
return Success(R, E);
}

case X86::BI__builtin_ia32_permdf256:
case X86::BI__builtin_ia32_permdi256: {
APValue R;
if (!evalShuffleGeneric(Info, E, R, [](unsigned DstIdx, unsigned Control) {
// permute4x64 operates on 4 64-bit elements
// For element i (0-3), extract bits [2*i+1:2*i] from Control
unsigned Index = (Control >> (2 * DstIdx)) & 0x3;
return std::make_pair(0, static_cast<int>(Index));
}))
return false;
return Success(R, E);
}

case X86::BI__builtin_ia32_vpermilvarps:
case X86::BI__builtin_ia32_vpermilvarps256:
case X86::BI__builtin_ia32_vpermilvarps512: {
Expand Down
4 changes: 2 additions & 2 deletions clang/lib/Headers/avx2intrin.h
Copy link
Contributor Author

@ahmednoursphinx ahmednoursphinx Dec 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just formatting will revert

Original file line number Diff line number Diff line change
Expand Up @@ -3238,7 +3238,7 @@ _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b) {
/// \a M[1:0] specifies the index in \a a for element 0 of the result,
/// \a M[3:2] specifies the index for element 1, and so forth.
/// \returns A 256-bit vector of [4 x double] containing the result.
#define _mm256_permute4x64_pd(V, M) \
#define _mm256_permute4x64_pd(V, M) \
((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))

/// Sets the result's 256-bit vector of [8 x float] to copies of elements of
Expand Down Expand Up @@ -3295,7 +3295,7 @@ _mm256_permutevar8x32_ps(__m256 __a, __m256i __b) {
/// \a M[1:0] specifies the index in \a a for element 0 of the result,
/// \a M[3:2] specifies the index for element 1, and so forth.
/// \returns A 256-bit vector of [4 x i64] containing the result.
#define _mm256_permute4x64_epi64(V, M) \
#define _mm256_permute4x64_epi64(V, M) \
((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))

/// Sets each half of the 256-bit result either to zero or to one of the
Expand Down
22 changes: 22 additions & 0 deletions clang/test/CodeGen/X86/avx2-builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -1111,12 +1111,34 @@ __m256i test_mm256_permute4x64_epi64(__m256i a) {
// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> poison, <4 x i32> <i32 3, i32 0, i32 2, i32 0>
return _mm256_permute4x64_epi64(a, 35);
}
// Control value 0x00: [0,0,0,0] -> broadcast element 0
TEST_CONSTEXPR(match_v4di(_mm256_permute4x64_epi64(_mm256_set_epi64x(40LL, 30LL, 20LL, 10LL), 0x00), 10LL, 10LL, 10LL, 10LL));
// Control value 0x1B: [0,1,2,3] -> reverse order [3,2,1,0] = [D,C,B,A]
TEST_CONSTEXPR(match_v4di(_mm256_permute4x64_epi64(_mm256_set_epi64x(40LL, 30LL, 20LL, 10LL), 0x1B), 40LL, 30LL, 20LL, 10LL));
// Control value 0x39: [1,2,3,0] -> rotate left [B,C,D,A]
TEST_CONSTEXPR(match_v4di(_mm256_permute4x64_epi64(_mm256_set_epi64x(40LL, 30LL, 20LL, 10LL), 0x39), 20LL, 30LL, 40LL, 10LL));
// Control value 0x12: [2,0,1,0] -> [C,A,B,A]
TEST_CONSTEXPR(match_v4di(_mm256_permute4x64_epi64(_mm256_set_epi64x(40LL, 30LL, 20LL, 10LL), 0x12), 30LL, 10LL, 20LL, 10LL));
// Control value 0xE4: [3,2,1,0] -> identity [A,B,C,D]
TEST_CONSTEXPR(match_v4di(_mm256_permute4x64_epi64(_mm256_set_epi64x(40LL, 30LL, 20LL, 10LL), 0xE4), 10LL, 20LL, 30LL, 40LL));
// Test with negative values
TEST_CONSTEXPR(match_v4di(_mm256_permute4x64_epi64(_mm256_set_epi64x(-40LL, -30LL, -20LL, -10LL), 0x1B), -40LL, -30LL, -20LL, -10LL));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't use _mm256_set_epi64x - use brace initialisation


__m256d test_mm256_permute4x64_pd(__m256d a) {
// CHECK-LABEL: test_mm256_permute4x64_pd
// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> poison, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
return _mm256_permute4x64_pd(a, 25);
}
// Control value 0x00: [0,0,0,0] -> broadcast element 0
TEST_CONSTEXPR(match_m256d(_mm256_permute4x64_pd(_mm256_set_pd(4.0, 3.0, 2.0, 1.0), 0x00), 1.0, 1.0, 1.0, 1.0));
// Control value 0x1B: [0,1,2,3] -> reverse order [3,2,1,0] = [D,C,B,A]
TEST_CONSTEXPR(match_m256d(_mm256_permute4x64_pd(_mm256_set_pd(4.0, 3.0, 2.0, 1.0), 0x1B), 4.0, 3.0, 2.0, 1.0));
// Control value 0x39: [1,2,3,0] -> rotate left [B,C,D,A]
TEST_CONSTEXPR(match_m256d(_mm256_permute4x64_pd(_mm256_set_pd(4.0, 3.0, 2.0, 1.0), 0x39), 2.0, 3.0, 4.0, 1.0));
// Control value 0x12: [2,0,1,0] -> [C,A,B,A]
TEST_CONSTEXPR(match_m256d(_mm256_permute4x64_pd(_mm256_set_pd(4.0, 3.0, 2.0, 1.0), 0x12), 3.0, 1.0, 2.0, 1.0));
// Control value 0xE4: [3,2,1,0] -> identity [A,B,C,D]
TEST_CONSTEXPR(match_m256d(_mm256_permute4x64_pd(_mm256_set_pd(4.0, 3.0, 2.0, 1.0), 0xE4), 1.0, 2.0, 3.0, 4.0));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't use _mm256_set_pd - use brace initialisation


__m256i test_mm256_permutevar8x32_epi32(__m256i a, __m256i b) {
// CHECK-LABEL: test_mm256_permutevar8x32_epi32
Expand Down
Loading