Skip to content

Commit 7f2ae3e

Browse files
author
Guillaume Piolat
committed
no message
1 parent 8a9cef7 commit 7f2ae3e

File tree

1 file changed

+77
-25
lines changed

1 file changed

+77
-25
lines changed

source/inteli/avx2intrin.d

Lines changed: 77 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2374,8 +2374,83 @@ unittest
23742374
assert(C.array == correctC);
23752375
}
23762376

2377-
// TODO __m128i _mm256_i64gather_epi32 (int const* base_addr, __m256i vindex, const int scale) pure @safe
2378-
// TODO __m128i _mm256_mask_i64gather_epi32 (__m128i src, int const* base_addr, __m256i vindex, __m128i mask, const int scale) pure @safe
2377+
/// Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded
2378+
/// from addresses starting at `base_addr` and offset by each 64-bit element in `vindex`
2379+
/// (each index is scaled by the factor in `scale`). Return gathered elements.
2380+
/// `scale` should be 1, 2, 4 or 8.
2381+
__m128i _mm256_i64gather_epi32(int scale)(const(int)* base_addr, __m256i vindex) @system
2382+
{
2383+
__m128i src;
2384+
return _mm256_mask_i64gather_epi32!scale(src, base_addr, vindex, _mm_set1_epi32(-1));
2385+
}
2386+
unittest
2387+
{
2388+
int[8] data = [0, 1, 2, 3,
2389+
4, 5, 6, 7];
2390+
__m256i vindex = _mm256_setr_epi64(-2, 4, 0, 2);
2391+
int4 A = cast(int4) _mm256_i64gather_epi32!2(&data[1], vindex);
2392+
int[4] correctA = [0, 3, 1, 2];
2393+
assert(A.array == correctA);
2394+
}
2395+
2396+
/// Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded
2397+
/// from addresses starting at `base_addr` and offset by each 64-bit element in `vindex`
2398+
/// (each index is scaled by the factor in `scale`). Gathered elements are merged using
2399+
/// `mask` (elements are copied from `src` when the highest bit is not set in the
2400+
/// corresponding element). `scale` should be 1, 2, 4 or 8.
2401+
__m128i _mm256_mask_i64gather_epi32(int scale)(__m128i src, const(int)* base_addr, __m256i vindex, __m128i mask) @system
2402+
{
2403+
static assert(isValidSIBScale(scale));
2404+
2405+
static if (GDC_with_AVX2)
2406+
{
2407+
return cast(__m128i) __builtin_ia32_gatherdiv4si256(cast(int4)src, base_addr, cast(long4)vindex, cast(int4)mask, scale);
2408+
}
2409+
else static if (LDC_with_AVX2)
2410+
{
2411+
return cast(__m128i) __builtin_ia32_gatherq_d256(cast(int4)src, base_addr, cast(long4)vindex, cast(int4)mask, cast(ubyte)scale);
2412+
}
2413+
else
2414+
{
2415+
__m128i r = src;
2416+
long4 vindexl = cast(long4)vindex;
2417+
int4 srci = cast(int4)src;
2418+
int4 maski = cast(int4)mask;
2419+
for (int n = 0; n < 4; ++n)
2420+
{
2421+
long index = vindexl.array[n];
2422+
long offset = index * scale;
2423+
void* p = cast(void*)(base_addr);
2424+
if (maski.array[n] < 0)
2425+
r.ptr[n] = *cast(int*)(p + offset);
2426+
else
2427+
r.ptr[n] = srci.ptr[n];
2428+
}
2429+
return r;
2430+
}
2431+
}
2432+
unittest
2433+
{
2434+
int[24] data = [0, 1, 2, 3,
2435+
4, 5, 6, 7,
2436+
8, 9, 10, 11,
2437+
12, 13, 14, 15,
2438+
16, 17, 18, 19,
2439+
20, 21, 22, 23];
2440+
__m128i src = _mm_setr_epi32(-1, -2, -3, -4);
2441+
__m128i mask = _mm_setr_epi32(-4, 4, -1, -2);
2442+
__m256i vindex = _mm256_setr_epi64(-4, 8, 0, 12);
2443+
2444+
int4 A = cast(int4) _mm256_mask_i64gather_epi32!1(src, &data[10], vindex, mask);
2445+
int4 B = cast(int4) _mm256_mask_i64gather_epi32!2(src, &data[10], vindex, mask);
2446+
int4 C = cast(int4) _mm256_mask_i64gather_epi32!4(src, &data[10], vindex, mask);
2447+
int[4] correctA = [9, -2, 10, 13];
2448+
int[4] correctB = [8, -2, 10, 16];
2449+
int[4] correctC = [6, -2, 10, 22];
2450+
assert(A.array == correctA);
2451+
assert(B.array == correctB);
2452+
assert(C.array == correctC);
2453+
}
23792454
// TODO __m128i _mm_i64gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale) pure @safe
23802455
// TODO __m128i _mm_mask_i64gather_epi64 (__m128i src, __int64 const* base_addr, __m128i vindex, __m128i mask, const int scale) pure @safe
23812456
// TODO __m256i _mm256_i64gather_epi64 (__int64 const* base_addr, __m256i vindex, const int scale) pure @safe
@@ -5534,27 +5609,4 @@ long2 __builtin_ia32_gatherq_q(long2, const void*, long2, long2, byte);
55345609
pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.q.256")
55355610
long4 __builtin_ia32_gatherq_q256(long4, const void*, long4, long4, byte);
55365611
5537-
pragma(LDC_intrinsic, "llvm.x86.avx2.maskstore.d")
5538-
void __builtin_ia32_maskstored(void*, int4, int4);
5539-
5540-
pragma(LDC_intrinsic, "llvm.x86.avx2.maskstore.d.256")
5541-
void __builtin_ia32_maskstored256(void*, int8, int8);
5542-
5543-
pragma(LDC_intrinsic, "llvm.x86.avx2.maskstore.q")
5544-
void __builtin_ia32_maskstoreq(void*, long2, long2);
5545-
5546-
pragma(LDC_intrinsic, "llvm.x86.avx2.maskstore.q.256")
5547-
void __builtin_ia32_maskstoreq256(void*, long4, long4);
5548-
5549-
5550-
pragma(LDC_intrinsic, "llvm.x86.avx2.permd")
5551-
int8 __builtin_ia32_permvarsi256(int8, int8) pure @safe;
5552-
5553-
pragma(LDC_intrinsic, "llvm.x86.avx2.permps")
5554-
float8 __builtin_ia32_permvarsf256(float8, int8) pure @safe;
5555-
5556-
5557-
pragma(LDC_intrinsic, "llvm.x86.avx2.pmovmskb")
5558-
int __builtin_ia32_pmovmskb256(byte32) pure @safe;
5559-
55605612
+/

0 commit comments

Comments
 (0)