@@ -2374,8 +2374,83 @@ unittest
23742374 assert (C.array == correctC);
23752375}
23762376
2377- // TODO __m128i _mm256_i64gather_epi32 (int const* base_addr, __m256i vindex, const int scale) pure @safe
2378- // TODO __m128i _mm256_mask_i64gather_epi32 (__m128i src, int const* base_addr, __m256i vindex, __m128i mask, const int scale) pure @safe
2377+ // / Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded
2378+ // / from addresses starting at `base_addr` and offset by each 64-bit element in `vindex`
2379+ // / (each index is scaled by the factor in `scale`). Return gathered elements.
2380+ // / `scale` should be 1, 2, 4 or 8.
2381+ __m128i _mm256_i64gather_epi32 (int scale)(const (int )* base_addr, __m256i vindex) @system
2382+ {
2383+ __m128i src;
2384+ return _mm256_mask_i64gather_epi32! scale(src, base_addr, vindex, _mm_set1_epi32(- 1 ));
2385+ }
2386+ unittest
2387+ {
2388+ int [8 ] data = [0 , 1 , 2 , 3 ,
2389+ 4 , 5 , 6 , 7 ];
2390+ __m256i vindex = _mm256_setr_epi64(- 2 , 4 , 0 , 2 );
2391+ int4 A = cast (int4) _mm256_i64gather_epi32! 2 (&data[1 ], vindex);
2392+ int [4 ] correctA = [0 , 3 , 1 , 2 ];
2393+ assert (A.array == correctA);
2394+ }
2395+
2396+ // / Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded
2397+ // / from addresses starting at `base_addr` and offset by each 64-bit element in `vindex`
2398+ // / (each index is scaled by the factor in `scale`). Gathered elements are merged using
2399+ // / `mask` (elements are copied from `src` when the highest bit is not set in the
2400+ // / corresponding element). `scale` should be 1, 2, 4 or 8.
2401+ __m128i _mm256_mask_i64gather_epi32 (int scale)(__m128i src, const (int )* base_addr, __m256i vindex, __m128i mask) @system
2402+ {
2403+ static assert (isValidSIBScale(scale));
2404+
2405+ static if (GDC_with_AVX2)
2406+ {
2407+ return cast (__m128i) __builtin_ia32_gatherdiv4si256(cast (int4)src, base_addr, cast (long4)vindex, cast (int4)mask, scale);
2408+ }
2409+ else static if (LDC_with_AVX2)
2410+ {
2411+ return cast (__m128i) __builtin_ia32_gatherq_d256(cast (int4)src, base_addr, cast (long4)vindex, cast (int4)mask, cast (ubyte )scale);
2412+ }
2413+ else
2414+ {
2415+ __m128i r = src;
2416+ long4 vindexl = cast (long4)vindex;
2417+ int4 srci = cast (int4)src;
2418+ int4 maski = cast (int4)mask;
2419+ for (int n = 0 ; n < 4 ; ++ n)
2420+ {
2421+ long index = vindexl.array[n];
2422+ long offset = index * scale;
2423+ void * p = cast (void * )(base_addr);
2424+ if (maski.array[n] < 0 )
2425+ r.ptr[n] = * cast (int * )(p + offset);
2426+ else
2427+ r.ptr[n] = srci.ptr[n];
2428+ }
2429+ return r;
2430+ }
2431+ }
2432+ unittest
2433+ {
2434+ int [24 ] data = [0 , 1 , 2 , 3 ,
2435+ 4 , 5 , 6 , 7 ,
2436+ 8 , 9 , 10 , 11 ,
2437+ 12 , 13 , 14 , 15 ,
2438+ 16 , 17 , 18 , 19 ,
2439+ 20 , 21 , 22 , 23 ];
2440+ __m128i src = _mm_setr_epi32(- 1 , - 2 , - 3 , - 4 );
2441+ __m128i mask = _mm_setr_epi32(- 4 , 4 , - 1 , - 2 );
2442+ __m256i vindex = _mm256_setr_epi64(- 4 , 8 , 0 , 12 );
2443+
2444+ int4 A = cast (int4) _mm256_mask_i64gather_epi32! 1 (src, &data[10 ], vindex, mask);
2445+ int4 B = cast (int4) _mm256_mask_i64gather_epi32! 2 (src, &data[10 ], vindex, mask);
2446+ int4 C = cast (int4) _mm256_mask_i64gather_epi32! 4 (src, &data[10 ], vindex, mask);
2447+ int [4 ] correctA = [9 , - 2 , 10 , 13 ];
2448+ int [4 ] correctB = [8 , - 2 , 10 , 16 ];
2449+ int [4 ] correctC = [6 , - 2 , 10 , 22 ];
2450+ assert (A.array == correctA);
2451+ assert (B.array == correctB);
2452+ assert (C.array == correctC);
2453+ }
23792454// TODO __m128i _mm_i64gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale) pure @safe
23802455// TODO __m128i _mm_mask_i64gather_epi64 (__m128i src, __int64 const* base_addr, __m128i vindex, __m128i mask, const int scale) pure @safe
23812456// TODO __m256i _mm256_i64gather_epi64 (__int64 const* base_addr, __m256i vindex, const int scale) pure @safe
@@ -5534,27 +5609,4 @@ long2 __builtin_ia32_gatherq_q(long2, const void*, long2, long2, byte);
55345609pragma(LDC_intrinsic, "llvm.x86.avx2.gather.q.q.256")
55355610long4 __builtin_ia32_gatherq_q256(long4, const void*, long4, long4, byte);
55365611
5537- pragma(LDC_intrinsic, "llvm.x86.avx2.maskstore.d")
5538- void __builtin_ia32_maskstored(void*, int4, int4);
5539-
5540- pragma(LDC_intrinsic, "llvm.x86.avx2.maskstore.d.256")
5541- void __builtin_ia32_maskstored256(void*, int8, int8);
5542-
5543- pragma(LDC_intrinsic, "llvm.x86.avx2.maskstore.q")
5544- void __builtin_ia32_maskstoreq(void*, long2, long2);
5545-
5546- pragma(LDC_intrinsic, "llvm.x86.avx2.maskstore.q.256")
5547- void __builtin_ia32_maskstoreq256(void*, long4, long4);
5548-
5549-
5550- pragma(LDC_intrinsic, "llvm.x86.avx2.permd")
5551- int8 __builtin_ia32_permvarsi256(int8, int8) pure @safe;
5552-
5553- pragma(LDC_intrinsic, "llvm.x86.avx2.permps")
5554- float8 __builtin_ia32_permvarsf256(float8, int8) pure @safe;
5555-
5556-
5557- pragma(LDC_intrinsic, "llvm.x86.avx2.pmovmskb")
5558- int __builtin_ia32_pmovmskb256(byte32) pure @safe;
5559-
55605612+/
0 commit comments