@@ -24,22 +24,22 @@ namespace nda::simd {
2424 __m512i _tmpe = _mm512_unpacklo_epi32 (simd_block[14 ], simd_block[15 ]);
2525 __m512i _tmpf = _mm512_unpackhi_epi32 (simd_block[14 ], simd_block[15 ]);
2626
27- __m512i _tmpg = _mm512_shuffle_epi32 ( _tmp0, _tmp2, _MM_SHUFFLE (1 , 0 , 1 , 0 ));
28- __m512i _tmph = _mm512_shuffle_epi32 ( _tmp0, _tmp2, _MM_SHUFFLE (3 , 2 , 3 , 2 ));
29- __m512i _tmpi = _mm512_shuffle_epi32 ( _tmp1, _tmp3, _MM_SHUFFLE (1 , 0 , 1 , 0 ));
30- __m512i _tmpj = _mm512_shuffle_epi32 ( _tmp1, _tmp3, _MM_SHUFFLE (3 , 2 , 3 , 2 ));
31- __m512i _tmpk = _mm512_shuffle_epi32 ( _tmp4, _tmp6, _MM_SHUFFLE (1 , 0 , 1 , 0 ));
32- __m512i _tmpl = _mm512_shuffle_epi32 ( _tmp4, _tmp6, _MM_SHUFFLE (3 , 2 , 3 , 2 ));
33- __m512i _tmpm = _mm512_shuffle_epi32 ( _tmp5, _tmp7, _MM_SHUFFLE (1 , 0 , 1 , 0 ));
34- __m512i _tmpn = _mm512_shuffle_epi32 ( _tmp5, _tmp7, _MM_SHUFFLE (3 , 2 , 3 , 2 ));
35- __m512i _tmpo = _mm512_shuffle_epi32 ( _tmp8, _tmpa, _MM_SHUFFLE (1 , 0 , 1 , 0 ));
36- __m512i _tmpp = _mm512_shuffle_epi32 ( _tmp8, _tmpa, _MM_SHUFFLE (3 , 2 , 3 , 2 ));
37- __m512i _tmpq = _mm512_shuffle_epi32 ( _tmp9, _tmpb, _MM_SHUFFLE (1 , 0 , 1 , 0 ));
38- __m512i _tmpr = _mm512_shuffle_epi32 ( _tmp9, _tmpb, _MM_SHUFFLE (3 , 2 , 3 , 2 ));
39- __m512i _tmps = _mm512_shuffle_epi32 ( _tmpc, _tmpe, _MM_SHUFFLE (1 , 0 , 1 , 0 ));
40- __m512i _tmpt = _mm512_shuffle_epi32 ( _tmpc, _tmpe, _MM_SHUFFLE (3 , 2 , 3 , 2 ));
41- __m512i _tmpu = _mm512_shuffle_epi32 ( _tmpd, _tmpf, _MM_SHUFFLE (1 , 0 , 1 , 0 ));
42- __m512i _tmpv = _mm512_shuffle_epi32 ( _tmpd, _tmpf, _MM_SHUFFLE (3 , 2 , 3 , 2 ));
27+ __m512i _tmpg = _mm512_castps_si512 ( _mm512_shuffle_ps ( _mm512_castsi512_ps ( _tmp0), _mm512_castsi512_ps ( _tmp2) , _MM_SHUFFLE (1 , 0 , 1 , 0 ) ));
28+ __m512i _tmph = _mm512_castps_si512 ( _mm512_shuffle_ps ( _mm512_castsi512_ps ( _tmp0), _mm512_castsi512_ps ( _tmp2) , _MM_SHUFFLE (3 , 2 , 3 , 2 ) ));
29+ __m512i _tmpi = _mm512_castps_si512 ( _mm512_shuffle_ps ( _mm512_castsi512_ps ( _tmp1), _mm512_castsi512_ps ( _tmp3) , _MM_SHUFFLE (1 , 0 , 1 , 0 ) ));
30+ __m512i _tmpj = _mm512_castps_si512 ( _mm512_shuffle_ps ( _mm512_castsi512_ps ( _tmp1), _mm512_castsi512_ps ( _tmp3) , _MM_SHUFFLE (3 , 2 , 3 , 2 ) ));
31+ __m512i _tmpk = _mm512_castps_si512 ( _mm512_shuffle_ps ( _mm512_castsi512_ps ( _tmp4), _mm512_castsi512_ps ( _tmp6) , _MM_SHUFFLE (1 , 0 , 1 , 0 ) ));
32+ __m512i _tmpl = _mm512_castps_si512 ( _mm512_shuffle_ps ( _mm512_castsi512_ps ( _tmp4), _mm512_castsi512_ps ( _tmp6) , _MM_SHUFFLE (3 , 2 , 3 , 2 ) ));
33+ __m512i _tmpm = _mm512_castps_si512 ( _mm512_shuffle_ps ( _mm512_castsi512_ps ( _tmp5), _mm512_castsi512_ps ( _tmp7) , _MM_SHUFFLE (1 , 0 , 1 , 0 ) ));
34+ __m512i _tmpn = _mm512_castps_si512 ( _mm512_shuffle_ps ( _mm512_castsi512_ps ( _tmp5), _mm512_castsi512_ps ( _tmp7) , _MM_SHUFFLE (3 , 2 , 3 , 2 ) ));
35+ __m512i _tmpo = _mm512_castps_si512 ( _mm512_shuffle_ps ( _mm512_castsi512_ps ( _tmp8), _mm512_castsi512_ps ( _tmpa) , _MM_SHUFFLE (1 , 0 , 1 , 0 ) ));
36+ __m512i _tmpp = _mm512_castps_si512 ( _mm512_shuffle_ps ( _mm512_castsi512_ps ( _tmp8), _mm512_castsi512_ps ( _tmpa) , _MM_SHUFFLE (3 , 2 , 3 , 2 ) ));
37+ __m512i _tmpq = _mm512_castps_si512 ( _mm512_shuffle_ps ( _mm512_castsi512_ps ( _tmp9), _mm512_castsi512_ps ( _tmpb) , _MM_SHUFFLE (1 , 0 , 1 , 0 ) ));
38+ __m512i _tmpr = _mm512_castps_si512 ( _mm512_shuffle_ps ( _mm512_castsi512_ps ( _tmp9), _mm512_castsi512_ps ( _tmpb) , _MM_SHUFFLE (3 , 2 , 3 , 2 ) ));
39+ __m512i _tmps = _mm512_castps_si512 ( _mm512_shuffle_ps ( _mm512_castsi512_ps ( _tmpc), _mm512_castsi512_ps ( _tmpe) , _MM_SHUFFLE (1 , 0 , 1 , 0 ) ));
40+ __m512i _tmpt = _mm512_castps_si512 ( _mm512_shuffle_ps ( _mm512_castsi512_ps ( _tmpc), _mm512_castsi512_ps ( _tmpe) , _MM_SHUFFLE (3 , 2 , 3 , 2 ) ));
41+ __m512i _tmpu = _mm512_castps_si512 ( _mm512_shuffle_ps ( _mm512_castsi512_ps ( _tmpd), _mm512_castsi512_ps ( _tmpf) , _MM_SHUFFLE (1 , 0 , 1 , 0 ) ));
42+ __m512i _tmpv = _mm512_castps_si512 ( _mm512_shuffle_ps ( _mm512_castsi512_ps ( _tmpd), _mm512_castsi512_ps ( _tmpf) , _MM_SHUFFLE (3 , 2 , 3 , 2 ) ));
4343
4444 _tmp0 = _mm512_shuffle_i32x4 (_tmpg, _tmpk, _MM_SHUFFLE (2 , 0 , 2 , 0 ));
4545 _tmp1 = _mm512_shuffle_i32x4 (_tmpo, _tmps, _MM_SHUFFLE (2 , 0 , 2 , 0 ));
0 commit comments