Skip to content

Commit 701c8c8

Browse files
author
Fikret Ardal
committed
simd_i16 kernel_transpose fix
1 parent 1319f0b commit 701c8c8

File tree

1 file changed

+16
-16
lines changed

1 file changed

+16
-16
lines changed

c++/nda/simd/arch/AVX512/kernel.hpp

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -24,22 +24,22 @@ namespace nda::simd {
2424
__m512i _tmpe = _mm512_unpacklo_epi32(simd_block[14], simd_block[15]);
2525
__m512i _tmpf = _mm512_unpackhi_epi32(simd_block[14], simd_block[15]);
2626

27-
__m512i _tmpg = _mm512_shuffle_epi32(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
28-
__m512i _tmph = _mm512_shuffle_epi32(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
29-
__m512i _tmpi = _mm512_shuffle_epi32(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
30-
__m512i _tmpj = _mm512_shuffle_epi32(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
31-
__m512i _tmpk = _mm512_shuffle_epi32(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
32-
__m512i _tmpl = _mm512_shuffle_epi32(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
33-
__m512i _tmpm = _mm512_shuffle_epi32(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
34-
__m512i _tmpn = _mm512_shuffle_epi32(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));
35-
__m512i _tmpo = _mm512_shuffle_epi32(_tmp8, _tmpa, _MM_SHUFFLE(1, 0, 1, 0));
36-
__m512i _tmpp = _mm512_shuffle_epi32(_tmp8, _tmpa, _MM_SHUFFLE(3, 2, 3, 2));
37-
__m512i _tmpq = _mm512_shuffle_epi32(_tmp9, _tmpb, _MM_SHUFFLE(1, 0, 1, 0));
38-
__m512i _tmpr = _mm512_shuffle_epi32(_tmp9, _tmpb, _MM_SHUFFLE(3, 2, 3, 2));
39-
__m512i _tmps = _mm512_shuffle_epi32(_tmpc, _tmpe, _MM_SHUFFLE(1, 0, 1, 0));
40-
__m512i _tmpt = _mm512_shuffle_epi32(_tmpc, _tmpe, _MM_SHUFFLE(3, 2, 3, 2));
41-
__m512i _tmpu = _mm512_shuffle_epi32(_tmpd, _tmpf, _MM_SHUFFLE(1, 0, 1, 0));
42-
__m512i _tmpv = _mm512_shuffle_epi32(_tmpd, _tmpf, _MM_SHUFFLE(3, 2, 3, 2));
27+
__m512i _tmpg = _mm512_castps_si512(_mm512_shuffle_ps(_mm512_castsi512_ps(_tmp0), _mm512_castsi512_ps(_tmp2), _MM_SHUFFLE(1, 0, 1, 0)));
28+
__m512i _tmph = _mm512_castps_si512(_mm512_shuffle_ps(_mm512_castsi512_ps(_tmp0), _mm512_castsi512_ps(_tmp2), _MM_SHUFFLE(3, 2, 3, 2)));
29+
__m512i _tmpi = _mm512_castps_si512(_mm512_shuffle_ps(_mm512_castsi512_ps(_tmp1), _mm512_castsi512_ps(_tmp3), _MM_SHUFFLE(1, 0, 1, 0)));
30+
__m512i _tmpj = _mm512_castps_si512(_mm512_shuffle_ps(_mm512_castsi512_ps(_tmp1), _mm512_castsi512_ps(_tmp3), _MM_SHUFFLE(3, 2, 3, 2)));
31+
__m512i _tmpk = _mm512_castps_si512(_mm512_shuffle_ps(_mm512_castsi512_ps(_tmp4), _mm512_castsi512_ps(_tmp6), _MM_SHUFFLE(1, 0, 1, 0)));
32+
__m512i _tmpl = _mm512_castps_si512(_mm512_shuffle_ps(_mm512_castsi512_ps(_tmp4), _mm512_castsi512_ps(_tmp6), _MM_SHUFFLE(3, 2, 3, 2)));
33+
__m512i _tmpm = _mm512_castps_si512(_mm512_shuffle_ps(_mm512_castsi512_ps(_tmp5), _mm512_castsi512_ps(_tmp7), _MM_SHUFFLE(1, 0, 1, 0)));
34+
__m512i _tmpn = _mm512_castps_si512(_mm512_shuffle_ps(_mm512_castsi512_ps(_tmp5), _mm512_castsi512_ps(_tmp7), _MM_SHUFFLE(3, 2, 3, 2)));
35+
__m512i _tmpo = _mm512_castps_si512(_mm512_shuffle_ps(_mm512_castsi512_ps(_tmp8), _mm512_castsi512_ps(_tmpa), _MM_SHUFFLE(1, 0, 1, 0)));
36+
__m512i _tmpp = _mm512_castps_si512(_mm512_shuffle_ps(_mm512_castsi512_ps(_tmp8), _mm512_castsi512_ps(_tmpa), _MM_SHUFFLE(3, 2, 3, 2)));
37+
__m512i _tmpq = _mm512_castps_si512(_mm512_shuffle_ps(_mm512_castsi512_ps(_tmp9), _mm512_castsi512_ps(_tmpb), _MM_SHUFFLE(1, 0, 1, 0)));
38+
__m512i _tmpr = _mm512_castps_si512(_mm512_shuffle_ps(_mm512_castsi512_ps(_tmp9), _mm512_castsi512_ps(_tmpb), _MM_SHUFFLE(3, 2, 3, 2)));
39+
__m512i _tmps = _mm512_castps_si512(_mm512_shuffle_ps(_mm512_castsi512_ps(_tmpc), _mm512_castsi512_ps(_tmpe), _MM_SHUFFLE(1, 0, 1, 0)));
40+
__m512i _tmpt = _mm512_castps_si512(_mm512_shuffle_ps(_mm512_castsi512_ps(_tmpc), _mm512_castsi512_ps(_tmpe), _MM_SHUFFLE(3, 2, 3, 2)));
41+
__m512i _tmpu = _mm512_castps_si512(_mm512_shuffle_ps(_mm512_castsi512_ps(_tmpd), _mm512_castsi512_ps(_tmpf), _MM_SHUFFLE(1, 0, 1, 0)));
42+
__m512i _tmpv = _mm512_castps_si512(_mm512_shuffle_ps(_mm512_castsi512_ps(_tmpd), _mm512_castsi512_ps(_tmpf), _MM_SHUFFLE(3, 2, 3, 2)));
4343

4444
_tmp0 = _mm512_shuffle_i32x4(_tmpg, _tmpk, _MM_SHUFFLE(2, 0, 2, 0));
4545
_tmp1 = _mm512_shuffle_i32x4(_tmpo, _tmps, _MM_SHUFFLE(2, 0, 2, 0));

0 commit comments

Comments
 (0)