Skip to content

Commit 38a2da1

Browse files
author
Fikret Ardal
committed
kernel_transpose for avx512 long/double
1 parent c8cb288 commit 38a2da1

File tree

2 files changed

+127
-6
lines changed

2 files changed

+127
-6
lines changed

c++/nda/simd/arch/AVX512/kernel.hpp

Lines changed: 126 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,71 @@ namespace nda::simd {
99
return simd_block;
1010
}
1111

12-
1312
template <>
1413
inline std::array<simd_l8, 8> kernel_transpose(const std::array<simd_l8, 8> &simd_block) {
15-
return simd_block;
14+
simd_l8 a0a1a2a3a4a5a6a7 = simd_block[0];
15+
simd_l8 b0b1b2b3b4b5b6b7 = simd_block[1];
16+
simd_l8 c0c1c2c3c4c5c6c7 = simd_block[2];
17+
simd_l8 d0d1d2d3d4d5d6d7 = simd_block[3];
18+
simd_l8 e0e1e2e3e4e5e6e7 = simd_block[4];
19+
simd_l8 f0f1f2f3f4f5f6f7 = simd_block[5];
20+
simd_l8 g0g1g2g3g4g5g6g7 = simd_block[6];
21+
simd_l8 h0h1h2h3h4h5h6h7 = simd_block[7];
22+
23+
__m512i a0b0a2b2a4b4a6b6 = _mm512_unpacklo_epi64(a0a1a2a3a4a5a6a7, b0b1b2b3b4b5b6b7);
24+
__m512i a1b1a3b3a5b5a7b7 = _mm512_unpackhi_epi64(a0a1a2a3a4a5a6a7, b0b1b2b3b4b5b6b7);
25+
__m512i c0d0c2d2c4d4c6d6 = _mm512_unpacklo_epi64(c0c1c2c3c4c5c6c7, d0d1d2d3d4d5d6d7);
26+
__m512i c1d1c3d3c5d5c7d7 = _mm512_unpackhi_epi64(c0c1c2c3c4c5c6c7, d0d1d2d3d4d5d6d7);
27+
__m512i e0f0e2f2e4f4e6f6 = _mm512_unpacklo_epi64(e0e1e2e3e4e5e6e7, f0f1f2f3f4f5f6f7);
28+
__m512i e1f1e3f3e5f5e5f7 = _mm512_unpackhi_epi64(e0e1e2e3e4e5e6e7, f0f1f2f3f4f5f6f7);
29+
__m512i g0h0g2h2g4h4g6h6 = _mm512_unpacklo_epi64(g0g1g2g3g4g5g6g7, h0h1h2h3h4h5h6h7);
30+
__m512i g1h1g3h3g5h5g7h7 = _mm512_unpackhi_epi64(g0g1g2g3g4g5g6g7, h0h1h2h3h4h5h6h7);
31+
32+
__m512i a2b2a0b0a6b6a4b4 = _mm512_shuffle_i64x2(a0b0a2b2a4b4a6b6, a0b0a2b2a4b4a6b6, NDA_SHUFFLE_MASK4(1, 0, 3, 2));
33+
__m512i a3b3a1b1a7b7a5b5 = _mm512_shuffle_i64x2(a1b1a3b3a5b5a7b7, a1b1a3b3a5b5a7b7, NDA_SHUFFLE_MASK4(1, 0, 3, 2));
34+
__m512i c2d2c0d0c6d6c4d4 = _mm512_shuffle_i64x2(c0d0c2d2c4d4c6d6, c0d0c2d2c4d4c6d6, NDA_SHUFFLE_MASK4(1, 0, 3, 2));
35+
__m512i c3d3c1d1c7d7c5d5 = _mm512_shuffle_i64x2(c1d1c3d3c5d5c7d7, c1d1c3d3c5d5c7d7, NDA_SHUFFLE_MASK4(1, 0, 3, 2));
36+
__m512i e2f2e0f0e6f6e4f4 = _mm512_shuffle_i64x2(e0f0e2f2e4f4e6f6, e0f0e2f2e4f4e6f6, NDA_SHUFFLE_MASK4(1, 0, 3, 2));
37+
__m512i e3f3e1f1e7f7e5f5 = _mm512_shuffle_i64x2(e1f1e3f3e5f5e5f7, e1f1e3f3e5f5e5f7, NDA_SHUFFLE_MASK4(1, 0, 3, 2));
38+
__m512i g2h2g0h0g6h6g4h4 = _mm512_shuffle_i64x2(g0h0g2h2g4h4g6h6, g0h0g2h2g4h4g6h6, NDA_SHUFFLE_MASK4(1, 0, 3, 2));
39+
__m512i g3h3g1h1g7h7g5h5 = _mm512_shuffle_i64x2(g1h1g3h3g5h5g7h7, g1h1g3h3g5h5g7h7, NDA_SHUFFLE_MASK4(1, 0, 3, 2));
40+
41+
__m512i a2b2c2d2a6b6c6d6 = _mm512_mask_blend_epi64(0b11001100, a2b2a0b0a6b6a4b4, c0d0c2d2c4d4c6d6);
42+
__m512i a3b3c3d3a7b7c7d7 = _mm512_mask_blend_epi64(0b11001100, a3b3a1b1a7b7a5b5, c1d1c3d3c5d5c7d7);
43+
__m512i a0b0c0d0a4b4c4d4 = _mm512_mask_blend_epi64(0b11001100, a0b0a2b2a4b4a6b6, c2d2c0d0c6d6c4d4);
44+
__m512i a1b1c1d1a5b5c5d5 = _mm512_mask_blend_epi64(0b11001100, a1b1a3b3a5b5a7b7, c3d3c1d1c7d7c5d5);
45+
__m512i e2f2g2h2e6f6g6h6 = _mm512_mask_blend_epi64(0b11001100, e2f2e0f0e6f6e4f4, g0h0g2h2g4h4g6h6);
46+
__m512i e3f3g3h3e7f7g7h7 = _mm512_mask_blend_epi64(0b11001100, e3f3e1f1e7f7e5f5, g1h1g3h3g5h5g7h7);
47+
__m512i e0f0g0h0e4f4g4h4 = _mm512_mask_blend_epi64(0b11001100, e0f0e2f2e4f4e6f6, g2h2g0h0g6h6g4h4);
48+
__m512i e1f1g1h1e5f5g5h5 = _mm512_mask_blend_epi64(0b11001100, e1f1e3f3e5f5e5f7, g3h3g1h1g7h7g5h5);
49+
50+
__m512i e0f0g0h0e0f0g0h0 = _mm512_shuffle_i64x2(e0f0g0h0e4f4g4h4, e0f0g0h0e4f4g4h4, NDA_SHUFFLE_MASK4(0, 1, 0, 1));
51+
__m512i e1f1g1h1e1f1g1h1 = _mm512_shuffle_i64x2(e1f1g1h1e5f5g5h5, e1f1g1h1e5f5g5h5, NDA_SHUFFLE_MASK4(0, 1, 0, 1));
52+
__m512i e2f2g2h2e2f2g2h2 = _mm512_shuffle_i64x2(e2f2g2h2e6f6g6h6, e2f2g2h2e6f6g6h6, NDA_SHUFFLE_MASK4(0, 1, 0, 1));
53+
__m512i e3f3g3h3e3f3g3h3 = _mm512_shuffle_i64x2(e3f3g3h3e7f7g7h7, e3f3g3h3e7f7g7h7, NDA_SHUFFLE_MASK4(0, 1, 0, 1));
54+
__m512i a4b4c4d4a4b4c4d4 = _mm512_shuffle_i64x2(a0b0c0d0a4b4c4d4, a0b0c0d0a4b4c4d4, NDA_SHUFFLE_MASK4(2, 3, 2, 3));
55+
__m512i a5b5c5d5a5b5c5d5 = _mm512_shuffle_i64x2(a1b1c1d1a5b5c5d5, a1b1c1d1a5b5c5d5, NDA_SHUFFLE_MASK4(2, 3, 2, 3));
56+
__m512i a6b6c6d6a6b6c6d6 = _mm512_shuffle_i64x2(a2b2c2d2a6b6c6d6, a2b2c2d2a6b6c6d6, NDA_SHUFFLE_MASK4(2, 3, 2, 3));
57+
__m512i a7b7c7d7a7b7c7d7 = _mm512_shuffle_i64x2(a3b3c3d3a7b7c7d7, a3b3c3d3a7b7c7d7, NDA_SHUFFLE_MASK4(2, 3, 2, 3));
58+
59+
simd_l8 a0b0c0d0e0f0g0h0(_mm512_mask_blend_epi64(0b11110000, a0b0c0d0a4b4c4d4, e0f0g0h0e0f0g0h0));
60+
61+
simd_l8 a1b1c1d1e1f1g1h1(_mm512_mask_blend_epi64(0b11110000, a1b1c1d1a5b5c5d5, e1f1g1h1e1f1g1h1));
62+
63+
simd_l8 a2b2c2d2e2f2g2h2(_mm512_mask_blend_epi64(0b11110000, a2b2c2d2a6b6c6d6, e2f2g2h2e2f2g2h2));
64+
65+
simd_l8 a3b3c3d3e3f3g3h3(_mm512_mask_blend_epi64(0b11110000, a3b3c3d3a7b7c7d7, e3f3g3h3e3f3g3h3));
66+
67+
simd_l8 a4b4c4d4e4f4g4h4(_mm512_mask_blend_epi64(0b11110000, a4b4c4d4a4b4c4d4, e0f0g0h0e4f4g4h4));
68+
69+
simd_l8 a5b5c5d5e5f5g5h5(_mm512_mask_blend_epi64(0b11110000, a5b5c5d5a5b5c5d5, e1f1g1h1e5f5g5h5));
70+
71+
simd_l8 a6b6c6d6e6f6g6h6(_mm512_mask_blend_epi64(0b11110000, a6b6c6d6a6b6c6d6, e2f2g2h2e6f6g6h6));
72+
73+
simd_l8 a7b7c7d7e7f7g7h7(_mm512_mask_blend_epi64(0b11110000, a7b7c7d7a7b7c7d7, e3f3g3h3e7f7g7h7));
74+
75+
return {a0b0c0d0e0f0g0h0, a1b1c1d1e1f1g1h1, a2b2c2d2e2f2g2h2, a3b3c3d3e3f3g3h3,
76+
a4b4c4d4e4f4g4h4, a5b5c5d5e5f5g5h5, a6b6c6d6e6f6g6h6, a7b7c7d7e7f7g7h7};
1677
}
1778

1879
template <>
@@ -22,7 +83,69 @@ namespace nda::simd {
2283

2384
template <>
2485
inline std::array<simd_d8, 8> kernel_transpose(const std::array<simd_d8, 8> &simd_block) {
25-
return simd_block;
86+
simd_d8 a0a1a2a3a4a5a6a7 = simd_block[0];
87+
simd_d8 b0b1b2b3b4b5b6b7 = simd_block[1];
88+
simd_d8 c0c1c2c3c4c5c6c7 = simd_block[2];
89+
simd_d8 d0d1d2d3d4d5d6d7 = simd_block[3];
90+
simd_d8 e0e1e2e3e4e5e6e7 = simd_block[4];
91+
simd_d8 f0f1f2f3f4f5f6f7 = simd_block[5];
92+
simd_d8 g0g1g2g3g4g5g6g7 = simd_block[6];
93+
simd_d8 h0h1h2h3h4h5h6h7 = simd_block[7];
94+
95+
__m512d a0b0a2b2a4b4a6b6 = _mm512_unpacklo_pd(a0a1a2a3a4a5a6a7, b0b1b2b3b4b5b6b7);
96+
__m512d a1b1a3b3a5b5a7b7 = _mm512_unpackhi_pd(a0a1a2a3a4a5a6a7, b0b1b2b3b4b5b6b7);
97+
__m512d c0d0c2d2c4d4c6d6 = _mm512_unpacklo_pd(c0c1c2c3c4c5c6c7, d0d1d2d3d4d5d6d7);
98+
__m512d c1d1c3d3c5d5c7d7 = _mm512_unpackhi_pd(c0c1c2c3c4c5c6c7, d0d1d2d3d4d5d6d7);
99+
__m512d e0f0e2f2e4f4e6f6 = _mm512_unpacklo_pd(e0e1e2e3e4e5e6e7, f0f1f2f3f4f5f6f7);
100+
__m512d e1f1e3f3e5f5e5f7 = _mm512_unpackhi_pd(e0e1e2e3e4e5e6e7, f0f1f2f3f4f5f6f7);
101+
__m512d g0h0g2h2g4h4g6h6 = _mm512_unpacklo_pd(g0g1g2g3g4g5g6g7, h0h1h2h3h4h5h6h7);
102+
__m512d g1h1g3h3g5h5g7h7 = _mm512_unpackhi_pd(g0g1g2g3g4g5g6g7, h0h1h2h3h4h5h6h7);
103+
104+
__m512d a2b2a0b0a6b6a4b4 = _mm512_shuffle_f64x2(a0b0a2b2a4b4a6b6, a0b0a2b2a4b4a6b6, NDA_SHUFFLE_MASK4(1, 0, 3, 2));
105+
__m512d a3b3a1b1a7b7a5b5 = _mm512_shuffle_f64x2(a1b1a3b3a5b5a7b7, a1b1a3b3a5b5a7b7, NDA_SHUFFLE_MASK4(1, 0, 3, 2));
106+
__m512d c2d2c0d0c6d6c4d4 = _mm512_shuffle_f64x2(c0d0c2d2c4d4c6d6, c0d0c2d2c4d4c6d6, NDA_SHUFFLE_MASK4(1, 0, 3, 2));
107+
__m512d c3d3c1d1c7d7c5d5 = _mm512_shuffle_f64x2(c1d1c3d3c5d5c7d7, c1d1c3d3c5d5c7d7, NDA_SHUFFLE_MASK4(1, 0, 3, 2));
108+
__m512d e2f2e0f0e6f6e4f4 = _mm512_shuffle_f64x2(e0f0e2f2e4f4e6f6, e0f0e2f2e4f4e6f6, NDA_SHUFFLE_MASK4(1, 0, 3, 2));
109+
__m512d e3f3e1f1e7f7e5f5 = _mm512_shuffle_f64x2(e1f1e3f3e5f5e5f7, e1f1e3f3e5f5e5f7, NDA_SHUFFLE_MASK4(1, 0, 3, 2));
110+
__m512d g2h2g0h0g6h6g4h4 = _mm512_shuffle_f64x2(g0h0g2h2g4h4g6h6, g0h0g2h2g4h4g6h6, NDA_SHUFFLE_MASK4(1, 0, 3, 2));
111+
__m512d g3h3g1h1g7h7g5h5 = _mm512_shuffle_f64x2(g1h1g3h3g5h5g7h7, g1h1g3h3g5h5g7h7, NDA_SHUFFLE_MASK4(1, 0, 3, 2));
112+
113+
__m512d a2b2c2d2a6b6c6d6 = _mm512_mask_blend_pd(0b11001100, a2b2a0b0a6b6a4b4, c0d0c2d2c4d4c6d6);
114+
__m512d a3b3c3d3a7b7c7d7 = _mm512_mask_blend_pd(0b11001100, a3b3a1b1a7b7a5b5, c1d1c3d3c5d5c7d7);
115+
__m512d a0b0c0d0a4b4c4d4 = _mm512_mask_blend_pd(0b11001100, a0b0a2b2a4b4a6b6, c2d2c0d0c6d6c4d4);
116+
__m512d a1b1c1d1a5b5c5d5 = _mm512_mask_blend_pd(0b11001100, a1b1a3b3a5b5a7b7, c3d3c1d1c7d7c5d5);
117+
__m512d e2f2g2h2e6f6g6h6 = _mm512_mask_blend_pd(0b11001100, e2f2e0f0e6f6e4f4, g0h0g2h2g4h4g6h6);
118+
__m512d e3f3g3h3e7f7g7h7 = _mm512_mask_blend_pd(0b11001100, e3f3e1f1e7f7e5f5, g1h1g3h3g5h5g7h7);
119+
__m512d e0f0g0h0e4f4g4h4 = _mm512_mask_blend_pd(0b11001100, e0f0e2f2e4f4e6f6, g2h2g0h0g6h6g4h4);
120+
__m512d e1f1g1h1e5f5g5h5 = _mm512_mask_blend_pd(0b11001100, e1f1e3f3e5f5e5f7, g3h3g1h1g7h7g5h5);
121+
122+
__m512d e0f0g0h0e0f0g0h0 = _mm512_shuffle_f64x2(e0f0g0h0e4f4g4h4, e0f0g0h0e4f4g4h4, NDA_SHUFFLE_MASK4(0, 1, 0, 1));
123+
__m512d e1f1g1h1e1f1g1h1 = _mm512_shuffle_f64x2(e1f1g1h1e5f5g5h5, e1f1g1h1e5f5g5h5, NDA_SHUFFLE_MASK4(0, 1, 0, 1));
124+
__m512d e2f2g2h2e2f2g2h2 = _mm512_shuffle_f64x2(e2f2g2h2e6f6g6h6, e2f2g2h2e6f6g6h6, NDA_SHUFFLE_MASK4(0, 1, 0, 1));
125+
__m512d e3f3g3h3e3f3g3h3 = _mm512_shuffle_f64x2(e3f3g3h3e7f7g7h7, e3f3g3h3e7f7g7h7, NDA_SHUFFLE_MASK4(0, 1, 0, 1));
126+
__m512d a4b4c4d4a4b4c4d4 = _mm512_shuffle_f64x2(a0b0c0d0a4b4c4d4, a0b0c0d0a4b4c4d4, NDA_SHUFFLE_MASK4(2, 3, 2, 3));
127+
__m512d a5b5c5d5a5b5c5d5 = _mm512_shuffle_f64x2(a1b1c1d1a5b5c5d5, a1b1c1d1a5b5c5d5, NDA_SHUFFLE_MASK4(2, 3, 2, 3));
128+
__m512d a6b6c6d6a6b6c6d6 = _mm512_shuffle_f64x2(a2b2c2d2a6b6c6d6, a2b2c2d2a6b6c6d6, NDA_SHUFFLE_MASK4(2, 3, 2, 3));
129+
__m512d a7b7c7d7a7b7c7d7 = _mm512_shuffle_f64x2(a3b3c3d3a7b7c7d7, a3b3c3d3a7b7c7d7, NDA_SHUFFLE_MASK4(2, 3, 2, 3));
130+
131+
simd_d8 a0b0c0d0e0f0g0h0(_mm512_mask_blend_pd(0b11110000, a0b0c0d0a4b4c4d4, e0f0g0h0e0f0g0h0));
132+
133+
simd_d8 a1b1c1d1e1f1g1h1(_mm512_mask_blend_pd(0b11110000, a1b1c1d1a5b5c5d5, e1f1g1h1e1f1g1h1));
134+
135+
simd_d8 a2b2c2d2e2f2g2h2(_mm512_mask_blend_pd(0b11110000, a2b2c2d2a6b6c6d6, e2f2g2h2e2f2g2h2));
136+
137+
simd_d8 a3b3c3d3e3f3g3h3(_mm512_mask_blend_pd(0b11110000, a3b3c3d3a7b7c7d7, e3f3g3h3e3f3g3h3));
138+
139+
simd_d8 a4b4c4d4e4f4g4h4(_mm512_mask_blend_pd(0b11110000, a4b4c4d4a4b4c4d4, e0f0g0h0e4f4g4h4));
140+
141+
simd_d8 a5b5c5d5e5f5g5h5(_mm512_mask_blend_pd(0b11110000, a5b5c5d5a5b5c5d5, e1f1g1h1e5f5g5h5));
142+
143+
simd_d8 a6b6c6d6e6f6g6h6(_mm512_mask_blend_pd(0b11110000, a6b6c6d6a6b6c6d6, e2f2g2h2e6f6g6h6));
144+
145+
simd_d8 a7b7c7d7e7f7g7h7(_mm512_mask_blend_pd(0b11110000, a7b7c7d7a7b7c7d7, e3f3g3h3e7f7g7h7));
146+
147+
return {a0b0c0d0e0f0g0h0, a1b1c1d1e1f1g1h1, a2b2c2d2e2f2g2h2, a3b3c3d3e3f3g3h3,
148+
a4b4c4d4e4f4g4h4, a5b5c5d5e5f5g5h5, a6b6c6d6e6f6g6h6, a7b7c7d7e7f7g7h7};
26149
}
27150

28151
template <>

test/c++/nda_simd.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -665,8 +665,6 @@ void simd_scatter_function() {
665665
simd::scatter(scatter, scattered_array.data(), n);
666666
gather = simd::gather<simd_t>(scattered_array.data(), n);
667667
EXPECT_TRUE(gather == scatter);
668-
// gather.store_unaligned(gather_array.data());
669-
// check_simd_array_equal(scatter, gather_array);
670668
}
671669
}
672670

@@ -1652,7 +1650,7 @@ TEST(NDA, SimdKernelTranspose) {
16521650
#ifdef __AVX512F__
16531651
// AVX512 SIMD types
16541652
// simd_kernel_transpose<float, 16, abi_tag::AVX512>();
1655-
// simd_kernel_transpose<double, 8, abi_tag::AVX512>();
1653+
simd_kernel_transpose<double, 8, abi_tag::AVX512>();
16561654
// simd_kernel_transpose<int32_t, 16, abi_tag::AVX512>();
16571655
// simd_kernel_transpose<int64_t, 8, abi_tag::AVX512>();
16581656
simd_kernel_transpose<std::complex<float>, 8, abi_tag::AVX512>();

0 commit comments

Comments
 (0)