@@ -9,10 +9,71 @@ namespace nda::simd {
99 return simd_block;
1010 }
1111
12-
1312 template <>
1413 inline std::array<simd_l8, 8 > kernel_transpose (const std::array<simd_l8, 8 > &simd_block) {
15- return simd_block;
14+ simd_l8 a0a1a2a3a4a5a6a7 = simd_block[0 ];
15+ simd_l8 b0b1b2b3b4b5b6b7 = simd_block[1 ];
16+ simd_l8 c0c1c2c3c4c5c6c7 = simd_block[2 ];
17+ simd_l8 d0d1d2d3d4d5d6d7 = simd_block[3 ];
18+ simd_l8 e0e1e2e3e4e5e6e7 = simd_block[4 ];
19+ simd_l8 f0f1f2f3f4f5f6f7 = simd_block[5 ];
20+ simd_l8 g0g1g2g3g4g5g6g7 = simd_block[6 ];
21+ simd_l8 h0h1h2h3h4h5h6h7 = simd_block[7 ];
22+
23+ __m512i a0b0a2b2a4b4a6b6 = _mm512_unpacklo_epi64 (a0a1a2a3a4a5a6a7, b0b1b2b3b4b5b6b7);
24+ __m512i a1b1a3b3a5b5a7b7 = _mm512_unpackhi_epi64 (a0a1a2a3a4a5a6a7, b0b1b2b3b4b5b6b7);
25+ __m512i c0d0c2d2c4d4c6d6 = _mm512_unpacklo_epi64 (c0c1c2c3c4c5c6c7, d0d1d2d3d4d5d6d7);
26+ __m512i c1d1c3d3c5d5c7d7 = _mm512_unpackhi_epi64 (c0c1c2c3c4c5c6c7, d0d1d2d3d4d5d6d7);
27+ __m512i e0f0e2f2e4f4e6f6 = _mm512_unpacklo_epi64 (e0e1e2e3e4e5e6e7, f0f1f2f3f4f5f6f7);
28+ __m512i e1f1e3f3e5f5e5f7 = _mm512_unpackhi_epi64 (e0e1e2e3e4e5e6e7, f0f1f2f3f4f5f6f7);
29+ __m512i g0h0g2h2g4h4g6h6 = _mm512_unpacklo_epi64 (g0g1g2g3g4g5g6g7, h0h1h2h3h4h5h6h7);
30+ __m512i g1h1g3h3g5h5g7h7 = _mm512_unpackhi_epi64 (g0g1g2g3g4g5g6g7, h0h1h2h3h4h5h6h7);
31+
32+ __m512i a2b2a0b0a6b6a4b4 = _mm512_shuffle_i64x2 (a0b0a2b2a4b4a6b6, a0b0a2b2a4b4a6b6, NDA_SHUFFLE_MASK4 (1 , 0 , 3 , 2 ));
33+ __m512i a3b3a1b1a7b7a5b5 = _mm512_shuffle_i64x2 (a1b1a3b3a5b5a7b7, a1b1a3b3a5b5a7b7, NDA_SHUFFLE_MASK4 (1 , 0 , 3 , 2 ));
34+ __m512i c2d2c0d0c6d6c4d4 = _mm512_shuffle_i64x2 (c0d0c2d2c4d4c6d6, c0d0c2d2c4d4c6d6, NDA_SHUFFLE_MASK4 (1 , 0 , 3 , 2 ));
35+ __m512i c3d3c1d1c7d7c5d5 = _mm512_shuffle_i64x2 (c1d1c3d3c5d5c7d7, c1d1c3d3c5d5c7d7, NDA_SHUFFLE_MASK4 (1 , 0 , 3 , 2 ));
36+ __m512i e2f2e0f0e6f6e4f4 = _mm512_shuffle_i64x2 (e0f0e2f2e4f4e6f6, e0f0e2f2e4f4e6f6, NDA_SHUFFLE_MASK4 (1 , 0 , 3 , 2 ));
37+ __m512i e3f3e1f1e7f7e5f5 = _mm512_shuffle_i64x2 (e1f1e3f3e5f5e5f7, e1f1e3f3e5f5e5f7, NDA_SHUFFLE_MASK4 (1 , 0 , 3 , 2 ));
38+ __m512i g2h2g0h0g6h6g4h4 = _mm512_shuffle_i64x2 (g0h0g2h2g4h4g6h6, g0h0g2h2g4h4g6h6, NDA_SHUFFLE_MASK4 (1 , 0 , 3 , 2 ));
39+ __m512i g3h3g1h1g7h7g5h5 = _mm512_shuffle_i64x2 (g1h1g3h3g5h5g7h7, g1h1g3h3g5h5g7h7, NDA_SHUFFLE_MASK4 (1 , 0 , 3 , 2 ));
40+
41+ __m512i a2b2c2d2a6b6c6d6 = _mm512_mask_blend_epi64 (0b11001100 , a2b2a0b0a6b6a4b4, c0d0c2d2c4d4c6d6);
42+ __m512i a3b3c3d3a7b7c7d7 = _mm512_mask_blend_epi64 (0b11001100 , a3b3a1b1a7b7a5b5, c1d1c3d3c5d5c7d7);
43+ __m512i a0b0c0d0a4b4c4d4 = _mm512_mask_blend_epi64 (0b11001100 , a0b0a2b2a4b4a6b6, c2d2c0d0c6d6c4d4);
44+ __m512i a1b1c1d1a5b5c5d5 = _mm512_mask_blend_epi64 (0b11001100 , a1b1a3b3a5b5a7b7, c3d3c1d1c7d7c5d5);
45+ __m512i e2f2g2h2e6f6g6h6 = _mm512_mask_blend_epi64 (0b11001100 , e2f2e0f0e6f6e4f4, g0h0g2h2g4h4g6h6);
46+ __m512i e3f3g3h3e7f7g7h7 = _mm512_mask_blend_epi64 (0b11001100 , e3f3e1f1e7f7e5f5, g1h1g3h3g5h5g7h7);
47+ __m512i e0f0g0h0e4f4g4h4 = _mm512_mask_blend_epi64 (0b11001100 , e0f0e2f2e4f4e6f6, g2h2g0h0g6h6g4h4);
48+ __m512i e1f1g1h1e5f5g5h5 = _mm512_mask_blend_epi64 (0b11001100 , e1f1e3f3e5f5e5f7, g3h3g1h1g7h7g5h5);
49+
50+ __m512i e0f0g0h0e0f0g0h0 = _mm512_shuffle_i64x2 (e0f0g0h0e4f4g4h4, e0f0g0h0e4f4g4h4, NDA_SHUFFLE_MASK4 (0 , 1 , 0 , 1 ));
51+ __m512i e1f1g1h1e1f1g1h1 = _mm512_shuffle_i64x2 (e1f1g1h1e5f5g5h5, e1f1g1h1e5f5g5h5, NDA_SHUFFLE_MASK4 (0 , 1 , 0 , 1 ));
52+ __m512i e2f2g2h2e2f2g2h2 = _mm512_shuffle_i64x2 (e2f2g2h2e6f6g6h6, e2f2g2h2e6f6g6h6, NDA_SHUFFLE_MASK4 (0 , 1 , 0 , 1 ));
53+ __m512i e3f3g3h3e3f3g3h3 = _mm512_shuffle_i64x2 (e3f3g3h3e7f7g7h7, e3f3g3h3e7f7g7h7, NDA_SHUFFLE_MASK4 (0 , 1 , 0 , 1 ));
54+ __m512i a4b4c4d4a4b4c4d4 = _mm512_shuffle_i64x2 (a0b0c0d0a4b4c4d4, a0b0c0d0a4b4c4d4, NDA_SHUFFLE_MASK4 (2 , 3 , 2 , 3 ));
55+ __m512i a5b5c5d5a5b5c5d5 = _mm512_shuffle_i64x2 (a1b1c1d1a5b5c5d5, a1b1c1d1a5b5c5d5, NDA_SHUFFLE_MASK4 (2 , 3 , 2 , 3 ));
56+ __m512i a6b6c6d6a6b6c6d6 = _mm512_shuffle_i64x2 (a2b2c2d2a6b6c6d6, a2b2c2d2a6b6c6d6, NDA_SHUFFLE_MASK4 (2 , 3 , 2 , 3 ));
57+ __m512i a7b7c7d7a7b7c7d7 = _mm512_shuffle_i64x2 (a3b3c3d3a7b7c7d7, a3b3c3d3a7b7c7d7, NDA_SHUFFLE_MASK4 (2 , 3 , 2 , 3 ));
58+
59+ simd_l8 a0b0c0d0e0f0g0h0 (_mm512_mask_blend_epi64 (0b11110000 , a0b0c0d0a4b4c4d4, e0f0g0h0e0f0g0h0));
60+
61+ simd_l8 a1b1c1d1e1f1g1h1 (_mm512_mask_blend_epi64 (0b11110000 , a1b1c1d1a5b5c5d5, e1f1g1h1e1f1g1h1));
62+
63+ simd_l8 a2b2c2d2e2f2g2h2 (_mm512_mask_blend_epi64 (0b11110000 , a2b2c2d2a6b6c6d6, e2f2g2h2e2f2g2h2));
64+
65+ simd_l8 a3b3c3d3e3f3g3h3 (_mm512_mask_blend_epi64 (0b11110000 , a3b3c3d3a7b7c7d7, e3f3g3h3e3f3g3h3));
66+
67+ simd_l8 a4b4c4d4e4f4g4h4 (_mm512_mask_blend_epi64 (0b11110000 , a4b4c4d4a4b4c4d4, e0f0g0h0e4f4g4h4));
68+
69+ simd_l8 a5b5c5d5e5f5g5h5 (_mm512_mask_blend_epi64 (0b11110000 , a5b5c5d5a5b5c5d5, e1f1g1h1e5f5g5h5));
70+
71+ simd_l8 a6b6c6d6e6f6g6h6 (_mm512_mask_blend_epi64 (0b11110000 , a6b6c6d6a6b6c6d6, e2f2g2h2e6f6g6h6));
72+
73+ simd_l8 a7b7c7d7e7f7g7h7 (_mm512_mask_blend_epi64 (0b11110000 , a7b7c7d7a7b7c7d7, e3f3g3h3e7f7g7h7));
74+
75+ return {a0b0c0d0e0f0g0h0, a1b1c1d1e1f1g1h1, a2b2c2d2e2f2g2h2, a3b3c3d3e3f3g3h3,
76+ a4b4c4d4e4f4g4h4, a5b5c5d5e5f5g5h5, a6b6c6d6e6f6g6h6, a7b7c7d7e7f7g7h7};
1677 }
1778
1879 template <>
@@ -22,7 +83,69 @@ namespace nda::simd {
2283
2384 template <>
2485 inline std::array<simd_d8, 8 > kernel_transpose (const std::array<simd_d8, 8 > &simd_block) {
25- return simd_block;
86+ simd_d8 a0a1a2a3a4a5a6a7 = simd_block[0 ];
87+ simd_d8 b0b1b2b3b4b5b6b7 = simd_block[1 ];
88+ simd_d8 c0c1c2c3c4c5c6c7 = simd_block[2 ];
89+ simd_d8 d0d1d2d3d4d5d6d7 = simd_block[3 ];
90+ simd_d8 e0e1e2e3e4e5e6e7 = simd_block[4 ];
91+ simd_d8 f0f1f2f3f4f5f6f7 = simd_block[5 ];
92+ simd_d8 g0g1g2g3g4g5g6g7 = simd_block[6 ];
93+ simd_d8 h0h1h2h3h4h5h6h7 = simd_block[7 ];
94+
95+ __m512d a0b0a2b2a4b4a6b6 = _mm512_unpacklo_pd (a0a1a2a3a4a5a6a7, b0b1b2b3b4b5b6b7);
96+ __m512d a1b1a3b3a5b5a7b7 = _mm512_unpackhi_pd (a0a1a2a3a4a5a6a7, b0b1b2b3b4b5b6b7);
97+ __m512d c0d0c2d2c4d4c6d6 = _mm512_unpacklo_pd (c0c1c2c3c4c5c6c7, d0d1d2d3d4d5d6d7);
98+ __m512d c1d1c3d3c5d5c7d7 = _mm512_unpackhi_pd (c0c1c2c3c4c5c6c7, d0d1d2d3d4d5d6d7);
99+ __m512d e0f0e2f2e4f4e6f6 = _mm512_unpacklo_pd (e0e1e2e3e4e5e6e7, f0f1f2f3f4f5f6f7);
100+ __m512d e1f1e3f3e5f5e5f7 = _mm512_unpackhi_pd (e0e1e2e3e4e5e6e7, f0f1f2f3f4f5f6f7);
101+ __m512d g0h0g2h2g4h4g6h6 = _mm512_unpacklo_pd (g0g1g2g3g4g5g6g7, h0h1h2h3h4h5h6h7);
102+ __m512d g1h1g3h3g5h5g7h7 = _mm512_unpackhi_pd (g0g1g2g3g4g5g6g7, h0h1h2h3h4h5h6h7);
103+
104+ __m512d a2b2a0b0a6b6a4b4 = _mm512_shuffle_f64x2 (a0b0a2b2a4b4a6b6, a0b0a2b2a4b4a6b6, NDA_SHUFFLE_MASK4 (1 , 0 , 3 , 2 ));
105+ __m512d a3b3a1b1a7b7a5b5 = _mm512_shuffle_f64x2 (a1b1a3b3a5b5a7b7, a1b1a3b3a5b5a7b7, NDA_SHUFFLE_MASK4 (1 , 0 , 3 , 2 ));
106+ __m512d c2d2c0d0c6d6c4d4 = _mm512_shuffle_f64x2 (c0d0c2d2c4d4c6d6, c0d0c2d2c4d4c6d6, NDA_SHUFFLE_MASK4 (1 , 0 , 3 , 2 ));
107+ __m512d c3d3c1d1c7d7c5d5 = _mm512_shuffle_f64x2 (c1d1c3d3c5d5c7d7, c1d1c3d3c5d5c7d7, NDA_SHUFFLE_MASK4 (1 , 0 , 3 , 2 ));
108+ __m512d e2f2e0f0e6f6e4f4 = _mm512_shuffle_f64x2 (e0f0e2f2e4f4e6f6, e0f0e2f2e4f4e6f6, NDA_SHUFFLE_MASK4 (1 , 0 , 3 , 2 ));
109+ __m512d e3f3e1f1e7f7e5f5 = _mm512_shuffle_f64x2 (e1f1e3f3e5f5e5f7, e1f1e3f3e5f5e5f7, NDA_SHUFFLE_MASK4 (1 , 0 , 3 , 2 ));
110+ __m512d g2h2g0h0g6h6g4h4 = _mm512_shuffle_f64x2 (g0h0g2h2g4h4g6h6, g0h0g2h2g4h4g6h6, NDA_SHUFFLE_MASK4 (1 , 0 , 3 , 2 ));
111+ __m512d g3h3g1h1g7h7g5h5 = _mm512_shuffle_f64x2 (g1h1g3h3g5h5g7h7, g1h1g3h3g5h5g7h7, NDA_SHUFFLE_MASK4 (1 , 0 , 3 , 2 ));
112+
113+ __m512d a2b2c2d2a6b6c6d6 = _mm512_mask_blend_pd (0b11001100 , a2b2a0b0a6b6a4b4, c0d0c2d2c4d4c6d6);
114+ __m512d a3b3c3d3a7b7c7d7 = _mm512_mask_blend_pd (0b11001100 , a3b3a1b1a7b7a5b5, c1d1c3d3c5d5c7d7);
115+ __m512d a0b0c0d0a4b4c4d4 = _mm512_mask_blend_pd (0b11001100 , a0b0a2b2a4b4a6b6, c2d2c0d0c6d6c4d4);
116+ __m512d a1b1c1d1a5b5c5d5 = _mm512_mask_blend_pd (0b11001100 , a1b1a3b3a5b5a7b7, c3d3c1d1c7d7c5d5);
117+ __m512d e2f2g2h2e6f6g6h6 = _mm512_mask_blend_pd (0b11001100 , e2f2e0f0e6f6e4f4, g0h0g2h2g4h4g6h6);
118+ __m512d e3f3g3h3e7f7g7h7 = _mm512_mask_blend_pd (0b11001100 , e3f3e1f1e7f7e5f5, g1h1g3h3g5h5g7h7);
119+ __m512d e0f0g0h0e4f4g4h4 = _mm512_mask_blend_pd (0b11001100 , e0f0e2f2e4f4e6f6, g2h2g0h0g6h6g4h4);
120+ __m512d e1f1g1h1e5f5g5h5 = _mm512_mask_blend_pd (0b11001100 , e1f1e3f3e5f5e5f7, g3h3g1h1g7h7g5h5);
121+
122+ __m512d e0f0g0h0e0f0g0h0 = _mm512_shuffle_f64x2 (e0f0g0h0e4f4g4h4, e0f0g0h0e4f4g4h4, NDA_SHUFFLE_MASK4 (0 , 1 , 0 , 1 ));
123+ __m512d e1f1g1h1e1f1g1h1 = _mm512_shuffle_f64x2 (e1f1g1h1e5f5g5h5, e1f1g1h1e5f5g5h5, NDA_SHUFFLE_MASK4 (0 , 1 , 0 , 1 ));
124+ __m512d e2f2g2h2e2f2g2h2 = _mm512_shuffle_f64x2 (e2f2g2h2e6f6g6h6, e2f2g2h2e6f6g6h6, NDA_SHUFFLE_MASK4 (0 , 1 , 0 , 1 ));
125+ __m512d e3f3g3h3e3f3g3h3 = _mm512_shuffle_f64x2 (e3f3g3h3e7f7g7h7, e3f3g3h3e7f7g7h7, NDA_SHUFFLE_MASK4 (0 , 1 , 0 , 1 ));
126+ __m512d a4b4c4d4a4b4c4d4 = _mm512_shuffle_f64x2 (a0b0c0d0a4b4c4d4, a0b0c0d0a4b4c4d4, NDA_SHUFFLE_MASK4 (2 , 3 , 2 , 3 ));
127+ __m512d a5b5c5d5a5b5c5d5 = _mm512_shuffle_f64x2 (a1b1c1d1a5b5c5d5, a1b1c1d1a5b5c5d5, NDA_SHUFFLE_MASK4 (2 , 3 , 2 , 3 ));
128+ __m512d a6b6c6d6a6b6c6d6 = _mm512_shuffle_f64x2 (a2b2c2d2a6b6c6d6, a2b2c2d2a6b6c6d6, NDA_SHUFFLE_MASK4 (2 , 3 , 2 , 3 ));
129+ __m512d a7b7c7d7a7b7c7d7 = _mm512_shuffle_f64x2 (a3b3c3d3a7b7c7d7, a3b3c3d3a7b7c7d7, NDA_SHUFFLE_MASK4 (2 , 3 , 2 , 3 ));
130+
131+ simd_d8 a0b0c0d0e0f0g0h0 (_mm512_mask_blend_pd (0b11110000 , a0b0c0d0a4b4c4d4, e0f0g0h0e0f0g0h0));
132+
133+ simd_d8 a1b1c1d1e1f1g1h1 (_mm512_mask_blend_pd (0b11110000 , a1b1c1d1a5b5c5d5, e1f1g1h1e1f1g1h1));
134+
135+ simd_d8 a2b2c2d2e2f2g2h2 (_mm512_mask_blend_pd (0b11110000 , a2b2c2d2a6b6c6d6, e2f2g2h2e2f2g2h2));
136+
137+ simd_d8 a3b3c3d3e3f3g3h3 (_mm512_mask_blend_pd (0b11110000 , a3b3c3d3a7b7c7d7, e3f3g3h3e3f3g3h3));
138+
139+ simd_d8 a4b4c4d4e4f4g4h4 (_mm512_mask_blend_pd (0b11110000 , a4b4c4d4a4b4c4d4, e0f0g0h0e4f4g4h4));
140+
141+ simd_d8 a5b5c5d5e5f5g5h5 (_mm512_mask_blend_pd (0b11110000 , a5b5c5d5a5b5c5d5, e1f1g1h1e5f5g5h5));
142+
143+ simd_d8 a6b6c6d6e6f6g6h6 (_mm512_mask_blend_pd (0b11110000 , a6b6c6d6a6b6c6d6, e2f2g2h2e6f6g6h6));
144+
145+ simd_d8 a7b7c7d7e7f7g7h7 (_mm512_mask_blend_pd (0b11110000 , a7b7c7d7a7b7c7d7, e3f3g3h3e7f7g7h7));
146+
147+ return {a0b0c0d0e0f0g0h0, a1b1c1d1e1f1g1h1, a2b2c2d2e2f2g2h2, a3b3c3d3e3f3g3h3,
148+ a4b4c4d4e4f4g4h4, a5b5c5d5e5f5g5h5, a6b6c6d6e6f6g6h6, a7b7c7d7e7f7g7h7};
26149 }
27150
28151 template <>
0 commit comments