@@ -1751,6 +1751,23 @@ unittest
17511751 assert (R.array == correct);
17521752}
17531753
1754+ // / Move the upper 2 32-bit integer elements from `b` to the lower 2 elements of result, and
1755+ // / copy the upper 2 elements from `a` to the upper 2 elements of dst.
1756+ __m128i _mm_movehl_epi32 (__m128i a, __m128i b) pure @trusted
1757+ {
1758+ a.ptr[0 ] = b.array[2 ];
1759+ a.ptr[1 ] = b.array[3 ];
1760+ return a;
1761+ }
1762+ unittest
1763+ {
1764+ __m128i A = _mm_setr_epi32(1 , 2 , 3 , 4 );
1765+ __m128i B = _mm_setr_epi32(5 , 6 , 7 , 8 );
1766+ __m128i R = _mm_movehl_epi32(A, B);
1767+ int [4 ] correct = [7 , 8 , 3 , 4 ];
1768+ assert (R.array == correct);
1769+ }
1770+
17541771// / Move the lower 2 single-precision (32-bit) floating-point elements from `b` to the upper 2 elements of result, and
17551772// / copy the lower 2 elements from `a` to the lower 2 elements of result
17561773__m128 _mm_movelh_ps (__m128 a, __m128 b) pure @trusted
@@ -1776,6 +1793,23 @@ unittest
17761793 assert (R.array == correct);
17771794}
17781795
1796+ // / Move the lower 2 32-bit integers `b` to the upper 2 elements of result, and
1797+ // / copy the lower 2 elements from `a` to the lower 2 elements of result
1798+ __m128i _mm_movelh_epi32 (__m128i a, __m128i b) pure @trusted // #BONUS
1799+ {
1800+ a.ptr[2 ] = b.array[0 ];
1801+ a.ptr[3 ] = b.array[1 ];
1802+ return a;
1803+ }
1804+ unittest
1805+ {
1806+ __m128i A = _mm_setr_epi32(1 , 2 , 3 , 4 );
1807+ __m128i B = _mm_setr_epi32(5 , 6 , 7 , 8 );
1808+ __m128i R = _mm_movelh_epi32(A, B);
1809+ int [4 ] correct = [1 , 2 , 5 , 6 ];
1810+ assert (R.array == correct);
1811+ }
1812+
17791813// / Create mask from the most significant bit of each 8-bit element in `a`.
17801814int _mm_movemask_pi8 (__m64 a) pure @safe
17811815{
@@ -2972,6 +3006,37 @@ unittest
29723006 assert (l3.array == r3);
29733007}
29743008
3009+ // / Transpose the 4x4 matrix formed by the 4 rows of 32-bit integer elements in row0, row1,
3010+ // / row2, and row3, and store the transposed matrix in these vectors (row0 now contains column 0, etc.).
3011+ void _MM_TRANSPOSE4_EPI32 (ref __m128i row0, ref __m128i row1, ref __m128i row2, ref __m128i row3) pure @safe // #BONUS
3012+ {
3013+ __m128i tmp3, tmp2, tmp1, tmp0;
3014+ tmp0 = _mm_unpacklo_epi32(row0, row1);
3015+ tmp2 = _mm_unpacklo_epi32(row2, row3);
3016+ tmp1 = _mm_unpackhi_epi32(row0, row1);
3017+ tmp3 = _mm_unpackhi_epi32(row2, row3);
3018+ row0 = _mm_movelh_epi32(tmp0, tmp2);
3019+ row1 = _mm_movehl_epi32(tmp2, tmp0);
3020+ row2 = _mm_movelh_epi32(tmp1, tmp3);
3021+ row3 = _mm_movehl_epi32(tmp3, tmp1);
3022+ }
3023+ unittest
3024+ {
3025+ __m128i l0 = _mm_setr_epi32(0 , 1 , 2 , 3 );
3026+ __m128i l1 = _mm_set_epi32(7 , 6 , 5 , 4 );
3027+ __m128i l2 = _mm_setr_epi32(8 , 9 , 10 , 11 );
3028+ __m128i l3 = _mm_setr_epi32(12 , 13 , 14 , 15 );
3029+ _MM_TRANSPOSE4_EPI32(l0, l1, l2, l3);
3030+ int [4 ] r0 = [0 , 4 , 8 , 12 ];
3031+ int [4 ] r1 = [1 , 5 , 9 , 13 ];
3032+ int [4 ] r2 = [2 , 6 , 10 , 14 ];
3033+ int [4 ] r3 = [3 , 7 , 11 , 15 ];
3034+ assert (l0.array == r0);
3035+ assert (l1.array == r1);
3036+ assert (l2.array == r2);
3037+ assert (l3.array == r3);
3038+ }
3039+
29753040// Note: the only difference between these intrinsics is the signalling
29763041// behaviour of quiet NaNs. This is incorrect but the case where
29773042// you would want to differentiate between qNaN and sNaN and then
0 commit comments