-
Couldn't load subscription status.
- Fork 15k
Open
Description
This code:
#include<stdint.h>
#include<immintrin.h>
void foo(__m256i a, __m256i b, __m256i* dst) {
__m256i v3 = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 16, 17, 20, 21, 24, 25, 28, 29, 18, 19, 22, 23, 26, 27, 30, 31);
__m256i v4 = _mm256_shuffle_epi8(a, v3);
__m256i v6 = _mm256_shuffle_epi8(b, v3);
__m256i v9 = _mm256_unpacklo_epi64(v4, v6);
__m256i v10 = _mm256_unpackhi_epi64(v4, v6);
_mm256_storeu_si256(dst, v9);
_mm256_storeu_si256(dst+1, v10);
}via -O3 -march=haswell compiles to:
foo:
vmovdqa ymm2, ymmword ptr [rip + .LCPI0_0] ; 0,1,4,5,z,z,z,z,8,9,12,13,z,z,z,z,16,17,20,21,z,z,z,z,24,25,28,29,z,z,z,z
vpshufb ymm3, ymm1, ymm2
vpshufb ymm2, ymm0, ymm2
vshufps ymm2, ymm2, ymm3, 136
vmovdqa ymm3, ymmword ptr [rip + .LCPI0_1] ; 2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31
vpshufb ymm1, ymm1, ymm3
vpshufb ymm0, ymm0, ymm3
vpblendd ymm0, ymm0, ymm1, 204
vmovups ymmword ptr [rdi], ymm2
vmovdqu ymmword ptr [rdi + 32], ymm0
vzeroupper
retnot taking advantage of the reusable vpshufb's as the code shows is possible.