Skip to content

[X86] Suboptimal reusable vpshufb #161980

@dzaima

Description

@dzaima

This code:

#include<stdint.h>
#include<immintrin.h>

void foo(__m256i a, __m256i b, __m256i* dst) {
  __m256i v3 = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 16, 17, 20, 21, 24, 25, 28, 29, 18, 19, 22, 23, 26, 27, 30, 31);
  
  __m256i v4 = _mm256_shuffle_epi8(a, v3);
  __m256i v6 = _mm256_shuffle_epi8(b, v3);
  
  __m256i v9 = _mm256_unpacklo_epi64(v4, v6);
  __m256i v10 = _mm256_unpackhi_epi64(v4, v6);
  _mm256_storeu_si256(dst, v9);
  _mm256_storeu_si256(dst+1, v10);
}

via -O3 -march=haswell compiles to:

foo:
        vmovdqa ymm2, ymmword ptr [rip + .LCPI0_0] ; 0,1,4,5,z,z,z,z,8,9,12,13,z,z,z,z,16,17,20,21,z,z,z,z,24,25,28,29,z,z,z,z
        vpshufb ymm3, ymm1, ymm2
        vpshufb ymm2, ymm0, ymm2
        vshufps ymm2, ymm2, ymm3, 136
        vmovdqa ymm3, ymmword ptr [rip + .LCPI0_1] ; 2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31
        vpshufb ymm1, ymm1, ymm3
        vpshufb ymm0, ymm0, ymm3
        vpblendd        ymm0, ymm0, ymm1, 204
        vmovups ymmword ptr [rdi], ymm2
        vmovdqu ymmword ptr [rdi + 32], ymm0
        vzeroupper
        ret

not taking advantage of the reusable vpshufb's as the code shows is possible.

https://c.godbolt.org/z/79Td33z9c

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions