The code:
#include <immintrin.h>
#include <stdint.h>
__m256i count_negative_values(__m256i* src, size_t n) {
__m256i acc = _mm256_setzero_si256();
for (size_t i = 0; i < n; i++) {
__m256i c = _mm256_loadu_si256(src + i);
__m256i m = _mm256_cmpgt_epi8(_mm256_setzero_si256(), c);
acc = _mm256_sub_epi8(acc, m);
}
return acc;
}
with -O3 -march=haswell generates a sequence of vpsrlw …,7 + vpand + vpaddb for adding to the accumulator, whereas it could produce just vpcmpgtb + vpsubb. (if desired, in the unrolled case, it could, to add e.g. a,b,c,d to the accumulator without a long vpsubb chain, do acc -= (a+b)+(c+d))
https://godbolt.org/z/Wo7qE8MjG