@@ -129,68 +129,33 @@ dnl AC_CHECK_LIB([bsc], [bsc_compress], [
129129dnl LIBS="-lbsc $LIBS"
130130dnl AC_DEFINE([HAVE_LIBBSC],1,[Define to 1 if you have the libbsc library.])])
131131
132- dnl Count parts needed to build rANS_static32x16pr_sse4.c
133- sse4_prerequisites=""
134-
135- dnl Check if we can use our SSSE3 implementations of rANS 32x16 codec.
136- HTS_CHECK_COMPILE_FLAGS_NEEDED([ ssse3] , [ -mssse3] , [ AC_LANG_PROGRAM ( [ [
137- #ifdef __x86_64__
138- #include "x86intrin.h"
139- #endif
140- ] ] ,[ [
141- #ifdef __x86_64__
142- __m128i a = _mm_set_epi32(1, 2, 3, 4), b = _mm_set_epi32(4, 3, 2, 1);
143- __m128i c = _mm_shuffle_epi8(a, b);
144- return *((char *) &c);
145- #endif
146- ] ] ) ] , [
147- MSSSE3="$flags_needed"
148- sse4_prerequisites="o$sse4_prerequisites"
149- AC_SUBST ( [ MSSSE3] )
150- AC_DEFINE ( [ HAVE_SSSE3] ,1 ,[ Defined to 1 if rANS source using SSSE3 can be compiled.] )
151- ] )
152-
153- dnl Check if we can use popcnt instructions
154- HTS_CHECK_COMPILE_FLAGS_NEEDED([ popcnt] , [ -mpopcnt] , [ AC_LANG_PROGRAM ( [ [
155- #ifdef __x86_64__
156- #include "x86intrin.h"
157- #endif
158- ] ] ,[ [
159- #ifdef __x86_64__
160- unsigned int i = _mm_popcnt_u32(1);
161- return i != 1;
162- #endif
163- ] ] ) ] , [
164- MPOPCNT="$flags_needed"
165- sse4_prerequisites="o$sse4_prerequisites"
166- AC_SUBST ( [ MPOPCNT] )
167- AC_DEFINE ( [ HAVE_POPCNT] ,1 ,[ Defined to 1 if rANS source using popcnt can be compiled.] )
168- ] )
169-
170- dnl Check if we can use our SSE4.1 too. This *may* always imply SSSE3?
171- dnl It may be easier just to target an old era of cpu than -mssse3 -msse4.1
172- dnl -mpopcnt. Eg -march=nehalem. I don't know how wide spread that is.
173- HTS_CHECK_COMPILE_FLAGS_NEEDED([ sse4.1] , [ -msse4.1] , [ AC_LANG_PROGRAM ( [ [
132+ dnl Check if we can use our SSE4.1 too.
133+ dnl Our SSE4 codec uses SSE4.1, SSSE3 (shuffle) and POPCNT, so we check all 3
134+ dnl together. This helps Zig builds which don't work well if we test each
135+ dnl individually.
136+ HTS_CHECK_COMPILE_FLAGS_NEEDED([ sse4.1] , [ -msse4.1 -mssse3 -mpopcnt] , [ AC_LANG_PROGRAM ( [ [
174137 #ifdef __x86_64__
175138 #include "x86intrin.h"
176139 #endif
177140 ] ] ,[ [
178141 #ifdef __x86_64__
179142 __m128i a = _mm_set_epi32(1, 2, 3, 4), b = _mm_set_epi32(4, 3, 2, 1);
180- __m128i c = _mm_max_epu32(a, b);
181- return *((char *) &c);
143+ __m128i c = _mm_shuffle_epi8( _mm_max_epu32(a, b) , b);
144+ return _mm_popcnt_u32( *((char *) &c) );
182145 #endif
183146 ] ] ) ] , [
184147 MSSE4_1="$flags_needed"
185- sse4_prerequisites="o$sse4_prerequisites"
148+ build_rans_sse4=yes
186149 AC_SUBST ( [ MSSE4_1] )
187150 AC_DEFINE ( [ HAVE_SSE4_1] ,1 ,[ Defined to 1 if rANS source using SSE4.1 can be compiled.] )
151+ AC_DEFINE ( [ HAVE_SSSE3] ,1 ,[ Defined to 1 if rANS source using SSSE3 can be compiled.] )
152+ AC_DEFINE ( [ HAVE_POPCNT] ,1 ,[ Defined to 1 if rANS source using popcnt can be compiled.] )
188153] )
189- AM_CONDITIONAL([ RANS_32x16_SSE4] ,[ test "x$sse4_prerequisites " = "xooo" ] )
154+ AM_CONDITIONAL([ RANS_32x16_SSE4] ,[ test "$build_rans_sse4 " = yes ] )
190155
191156dnl Check if we can use our AVX2 implementations.
192157build_rans_avx2=no
193- HTS_CHECK_COMPILE_FLAGS_NEEDED([ avx2] , [ -mavx2] , [ AC_LANG_PROGRAM ( [ [
158+ HTS_CHECK_COMPILE_FLAGS_NEEDED([ avx2] , [ -mavx2 -mpopcnt ] , [ AC_LANG_PROGRAM ( [ [
194159 #ifdef __x86_64__
195160 #include "x86intrin.h"
196161 #endif
@@ -199,34 +164,36 @@ HTS_CHECK_COMPILE_FLAGS_NEEDED([avx2], [-mavx2], [AC_LANG_PROGRAM([[
199164 __m256i a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
200165 __m256i b = _mm256_add_epi32(a, a);
201166 long long c = _mm256_extract_epi64(b, 0);
202- return ( int) c ;
167+ return _mm_popcnt_u32(( int)c) ;
203168 #endif
204169 ] ] ) ] , [
205170 MAVX2="$flags_needed"
206171 build_rans_avx2=yes
207172 AC_SUBST ( [ MAVX2] )
208173 AC_DEFINE ( [ HAVE_AVX2] ,1 ,[ Defined to 1 if rANS source using AVX2 can be compiled.] )
174+ AC_DEFINE ( [ HAVE_POPCNT] ,1 ,[ Defined to 1 if rANS source using popcnt can be compiled.] )
209175] )
210176AM_CONDITIONAL([ RANS_32x16_AVX2] ,[ test "$build_rans_avx2" = yes] )
211177
212- dnl Check also if we have AVX512. If so this overrides AVX2
178+ dnl Check also if we have AVX512.
213179build_rans_avx512=no
214- HTS_CHECK_COMPILE_FLAGS_NEEDED([ avx512f] , [ -mavx512f] , [ AC_LANG_PROGRAM ( [ [
180+ HTS_CHECK_COMPILE_FLAGS_NEEDED([ avx512f] , [ -mavx512f -mpopcnt ] , [ AC_LANG_PROGRAM ( [ [
215181 #ifdef __x86_64__
216182 #include "x86intrin.h"
217183 #endif
218184 ] ] ,[ [
219185 #ifdef __x86_64__
220186 __m512i a = _mm512_set1_epi32(1);
221187 __m512i b = _mm512_add_epi32(a, a);
222- return *((char *) &b);
188+ return _mm_popcnt_u32( *((char *) &b) );
223189 #endif
224190 ] ] ) ] , [
225191 MAVX512="$flags_needed"
226192 build_rans_avx512=yes
227193 AC_SUBST ( [ MAVX512] )
228194 AC_DEFINE ( [ HAVE_AVX512] ,1 ,[ Defined to 1 if rANS source using AVX512F can be compiled.] )
229195] )
196+ AC_DEFINE ( [ HAVE_POPCNT] ,1 ,[ Defined to 1 if rANS source using popcnt can be compiled.] )
230197AM_CONDITIONAL([ RANS_32x16_AVX512] ,[ test "$build_rans_avx512" = yes] )
231198
232199AC_SUBST ( [ HTSCODECS_SIMD_SRC] )
0 commit comments