@@ -30,18 +30,24 @@ constexpr bool hasFlag(AlignmentFlags allFlags, AlignmentFlags flag) {
3030}
3131
3232__attribute__ ((target(" avx2" )))
33- inline bool isAligned32(void * ptr) {
33+ inline bool isAligned32(void * ptr)
34+ {
3435 return (reinterpret_cast <std::uintptr_t >(ptr) & 31 ) == 0 ;
3536}
37+
3638__attribute__ ((target(" avx2" )))
37- inline __m256i loadAligned(const void * ptr) {
39+ inline __m256i loadAligned(const void * ptr)
40+ {
3841 const __m256i* aligned_ptr = (const __m256i*)__builtin_assume_aligned (ptr, 32 );
3942 return _mm256_load_si256 (aligned_ptr);
4043}
44+
4145__attribute__ ((target(" avx2" )))
42- inline __m256i loadUnaligned(const void * ptr) {
46+ inline __m256i loadUnaligned(const void * ptr)
47+ {
4348 return _mm256_loadu_si256 ((const __m256i*)ptr);
4449}
50+
4551template <bool alligned>
4652__attribute__ ((target(" avx2" )))
4753inline __m256i Load(const void * ptr)
@@ -59,15 +65,19 @@ inline void GetChannels3_16x16_2(uint8_t* ptr, __m256i& rl, __m256i& gl, __m256i
5965 const __m256i blendMask0 = _mm256_setr_epi8 (
6066 0 , 0 , -1 , 0 , 0 , -1 , 0 , 0 , -1 , 0 , 0 , -1 , 0 , 0 , -1 , 0 ,
6167 0 , -1 , 0 , 0 , -1 , 0 , 0 , -1 , 0 , 0 , -1 , 0 , 0 , -1 , 0 , 0 );
68+
6269 const __m256i blendMask1 = _mm256_setr_epi8 (
6370 0 , -1 , 0 , 0 , -1 , 0 , 0 , -1 , 0 , 0 , -1 , 0 , 0 , -1 , 0 , 0 ,
6471 -1 , 0 , 0 , -1 , 0 , 0 , -1 , 0 , 0 , -1 , 0 , 0 , -1 , 0 , 0 , -1 );
72+
6573 const __m256i shuffleMaskR = _mm256_setr_epi8 (
6674 0 , 3 , 6 , 9 , 12 , 15 , 2 , 5 , 8 , 11 , 14 , 1 , 4 , 7 , 10 , 13 ,
6775 0 , 3 , 6 , 9 , 12 , 15 , 2 , 5 , 8 , 11 , 14 , 1 , 4 , 7 , 10 , 13 );
76+
6877 const __m256i shuffleMaskG = _mm256_setr_epi8 (
6978 1 , 4 , 7 , 10 , 13 , 0 , 3 , 6 , 9 , 12 , 15 , 2 , 5 , 8 , 11 , 14 ,
7079 1 , 4 , 7 , 10 , 13 , 0 , 3 , 6 , 9 , 12 , 15 , 2 , 5 , 8 , 11 , 14 );
80+
7181 const __m256i shuffleMaskB = _mm256_setr_epi8 (
7282 2 , 5 , 8 , 11 , 14 , 1 , 4 , 7 , 10 , 13 , 0 , 3 , 6 , 9 , 12 , 15 ,
7383 2 , 5 , 8 , 11 , 14 , 1 , 4 , 7 , 10 , 13 , 0 , 3 , 6 , 9 , 12 , 15 );
@@ -103,8 +113,10 @@ inline void GetChannels3_16x16_2(uint8_t* ptr, __m256i& rl, __m256i& gl, __m256i
103113__attribute__ ((target(" avx2" )))
104114inline void GetChannels4_16x16_2(const uint8_t * ptr, __m256i& rl, __m256i& gl, __m256i& bl, __m256i& rh, __m256i& gh, __m256i& bh)
105115{
106- const __m256i rgbaShuffleMask = _mm256_setr_epi8 (0 , 4 , 8 , 12 , 1 , 5 , 9 , 13 , 2 , 6 , 10 , 14 , 3 , 7 , 11 , 15 ,
116+ const __m256i rgbaShuffleMask = _mm256_setr_epi8 (
117+ 0 , 4 , 8 , 12 , 1 , 5 , 9 , 13 , 2 , 6 , 10 , 14 , 3 , 7 , 11 , 15 ,
107118 0 , 4 , 8 , 12 , 1 , 5 , 9 , 13 , 2 , 6 , 10 , 14 , 3 , 7 , 11 , 15 );
119+
108120 __m256i rgb1 = _mm256_loadu_si256 ((const __m256i*)ptr);
109121 __m256i rgb2 = _mm256_loadu_si256 ((const __m256i*)(ptr + 32 ));
110122 __m256i rgb3 = _mm256_loadu_si256 ((const __m256i*)(ptr + 64 ));
@@ -295,7 +307,6 @@ inline void GetChannels3_16x16(uint8_t* RESTRICT input, __m256i& rl, __m256i& gl
295307 8 , 11 , 14 , 1 , 4 , 7 , 10 , 13
296308 );
297309
298- // Define blend mask directly in AVX registers
299310 const __m256i blendMask = _mm256_setr_epi8 (
300311 -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 ,
301312 -1 , -1 , -1 , 0 , 0 , 0 , 0 , 0 ,
@@ -358,12 +369,15 @@ inline void GetChannels4_16x16(uint8_t* RESTRICT src, __m256i& rl, __m256i& gl,
358369 const __m256i rmask = _mm256_setr_epi8 (
359370 0 , -1 , -1 , -1 , 4 , -1 , -1 , -1 , 8 , -1 , -1 , -1 , 12 , -1 , -1 , -1 , 16 ,
360371 -1 , -1 , -1 , 20 , -1 , -1 , -1 , 24 , -1 , -1 , -1 , 28 , -1 , -1 , -1 );
372+
361373 const __m256i gmask = _mm256_setr_epi8 (
362374 1 , -1 , -1 , -1 , 5 , -1 , -1 , -1 , 9 , -1 , -1 , -1 , 13 , -1 , -1 , -1 , 17 ,
363375 -1 , -1 , -1 , 21 , -1 , -1 , -1 , 25 , -1 , -1 , -1 , 29 , -1 , -1 , -1 );
376+
364377 const __m256i bmask = _mm256_setr_epi8 (
365378 2 , -1 , -1 , -1 , 6 , -1 , -1 , -1 , 10 , -1 , -1 , -1 , 14 , -1 , -1 , -1 , 18 ,
366379 -1 , -1 , -1 , 22 , -1 , -1 , -1 , 26 , -1 , -1 , -1 , 30 , -1 , -1 , -1 );
380+
367381 __m256i rgb1 = _mm256_loadu_si256 ((__m256i*)src);
368382 __m256i rgb2 = _mm256_loadu_si256 ((__m256i*)(src + 32 ));
369383 __m256i rgb3 = _mm256_loadu_si256 ((__m256i*)(src + 64 ));
0 commit comments