@@ -1047,13 +1047,26 @@ INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
10471047INLINE void load_counters16 (uint64_t counter , bool increment_counter ,
10481048 __m512i * out_lo , __m512i * out_hi ) {
10491049 const __m512i mask = _mm512_set1_epi32 (- (int32_t )increment_counter );
1050- const __m512i add0 = _mm512_set_epi32 (15 , 14 , 13 , 12 , 11 , 10 , 9 , 8 , 7 , 6 , 5 , 4 , 3 , 2 , 1 , 0 );
1051- const __m512i add1 = _mm512_and_si512 (mask , add0 );
1052- __m512i l = _mm512_add_epi32 (_mm512_set1_epi32 ((int32_t )counter ), add1 );
1053- __mmask16 carry = _mm512_cmp_epu32_mask (l , add1 , _MM_CMPINT_LT );
1054- __m512i h = _mm512_mask_add_epi32 (_mm512_set1_epi32 ((int32_t )(counter >> 32 )), carry , _mm512_set1_epi32 ((int32_t )(counter >> 32 )), _mm512_set1_epi32 (1 ));
1055- * out_lo = l ;
1056- * out_hi = h ;
1050+ const __m512i deltas = _mm512_set_epi32 (15 , 14 , 13 , 12 , 11 , 10 , 9 , 8 , 7 , 6 , 5 , 4 , 3 , 2 , 1 , 0 );
1051+ const __m512i masked_deltas = _mm512_and_si512 (deltas , mask );
1052+ const __m512i low_words = _mm512_add_epi32 (
1053+ _mm512_set1_epi32 ((int32_t )counter ),
1054+ masked_deltas );
1055+ // The carry bit is 1 if the high bit of the word was 1 before addition and is
1056+ // 0 after.
1057+ // NOTE: It would be a bit more natural to use _mm512_cmp_epu32_mask to
1058+ // compute the carry bits here, and originally we did, but that intrinsic is
1059+ // broken under GCC 5.4. See https://github.com/BLAKE3-team/BLAKE3/issues/271.
1060+ const __m512i carries = _mm512_srli_epi32 (
1061+ _mm512_andnot_si512 (
1062+ low_words , // 0 after (gets inverted by andnot)
1063+ _mm512_set1_epi32 ((int32_t )counter )), // and 1 before
1064+ 31 );
1065+ const __m512i high_words = _mm512_add_epi32 (
1066+ _mm512_set1_epi32 ((int32_t )(counter >> 32 )),
1067+ carries );
1068+ * out_lo = low_words ;
1069+ * out_hi = high_words ;
10571070}
10581071
10591072static
0 commit comments