@@ -183,38 +183,38 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
183183
184184//#include <immintrin.h>
185185
186- #define rotr32 (x ) _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
187- #define rotr24 (x ) _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
188- #define rotr16 (x ) _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
189- #define rotr63 (x ) _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x)))
186+ #define rotr32_avx2 (x ) _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
187+ #define rotr24_avx2 (x ) _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
188+ #define rotr16_avx2 (x ) _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
189+ #define rotr63_avx2 (x ) _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x)))
190190
191191#define G1_AVX2 (A0 , A1 , B0 , B1 , C0 , C1 , D0 , D1 ) \
192192 do { \
193193 __m256i ml = _mm256_mul_epu32(A0, B0); \
194194 ml = _mm256_add_epi64(ml, ml); \
195195 A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
196196 D0 = _mm256_xor_si256(D0, A0); \
197- D0 = rotr32 (D0); \
197+ D0 = rotr32_avx2 (D0); \
198198 \
199199 ml = _mm256_mul_epu32(C0, D0); \
200200 ml = _mm256_add_epi64(ml, ml); \
201201 C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
202202 \
203203 B0 = _mm256_xor_si256(B0, C0); \
204- B0 = rotr24 (B0); \
204+ B0 = rotr24_avx2 (B0); \
205205 \
206206 ml = _mm256_mul_epu32(A1, B1); \
207207 ml = _mm256_add_epi64(ml, ml); \
208208 A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
209209 D1 = _mm256_xor_si256(D1, A1); \
210- D1 = rotr32 (D1); \
210+ D1 = rotr32_avx2 (D1); \
211211 \
212212 ml = _mm256_mul_epu32(C1, D1); \
213213 ml = _mm256_add_epi64(ml, ml); \
214214 C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
215215 \
216216 B1 = _mm256_xor_si256(B1, C1); \
217- B1 = rotr24 (B1); \
217+ B1 = rotr24_avx2 (B1); \
218218 } while((void)0, 0);
219219
220220#define G2_AVX2 (A0 , A1 , B0 , B1 , C0 , C1 , D0 , D1 ) \
@@ -223,25 +223,25 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
223223 ml = _mm256_add_epi64(ml, ml); \
224224 A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
225225 D0 = _mm256_xor_si256(D0, A0); \
226- D0 = rotr16 (D0); \
226+ D0 = rotr16_avx2 (D0); \
227227 \
228228 ml = _mm256_mul_epu32(C0, D0); \
229229 ml = _mm256_add_epi64(ml, ml); \
230230 C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
231231 B0 = _mm256_xor_si256(B0, C0); \
232- B0 = rotr63 (B0); \
232+ B0 = rotr63_avx2 (B0); \
233233 \
234234 ml = _mm256_mul_epu32(A1, B1); \
235235 ml = _mm256_add_epi64(ml, ml); \
236236 A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
237237 D1 = _mm256_xor_si256(D1, A1); \
238- D1 = rotr16 (D1); \
238+ D1 = rotr16_avx2 (D1); \
239239 \
240240 ml = _mm256_mul_epu32(C1, D1); \
241241 ml = _mm256_add_epi64(ml, ml); \
242242 C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
243243 B1 = _mm256_xor_si256(B1, C1); \
244- B1 = rotr63 (B1); \
244+ B1 = rotr63_avx2 (B1); \
245245 } while((void)0, 0);
246246
247247#define DIAGONALIZE_1 (A0 , B0 , C0 , D0 , A1 , B1 , C1 , D1 ) \
0 commit comments