@@ -8,6 +8,7 @@ use core::arch::x86::*;
88#[ cfg( target_arch = "x86_64" ) ]
99use core:: arch:: x86_64:: * ;
1010
11+ use super :: ParBlocks ;
1112use crate :: { Block , Key } ;
1213
1314const fn set02 ( x3 : u8 , x2 : u8 , x1 : u8 , x0 : u8 ) -> i32 {
@@ -964,25 +965,44 @@ impl Aligned4x130 {
964965 /// Panics if `src.len() < 64`.
965966 #[ target_feature( enable = "avx2" ) ]
966967 pub ( super ) unsafe fn from_blocks ( src : & [ Block ; 4 ] ) -> Self {
968+ let ( lo, hi) = src. split_at ( 2 ) ;
969+ let blocks_23 = _mm256_loadu_si256 ( hi. as_ptr ( ) as * const _ ) ;
970+ let blocks_01 = _mm256_loadu_si256 ( lo. as_ptr ( ) as * const _ ) ;
971+
972+ Self :: from_loaded_blocks ( blocks_01, blocks_23)
973+ }
974+
975+ /// Aligns four 16-byte Poly1305 blocks at 26-bit boundaries within 32-bit words, and
976+ /// sets the high bit for each block.
977+ #[ target_feature( enable = "avx2" ) ]
978+ pub ( super ) unsafe fn from_par_blocks ( src : & ParBlocks ) -> Self {
979+ let ( lo, hi) = src. split_at ( 2 ) ;
980+ let blocks_23 = _mm256_loadu_si256 ( hi. as_ptr ( ) as * const _ ) ;
981+ let blocks_01 = _mm256_loadu_si256 ( lo. as_ptr ( ) as * const _ ) ;
982+
983+ Self :: from_loaded_blocks ( blocks_01, blocks_23)
984+ }
985+
986+ /// Aligns four 16-byte Poly1305 blocks at 26-bit boundaries within 32-bit words, and
987+ /// sets the high bit for each block.
988+ ///
989+ /// The four blocks must be in the following 32-bit word layout:
990+ /// [b33, b32, b31, b30, b23, b22, b21, b20]
991+ /// [b13, b12, b11, b10, b03, b02, b01, b00]
992+ #[ target_feature( enable = "avx2" ) ]
993+ unsafe fn from_loaded_blocks ( blocks_01 : __m256i , blocks_23 : __m256i ) -> Self {
967994 // 26-bit mask on each 32-bit word.
968995 let mask_26 = _mm256_set1_epi32 ( 0x3ffffff ) ;
969996 // Sets bit 24 of each 32-bit word.
970997 let set_hibit = _mm256_set1_epi32 ( 1 << 24 ) ;
971998
972- // - Load the four blocks into the following 32-bit word layout:
973- // [b33, b32, b31, b30, b23, b22, b21, b20]
974- // [b13, b12, b11, b10, b03, b02, b01, b00]
975- //
976999 // - Unpack the upper and lower 64 bits:
9771000 // [b33, b32, b13, b12, b23, b22, b03, b02]
9781001 // [b31, b30, b11, b10, b21, b20, b01, b00]
9791002 //
9801003 // - Swap the middle two 64-bit words:
9811004 // a0 = [b33, b32, b23, b22, b13, b12, b03, b02]
9821005 // a1 = [b31, b30, b21, b20, b11, b10, b01, b00]
983- let ( lo, hi) = src. split_at ( 2 ) ;
984- let blocks_23 = _mm256_loadu_si256 ( hi. as_ptr ( ) as * const _ ) ;
985- let blocks_01 = _mm256_loadu_si256 ( lo. as_ptr ( ) as * const _ ) ;
9861006 let a0 = _mm256_permute4x64_epi64 (
9871007 _mm256_unpackhi_epi64 ( blocks_01, blocks_23) ,
9881008 set02 ( 3 , 1 , 2 , 0 ) ,
0 commit comments