@@ -128,7 +128,7 @@ pub fn avx_transpose128x128(in_out: &mut [__m256i; 64]) {
128128 }
129129 } ) ;
130130
131- // Phase 6: swap 64x64 bit-matrices therfore completing the 128x128 bit
131+ // Phase 6: swap 64x64 bit-matrices therefore completing the 128x128 bit
132132 // transpose
133133 const SHIFT_6 : usize = 6 ;
134134 const OFFSET_6 : usize = 1 << ( SHIFT_6 - 1 ) ; // 32
@@ -160,21 +160,27 @@ const fn mask(pattern: u64, pattern_len: u32) -> u64 {
160160///
161161/// This implementation is specifically tuned for transposing `128 x l` matrices
162162/// as done in OT protocols. Performance might be better if `input` is 16-byte
163- /// aligned and the number of columns is divisable by 512 on systems with
163+ /// aligned and the number of columns is divisible by 512 on systems with
164164/// 64-byte cache lines.
165165///
166166/// # Panics
167167/// If `input.len() != output.len()`
168168/// If the number of rows is less than 128.
169- /// If the number of rows is not divisable by 128.
170- /// If the number of columns (= input.len() * 8 / rows) is not divisable by 8.
169+ /// If `input.len()` is not divisible by rows.
170+ /// If the number of rows is not divisible by 128.
171+ /// If the number of columns (= input.len() * 8 / rows) is not divisible by 8.
171172///
172173/// # Safety
173174/// AVX2 instruction set must be available.
174175#[ target_feature( enable = "avx2" ) ]
175176pub fn transpose_bitmatrix ( input : & [ u8 ] , output : & mut [ u8 ] , rows : usize ) {
176177 assert_eq ! ( input. len( ) , output. len( ) ) ;
177178 assert ! ( rows >= 128 , "Number of rows must be >= 128." ) ;
179+ assert_eq ! (
180+ 0 ,
181+ input. len( ) % rows,
182+ "input.len(), must be divisble by rows"
183+ ) ;
178184 assert_eq ! ( 0 , rows % 128 , "Number of rows must be a multiple of 128." ) ;
179185 let cols = input. len ( ) * 8 / rows;
180186 assert_eq ! ( 0 , cols % 8 , "Number of columns must be a multiple of 8." ) ;
@@ -280,7 +286,15 @@ pub fn transpose_bitmatrix(input: &[u8], output: &mut [u8], rows: usize) {
280286 }
281287}
282288
283- // Inline never to reduce code size of main method.
289+ // Inline never to reduce code size of `transpose_bitmatrix` method. This is
290+ // method is only called once row block if the columns are not divisible by 128.
291+ // Since this is only rarely executed opposed to the core loop of
292+ // `transpose_bitmatrix` we annotate it with inline(never) to ensure the
293+ // optimizer doesn't inline it which could negatively impact performance
294+ // due to larger code size and potentially more instruction cache misses. This
295+ // is an assumption and not verified by a benchmark, but even if it were wrong,
296+ // it shouldn't negatively impact runtime because this method is called rarely
297+ // in our use cases where we have 128 rows and many columns.
284298#[ inline( never) ]
285299#[ target_feature( enable = "avx2" ) ]
286300#[ allow( clippy:: too_many_arguments) ]
@@ -335,7 +349,7 @@ mod tests {
335349 let mut v = [ _mm256_setzero_si256 ( ) ; 64 ] ;
336350 StdRng :: seed_from_u64 ( 42 ) . fill_bytes ( bytemuck:: cast_slice_mut ( & mut v) ) ;
337351
338- let orig = v. clone ( ) ;
352+ let orig = v;
339353 avx_transpose128x128 ( & mut v) ;
340354 avx_transpose128x128 ( & mut v) ;
341355 let mut failed = false ;
@@ -398,7 +412,7 @@ mod tests {
398412 }
399413
400414 #[ test]
401- fn test_avx_transpose_larger_cols_divisable_by_4_times_128 ( ) {
415+ fn test_avx_transpose_larger_cols_divisible_by_4_times_128 ( ) {
402416 let rows = 128 ;
403417 let cols = 128 * 8 ;
404418 let mut v = vec ! [ 0_u8 ; rows * cols / 8 ] ;
@@ -415,7 +429,7 @@ mod tests {
415429 }
416430
417431 #[ test]
418- fn test_avx_transpose_larger_cols_divisable_by_8 ( ) {
432+ fn test_avx_transpose_larger_cols_divisible_by_8 ( ) {
419433 let rows = 128 ;
420434 let cols = 128 + 32 ;
421435 let mut v = vec ! [ 0_u8 ; rows * cols / 8 ] ;
0 commit comments