Skip to content

Commit 7e0c73f

Browse files
committed
core: avx2 transpose spelling mistakes and docs
1 parent dccedb4 commit 7e0c73f

File tree

1 file changed

+22
-8
lines changed

1 file changed

+22
-8
lines changed

cryprot-core/src/transpose/avx2.rs

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ pub fn avx_transpose128x128(in_out: &mut [__m256i; 64]) {
128128
}
129129
});
130130

131-
// Phase 6: swap 64x64 bit-matrices therfore completing the 128x128 bit
131+
// Phase 6: swap 64x64 bit-matrices therefore completing the 128x128 bit
132132
// transpose
133133
const SHIFT_6: usize = 6;
134134
const OFFSET_6: usize = 1 << (SHIFT_6 - 1); // 32
@@ -160,21 +160,27 @@ const fn mask(pattern: u64, pattern_len: u32) -> u64 {
160160
///
161161
/// This implementation is specifically tuned for transposing `128 x l` matrices
162162
/// as done in OT protocols. Performance might be better if `input` is 16-byte
163-
/// aligned and the number of columns is divisable by 512 on systems with
163+
/// aligned and the number of columns is divisible by 512 on systems with
164164
/// 64-byte cache lines.
165165
///
166166
/// # Panics
167167
/// If `input.len() != output.len()`
168168
/// If the number of rows is less than 128.
169-
/// If the number of rows is not divisable by 128.
170-
/// If the number of columns (= input.len() * 8 / rows) is not divisable by 8.
169+
/// If `input.len()` is not divisible by rows.
170+
/// If the number of rows is not divisible by 128.
171+
/// If the number of columns (= input.len() * 8 / rows) is not divisible by 8.
171172
///
172173
/// # Safety
173174
/// AVX2 instruction set must be available.
174175
#[target_feature(enable = "avx2")]
175176
pub fn transpose_bitmatrix(input: &[u8], output: &mut [u8], rows: usize) {
176177
assert_eq!(input.len(), output.len());
177178
assert!(rows >= 128, "Number of rows must be >= 128.");
179+
assert_eq!(
180+
0,
181+
input.len() % rows,
182+
"input.len(), must be divisble by rows"
183+
);
178184
assert_eq!(0, rows % 128, "Number of rows must be a multiple of 128.");
179185
let cols = input.len() * 8 / rows;
180186
assert_eq!(0, cols % 8, "Number of columns must be a multiple of 8.");
@@ -280,7 +286,15 @@ pub fn transpose_bitmatrix(input: &[u8], output: &mut [u8], rows: usize) {
280286
}
281287
}
282288

283-
// Inline never to reduce code size of main method.
289+
// Inline never to reduce code size of `transpose_bitmatrix` method. This is
290+
// method is only called once row block if the columns are not divisible by 128.
291+
// Since this is only rarely executed opposed to the core loop of
292+
// `transpose_bitmatrix` we annotate it with inline(never) to ensure the
293+
// optimizer doesn't inline it which could negatively impact performance
294+
// due to larger code size and potentially more instruction cache misses. This
295+
// is an assumption and not verified by a benchmark, but even if it were wrong,
296+
// it shouldn't negatively impact runtime because this method is called rarely
297+
// in our use cases where we have 128 rows and many columns.
284298
#[inline(never)]
285299
#[target_feature(enable = "avx2")]
286300
#[allow(clippy::too_many_arguments)]
@@ -335,7 +349,7 @@ mod tests {
335349
let mut v = [_mm256_setzero_si256(); 64];
336350
StdRng::seed_from_u64(42).fill_bytes(bytemuck::cast_slice_mut(&mut v));
337351

338-
let orig = v.clone();
352+
let orig = v;
339353
avx_transpose128x128(&mut v);
340354
avx_transpose128x128(&mut v);
341355
let mut failed = false;
@@ -398,7 +412,7 @@ mod tests {
398412
}
399413

400414
#[test]
401-
fn test_avx_transpose_larger_cols_divisable_by_4_times_128() {
415+
fn test_avx_transpose_larger_cols_divisible_by_4_times_128() {
402416
let rows = 128;
403417
let cols = 128 * 8;
404418
let mut v = vec![0_u8; rows * cols / 8];
@@ -415,7 +429,7 @@ mod tests {
415429
}
416430

417431
#[test]
418-
fn test_avx_transpose_larger_cols_divisable_by_8() {
432+
fn test_avx_transpose_larger_cols_divisible_by_8() {
419433
let rows = 128;
420434
let cols = 128 + 32;
421435
let mut v = vec![0_u8; rows * cols / 8];

0 commit comments

Comments
 (0)