Skip to content

Commit e62c241

Browse files
committed
poly1305: Expose 4-block processing on the AVX2 backend
1 parent eb30dc0 commit e62c241

File tree

2 files changed

+53
-12
lines changed

2 files changed

+53
-12
lines changed

poly1305/src/backend/avx2.rs

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
// length to be known, which is incompatible with the streaming API of UniversalHash.
1717

1818
use universal_hash::{
19-
consts::{U1, U16},
19+
consts::{U16, U4},
2020
crypto_common::{BlockSizeUser, ParBlocksSizeUser},
2121
generic_array::GenericArray,
2222
UhfBackend,
@@ -27,6 +27,9 @@ use crate::{Block, Key, Tag};
2727
mod helpers;
2828
use self::helpers::*;
2929

30+
/// Four Poly1305 blocks (64-bytes)
31+
type ParBlocks = universal_hash::ParBlocks<State>;
32+
3033
#[derive(Copy, Clone)]
3134
struct Initialized {
3235
p: Aligned4x130,
@@ -65,6 +68,15 @@ impl State {
6568
}
6669
}
6770

71+
/// Process four Poly1305 blocks at once.
72+
#[target_feature(enable = "avx2")]
73+
pub(crate) unsafe fn compute_par_blocks(&mut self, blocks: &ParBlocks) {
74+
assert!(self.partial_block.is_none());
75+
assert_eq!(self.num_cached_blocks, 0);
76+
77+
self.process_blocks(Aligned4x130::from_par_blocks(blocks));
78+
}
79+
6880
/// Compute a Poly1305 block
6981
#[target_feature(enable = "avx2")]
7082
pub(crate) unsafe fn compute_block(&mut self, block: &Block, partial: bool) {
@@ -83,13 +95,18 @@ impl State {
8395
self.num_cached_blocks = 0;
8496
}
8597

98+
self.process_blocks(Aligned4x130::from_blocks(&self.cached_blocks));
99+
}
100+
101+
/// Compute a Poly1305 block
102+
#[target_feature(enable = "avx2")]
103+
unsafe fn process_blocks(&mut self, blocks: Aligned4x130) {
86104
if let Some(inner) = &mut self.initialized {
87105
// P <-- R^4 * P + blocks
88-
inner.p =
89-
(&inner.p * inner.r4).reduce() + Aligned4x130::from_blocks(&self.cached_blocks);
106+
inner.p = (&inner.p * inner.r4).reduce() + blocks;
90107
} else {
91108
// Initialize the polynomial.
92-
let p = Aligned4x130::from_blocks(&self.cached_blocks);
109+
let p = blocks;
93110

94111
// Initialize the multiplier (used to merge down the polynomial during
95112
// finalization).
@@ -160,11 +177,15 @@ impl BlockSizeUser for State {
160177
}
161178

162179
impl ParBlocksSizeUser for State {
163-
type ParBlocksSize = U1;
180+
type ParBlocksSize = U4;
164181
}
165182

166183
impl UhfBackend for State {
167184
fn proc_block(&mut self, block: &Block) {
168185
unsafe { self.compute_block(block, false) };
169186
}
187+
188+
fn proc_par_blocks(&mut self, blocks: &ParBlocks) {
189+
unsafe { self.compute_par_blocks(blocks) };
190+
}
170191
}

poly1305/src/backend/avx2/helpers.rs

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ use core::arch::x86::*;
88
#[cfg(target_arch = "x86_64")]
99
use core::arch::x86_64::*;
1010

11+
use super::ParBlocks;
1112
use crate::{Block, Key};
1213

1314
const fn set02(x3: u8, x2: u8, x1: u8, x0: u8) -> i32 {
@@ -964,25 +965,44 @@ impl Aligned4x130 {
964965
/// Panics if `src.len() < 64`.
965966
#[target_feature(enable = "avx2")]
966967
pub(super) unsafe fn from_blocks(src: &[Block; 4]) -> Self {
968+
let (lo, hi) = src.split_at(2);
969+
let blocks_23 = _mm256_loadu_si256(hi.as_ptr() as *const _);
970+
let blocks_01 = _mm256_loadu_si256(lo.as_ptr() as *const _);
971+
972+
Self::from_loaded_blocks(blocks_01, blocks_23)
973+
}
974+
975+
/// Aligns four 16-byte Poly1305 blocks at 26-bit boundaries within 32-bit words, and
976+
/// sets the high bit for each block.
977+
#[target_feature(enable = "avx2")]
978+
pub(super) unsafe fn from_par_blocks(src: &ParBlocks) -> Self {
979+
let (lo, hi) = src.split_at(2);
980+
let blocks_23 = _mm256_loadu_si256(hi.as_ptr() as *const _);
981+
let blocks_01 = _mm256_loadu_si256(lo.as_ptr() as *const _);
982+
983+
Self::from_loaded_blocks(blocks_01, blocks_23)
984+
}
985+
986+
/// Aligns four 16-byte Poly1305 blocks at 26-bit boundaries within 32-bit words, and
987+
/// sets the high bit for each block.
988+
///
989+
/// The four blocks must be in the following 32-bit word layout:
990+
/// [b33, b32, b31, b30, b23, b22, b21, b20]
991+
/// [b13, b12, b11, b10, b03, b02, b01, b00]
992+
#[target_feature(enable = "avx2")]
993+
unsafe fn from_loaded_blocks(blocks_01: __m256i, blocks_23: __m256i) -> Self {
967994
// 26-bit mask on each 32-bit word.
968995
let mask_26 = _mm256_set1_epi32(0x3ffffff);
969996
// Sets bit 24 of each 32-bit word.
970997
let set_hibit = _mm256_set1_epi32(1 << 24);
971998

972-
// - Load the four blocks into the following 32-bit word layout:
973-
// [b33, b32, b31, b30, b23, b22, b21, b20]
974-
// [b13, b12, b11, b10, b03, b02, b01, b00]
975-
//
976999
// - Unpack the upper and lower 64 bits:
9771000
// [b33, b32, b13, b12, b23, b22, b03, b02]
9781001
// [b31, b30, b11, b10, b21, b20, b01, b00]
9791002
//
9801003
// - Swap the middle two 64-bit words:
9811004
// a0 = [b33, b32, b23, b22, b13, b12, b03, b02]
9821005
// a1 = [b31, b30, b21, b20, b11, b10, b01, b00]
983-
let (lo, hi) = src.split_at(2);
984-
let blocks_23 = _mm256_loadu_si256(hi.as_ptr() as *const _);
985-
let blocks_01 = _mm256_loadu_si256(lo.as_ptr() as *const _);
9861006
let a0 = _mm256_permute4x64_epi64(
9871007
_mm256_unpackhi_epi64(blocks_01, blocks_23),
9881008
set02(3, 1, 2, 0),

0 commit comments

Comments
 (0)