Skip to content

Commit 2ba387d

Browse files
authored
polyval: ARMv8 PMULL backend (#126)
The ARMv8 Cryptography Extensions provide a CLMUL-like instruction called PMULL which can be used for computing GHASH/POLYVAL. Additional background: https://eprint.iacr.org/2015/688.pdf This commit adds a nightly-only PMULL accelerated backend for POLYVAL based on this public domain C intrinsics implementation: https://github.com/noloader/AES-Intrinsics/blob/master/clmul-arm.c Adapting it to POLYVAL required changes similar to the CLMUL backend, namely adapting the mask to use POLYVAL's polynomial (which is the reverse of GHASH), and some additional work in the reduction to make it "desrever" (as Shay Gueron likes to say). Performance seems suboptimal, but still significantly better than the software implementation by an order of magnitude. It seems that ARMv8 CPUs support a number of instruction fusions with PMULL, e.g. `fuse-crypto-eor`, and we should investigate those. Additionally it seems like we could better schedule operations on multiple blocks in parallel.
1 parent 46dd3db commit 2ba387d

File tree

9 files changed

+237
-56
lines changed

9 files changed

+237
-56
lines changed

.github/workflows/polyval.yml

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,4 +180,27 @@ jobs:
180180
- run: cross test --target ${{ matrix.target }} --release --features force-soft
181181
- run: cross test --target ${{ matrix.target }} --release --features std
182182
- run: cross test --target ${{ matrix.target }} --release --features zeroize
183-
- run: cross test --target ${{ matrix.target }} --release --all-features
183+
184+
# ARMv8 cross-compiled tests for PMULL intrinsics (nightly-only)
185+
armv8:
186+
strategy:
187+
matrix:
188+
include:
189+
- target: aarch64-unknown-linux-gnu
190+
rust: nightly
191+
runs-on: ubuntu-latest
192+
steps:
193+
- uses: actions/checkout@v1
194+
- run: ${{ matrix.deps }}
195+
- uses: actions-rs/toolchain@v1
196+
with:
197+
toolchain: ${{ matrix.rust }}
198+
target: ${{ matrix.target }}
199+
profile: minimal
200+
override: true
201+
- run: cargo install cross
202+
- run: cross test --release --target ${{ matrix.target }} --features armv8
203+
- run: cross test --release --target ${{ matrix.target }} --features armv8,force-soft
204+
- run: cross test --release --target ${{ matrix.target }} --features armv8,std
205+
- run: cross test --release --target ${{ matrix.target }} --features armv8,zeroize
206+
- run: cross test --release --target ${{ matrix.target }} --all-features

Cargo.lock

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ghash/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,5 @@ hex-literal = "0.2"
2424

2525
[features]
2626
std = ["polyval/std"]
27+
armv8 = ["polyval/armv8"]
2728
force-soft = ["polyval/force-soft"]

polyval/Cargo.toml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,21 @@ categories = ["cryptography", "no-std"]
1515
edition = "2018"
1616

1717
[dependencies]
18+
cfg-if = "1"
1819
opaque-debug = "0.3"
1920
universal-hash = { version = "0.4", default-features = false }
2021
zeroize = { version = "1.3", optional = true, default-features = false }
2122

22-
[target.'cfg(any(target_arch = "x86_64", target_arch = "x86"))'.dependencies]
23-
cpufeatures = "0.1"
23+
[target.'cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "x86"))'.dependencies]
24+
cpufeatures = "0.1.4"
2425

2526
[dev-dependencies]
2627
hex-literal = "0.2"
2728

2829
[features]
29-
force-soft = [] # Disable support for hardware intrinsics (CLMUL)
3030
std = ["universal-hash/std"]
31+
armv8 = [] # Enable nightly-only ARMv8 intrinsics support
32+
force-soft = [] # Disable support for hardware intrinsics
3133

3234
[package.metadata.docs.rs]
3335
all-features = true

polyval/src/backend.rs

Lines changed: 18 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,24 @@
11
//! POLYVAL backends
22
3-
#[cfg(all(
4-
any(target_arch = "x86", target_arch = "x86_64"),
5-
not(feature = "force-soft")
6-
))]
7-
pub(crate) mod autodetect;
8-
9-
#[cfg(all(
10-
any(target_arch = "x86", target_arch = "x86_64"),
11-
not(feature = "force-soft")
12-
))]
13-
pub(crate) mod clmul;
14-
153
#[cfg_attr(not(target_pointer_width = "64"), path = "backend/soft32.rs")]
164
#[cfg_attr(target_pointer_width = "64", path = "backend/soft64.rs")]
17-
pub(crate) mod soft;
5+
mod soft;
186

19-
#[cfg(all(
20-
any(target_arch = "x86", target_arch = "x86_64"),
21-
not(feature = "force-soft")
22-
))]
23-
pub use crate::backend::autodetect::Polyval;
7+
use cfg_if::cfg_if;
248

25-
#[cfg(not(all(
26-
any(target_arch = "x86", target_arch = "x86_64"),
27-
not(feature = "force-soft")
28-
)))]
29-
pub use crate::backend::soft::Polyval;
9+
cfg_if! {
10+
if #[cfg(all(target_arch = "aarch64", feature = "armv8", not(feature = "force-soft")))] {
11+
mod autodetect;
12+
mod pmull;
13+
pub use crate::backend::autodetect::Polyval;
14+
} else if #[cfg(all(
15+
any(target_arch = "x86_64", target_arch = "x86"),
16+
not(feature = "force-soft")
17+
))] {
18+
mod autodetect;
19+
mod clmul;
20+
pub use crate::backend::autodetect::Polyval;
21+
} else {
22+
pub use crate::backend::soft::Polyval;
23+
}
24+
}

polyval/src/backend/autodetect.rs

Lines changed: 25 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,47 @@
1-
//! Autodetection for (P)CLMUL(QDQ) CPU intrinsics on x86 CPUs, with fallback
2-
//! to the "soft" backend when it's unavailable.
1+
//! Autodetection for CPU intrinsics, with fallback to the "soft" backend when
2+
//! they are unavailable.
33
4-
use crate::{backend, Block, Key};
4+
use crate::{backend::soft, Block, Key};
55
use core::mem::ManuallyDrop;
66
use universal_hash::{consts::U16, NewUniversalHash, Output, UniversalHash};
77

8-
cpufeatures::new!(clmul_cpuid, "pclmulqdq", "sse4.1");
8+
#[cfg(all(target_arch = "aarch64", feature = "armv8"))]
9+
use super::pmull as intrinsics;
10+
11+
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
12+
use super::clmul as intrinsics;
13+
14+
#[cfg(all(target_arch = "aarch64", feature = "armv8"))]
15+
cpufeatures::new!(mul_intrinsics, "aes"); // `aes` implies PMULL
16+
17+
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
18+
cpufeatures::new!(mul_intrinsics, "pclmulqdq", "sse4.1");
919

1020
/// **POLYVAL**: GHASH-like universal hash over GF(2^128).
1121
pub struct Polyval {
1222
inner: Inner,
13-
token: clmul_cpuid::InitToken,
23+
token: mul_intrinsics::InitToken,
1424
}
1525

1626
union Inner {
17-
clmul: ManuallyDrop<backend::clmul::Polyval>,
18-
soft: ManuallyDrop<backend::soft::Polyval>,
27+
intrinsics: ManuallyDrop<intrinsics::Polyval>,
28+
soft: ManuallyDrop<soft::Polyval>,
1929
}
2030

2131
impl NewUniversalHash for Polyval {
2232
type KeySize = U16;
2333

2434
/// Initialize POLYVAL with the given `H` field element
2535
fn new(h: &Key) -> Self {
26-
let (token, clmul_present) = clmul_cpuid::init_get();
36+
let (token, has_intrinsics) = mul_intrinsics::init_get();
2737

28-
let inner = if clmul_present {
38+
let inner = if has_intrinsics {
2939
Inner {
30-
clmul: ManuallyDrop::new(backend::clmul::Polyval::new(h)),
40+
intrinsics: ManuallyDrop::new(intrinsics::Polyval::new(h)),
3141
}
3242
} else {
3343
Inner {
34-
soft: ManuallyDrop::new(backend::soft::Polyval::new(h)),
44+
soft: ManuallyDrop::new(soft::Polyval::new(h)),
3545
}
3646
};
3747

@@ -46,7 +56,7 @@ impl UniversalHash for Polyval {
4656
#[inline]
4757
fn update(&mut self, x: &Block) {
4858
if self.token.get() {
49-
unsafe { (*self.inner.clmul).update(x) }
59+
unsafe { (*self.inner.intrinsics).update(x) }
5060
} else {
5161
unsafe { (*self.inner.soft).update(x) }
5262
}
@@ -55,7 +65,7 @@ impl UniversalHash for Polyval {
5565
/// Reset internal state
5666
fn reset(&mut self) {
5767
if self.token.get() {
58-
unsafe { (*self.inner.clmul).reset() }
68+
unsafe { (*self.inner.intrinsics).reset() }
5969
} else {
6070
unsafe { (*self.inner.soft).reset() }
6171
}
@@ -65,7 +75,7 @@ impl UniversalHash for Polyval {
6575
fn finalize(self) -> Output<Self> {
6676
let output_bytes = if self.token.get() {
6777
unsafe {
68-
ManuallyDrop::into_inner(self.inner.clmul)
78+
ManuallyDrop::into_inner(self.inner.intrinsics)
6979
.finalize()
7080
.into_bytes()
7181
}
@@ -85,7 +95,7 @@ impl Clone for Polyval {
8595
fn clone(&self) -> Self {
8696
let inner = if self.token.get() {
8797
Inner {
88-
clmul: ManuallyDrop::new(unsafe { (*self.inner.clmul).clone() }),
98+
intrinsics: ManuallyDrop::new(unsafe { (*self.inner.intrinsics).clone() }),
8999
}
90100
} else {
91101
Inner {

polyval/src/backend/clmul.rs

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
1-
//! **POLYVAL**: GHASH-like universal hash over GF(2^128).
2-
//!
3-
//! CLMUL-accelerated implementation for modern x86/x86_64 CPUs
1+
//! Intel `CLMUL`-accelerated implementation for modern x86/x86_64 CPUs
42
//! (i.e. Intel Sandy Bridge-compatible or newer)
53
64
use crate::{Block, Key};
@@ -26,9 +24,10 @@ impl NewUniversalHash for Polyval {
2624
unsafe {
2725
// `_mm_loadu_si128` performs an unaligned load
2826
#[allow(clippy::cast_ptr_alignment)]
29-
let h = _mm_loadu_si128(h.as_ptr() as *const __m128i);
30-
let y = _mm_setzero_si128();
31-
Self { h, y }
27+
Self {
28+
h: _mm_loadu_si128(h.as_ptr() as *const __m128i),
29+
y: _mm_setzero_si128(),
30+
}
3231
}
3332
}
3433
}

polyval/src/backend/pmull.rs

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
//! ARMv8 `PMULL`-accelerated implementation of POLYVAL.
2+
//!
3+
//! Based on this C intrinsics implementation:
4+
//! <https://github.com/noloader/AES-Intrinsics/blob/master/clmul-arm.c>
5+
//!
6+
//! Original C written and placed in public domain by Jeffrey Walton.
7+
//! Based on code from ARM, and by Johannes Schneiders, Skip Hovsmith and
8+
//! Barry O'Rourke for the mbedTLS project.
9+
//!
10+
//! For more information about PMULL, see:
11+
//! - <https://developer.arm.com/documentation/100069/0608/A64-SIMD-Vector-Instructions/PMULL--PMULL2--vector->
12+
//! - <https://eprint.iacr.org/2015/688.pdf>
13+
14+
use crate::{Block, Key};
15+
use core::{arch::aarch64::*, mem};
16+
use universal_hash::{consts::U16, NewUniversalHash, Output, UniversalHash};
17+
18+
/// **POLYVAL**: GHASH-like universal hash over GF(2^128).
19+
#[derive(Clone)]
20+
pub struct Polyval {
21+
h: uint8x16_t,
22+
y: uint8x16_t,
23+
}
24+
25+
impl NewUniversalHash for Polyval {
26+
type KeySize = U16;
27+
28+
/// Initialize POLYVAL with the given `H` field element
29+
fn new(h: &Key) -> Self {
30+
unsafe {
31+
Self {
32+
h: vld1q_u8(h.as_ptr()),
33+
y: vdupq_n_u8(0), // all zeroes
34+
}
35+
}
36+
}
37+
}
38+
39+
impl UniversalHash for Polyval {
40+
type BlockSize = U16;
41+
42+
#[inline]
43+
fn update(&mut self, x: &Block) {
44+
unsafe {
45+
self.mul(x);
46+
}
47+
}
48+
49+
/// Reset internal state
50+
fn reset(&mut self) {
51+
unsafe {
52+
self.y = vdupq_n_u8(0);
53+
}
54+
}
55+
56+
/// Get GHASH output
57+
fn finalize(self) -> Output<Self> {
58+
unsafe { mem::transmute(self.y) }
59+
}
60+
}
61+
62+
impl Polyval {
63+
/// Mask value used when performing reduction.
64+
/// This corresponds to POLYVAL's polynomial with the highest bit unset.
65+
const MASK: u128 = 1 << 127 | 1 << 126 | 1 << 121 | 1;
66+
67+
/// POLYVAL carryless multiplication.
68+
// TODO(tarcieri): investigate ordering optimizations and fusions e.g.`fuse-crypto-eor`
69+
#[inline]
70+
#[target_feature(enable = "neon")]
71+
#[target_feature(enable = "crypto")]
72+
unsafe fn mul(&mut self, x: &Block) {
73+
let h = self.h;
74+
let y = veorq_u8(self.y, vld1q_u8(x.as_ptr()));
75+
76+
// polynomial multiply
77+
let z = vdupq_n_u8(0);
78+
let r0 = pmull::<0, 0>(h, y);
79+
let r1 = pmull::<1, 1>(h, y);
80+
let t0 = pmull::<0, 1>(h, y);
81+
let t1 = pmull::<1, 0>(h, y);
82+
let t0 = veorq_u8(t0, t1);
83+
let t1 = vextq_u8(z, t0, 8);
84+
let r0 = veorq_u8(r0, t1);
85+
let t1 = vextq_u8(t0, z, 8);
86+
let r1 = veorq_u8(r1, t1);
87+
88+
// polynomial reduction
89+
let p = mem::transmute(Self::MASK);
90+
let t0 = pmull::<0, 1>(r0, p);
91+
let t1 = vextq_u8(t0, t0, 8);
92+
let r0 = veorq_u8(r0, t1);
93+
let t1 = pmull::<1, 1>(r0, p);
94+
let r0 = veorq_u8(r0, t1);
95+
96+
self.y = veorq_u8(r0, r1);
97+
}
98+
}
99+
100+
/// Wrapper for the ARM64 `PMULL` instruction.
101+
#[inline(always)]
102+
unsafe fn pmull<const A_LANE: i32, const B_LANE: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
103+
mem::transmute(vmull_p64(
104+
vgetq_lane_u64(vreinterpretq_u64_u8(a), A_LANE),
105+
vgetq_lane_u64(vreinterpretq_u64_u8(b), B_LANE),
106+
))
107+
}
108+
109+
// TODO(tarcieri): zeroize support
110+
// #[cfg(feature = "zeroize")]
111+
// impl Drop for Polyval {
112+
// fn drop(&mut self) {
113+
// use zeroize::Zeroize;
114+
// self.h.zeroize();
115+
// self.y.zeroize();
116+
// }
117+
// }

0 commit comments

Comments
 (0)