polyval: ARMv8 PMULL backend (#126)

tarcieri · web-flow · commit 2ba387d54de5 · 2021-05-31T12:36:18.000-07:00
The ARMv8 Cryptography Extensions provide a CLMUL-like instruction called PMULL which can be used for computing GHASH/POLYVAL. Additional background: https://eprint.iacr.org/2015/688.pdf This commit adds a nightly-only PMULL accelerated backend for POLYVAL based on this public domain C intrinsics implementation: https://github.com/noloader/AES-Intrinsics/blob/master/clmul-arm.c Adapting it to POLYVAL required changes similar to the CLMUL backend, namely adapting the mask to use POLYVAL's polynomial (which is the reverse of GHASH), and some additional work in the reduction to make it "desrever" (as Shay Gueron likes to say). Performance seems suboptimal, but still significantly better than the software implementation by an order of magnitude. It seems that ARMv8 CPUs support a number of instruction fusions with PMULL, e.g. `fuse-crypto-eor`, and we should investigate those. Additionally it seems like we could better schedule operations on multiple blocks in parallel.
diff --git a/.github/workflows/polyval.yml b/.github/workflows/polyval.yml
@@ -180,4 +180,27 @@ jobs:
       - run: cross test --target ${{ matrix.target }} --release --features force-soft
       - run: cross test --target ${{ matrix.target }} --release --features std
       - run: cross test --target ${{ matrix.target }} --release --features zeroize
-      - run: cross test --target ${{ matrix.target }} --release --all-features
+
+  # ARMv8 cross-compiled tests for PMULL intrinsics (nightly-only)
+  armv8:
+    strategy:
+      matrix:
+        include:
+          - target: aarch64-unknown-linux-gnu
+            rust: nightly
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v1
+      - run: ${{ matrix.deps }}
+      - uses: actions-rs/toolchain@v1
+        with:
+          toolchain: ${{ matrix.rust }}
+          target: ${{ matrix.target }}
+          profile: minimal
+          override: true
+      - run: cargo install cross
+      - run: cross test --release --target ${{ matrix.target }} --features armv8
+      - run: cross test --release --target ${{ matrix.target }} --features armv8,force-soft
+      - run: cross test --release --target ${{ matrix.target }} --features armv8,std
+      - run: cross test --release --target ${{ matrix.target }} --features armv8,zeroize
+      - run: cross test --release --target ${{ matrix.target }} --all-features
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/ghash/Cargo.toml b/ghash/Cargo.toml
@@ -24,4 +24,5 @@ hex-literal = "0.2"
 
 [features]
 std = ["polyval/std"]
+armv8 = ["polyval/armv8"]
 force-soft = ["polyval/force-soft"]
diff --git a/polyval/Cargo.toml b/polyval/Cargo.toml
@@ -15,19 +15,21 @@ categories = ["cryptography", "no-std"]
 edition = "2018"
 
 [dependencies]
+cfg-if = "1"
 opaque-debug = "0.3"
 universal-hash = { version = "0.4", default-features = false }
 zeroize = { version = "1.3", optional = true, default-features = false }
 
-[target.'cfg(any(target_arch = "x86_64", target_arch = "x86"))'.dependencies]
-cpufeatures = "0.1"
+[target.'cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "x86"))'.dependencies]
+cpufeatures = "0.1.4"
 
 [dev-dependencies]
 hex-literal = "0.2"
 
 [features]
-force-soft = [] # Disable support for hardware intrinsics (CLMUL)
 std = ["universal-hash/std"]
+armv8 = [] # Enable nightly-only ARMv8 intrinsics support
+force-soft = [] # Disable support for hardware intrinsics
 
 [package.metadata.docs.rs]
 all-features = true
diff --git a/polyval/src/backend.rs b/polyval/src/backend.rs
@@ -1,29 +1,24 @@
 //! POLYVAL backends
 
-#[cfg(all(
-    any(target_arch = "x86", target_arch = "x86_64"),
-    not(feature = "force-soft")
-))]
-pub(crate) mod autodetect;
-
-#[cfg(all(
-    any(target_arch = "x86", target_arch = "x86_64"),
-    not(feature = "force-soft")
-))]
-pub(crate) mod clmul;
-
 #[cfg_attr(not(target_pointer_width = "64"), path = "backend/soft32.rs")]
 #[cfg_attr(target_pointer_width = "64", path = "backend/soft64.rs")]
-pub(crate) mod soft;
+mod soft;
 
-#[cfg(all(
-    any(target_arch = "x86", target_arch = "x86_64"),
-    not(feature = "force-soft")
-))]
-pub use crate::backend::autodetect::Polyval;
+use cfg_if::cfg_if;
 
-#[cfg(not(all(
-    any(target_arch = "x86", target_arch = "x86_64"),
-    not(feature = "force-soft")
-)))]
-pub use crate::backend::soft::Polyval;
+cfg_if! {
+    if #[cfg(all(target_arch = "aarch64", feature = "armv8", not(feature = "force-soft")))] {
+        mod autodetect;
+        mod pmull;
+        pub use crate::backend::autodetect::Polyval;
+    } else if #[cfg(all(
+        any(target_arch = "x86_64", target_arch = "x86"),
+        not(feature = "force-soft")
+    ))] {
+        mod autodetect;
+        mod clmul;
+        pub use crate::backend::autodetect::Polyval;
+    } else {
+        pub use crate::backend::soft::Polyval;
+    }
+}
diff --git a/polyval/src/backend/autodetect.rs b/polyval/src/backend/autodetect.rs
@@ -1,37 +1,47 @@
-//! Autodetection for (P)CLMUL(QDQ) CPU intrinsics on x86 CPUs, with fallback
-//! to the "soft" backend when it's unavailable.
+//! Autodetection for CPU intrinsics, with fallback to the "soft" backend when
+//! they are unavailable.
 
-use crate::{backend, Block, Key};
+use crate::{backend::soft, Block, Key};
 use core::mem::ManuallyDrop;
 use universal_hash::{consts::U16, NewUniversalHash, Output, UniversalHash};
 
-cpufeatures::new!(clmul_cpuid, "pclmulqdq", "sse4.1");
+#[cfg(all(target_arch = "aarch64", feature = "armv8"))]
+use super::pmull as intrinsics;
+
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+use super::clmul as intrinsics;
+
+#[cfg(all(target_arch = "aarch64", feature = "armv8"))]
+cpufeatures::new!(mul_intrinsics, "aes"); // `aes` implies PMULL
+
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+cpufeatures::new!(mul_intrinsics, "pclmulqdq", "sse4.1");
 
 /// **POLYVAL**: GHASH-like universal hash over GF(2^128).
 pub struct Polyval {
     inner: Inner,
-    token: clmul_cpuid::InitToken,
+    token: mul_intrinsics::InitToken,
 }
 
 union Inner {
-    clmul: ManuallyDrop<backend::clmul::Polyval>,
-    soft: ManuallyDrop<backend::soft::Polyval>,
+    intrinsics: ManuallyDrop<intrinsics::Polyval>,
+    soft: ManuallyDrop<soft::Polyval>,
 }
 
 impl NewUniversalHash for Polyval {
     type KeySize = U16;
 
     /// Initialize POLYVAL with the given `H` field element
     fn new(h: &Key) -> Self {
-        let (token, clmul_present) = clmul_cpuid::init_get();
+        let (token, has_intrinsics) = mul_intrinsics::init_get();
 
-        let inner = if clmul_present {
+        let inner = if has_intrinsics {
             Inner {
-                clmul: ManuallyDrop::new(backend::clmul::Polyval::new(h)),
+                intrinsics: ManuallyDrop::new(intrinsics::Polyval::new(h)),
             }
         } else {
             Inner {
-                soft: ManuallyDrop::new(backend::soft::Polyval::new(h)),
+                soft: ManuallyDrop::new(soft::Polyval::new(h)),
             }
         };
 
@@ -46,7 +56,7 @@ impl UniversalHash for Polyval {
     #[inline]
     fn update(&mut self, x: &Block) {
         if self.token.get() {
-            unsafe { (*self.inner.clmul).update(x) }
+            unsafe { (*self.inner.intrinsics).update(x) }
         } else {
             unsafe { (*self.inner.soft).update(x) }
         }
@@ -55,7 +65,7 @@ impl UniversalHash for Polyval {
     /// Reset internal state
     fn reset(&mut self) {
         if self.token.get() {
-            unsafe { (*self.inner.clmul).reset() }
+            unsafe { (*self.inner.intrinsics).reset() }
         } else {
             unsafe { (*self.inner.soft).reset() }
         }
@@ -65,7 +75,7 @@ impl UniversalHash for Polyval {
     fn finalize(self) -> Output<Self> {
         let output_bytes = if self.token.get() {
             unsafe {
-                ManuallyDrop::into_inner(self.inner.clmul)
+                ManuallyDrop::into_inner(self.inner.intrinsics)
                     .finalize()
                     .into_bytes()
             }
@@ -85,7 +95,7 @@ impl Clone for Polyval {
     fn clone(&self) -> Self {
         let inner = if self.token.get() {
             Inner {
-                clmul: ManuallyDrop::new(unsafe { (*self.inner.clmul).clone() }),
+                intrinsics: ManuallyDrop::new(unsafe { (*self.inner.intrinsics).clone() }),
             }
         } else {
             Inner {
diff --git a/polyval/src/backend/clmul.rs b/polyval/src/backend/clmul.rs
@@ -1,6 +1,4 @@
-//! **POLYVAL**: GHASH-like universal hash over GF(2^128).
-//!
-//! CLMUL-accelerated implementation for modern x86/x86_64 CPUs
+//! Intel `CLMUL`-accelerated implementation for modern x86/x86_64 CPUs
 //! (i.e. Intel Sandy Bridge-compatible or newer)
 
 use crate::{Block, Key};
@@ -26,9 +24,10 @@ impl NewUniversalHash for Polyval {
         unsafe {
             // `_mm_loadu_si128` performs an unaligned load
             #[allow(clippy::cast_ptr_alignment)]
-            let h = _mm_loadu_si128(h.as_ptr() as *const __m128i);
-            let y = _mm_setzero_si128();
-            Self { h, y }
+            Self {
+                h: _mm_loadu_si128(h.as_ptr() as *const __m128i),
+                y: _mm_setzero_si128(),
+            }
         }
     }
 }
diff --git a/polyval/src/backend/pmull.rs b/polyval/src/backend/pmull.rs
@@ -0,0 +1,117 @@
+//! ARMv8 `PMULL`-accelerated implementation of POLYVAL.
+//!
+//! Based on this C intrinsics implementation:
+//! <https://github.com/noloader/AES-Intrinsics/blob/master/clmul-arm.c>
+//!
+//! Original C written and placed in public domain by Jeffrey Walton.
+//! Based on code from ARM, and by Johannes Schneiders, Skip Hovsmith and
+//! Barry O'Rourke for the mbedTLS project.
+//!
+//! For more information about PMULL, see:
+//! - <https://developer.arm.com/documentation/100069/0608/A64-SIMD-Vector-Instructions/PMULL--PMULL2--vector->
+//! - <https://eprint.iacr.org/2015/688.pdf>
+
+use crate::{Block, Key};
+use core::{arch::aarch64::*, mem};
+use universal_hash::{consts::U16, NewUniversalHash, Output, UniversalHash};
+
+/// **POLYVAL**: GHASH-like universal hash over GF(2^128).
+#[derive(Clone)]
+pub struct Polyval {
+    h: uint8x16_t,
+    y: uint8x16_t,
+}
+
+impl NewUniversalHash for Polyval {
+    type KeySize = U16;
+
+    /// Initialize POLYVAL with the given `H` field element
+    fn new(h: &Key) -> Self {
+        unsafe {
+            Self {
+                h: vld1q_u8(h.as_ptr()),
+                y: vdupq_n_u8(0), // all zeroes
+            }
+        }
+    }
+}
+
+impl UniversalHash for Polyval {
+    type BlockSize = U16;
+
+    #[inline]
+    fn update(&mut self, x: &Block) {
+        unsafe {
+            self.mul(x);
+        }
+    }
+
+    /// Reset internal state
+    fn reset(&mut self) {
+        unsafe {
+            self.y = vdupq_n_u8(0);
+        }
+    }
+
+    /// Get GHASH output
+    fn finalize(self) -> Output<Self> {
+        unsafe { mem::transmute(self.y) }
+    }
+}
+
+impl Polyval {
+    /// Mask value used when performing reduction.
+    /// This corresponds to POLYVAL's polynomial with the highest bit unset.
+    const MASK: u128 = 1 << 127 | 1 << 126 | 1 << 121 | 1;
+
+    /// POLYVAL carryless multiplication.
+    // TODO(tarcieri): investigate ordering optimizations and fusions e.g.`fuse-crypto-eor`
+    #[inline]
+    #[target_feature(enable = "neon")]
+    #[target_feature(enable = "crypto")]
+    unsafe fn mul(&mut self, x: &Block) {
+        let h = self.h;
+        let y = veorq_u8(self.y, vld1q_u8(x.as_ptr()));
+
+        // polynomial multiply
+        let z = vdupq_n_u8(0);
+        let r0 = pmull::<0, 0>(h, y);
+        let r1 = pmull::<1, 1>(h, y);
+        let t0 = pmull::<0, 1>(h, y);
+        let t1 = pmull::<1, 0>(h, y);
+        let t0 = veorq_u8(t0, t1);
+        let t1 = vextq_u8(z, t0, 8);
+        let r0 = veorq_u8(r0, t1);
+        let t1 = vextq_u8(t0, z, 8);
+        let r1 = veorq_u8(r1, t1);
+
+        // polynomial reduction
+        let p = mem::transmute(Self::MASK);
+        let t0 = pmull::<0, 1>(r0, p);
+        let t1 = vextq_u8(t0, t0, 8);
+        let r0 = veorq_u8(r0, t1);
+        let t1 = pmull::<1, 1>(r0, p);
+        let r0 = veorq_u8(r0, t1);
+
+        self.y = veorq_u8(r0, r1);
+    }
+}
+
+/// Wrapper for the ARM64 `PMULL` instruction.
+#[inline(always)]
+unsafe fn pmull<const A_LANE: i32, const B_LANE: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    mem::transmute(vmull_p64(
+        vgetq_lane_u64(vreinterpretq_u64_u8(a), A_LANE),
+        vgetq_lane_u64(vreinterpretq_u64_u8(b), B_LANE),
+    ))
+}
+
+// TODO(tarcieri): zeroize support
+// #[cfg(feature = "zeroize")]
+// impl Drop for Polyval {
+//     fn drop(&mut self) {
+//         use zeroize::Zeroize;
+//         self.h.zeroize();
+//         self.y.zeroize();
+//     }
+// }
diff --git a/polyval/src/lib.rs b/polyval/src/lib.rs