Rewrite AVX2Encoder to not use generics

dequbed · dequbed · commit c98293b9514f · 2021-09-16T10:45:33.000+02:00
diff --git a/src/engine/avx2/mod.rs b/src/engine/avx2/mod.rs
@@ -3,9 +3,6 @@ use crate::engine::Config;
 use crate::engine::DecodeEstimate;
 use crate::{DecodeError, PAD_BYTE};
 
-use core::marker::PhantomData;
-
-
 #[cfg(target_arch = "x86")]
 use core::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
@@ -28,41 +25,75 @@ const DECODED_CHUNK_LEN: usize = 24;
 /// - It has to use unsafe code because intrinsics are always unsafe in Rust.
 /// - The algorithm in use makes specific assumptions about the alphabet, so it's only implemented
 /// for the STANDARD and URL_SAFE Alphabet
-pub struct AVX2Encoder<A> {
+pub struct AVX2Encoder {
     config: AVX2Config,
-    alp: PhantomData<A>,
-}
 
-impl<A> AVX2Encoder<A> {
-    /// Create an AVX2Encoder from a given config.
-    ///
-    /// You can either select the Alphabet by defining the type specifically:
-    /// ```rust
-    /// let engine: AVX2Encoder<Standard> = AVX2Encoder::from(AVX2Config::default());
-    /// ```
-    /// or by calling one of the associated functions [`from_standard`] and [`from_urlsafe`].
-    pub const fn from(config: AVX2Config) -> Self {
-        Self {
-            config,
-            alp: PhantomData,
-        }
-    }
+    // Alphabet LUT for serial steps
+    encode_table: [u8;  64],
+    decode_table: [u8; 256],
+
+    // Alphabet LUT for vectorized steps
+    encode_offsets: __m256i,
+    decode_offsets: __m256i,
 }
-impl AVX2Encoder<Standard> {
-    /// Create an AVX2Encoder for the STANDARD alphabet with the given config.
-    pub const fn from_standard(config: AVX2Config) -> Self {
+
+impl AVX2Encoder {
+    /// Create an AVX2Encoder for the standard Alphabet from a given config.
+    /// You can create one for urlsafe with the associated function [`from_urlsafe`].
+    pub fn from_standard(config: AVX2Config) -> Self {
+        let encode_offsets = unsafe {
+            _mm256_setr_epi8(
+                71, -4, -4, -4, -4, -4, -4, -4,
+                -4, -4, -4,-19,-16, 65,  0,  0,
+                71, -4, -4, -4, -4, -4, -4, -4,
+                -4, -4, -4,-19,-16, 65,  0,  0,
+            )
+        };
+
+        let decode_offsets = unsafe {
+            _mm256_setr_epi8(
+                0, 16, 19, 4, -65, -65, -71, -71, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 16, 19, 4, -65, -65, -71, -71, 0, 0, 0, 0, 0, 0, 0, 0
+            )
+        };
+
         Self {
             config,
-            alp: PhantomData,
+
+            encode_table: ENCODE_TABLE,
+            decode_table: DECODE_TABLE,
+
+            encode_offsets,
+            decode_offsets,
         }
     }
-}
-impl AVX2Encoder<Urlsafe> {
-    /// Create an AVX2Encoder for the STANDARD alphabet with the given config.
-    pub const fn from_url_safe(config: AVX2Config) -> Self {
+    /// Create an AVX2Encoder for the urlsafe alphabet with the given config.
+    /// You can create one for standard with the associated function [`from_standard`].
+    pub fn from_url_safe(config: AVX2Config) -> Self {
+        let encode_offsets = unsafe {
+            _mm256_setr_epi8(
+                71, -4, -4, -4, -4, -4, -4, -4,
+                -4, -4, -4,-17, 32, 65,  0,  0,
+                71, -4, -4, -4, -4, -4, -4, -4,
+                -4, -4, -4,-17, 32, 65,  0,  0,
+            )
+        };
+
+        let decode_offsets = unsafe {
+            _mm256_setr_epi8(
+                0, -32, 17, 4, -65, -65, -71, -71, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, -32, 17, 4, -65, -65, -71, -71, 0, 0, 0, 0, 0, 0, 0, 0
+            )
+        };
+
         Self {
             config,
-            alp: PhantomData,
+
+            encode_table: URL_ENCODE_TABLE,
+            decode_table: URL_DECODE_TABLE,
+
+            encode_offsets,
+            decode_offsets,
         }
     }
 }
@@ -95,8 +126,9 @@ impl DecodeEstimate for AVX2Estimate {
 }
 
 
-#[inline]
+#[inline(always)]
 unsafe fn load_block(input: __m256i) -> __m256i {
+    // TODO: Explain this load shuffle
     let i: __m256i = _mm256_shuffle_epi8(input, _mm256_set_epi8(
         10, 11,  9, 10,  7,  8,  6,  7,  4,  5,  3,  4,  1,  2,  0,  1,
         14, 15, 13, 14, 11, 12, 10, 11,  8,  9,  7,  8,  5,  6,  4,  5
@@ -117,6 +149,7 @@ unsafe fn decode(
     mask_2f: __m256i,
     block: __m256i
 ) -> __m256i {
+    // TODO: Explain this decode step
     let hi_nibbles = _mm256_srli_epi32(block, 4);
     let lo_nibbles = _mm256_and_si256(block, mask_2f);
     let lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
@@ -146,6 +179,10 @@ unsafe fn decode(
 }
 
 #[inline(always)]
+/// decode_masked is a version of decode specialized for partial input.
+/// The only difference between it and the unmasked version is that the test that checks for
+/// invalid bytes (which is `a AND b` over a,b := 256-bit vector) gets the same input mask applied,
+/// since `0` bytes would in fact be an invalid input.
 unsafe fn decode_masked(
     invalid: &mut bool,
     lut_lo: __m256i,
@@ -188,65 +225,14 @@ unsafe fn decode_masked(
     ))
 }
 
-#[doc(hidden)]
-pub trait AvxAlp: Send + Sync {
-    unsafe fn encode(input: __m256i) -> __m256i;
-    fn encode_table() -> &'static [u8; 64];
-    fn decode_table() -> &'static [u8; 256];
-}
-
-#[doc(hidden)]
-pub struct Standard;
-impl AvxAlp for Standard {
-    #[inline]
-    unsafe fn encode(input: __m256i) -> __m256i {
-        let mut result: __m256i = _mm256_subs_epu8(input, _mm256_set1_epi8(51));
-        let less: __m256i = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input);
-        result = _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13)));
-        let offsets: __m256i = _mm256_setr_epi8(
-            71, -4, -4, -4, -4, -4, -4, -4,
-            -4, -4, -4,-19,-16, 65,  0,  0,
-            71, -4, -4, -4, -4, -4, -4, -4,
-            -4, -4, -4,-19,-16, 65,  0,  0,
-        );
-        result = _mm256_shuffle_epi8(offsets, result);
-        return _mm256_add_epi8(result, input);
-    }
-
-    fn encode_table() -> &'static [u8; 64] {
-        &ENCODE_TABLE
-    }
-
-    fn decode_table() -> &'static [u8; 256] {
-        &DECODE_TABLE
-    }
-}
-
-#[doc(hidden)]
-pub struct Urlsafe;
-impl AvxAlp for Urlsafe {
-    #[inline]
-    unsafe fn encode(input: __m256i) -> __m256i {
-        let mut result: __m256i = _mm256_subs_epu8(input, _mm256_set1_epi8(51));
-        let less: __m256i = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input);
-        result = _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13)));
-        let offsets: __m256i = _mm256_setr_epi8(
-            71, -4, -4, -4, -4, -4, -4, -4,
-            -4, -4, -4,-17, 32, 65,  0,  0,
-            71, -4, -4, -4, -4, -4, -4, -4,
-            -4, -4, -4,-17, 32, 65,  0,  0,
-        );
-        result = _mm256_shuffle_epi8(offsets, result);
-        return _mm256_add_epi8(result, input);
-    }
-
-    fn encode_table() -> &'static [u8; 64] {
-        &URL_ENCODE_TABLE
-    }
 
-    fn decode_table() -> &'static [u8; 256] {
-        &URL_DECODE_TABLE
-    }
+#[inline]
+unsafe fn encode(offsets: __m256i, input: __m256i) -> __m256i {
+    let mut result: __m256i = _mm256_subs_epu8(input, _mm256_set1_epi8(51));
+    let less: __m256i = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input);
+    result = _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13)));
+    result = _mm256_shuffle_epi8(offsets, result);
+    return _mm256_add_epi8(result, input);
 }
 
 const ENCODE_TABLE: [u8; 64] = 
@@ -260,7 +246,7 @@ const URL_DECODE_TABLE: [u8; 256] =
 
 const MASKLOAD: [i32; 16] = [-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0];
 
-impl<A: AvxAlp> super::Engine for AVX2Encoder<A> {
+impl super::Engine for AVX2Encoder {
     type Config = AVX2Config;
     type DecodeEstimate = AVX2Estimate;
 
@@ -306,7 +292,7 @@ impl<A: AvxAlp> super::Engine for AVX2Encoder<A> {
                     _mm256_set_epi32(SKIP,LOAD,LOAD,LOAD,LOAD,LOAD,LOAD,SKIP));
 
                 let expanded: __m256i = load_block(block);
-                let outblock: __m256i = A::encode(expanded);
+                let outblock: __m256i = encode(self.encode_offsets, expanded);
                 _mm256_storeu_si256(output_chunk.as_mut_ptr().cast(), outblock);
 
                 output_index += 32;
@@ -328,7 +314,7 @@ impl<A: AvxAlp> super::Engine for AVX2Encoder<A> {
                     // First step: Expand the 24 input bytes into 32 bytes ready for encoding.
                     let expanded: __m256i = load_block(block);
                     // Second step: Do the actual conversion
-                    let outblock: __m256i = A::encode(expanded);
+                    let outblock: __m256i = encode(self.encode_offsets, expanded);
                     // Third step: Write the data into the output
                     _mm256_storeu_si256(output_chunk.as_mut_ptr().cast(), outblock);
 
@@ -356,18 +342,16 @@ impl<A: AvxAlp> super::Engine for AVX2Encoder<A> {
 
         const LOW_SIX_BITS_U8: u8 = 0b111111;
 
-        let encode_table = A::encode_table();
-
         while input_index < start_of_rem {
             let input_chunk = &input[input_index..(input_index + 3)];
             let output_chunk = &mut output[output_index..(output_index + 4)];
 
-            output_chunk[0] = encode_table[(input_chunk[0] >> 2) as usize];
-            output_chunk[1] = encode_table
+            output_chunk[0] = self.encode_table[(input_chunk[0] >> 2) as usize];
+            output_chunk[1] = self.encode_table
                 [((input_chunk[0] << 4 | input_chunk[1] >> 4) & LOW_SIX_BITS_U8) as usize];
-            output_chunk[2] = encode_table
+            output_chunk[2] = self.encode_table
                 [((input_chunk[1] << 2 | input_chunk[2] >> 6) & LOW_SIX_BITS_U8) as usize];
-            output_chunk[3] = encode_table[(input_chunk[2] & LOW_SIX_BITS_U8) as usize];
+            output_chunk[3] = self.encode_table[(input_chunk[2] & LOW_SIX_BITS_U8) as usize];
 
             input_index += 3;
             output_index += 4;
@@ -377,18 +361,18 @@ impl<A: AvxAlp> super::Engine for AVX2Encoder<A> {
 
         if rem == 2 {
             let final_input = input.len()-2;
-            output[output_index] = encode_table[(input[final_input] >> 2) as usize];
+            output[output_index] = self.encode_table[(input[final_input] >> 2) as usize];
             output[output_index + 1] =
-                encode_table[((input[final_input] << 4 | input[final_input + 1] >> 4)
+                self.encode_table[((input[final_input] << 4 | input[final_input + 1] >> 4)
                     & LOW_SIX_BITS_U8) as usize];
             output[output_index + 2] =
-                encode_table[((input[final_input + 1] << 2) & LOW_SIX_BITS_U8) as usize];
+                self.encode_table[((input[final_input + 1] << 2) & LOW_SIX_BITS_U8) as usize];
             output_index += 3;
         } else if rem == 1 {
             let final_input = input.len()-1;
-            output[output_index] = encode_table[(input[final_input] >> 2) as usize];
+            output[output_index] = self.encode_table[(input[final_input] >> 2) as usize];
             output[output_index + 1] =
-                encode_table[((input[final_input] << 4) & LOW_SIX_BITS_U8) as usize];
+                self.encode_table[((input[final_input] << 4) & LOW_SIX_BITS_U8) as usize];
             output_index += 2;
         }
 
@@ -405,14 +389,13 @@ impl<A: AvxAlp> super::Engine for AVX2Encoder<A> {
         output: &mut [u8],
         _estimate: Self::DecodeEstimate,
     ) -> Result<usize, DecodeError> {
-        let decode_table = A::decode_table();
         // TODO: Check if LLVM optimizes this modulo into an &
         let skip_stage_2 = match input.len() % 4 {
             1 => {
                 // trailing whitespace is so common that it's worth it to check the last byte to
                 // possibly return a better error message
                 if let Some(b) = input.last() {
-                    if *b != PAD_BYTE && decode_table[*b as usize] == INVALID_VALUE {
+                    if *b != PAD_BYTE && self.decode_table[*b as usize] == INVALID_VALUE {
                         return Err(DecodeError::InvalidByte(input.len() - 1, *b));
                     }
                 }
@@ -487,10 +470,6 @@ impl<A: AvxAlp> super::Engine for AVX2Encoder<A> {
                 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
                 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10
         )};
-        let lut_roll = unsafe {_mm256_setr_epi8(
-                0, 16, 19, 4, -65, -65, -71, -71, 0, 0, 0, 0, 0, 0, 0, 0,
-                0, 16, 19, 4, -65, -65, -71, -71, 0, 0, 0, 0, 0, 0, 0, 0
-        )};
         let mask_2f = unsafe { _mm256_set1_epi8(0x2F) };
 
         // This will only evaluate to true if we have an input of 33 bytes or more;
@@ -514,10 +493,10 @@ impl<A: AvxAlp> super::Engine for AVX2Encoder<A> {
 
                 unsafe {
                     block = _mm256_loadu_si256(input_chunk.as_ptr().cast());
-                    block = decode(&mut invalid, lut_lo, lut_hi, lut_roll, mask_2f, block);
+                    block = decode(&mut invalid, lut_lo, lut_hi, self.decode_offsets, mask_2f, block);
 
                     if invalid {
-                        return Err(find_invalid_input(input_index, input_chunk, decode_table));
+                        return Err(find_invalid_input(input_index, input_chunk, &self.decode_table));
                     }
 
                     _mm256_storeu_si256(output_chunk.as_mut_ptr().cast(), block);
@@ -551,12 +530,12 @@ impl<A: AvxAlp> super::Engine for AVX2Encoder<A> {
                 let mask_output = _mm256_loadu_si256(MASKLOAD[2..10].as_ptr().cast());
 
                 block = _mm256_loadu_si256(input_chunk.as_ptr().cast());
-                block = decode(&mut invalid, lut_lo, lut_hi, lut_roll, mask_2f, block);
+                block = decode(&mut invalid, lut_lo, lut_hi, self.decode_offsets, mask_2f, block);
 
                 _mm256_maskstore_epi32(output_chunk.as_mut_ptr().cast(), mask_output, block);
             }
             if invalid {
-                return Err(find_invalid_input(input_index, input_chunk, decode_table));
+                return Err(find_invalid_input(input_index, input_chunk, &self.decode_table));
             }
 
             input_index += 32;
@@ -604,10 +583,10 @@ impl<A: AvxAlp> super::Engine for AVX2Encoder<A> {
                 block = _mm256_maskload_epi32(input_chunk.as_ptr().cast(), mask_input);
                 let outblock 
                     = decode_masked(&mut invalid,
-                        lut_lo, lut_hi, lut_roll, mask_2f, mask_input, block);
+                        lut_lo, lut_hi, self.decode_offsets, mask_2f, mask_input, block);
 
                 if invalid {
-                    return Err(find_invalid_input(input_index, input_chunk, decode_table));
+                    return Err(find_invalid_input(input_index, input_chunk, &self.decode_table));
                 }
 
                 _mm256_maskstore_epi32(output_chunk.as_mut_ptr().cast(), mask_output, outblock);
@@ -677,7 +656,7 @@ impl<A: AvxAlp> super::Engine for AVX2Encoder<A> {
             // can use up to 8 * 6 = 48 bits of the u64, if last chunk has no padding.
             // Pack the leftovers from left to right.
             let shift = 64 - (morsels_in_leftover + 1) * 6;
-            let morsel = decode_table[*b as usize];
+            let morsel = self.decode_table[*b as usize];
             if morsel == INVALID_VALUE {
                 return Err(DecodeError::InvalidByte(start_of_leftovers + i, *b));
             }
@@ -729,7 +708,7 @@ impl<A: AvxAlp> super::Engine for AVX2Encoder<A> {
     }
 }
 
-fn find_invalid_input(input_index: usize, input: &[u8], decode_table: &'static [u8; 256]) -> DecodeError {
+fn find_invalid_input(input_index: usize, input: &[u8], decode_table: &[u8; 256]) -> DecodeError {
     // Figure out which byte was invalid exactly.
     for i in 0..input.len() {
         let byte = input[i];