Start explainering the algorithm

dequbed · dequbed · commit 1046070c3efe · 2021-09-16T18:31:05.000+02:00
diff --git a/src/engine/avx2/mod.rs b/src/engine/avx2/mod.rs
@@ -35,6 +35,12 @@ pub struct AVX2Encoder {
     // Alphabet LUT for vectorized steps
     encode_offsets: __m256i,
     decode_offsets: __m256i,
+
+    // The algorithm in use needs to be able to distinguish between the two singletons outside the
+    // [A-Za-z] ranges. 
+    // For STANDARD these are '+' and '/' and the engine matches against '/' i.e. 0x2F
+    // For URL_SAFE these are '-' and '_' and the engine matches against '_' i.e. 0x5F
+    singleton_mask: __m256i,
 }
 
 impl AVX2Encoder {
@@ -43,20 +49,22 @@ impl AVX2Encoder {
     pub fn from_standard(config: AVX2Config) -> Self {
         let encode_offsets = unsafe {
             _mm256_setr_epi8(
-                71, -4, -4, -4, -4, -4, -4, -4,
-                -4, -4, -4,-19,-16, 65,  0,  0,
-                71, -4, -4, -4, -4, -4, -4, -4,
-                -4, -4, -4,-19,-16, 65,  0,  0,
+            //  00  01  02  03  04  05  06  07  08  09  10  11  12  13  14  15
+                71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,-19,-16, 65,  0,  0,
+                71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,-19,-16, 65,  0,  0,
             )
         };
 
         let decode_offsets = unsafe {
             _mm256_setr_epi8(
+            //  00  01  02  03  04  05  06  07  08  09  10  11  12  13  14  15
                 0, 16, 19, 4, -65, -65, -71, -71, 0, 0, 0, 0, 0, 0, 0, 0,
                 0, 16, 19, 4, -65, -65, -71, -71, 0, 0, 0, 0, 0, 0, 0, 0
             )
         };
 
+        let singleton_mask = unsafe { _mm256_set1_epi8(0x2F) };
+
         Self {
             config,
 
@@ -65,27 +73,30 @@ impl AVX2Encoder {
 
             encode_offsets,
             decode_offsets,
+            singleton_mask,
         }
     }
     /// Create an AVX2Encoder for the urlsafe alphabet with the given config.
     /// You can create one for standard with the associated function [`from_standard`].
     pub fn from_url_safe(config: AVX2Config) -> Self {
         let encode_offsets = unsafe {
             _mm256_setr_epi8(
-                71, -4, -4, -4, -4, -4, -4, -4,
-                -4, -4, -4,-17, 32, 65,  0,  0,
-                71, -4, -4, -4, -4, -4, -4, -4,
-                -4, -4, -4,-17, 32, 65,  0,  0,
+            //  00  01  02  03  04  05  06  07  08  09  10  11  12  13  14  15
+                71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,-17, 32, 65,  0,  0,
+                71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,-17, 32, 65,  0,  0,
             )
         };
 
         let decode_offsets = unsafe {
             _mm256_setr_epi8(
-                0, -32, 17, 4, -65, -65, -71, -71, 0, 0, 0, 0, 0, 0, 0, 0,
-                0, -32, 17, 4, -65, -65, -71, -71, 0, 0, 0, 0, 0, 0, 0, 0
+            // 00   01  02  03   04   05   06   07  08  09  10  11  12  13  14  15
+                0, -32, 17,  4, -65, -65, -71, -71,  0,  0,  0,  0,  0,  0,  0,  0,
+                0, -32, 17,  4, -65, -65, -71, -71,  0,  0,  0,  0,  0,  0,  0,  0
             )
         };
 
+        let singleton_mask = unsafe { _mm256_set1_epi8(0x2B) };
+
         Self {
             config,
 
@@ -94,6 +105,7 @@ impl AVX2Encoder {
 
             encode_offsets,
             decode_offsets,
+            singleton_mask,
         }
     }
 }
@@ -143,23 +155,67 @@ unsafe fn load_block(input: __m256i) -> __m256i {
 #[inline(always)]
 unsafe fn decode(
     invalid: &mut bool,
-    lut_lo: __m256i,
-    lut_hi: __m256i,
-    lut_roll: __m256i,
-    mask_2f: __m256i,
+    lo_witness_lut: __m256i,
+    hi_witness_lut: __m256i,
+    offsets: __m256i,
+    mask_singleton: __m256i,
     block: __m256i
 ) -> __m256i {
-    // TODO: Explain this decode step
-    let hi_nibbles = _mm256_srli_epi32(block, 4);
-    let lo_nibbles = _mm256_and_si256(block, mask_2f);
-    let lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
-    let eq_2f = _mm256_cmpeq_epi8(block, mask_2f);
-    let hi_nibbles = _mm256_and_si256(hi_nibbles, mask_2f);
-    let hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
-    let roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2f, hi_nibbles));
-    if _mm256_testz_si256(lo, hi) == 0 {
+    // The most relevant information to understand this algorithm is this tidbit:
+    // AVX shuffle conveniently work like table lookups; c = _mm256_shuffle_epi8(a,b) behaves* like 
+    // for i in 0..16 {
+    //     c[i] = a[b[i]];
+    //     c[i+16] = a[b[i+16]];
+    // }
+    // This is the reason why lo_witness_lut, hi_witness_lut, encode_offsets and decode_offets all have the exact
+    // same values set for each 16-byte half; they are used as Look-Up tables in shuffles.
+    // (* it additionally sets c[i] and c[i] to 0 if b[i] >= 128 but that is not used here)
+    //
+    // As a first step, since the indexes available in shuffles are only 0.16 or in other words one
+    // nibble's worth, split each input byte into high and low nibble.
+    // The high nibbles are retrieved by shifting the input by 4 bits and then applying a mask of
+    // 0b1111 to it. The low bits are retrieved by not shifting and applying the very same map.
+    // The "standard" algorithm happens to look for 0x2F ('/') which *also* just happens to have the
+    // lowest 4 bits set to 1, so it can use that. The urlsafe one can't.
+    let mask_nib = _mm256_set1_epi8(0b00001111);
+    let block_shifted = _mm256_srli_epi32(block, 4);
+    let hi_nibbles = _mm256_and_si256(block_shifted, mask_nib);
+    let lo_nibbles = _mm256_and_si256(block, mask_nib);
+
+    // This algorithm uses offsets for decoding. e.g. in the standard and url-safe alphabet the
+    // ASCII letter 'A' encodes 0b000000, the letter 'B' 0b000001, and so on. The ASCII value of
+    // 'A' is 65. So to get from a capital letter in the input to the value it encodes you have to
+    // substract 65.  Similarly, the letter 'a' encodes 0b011010, or 26 in decimal. 'b' encodes 27
+    // and so on. But the ASCII value of 'a' is 97, so to get from a miniscule to it's value you
+    // don't substract 65 but 71 instead.
+    // The main optimization this algorithm makes and the source for it's assumptions is that it
+    // relies on the fact that the alphabet used has continous ordered ranges of inputs that thus
+    // share an offset, and that these ranges are distinguishable by their upper nibble.
+    // In other words because for [A-Z] substracting 65 gets you to the correct value and for [a-z]
+    // substracting 71 does as well. While decoding we just have to figure out which range an input
+    // belongs to and directly know what offset to apply.
+    // However, we need to check for invalid inputs. The algorithm again optimizes that by using
+    // the fact that valid input is in one of the ranges or one of two special bytes ('+' and '/'
+    // or '-' and '_' specifically)
+    // [A-Z] for example is the range of 0b100_0001 to 0b101_1010, so the high nibbles 0b100 (4)
+    // and 0b101 (5). But not every input with these high nibbles is valid, e.g. the character '@'
+    // encoded as 0b100_0000 or the character '[', i.e. 0b101_1011.  So we need to check if the low
+    // nibble is valid for a given high nibble. AVX2 has an instructions for bitwise comparing two
+    // vectors which is exposed as `test` instrinsics which return a different CPU flag for
+    // conditionals.
+    // _mm256_testz_si256 used here bitwise AND's both input vectors and returns 1 if the result is
+    // zero and 0 if the result has any bit set.
+    // So we need to now generate a `witness` for the high and low nibble each so that 
+    // `witness_hi & witness_lo == 0` iff the input is valid.
+    let witness_lo = _mm256_shuffle_epi8(lo_witness_lut, lo_nibbles);
+    let witness_hi = _mm256_shuffle_epi8(hi_witness_lut, hi_nibbles);
+    if _mm256_testz_si256(witness_lo, witness_hi) == 0 {
         *invalid = true;
+        return _mm256_and_si256(witness_hi, witness_lo);
     }
+
+    let eq_singleton = _mm256_cmpeq_epi8(block, mask_singleton);
+    let roll = _mm256_shuffle_epi8(offsets, _mm256_add_epi8(eq_singleton, hi_nibbles));
     let shuffeled = _mm256_add_epi8(block, roll);
 
     let merge_ab_and_bc = _mm256_maddubs_epi16(shuffeled, 
@@ -185,28 +241,28 @@ unsafe fn decode(
 /// since `0` bytes would in fact be an invalid input.
 unsafe fn decode_masked(
     invalid: &mut bool,
-    lut_lo: __m256i,
-    lut_hi: __m256i,
+    lo_witness_lut: __m256i,
+    hi_witness_lut: __m256i,
     lut_roll: __m256i,
-    mask_2f: __m256i,
+    mask_singleton: __m256i,
     mask_input: __m256i,
     block: __m256i
 ) -> __m256i {
     let hi_nibbles = _mm256_srli_epi32(block, 4);
-    let lo_nibbles = _mm256_and_si256(block, mask_2f);
-    let eq_2f = _mm256_cmpeq_epi8(block, mask_2f);
-    let hi_nibbles = _mm256_and_si256(hi_nibbles, mask_2f);
+    let lo_nibbles = _mm256_and_si256(block, mask_singleton);
+    let eq_singleton = _mm256_cmpeq_epi8(block, mask_singleton);
+    let hi_nibbles = _mm256_and_si256(hi_nibbles, mask_singleton);
 
-    let lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
-    let hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
+    let lo = _mm256_shuffle_epi8(lo_witness_lut, lo_nibbles);
+    let hi = _mm256_shuffle_epi8(hi_witness_lut, hi_nibbles);
     // Special case: If we have a masked input we need to forward this mask here to not
     // trip the test below
     let hi = _mm256_and_si256(hi, mask_input);
     if _mm256_testz_si256(lo, hi) == 0 {
         *invalid = true;
     }
 
-    let roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2f, hi_nibbles));
+    let roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_singleton, hi_nibbles));
     let shuffeled = _mm256_add_epi8(block, roll);
 
     let merge_ab_and_bc = _mm256_maddubs_epi16(shuffeled, 
@@ -457,20 +513,69 @@ impl super::Engine for AVX2Encoder {
         let mut block: __m256i;
         let mut invalid: bool = false;
 
-        // Initialize the four required vectors for all avx decoding operations
-        let lut_lo = unsafe { _mm256_setr_epi8(
+        // Witnesses for the high nibbles:
+        // 0x0 and 0x1 are never valid, no matter what the low nibble is.
+        // 0x2 is valid for the characters '+' (0x2B), '/' (0x2F) and '-' (0x2D), depending on the
+        // alphabet.
+        // 0x3 contains numerals but the only valid inputs are 0x30 to 0x39, so we need to make
+        // sure that everything from 0xA to 0xF is rejected.
+        // 0x4 and 0x5 contain [A-Z] and also the special character '_' (0x5F) from the urlsafe
+        // alphabet.
+        // 0x6 and 0x7 contain [a-z].
+        // 0x7 and 0x8 are never valid; 0x8 or higher especially means invalid ASCII.
+        //
+        // We use -0x1 as "always invalid" value so that the low witness has to only return
+        // something != 0 for the invalid test to trip.
+        let hi_witness_lut = unsafe { _mm256_setr_epi8(
+                // 0     1     2     3     4     5     6     7
+                -0x1, -0x1, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
+                // 8     9    10    11    12    13    14    15
+                -0x1, -0x1, -0x1, -0x1, -0x1, -0x1, -0x1, -0x1,
+                // 0     1     2     3     4     5     6     7
+                -0x1, -0x1, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
+                // 8     9    10    11    12    13    14    15
+                -0x1, -0x1, -0x1, -0x1, -0x1, -0x1, -0x1, -0x1
+        )};
+        // Witnesses for the low nibbles. The requirements for the given hi witnesses are then:
+        // // Be invalid if hi is.
+        // - lo[..] & -0x1 == 1
+        // // Numerals
+        // - lo[0..9] & 0x2 == 0
+        // - lo[10..15] & 0x2 == 1
+        // // Capitals
+        // - lo[0] & 0x4 == 1
+        // - lo[1..] & 0x4 == 0
+        // - lo[0..10] & 0x8 == 0
+        // - lo[11..15] & 0x8 == 1
+        // // Miniscules
+        // - lo[0] & 0x4 == 1
+        // - lo[1..] & 0x4 == 1
+        // - lo[..10] & 0x8 == 1
+        // - lo[11..15] & 0x8 == 1
+        // // Special, depending on the alphabet
+        // // standard
+        // - lo[15] & 0x1 == 1
+        // - lo[11] & 0x1 == 1
+        // // urlsafe
+        // - lo[13] & 0x1 == 1
+        // - lo[15] & 0x8
+        // ASCII has the advantage that A-Z and a-z are 0x20 away from each other so you can use
+        // the same lo witnesses.
+        // The easiest way to create these witness tables and what is done here is to use the hi
+        // witness to select a bit to probe and set the bit in the low witness for valid nibbles in
+        // that range. E.g. the hi witness sets bit 1 for high nibble 0x2 and bit 3 for 0x4 and
+        // 0x6, and the lo witness only sets bit 1 for valid inputs with high nibble 0x2 (like
+        // 0x2F, 0x2B etc.) and bit 3 for valid letters [A-Za-z].
+        let lo_witness_lut = unsafe { _mm256_setr_epi8(
+                // 0     1     2     3     4     5     6     7
                 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
+                // 8     9    10    11    12    13    14    15
                 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A,
+                // 0     1     2     3     4     5     6     7
                 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
+                // 8     9    10    11    12    13    14    15
                 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A
         )};
-        let lut_hi = unsafe { _mm256_setr_epi8(
-                0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
-                0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
-                0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
-                0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10
-        )};
-        let mask_2f = unsafe { _mm256_set1_epi8(0x2F) };
 
         // This will only evaluate to true if we have an input of 33 bytes or more;
         // skip_final_bytes is at least input.len() otherwise.
@@ -493,7 +598,12 @@ impl super::Engine for AVX2Encoder {
 
                 unsafe {
                     block = _mm256_loadu_si256(input_chunk.as_ptr().cast());
-                    block = decode(&mut invalid, lut_lo, lut_hi, self.decode_offsets, mask_2f, block);
+                    block = decode(&mut invalid, 
+                        lo_witness_lut, 
+                        hi_witness_lut, 
+                        self.decode_offsets, 
+                        self.singleton_mask, 
+                        block);
 
                     if invalid {
                         return Err(find_invalid_input(input_index, input_chunk, &self.decode_table));
@@ -530,7 +640,7 @@ impl super::Engine for AVX2Encoder {
                 let mask_output = _mm256_loadu_si256(MASKLOAD[2..10].as_ptr().cast());
 
                 block = _mm256_loadu_si256(input_chunk.as_ptr().cast());
-                block = decode(&mut invalid, lut_lo, lut_hi, self.decode_offsets, mask_2f, block);
+                block = decode(&mut invalid, lo_witness_lut, hi_witness_lut, self.decode_offsets, self.singleton_mask, block);
 
                 _mm256_maskstore_epi32(output_chunk.as_mut_ptr().cast(), mask_output, block);
             }
@@ -583,7 +693,7 @@ impl super::Engine for AVX2Encoder {
                 block = _mm256_maskload_epi32(input_chunk.as_ptr().cast(), mask_input);
                 let outblock 
                     = decode_masked(&mut invalid,
-                        lut_lo, lut_hi, self.decode_offsets, mask_2f, mask_input, block);
+                        lo_witness_lut, hi_witness_lut, self.decode_offsets, self.singleton_mask, mask_input, block);
 
                 if invalid {
                     return Err(find_invalid_input(input_index, input_chunk, &self.decode_table));
@@ -717,7 +827,7 @@ fn find_invalid_input(input_index: usize, input: &[u8], decode_table: &[u8; 256]
         }
     }
 
-    unreachable!("Called find_invalid_input on valid input! {}, {:?}", input_index, input);
+    unreachable!("find_invalid_input was given valid input {:?}, global index {}", input, input_index);
 }