Various cleanups

marshallpierce · marshallpierce · commit 3155ca72305e · 2021-08-18T16:53:58.000-06:00
- Improve engine tests
- Improve comments
- Remove dead code
- Improve error message byte formatting
diff --git a/.gitignore b/.gitignore
@@ -10,5 +10,5 @@ main.rs
 *.iml
 
 # `perf record` files
-perf.data*
+/*perf.data*
 /tmp
diff --git a/Cargo.toml b/Cargo.toml
@@ -31,3 +31,7 @@ std = []
 [profile.bench]
 # Useful for better disassembly when using `perf record` and `perf annotate`
 debug = true
+
+[profile.test]
+# Faster tests save much more than the increase in compilation time
+opt-level = 3
diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md
@@ -5,9 +5,9 @@
   - This opens the door to a portable constant-time implementation ([#153](https://github.com/marshallpierce/rust-base64/pull/153), presumably `ConstantTimePortable`?) for security-sensitive applications that need side-channel resistance, and CPU-specific SIMD implementations for  more speed.
   - Standard base64 per the RFC is available via `DEFAULT_ENGINE`. To use different alphabets or other settings (padding, etc), create your own engine instance.
 - `CharacterSet` is now `Alphabet` (per the RFC), and allows creating custom alphabets. The corresponding tables that were previously code-generated are now built dynamically.
-- Since there are already multiple breaking changes, various functions are renamed to be more consistent and discoverable
-- MSRV is now 1.47.0
-- DecoderReader now owns its inner reader, and can expose it via `into_inner()`. For symmetry, `EncoderWriter` can do the same with its writer.
+- Since there are already multiple breaking changes, various functions are renamed to be more consistent and discoverable.
+- MSRV is now 1.47.0 to allow various things to use `const fn`.
+- `DecoderReader` now owns its inner reader, and can expose it via `into_inner()`. For symmetry, `EncoderWriter` can do the same with its writer.
 
 # 0.13.0
 
diff --git a/src/alphabet.rs b/src/alphabet.rs
@@ -1,5 +1,6 @@
 //! Provides [Alphabet] and constants for alphabets commonly used in the wild.
 
+use crate::PAD_BYTE;
 #[cfg(any(feature = "std", test))]
 use std::{convert, error, fmt};
 
@@ -17,7 +18,7 @@ const ALPHABET_SIZE: usize = 64;
 ///     &custom,
 ///     base64::engine::fast_portable::PAD);
 /// ```
-#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+#[derive(Clone, Debug, Eq, PartialEq)]
 pub struct Alphabet {
     pub(crate) symbols: [u8; ALPHABET_SIZE],
 }
@@ -39,7 +40,7 @@ impl Alphabet {
         Alphabet { symbols }
     }
 
-    /// Create a `CharacterSet` from a string of 64 unique printable ASCII bytes.
+    /// Create an `Alphabet` from a string of 64 unique printable ASCII bytes.
     ///
     /// The `=` byte is not allowed as it is used for padding.
     ///
@@ -62,7 +63,7 @@ impl Alphabet {
                     return Err(ParseAlphabetError::UnprintableByte(byte));
                 }
                 // = is assumed to be padding, so cannot be used as a symbol
-                if b'=' == byte {
+                if byte == PAD_BYTE {
                     return Err(ParseAlphabetError::ReservedByte(byte));
                 }
 
@@ -121,9 +122,9 @@ impl fmt::Display for ParseAlphabetError {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         match self {
             ParseAlphabetError::InvalidLength => write!(f, "Invalid length - must be 64 bytes"),
-            ParseAlphabetError::DuplicatedByte(b) => write!(f, "Duplicated byte: {}", b),
-            ParseAlphabetError::UnprintableByte(b) => write!(f, "Unprintable byte: {}", b),
-            ParseAlphabetError::ReservedByte(b) => write!(f, "Reserved byte: {}", b),
+            ParseAlphabetError::DuplicatedByte(b) => write!(f, "Duplicated byte: {:#04x}", b),
+            ParseAlphabetError::UnprintableByte(b) => write!(f, "Unprintable byte: {:#04x}", b),
+            ParseAlphabetError::ReservedByte(b) => write!(f, "Reserved byte: {:#04x}", b),
         }
     }
 }
diff --git a/src/decode.rs b/src/decode.rs
@@ -9,8 +9,6 @@ use core::fmt;
 #[cfg(any(feature = "std", test))]
 use std::error;
 
-// TODO how to handle InvalidLastSymbol and InvalidLength behavior across engines?
-
 /// Errors that can occur while decoding.
 #[derive(Clone, Debug, PartialEq, Eq)]
 pub enum DecodeError {
@@ -58,7 +56,7 @@ impl error::Error for DecodeError {
     }
 }
 
-///Decode base64 using the [default engine](DEFAULT_ENGINE), alphabet, and config.
+///Decode base64 using the [default engine](DEFAULT_ENGINE).
 ///Returns a `Result` containing a `Vec<u8>`.
 ///
 ///# Example
@@ -93,10 +91,10 @@ pub fn decode<T: AsRef<[u8]>>(input: T) -> Result<Vec<u8>, DecodeError> {
 ///
 ///    // custom engine setup
 ///    let bytes_url = base64::decode_engine(
-///        "aGVsbG8gaW50ZXJuZXR-Cg==",
+///        "aGVsbG8gaW50ZXJuZXR-Cg",
 ///        &base64::engine::fast_portable::FastPortable::from(
 ///            &base64::alphabet::URL_SAFE,
-///            base64::engine::fast_portable::PAD),
+///            base64::engine::fast_portable::NO_PAD),
 ///
 ///    ).unwrap();
 ///    println!("{:?}", bytes_url);
diff --git a/src/encode.rs b/src/encode.rs
@@ -7,7 +7,7 @@ use crate::PAD_BYTE;
 #[cfg(any(feature = "alloc", feature = "std", test))]
 use alloc::{string::String, vec};
 
-///Encode arbitrary octets as base64 using the [default engine](DEFAULT_ENGINE), alphabet, and config.
+///Encode arbitrary octets as base64 using the [default engine](DEFAULT_ENGINE).
 ///Returns a `String`.
 ///
 ///# Example
diff --git a/src/engine/fast_portable/mod.rs b/src/engine/fast_portable/mod.rs
@@ -216,23 +216,6 @@ pub(crate) const fn decode_table(alphabet: &Alphabet) -> [u8; 256] {
     return decode_table;
 }
 
-// fn decode_aligned(symbol: u8, decode_table: &[u8; 256]) -> u8 {
-//     let mut result: u8 = 0x00;
-//     // If `symbol` is inside the printable range, one of these two derived indices will be equal to
-//     // the original index, and the decoded byte will end up in `result`. If `symbol` is not
-//     // printable, neither will equal the original symbol, and so both decoded bytes will have 0x00
-//     // as a mask.
-//     // TODO invalid bytes decoded to 0x00 instead of 0xFF?
-//     let idx: [u8; 2] = [symbol % 64, symbol % 64 + 64];
-//     for i in 0..2 {
-//         let symbol_eq_mod = idx[i] == symbol;
-//         // if symbol equals its mod flavor, 0xFF, else 0x00
-//         let mask = ((symbol_eq_mod) as i8 - 1) as u8;
-//         result = result | (decode_table[idx[i] as usize] & mask);
-//     }
-//     result
-// }
-
 #[inline]
 fn read_u64(s: &[u8]) -> u64 {
     u64::from_be_bytes(s[..8].try_into().unwrap())
@@ -315,6 +298,9 @@ impl Config for FastPortableConfig {
 }
 
 /// Include padding bytes when encoding.
+///
+/// This is the standard per the base64 RFC, but consider using [NO_PAD] instead as padding serves
+/// little purpose in practice.
 pub const PAD: FastPortableConfig = FastPortableConfig::new();
 
 /// Don't add padding when encoding.
diff --git a/src/engine/mod.rs b/src/engine/mod.rs
@@ -10,10 +10,10 @@ mod naive;
 #[cfg(test)]
 mod tests;
 
-/// An `Engine` provides low-level encoding and decoding operations that all other higher-level parts of the API use.
+/// An `Engine` provides low-level encoding and decoding operations that all other higher-level parts of the API use. Users of the library will generally not need to implement this.
 ///
 /// Different implementations offer different characteristics. The library currently ships with
-/// a general-purpose [FastPortable] that offers good speed and works on any CPU, with more choices
+/// a general-purpose [FastPortable] impl that offers good speed and works on any CPU, with more choices
 /// coming later, like a constant-time one when side channel resistance is called for, and vendor-specific vectorized ones for more speed.
 ///
 /// See [DEFAULT_ENGINE] if you just want standard base64. Otherwise, when possible, it's
@@ -40,10 +40,9 @@ pub trait Engine: Send + Sync {
     /// Must not write any bytes into the output slice other than the encoded data.
     fn encode(&self, input: &[u8], output: &mut [u8]) -> usize;
 
-    /// As an optimization, it is sometimes helpful to have a conservative estimate of the decoded
-    /// size before doing the decoding.
-    ///
-    /// The result of this must be passed to [Engine::decode()].
+    /// As an optimization to prevent the decoded length from being calculated twice, it is
+    /// sometimes helpful to have a conservative estimate of the decoded size before doing the
+    /// decoding, so this calculation is done separately and passed to [Engine::decode()] as needed.
     fn decoded_length_estimate(&self, input_len: usize) -> Self::DecodeEstimate;
 
     /// Decode `input` base64 bytes into the `output` buffer.
@@ -87,14 +86,14 @@ pub trait Config {
 /// The decode estimate used by an engine implementation. Users do not need to interact with this;
 /// it is only for engine implementors.
 ///
-/// Implementors may want to store relevant calculations when constructing this to avoid having
-/// to calculate them again during actual decoding.
+/// Implementors may store relevant data here when constructing this to avoid having to calculate
+/// them again during actual decoding.
 pub trait DecodeEstimate {
     /// Returns a conservative (err on the side of too big) estimate of the decoded length to use
     /// for pre-allocating buffers, etc.
     fn decoded_length_estimate(&self) -> usize;
 }
 
-/// An engine that will work on all CPUs using the standard base64 alphabet and config.
+/// A [FastPortable] engine using the [crate::alphabet::STANDARD] base64 alphabet and [crate::engine::fast_portable::PAD] config.
 pub const DEFAULT_ENGINE: FastPortable =
     FastPortable::from(&alphabet::STANDARD, fast_portable::PAD);
diff --git a/src/engine/tests.rs b/src/engine/tests.rs
@@ -6,9 +6,10 @@ use rstest::rstest;
 use rstest_reuse::{apply, template};
 use std::iter;
 
+use crate::tests::assert_encode_sanity;
 use crate::{
     alphabet::{Alphabet, STANDARD},
-    encode,
+    decode_engine, encode,
     engine::{fast_portable, naive, Engine},
     tests::random_alphabet,
     DecodeError, PAD_BYTE,
@@ -122,7 +123,9 @@ fn encode_doesnt_write_extra_bytes<E: EngineWrapper>(engine_wrapper: E) {
     let mut encode_buf = Vec::<u8>::new();
     let mut encode_buf_backup = Vec::<u8>::new();
 
-    let len_range = Uniform::new(1, 1_000);
+    let input_len_range = Uniform::new(0, 5);
+    let prefix_len_range = Uniform::new(0, 5);
+    let suffix_len_range = Uniform::new(0, 5);
 
     for _ in 0..10_000 {
         let engine = E::random(&mut rng);
@@ -131,23 +134,37 @@ fn encode_doesnt_write_extra_bytes<E: EngineWrapper>(engine_wrapper: E) {
         encode_buf.clear();
         encode_buf_backup.clear();
 
-        let orig_len = fill_rand(&mut orig_data, &mut rng, &len_range);
-        let expected_encode_len = engine_encoded_len(orig_len);
-        encode_buf.resize(expected_encode_len, 0);
+        let orig_len = fill_rand(&mut orig_data, &mut rng, &input_len_range);
+
+        // write a random prefix
+        let prefix_len = fill_rand(&mut encode_buf, &mut rng, &prefix_len_range);
+        let expected_encode_len_no_pad = engine_encoded_len(orig_len);
+        // leave space for encoded data
+        encode_buf.resize(expected_encode_len_no_pad + prefix_len, 0);
+        // and a random suffix
+        let suffix_len = fill_rand(&mut encode_buf, &mut rng, &suffix_len_range);
 
-        // oversize encode buffer so we can easily tell if it writes anything more than
-        // just the encoded data
-        fill_rand_len(&mut encode_buf, &mut rng, (expected_encode_len + 100) * 2);
         encode_buf_backup.extend_from_slice(&encode_buf[..]);
 
-        let encoded_len = engine.encode(&orig_data[..], &mut encode_buf[..]);
-        assert_eq!(expected_encode_len, encoded_len);
+        let encoded_len_no_pad = engine.encode(&orig_data[..], &mut encode_buf[prefix_len..]);
+        assert_eq!(expected_encode_len_no_pad, encoded_len_no_pad);
 
         // no writes past what it claimed to write
+        assert_eq!(&encode_buf_backup[..prefix_len], &encode_buf[..prefix_len]);
         assert_eq!(
-            &encode_buf_backup[encoded_len..],
-            &encode_buf[encoded_len..]
-        )
+            &encode_buf_backup[(prefix_len + encoded_len_no_pad)..],
+            &encode_buf[(prefix_len + encoded_len_no_pad)..]
+        );
+
+        let encoded_data = &encode_buf[prefix_len..(prefix_len + encoded_len_no_pad)];
+        assert_encode_sanity(
+            std::str::from_utf8(encoded_data).unwrap(),
+            // engines don't pad
+            false,
+            orig_len,
+        );
+
+        assert_eq!(orig_data, decode_engine(encoded_data, &engine).unwrap());
     }
 }
 
@@ -273,6 +290,37 @@ fn decode_detect_invalid_last_symbol_two_bytes<E: EngineWrapper>(engine_wrapper:
     }
 }
 
+#[apply(all_engines)]
+fn decode_detect_invalid_last_symbol_when_length_is_also_invalid<E: EngineWrapper>(
+    engine_wrapper: E,
+) {
+    let mut rng = rand::rngs::SmallRng::from_entropy();
+
+    // check across enough lengths that it would likely cover any implementation's various internal
+    // small/large input division
+    for len in 0_usize..1000 {
+        if len % 4 != 1 {
+            continue;
+        }
+
+        let engine = E::random_alphabet(&mut rng, &STANDARD);
+
+        let mut input = vec![b'A'; len];
+
+        // with a valid last char, it's InvalidLength
+        assert_eq!(
+            Err(DecodeError::InvalidLength),
+            decode_engine(&input, &engine)
+        );
+        // after mangling the last char, it's InvalidByte
+        input[len - 1] = b'*';
+        assert_eq!(
+            Err(DecodeError::InvalidByte(len - 1, b'*')),
+            decode_engine(&input, &engine)
+        );
+    }
+}
+
 #[apply(all_engines)]
 fn decode_detect_invalid_last_symbol_every_possible_two_symbols<E: EngineWrapper>(
     engine_wrapper: E,

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,6 @@`
`1`	`1`	`//! Provides [Alphabet] and constants for alphabets commonly used in the wild.`
`2`	`2`
	`3`	`+use crate::PAD_BYTE;`
`3`	`4`	`#[cfg(any(feature = "std", test))]`
`4`	`5`	`use std::{convert, error, fmt};`
`5`	`6`
`@@ -17,7 +18,7 @@ const ALPHABET_SIZE: usize = 64;`
`17`	`18`	`/// &custom,`
`18`	`19`	`/// base64::engine::fast_portable::PAD);`
`19`	`20`	/// ```
`20`		`-#[derive(Clone, Copy, Debug, Eq, PartialEq)]`
	`21`	`+#[derive(Clone, Debug, Eq, PartialEq)]`
`21`	`22`	`pub struct Alphabet {`
`22`	`23`	`pub(crate) symbols: [u8; ALPHABET_SIZE],`
`23`	`24`	`}`
`@@ -39,7 +40,7 @@ impl Alphabet {`
`39`	`40`	`Alphabet { symbols }`
`40`	`41`	`}`
`41`	`42`
`42`		- /// Create a `CharacterSet` from a string of 64 unique printable ASCII bytes.
	`43`	+ /// Create an `Alphabet` from a string of 64 unique printable ASCII bytes.
`43`	`44`	`///`
`44`	`45`	/// The `=` byte is not allowed as it is used for padding.
`45`	`46`	`///`
`@@ -62,7 +63,7 @@ impl Alphabet {`
`62`	`63`	`return Err(ParseAlphabetError::UnprintableByte(byte));`
`63`	`64`	`}`
`64`	`65`	`// = is assumed to be padding, so cannot be used as a symbol`
`65`		`- if b'=' == byte {`
	`66`	`+ if byte == PAD_BYTE {`
`66`	`67`	`return Err(ParseAlphabetError::ReservedByte(byte));`
`67`	`68`	`}`
`68`	`69`
`@@ -121,9 +122,9 @@ impl fmt::Display for ParseAlphabetError {`
`121`	`122`	`fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {`
`122`	`123`	`match self {`
`123`	`124`	`ParseAlphabetError::InvalidLength => write!(f, "Invalid length - must be 64 bytes"),`
`124`		`- ParseAlphabetError::DuplicatedByte(b) => write!(f, "Duplicated byte: {}", b),`
`125`		`- ParseAlphabetError::UnprintableByte(b) => write!(f, "Unprintable byte: {}", b),`
`126`		`- ParseAlphabetError::ReservedByte(b) => write!(f, "Reserved byte: {}", b),`
	`125`	`+ ParseAlphabetError::DuplicatedByte(b) => write!(f, "Duplicated byte: {:#04x}", b),`
	`126`	`+ ParseAlphabetError::UnprintableByte(b) => write!(f, "Unprintable byte: {:#04x}", b),`
	`127`	`+ ParseAlphabetError::ReservedByte(b) => write!(f, "Reserved byte: {:#04x}", b),`
`127`	`128`	`}`
`128`	`129`	`}`
`129`	`130`	`}`