|
2 | 2 |
|
3 | 3 | #![forbid(unsafe_code)]
|
4 | 4 |
|
5 |
| -pub(crate) mod helpers; |
6 | 5 | pub(crate) mod simd;
|
7 | 6 |
|
8 | 7 | #[inline]
|
@@ -43,5 +42,44 @@ pub(crate) fn validate_utf8_basic_fallback(input: &[u8]) -> Result<(), crate::ba
|
43 | 42 |
|
44 | 43 | #[inline]
|
45 | 44 | pub(crate) fn validate_utf8_compat_fallback(input: &[u8]) -> Result<(), crate::compat::Utf8Error> {
|
46 |
| - helpers::validate_utf8_at_offset(input, 0) |
| 45 | + validate_utf8_at_offset(input, 0) |
| 46 | +} |
| 47 | + |
| 48 | +type Utf8ErrorCompat = crate::compat::Utf8Error; |
| 49 | + |
| 50 | +#[inline] |
| 51 | +#[expect(clippy::cast_possible_truncation)] |
| 52 | +pub(crate) fn validate_utf8_at_offset(input: &[u8], offset: usize) -> Result<(), Utf8ErrorCompat> { |
| 53 | + match core::str::from_utf8(&input[offset..]) { |
| 54 | + Ok(_) => Ok(()), |
| 55 | + Err(err) => Err(Utf8ErrorCompat { |
| 56 | + valid_up_to: err.valid_up_to() + offset, |
| 57 | + error_len: err.error_len().map(|len| { |
| 58 | + // never truncates since std::str::err::Utf8Error::error_len() never returns value larger than 4 |
| 59 | + len as u8 |
| 60 | + }), |
| 61 | + }), |
| 62 | + } |
| 63 | +} |
| 64 | + |
| 65 | +#[cold] |
| 66 | +#[expect(clippy::unwrap_used)] |
| 67 | +#[allow(dead_code)] // only used if there is a SIMD implementation |
| 68 | +pub(crate) fn get_compat_error(input: &[u8], failing_block_pos: usize) -> Utf8ErrorCompat { |
| 69 | + let offset = if failing_block_pos == 0 { |
| 70 | + // Error must be in this block since it is the first. |
| 71 | + 0 |
| 72 | + } else { |
| 73 | + // The previous block is OK except for a possible continuation over the block boundary. |
| 74 | + // We go backwards over the last three bytes of the previous block and find the |
| 75 | + // last non-continuation byte as a starting point for an std validation. If the last |
| 76 | + // three bytes are all continuation bytes then the previous block ends with a four byte |
| 77 | + // UTF-8 codepoint, is thus complete and valid UTF-8. We start the check with the |
| 78 | + // current block in that case. |
| 79 | + (1..=3) |
| 80 | + .find(|i| input[failing_block_pos - i] >> 6 != 0b10) |
| 81 | + .map_or(failing_block_pos, |i| failing_block_pos - i) |
| 82 | + }; |
| 83 | + // UNWRAP: safe because the SIMD UTF-8 validation found an error |
| 84 | + validate_utf8_at_offset(input, offset).unwrap_err() |
47 | 85 | }
|
0 commit comments