Skip to content

Commit a7d93ca

Browse files
committed
wip
1 parent 5d45066 commit a7d93ca

File tree

4 files changed

+41
-43
lines changed

4 files changed

+41
-43
lines changed

portable/Cargo.toml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
#cargo-features = ["edition2024"] # TODO
2-
31
[package]
42
name = "simdutf8-portable"
53
version = "0.1.0"

portable/src/implementation/helpers.rs

Lines changed: 0 additions & 38 deletions
This file was deleted.

portable/src/implementation/mod.rs

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
33
#![forbid(unsafe_code)]
44

5-
pub(crate) mod helpers;
65
pub(crate) mod simd;
76

87
#[inline]
@@ -43,5 +42,44 @@ pub(crate) fn validate_utf8_basic_fallback(input: &[u8]) -> Result<(), crate::ba
4342

4443
#[inline]
4544
pub(crate) fn validate_utf8_compat_fallback(input: &[u8]) -> Result<(), crate::compat::Utf8Error> {
46-
helpers::validate_utf8_at_offset(input, 0)
45+
validate_utf8_at_offset(input, 0)
46+
}
47+
48+
type Utf8ErrorCompat = crate::compat::Utf8Error;
49+
50+
#[inline]
51+
#[expect(clippy::cast_possible_truncation)]
52+
pub(crate) fn validate_utf8_at_offset(input: &[u8], offset: usize) -> Result<(), Utf8ErrorCompat> {
53+
match core::str::from_utf8(&input[offset..]) {
54+
Ok(_) => Ok(()),
55+
Err(err) => Err(Utf8ErrorCompat {
56+
valid_up_to: err.valid_up_to() + offset,
57+
error_len: err.error_len().map(|len| {
58+
// never truncates since std::str::err::Utf8Error::error_len() never returns value larger than 4
59+
len as u8
60+
}),
61+
}),
62+
}
63+
}
64+
65+
#[cold]
66+
#[expect(clippy::unwrap_used)]
67+
#[allow(dead_code)] // only used if there is a SIMD implementation
68+
pub(crate) fn get_compat_error(input: &[u8], failing_block_pos: usize) -> Utf8ErrorCompat {
69+
let offset = if failing_block_pos == 0 {
70+
// Error must be in this block since it is the first.
71+
0
72+
} else {
73+
// The previous block is OK except for a possible continuation over the block boundary.
74+
// We go backwards over the last three bytes of the previous block and find the
75+
// last non-continuation byte as a starting point for an std validation. If the last
76+
// three bytes are all continuation bytes then the previous block ends with a four byte
77+
// UTF-8 codepoint, is thus complete and valid UTF-8. We start the check with the
78+
// current block in that case.
79+
(1..=3)
80+
.find(|i| input[failing_block_pos - i] >> 6 != 0b10)
81+
.map_or(failing_block_pos, |i| failing_block_pos - i)
82+
};
83+
// UNWRAP: safe because the SIMD UTF-8 validation found an error
84+
validate_utf8_at_offset(input, offset).unwrap_err()
4785
}

portable/src/implementation/simd.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -670,7 +670,7 @@ pub fn validate_utf8_basic(input: &[u8]) -> core::result::Result<(), basic::Utf8
670670
#[inline]
671671
pub fn validate_utf8_compat(input: &[u8]) -> core::result::Result<(), compat::Utf8Error> {
672672
Utf8CheckAlgorithm::<16, 4>::validate_utf8_compat_simd0(input)
673-
.map_err(|idx| crate::implementation::helpers::get_compat_error(input, idx))
673+
.map_err(|idx| super::get_compat_error(input, idx))
674674
}
675675

676676
/// Low-level implementation of the [`basic::imp::Utf8Validator`] trait.

0 commit comments

Comments
 (0)