Skip to content

Commit d560520

Browse files
authored
performance: faster utf8 validation by using simd instructions (#4347)
Signed-off-by: Robert Kruszewski <[email protected]>
1 parent 9458e81 commit d560520

File tree

14 files changed

+51
-26
lines changed

14 files changed

+51
-26
lines changed

Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,7 @@ rustversion = "1.0"
167167
serde = "1.0.203"
168168
serde_json = "1.0.116"
169169
serde_test = "1.0.176"
170+
simdutf8 = "0.1.5"
170171
similar = "2.7.0"
171172
simplelog = "0.12"
172173
sketches-ddsketch = "0.3.0"

java/testfiles/Cargo.lock

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vortex-array/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ rstest = { workspace = true, optional = true }
5151
rstest_reuse = { workspace = true, optional = true }
5252
rustc-hash = { workspace = true }
5353
serde = { workspace = true, features = ["derive"] }
54+
simdutf8 = { workspace = true }
5455
static_assertions = { workspace = true }
5556
tabled = { workspace = true, optional = true, default-features = false, features = [
5657
"std",

vortex-array/src/arrays/varbinview/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -398,7 +398,7 @@ impl VarBinViewArray {
398398

399399
match dtype {
400400
DType::Utf8(_) => Self::validate_views(views, buffers, validity, |string| {
401-
std::str::from_utf8(string).is_ok()
401+
simdutf8::basic::from_utf8(string).is_ok()
402402
})?,
403403
DType::Binary(_) => Self::validate_views(views, buffers, validity, |_| true)?,
404404
_ => vortex_bail!("invalid DType {dtype} for `VarBinViewArray`"),

vortex-buffer/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ itertools = { workspace = true }
2727
log = { workspace = true, optional = true }
2828
memmap2 = { workspace = true, optional = true }
2929
num-traits = { workspace = true }
30+
simdutf8 = { workspace = true }
3031
vortex-error = { workspace = true }
3132

3233
[lints]

vortex-buffer/src/string.rs

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33

44
use std::fmt::{Debug, Formatter};
55
use std::ops::Deref;
6-
use std::str::Utf8Error;
6+
7+
use vortex_error::{VortexError, vortex_err};
78

89
use crate::ByteBuffer;
910

@@ -64,10 +65,15 @@ impl From<&str> for BufferString {
6465
}
6566

6667
impl TryFrom<ByteBuffer> for BufferString {
67-
type Error = Utf8Error;
68+
type Error = VortexError;
6869

6970
fn try_from(value: ByteBuffer) -> Result<Self, Self::Error> {
70-
let _ = std::str::from_utf8(value.as_ref())?;
71+
let _ = simdutf8::basic::from_utf8(value.as_ref()).map_err(|_| {
72+
#[allow(clippy::unwrap_used)]
73+
// run validation using `compat` package to get more detailed error message
74+
let err = simdutf8::compat::from_utf8(value.as_ref()).unwrap_err();
75+
vortex_err!("invalid utf-8: {err}")
76+
})?;
7177
Ok(Self(value))
7278
}
7379
}

vortex-dtype/src/datetime/temporal.rs

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -157,14 +157,16 @@ fn decode_timestamp_metadata(ext_meta: &ExtMetadata) -> VortexResult<TemporalMet
157157
let tag = ext_meta.as_ref()[0];
158158
let time_unit = TimeUnit::try_from(tag)?;
159159
let tz_len_bytes = &ext_meta.as_ref()[1..3];
160-
let tz_len = u16::from_le_bytes(tz_len_bytes.try_into()?);
160+
let tz_len = u16::from_le_bytes(tz_len_bytes.try_into()?) as usize;
161161
if tz_len == 0 {
162162
return Ok(TemporalMetadata::Timestamp(time_unit, None));
163163
}
164164

165165
// Attempt to load from len-prefixed bytes
166-
let tz_bytes = &ext_meta.as_ref()[3..(3 + (tz_len as usize))];
167-
let tz = String::from_utf8_lossy(tz_bytes).to_string();
166+
let tz_bytes = &ext_meta.as_ref()[3..][..tz_len];
167+
let tz = str::from_utf8(tz_bytes)
168+
.map_err(|e| vortex_err!("timezone is not valid utf8 string: {e}"))?
169+
.to_string();
168170
Ok(TemporalMetadata::Timestamp(time_unit, Some(tz)))
169171
}
170172

vortex-duckdb/src/convert/expr.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,10 @@ pub fn try_from_bound_expression(value: &Expression) -> VortexResult<Option<Expr
3333
vortex_bail!("no expression class id {:?}", value.as_class_id())
3434
};
3535
Ok(Some(match value {
36-
ExpressionClass::BoundColumnRef(col_ref) => col(col_ref.name.to_str()?),
36+
ExpressionClass::BoundColumnRef(col_ref) => col(col_ref
37+
.name
38+
.to_str()
39+
.map_err(|e| vortex_err!("invalid utf-8: {e}"))?),
3740
ExpressionClass::BoundConstant(const_) => lit(Scalar::try_from(const_.value)?),
3841
ExpressionClass::BoundComparison(compare) => {
3942
let operator: Operator = compare.op.try_into()?;

vortex-duckdb/src/duckdb/scalar_function.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// SPDX-License-Identifier: Apache-2.0
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

4-
use vortex::error::VortexExpect;
4+
use vortex::error::{VortexUnwrap, vortex_err};
55

66
use crate::duckdb::LogicalType;
77
use crate::{cpp, wrapper};
@@ -14,7 +14,8 @@ impl ScalarFunction {
1414
let name_ptr = cpp::duckdb_vx_sfunc_name(self.as_ptr());
1515
std::ffi::CStr::from_ptr(name_ptr)
1616
.to_str()
17-
.vortex_expect("invalid utf-8")
17+
.map_err(|e| vortex_err!("invalid utf-8: {e}"))
18+
.vortex_unwrap()
1819
}
1920
}
2021

0 commit comments

Comments
 (0)