Skip to content

Commit 9a9ff8d

Browse files
perf: Improve performance of hex encoding in spark functions (#19586)
## Which issue does this PR close? - Part of #19569 ## Rationale for this change Completes the hex encoding optimization work from #19568 by replacing `write!` format strings with lookup tables in the remaining instances (`hex` and `sha1` functions in spark module). ## What changes are included in this PR? Avoid using `write!` with a format string and use a more efficient approach ## Are these changes tested? Yes ## Are there any user-facing changes? No.
1 parent 195d3d6 commit 9a9ff8d

File tree

2 files changed

+21
-16
lines changed

2 files changed

+21
-16
lines changed

datafusion/spark/src/function/hash/sha1.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
// under the License.
1717

1818
use std::any::Any;
19-
use std::fmt::Write;
2019
use std::sync::Arc;
2120

2221
use arrow::array::{ArrayRef, StringArray};
@@ -95,11 +94,16 @@ impl ScalarUDFImpl for SparkSha1 {
9594
}
9695
}
9796

97+
/// Hex encoding lookup table for fast byte-to-hex conversion
98+
const HEX_CHARS_LOWER: &[u8; 16] = b"0123456789abcdef";
99+
100+
#[inline]
98101
fn spark_sha1_digest(value: &[u8]) -> String {
99102
let result = Sha1::digest(value);
100103
let mut s = String::with_capacity(result.len() * 2);
101-
for b in result.as_slice() {
102-
write!(&mut s, "{b:02x}").unwrap();
104+
for &b in result.as_slice() {
105+
s.push(HEX_CHARS_LOWER[(b >> 4) as usize] as char);
106+
s.push(HEX_CHARS_LOWER[(b & 0x0f) as usize] as char);
103107
}
104108
s
105109
}

datafusion/spark/src/function/math/hex.rs

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,6 @@ use datafusion_expr::{
3737
Coercion, ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
3838
TypeSignatureClass, Volatility,
3939
};
40-
use std::fmt::Write;
41-
4240
/// <https://spark.apache.org/docs/latest/api/sql/index.html#hex>
4341
#[derive(Debug, PartialEq, Eq, Hash)]
4442
pub struct SparkHex {
@@ -116,19 +114,22 @@ fn hex_int64(num: i64) -> String {
116114
format!("{num:X}")
117115
}
118116

119-
#[inline(always)]
117+
/// Hex encoding lookup tables for fast byte-to-hex conversion
118+
const HEX_CHARS_LOWER: &[u8; 16] = b"0123456789abcdef";
119+
const HEX_CHARS_UPPER: &[u8; 16] = b"0123456789ABCDEF";
120+
121+
#[inline]
120122
fn hex_encode<T: AsRef<[u8]>>(data: T, lower_case: bool) -> String {
121-
let mut s = String::with_capacity(data.as_ref().len() * 2);
122-
if lower_case {
123-
for b in data.as_ref() {
124-
// Writing to a string never errors, so we can unwrap here.
125-
write!(&mut s, "{b:02x}").unwrap();
126-
}
123+
let bytes = data.as_ref();
124+
let mut s = String::with_capacity(bytes.len() * 2);
125+
let hex_chars = if lower_case {
126+
HEX_CHARS_LOWER
127127
} else {
128-
for b in data.as_ref() {
129-
// Writing to a string never errors, so we can unwrap here.
130-
write!(&mut s, "{b:02X}").unwrap();
131-
}
128+
HEX_CHARS_UPPER
129+
};
130+
for &b in bytes {
131+
s.push(hex_chars[(b >> 4) as usize] as char);
132+
s.push(hex_chars[(b & 0x0f) as usize] as char);
132133
}
133134
s
134135
}

0 commit comments

Comments
 (0)