Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions datafusion/functions/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -264,3 +264,8 @@ required-features = ["string_expressions"]
harness = false
name = "ends_with"
required-features = ["string_expressions"]

[[bench]]
harness = false
name = "translate"
required-features = ["unicode_expressions"]
90 changes: 90 additions & 0 deletions datafusion/functions/benches/translate.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

extern crate criterion;

use arrow::array::OffsetSizeTrait;
use arrow::datatypes::{DataType, Field};
use arrow::util::bench_util::create_string_array_with_len;
use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
use datafusion_common::DataFusionError;
use datafusion_common::config::ConfigOptions;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
use datafusion_functions::unicode;
use std::hint::black_box;
use std::sync::Arc;
use std::time::Duration;

fn create_args<O: OffsetSizeTrait>(size: usize, str_len: usize) -> Vec<ColumnarValue> {
let string_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
// Create simple from/to strings for translation
let from_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, 3));
let to_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, 2));

vec![
ColumnarValue::Array(string_array),
ColumnarValue::Array(from_array),
ColumnarValue::Array(to_array),
]
}

fn invoke_translate_with_args(
args: Vec<ColumnarValue>,
number_rows: usize,
) -> Result<ColumnarValue, DataFusionError> {
let arg_fields = args
.iter()
.enumerate()
.map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
.collect::<Vec<_>>();
let config_options = Arc::new(ConfigOptions::default());

unicode::translate().invoke_with_args(ScalarFunctionArgs {
args,
arg_fields,
number_rows,
return_field: Field::new("f", DataType::Utf8, true).into(),
config_options: Arc::clone(&config_options),
})
}

fn criterion_benchmark(c: &mut Criterion) {
for size in [1024, 4096] {
let mut group = c.benchmark_group(format!("translate size={size}"));
group.sampling_mode(SamplingMode::Flat);
group.sample_size(10);
group.measurement_time(Duration::from_secs(10));

for str_len in [8, 32] {
let args = create_args::<i32>(size, str_len);
group.bench_function(
format!("translate_string [size={size}, str_len={str_len}]"),
|b| {
b.iter(|| {
let args_cloned = args.clone();
black_box(invoke_translate_with_args(args_cloned, size))
})
},
);
}

group.finish();
}
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);
56 changes: 35 additions & 21 deletions datafusion/functions/src/unicode/translate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -148,34 +148,48 @@ where
let from_array_iter = ArrayIter::new(from_array);
let to_array_iter = ArrayIter::new(to_array);

// Reusable buffers to avoid allocating for each row
let mut from_map: HashMap<&str, usize> = HashMap::new();
let mut from_graphemes: Vec<&str> = Vec::new();
let mut to_graphemes: Vec<&str> = Vec::new();
let mut string_graphemes: Vec<&str> = Vec::new();
let mut result_graphemes: Vec<&str> = Vec::new();

let result = string_array_iter
.zip(from_array_iter)
.zip(to_array_iter)
.map(|((string, from), to)| match (string, from, to) {
(Some(string), Some(from), Some(to)) => {
// create a hashmap of [char, index] to change from O(n) to O(1) for from list
let from_map: HashMap<&str, usize> = from
.graphemes(true)
.collect::<Vec<&str>>()
.iter()
.enumerate()
.map(|(index, c)| (c.to_owned(), index))
.collect();
// Clear and reuse buffers
from_map.clear();
from_graphemes.clear();
to_graphemes.clear();
string_graphemes.clear();
result_graphemes.clear();

// Build from_map using reusable buffer
from_graphemes.extend(from.graphemes(true));
for (index, c) in from_graphemes.iter().enumerate() {
from_map.insert(*c, index);
}

// Build to_graphemes
to_graphemes.extend(to.graphemes(true));

let to = to.graphemes(true).collect::<Vec<&str>>();
// Process string and build result
string_graphemes.extend(string.graphemes(true));
for c in &string_graphemes {
match from_map.get(*c) {
Some(n) => {
if let Some(replacement) = to_graphemes.get(*n) {
result_graphemes.push(*replacement);
}
}
None => result_graphemes.push(*c),
}
}

Some(
string
.graphemes(true)
.collect::<Vec<&str>>()
.iter()
.flat_map(|c| match from_map.get(*c) {
Some(n) => to.get(*n).copied(),
None => Some(*c),
})
.collect::<Vec<&str>>()
.concat(),
)
Some(result_graphemes.concat())
}
_ => None,
})
Expand Down