Skip to content

Commit a992d72

Browse files
authored
Merge pull request #2880 from mabel-dev/clickbench-performance-regression-investigation-1
group by performance
2 parents 2ea26aa + 082e25b commit a992d72

File tree

7 files changed

+147
-11
lines changed

7 files changed

+147
-11
lines changed

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ crate-type = ["cdylib"]
1111
[dependencies]
1212
pythonize = "0.26"
1313
serde = "1.0.171"
14+
regex = "1.10"
1415

1516
[dependencies.pyo3]
1617
version = "0.26"

opteryx/__version__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
22
# DO NOT EDIT THIS FILE DIRECTLY
33

4-
__build__ = 1710
4+
__build__ = 1712
55
__author__ = "@joocer"
6-
__version__ = "0.26.0-beta.1710"
6+
__version__ = "0.26.0-beta.1712"
77

88
# Store the version here so:
99
# 1) we don't load dependencies by storing it in __init__.py

opteryx/connectors/disk_connector.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,8 @@ def read_blob(
114114
If an I/O error occurs while reading the file.
115115
"""
116116
from opteryx.compiled.io.disk_reader import read_file_mmap
117-
#from opteryx.compiled.io.disk_reader import unmap_memory
118117

118+
# from opteryx.compiled.io.disk_reader import unmap_memory
119119
# Read using mmap for maximum speed
120120
mmap_obj = read_file_mmap(blob_name)
121121

opteryx/functions/string_functions.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,4 +289,15 @@ def match_against(arr, val):
289289

290290

291291
def regex_replace(array, _pattern, _replacement):
292+
"""
293+
Regex replacement using PyArrow's optimized C++ implementation.
294+
295+
PyArrow's replace_substring_regex is already highly optimized and works
296+
directly with Arrow buffers without Python object conversion overhead.
297+
298+
Note: A Rust implementation was attempted but the overhead of converting
299+
PyArrow arrays to Python lists (990x slower than direct buffer access)
300+
made it significantly slower than PyArrow's native implementation.
301+
"""
302+
# Use PyArrow's optimized C++ implementation
292303
return compute.replace_substring_regex(array, _pattern[0], _replacement[0])

opteryx/operators/aggregate_and_group_node.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -92,10 +92,6 @@ def execute(self, morsel: pyarrow.Table, **kwargs):
9292
self.buffer,
9393
promote_options="permissive",
9494
)
95-
# Only combine chunks if we haven't done partial aggregation yet
96-
# combine_chunks can fail after partial aggregation due to buffer structure
97-
if not self._partial_aggregated:
98-
table = table.combine_chunks()
9995

10096
# If we've done partial aggregations, the aggregate functions need adjusting
10197
# because columns like "*" have been renamed to "*_count"
@@ -230,8 +226,7 @@ def execute(self, morsel: pyarrow.Table, **kwargs):
230226
self.buffer,
231227
promote_options="permissive",
232228
)
233-
# Only combine chunks once before aggregation
234-
table = table.combine_chunks()
229+
235230
groups = table.group_by(self.group_by_columns)
236231
groups = groups.aggregate(self.aggregate_functions)
237232
self.buffer = [groups] # Replace buffer with partial result

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "opteryx"
3-
version = "0.26.0-beta.1710"
3+
version = "0.26.0-beta.1712"
44
description = "Query your data, where it lives"
55
requires-python = '>=3.11'
66
readme = {file = "README.md", content-type = "text/markdown"}

src/lib.rs

Lines changed: 130 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,43 @@ use pythonize::pythonize;
22

33
use pyo3::exceptions::PyValueError;
44
use pyo3::prelude::*;
5+
use pyo3::types::PyBytes;
56

67
use sqlparser::parser::Parser;
8+
use regex::bytes::Regex as BytesRegex;
9+
use regex::Regex;
710

811
mod opteryx_dialect;
912
pub use opteryx_dialect::OpteryxDialect;
1013

14+
/// Convert Python-style backreferences (\1, \2, etc.) to Rust-style ($1, $2, etc.)
15+
fn convert_python_to_rust_backrefs(replacement: &str) -> String {
16+
let mut result = String::new();
17+
let mut chars = replacement.chars().peekable();
18+
19+
while let Some(ch) = chars.next() {
20+
if ch == '\\' {
21+
if let Some(&next_ch) = chars.peek() {
22+
if next_ch.is_ascii_digit() {
23+
// This is a backreference like \1
24+
result.push('$');
25+
// Don't consume the next char, just peek
26+
} else {
27+
// Regular escape sequence, keep the backslash
28+
result.push(ch);
29+
}
30+
} else {
31+
// Backslash at end of string
32+
result.push(ch);
33+
}
34+
} else {
35+
result.push(ch);
36+
}
37+
}
38+
39+
result
40+
}
41+
1142
/// Function to parse SQL statements from a string. Returns a list with
1243
/// one item per query statement.
1344
///
@@ -36,9 +67,107 @@ fn parse_sql(py: Python, sql: String, _dialect: String) -> PyResult<Py<PyAny>> {
3667
Ok(output.into())
3768
}
3869

70+
/// Fast regex replacement using Rust's regex crate.
71+
///
72+
/// This function performs regex replacement on arrays of strings or bytes,
73+
/// compiling the pattern once and applying it to all items efficiently.
74+
///
75+
/// Arguments:
76+
/// - data: List of strings or bytes to process
77+
/// - pattern: Regex pattern (string or bytes)
78+
/// - replacement: Replacement string (string or bytes)
79+
///
80+
/// Returns:
81+
/// - List of strings or bytes with replacements applied
82+
#[pyfunction]
83+
#[pyo3(text_signature = "(data, pattern, replacement)")]
84+
fn regex_replace_rust(
85+
py: Python,
86+
data: Vec<Option<Py<PyAny>>>,
87+
pattern: Py<PyAny>,
88+
replacement: Py<PyAny>,
89+
) -> PyResult<Vec<Option<Py<PyAny>>>> {
90+
// Check if we're working with bytes or strings
91+
let is_bytes = pattern.bind(py).is_instance_of::<PyBytes>();
92+
93+
if is_bytes {
94+
// Bytes mode - use bytes regex
95+
let pattern_bytes: &[u8] = pattern.extract(py)?;
96+
97+
// Replacement can be either bytes or string - try both
98+
let replacement_str = if let Ok(bytes) = replacement.extract::<&[u8]>(py) {
99+
std::str::from_utf8(bytes).map_err(|e| {
100+
PyValueError::new_err(format!("Invalid UTF-8 in replacement: {}", e))
101+
})?.to_string()
102+
} else if let Ok(s) = replacement.extract::<String>(py) {
103+
s
104+
} else {
105+
return Err(PyValueError::new_err("Replacement must be bytes or string"));
106+
};
107+
108+
// Convert Python-style backreferences (\1, \2, etc.) to Rust-style ($1, $2, etc.)
109+
let rust_replacement = convert_python_to_rust_backrefs(&replacement_str);
110+
111+
// Compile regex once
112+
let re = BytesRegex::new(std::str::from_utf8(pattern_bytes).map_err(|e| {
113+
PyValueError::new_err(format!("Invalid UTF-8 in pattern: {}", e))
114+
})?)
115+
.map_err(|e| PyValueError::new_err(format!("Invalid regex pattern: {}", e)))?;
116+
117+
// Process each item
118+
let mut result = Vec::with_capacity(data.len());
119+
for item_opt in data {
120+
match item_opt {
121+
None => result.push(None),
122+
Some(item) => {
123+
let item_bytes: &[u8] = item.extract(py)?;
124+
let replaced = re.replace_all(item_bytes, rust_replacement.as_bytes());
125+
result.push(Some(PyBytes::new(py, &replaced).into()));
126+
}
127+
}
128+
}
129+
Ok(result)
130+
} else {
131+
// String mode - use string regex
132+
let pattern_str: String = pattern.extract(py)?;
133+
let replacement_str: String = replacement.extract(py)?;
134+
135+
// Convert Python-style backreferences to Rust-style
136+
let rust_replacement = convert_python_to_rust_backrefs(&replacement_str);
137+
138+
// Compile regex once
139+
let re = Regex::new(&pattern_str)
140+
.map_err(|e| PyValueError::new_err(format!("Invalid regex pattern: {}", e)))?;
141+
142+
// Process each item
143+
let mut result = Vec::with_capacity(data.len());
144+
for item_opt in data {
145+
match item_opt {
146+
None => result.push(None),
147+
Some(item) => {
148+
if let Ok(item_bytes) = item.extract::<&[u8]>(py) {
149+
// Item is bytes, convert to string, replace, convert back
150+
let item_str = std::str::from_utf8(item_bytes)
151+
.map_err(|e| PyValueError::new_err(format!("Invalid UTF-8: {}", e)))?;
152+
let replaced = re.replace_all(item_str, &rust_replacement);
153+
result.push(Some(PyBytes::new(py, replaced.as_bytes()).into()));
154+
} else {
155+
// Item is string
156+
let item_str: String = item.extract(py)?;
157+
let replaced = re.replace_all(&item_str, &rust_replacement);
158+
result.push(Some(PyBytes::new(py, replaced.as_bytes()).into()));
159+
}
160+
}
161+
}
162+
}
163+
Ok(result)
164+
}
165+
}
166+
39167

40168
#[pymodule]
41169
fn compute(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
42170
m.add_function(wrap_pyfunction!(parse_sql, m)?)?;
171+
m.add_function(wrap_pyfunction!(regex_replace_rust, m)?)?;
43172
Ok(())
44-
}
173+
}

0 commit comments

Comments
 (0)