@@ -2,12 +2,43 @@ use pythonize::pythonize;
22
33use pyo3:: exceptions:: PyValueError ;
44use pyo3:: prelude:: * ;
5+ use pyo3:: types:: PyBytes ;
56
67use sqlparser:: parser:: Parser ;
8+ use regex:: bytes:: Regex as BytesRegex ;
9+ use regex:: Regex ;
710
811mod opteryx_dialect;
912pub use opteryx_dialect:: OpteryxDialect ;
1013
14+ /// Convert Python-style backreferences (\1, \2, etc.) to Rust-style ($1, $2, etc.)
15+ fn convert_python_to_rust_backrefs ( replacement : & str ) -> String {
16+ let mut result = String :: new ( ) ;
17+ let mut chars = replacement. chars ( ) . peekable ( ) ;
18+
19+ while let Some ( ch) = chars. next ( ) {
20+ if ch == '\\' {
21+ if let Some ( & next_ch) = chars. peek ( ) {
22+ if next_ch. is_ascii_digit ( ) {
23+ // This is a backreference like \1
24+ result. push ( '$' ) ;
25+ // Don't consume the next char, just peek
26+ } else {
27+ // Regular escape sequence, keep the backslash
28+ result. push ( ch) ;
29+ }
30+ } else {
31+ // Backslash at end of string
32+ result. push ( ch) ;
33+ }
34+ } else {
35+ result. push ( ch) ;
36+ }
37+ }
38+
39+ result
40+ }
41+
1142/// Function to parse SQL statements from a string. Returns a list with
1243/// one item per query statement.
1344///
@@ -36,9 +67,107 @@ fn parse_sql(py: Python, sql: String, _dialect: String) -> PyResult<Py<PyAny>> {
3667 Ok ( output. into ( ) )
3768}
3869
70+ /// Fast regex replacement using Rust's regex crate.
71+ ///
72+ /// This function performs regex replacement on arrays of strings or bytes,
73+ /// compiling the pattern once and applying it to all items efficiently.
74+ ///
75+ /// Arguments:
76+ /// - data: List of strings or bytes to process
77+ /// - pattern: Regex pattern (string or bytes)
78+ /// - replacement: Replacement string (string or bytes)
79+ ///
80+ /// Returns:
81+ /// - List of strings or bytes with replacements applied
82+ #[ pyfunction]
83+ #[ pyo3( text_signature = "(data, pattern, replacement)" ) ]
84+ fn regex_replace_rust (
85+ py : Python ,
86+ data : Vec < Option < Py < PyAny > > > ,
87+ pattern : Py < PyAny > ,
88+ replacement : Py < PyAny > ,
89+ ) -> PyResult < Vec < Option < Py < PyAny > > > > {
90+ // Check if we're working with bytes or strings
91+ let is_bytes = pattern. bind ( py) . is_instance_of :: < PyBytes > ( ) ;
92+
93+ if is_bytes {
94+ // Bytes mode - use bytes regex
95+ let pattern_bytes: & [ u8 ] = pattern. extract ( py) ?;
96+
97+ // Replacement can be either bytes or string - try both
98+ let replacement_str = if let Ok ( bytes) = replacement. extract :: < & [ u8 ] > ( py) {
99+ std:: str:: from_utf8 ( bytes) . map_err ( |e| {
100+ PyValueError :: new_err ( format ! ( "Invalid UTF-8 in replacement: {}" , e) )
101+ } ) ?. to_string ( )
102+ } else if let Ok ( s) = replacement. extract :: < String > ( py) {
103+ s
104+ } else {
105+ return Err ( PyValueError :: new_err ( "Replacement must be bytes or string" ) ) ;
106+ } ;
107+
108+ // Convert Python-style backreferences (\1, \2, etc.) to Rust-style ($1, $2, etc.)
109+ let rust_replacement = convert_python_to_rust_backrefs ( & replacement_str) ;
110+
111+ // Compile regex once
112+ let re = BytesRegex :: new ( std:: str:: from_utf8 ( pattern_bytes) . map_err ( |e| {
113+ PyValueError :: new_err ( format ! ( "Invalid UTF-8 in pattern: {}" , e) )
114+ } ) ?)
115+ . map_err ( |e| PyValueError :: new_err ( format ! ( "Invalid regex pattern: {}" , e) ) ) ?;
116+
117+ // Process each item
118+ let mut result = Vec :: with_capacity ( data. len ( ) ) ;
119+ for item_opt in data {
120+ match item_opt {
121+ None => result. push ( None ) ,
122+ Some ( item) => {
123+ let item_bytes: & [ u8 ] = item. extract ( py) ?;
124+ let replaced = re. replace_all ( item_bytes, rust_replacement. as_bytes ( ) ) ;
125+ result. push ( Some ( PyBytes :: new ( py, & replaced) . into ( ) ) ) ;
126+ }
127+ }
128+ }
129+ Ok ( result)
130+ } else {
131+ // String mode - use string regex
132+ let pattern_str: String = pattern. extract ( py) ?;
133+ let replacement_str: String = replacement. extract ( py) ?;
134+
135+ // Convert Python-style backreferences to Rust-style
136+ let rust_replacement = convert_python_to_rust_backrefs ( & replacement_str) ;
137+
138+ // Compile regex once
139+ let re = Regex :: new ( & pattern_str)
140+ . map_err ( |e| PyValueError :: new_err ( format ! ( "Invalid regex pattern: {}" , e) ) ) ?;
141+
142+ // Process each item
143+ let mut result = Vec :: with_capacity ( data. len ( ) ) ;
144+ for item_opt in data {
145+ match item_opt {
146+ None => result. push ( None ) ,
147+ Some ( item) => {
148+ if let Ok ( item_bytes) = item. extract :: < & [ u8 ] > ( py) {
149+ // Item is bytes, convert to string, replace, convert back
150+ let item_str = std:: str:: from_utf8 ( item_bytes)
151+ . map_err ( |e| PyValueError :: new_err ( format ! ( "Invalid UTF-8: {}" , e) ) ) ?;
152+ let replaced = re. replace_all ( item_str, & rust_replacement) ;
153+ result. push ( Some ( PyBytes :: new ( py, replaced. as_bytes ( ) ) . into ( ) ) ) ;
154+ } else {
155+ // Item is string
156+ let item_str: String = item. extract ( py) ?;
157+ let replaced = re. replace_all ( & item_str, & rust_replacement) ;
158+ result. push ( Some ( PyBytes :: new ( py, replaced. as_bytes ( ) ) . into ( ) ) ) ;
159+ }
160+ }
161+ }
162+ }
163+ Ok ( result)
164+ }
165+ }
166+
39167
40168#[ pymodule]
41169fn compute ( _py : Python , m : & Bound < ' _ , PyModule > ) -> PyResult < ( ) > {
42170 m. add_function ( wrap_pyfunction ! ( parse_sql, m) ?) ?;
171+ m. add_function ( wrap_pyfunction ! ( regex_replace_rust, m) ?) ?;
43172 Ok ( ( ) )
44- }
173+ }
0 commit comments