@@ -2,12 +2,43 @@ use pythonize::pythonize;
22
33use  pyo3:: exceptions:: PyValueError ; 
44use  pyo3:: prelude:: * ; 
5+ use  pyo3:: types:: PyBytes ; 
56
67use  sqlparser:: parser:: Parser ; 
8+ use  regex:: bytes:: Regex  as  BytesRegex ; 
9+ use  regex:: Regex ; 
710
811mod  opteryx_dialect; 
912pub  use  opteryx_dialect:: OpteryxDialect ; 
1013
14+ /// Convert Python-style backreferences (\1, \2, etc.) to Rust-style ($1, $2, etc.) 
15+ fn  convert_python_to_rust_backrefs ( replacement :  & str )  -> String  { 
16+     let  mut  result = String :: new ( ) ; 
17+     let  mut  chars = replacement. chars ( ) . peekable ( ) ; 
18+     
19+     while  let  Some ( ch)  = chars. next ( )  { 
20+         if  ch == '\\'  { 
21+             if  let  Some ( & next_ch)  = chars. peek ( )  { 
22+                 if  next_ch. is_ascii_digit ( )  { 
23+                     // This is a backreference like \1 
24+                     result. push ( '$' ) ; 
25+                     // Don't consume the next char, just peek 
26+                 }  else  { 
27+                     // Regular escape sequence, keep the backslash 
28+                     result. push ( ch) ; 
29+                 } 
30+             }  else  { 
31+                 // Backslash at end of string 
32+                 result. push ( ch) ; 
33+             } 
34+         }  else  { 
35+             result. push ( ch) ; 
36+         } 
37+     } 
38+     
39+     result
40+ } 
41+ 
1142/// Function to parse SQL statements from a string. Returns a list with 
1243/// one item per query statement. 
1344/// 
@@ -36,9 +67,107 @@ fn parse_sql(py: Python, sql: String, _dialect: String) -> PyResult<Py<PyAny>> {
3667    Ok ( output. into ( ) ) 
3768} 
3869
70+ /// Fast regex replacement using Rust's regex crate. 
71+ ///  
72+ /// This function performs regex replacement on arrays of strings or bytes, 
73+ /// compiling the pattern once and applying it to all items efficiently. 
74+ ///  
75+ /// Arguments: 
76+ /// - data: List of strings or bytes to process 
77+ /// - pattern: Regex pattern (string or bytes) 
78+ /// - replacement: Replacement string (string or bytes) 
79+ ///  
80+ /// Returns: 
81+ /// - List of strings or bytes with replacements applied 
82+ #[ pyfunction]  
83+ #[ pyo3( text_signature = "(data, pattern, replacement)" ) ]  
84+ fn  regex_replace_rust ( 
85+     py :  Python , 
86+     data :  Vec < Option < Py < PyAny > > > , 
87+     pattern :  Py < PyAny > , 
88+     replacement :  Py < PyAny > , 
89+ )  -> PyResult < Vec < Option < Py < PyAny > > > >  { 
90+     // Check if we're working with bytes or strings 
91+     let  is_bytes = pattern. bind ( py) . is_instance_of :: < PyBytes > ( ) ; 
92+     
93+     if  is_bytes { 
94+         // Bytes mode - use bytes regex 
95+         let  pattern_bytes:  & [ u8 ]  = pattern. extract ( py) ?; 
96+         
97+         // Replacement can be either bytes or string - try both 
98+         let  replacement_str = if  let  Ok ( bytes)  = replacement. extract :: < & [ u8 ] > ( py)  { 
99+             std:: str:: from_utf8 ( bytes) . map_err ( |e| { 
100+                 PyValueError :: new_err ( format ! ( "Invalid UTF-8 in replacement: {}" ,  e) ) 
101+             } ) ?. to_string ( ) 
102+         }  else  if  let  Ok ( s)  = replacement. extract :: < String > ( py)  { 
103+             s
104+         }  else  { 
105+             return  Err ( PyValueError :: new_err ( "Replacement must be bytes or string" ) ) ; 
106+         } ; 
107+         
108+         // Convert Python-style backreferences (\1, \2, etc.) to Rust-style ($1, $2, etc.) 
109+         let  rust_replacement = convert_python_to_rust_backrefs ( & replacement_str) ; 
110+         
111+         // Compile regex once 
112+         let  re = BytesRegex :: new ( std:: str:: from_utf8 ( pattern_bytes) . map_err ( |e| { 
113+             PyValueError :: new_err ( format ! ( "Invalid UTF-8 in pattern: {}" ,  e) ) 
114+         } ) ?) 
115+         . map_err ( |e| PyValueError :: new_err ( format ! ( "Invalid regex pattern: {}" ,  e) ) ) ?; 
116+         
117+         // Process each item 
118+         let  mut  result = Vec :: with_capacity ( data. len ( ) ) ; 
119+         for  item_opt in  data { 
120+             match  item_opt { 
121+                 None  => result. push ( None ) , 
122+                 Some ( item)  => { 
123+                     let  item_bytes:  & [ u8 ]  = item. extract ( py) ?; 
124+                     let  replaced = re. replace_all ( item_bytes,  rust_replacement. as_bytes ( ) ) ; 
125+                     result. push ( Some ( PyBytes :: new ( py,  & replaced) . into ( ) ) ) ; 
126+                 } 
127+             } 
128+         } 
129+         Ok ( result) 
130+     }  else  { 
131+         // String mode - use string regex 
132+         let  pattern_str:  String  = pattern. extract ( py) ?; 
133+         let  replacement_str:  String  = replacement. extract ( py) ?; 
134+         
135+         // Convert Python-style backreferences to Rust-style 
136+         let  rust_replacement = convert_python_to_rust_backrefs ( & replacement_str) ; 
137+         
138+         // Compile regex once 
139+         let  re = Regex :: new ( & pattern_str) 
140+             . map_err ( |e| PyValueError :: new_err ( format ! ( "Invalid regex pattern: {}" ,  e) ) ) ?; 
141+         
142+         // Process each item 
143+         let  mut  result = Vec :: with_capacity ( data. len ( ) ) ; 
144+         for  item_opt in  data { 
145+             match  item_opt { 
146+                 None  => result. push ( None ) , 
147+                 Some ( item)  => { 
148+                     if  let  Ok ( item_bytes)  = item. extract :: < & [ u8 ] > ( py)  { 
149+                         // Item is bytes, convert to string, replace, convert back 
150+                         let  item_str = std:: str:: from_utf8 ( item_bytes) 
151+                             . map_err ( |e| PyValueError :: new_err ( format ! ( "Invalid UTF-8: {}" ,  e) ) ) ?; 
152+                         let  replaced = re. replace_all ( item_str,  & rust_replacement) ; 
153+                         result. push ( Some ( PyBytes :: new ( py,  replaced. as_bytes ( ) ) . into ( ) ) ) ; 
154+                     }  else  { 
155+                         // Item is string 
156+                         let  item_str:  String  = item. extract ( py) ?; 
157+                         let  replaced = re. replace_all ( & item_str,  & rust_replacement) ; 
158+                         result. push ( Some ( PyBytes :: new ( py,  replaced. as_bytes ( ) ) . into ( ) ) ) ; 
159+                     } 
160+                 } 
161+             } 
162+         } 
163+         Ok ( result) 
164+     } 
165+ } 
166+ 
39167
40168#[ pymodule]  
41169fn  compute ( _py :  Python ,  m :  & Bound < ' _ ,  PyModule > )  -> PyResult < ( ) >  { 
42170    m. add_function ( wrap_pyfunction ! ( parse_sql,  m) ?) ?; 
171+     m. add_function ( wrap_pyfunction ! ( regex_replace_rust,  m) ?) ?; 
43172    Ok ( ( ) ) 
44- } 
173+ } 
0 commit comments