22import logging
33import os
44import re
5- from typing import Dict , List , Optional , Tuple , Union
5+ from typing import Dict , List , Optional , Tuple , Union , cast
66
77from ply import lex , yacc
88
1919IN_COM = "--"
2020MYSQL_COM = "#"
2121
22+ LF_IN_QUOTE = r"\N"
23+
2224
2325def set_logging_config (
2426 log_level : Union [str , int ], log_file : Optional [str ] = None
@@ -173,7 +175,7 @@ def process_regex_input(self, data):
173175 return data
174176
175177 def pre_process_data (self , data ):
176- data = data .decode ("utf-8" )
178+ data = cast ( str , data .decode ("utf-8" ) )
177179 # todo: not sure how to workaround ',' normal way
178180 if "input.regex" in data :
179181 data = self .process_regex_input (data )
@@ -182,37 +184,44 @@ def pre_process_data(self, data):
182184 result = []
183185 in_quote = False
184186 i = 0
185- symbol_spacing_map = {
186- "," : " , " ,
187- "(" : " ( " ,
188- ")" : " ) " ,
189- }
187+ symbol_spacing_map = {"," , "(" , ")" }
188+
189+ # Special handling for odd number of single quotes
190+ if data .count ("'" ) % 2 != 0 :
191+ data = data .replace ("\\ '" , "pars_m_single" )
192+
190193 while i < len (data ):
191194 char = data [i ]
195+ startswith = data [i :].startswith
192196
193197 # Handle quote start/end
194- if char == "'" and ( i == 0 or data [ i - 1 ] != " \\ " ) :
198+ if char == "'" :
195199 in_quote = not in_quote
196200 result .append (char )
201+
202+ # Handle line feeds in quotes
203+ elif in_quote and startswith ("\\ n" ):
204+ result .append (LF_IN_QUOTE )
205+ i += 1
206+
207+ # Handle special unicode quotes
208+ elif not in_quote and (startswith (r"\u2018" ) or startswith (r"\u2019" )):
209+ result .append ("'" )
210+ i += 5
211+
197212 # Handle symbols that need spacing
198213 elif not in_quote and char in symbol_spacing_map :
199- result .append (symbol_spacing_map [char ])
214+ result .append (f" { char } " )
215+
200216 # Keep all other characters as-is
201217 else :
202218 result .append (char )
203219
204220 i += 1
205221
206222 data = "" .join (result )
207-
208- if data .count ("'" ) % 2 != 0 :
209- data = data .replace ("\\ '" , "pars_m_single" )
210223 data = (
211224 data .replace ("\\ x" , "\\ 0" )
212- .replace ("‘" , "'" )
213- .replace ("’" , "'" )
214- .replace ("\\ u2018" , "'" )
215- .replace ("\\ u2019" , "'" )
216225 .replace ("'\\ t'" , "'pars_m_t'" )
217226 .replace ("\\ t" , " " )
218227 )
@@ -296,7 +305,13 @@ def process_line(
296305 ) -> Tuple [Optional [str ], bool ]:
297306 self .pre_process_line ()
298307
299- self .line = self .line .strip ().replace ("\n " , "" ).replace ("\t " , "" )
308+ # Remove whitespace, while preserving newlines in quotes
309+ self .line = (
310+ self .line .strip ()
311+ .replace ("\n " , "" )
312+ .replace ("\t " , "" )
313+ .replace (LF_IN_QUOTE , "\\ n" )
314+ )
300315 self .skip = self .check_line_on_skip_words ()
301316
302317 self .parse_set_statement ()
0 commit comments