File tree Expand file tree Collapse file tree 1 file changed +11
-4
lines changed
Expand file tree Collapse file tree 1 file changed +11
-4
lines changed Original file line number Diff line number Diff line change 77 NAN_LIKE_ARTIFACTS ,
88 UNICODE_NAN_LIKE_ARTIFACTS ,
99)
10+ from ugly_csv_generator .utils .add_random_spaces import (
11+ SPACES ,
12+ UNICODE_SPACES
13+ )
1014from csv_trimming .logger import logger
1115
1216NAN_LIKE = NAN_LIKE_ARTIFACTS + UNICODE_NAN_LIKE_ARTIFACTS
17+ SPACE_LIKE = sorted (SPACES + UNICODE_SPACES , key = lambda x : - len (x ))
1318
1419
1520def is_nan (candidate : Any ) -> bool :
@@ -36,8 +41,6 @@ def is_nan(candidate: Any) -> bool:
3641class CSVTrimmer :
3742 """Class handling the cleaning up of malformed CSVs using heuristics."""
3843
39- SPACES = ("\n \r " , "\n " , " " )
40-
4144 def __init__ (
4245 self ,
4346 correlation_callback : Optional [
@@ -194,8 +197,12 @@ def _deep_strip(self, string: str):
194197 ----------------------------
195198 String without duplicated spaces.
196199 """
197- for char in CSVTrimmer .SPACES :
198- string = " " .join ([e for e in string .split (char ) if e ])
200+ old_string = None
201+ while old_string != string :
202+ old_string = string
203+ for char in SPACE_LIKE :
204+ if char in string :
205+ string = " " .join (e for e in string .split (char ) if e )
199206 return string .strip ()
200207
201208 def trim_spaces (self , csv : pd .DataFrame ) -> pd .DataFrame :
You can’t perform that action at this time.
0 commit comments