Skip to content

Commit a61da54

Browse files
Adding support for UTF space-like characters from ugly-csv generator
1 parent 3560d25 commit a61da54

File tree

1 file changed

+11
-4
lines changed

1 file changed

+11
-4
lines changed

csv_trimming/trim.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,14 @@
77
NAN_LIKE_ARTIFACTS,
88
UNICODE_NAN_LIKE_ARTIFACTS,
99
)
10+
from ugly_csv_generator.utils.add_random_spaces import (
11+
SPACES,
12+
UNICODE_SPACES
13+
)
1014
from csv_trimming.logger import logger
1115

1216
NAN_LIKE = NAN_LIKE_ARTIFACTS + UNICODE_NAN_LIKE_ARTIFACTS
17+
SPACE_LIKE = sorted(SPACES + UNICODE_SPACES, key=lambda x: -len(x))
1318

1419

1520
def is_nan(candidate: Any) -> bool:
@@ -36,8 +41,6 @@ def is_nan(candidate: Any) -> bool:
3641
class CSVTrimmer:
3742
"""Class handling the cleaning up of malformed CSVs using heuristics."""
3843

39-
SPACES = ("\n\r", "\n", " ")
40-
4144
def __init__(
4245
self,
4346
correlation_callback: Optional[
@@ -194,8 +197,12 @@ def _deep_strip(self, string: str):
194197
----------------------------
195198
String without duplicated spaces.
196199
"""
197-
for char in CSVTrimmer.SPACES:
198-
string = " ".join([e for e in string.split(char) if e])
200+
old_string = None
201+
while old_string != string:
202+
old_string = string
203+
for char in SPACE_LIKE:
204+
if char in string:
205+
string = " ".join(e for e in string.split(char) if e)
199206
return string.strip()
200207

201208
def trim_spaces(self, csv: pd.DataFrame) -> pd.DataFrame:

0 commit comments

Comments
 (0)