Skip to content

Commit aad3d87

Browse files
authored
Bugfix/support comma in csv header (#40)
* consistently map keys to CSV header strings * reset global state from extract_features_to_csv to support notebooks also calling extract_features directly
1 parent 07a5dd6 commit aad3d87

File tree

4 files changed

+36
-21
lines changed

4 files changed

+36
-21
lines changed

sherlock/features/helpers.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import string
2+
import csv
3+
import io
24

35

46
# https://stackoverflow.com/questions/10593387/when-do-i-need-to-escape-characters-within-a-regex-character-set-within
@@ -87,3 +89,17 @@ def literal_eval_as_str(value, none_value=None):
8789
strings.append(s)
8890

8991
return strings
92+
93+
94+
def keys_to_csv(keys):
95+
"""
96+
Encode a list of strings into an Excel CSV compatible header.
97+
98+
Wraps all items with double quotes to prevent legitimate values containing a comma from being interpreted as a
99+
separator, and encodes existing double quotes with two double quotes.
100+
"""
101+
with io.StringIO() as output:
102+
writer = csv.writer(output, quoting=csv.QUOTE_NONNUMERIC)
103+
writer.writerow(keys)
104+
105+
return output.getvalue()

sherlock/features/preprocessing.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import os
33

44
from collections import OrderedDict
5-
from typing import Union
5+
from typing import Union, Tuple
66

77
import pandas as pd
88

@@ -16,7 +16,7 @@
1616
from sherlock.features.word_embeddings import extract_word_embeddings_features
1717
from sherlock.features.paragraph_vectors import infer_paragraph_embeddings_features
1818
from sherlock.global_state import set_first, reset_first
19-
from sherlock.features.helpers import literal_eval_as_str
19+
from sherlock.features.helpers import literal_eval_as_str, keys_to_csv
2020

2121

2222
def prepare_feature_extraction():
@@ -79,7 +79,7 @@ def convert_string_lists_to_lists(
7979
labels: Union[pd.DataFrame, pd.Series],
8080
data_column_name: str = None,
8181
labels_column_name: str = None,
82-
) -> pd.Series:
82+
) -> Tuple[list, list]:
8383
"""Convert strings of arrays with values to arrays of strings of values.
8484
Each row in de dataframe or series corresponds to a column, represented by a string of a list.
8585
Each string-list will be converted to a list with string values.
@@ -123,7 +123,9 @@ def convert_string_lists_to_lists(
123123
converted_labels = labels.to_list()
124124
else:
125125
raise TypeError("Unexpected data type of labels.")
126-
126+
print("types")
127+
print(type(converted_data))
128+
print(type(converted_labels))
127129
return converted_data, converted_labels
128130

129131

@@ -136,15 +138,15 @@ def load_parquet_values(path):
136138

137139
def extract_features(
138140
output_filename, data: Union[pd.DataFrame, pd.Series]
139-
) -> pd.DataFrame:
141+
):
140142
"""Extract features from raw data.
141143
142144
Parameters
143145
----------
144146
output_filename
145147
filename to output featurized column samples
146148
data
147-
A pandas DataFrame or Series with each row a list of string values.
149+
A pandas DataFrame or Series with each row as a list of string values.
148150
"""
149151
vec_dim = 400
150152
reuse_model = True
@@ -178,10 +180,10 @@ def extract_features(
178180

179181
if first_keys is None:
180182
first_keys = features.keys()
181-
first_keys_str = ",".join(features.keys())
182-
183183
print(f"Exporting {len(first_keys)} column features")
184184

185+
first_keys_str = keys_to_csv(features.keys())
186+
185187
outfile.write(first_keys_str + "\n")
186188

187189
set_first()

sherlock/functional.py

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
import csv
2-
import io
31
import multiprocessing
42
import os
53
import random
@@ -16,8 +14,8 @@
1614
from sherlock.features.bag_of_words import extract_bag_of_words_features
1715
from sherlock.features.word_embeddings import extract_word_embeddings_features
1816
from sherlock.features.paragraph_vectors import infer_paragraph_embeddings_features
19-
from sherlock.features.helpers import literal_eval_as_str
20-
from sherlock.global_state import is_first, set_first
17+
from sherlock.features.helpers import literal_eval_as_str, keys_to_csv
18+
from sherlock.global_state import is_first, set_first, reset_first
2119

2220

2321
def as_py_str(x):
@@ -95,14 +93,6 @@ def black_hole(od: OrderedDict):
9593
return None
9694

9795

98-
def keys_to_csv(keys):
99-
with io.StringIO() as output:
100-
writer = csv.writer(output, quoting=csv.QUOTE_NONNUMERIC)
101-
writer.writerow(keys)
102-
103-
return output.getvalue()
104-
105-
10696
def ensure_path_exists(output_path):
10797
path = os.path.dirname(output_path)
10898

@@ -117,6 +107,8 @@ def extract_features_to_csv(output_path, parquet_values):
117107
key_count = 0
118108
core_count = multiprocessing.cpu_count()
119109

110+
reset_first()
111+
120112
# retrieve keys for every row only if verify_keys=True
121113
drop_keys = partial(keys_on_first, first_keys_only=(not verify_keys))
122114

tests/test_helpers.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from unittest import TestCase
2-
from sherlock.features.helpers import literal_eval_as_str
2+
from sherlock.features.helpers import literal_eval_as_str, keys_to_csv
33

44

55
class Test(TestCase):
@@ -32,3 +32,8 @@ def test_literal_eval_as_str_multiple_commas_in_string(self):
3232

3333
assert result == ['I have, multiple commas, in string which should, be preserved, ',
3434
', another']
35+
36+
def test_keys_to_csv(self):
37+
result = keys_to_csv(['n_[0]-agg-any', 'n_[,]-agg-any', 'n_["]-agg-any'])
38+
39+
assert result == '"n_[0]-agg-any","n_[,]-agg-any","n_[""]-agg-any"\r\n'

0 commit comments

Comments
 (0)