Bugfix/support comma in csv header (#40)

lowecg · web-flow · commit aad3d8794cfb · 2022-02-20T13:25:56.000+01:00
* consistently map keys to CSV header strings

* reset global state from extract_features_to_csv to support notebooks also calling extract_features directly
diff --git a/sherlock/features/helpers.py b/sherlock/features/helpers.py
@@ -1,4 +1,6 @@
 import string
+import csv
+import io
 
 
 # https://stackoverflow.com/questions/10593387/when-do-i-need-to-escape-characters-within-a-regex-character-set-within
@@ -87,3 +89,17 @@ def literal_eval_as_str(value, none_value=None):
             strings.append(s)
 
     return strings
+
+
+def keys_to_csv(keys):
+    """
+    Encode a list of strings into an Excel CSV compatible header.
+
+    Wraps all items with double quotes to prevent legitimate values containing a comma from being interpreted as a
+    separator, and encodes existing double quotes with two double quotes.
+    """
+    with io.StringIO() as output:
+        writer = csv.writer(output, quoting=csv.QUOTE_NONNUMERIC)
+        writer.writerow(keys)
+
+        return output.getvalue()
diff --git a/sherlock/features/preprocessing.py b/sherlock/features/preprocessing.py
@@ -2,7 +2,7 @@
 import os
 
 from collections import OrderedDict
-from typing import Union
+from typing import Union, Tuple
 
 import pandas as pd
 
@@ -16,7 +16,7 @@
 from sherlock.features.word_embeddings import extract_word_embeddings_features
 from sherlock.features.paragraph_vectors import infer_paragraph_embeddings_features
 from sherlock.global_state import set_first, reset_first
-from sherlock.features.helpers import literal_eval_as_str
+from sherlock.features.helpers import literal_eval_as_str, keys_to_csv
 
 
 def prepare_feature_extraction():
@@ -79,7 +79,7 @@ def convert_string_lists_to_lists(
     labels: Union[pd.DataFrame, pd.Series],
     data_column_name: str = None,
     labels_column_name: str = None,
-) -> pd.Series:
+) -> Tuple[list, list]:
     """Convert strings of arrays with values to arrays of strings of values.
     Each row in de dataframe or series corresponds to a column, represented by a string of a list.
     Each string-list will be converted to a list with string values.
@@ -123,7 +123,9 @@ def convert_string_lists_to_lists(
         converted_labels = labels.to_list()
     else:
         raise TypeError("Unexpected data type of labels.")
-
+    print("types")
+    print(type(converted_data))
+    print(type(converted_labels))
     return converted_data, converted_labels
 
 
@@ -136,15 +138,15 @@ def load_parquet_values(path):
 
 def extract_features(
     output_filename, data: Union[pd.DataFrame, pd.Series]
-) -> pd.DataFrame:
+):
     """Extract features from raw data.
 
     Parameters
     ----------
     output_filename
         filename to output featurized column samples
     data
-        A pandas DataFrame or Series with each row a list of string values.
+        A pandas DataFrame or Series with each row as a list of string values.
     """
     vec_dim = 400
     reuse_model = True
@@ -178,10 +180,10 @@ def extract_features(
 
             if first_keys is None:
                 first_keys = features.keys()
-                first_keys_str = ",".join(features.keys())
-
                 print(f"Exporting {len(first_keys)} column features")
 
+                first_keys_str = keys_to_csv(features.keys())
+
                 outfile.write(first_keys_str + "\n")
 
                 set_first()
diff --git a/sherlock/functional.py b/sherlock/functional.py
@@ -1,5 +1,3 @@
-import csv
-import io
 import multiprocessing
 import os
 import random
@@ -16,8 +14,8 @@
 from sherlock.features.bag_of_words import extract_bag_of_words_features
 from sherlock.features.word_embeddings import extract_word_embeddings_features
 from sherlock.features.paragraph_vectors import infer_paragraph_embeddings_features
-from sherlock.features.helpers import literal_eval_as_str
-from sherlock.global_state import is_first, set_first
+from sherlock.features.helpers import literal_eval_as_str, keys_to_csv
+from sherlock.global_state import is_first, set_first, reset_first
 
 
 def as_py_str(x):
@@ -95,14 +93,6 @@ def black_hole(od: OrderedDict):
     return None
 
 
-def keys_to_csv(keys):
-    with io.StringIO() as output:
-        writer = csv.writer(output, quoting=csv.QUOTE_NONNUMERIC)
-        writer.writerow(keys)
-
-        return output.getvalue()
-
-
 def ensure_path_exists(output_path):
     path = os.path.dirname(output_path)
 
@@ -117,6 +107,8 @@ def extract_features_to_csv(output_path, parquet_values):
     key_count = 0
     core_count = multiprocessing.cpu_count()
 
+    reset_first()
+
     # retrieve keys for every row only if verify_keys=True
     drop_keys = partial(keys_on_first, first_keys_only=(not verify_keys))
 
diff --git a/tests/test_helpers.py b/tests/test_helpers.py
@@ -1,5 +1,5 @@
 from unittest import TestCase
-from sherlock.features.helpers import literal_eval_as_str
+from sherlock.features.helpers import literal_eval_as_str, keys_to_csv
 
 
 class Test(TestCase):
@@ -32,3 +32,8 @@ def test_literal_eval_as_str_multiple_commas_in_string(self):
 
         assert result == ['I have, multiple commas, in string which should, be preserved, ',
                           ', another']
+
+    def test_keys_to_csv(self):
+        result = keys_to_csv(['n_[0]-agg-any', 'n_[,]-agg-any', 'n_["]-agg-any'])
+
+        assert result == '"n_[0]-agg-any","n_[,]-agg-any","n_[""]-agg-any"\r\n'