Adds replace or null special char in record data upload (#146)

SimonDegrafKern · lumburovskalina · JWittmeyer · web-flow · commit 10faea8584cb · 2023-08-16T18:42:36.000+02:00
* Adds replace or null special char in record data upload * Adds functionality to display nested json as string * Azure platform on embeddings (#145) * Additional data field on the embeddings table * Added azure as embedding platform * Import/export additional data * Submodule dev change --------- Co-authored-by: JWittmeyer <jens.wittmeyer@kern.ai> * updates submodule * Additional check if there are less than 10 records * Restructures function body for pick sample * Fixes function call to pick sample * Adds env variable for lifo queue (#147) * Resolves PR comments * Updates submodule * changed type hints --------- Co-authored-by: lumburovskalina <104008550+lumburovskalina@users.noreply.github.com> Co-authored-by: JWittmeyer <jens.wittmeyer@kern.ai> Co-authored-by: Lina <lina.lumburovska@kern.ai> Co-authored-by: JWittmeyer <91723236+JWittmeyer@users.noreply.github.com>
diff --git a/controller/transfer/util.py b/controller/transfer/util.py
@@ -1,5 +1,6 @@
 import datetime
-from typing import List, Dict, Tuple, Union, Optional
+import json
+from typing import Any, List, Dict, Tuple, Union, Optional
 
 from submodules.model import enums
 from .checks import check_argument_allowed, run_checks, run_limit_checks
@@ -83,6 +84,7 @@ def convert_to_record_dict(
             )
             raise Exception("Upload conversion error", "Upload ran into errors")
         # ensure useable columns dont break the import
+        df = df.replace("\u0000", " ", regex=True)
         df.fillna(" ", inplace=True)
     except Exception as e:
         logger.error(traceback.format_exc())
@@ -100,6 +102,8 @@ def convert_to_record_dict(
     run_limit_checks(df, project_id, user_id)
     run_checks(df, project_id, user_id)
     check_and_convert_category_for_unknown(df, project_id, user_id)
+
+    df = covert_nested_attributes_to_text(df)
     added_col = add_running_id_if_not_present(df, project_id)
     return df.to_dict("records"), added_col
 
@@ -141,6 +145,29 @@ def check_and_convert_category_for_unknown(
         )
 
 
+def covert_nested_attributes_to_text(df: pd.DataFrame) -> pd.DataFrame:
+    for key in df.columns:
+        sample = pick_sample(df, key)
+        if check_sample_has_dict_values(sample):
+            df[key] = df[key].apply(lambda x: json.dumps(x))
+    return df
+
+
+def check_sample_has_dict_values(sample: List[Any]) -> bool:
+    for value in sample:
+        if isinstance(value, dict):
+            return True
+    return False
+
+
+def pick_sample(df: pd.DataFrame, key: str, sample_size: int = 10) -> pd.Series:
+    column_size = len(df[key])
+    if column_size <= sample_size:
+        return df[key].sample(column_size)
+
+    return df[key].sample(sample_size)
+
+
 def string_to_import_option_dict(
     import_string: str, user_id: str, project_id: str
 ) -> Dict[str, Union[str, int]]: