Skip to content

Commit 10faea8

Browse files
SimonDegrafKernlumburovskalinaJWittmeyer
authored
Adds replace or null special char in record data upload (#146)
* Adds replace or null special char in record data upload * Adds functionality to display nested json as string * Azure platform on embeddings (#145) * Additional data field on the embeddings table * Added azure as embedding platform * Import/export additional data * Submodule dev change --------- Co-authored-by: JWittmeyer <[email protected]> * updates submodule * Additional check if there are less than 10 records * Restructures function body for pick sample * Fixes function call to pick sample * Adds env variable for lifo queue (#147) * Resolves PR comments * Updates submodule * changed type hints --------- Co-authored-by: lumburovskalina <[email protected]> Co-authored-by: JWittmeyer <[email protected]> Co-authored-by: Lina <[email protected]> Co-authored-by: JWittmeyer <[email protected]>
1 parent 65fe93c commit 10faea8

File tree

1 file changed

+28
-1
lines changed

1 file changed

+28
-1
lines changed

controller/transfer/util.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import datetime
2-
from typing import List, Dict, Tuple, Union, Optional
2+
import json
3+
from typing import Any, List, Dict, Tuple, Union, Optional
34

45
from submodules.model import enums
56
from .checks import check_argument_allowed, run_checks, run_limit_checks
@@ -83,6 +84,7 @@ def convert_to_record_dict(
8384
)
8485
raise Exception("Upload conversion error", "Upload ran into errors")
8586
# ensure useable columns dont break the import
87+
df = df.replace("\u0000", " ", regex=True)
8688
df.fillna(" ", inplace=True)
8789
except Exception as e:
8890
logger.error(traceback.format_exc())
@@ -100,6 +102,8 @@ def convert_to_record_dict(
100102
run_limit_checks(df, project_id, user_id)
101103
run_checks(df, project_id, user_id)
102104
check_and_convert_category_for_unknown(df, project_id, user_id)
105+
106+
df = covert_nested_attributes_to_text(df)
103107
added_col = add_running_id_if_not_present(df, project_id)
104108
return df.to_dict("records"), added_col
105109

@@ -141,6 +145,29 @@ def check_and_convert_category_for_unknown(
141145
)
142146

143147

148+
def covert_nested_attributes_to_text(df: pd.DataFrame) -> pd.DataFrame:
149+
for key in df.columns:
150+
sample = pick_sample(df, key)
151+
if check_sample_has_dict_values(sample):
152+
df[key] = df[key].apply(lambda x: json.dumps(x))
153+
return df
154+
155+
156+
def check_sample_has_dict_values(sample: List[Any]) -> bool:
157+
for value in sample:
158+
if isinstance(value, dict):
159+
return True
160+
return False
161+
162+
163+
def pick_sample(df: pd.DataFrame, key: str, sample_size: int = 10) -> pd.Series:
164+
column_size = len(df[key])
165+
if column_size <= sample_size:
166+
return df[key].sample(column_size)
167+
168+
return df[key].sample(sample_size)
169+
170+
144171
def string_to_import_option_dict(
145172
import_string: str, user_id: str, project_id: str
146173
) -> Dict[str, Union[str, int]]:

0 commit comments

Comments
 (0)