Skip to content

Commit cc10831

Browse files
authored
Adds max size to integer columns & new category strategy (#178)
* Adds max size to integer columns & new category strategy * Adds comment
1 parent bd28090 commit cc10831

File tree

1 file changed

+11
-5
lines changed

1 file changed

+11
-5
lines changed

util/category.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,18 +14,24 @@ def infer_category(file_name: str) -> str:
1414
def infer_category_enum(df: pd.DataFrame, df_col: str) -> str:
1515
type_name = df[df_col].dtype.name
1616
if type_name == "int64":
17+
if df[df_col].apply(lambda x: x > 2_147_483_647).sum() > 0:
18+
# doesn't fit in database INTEGER type
19+
# check all values instead of sample since it a simple integer column
20+
return enums.DataTypes.TEXT.value
1721
return enums.DataTypes.INTEGER.value
1822
elif type_name == "float64":
1923
return enums.DataTypes.FLOAT.value
2024
elif type_name == "bool":
2125
return enums.DataTypes.BOOLEAN.value
2226
elif type_name == "object":
23-
sample = df[df_col].sample(10) if len(df) > 10 else df[df_col]
24-
if sample.apply(lambda x: len(str(x).split()) > 4).sum() > 0:
25-
# if any of 10 randomly sampled texts contains more than 4 whitespaces, it is most likely text
26-
return enums.DataTypes.TEXT.value
27-
else:
27+
# if the number of unique values is less than 20% of the number of rows
28+
# & no value is longer than 50 characters then we assume category
29+
if (
30+
df[df_col].nunique() <= df[df_col].count() * 0.2
31+
and df[df_col].str.len().max() < 50
32+
):
2833
return enums.DataTypes.CATEGORY.value
34+
return enums.DataTypes.TEXT.value
2935
else:
3036
return enums.DataTypes.UNKNOWN.value
3137

0 commit comments

Comments
 (0)