@@ -14,18 +14,24 @@ def infer_category(file_name: str) -> str:
1414def infer_category_enum (df : pd .DataFrame , df_col : str ) -> str :
1515 type_name = df [df_col ].dtype .name
1616 if type_name == "int64" :
17+ if df [df_col ].apply (lambda x : x > 2_147_483_647 ).sum () > 0 :
18+ # doesn't fit in database INTEGER type
19+ # check all values instead of sample since it a simple integer column
20+ return enums .DataTypes .TEXT .value
1721 return enums .DataTypes .INTEGER .value
1822 elif type_name == "float64" :
1923 return enums .DataTypes .FLOAT .value
2024 elif type_name == "bool" :
2125 return enums .DataTypes .BOOLEAN .value
2226 elif type_name == "object" :
23- sample = df [df_col ].sample (10 ) if len (df ) > 10 else df [df_col ]
24- if sample .apply (lambda x : len (str (x ).split ()) > 4 ).sum () > 0 :
25- # if any of 10 randomly sampled texts contains more than 4 whitespaces, it is most likely text
26- return enums .DataTypes .TEXT .value
27- else :
27+ # if the number of unique values is less than 20% of the number of rows
28+ # & no value is longer than 50 characters then we assume category
29+ if (
30+ df [df_col ].nunique () <= df [df_col ].count () * 0.2
31+ and df [df_col ].str .len ().max () < 50
32+ ):
2833 return enums .DataTypes .CATEGORY .value
34+ return enums .DataTypes .TEXT .value
2935 else :
3036 return enums .DataTypes .UNKNOWN .value
3137
0 commit comments