@@ -1046,72 +1046,15 @@ def _write_uniques(dfs, base_path, col_selector: ColumnSelector, options: FitOpt
10461046 # Use ignore_index=True to avoid allocating memory for
10471047 # an index we don't even need
10481048 df = df .sort_values (col_selector .names , na_position = "first" , ignore_index = True )
1049- new_cols = {}
1050- nulls_missing = False
1051- for col in col_selector .names :
1052- name_size = col + "_size"
1053- null_size = 0
1054- # Set null size if first element in `col` is
1055- # null, and the `size` aggregation is known
1056- if name_size in df and df [col ].iloc [:1 ].isnull ().any ():
1057- null_size = df [name_size ].iloc [0 ]
1058- if options .max_size :
1059- max_emb_size = options .max_size
1060- if isinstance (options .max_size , dict ):
1061- max_emb_size = max_emb_size [col ]
1062- if options .num_buckets :
1063- if isinstance (options .num_buckets , int ):
1064- nlargest = max_emb_size - options .num_buckets - 1
1065- else :
1066- nlargest = max_emb_size - options .num_buckets [col ] - 1
1067- else :
1068- nlargest = max_emb_size - 1
1069-
1070- if nlargest <= 0 :
1071- raise ValueError ("`nlargest` cannot be 0 or negative" )
1072-
1073- if nlargest < len (df ) and name_size in df :
1074- # remove NAs from column, we have na count from above.
1075- df = df .dropna ()
1076- # sort based on count (name_size column)
1077- df = df .nlargest (n = nlargest , columns = name_size )
1078- new_cols [col ] = _concat (
1079- [nullable_series ([None ], df , df [col ].dtype ), df [col ]],
1080- ignore_index = True ,
1081- )
1082- new_cols [name_size ] = _concat (
1083- [nullable_series ([null_size ], df , df [name_size ].dtype ), df [name_size ]],
1084- ignore_index = True ,
1085- )
1086- # recreate newly "count" ordered df
1087- df = type (df )(new_cols )
1088- if not dispatch .series_has_nulls (df [col ]):
1089- if name_size in df :
1090- df = df .sort_values (name_size , ascending = False , ignore_index = True )
10911049
1092- nulls_missing = True
1093- new_cols [col ] = _concat (
1094- [nullable_series ([None ], df , df [col ].dtype ), df [col ]],
1095- ignore_index = True ,
1096- )
1097- if name_size in df :
1098- new_cols [name_size ] = _concat (
1099- [nullable_series ([null_size ], df , df [name_size ].dtype ), df [name_size ]],
1100- ignore_index = True ,
1101- )
1050+ name_size_multi = "_" .join (col_selector .names + ["size" ])
1051+ if len (col_selector .names ) > 1 and name_size_multi in df :
1052+ # Using "combo" encoding
1053+ df = _combo_encode (df , name_size_multi , col_selector , options )
1054+ else :
1055+ # Using (default) "joint" encoding
1056+ df = _joint_encode (df , col_selector , options )
11021057
1103- else :
1104- # ensure None aka "unknown" stays at index 0
1105- if name_size in df :
1106- df_0 = df .iloc [0 :1 ]
1107- df_1 = df .iloc [1 :].sort_values (name_size , ascending = False , ignore_index = True )
1108- df = _concat ([df_0 , df_1 ])
1109- new_cols [col ] = df [col ].copy (deep = False )
1110-
1111- if name_size in df :
1112- new_cols [name_size ] = df [name_size ].copy (deep = False )
1113- if nulls_missing :
1114- df = type (df )(new_cols )
11151058 df .to_parquet (path , index = False , compression = None )
11161059 else :
11171060 df_null = type (df )({c : [None ] for c in col_selector .names })
@@ -1123,6 +1066,119 @@ def _write_uniques(dfs, base_path, col_selector: ColumnSelector, options: FitOpt
11231066 return path
11241067
11251068
1069+ @annotate ("_combo_encode" , color = "green" , domain = "nvt_python" )
1070+ def _combo_encode (df , name_size_multi : str , col_selector : ColumnSelector , options : FitOptions ):
1071+ # Combo-encoding utility (used by _write_uniques)
1072+
1073+ # Account for max_size and num_buckets
1074+ if options .max_size :
1075+ max_emb_size = options .max_size
1076+ if isinstance (options .max_size , dict ):
1077+ raise NotImplementedError (
1078+ "Cannot specify max_size as a dictionary for 'combo' encoding."
1079+ )
1080+ if options .num_buckets :
1081+ if isinstance (options .num_buckets , dict ):
1082+ raise NotImplementedError (
1083+ "Cannot specify num_buckets as a dictionary for 'combo' encoding."
1084+ )
1085+ nlargest = max_emb_size - options .num_buckets - 1
1086+ else :
1087+ nlargest = max_emb_size - 1
1088+
1089+ if nlargest <= 0 :
1090+ raise ValueError ("`nlargest` cannot be 0 or negative" )
1091+
1092+ if nlargest < len (df ):
1093+ # sort based on count (name_size_multi column)
1094+ df = df .nlargest (n = nlargest , columns = name_size_multi )
1095+
1096+ # Deal with nulls
1097+ has_nans = df [col_selector .names ].iloc [0 ].transpose ().isnull ().all ()
1098+ if hasattr (has_nans , "iloc" ):
1099+ has_nans = has_nans [0 ]
1100+ if not has_nans :
1101+ null_data = {col : nullable_series ([None ], df , df [col ].dtype ) for col in col_selector .names }
1102+ null_data [name_size_multi ] = [0 ]
1103+ null_df = type (df )(null_data )
1104+ df = _concat ([null_df , df ], ignore_index = True )
1105+
1106+ return df
1107+
1108+
1109+ @annotate ("_joint_encode" , color = "green" , domain = "nvt_python" )
1110+ def _joint_encode (df , col_selector : ColumnSelector , options : FitOptions ):
1111+ # Joint-encoding utility (used by _write_uniques)
1112+
1113+ new_cols = {}
1114+ nulls_missing = False
1115+ for col in col_selector .names :
1116+ name_size = col + "_size"
1117+ null_size = 0
1118+ # Set null size if first element in `col` is
1119+ # null, and the `size` aggregation is known
1120+ if name_size in df and df [col ].iloc [:1 ].isnull ().any ():
1121+ null_size = df [name_size ].iloc [0 ]
1122+ if options .max_size :
1123+ max_emb_size = options .max_size
1124+ if isinstance (options .max_size , dict ):
1125+ max_emb_size = max_emb_size [col ]
1126+ if options .num_buckets :
1127+ if isinstance (options .num_buckets , int ):
1128+ nlargest = max_emb_size - options .num_buckets - 1
1129+ else :
1130+ nlargest = max_emb_size - options .num_buckets [col ] - 1
1131+ else :
1132+ nlargest = max_emb_size - 1
1133+
1134+ if nlargest <= 0 :
1135+ raise ValueError ("`nlargest` cannot be 0 or negative" )
1136+
1137+ if nlargest < len (df ) and name_size in df :
1138+ # remove NAs from column, we have na count from above.
1139+ df = df .dropna () # TODO: This seems dangerous - Check this
1140+ # sort based on count (name_size column)
1141+ df = df .nlargest (n = nlargest , columns = name_size )
1142+ new_cols [col ] = _concat (
1143+ [nullable_series ([None ], df , df [col ].dtype ), df [col ]],
1144+ ignore_index = True ,
1145+ )
1146+ new_cols [name_size ] = _concat (
1147+ [nullable_series ([null_size ], df , df [name_size ].dtype ), df [name_size ]],
1148+ ignore_index = True ,
1149+ )
1150+ # recreate newly "count" ordered df
1151+ df = type (df )(new_cols )
1152+ if not dispatch .series_has_nulls (df [col ]):
1153+ if name_size in df :
1154+ df = df .sort_values (name_size , ascending = False , ignore_index = True )
1155+
1156+ nulls_missing = True
1157+ new_cols [col ] = _concat (
1158+ [nullable_series ([None ], df , df [col ].dtype ), df [col ]],
1159+ ignore_index = True ,
1160+ )
1161+ if name_size in df :
1162+ new_cols [name_size ] = _concat (
1163+ [nullable_series ([null_size ], df , df [name_size ].dtype ), df [name_size ]],
1164+ ignore_index = True ,
1165+ )
1166+
1167+ else :
1168+ # ensure None aka "unknown" stays at index 0
1169+ if name_size in df :
1170+ df_0 = df .iloc [0 :1 ]
1171+ df_1 = df .iloc [1 :].sort_values (name_size , ascending = False , ignore_index = True )
1172+ df = _concat ([df_0 , df_1 ])
1173+ new_cols [col ] = df [col ].copy (deep = False )
1174+
1175+ if name_size in df :
1176+ new_cols [name_size ] = df [name_size ].copy (deep = False )
1177+ if nulls_missing :
1178+ return type (df )(new_cols )
1179+ return df
1180+
1181+
11261182def _finish_labels (paths , cols ):
11271183 return {col : paths [i ] for i , col in enumerate (cols )}
11281184
0 commit comments