Skip to content

Commit b2144b4

Browse files
rjzamorarnyakkarlhigley
authored
Fix Categorify bug for combo encoding with null values (#1652)
* handle combo categorify with nulls * formatting * include test coverage * split encoding logic into distinct functions to improve readability Co-authored-by: rnyak <[email protected]> Co-authored-by: Karl Higley <[email protected]>
1 parent b3e683f commit b2144b4

File tree

2 files changed

+131
-64
lines changed

2 files changed

+131
-64
lines changed

nvtabular/ops/categorify.py

Lines changed: 120 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1046,72 +1046,15 @@ def _write_uniques(dfs, base_path, col_selector: ColumnSelector, options: FitOpt
10461046
# Use ignore_index=True to avoid allocating memory for
10471047
# an index we don't even need
10481048
df = df.sort_values(col_selector.names, na_position="first", ignore_index=True)
1049-
new_cols = {}
1050-
nulls_missing = False
1051-
for col in col_selector.names:
1052-
name_size = col + "_size"
1053-
null_size = 0
1054-
# Set null size if first element in `col` is
1055-
# null, and the `size` aggregation is known
1056-
if name_size in df and df[col].iloc[:1].isnull().any():
1057-
null_size = df[name_size].iloc[0]
1058-
if options.max_size:
1059-
max_emb_size = options.max_size
1060-
if isinstance(options.max_size, dict):
1061-
max_emb_size = max_emb_size[col]
1062-
if options.num_buckets:
1063-
if isinstance(options.num_buckets, int):
1064-
nlargest = max_emb_size - options.num_buckets - 1
1065-
else:
1066-
nlargest = max_emb_size - options.num_buckets[col] - 1
1067-
else:
1068-
nlargest = max_emb_size - 1
1069-
1070-
if nlargest <= 0:
1071-
raise ValueError("`nlargest` cannot be 0 or negative")
1072-
1073-
if nlargest < len(df) and name_size in df:
1074-
# remove NAs from column, we have na count from above.
1075-
df = df.dropna()
1076-
# sort based on count (name_size column)
1077-
df = df.nlargest(n=nlargest, columns=name_size)
1078-
new_cols[col] = _concat(
1079-
[nullable_series([None], df, df[col].dtype), df[col]],
1080-
ignore_index=True,
1081-
)
1082-
new_cols[name_size] = _concat(
1083-
[nullable_series([null_size], df, df[name_size].dtype), df[name_size]],
1084-
ignore_index=True,
1085-
)
1086-
# recreate newly "count" ordered df
1087-
df = type(df)(new_cols)
1088-
if not dispatch.series_has_nulls(df[col]):
1089-
if name_size in df:
1090-
df = df.sort_values(name_size, ascending=False, ignore_index=True)
10911049

1092-
nulls_missing = True
1093-
new_cols[col] = _concat(
1094-
[nullable_series([None], df, df[col].dtype), df[col]],
1095-
ignore_index=True,
1096-
)
1097-
if name_size in df:
1098-
new_cols[name_size] = _concat(
1099-
[nullable_series([null_size], df, df[name_size].dtype), df[name_size]],
1100-
ignore_index=True,
1101-
)
1050+
name_size_multi = "_".join(col_selector.names + ["size"])
1051+
if len(col_selector.names) > 1 and name_size_multi in df:
1052+
# Using "combo" encoding
1053+
df = _combo_encode(df, name_size_multi, col_selector, options)
1054+
else:
1055+
# Using (default) "joint" encoding
1056+
df = _joint_encode(df, col_selector, options)
11021057

1103-
else:
1104-
# ensure None aka "unknown" stays at index 0
1105-
if name_size in df:
1106-
df_0 = df.iloc[0:1]
1107-
df_1 = df.iloc[1:].sort_values(name_size, ascending=False, ignore_index=True)
1108-
df = _concat([df_0, df_1])
1109-
new_cols[col] = df[col].copy(deep=False)
1110-
1111-
if name_size in df:
1112-
new_cols[name_size] = df[name_size].copy(deep=False)
1113-
if nulls_missing:
1114-
df = type(df)(new_cols)
11151058
df.to_parquet(path, index=False, compression=None)
11161059
else:
11171060
df_null = type(df)({c: [None] for c in col_selector.names})
@@ -1123,6 +1066,119 @@ def _write_uniques(dfs, base_path, col_selector: ColumnSelector, options: FitOpt
11231066
return path
11241067

11251068

1069+
@annotate("_combo_encode", color="green", domain="nvt_python")
1070+
def _combo_encode(df, name_size_multi: str, col_selector: ColumnSelector, options: FitOptions):
1071+
# Combo-encoding utility (used by _write_uniques)
1072+
1073+
# Account for max_size and num_buckets
1074+
if options.max_size:
1075+
max_emb_size = options.max_size
1076+
if isinstance(options.max_size, dict):
1077+
raise NotImplementedError(
1078+
"Cannot specify max_size as a dictionary for 'combo' encoding."
1079+
)
1080+
if options.num_buckets:
1081+
if isinstance(options.num_buckets, dict):
1082+
raise NotImplementedError(
1083+
"Cannot specify num_buckets as a dictionary for 'combo' encoding."
1084+
)
1085+
nlargest = max_emb_size - options.num_buckets - 1
1086+
else:
1087+
nlargest = max_emb_size - 1
1088+
1089+
if nlargest <= 0:
1090+
raise ValueError("`nlargest` cannot be 0 or negative")
1091+
1092+
if nlargest < len(df):
1093+
# sort based on count (name_size_multi column)
1094+
df = df.nlargest(n=nlargest, columns=name_size_multi)
1095+
1096+
# Deal with nulls
1097+
has_nans = df[col_selector.names].iloc[0].transpose().isnull().all()
1098+
if hasattr(has_nans, "iloc"):
1099+
has_nans = has_nans[0]
1100+
if not has_nans:
1101+
null_data = {col: nullable_series([None], df, df[col].dtype) for col in col_selector.names}
1102+
null_data[name_size_multi] = [0]
1103+
null_df = type(df)(null_data)
1104+
df = _concat([null_df, df], ignore_index=True)
1105+
1106+
return df
1107+
1108+
1109+
@annotate("_joint_encode", color="green", domain="nvt_python")
1110+
def _joint_encode(df, col_selector: ColumnSelector, options: FitOptions):
1111+
# Joint-encoding utility (used by _write_uniques)
1112+
1113+
new_cols = {}
1114+
nulls_missing = False
1115+
for col in col_selector.names:
1116+
name_size = col + "_size"
1117+
null_size = 0
1118+
# Set null size if first element in `col` is
1119+
# null, and the `size` aggregation is known
1120+
if name_size in df and df[col].iloc[:1].isnull().any():
1121+
null_size = df[name_size].iloc[0]
1122+
if options.max_size:
1123+
max_emb_size = options.max_size
1124+
if isinstance(options.max_size, dict):
1125+
max_emb_size = max_emb_size[col]
1126+
if options.num_buckets:
1127+
if isinstance(options.num_buckets, int):
1128+
nlargest = max_emb_size - options.num_buckets - 1
1129+
else:
1130+
nlargest = max_emb_size - options.num_buckets[col] - 1
1131+
else:
1132+
nlargest = max_emb_size - 1
1133+
1134+
if nlargest <= 0:
1135+
raise ValueError("`nlargest` cannot be 0 or negative")
1136+
1137+
if nlargest < len(df) and name_size in df:
1138+
# remove NAs from column, we have na count from above.
1139+
df = df.dropna() # TODO: This seems dangerous - Check this
1140+
# sort based on count (name_size column)
1141+
df = df.nlargest(n=nlargest, columns=name_size)
1142+
new_cols[col] = _concat(
1143+
[nullable_series([None], df, df[col].dtype), df[col]],
1144+
ignore_index=True,
1145+
)
1146+
new_cols[name_size] = _concat(
1147+
[nullable_series([null_size], df, df[name_size].dtype), df[name_size]],
1148+
ignore_index=True,
1149+
)
1150+
# recreate newly "count" ordered df
1151+
df = type(df)(new_cols)
1152+
if not dispatch.series_has_nulls(df[col]):
1153+
if name_size in df:
1154+
df = df.sort_values(name_size, ascending=False, ignore_index=True)
1155+
1156+
nulls_missing = True
1157+
new_cols[col] = _concat(
1158+
[nullable_series([None], df, df[col].dtype), df[col]],
1159+
ignore_index=True,
1160+
)
1161+
if name_size in df:
1162+
new_cols[name_size] = _concat(
1163+
[nullable_series([null_size], df, df[name_size].dtype), df[name_size]],
1164+
ignore_index=True,
1165+
)
1166+
1167+
else:
1168+
# ensure None aka "unknown" stays at index 0
1169+
if name_size in df:
1170+
df_0 = df.iloc[0:1]
1171+
df_1 = df.iloc[1:].sort_values(name_size, ascending=False, ignore_index=True)
1172+
df = _concat([df_0, df_1])
1173+
new_cols[col] = df[col].copy(deep=False)
1174+
1175+
if name_size in df:
1176+
new_cols[name_size] = df[name_size].copy(deep=False)
1177+
if nulls_missing:
1178+
return type(df)(new_cols)
1179+
return df
1180+
1181+
11261182
def _finish_labels(paths, cols):
11271183
return {col: paths[i] for i, col in enumerate(cols)}
11281184

tests/unit/ops/test_categorify.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,17 @@ def test_categorify_multi(tmpdir, cat_names, kind, cpu):
329329
"expected_e": [3, 2, 1, 4],
330330
"expected_ae": [3, 4, 2, 1],
331331
},
332+
# Include null value
333+
{
334+
"df_data": {
335+
"Author": [np.nan, "User_E", "User_B", "User_A"],
336+
"Engaging User": ["User_C", "User_B", "User_A", "User_D"],
337+
"Post": [1, 2, 3, 4],
338+
},
339+
"expected_a": [0, 3, 2, 1],
340+
"expected_e": [3, 2, 1, 4],
341+
"expected_ae": [1, 4, 3, 2],
342+
},
332343
],
333344
)
334345
def test_categorify_multi_combo(tmpdir, input_with_output, cat_names, cpu):

0 commit comments

Comments
 (0)