-
Notifications
You must be signed in to change notification settings - Fork 54
Open
Labels
Description
Bug description
The default embedding dimensions for categorical features computed by NVTabular often causes embedding collapse. I detailed this issue here. When you modify the embedding dimensions directly in the schema, it won't change the architecture of the subsequent model built from the modified schema. Moreover, the schema of the model will reflect the modified embedding dimensions, even though the number of parameters of the model will not change.
Steps/Code to reproduce bug
import nvtabular as nvt
from nvtabular import ops
from nvtabular.loader.tensorflow import KerasSequenceLoader
from merlin.core.dispatch import get_lib
from merlin.schema.tags import Tags
from merlin.schema.schema import Schema
import merlin.models.tf as mm
from merlin.dataloader.ops.embeddings import EmbeddingOperator
from merlin.models.utils.dataset import unique_rows_by_features
import tensorflow as tf
import numpy as np
import pandas as pd
def generate_categorical_data(cardinalities, num_rows=1_000_000, seed=2025):
np.random.seed(seed)
data = {}
for i, cardinality in enumerate(cardinalities, 1):
feature_name = f'feature_{i}'
data[feature_name] = np.random.randint(0, cardinality, size=num_rows)
return pd.DataFrame(data)
def map_cardinality_to_embedding_dimension(cardinality: int) -> int:
""""""
if cardinality < 32:
emb_dim = 2
elif cardinality < 128:
emb_dim = 4
elif cardinality < 1024:
emb_dim = 8
elif cardinality < 32768:
emb_dim = 16
else:
emb_dim = 32
return emb_dim
def print_schema(schema: Schema):
for col in schema.column_names:
card = schema[col].properties['embedding_sizes']['cardinality']
dim = schema[col].properties['embedding_sizes']['dimension']
print(f'{col} cardinality: {card} dimension: {dim}')
def build_model(schema: Schema, set_embedding_dims: bool = False) -> mm.TwoTowerModelV2:
dimensions = [2048, 256, 16]
activation = 'relu'
normalization = 'batch_norm'
dropout = 0.5
logits_temperature = 1.0
user_emb_dims, item_emb_dims = {}, {}
user_schema = schema.select_by_tag(Tags.USER)
item_schema = schema.select_by_tag(Tags.ITEM)
if set_embedding_dims:
for col in user_schema.column_names:
user_emb_dims[col] = schema[col].properties['embedding_sizes']['dimension']
for col in item_schema.column_names:
item_emb_dims[col] = schema[col].properties['embedding_sizes']['dimension']
user_embeddings_block = mm.Embeddings(
user_schema.select_by_tag(Tags.CATEGORICAL),
infer_embedding_sizes=False,
dim=user_emb_dims
)
user_inputs = mm.InputBlockV2(
user_schema, categorical=user_embeddings_block
)
query_tower = mm.Encoder(
user_inputs,
mm.MLPBlock(
dimensions=dimensions,
no_activation_last_layer=True,
activation=activation,
dropout=dropout,
normalization=normalization
)
)
item_embeddings_block = mm.Embeddings(
item_schema.select_by_tag(Tags.CATEGORICAL),
infer_embedding_sizes=False,
dim=item_emb_dims
)
item_inputs = mm.InputBlockV2(
item_schema, categorical=item_embeddings_block
)
candidate_tower = mm.Encoder(
item_inputs,
mm.MLPBlock(
dimensions=dimensions,
no_activation_last_layer=True,
activation=activation,
dropout=dropout,
normalization=normalization
)
)
model = mm.TwoTowerModelV2(
query_tower,
candidate_tower,
logits_temperature=logits_temperature,
negative_samplers=[mm.InBatchSamplerV2()]
)
model.compile()
return model
# Generate random data
col_user_id = 'user_id'
col_item_id = 'item_id'
cardinalities = [3, 5, 12, 20, 29, 50, 80, 230, 760, 1100, 4679, 8900]
df = generate_categorical_data(cardinalities)
df[col_user_id] = np.random.randint(0, 10_000, len(df))
df[col_item_id] = np.random.randint(0, 20_000, len(df))
# Create workflow, dataset, data loader
cat_columns = [col for col in df.columns if 'feature_' in col]
user_id = nvt.ColumnSelector(col_user_id) >> ops.Categorify(freq_threshold=1, dtype='int32') >> ops.AddTags(tags=[Tags.USER_ID, Tags.USER])
item_id = nvt.ColumnSelector(col_item_id) >> ops.Categorify(freq_threshold=1, dtype='int32') >> ops.AddTags(tags=[Tags.ITEM_ID, Tags.ITEM])
user_cat_features = nvt.ColumnSelector(cat_columns) >> ops.Categorify(freq_threshold=1, dtype='int32') >> ops.AddTags(tags=[Tags.USER])
features = user_id + user_cat_features + item_id
workflow = nvt.Workflow(features)
ds = nvt.Dataset(df)
ds_tr = workflow.fit_transform(ds)
ds_loader = KerasSequenceLoader(ds_tr, batch_size=32)
# print cardinalities and dimensions
schema = ds_tr.schema
print_schema(schema)
model = build_model(schema, set_embedding_dims=False)
_ = model(next(iter(ds_loader))[0])
print(f'{model.count_params():,}')
# Modify embedding dimensions in schema
for col in schema.column_names:
card = schema[col].properties['embedding_sizes']['cardinality']
dim = map_cardinality_to_embedding_dimension(card)
schema[col].properties['embedding_sizes']['dimension'] = dim
print_schema(schema)
# Results in the same number of parameters, even though the schema is different...
model = build_model(schema, set_embedding_dims=False)
_ = model(next(iter(ds_loader))[0])
print(f'{model.count_params():,}')
# Verify model's schema also reflect the changes...
print_schema(model.schema)
# Set the embedding dimensions directly in the Embeddings block
model = build_model(schema, set_embedding_dims=True)
_ = model(next(iter(ds_loader))[0])
print(f'{model.count_params():,}')
Expected behavior
- The first model building with the default dimensions produces a model with parameters of 2,552,849.
- The second model building with the modified schema and embedding dimensions produces the same number of parameters of 2,552,849.
- Moreover, the model's schema reflect the changes in the schema with the modified embedding dimensions.
- The third model building with directly setting the embedding dimensions in the Embeddings block produces a different model with parameters of 2,041,385.
Consequence: modifying the embedding dimensions of the schema has no effect the subsequent model and the model's schema reflects wrong embedding dimensions.
Environment details
https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-tensorflow-training