Skip to content

[BUG] Getting error from model.item_embeddings(...) #231

@rnyak

Description

@rnyak

Bug description

I am getting the following error when I want to generate item_embeddings.

schema = TensorflowMetadata.from_proto_text_file('./train/').to_merlin_schema()
schema = schema.select_by_name(['user_id', 'user_gender', 'user_age', 'user_geography', 'user_profile', 'user_group','item_id', 'item_category', 'item_brand', 'item_shop'])

model = mm.TwoTowerModel(schema, query_tower= mm.MLPBlock([32, 64]))
model.compile(optimizer="adam", run_eagerly=False)
train_loader = get_dataloader(nvt.Dataset(train_path), shuffle=True)
losses = model.fit(train_loader, epochs=1)

item_features = cudf.read_parquet('item_featues.parquet')
model.item_embeddings(nvt.Dataset(item_features, schema=schema), batch_size=4096)
WARNING:absl:Found untraced functions such as block_context_layer_call_fn, block_context_layer_call_and_return_conditional_losses, block_context_layer_call_fn, block_context_layer_call_and_return_conditional_losses, block_context_layer_call_and_return_conditional_losses while saving (showing 5 of 55). These functions will not be directly callable after loading.
INFO:tensorflow:Assets written to: /tmp/tmppt5w0fbu/assets
INFO:tensorflow:Assets written to: /tmp/tmppt5w0fbu/assets
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Input In [15], in <cell line: 1>()
----> 1 model.item_embeddings(nvt.Dataset(item_features, schema=schema), batch_size=4096)

File /models/merlin/models/tf/core.py:2659, in RetrievalModel.item_embeddings(self, dataset, item_tag, item_id_tag, batch_size)
   2640 """Export item embeddings from the model.
   2641 
   2642 Parameters
   (...)
   2655 merlin.io.Dataset
   2656 """
   2657 from merlin.models.tf.utils.batch_utils import ItemEmbeddings
-> 2659 get_item_emb = ItemEmbeddings(self, batch_size=batch_size)
   2661 dataset = self._ensure_unique(dataset, item_tag, item_id_tag)
   2662 embeddings = dataset.map_partitions(get_item_emb)

File /models/merlin/models/tf/utils/batch_utils.py:114, in ItemEmbeddings.__init__(self, model, batch_size, save_path)
    111 item_block = model.block.first.item_block()
    112 schema = item_block.schema
--> 114 super().__init__(
    115     item_block,
    116     save_path=save_path,
    117     batch_size=batch_size,
    118     schema=schema,
    119     output_concat_func=np.concatenate,
    120 )

File /models/merlin/models/tf/utils/batch_utils.py:77, in TFModelEncode.__init__(self, model, output_names, batch_size, save_path, block_load_func, schema, output_concat_func)
     66 def __init__(
     67     self,
     68     model: tp.Union[Model, tf.keras.Model],
   (...)
     74     output_concat_func=None,
     75 ):
     76     save_path = save_path or tempfile.mkdtemp()
---> 77     model.save(save_path)
     79     model_load_func = block_load_func if block_load_func else tf.keras.models.load_model
     80     if not output_names:

File /usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py:67, in filter_traceback.<locals>.error_handler(*args, **kwargs)
     65 except Exception as e:  # pylint: disable=broad-except
     66   filtered_tb = _process_traceback_frames(e.__traceback__)
---> 67   raise e.with_traceback(filtered_tb) from None
     68 finally:
     69   del filtered_tb

File /models/merlin/models/tf/core.py:2274, in ModelBlock.get_config(self)
   2273 def get_config(self):
-> 2274     return {"block": tf.keras.utils.serialize_keras_object(self.block)}

File /models/merlin/models/tf/core.py:789, in SequentialBlock.get_config(self)
    787 config = {}
    788 for i, layer in enumerate(self.layers):
--> 789     config[i] = tf.keras.utils.serialize_keras_object(layer)
    791 return config

File /models/merlin/models/tf/core.py:1553, in ParallelBlock.get_config(self)
   1551 def get_config(self):
   1552     return maybe_serialize_keras_objects(
-> 1553         self, super(ParallelBlock, self).get_config(), ["parallel_layers"]
   1554     )

File /models/merlin/models/tf/core.py:1174, in TabularBlock.get_config(self)
   1171 config = maybe_serialize_keras_objects(self, config, ["pre", "post", "aggregation"])
   1173 if self.schema:
-> 1174     config["schema"] = schema_to_tensorflow_metadata_json(self.schema)
   1176 return config

File /models/merlin/models/utils/schema.py:38, in schema_to_tensorflow_metadata_json(schema, path)
     37 def schema_to_tensorflow_metadata_json(schema, path=None):
---> 38     json = TensorflowMetadata.from_merlin_schema(schema).to_json()
     39     if path:
     40         with open(path, "w") as o:

File /core/merlin/schema/io/tensorflow_metadata.py:97, in TensorflowMetadata.to_json(self)
     96 def to_json(self):
---> 97     return self.proto_schema.to_json()

File /usr/local/lib/python3.8/dist-packages/betterproto/__init__.py:909, in Message.to_json(self, indent)
    907 def to_json(self, indent: Union[None, int, str] = None) -> str:
    908     """Returns the encoded JSON representation of this message instance."""
--> 909     return json.dumps(self.to_dict(), indent=indent)

File /usr/lib/python3.8/json/__init__.py:231, in dumps(obj, skipkeys, ensure_ascii, check_circular, allow_nan, cls, indent, separators, default, sort_keys, **kw)
    226 # cached encoder
    227 if (not skipkeys and ensure_ascii and
    228     check_circular and allow_nan and
    229     cls is None and indent is None and separators is None and
    230     default is None and not sort_keys and not kw):
--> 231     return _default_encoder.encode(obj)
    232 if cls is None:
    233     cls = JSONEncoder

File /usr/lib/python3.8/json/encoder.py:199, in JSONEncoder.encode(self, o)
    195         return encode_basestring(o)
    196 # This doesn't pass the iterator directly to ''.join() because the
    197 # exceptions aren't as detailed.  The list call should be roughly
    198 # equivalent to the PySequence_Fast that ''.join() would do.
--> 199 chunks = self.iterencode(o, _one_shot=True)
    200 if not isinstance(chunks, (list, tuple)):
    201     chunks = list(chunks)

File /usr/lib/python3.8/json/encoder.py:257, in JSONEncoder.iterencode(self, o, _one_shot)
    252 else:
    253     _iterencode = _make_iterencode(
    254         markers, self.default, _encoder, self.indent, floatstr,
    255         self.key_separator, self.item_separator, self.sort_keys,
    256         self.skipkeys, _one_shot)
--> 257 return _iterencode(o, 0)

File /usr/lib/python3.8/json/encoder.py:179, in JSONEncoder.default(self, o)
    160 def default(self, o):
    161     """Implement this method in a subclass such that it returns
    162     a serializable object for ``o``, or calls the base implementation
    163     (to raise a ``TypeError``).
   (...)
    177 
    178     """
--> 179     raise TypeError(f'Object of type {o.__class__.__name__} '
    180                     f'is not JSON serializable')

TypeError: Object of type Struct is not JSON serializable

Steps/Code to reproduce bug

Expected behavior

Environment details

  • Merlin version: merlin-tensorflow-training:22:03 container
  • Platform: Docker image
  • Python version:
  • PyTorch version (GPU?):
  • Tensorflow version (GPU?): Tensorflow

Metadata

Metadata

Labels

bugSomething isn't working

Type

No type

Projects

No projects

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions