Consolidate Embedding Operator and add offsets to embeddings (#138)

jperez999 · jperez999 · commit 53e35337d01a · 2023-04-20T09:50:31.000-04:00
* consolidate embedding operator

* add offsets to embeddings

* fix schema shape calculation and clean up tests
diff --git a/merlin/dataloader/ops/embeddings.py b/merlin/dataloader/ops/embeddings.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
+from typing import Optional, Union
 
 import numpy as np
 
@@ -43,92 +43,15 @@ class EmbeddingOperator(BaseOperator):
 
     def __init__(
         self,
-        embeddings: np.ndarray,
+        embeddings: Union[np.ndarray, str],
         lookup_key: str = "id",
         embedding_name: str = "embeddings",
-        id_lookup_table=None,
-    ):
-        self.embeddings = embeddings
-        self.lookup_key = lookup_key
-        self.embedding_name = embedding_name
-        self.id_lookup_table = id_lookup_table
-
-    def transform(
-        self, col_selector: ColumnSelector, transformable: Transformable
-    ) -> Transformable:
-        keys = transformable[self.lookup_key]
-        indices = keys.cpu().values
-        if self.id_lookup_table is not None:
-            indices = np.nonzero(np.in1d(self.id_lookup_table, indices))
-        embeddings = self.embeddings[indices]
-        embeddings_col = TensorColumn(embeddings)
-        transformable[self.embedding_name] = (
-            embeddings_col.gpu() if keys.device == Device.GPU else embeddings_col
-        )
-        return transformable
-
-    def compute_output_schema(
-        self,
-        input_schema: Schema,
-        col_selector: ColumnSelector,
-        prev_output_schema: Schema = None,
-    ) -> Schema:
-        """Creates the output schema for this operator.
-
-        Parameters
-        ----------
-        input_schema : Schema
-            schema coming from ancestor nodes
-        col_selector : ColumnSelector
-            subselection of columns to apply to this operator
-        prev_output_schema : Schema, optional
-            the output schema of the previously executed operators, by default None
-
-        Returns
-        -------
-        Schema
-            Schema representing the correct output for this operator.
-        """
-        col_schemas = []
-        for _, col_schema in input_schema.column_schemas.items():
-            col_schemas.append(col_schema)
-        col_schemas.append(
-            ColumnSchema(
-                name=self.embedding_name,
-                tags=[Tags.CONTINUOUS, Tags.EMBEDDING],
-                dtype=self.embeddings.dtype,
-                is_list=True,
-                is_ragged=False,
-            )
-        )
-
-        return Schema(col_schemas)
-
-
-class NumpyEmbeddingOperator(BaseOperator):
-    """Create an embedding table from supplied embeddings to add embedding entry
-    to records based on supplied indices. Support for indices lookup table is available.
-    Embedding table is stored in host memory.
-
-    Parameters
-    ----------
-    embeddings : np.ndarray
-        numpy ndarray representing embedding values
-    lookup_key : str, optional
-        the name of the column that will be used as indices, by default "id"
-    embedding_name : str, optional
-        name of new column of embeddings, added to output, by default "embeddings"
-    id_lookup_table : np.array, optional
-        numpy array of values that represent embedding indices, by default None
-    """
-
-    def __init__(
-        self,
-        embeddings: np.ndarray,
-        lookup_key: str = "id",
-        embedding_name: str = "embeddings",
-        id_lookup_table=None,
+        id_lookup_table: Optional[Union[np.ndarray, str]] = None,
+        mmap=False,
     ):
+        if mmap:
+            embeddings = np.load(embeddings, mmap_mode="r")
+            id_lookup_table = np.load(id_lookup_table) if id_lookup_table else None
         self.embeddings = embeddings
         self.lookup_key = lookup_key
         self.embedding_name = embedding_name
@@ -142,16 +65,12 @@ def transform(
         if self.id_lookup_table is not None:
             indices = np.in1d(self.id_lookup_table, indices)
         embeddings = self.embeddings[indices]
-        # numpy_to_tensor
-        embeddings_col = TensorColumn(embeddings)
+        embeddings_col = TensorColumn(embeddings, offsets=keys.cpu().offsets)
         transformable[self.embedding_name] = (
             embeddings_col.gpu() if keys.device == Device.GPU else embeddings_col
         )
         return transformable
 
-    def _format_embeddings(self, embeddings, keys):
-        raise NotImplementedError("No logic to format embeddings.")
-
     def compute_output_schema(
         self,
         input_schema: Schema,
@@ -177,53 +96,15 @@ def compute_output_schema(
         col_schemas = []
         for _, col_schema in input_schema.column_schemas.items():
             col_schemas.append(col_schema)
+        id_schema = input_schema.column_schemas[self.lookup_key]
         embedding_dim = self.embeddings.shape[1]
         col_schemas.append(
             ColumnSchema(
                 name=self.embedding_name,
-                tags=[Tags.CONTINUOUS, Tags.EMBEDDING],
+                tags=[Tags.EMBEDDING],
                 dtype=self.embeddings.dtype,
-                is_list=True,
-                is_ragged=False,
-                properties={"value_count": {"min": embedding_dim, "max": embedding_dim}},
+                dims=id_schema.shape.as_tuple + (embedding_dim,),
             )
         )
 
         return Schema(col_schemas)
-
-
-class MmapNumpyEmbedding(NumpyEmbeddingOperator):
-    """Operator loads numpy embedding table from file using memory map to be used to create
-    torch embedding representations. This allows for larger than host memory embedding
-    tables to be used for embedding lookups. The only limit to the size is what fits in
-    storage, preferred storage device is SSD for faster lookups.
-
-    Parameters
-    ----------
-    embedding_npz : numpy ndarray file
-        file holding numpy ndarray representing embedding table
-    ids_lookup_npz : numpy array file, optional
-        file holding numpy array of values that represent embedding indices, by default None
-    lookup_key : str, optional
-        the name of the column that will be used as indices, by default "id"
-    embedding_name : str, optional
-        name of new column of embeddings, added to output, by default "embeddings"
-    transform_function : _type_, optional
-        function that will transform embedding from numpy to torch, by default None
-    """
-
-    def __init__(
-        self,
-        embedding_npz,
-        ids_lookup_npz=None,
-        lookup_key="id",
-        embedding_name="embeddings",
-    ):
-        embeddings = np.load(embedding_npz, mmap_mode="r")
-        id_lookup = np.load(ids_lookup_npz) if ids_lookup_npz else None
-        super().__init__(
-            embeddings,
-            lookup_key=lookup_key,
-            embedding_name=embedding_name,
-            id_lookup_table=id_lookup,
-        )
diff --git a/tests/unit/dataloader/test_embeddings.py b/tests/unit/dataloader/test_embeddings.py
@@ -20,13 +20,10 @@
 
 from merlin.core.dispatch import HAS_GPU
 from merlin.dataloader.loader_base import LoaderBase as Loader  # noqa
-from merlin.dataloader.ops.embeddings import (  # noqa
-    EmbeddingOperator,
-    MmapNumpyEmbedding,
-    NumpyEmbeddingOperator,
-)
+from merlin.dataloader.ops.embeddings import EmbeddingOperator
 from merlin.io import Dataset
 from merlin.schema import Tags
+from merlin.table import TensorColumn, TensorTable
 
 
 @pytest.mark.parametrize("cpu", [None, "cpu"] if HAS_GPU else ["cpu"])
@@ -40,17 +37,13 @@ def test_embedding_np_mmap_dl_no_lookup(tmpdir, embedding_ids, np_embeddings_fro
     dataset = Dataset(str(pq_path))
     dataset = dataset.repartition(10)
     schema = dataset.schema
-    for col_name in cat_names:
-        schema[col_name] = schema[col_name].with_tags([Tags.CATEGORICAL, Tags.EMBEDDING])
-    dataset.schema = schema
-
     for col_name in cat_names:
         schema[col_name] = schema[col_name].with_tags([Tags.CATEGORICAL, Tags.EMBEDDING])
     dataset.schema = schema
     data_loader = Loader(
         dataset,
         batch_size=batch_size,
-        transforms=[MmapNumpyEmbedding(embeddings_file)],
+        transforms=[EmbeddingOperator(embeddings_file, mmap=True)],
         shuffle=False,
         device=cpu,
     )
@@ -90,13 +83,10 @@ def test_embedding_np_mmap_dl_with_lookup(tmpdir, rev_embedding_ids, np_embeddin
         schema[col_name] = schema[col_name].with_tags([Tags.CATEGORICAL, Tags.EMBEDDING])
     dataset.schema = schema
 
-    for col_name in cat_names:
-        schema[col_name] = schema[col_name].with_tags([Tags.CATEGORICAL, Tags.EMBEDDING])
-    dataset.schema = schema
     data_loader = Loader(
         dataset,
         batch_size=batch_size,
-        transforms=[MmapNumpyEmbedding(embeddings_file, ids_lookup_npz=id_lookup_file)],
+        transforms=[EmbeddingOperator(embeddings_file, id_lookup_table=id_lookup_file, mmap=True)],
         shuffle=False,
         device=cpu,
     )
@@ -121,10 +111,6 @@ def test_embedding_np_dl_no_lookup(tmpdir, embedding_ids, embeddings_from_datafr
     dataset = Dataset(str(pq_path))
     dataset = dataset.repartition(10)
     schema = dataset.schema
-    for col_name in cat_names:
-        schema[col_name] = schema[col_name].with_tags([Tags.CATEGORICAL, Tags.EMBEDDING])
-    dataset.schema = schema
-
     for col_name in cat_names:
         schema[col_name] = schema[col_name].with_tags([Tags.CATEGORICAL, Tags.EMBEDDING])
     dataset.schema = schema
@@ -134,7 +120,7 @@ def test_embedding_np_dl_no_lookup(tmpdir, embedding_ids, embeddings_from_datafr
     data_loader = Loader(
         dataset,
         batch_size=batch_size,
-        transforms=[NumpyEmbeddingOperator(embeddings_np)],
+        transforms=[EmbeddingOperator(embeddings_np)],
         shuffle=False,
         device=cpu,
     )
@@ -160,10 +146,6 @@ def test_embedding_np_dl_with_lookup(tmpdir, rev_embedding_ids, embeddings_from_
     dataset = Dataset(str(pq_path))
     dataset = dataset.repartition(10)
     schema = dataset.schema
-    for col_name in cat_names:
-        schema[col_name] = schema[col_name].with_tags([Tags.CATEGORICAL, Tags.EMBEDDING])
-    dataset.schema = schema
-
     for col_name in cat_names:
         schema[col_name] = schema[col_name].with_tags([Tags.CATEGORICAL, Tags.EMBEDDING])
     dataset.schema = schema
@@ -173,9 +155,7 @@ def test_embedding_np_dl_with_lookup(tmpdir, rev_embedding_ids, embeddings_from_
     data_loader = Loader(
         dataset,
         batch_size=batch_size,
-        transforms=[
-            NumpyEmbeddingOperator(embeddings_np, id_lookup_table=embedding_ids.to_numpy())
-        ],
+        transforms=[EmbeddingOperator(embeddings_np, id_lookup_table=embedding_ids.to_numpy())],
         shuffle=False,
         device=cpu,
     )
@@ -192,77 +172,44 @@ def test_embedding_np_dl_with_lookup(tmpdir, rev_embedding_ids, embeddings_from_
 
 
 @pytest.mark.parametrize("cpu", [None, "cpu"] if HAS_GPU else ["cpu"])
-def test_embedding_dl_no_lookup(tmpdir, embedding_ids, embeddings_from_dataframe, cpu):
+def test_embedding_np_dl_with_lookup_ragged(
+    tmpdir, rev_embedding_ids, embeddings_from_dataframe, cpu
+):
     cat_names = ["id"]
-    batch_size = 10000
+    batch_size = 5
     pq_path = tmpdir / "id.parquet"
-    embedding_ids.to_parquet(pq_path)
-    dataset = Dataset(str(pq_path))
+    embedding_ids = rev_embedding_ids["id"][:100].to_numpy()
+    offsets = np.array([0, 10, 15, 20, 30, 40, 45, 55, 65, 75, 80, 90, 100])
+    tensor_df = TensorTable({"id": TensorColumn(embedding_ids, offsets=offsets)}).to_df()
+    tensor_df.to_parquet(pq_path)
+    dataset = Dataset(str(pq_path), cpu=bool(cpu))
     dataset = dataset.repartition(10)
     schema = dataset.schema
-    for col_name in cat_names:
-        schema[col_name] = schema[col_name].with_tags([Tags.CATEGORICAL, Tags.EMBEDDING])
-    dataset.schema = schema
-
     for col_name in cat_names:
         schema[col_name] = schema[col_name].with_tags([Tags.CATEGORICAL, Tags.EMBEDDING])
     dataset.schema = schema
     paths = sorted(glob.glob(f"{embeddings_from_dataframe}/*"))
     embeddings_ds = Dataset(paths)
-    np_tensor = embeddings_ds.to_ddf().compute().to_numpy()[:, 1:]
+    embeddings_np = embeddings_ds.to_ddf().compute().to_numpy()[:100, 1:]
     data_loader = Loader(
         dataset,
         batch_size=batch_size,
-        transforms=[EmbeddingOperator(np_tensor)],
+        transforms=[EmbeddingOperator(embeddings_np, id_lookup_table=embedding_ids)],
         shuffle=False,
         device=cpu,
     )
     full_len = 0
+    old_end = 0
     for idx, batch in enumerate(data_loader):
         assert "embeddings" in batch[0]
         assert "id" in batch[0]
-        start = idx * batch_size
-        end = start + int(batch[0]["id"].shape[0])
+        start = old_end
+        end = start + int(batch[0]["id"].cpu().values.shape[0])
+        old_end = end
+        id_offsets = batch[0]["id"].cpu().offsets
         embeddings_vals = batch[0]["embeddings"].cpu().values
-        assert (embeddings_vals == np_tensor[start:end]).all()
-        full_len += int(batch[0]["embeddings"].shape[0])
-    assert full_len == embedding_ids.shape[0]
-
-
-@pytest.mark.parametrize("cpu", [None, "cpu"] if HAS_GPU else ["cpu"])
-def test_embedding_dl_with_lookup(tmpdir, rev_embedding_ids, embeddings_from_dataframe, cpu):
-    cat_names = ["id"]
-    batch_size = 10000
-    pq_path = tmpdir / "id.parquet"
-    embedding_ids = rev_embedding_ids
-    embedding_ids.to_parquet(pq_path)
-    dataset = Dataset(str(pq_path))
-    dataset = dataset.repartition(10)
-    schema = dataset.schema
-    for col_name in cat_names:
-        schema[col_name] = schema[col_name].with_tags([Tags.CATEGORICAL, Tags.EMBEDDING])
-    dataset.schema = schema
-
-    for col_name in cat_names:
-        schema[col_name] = schema[col_name].with_tags([Tags.CATEGORICAL, Tags.EMBEDDING])
-    dataset.schema = schema
-    paths = sorted(glob.glob(f"{embeddings_from_dataframe}/*"))
-    embeddings_ds = Dataset(paths)
-    np_tensor = embeddings_ds.to_ddf().compute().to_numpy()[:, 1:]
-    data_loader = Loader(
-        dataset,
-        batch_size=batch_size,
-        transforms=[EmbeddingOperator(np_tensor, id_lookup_table=embedding_ids.to_numpy())],
-        shuffle=False,
-        device=cpu,
-    )
-    full_len = 0
-    for idx, batch in enumerate(data_loader):
-        assert "embeddings" in batch[0]
-        assert "id" in batch[0]
-        start = idx * batch_size
-        end = start + int(batch[0]["id"].shape[0])
-        embeddings_vals = batch[0]["embeddings"].cpu().values
-        assert (embeddings_vals == np_tensor[start:end]).all()
+        embeddings_offs = batch[0]["embeddings"].cpu().offsets
+        assert (embeddings_vals == embeddings_np[start:end]).all()
+        assert (embeddings_offs == id_offsets).all()
         full_len += int(batch[0]["embeddings"].shape[0])
-    assert full_len == embedding_ids.shape[0]
+    assert full_len == offsets.shape[0] - 1