2020
2121from merlin .core .dispatch import HAS_GPU
2222from merlin .dataloader .loader_base import LoaderBase as Loader # noqa
23- from merlin .dataloader .ops .embeddings import ( # noqa
24- EmbeddingOperator ,
25- MmapNumpyEmbedding ,
26- NumpyEmbeddingOperator ,
27- )
23+ from merlin .dataloader .ops .embeddings import EmbeddingOperator
2824from merlin .io import Dataset
2925from merlin .schema import Tags
26+ from merlin .table import TensorColumn , TensorTable
3027
3128
3229@pytest .mark .parametrize ("cpu" , [None , "cpu" ] if HAS_GPU else ["cpu" ])
@@ -40,17 +37,13 @@ def test_embedding_np_mmap_dl_no_lookup(tmpdir, embedding_ids, np_embeddings_fro
4037 dataset = Dataset (str (pq_path ))
4138 dataset = dataset .repartition (10 )
4239 schema = dataset .schema
43- for col_name in cat_names :
44- schema [col_name ] = schema [col_name ].with_tags ([Tags .CATEGORICAL , Tags .EMBEDDING ])
45- dataset .schema = schema
46-
4740 for col_name in cat_names :
4841 schema [col_name ] = schema [col_name ].with_tags ([Tags .CATEGORICAL , Tags .EMBEDDING ])
4942 dataset .schema = schema
5043 data_loader = Loader (
5144 dataset ,
5245 batch_size = batch_size ,
53- transforms = [MmapNumpyEmbedding (embeddings_file )],
46+ transforms = [EmbeddingOperator (embeddings_file , mmap = True )],
5447 shuffle = False ,
5548 device = cpu ,
5649 )
@@ -90,13 +83,10 @@ def test_embedding_np_mmap_dl_with_lookup(tmpdir, rev_embedding_ids, np_embeddin
9083 schema [col_name ] = schema [col_name ].with_tags ([Tags .CATEGORICAL , Tags .EMBEDDING ])
9184 dataset .schema = schema
9285
93- for col_name in cat_names :
94- schema [col_name ] = schema [col_name ].with_tags ([Tags .CATEGORICAL , Tags .EMBEDDING ])
95- dataset .schema = schema
9686 data_loader = Loader (
9787 dataset ,
9888 batch_size = batch_size ,
99- transforms = [MmapNumpyEmbedding (embeddings_file , ids_lookup_npz = id_lookup_file )],
89+ transforms = [EmbeddingOperator (embeddings_file , id_lookup_table = id_lookup_file , mmap = True )],
10090 shuffle = False ,
10191 device = cpu ,
10292 )
@@ -121,10 +111,6 @@ def test_embedding_np_dl_no_lookup(tmpdir, embedding_ids, embeddings_from_datafr
121111 dataset = Dataset (str (pq_path ))
122112 dataset = dataset .repartition (10 )
123113 schema = dataset .schema
124- for col_name in cat_names :
125- schema [col_name ] = schema [col_name ].with_tags ([Tags .CATEGORICAL , Tags .EMBEDDING ])
126- dataset .schema = schema
127-
128114 for col_name in cat_names :
129115 schema [col_name ] = schema [col_name ].with_tags ([Tags .CATEGORICAL , Tags .EMBEDDING ])
130116 dataset .schema = schema
@@ -134,7 +120,7 @@ def test_embedding_np_dl_no_lookup(tmpdir, embedding_ids, embeddings_from_datafr
134120 data_loader = Loader (
135121 dataset ,
136122 batch_size = batch_size ,
137- transforms = [NumpyEmbeddingOperator (embeddings_np )],
123+ transforms = [EmbeddingOperator (embeddings_np )],
138124 shuffle = False ,
139125 device = cpu ,
140126 )
@@ -160,10 +146,6 @@ def test_embedding_np_dl_with_lookup(tmpdir, rev_embedding_ids, embeddings_from_
160146 dataset = Dataset (str (pq_path ))
161147 dataset = dataset .repartition (10 )
162148 schema = dataset .schema
163- for col_name in cat_names :
164- schema [col_name ] = schema [col_name ].with_tags ([Tags .CATEGORICAL , Tags .EMBEDDING ])
165- dataset .schema = schema
166-
167149 for col_name in cat_names :
168150 schema [col_name ] = schema [col_name ].with_tags ([Tags .CATEGORICAL , Tags .EMBEDDING ])
169151 dataset .schema = schema
@@ -173,9 +155,7 @@ def test_embedding_np_dl_with_lookup(tmpdir, rev_embedding_ids, embeddings_from_
173155 data_loader = Loader (
174156 dataset ,
175157 batch_size = batch_size ,
176- transforms = [
177- NumpyEmbeddingOperator (embeddings_np , id_lookup_table = embedding_ids .to_numpy ())
178- ],
158+ transforms = [EmbeddingOperator (embeddings_np , id_lookup_table = embedding_ids .to_numpy ())],
179159 shuffle = False ,
180160 device = cpu ,
181161 )
@@ -192,77 +172,44 @@ def test_embedding_np_dl_with_lookup(tmpdir, rev_embedding_ids, embeddings_from_
192172
193173
194174@pytest .mark .parametrize ("cpu" , [None , "cpu" ] if HAS_GPU else ["cpu" ])
195- def test_embedding_dl_no_lookup (tmpdir , embedding_ids , embeddings_from_dataframe , cpu ):
175+ def test_embedding_np_dl_with_lookup_ragged (
176+ tmpdir , rev_embedding_ids , embeddings_from_dataframe , cpu
177+ ):
196178 cat_names = ["id" ]
197- batch_size = 10000
179+ batch_size = 5
198180 pq_path = tmpdir / "id.parquet"
199- embedding_ids .to_parquet (pq_path )
200- dataset = Dataset (str (pq_path ))
181+ embedding_ids = rev_embedding_ids ["id" ][:100 ].to_numpy ()
182+ offsets = np .array ([0 , 10 , 15 , 20 , 30 , 40 , 45 , 55 , 65 , 75 , 80 , 90 , 100 ])
183+ tensor_df = TensorTable ({"id" : TensorColumn (embedding_ids , offsets = offsets )}).to_df ()
184+ tensor_df .to_parquet (pq_path )
185+ dataset = Dataset (str (pq_path ), cpu = bool (cpu ))
201186 dataset = dataset .repartition (10 )
202187 schema = dataset .schema
203- for col_name in cat_names :
204- schema [col_name ] = schema [col_name ].with_tags ([Tags .CATEGORICAL , Tags .EMBEDDING ])
205- dataset .schema = schema
206-
207188 for col_name in cat_names :
208189 schema [col_name ] = schema [col_name ].with_tags ([Tags .CATEGORICAL , Tags .EMBEDDING ])
209190 dataset .schema = schema
210191 paths = sorted (glob .glob (f"{ embeddings_from_dataframe } /*" ))
211192 embeddings_ds = Dataset (paths )
212- np_tensor = embeddings_ds .to_ddf ().compute ().to_numpy ()[:, 1 :]
193+ embeddings_np = embeddings_ds .to_ddf ().compute ().to_numpy ()[:100 , 1 :]
213194 data_loader = Loader (
214195 dataset ,
215196 batch_size = batch_size ,
216- transforms = [EmbeddingOperator (np_tensor )],
197+ transforms = [EmbeddingOperator (embeddings_np , id_lookup_table = embedding_ids )],
217198 shuffle = False ,
218199 device = cpu ,
219200 )
220201 full_len = 0
202+ old_end = 0
221203 for idx , batch in enumerate (data_loader ):
222204 assert "embeddings" in batch [0 ]
223205 assert "id" in batch [0 ]
224- start = idx * batch_size
225- end = start + int (batch [0 ]["id" ].shape [0 ])
206+ start = old_end
207+ end = start + int (batch [0 ]["id" ].cpu ().values .shape [0 ])
208+ old_end = end
209+ id_offsets = batch [0 ]["id" ].cpu ().offsets
226210 embeddings_vals = batch [0 ]["embeddings" ].cpu ().values
227- assert (embeddings_vals == np_tensor [start :end ]).all ()
228- full_len += int (batch [0 ]["embeddings" ].shape [0 ])
229- assert full_len == embedding_ids .shape [0 ]
230-
231-
232- @pytest .mark .parametrize ("cpu" , [None , "cpu" ] if HAS_GPU else ["cpu" ])
233- def test_embedding_dl_with_lookup (tmpdir , rev_embedding_ids , embeddings_from_dataframe , cpu ):
234- cat_names = ["id" ]
235- batch_size = 10000
236- pq_path = tmpdir / "id.parquet"
237- embedding_ids = rev_embedding_ids
238- embedding_ids .to_parquet (pq_path )
239- dataset = Dataset (str (pq_path ))
240- dataset = dataset .repartition (10 )
241- schema = dataset .schema
242- for col_name in cat_names :
243- schema [col_name ] = schema [col_name ].with_tags ([Tags .CATEGORICAL , Tags .EMBEDDING ])
244- dataset .schema = schema
245-
246- for col_name in cat_names :
247- schema [col_name ] = schema [col_name ].with_tags ([Tags .CATEGORICAL , Tags .EMBEDDING ])
248- dataset .schema = schema
249- paths = sorted (glob .glob (f"{ embeddings_from_dataframe } /*" ))
250- embeddings_ds = Dataset (paths )
251- np_tensor = embeddings_ds .to_ddf ().compute ().to_numpy ()[:, 1 :]
252- data_loader = Loader (
253- dataset ,
254- batch_size = batch_size ,
255- transforms = [EmbeddingOperator (np_tensor , id_lookup_table = embedding_ids .to_numpy ())],
256- shuffle = False ,
257- device = cpu ,
258- )
259- full_len = 0
260- for idx , batch in enumerate (data_loader ):
261- assert "embeddings" in batch [0 ]
262- assert "id" in batch [0 ]
263- start = idx * batch_size
264- end = start + int (batch [0 ]["id" ].shape [0 ])
265- embeddings_vals = batch [0 ]["embeddings" ].cpu ().values
266- assert (embeddings_vals == np_tensor [start :end ]).all ()
211+ embeddings_offs = batch [0 ]["embeddings" ].cpu ().offsets
212+ assert (embeddings_vals == embeddings_np [start :end ]).all ()
213+ assert (embeddings_offs == id_offsets ).all ()
267214 full_len += int (batch [0 ]["embeddings" ].shape [0 ])
268- assert full_len == embedding_ids .shape [0 ]
215+ assert full_len == offsets .shape [0 ] - 1
0 commit comments