Skip to content

Commit a099c59

Browse files
author
The TensorFlow Datasets Authors
committed
Leverage mlcroissant's filters in TFDS CroissantBuilder's _generate_example.
PiperOrigin-RevId: 655566285
1 parent d6002dd commit a099c59

File tree

1 file changed

+3
-8
lines changed

1 file changed

+3
-8
lines changed

tensorflow_datasets/core/dataset_builders/croissant_builder.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,7 @@ def _split_generators(
260260
split['name']: self._generate_examples(
261261
filters={
262262
**self._filters,
263-
split_reference.reference_field.id: split['name'].encode(),
263+
split_reference.reference_field.id: split['name'],
264264
}
265265
)
266266
for split in split_reference.split_record_set.data
@@ -285,15 +285,10 @@ def _generate_examples(
285285
record_set = croissant_utils.get_record_set(
286286
self.builder_config.name, metadata=self.metadata
287287
)
288-
records = self.dataset.records(record_set.id)
288+
records = self.dataset.records(record_set.id, filters=filters)
289289
for i, record in enumerate(records):
290290
# Some samples might not be TFDS-compatible as-is, e.g. from croissant
291291
# describing HuggingFace datasets, so we convert them here. This shouldn't
292292
# impact datasets which are already TFDS-compatible.
293293
record = conversion_utils.to_tfds_value(record, self.info.features)
294-
# After partition implementation, the filters will be applied from
295-
# mlcroissant `dataset.records` directly.
296-
# `records = records.filter(f == v for f, v in filters.items())``
297-
# For now, we apply them in TFDS.
298-
if all(record[filter] == value for filter, value in filters.items()):
299-
yield i, record
294+
yield i, record

0 commit comments

Comments
 (0)