Skip to content

Commit 477aa74

Browse files
fineguyThe TensorFlow Datasets Authors
authored andcommitted
Fix conversion of HF datasets.
PiperOrigin-RevId: 623162414
1 parent 1c56be4 commit 477aa74

File tree

2 files changed

+26
-4
lines changed

2 files changed

+26
-4
lines changed

tensorflow_datasets/core/dataset_builders/huggingface_dataset_builder.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,8 @@ def get_serialized_examples_iter():
116116
split=shard_spec.shard_split, run_post_process=False
117117
):
118118
example = huggingface_utils.convert_hf_value(hf_value, features)
119-
serialized_example = serializer.serialize_example(example)
119+
encoded_example = features.encode_example(example)
120+
serialized_example = serializer.serialize_example(encoded_example)
120121
num_bytes += len(serialized_example)
121122
yield serialized_example
122123

tensorflow_datasets/core/dataset_builders/huggingface_dataset_builder_test.py

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,13 @@
1616
from unittest import mock
1717

1818
import datasets as hf_datasets
19+
import numpy as np
1920
import pytest
21+
from tensorflow_datasets.core import lazy_imports_lib
2022
from tensorflow_datasets.core.dataset_builders import huggingface_dataset_builder
2123

24+
PIL_Image = lazy_imports_lib.lazy_imports.PIL_Image
25+
2226

2327
class DummyHuggingfaceBuilder(hf_datasets.GeneratorBasedBuilder):
2428

@@ -35,7 +39,11 @@ def _split_generators(self, dl_manager):
3539

3640
def _generate_examples(self):
3741
for i in range(2):
38-
yield i, {'feature': i}
42+
yield i, {
43+
'number': i,
44+
'text': f'{i}',
45+
'image': PIL_Image.new(mode='L', size=(4, 4)),
46+
}
3947

4048
def download_and_prepare(self, *args, **kwargs):
4149
# Disable downloads from GCS
@@ -87,8 +95,17 @@ def mock_huggingface_dataset_builder(
8795
def test_download_and_prepare(builder):
8896
builder.download_and_prepare()
8997
ds = builder.as_data_source()
98+
expected_image = PIL_Image.new(mode='RGB', size=(4, 4))
9099
# Split names are sanitized, eg train.clean -> train_clean
91-
assert list(ds['train_clean']) == [{'feature': 0}, {'feature': 1}]
100+
for element, expected in zip(
101+
ds['train_clean'],
102+
[
103+
{'number': 0, 'text': b'0', 'image': expected_image},
104+
{'number': 1, 'text': b'1', 'image': expected_image},
105+
],
106+
):
107+
for feature in ['number', 'text', 'image']:
108+
assert np.array_equal(element[feature], expected[feature])
92109

93110

94111
def test_all_parameters_are_passed_down_to_hf(builder):
@@ -98,4 +115,8 @@ def test_all_parameters_are_passed_down_to_hf(builder):
98115

99116

100117
def test_hf_features(builder):
101-
assert builder._hf_features() == {'feature': hf_datasets.Value('int64')}
118+
assert builder._hf_features() == {
119+
'number': hf_datasets.Value('int64'),
120+
'text': hf_datasets.Value('string'),
121+
'image': hf_datasets.Image(),
122+
}

0 commit comments

Comments
 (0)