Add a context manager to create a dummy Croissant file.

fineguy · The TensorFlow Datasets Authors · commit 768957960791 · 2024-06-06T07:40:39.000-07:00
PiperOrigin-RevId: 640891862
diff --git a/tensorflow_datasets/core/dataset_builders/croissant_builder_test.py b/tensorflow_datasets/core/dataset_builders/croissant_builder_test.py
@@ -15,10 +15,6 @@
 
 """Tests for croissant_builder."""
 
-import json
-import tempfile
-
-from etils import epath
 import numpy as np
 import pytest
 from tensorflow_datasets import testing
@@ -28,61 +24,6 @@
 from tensorflow_datasets.core.features import text_feature
 from tensorflow_datasets.core.utils.lazy_imports_utils import mlcroissant as mlc
 
-DUMMY_ENTRIES = [{"index": i, "text": f"Dummy example {i}"} for i in range(2)]
-
-
-def get_dummy_metadata():
-  distribution = [
-      mlc.FileObject(
-          id="raw_data",
-          description="File with the data.",
-          encoding_format="application/jsonlines",
-          content_url="data/raw_data.jsonl",
-          sha256=(
-              "ec6a2e5865be2c3ea2bf41817bf9ca78cbfcdd60bce0282721da8625a28fd10d"
-          ),
-      ),
-  ]
-  record_sets = [
-      mlc.RecordSet(
-          id="jsonl",
-          description="Dummy record set.",
-          fields=[
-              mlc.Field(
-                  name="index",
-                  description="The sample index.",
-                  data_types=mlc.DataType.INTEGER,
-                  source=mlc.Source(
-                      file_object="raw_data",
-                      extract=mlc.Extract(column="index"),
-                  ),
-              ),
-              mlc.Field(
-                  name="text",
-                  description="The dummy sample text.",
-                  data_types=mlc.DataType.TEXT,
-                  source=mlc.Source(
-                      file_object="raw_data",
-                      extract=mlc.Extract(column="text"),
-                  ),
-              ),
-          ],
-      )
-  ]
-  dummy_metadata = mlc.Metadata(
-      name="DummyDataset",
-      description="Dummy description.",
-      cite_as=(
-          "@article{dummyarticle, title={title}, author={author}, year={2020}}"
-      ),
-      url="https://dummy_url",
-      distribution=distribution,
-      record_sets=record_sets,
-      version="1.2.0",
-      license="Public",
-  )
-  return dummy_metadata
-
 
 @pytest.mark.parametrize(
     ["field", "feature_type", "int_dtype", "float_dtype"],
@@ -169,29 +110,15 @@ class CroissantBuilderTest(testing.TestCase):
   def setUpClass(cls):
     super(CroissantBuilderTest, cls).setUpClass()
 
-    # Write raw examples on tmp/data.
-    data_dir = epath.Path(tempfile.gettempdir()) / "data"
-    data_dir.mkdir(parents=True, exist_ok=True)
-    raw_output_file = data_dir / "raw_data.jsonl"
-    with open(raw_output_file, "w") as outfile:
-      for entry in DUMMY_ENTRIES:
-        json.dump(entry, outfile)
-        outfile.write("\n")
-
-    # Write Croissant JSON-LD on tmp.
-    dummy_metadata = get_dummy_metadata()
-    croissant_file = epath.Path(tempfile.gettempdir()) / "croissant.json"
-    with open(croissant_file, "w") as f:
-      f.write(json.dumps(dummy_metadata.to_json(), indent=2))
-      f.write("\n")
-
-    cls._tfds_tmp_dir = testing.make_tmp_dir()
-    cls.builder = croissant_builder.CroissantBuilder(
-        jsonld=croissant_file,
-        file_format=FileFormat.ARRAY_RECORD,
-        disable_shuffling=True,
-        data_dir=cls._tfds_tmp_dir,
-    )
+    with testing.dummy_croissant_file() as croissant_file:
+      cls._tfds_tmp_dir = testing.make_tmp_dir()
+      cls.builder = croissant_builder.CroissantBuilder(
+          jsonld=croissant_file,
+          file_format=FileFormat.ARRAY_RECORD,
+          disable_shuffling=True,
+          data_dir=cls._tfds_tmp_dir,
+      )
+      cls.builder.download_and_prepare()
 
   def test_dataset_info(self):
     assert self.builder.name == "dummydataset"
@@ -211,10 +138,9 @@ def test_dataset_info(self):
     )
 
   def test_generated_samples(self):
-    self.builder.download_and_prepare()
     for split_name in ["all", "default"]:
       data_source = self.builder.as_data_source(split=split_name)
       assert len(data_source) == 2
       for i in range(2):
-        assert data_source[i]["index"] == DUMMY_ENTRIES[i]["index"]
-        assert data_source[i]["text"].decode() == DUMMY_ENTRIES[i]["text"]
+        assert data_source[i]["index"] == i
+        assert data_source[i]["text"].decode() == f"Dummy example {i}"
diff --git a/tensorflow_datasets/testing/__init__.py b/tensorflow_datasets/testing/__init__.py
@@ -39,6 +39,7 @@
   from tensorflow_datasets.testing.test_case_in_context import TestCaseInContext
   from tensorflow_datasets.testing.test_utils import assert_features_equal
   from tensorflow_datasets.testing.test_utils import disable_gcs_access
+  from tensorflow_datasets.testing.test_utils import dummy_croissant_file
   from tensorflow_datasets.testing.test_utils import DummyBeamDataset
   from tensorflow_datasets.testing.test_utils import DummyDataset
   from tensorflow_datasets.testing.test_utils import DummyDatasetCollection
@@ -53,9 +54,9 @@
   from tensorflow_datasets.testing.test_utils import MockFs
   from tensorflow_datasets.testing.test_utils import rm_tmp_dir
   from tensorflow_datasets.testing.test_utils import run_in_graph_and_eager_modes
+  from tensorflow_datasets.testing.test_utils import set_current_datetime
   from tensorflow_datasets.testing.test_utils import test_main
   from tensorflow_datasets.testing.test_utils import tmp_dir
-  from tensorflow_datasets.testing.test_utils import set_current_datetime
   # LINT.ThenChange(:deps)
 # pylint: enable=g-import-not-at-top,g-importing-member
 
@@ -66,6 +67,7 @@
         "tensorflow_datasets.testing.dataset_builder_testing"
     ),
     "disable_gcs_access": "tensorflow_datasets.testing.test_utils",
+    "dummy_croissant_file": "tensorflow_datasets.testing.test_utils",
     "DummyBeamDataset": "tensorflow_datasets.testing.test_utils",
     "DummyDataset": "tensorflow_datasets.testing.test_utils",
     "DummyDatasetCollection": "tensorflow_datasets.testing.test_utils",
@@ -90,14 +92,14 @@
     # TODO(afrozm): rm from here and add as methods to TestCase
     "rm_tmp_dir": "tensorflow_datasets.testing.test_utils",
     "run_in_graph_and_eager_modes": "tensorflow_datasets.testing.test_utils",
+    "set_current_datetime": "tensorflow_datasets.testing.test_utils",
     "SubTestCase": "tensorflow_datasets.testing.feature_test_case",
     "test_main": "tensorflow_datasets.testing.test_utils",
     "TestCase": "tensorflow_datasets.testing.test_case",
     "TestCaseInContext": "tensorflow_datasets.testing.test_case_in_context",
     "TestValue": "tensorflow_datasets.testing.feature_test_case",
     # TODO(afrozm): rm from here and add as methods to TestCase
     "tmp_dir": "tensorflow_datasets.testing.test_utils",
-    "set_current_datetime": "tensorflow_datasets.testing.test_utils",
     # LINT.ThenChange(:pydeps)
 }
 
diff --git a/tensorflow_datasets/testing/test_utils.py b/tensorflow_datasets/testing/test_utils.py
@@ -21,6 +21,7 @@
 import dataclasses
 import datetime
 import functools
+import json
 import os
 import pathlib
 import subprocess
@@ -39,6 +40,7 @@
 from tensorflow_datasets.core import lazy_imports_lib
 from tensorflow_datasets.core import naming
 from tensorflow_datasets.core import utils
+from tensorflow_datasets.core.utils.lazy_imports_utils import mlcroissant as mlc
 from tensorflow_datasets.core.utils.lazy_imports_utils import tensorflow as tf
 
 
@@ -706,3 +708,77 @@ def now(cls, tz=None) -> datetime.datetime:
 
   with mock.patch.object(datetime, 'datetime', new=MockDatetime):
     yield
+
+
+@contextlib.contextmanager
+def dummy_croissant_file() -> Iterator[epath.Path]:
+  """Yields temporary path to a dummy Croissant file.
+
+  The function creates a temporary directory that stores raw data files and the
+  Croissant JSON-LD.
+  """
+  entries = [{'index': i, 'text': f'Dummy example {i}'} for i in range(2)]
+  distribution = [
+      mlc.FileObject(
+          id='raw_data',
+          description='File with the data.',
+          encoding_format='application/jsonlines',
+          content_url='data/raw_data.jsonl',
+          sha256=(
+              'b13bbcd65bb5ec7c0c64cbceb635de3eadda17f3311c5982dc2d5a342ed97690'
+          ),
+      ),
+  ]
+  record_sets = [
+      mlc.RecordSet(
+          id='jsonl',
+          description='Dummy record set.',
+          fields=[
+              mlc.Field(
+                  name='index',
+                  description='The sample index.',
+                  data_types=mlc.DataType.INTEGER,
+                  source=mlc.Source(
+                      file_object='raw_data',
+                      extract=mlc.Extract(column='index'),
+                  ),
+              ),
+              mlc.Field(
+                  name='text',
+                  description='The dummy sample text.',
+                  data_types=mlc.DataType.TEXT,
+                  source=mlc.Source(
+                      file_object='raw_data',
+                      extract=mlc.Extract(column='text'),
+                  ),
+              ),
+          ],
+      )
+  ]
+  dummy_metadata = mlc.Metadata(
+      name='DummyDataset',
+      description='Dummy description.',
+      cite_as=(
+          '@article{dummyarticle, title={title}, author={author}, year={2020}}'
+      ),
+      url='https://dummy_url',
+      distribution=distribution,
+      record_sets=record_sets,
+      version='1.2.0',
+      license='Public',
+  )
+
+  with tempfile.TemporaryDirectory() as tempdir:
+    tempdir = epath.Path(tempdir)
+
+    # Write raw examples to tempdir/data.
+    raw_data_dir = tempdir / 'data'
+    raw_data_dir.mkdir()
+    raw_data_file = raw_data_dir / 'raw_data.jsonl'
+    raw_data_file.write_text('\n'.join(map(json.dumps, entries)))
+
+    # Write Croissant JSON-LD to tempdir.
+    croissant_file = tempdir / 'croissant.json'
+    croissant_file.write_text(json.dumps(dummy_metadata.to_json(), indent=2))
+
+    yield croissant_file
diff --git a/tensorflow_datasets/testing/test_utils_test.py b/tensorflow_datasets/testing/test_utils_test.py
@@ -17,6 +17,7 @@
 
 import pathlib
 
+import mlcroissant as mlc
 import pytest
 import tensorflow as tf
 from tensorflow_datasets.testing import test_case
@@ -226,3 +227,21 @@ def is_lambda(fn):
       assert not is_lambda(gcs_utils.gcs_dataset_info_files)
     assert is_lambda(gcs_utils.gcs_dataset_info_files)
   assert not is_lambda(gcs_utils.gcs_dataset_info_files)
+
+
+def test_dummy_croissant_file():
+  with test_utils.dummy_croissant_file() as croissant_file:
+    dataset = mlc.Dataset(jsonld=croissant_file)
+
+    assert dataset.jsonld == croissant_file
+    assert dataset.mapping is None
+    assert dataset.metadata.description == 'Dummy description.'
+    assert [record_set.id for record_set in dataset.metadata.record_sets] == [
+        'jsonl'
+    ]
+    assert [record for record in dataset.records('jsonl')] == [
+        {'text': b'Dummy example 0', 'index': 0},
+        {'text': b'Dummy example 1', 'index': 1},
+    ]
+    assert dataset.metadata.url == 'https://dummy_url'
+    assert dataset.metadata.version == '1.2.0'