Add more tests to test_dataset and test_io (#594)

sarahyurick · web-flow · commit c74b4368e00b · 2025-04-21T11:48:08.000-07:00
* Add more tests to test_dataset

Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;

* add more read_custom tests

Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;

* ruff

Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;

---------

Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;
Signed-off-by: Sarah Yurick &lt;53962159+sarahyurick@users.noreply.github.com&gt;
diff --git a/nemo_curator/datasets/doc_dataset.py b/nemo_curator/datasets/doc_dataset.py
@@ -187,7 +187,7 @@ def read_custom(  # noqa: PLR0913
                 and read all files under the directory.
                 If input_file is a list of strings, we assume each string is a file path.
             file_type: The type of the file to read.
-            read_func_single_partition: A function that reads a single file or a list of files in an single dask partition.
+            read_func_single_partition: A function that reads a single file or a list of files in an single Dask partition.
                 The function should take the following arguments:
                 - files: A list of file paths.
                 - file_type: The type of the file to read (in case you want to handle different file types differently).
@@ -204,6 +204,7 @@ def read_custom(  # noqa: PLR0913
             input_meta: A dictionary or a string formatted as a dictionary, which outlines
                 the field names and their respective data types within the JSONL input file.
         """
+
         if isinstance(input_files, str):
             if input_files.endswith(file_type):
                 files = [input_files]
@@ -218,9 +219,11 @@ def read_custom(  # noqa: PLR0913
         else:
             msg = "input_files must be a string or list"
             raise TypeError(msg)
+
         return cls(
             read_data(
                 input_files=files,
+                file_type=file_type,
                 backend=backend,
                 files_per_partition=files_per_partition,
                 blocksize=None,
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -12,13 +12,88 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+from pathlib import Path
+
 import pandas as pd
+import pytest
 
 from nemo_curator.datasets import DocumentDataset
+from nemo_curator.datasets.doc_dataset import _read_json_or_parquet
 
 
 def test_to_from_pandas() -> None:
     original_df = pd.DataFrame({"first_col": [1, 2, 3], "second_col": ["a", "b", "c"]})
     dataset = DocumentDataset.from_pandas(original_df)
     converted_df = dataset.to_pandas()
     pd.testing.assert_frame_equal(original_df, converted_df)
+
+
+def test_persist() -> None:
+    original_df = pd.DataFrame({"first_col": [1, 2, 3], "second_col": ["a", "b", "c"]})
+    dataset = DocumentDataset.from_pandas(original_df)
+    dataset.persist()
+
+
+def test_repartition() -> None:
+    original_df = pd.DataFrame({"first_col": [1, 2, 3], "second_col": ["a", "b", "c"]})
+    dataset = DocumentDataset.from_pandas(original_df)
+    dataset = dataset.repartition(npartitions=3)
+    assert dataset.df.npartitions == 3  # noqa: PLR2004
+
+
+def test_head() -> None:
+    original_df = pd.DataFrame({"first_col": [1, 2, 3], "second_col": ["a", "b", "c"]})
+    dataset = DocumentDataset.from_pandas(original_df)
+    expected_df = pd.DataFrame({"first_col": [1, 2], "second_col": ["a", "b"]})
+    pd.testing.assert_frame_equal(expected_df, dataset.head(2))
+
+
+def test_read_pickle(tmpdir: Path) -> None:
+    original_df = pd.DataFrame({"first_col": [1, 2, 3], "second_col": ["a", "b", "c"]})
+    output_file = str(tmpdir / "output.pkl")
+    original_df.to_pickle(output_file)
+    dataset = DocumentDataset.read_pickle(output_file)
+    pd.testing.assert_frame_equal(original_df, dataset.df.compute())
+
+
+def test_to_pickle(tmpdir: Path) -> None:
+    original_df = pd.DataFrame({"first_col": [1, 2, 3], "second_col": ["a", "b", "c"]})
+    dataset = DocumentDataset.from_pandas(original_df)
+
+    output_file = str(tmpdir / "output.pkl")
+    with pytest.raises(NotImplementedError):
+        dataset.to_pickle(output_file)
+
+
+def test_read_json_or_parquet(tmpdir: Path) -> None:
+    original_df = pd.DataFrame({"first_col": [1, 2, 3], "second_col": ["a", "b", "c"]})
+
+    directory_1 = str(tmpdir / "directory_1")
+    directory_2 = str(tmpdir / "directory_2")
+    os.makedirs(directory_1, exist_ok=True)
+    os.makedirs(directory_2, exist_ok=True)
+
+    file_1 = directory_1 + "/file_1.jsonl"
+    file_2 = directory_2 + "/file_2.jsonl"
+    original_df.to_json(file_1, orient="records", lines=True)
+    original_df.to_json(file_2, orient="records", lines=True)
+
+    # List of directories
+    data = _read_json_or_parquet(
+        input_files=[directory_1, directory_2],
+        file_type="jsonl",
+        backend="pandas",
+        files_per_partition=1,
+    )
+    assert len(data) == 6  # noqa: PLR2004
+
+    file_series = pd.Series([file_1, file_2])
+    # Non string or list input
+    with pytest.raises(TypeError):
+        data = _read_json_or_parquet(
+            input_files=file_series,
+            file_type="jsonl",
+            backend="pandas",
+            files_per_partition=1,
+        )
diff --git a/tests/test_io.py b/tests/test_io.py
@@ -157,6 +157,7 @@ def read_npy_file(files: list[str], backend: Literal["cudf", "pandas"], **kwargs
                     [{**json.loads(pickle.load(open(file, "rb")))} for file in files],  # noqa: S301
                 )
 
+            # Directory
             dataset = DocumentDataset.read_custom(
                 input_files=tmp_dir,
                 file_type="pkl",
@@ -172,6 +173,45 @@ def read_npy_file(files: list[str], backend: Literal["cudf", "pandas"], **kwargs
                 ),  # because we sort columns by name
             )
 
+    def test_read_custom_input_files(self, tmp_path: Path) -> None:
+        # Prepare files
+        df = pd.DataFrame({"id": [1, 2, 3], "text": ["a", "b", "c"]})
+        file_1 = str(tmp_path / "test_file_1.jsonl")
+        file_2 = str(tmp_path / "test_file_2.jsonl")
+        df.to_json(file_1, orient="records", lines=True)
+        df.to_json(file_2, orient="records", lines=True)
+
+        def read_jsonl(files: list[str], **kwargs) -> pd.DataFrame:  # noqa: ARG001
+            return pd.concat([pd.read_json(f, lines=True) for f in files], ignore_index=True)
+
+        # Single file
+        dataset = DocumentDataset.read_custom(
+            input_files=file_1,
+            file_type="jsonl",
+            read_func_single_partition=read_jsonl,
+            files_per_partition=1,
+        )
+        assert dataset.df.compute().equals(df)
+
+        # List of files
+        dataset = DocumentDataset.read_custom(
+            input_files=[file_1, file_2],
+            file_type="jsonl",
+            read_func_single_partition=read_jsonl,
+            files_per_partition=1,
+        )
+        assert len(dataset.df) == 6  # noqa: PLR2004
+
+        file_series = pd.Series([file_1, file_2])
+        # Non string or list input
+        with pytest.raises(TypeError):
+            dataset = DocumentDataset.read_custom(
+                input_files=file_series,
+                file_type="jsonl",
+                read_func_single_partition=read_jsonl,
+                files_per_partition=1,
+            )
+
 
 class TestWriteWithFilename:
     @pytest.mark.parametrize("keep_filename_column", [True, False])