Add tests for cloud I/O changes (#1257)

praateekmahajan · web-flow · commit 8392acd5aa2c · 2025-11-20T09:01:47.000-08:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -167,6 +167,7 @@ test = [
     "pytest-cov",
     "pytest-loguru",
     "scikit-learn",
+    "s3fs", # added for testing cloud fs
 ]
 
 [tool.uv]
diff --git a/tests/stages/deduplication/semantic/test_pairwise_io.py b/tests/stages/deduplication/semantic/test_pairwise_io.py
@@ -14,6 +14,7 @@
 
 # ruff: noqa: E402
 from pathlib import Path
+from unittest.mock import Mock
 
 import pytest
 
@@ -28,6 +29,21 @@
 class TestClusterWiseFilePartitioningStage:
     """Test cases for ClusterWiseFilePartitioningStage."""
 
+    def test_setup(self):
+        # Test fs and path_normalizer are set correctly
+        stage = ClusterWiseFilePartitioningStage("s3://test-bucket/test-path")
+        stage.setup()
+        assert stage.fs is not None
+        assert stage.path_normalizer is not None
+        assert stage.path_normalizer("test-bucket/test-path") == "s3://test-bucket/test-path"
+
+        # Test for local filesystem
+        stage = ClusterWiseFilePartitioningStage("/test/path")
+        stage.setup()
+        assert stage.fs is not None
+        assert stage.path_normalizer is not None
+        assert stage.path_normalizer("/test/path") == "/test/path"
+
     def test_process_finds_all_centroid_files(self, tmp_path: Path):
         """Test that process method finds all files in centroid directories."""
 
@@ -58,9 +74,22 @@ def test_process_finds_all_centroid_files(self, tmp_path: Path):
         stage = ClusterWiseFilePartitioningStage(str(tmp_path))
         stage.setup()
 
+        # Mock path_normalizer to track calls and verify it's used correctly
+        # For local filesystem, path_normalizer is lambda x: x, so mock should return input
+        mock_path_normalizer = Mock(side_effect=lambda x: x)
+        stage.path_normalizer = mock_path_normalizer
+
         empty_task = _EmptyTask(task_id="test", dataset_name="test", data=None)
         result = stage.process(empty_task)
 
+        # Verify path_normalizer was called exactly 3 times (once per centroid directory)
+        assert mock_path_normalizer.call_count == 3
+
+        # Verify it was called with centroid directory paths
+        # fs.ls() returns entries that contain "centroid="
+        call_args = [call[0][0] for call in mock_path_normalizer.call_args_list]
+        assert all("centroid=" in str(arg) for arg in call_args)
+
         # Should create 3 FileGroupTasks for 3 centroids
         assert len(result) == 3
         assert all(isinstance(task, FileGroupTask) for task in result)
diff --git a/tests/stages/text/io/reader/test_parquet.py b/tests/stages/text/io/reader/test_parquet.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from pathlib import Path
+from unittest.mock import patch
 
 import pandas as pd
 import pyarrow as pa
@@ -77,13 +78,29 @@ def test_parquet_reader_stage_pandas_reads_and_concatenates(sample_parquet_files
     task = _make_file_group_task(sample_parquet_files[:2])
     stage = ParquetReaderStage(fields=None)
 
-    out = stage.process(task)
-    assert isinstance(out, DocumentBatch)
+    # Track calls to pd.read_parquet and pd.concat using mock.patch with wraps
+    with (
+        patch(
+            "nemo_curator.stages.text.io.reader.parquet.pd.read_parquet", wraps=pd.read_parquet
+        ) as mock_read_parquet,
+        patch("nemo_curator.stages.text.io.reader.parquet.pd.concat", wraps=pd.concat) as mock_concat,
+    ):
+        out = stage.process(task)
+        assert isinstance(out, DocumentBatch)
+
+        df = out.to_pandas()
+        assert isinstance(df, pd.DataFrame)
+        assert len(df) == 4  # 2 files * 2 records each = 4 records
+        assert {"text", "category", "score"}.issubset(set(df.columns))
+
+        # Verify pd.read_parquet was called once per file
+        assert mock_read_parquet.call_count == 2
+        assert mock_read_parquet.call_args_list[0][0][0] == sample_parquet_files[0]
+        assert mock_read_parquet.call_args_list[1][0][0] == sample_parquet_files[1]
 
-    df = out.to_pandas()
-    assert isinstance(df, pd.DataFrame)
-    assert len(df) == 4  # 2 files * 2 records each = 4 records
-    assert {"text", "category", "score"}.issubset(set(df.columns))
+        # Verify pd.concat was called once with ignore_index=True
+        assert mock_concat.call_count == 1
+        assert mock_concat.call_args[1].get("ignore_index") is True
 
 
 class TestParquetReaderStorageOptionsAndColumns:
@@ -150,11 +167,29 @@ def test_parquet_reader_stage_pyarrow_reads_and_concatenates(tmp_path: Path):
     task = _make_file_group_task([str(f1), str(f2)])
     stage = ParquetReaderStage(read_kwargs={"engine": "pyarrow"}, fields=None)
 
-    out = stage.process(task)
-    table = out.to_pyarrow()
-    assert isinstance(table, pa.Table)
-    assert table.num_rows == 3
-    assert {"text", "category", "score"}.issubset(set(table.column_names))
+    # Track calls to pd.read_parquet and pd.concat using mock.patch with wraps
+    with (
+        patch(
+            "nemo_curator.stages.text.io.reader.parquet.pd.read_parquet", wraps=pd.read_parquet
+        ) as mock_read_parquet,
+        patch("nemo_curator.stages.text.io.reader.parquet.pd.concat", wraps=pd.concat) as mock_concat,
+    ):
+        out = stage.process(task)
+        table = out.to_pyarrow()
+        assert isinstance(table, pa.Table)
+        assert table.num_rows == 3
+        assert {"text", "category", "score"}.issubset(set(table.column_names))
+
+        # Verify pd.read_parquet was called once per file
+        assert mock_read_parquet.call_count == 2
+        assert mock_read_parquet.call_args_list[0][0][0] == str(f1)
+        assert mock_read_parquet.call_args_list[1][0][0] == str(f2)
+        # Verify engine was passed correctly
+        assert mock_read_parquet.call_args_list[0][1].get("engine") == "pyarrow"
+
+        # Verify pd.concat was called once with ignore_index=True
+        assert mock_concat.call_count == 1
+        assert mock_concat.call_args[1].get("ignore_index") is True
 
 
 def test_parquet_reader_stage_pyarrow_errors_when_some_columns_missing(tmp_path: Path):
diff --git a/tests/stages/text/io/writer/test_jsonl.py b/tests/stages/text/io/writer/test_jsonl.py
@@ -22,6 +22,7 @@
 
 import nemo_curator.stages.text.io.writer.utils as writer_utils
 from nemo_curator.stages.text.io.writer import JsonlWriter
+from nemo_curator.stages.text.io.writer import base as writer_base
 from nemo_curator.tasks import DocumentBatch
 
 
@@ -266,3 +267,26 @@ def test_jsonl_writer_overwrites_existing_file(
         pd.testing.assert_frame_equal(
             pd.read_json(result1.data[0], lines=True), pd.read_json(result2.data[0], lines=True)
         )
+
+    @pytest.mark.parametrize(
+        "path",
+        [
+            "s3://test-bucket/output",
+            "/local/path",
+        ],
+    )
+    def test_jsonl_writer_write_data_path_protocol_handling(self, pandas_document_batch: DocumentBatch, path: str):
+        """Test that write_data is called with correct protocol handling for cloud and local paths."""
+        with mock.patch.object(writer_base, "check_output_mode", return_value=None):
+            writer = JsonlWriter(path=path)
+            writer.setup()
+
+        with (
+            mock.patch.object(writer.fs, "exists", return_value=False),
+            mock.patch.object(writer, "write_data") as mock_write_data,
+        ):
+            writer.process(pandas_document_batch)
+
+            mock_write_data.assert_called_once()
+            file_path = mock_write_data.call_args[0][1]
+            assert file_path.startswith(path), f"Path should start with {path}"
diff --git a/tests/stages/text/io/writer/test_parquet.py b/tests/stages/text/io/writer/test_parquet.py
@@ -21,6 +21,7 @@
 import pytest
 
 from nemo_curator.stages.text.io.writer import ParquetWriter
+from nemo_curator.stages.text.io.writer import base as writer_base
 from nemo_curator.stages.text.io.writer import utils as writer_utils
 from nemo_curator.tasks import DocumentBatch
 
@@ -251,3 +252,26 @@ def test_jsonl_writer_overwrites_existing_file(
         )
 
         pd.testing.assert_frame_equal(pd.read_parquet(result1.data[0]), pd.read_parquet(result2.data[0]))
+
+    @pytest.mark.parametrize(
+        "path",
+        [
+            "s3://test-bucket/output",
+            "/local/path",
+        ],
+    )
+    def test_parquet_writer_write_data_path_protocol_handling(self, pandas_document_batch: DocumentBatch, path: str):
+        """Test that write_data is called with correct protocol handling for cloud and local paths."""
+        with mock.patch.object(writer_base, "check_output_mode", return_value=None):
+            writer = ParquetWriter(path=path)
+            writer.setup()
+
+        with (
+            mock.patch.object(writer.fs, "exists", return_value=False),
+            mock.patch.object(writer, "write_data") as mock_write_data,
+        ):
+            writer.process(pandas_document_batch)
+
+            mock_write_data.assert_called_once()
+            file_path = mock_write_data.call_args[0][1]
+            assert file_path.startswith(path), f"Path should start with {path}"
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -167,6 +167,7 @@ test = [`
`167`	`167`	`"pytest-cov",`
`168`	`168`	`"pytest-loguru",`
`169`	`169`	`"scikit-learn",`
	`170`	`+ "s3fs", # added for testing cloud fs`
`170`	`171`	`]`
`171`	`172`
`172`	`173`	`[tool.uv]`