map operator: Add support for non absolute input_dir and output_dir (#19378)

tchaton · web-flow · commit 5a0d2eff8c99 · 2024-02-01T08:25:47.000Z
diff --git a/.github/workflows/ci-tests-data.yml b/.github/workflows/ci-tests-data.yml
@@ -87,15 +87,15 @@ jobs:
       #     ls -lh $PYPI_CACHE_DIR
 
       - name: Install package & dependencies
-        timeout-minutes: 30
+        timeout-minutes: 5
         run: |
           pip install -e ".[data-dev]" -U --prefer-binary -f ${TORCH_URL}
           pip list
 
       - name: Testing Data
         working-directory: tests/tests_data
         # NOTE: do not include coverage report here, see: https://github.com/nedbat/coveragepy/issues/1003
-        timeout-minutes: 10
+        timeout-minutes: 25
         run: |
           python -m coverage run --source lightning \
             -m pytest -v --timeout=60 --durations=60
diff --git a/src/lightning/data/streaming/constants.py b/src/lightning/data/streaming/constants.py
@@ -57,3 +57,4 @@
 _NUMPY_DTYPES_MAPPING = {i: np.dtype(v) for i, v in enumerate(_NUMPY_SCTYPES)}
 
 _TIME_FORMAT = "%Y-%m-%d_%H-%M-%S.%fZ"
+_IS_IN_STUDIO = bool(os.getenv("LIGHTNING_CLOUD_PROJECT_ID", None)) and bool(os.getenv("LIGHTNING_CLUSTER_ID", None))
diff --git a/src/lightning/data/streaming/data_processor.py b/src/lightning/data/streaming/data_processor.py
@@ -10,6 +10,7 @@
 from abc import abstractmethod
 from dataclasses import dataclass
 from multiprocessing import Process, Queue
+from pathlib import Path
 from queue import Empty
 from time import sleep, time
 from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
@@ -25,6 +26,7 @@
     _BOTO3_AVAILABLE,
     _DEFAULT_FAST_DEV_RUN_ITEMS,
     _INDEX_FILENAME,
+    _IS_IN_STUDIO,
     _LIGHTNING_CLOUD_LATEST,
     _TORCH_GREATER_EQUAL_2_1_0,
 )
@@ -66,17 +68,21 @@ def _get_home_folder() -> str:
     return os.getenv("DATA_OPTIMIZER_HOME_FOLDER", os.path.expanduser("~"))
 
 
+def _get_default_cache() -> str:
+    return "/cache" if _IS_IN_STUDIO else tempfile.gettempdir()
+
+
 def _get_cache_dir(name: Optional[str] = None) -> str:
     """Returns the cache directory used by the Cache to store the chunks."""
-    cache_dir = os.getenv("DATA_OPTIMIZER_CACHE_FOLDER", "/cache/chunks")
+    cache_dir = os.getenv("DATA_OPTIMIZER_CACHE_FOLDER", f"{_get_default_cache()}/chunks")
     if name is None:
         return cache_dir
     return os.path.join(cache_dir, name.lstrip("/"))
 
 
 def _get_cache_data_dir(name: Optional[str] = None) -> str:
     """Returns the cache data directory used by the DataProcessor workers to download the files."""
-    cache_dir = os.getenv("DATA_OPTIMIZER_DATA_CACHE_FOLDER", "/cache/data")
+    cache_dir = os.getenv("DATA_OPTIMIZER_DATA_CACHE_FOLDER", f"{_get_default_cache()}/data")
     if name is None:
         return os.path.join(cache_dir)
     return os.path.join(cache_dir, name.lstrip("/"))
@@ -222,18 +228,20 @@ def _upload_fn(upload_queue: Queue, remove_queue: Queue, cache_dir: str, output_
                 )
             except Exception as e:
                 print(e)
-        elif output_dir.path and os.path.isdir(output_dir.path):
+
+        elif output_dir.path:
             if tmpdir is None:
-                shutil.copyfile(local_filepath, os.path.join(output_dir.path, os.path.basename(local_filepath)))
+                output_filepath = os.path.join(output_dir.path, os.path.basename(local_filepath))
             else:
                 output_filepath = os.path.join(output_dir.path, local_filepath.replace(tmpdir, "")[1:])
-                os.makedirs(os.path.dirname(output_filepath), exist_ok=True)
-                shutil.copyfile(local_filepath, output_filepath)
+
+            os.makedirs(os.path.dirname(output_filepath), exist_ok=True)
+            shutil.move(local_filepath, output_filepath)
         else:
             raise ValueError(f"The provided {output_dir.path} isn't supported.")
 
         # Inform the remover to delete the file
-        if remove_queue:
+        if remove_queue and os.path.exists(local_filepath):
             remove_queue.put([local_filepath])
 
 
@@ -290,7 +298,10 @@ def _get_num_bytes(item: Any, base_path: str) -> int:
 
     num_bytes = 0
     for element in flattened_item:
-        if isinstance(element, str) and element.startswith(base_path) and os.path.exists(element):
+        if isinstance(element, str):
+            element = Path(element).resolve()
+            if not element.exists():
+                continue
             file_bytes = os.path.getsize(element)
             if file_bytes == 0:
                 raise RuntimeError(f"The file {element} has 0 bytes!")
@@ -475,16 +486,22 @@ def _collect_paths(self) -> None:
         for item in self.items:
             flattened_item, spec = tree_flatten(item)
 
+            def is_path(element: Any) -> bool:
+                if not isinstance(element, str):
+                    return False
+
+                element: str = str(Path(element).resolve())
+                return (
+                    element.startswith(self.input_dir.path)
+                    if self.input_dir.path is not None
+                    else os.path.exists(element)
+                )
+
             # For speed reasons, we assume starting with `self.input_dir` is enough to be a real file.
             # Other alternative would be too slow.
             # TODO: Try using dictionary for higher accurary.
             indexed_paths = {
-                index: element
-                for index, element in enumerate(flattened_item)
-                if isinstance(element, str)
-                and (
-                    element.startswith(self.input_dir.path) if self.input_dir is not None else os.path.exists(element)
-                )  # For speed reasons
+                index: str(Path(element).resolve()) for index, element in enumerate(flattened_item) if is_path(element)
             }
 
             if len(indexed_paths) == 0:
@@ -947,7 +964,7 @@ def run(self, data_recipe: DataRecipe) -> None:
         print("Workers are finished.")
         result = data_recipe._done(len(user_items), self.delete_cached_files, self.output_dir)
 
-        if num_nodes == node_rank + 1:
+        if num_nodes == node_rank + 1 and self.output_dir.url:
             _create_dataset(
                 input_dir=self.input_dir.path,
                 storage_dir=self.output_dir.path,
diff --git a/src/lightning/data/streaming/functions.py b/src/lightning/data/streaming/functions.py
@@ -65,9 +65,6 @@ def _get_input_dir(inputs: Sequence[Any]) -> Optional[str]:
     if "/.project" in absolute_path:
         return "/" + os.path.join(*str(list(indexed_paths.values())[0]).split("/")[:4])
 
-    if indexed_paths[0] != absolute_path:
-        raise ValueError(f"The provided path should be absolute. Found {indexed_paths[0]} instead of {absolute_path}.")
-
     return "/" + os.path.join(*str(absolute_path).split("/")[:4])
 
 
diff --git a/src/lightning/data/streaming/writer.py b/src/lightning/data/streaming/writer.py
@@ -81,6 +81,7 @@ def __init__(
         if self._compression:
             if len(_COMPRESSORS) == 0:
                 raise ValueError("No compresion algorithms are installed.")
+
             if self._compression not in _COMPRESSORS:
                 raise ValueError(
                     f"The provided compression {self._compression} isn't available in {sorted(_COMPRESSORS)}"
diff --git a/src/lightning/pytorch/core/module.py b/src/lightning/pytorch/core/module.py
@@ -750,13 +750,11 @@ def validation_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT:
         .. code-block:: python
 
             # if you have one val dataloader:
-            def validation_step(self, batch, batch_idx):
-                ...
+            def validation_step(self, batch, batch_idx): ...
 
 
             # if you have multiple val dataloaders:
-            def validation_step(self, batch, batch_idx, dataloader_idx=0):
-                ...
+            def validation_step(self, batch, batch_idx, dataloader_idx=0): ...
 
         Examples::
 
@@ -819,13 +817,11 @@ def test_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT:
         .. code-block:: python
 
             # if you have one test dataloader:
-            def test_step(self, batch, batch_idx):
-                ...
+            def test_step(self, batch, batch_idx): ...
 
 
             # if you have multiple test dataloaders:
-            def test_step(self, batch, batch_idx, dataloader_idx=0):
-                ...
+            def test_step(self, batch, batch_idx, dataloader_idx=0): ...
 
         Examples::
 
@@ -989,7 +985,7 @@ def configure_optimizers(self):
                     "lr_scheduler": {
                         "scheduler": ReduceLROnPlateau(optimizer, ...),
                         "monitor": "metric_to_track",
-                        "frequency": "indicates how often the metric is updated"
+                        "frequency": "indicates how often the metric is updated",
                         # If "monitor" references validation metrics, then "frequency" should be set to a
                         # multiple of "trainer.check_val_every_n_epoch".
                     },
diff --git a/tests/tests_data/streaming/test_data_processor.py b/tests/tests_data/streaming/test_data_processor.py
@@ -2,6 +2,7 @@
 import random
 import sys
 from functools import partial
+from pathlib import Path
 from typing import Any, List
 from unittest import mock
 
@@ -502,7 +503,7 @@ def test_data_processsor_distributed(fast_dev_run, delete_cached_files, tmpdir,
         "chunk-1-3.bin",
     ]
 
-    assert sorted(os.listdir(cache_dir)) == fast_dev_run_disabled_chunks_0
+    assert sorted(os.listdir(remote_output_dir)) == fast_dev_run_disabled_chunks_0
 
     cache_dir = os.path.join(tmpdir, "cache_2")
     monkeypatch.setenv("DATA_OPTIMIZER_CACHE_FOLDER", cache_dir)
@@ -531,26 +532,11 @@ def test_data_processsor_distributed(fast_dev_run, delete_cached_files, tmpdir,
         "index.json",
     ]
 
-    assert sorted(os.listdir(cache_dir)) == fast_dev_run_disabled_chunks_1
-
     expected = sorted(fast_dev_run_disabled_chunks_0 + fast_dev_run_disabled_chunks_1 + ["1-index.json"])
 
     assert sorted(os.listdir(remote_output_dir)) == expected
 
-    _create_dataset_mock.assert_called()
-
-    assert _create_dataset_mock._mock_mock_calls[0].kwargs == {
-        "input_dir": str(input_dir),
-        "storage_dir": str(remote_output_dir),
-        "dataset_type": "CHUNKED",
-        "empty": False,
-        "size": 30,
-        "num_bytes": 26657,
-        "data_format": "jpeg",
-        "compression": None,
-        "num_chunks": 16,
-        "num_bytes_per_chunk": [2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2],
-    }
+    _create_dataset_mock.assert_not_called()
 
 
 class TextTokenizeRecipe(DataChunkRecipe):
@@ -951,6 +937,36 @@ def test_data_processing_map_without_input_dir_and_folder(monkeypatch, tmpdir):
     assert os.path.exists(os.path.join(output_dir, "0", "0.JPEG"))
 
 
+def map_fn_map_non_absolute(path, output_dir):
+    absolute_path = str(Path(path).absolute())
+    assert absolute_path == path, (absolute_path, path)
+
+    with open(os.path.join(output_dir, os.path.basename(path)), "w") as f:
+        f.write("Hello World")
+
+
+@pytest.mark.skipif(sys.platform == "win32", reason="not supported on windows")
+def test_data_processing_map_non_absolute_path(monkeypatch, tmpdir):
+    monkeypatch.chdir(str(tmpdir))
+
+    for i in range(5):
+        with open(f"./{i}.txt", "w") as f:
+            f.write("Hello World")
+
+    assert sorted(os.listdir(tmpdir)) == ["0.txt", "1.txt", "2.txt", "3.txt", "4.txt"]
+
+    map(
+        map_fn_map_non_absolute,
+        [f"{i}.txt" for i in range(5)],
+        output_dir="./output_dir",
+        num_workers=1,
+        reorder_files=True,
+    )
+
+    assert sorted(os.listdir(tmpdir)) == ["0.txt", "1.txt", "2.txt", "3.txt", "4.txt", "output_dir"]
+    assert sorted(os.listdir(os.path.join(tmpdir, "output_dir"))) == ["0.txt", "1.txt", "2.txt", "3.txt", "4.txt"]
+
+
 @pytest.mark.skipif(condition=sys.platform == "win32", reason="Not supported on windows")
 def test_map_error_when_not_empty(monkeypatch, tmpdir):
     boto3 = mock.MagicMock()
@@ -967,6 +983,8 @@ def test_map_error_when_not_empty(monkeypatch, tmpdir):
             error_when_not_empty=True,
         )
 
+    monkeypatch.setattr(data_processor_module, "_IS_IN_STUDIO", True)
+
     with pytest.raises(OSError, match="cache"):
         map(
             map_fn,
diff --git a/tests/tests_data/streaming/test_writer.py b/tests/tests_data/streaming/test_writer.py
@@ -18,6 +18,7 @@
 import numpy as np
 import pytest
 from lightning import seed_everything
+from lightning.data.streaming.compression import _ZSTD_AVAILABLE
 from lightning.data.streaming.reader import BinaryReader
 from lightning.data.streaming.sampler import ChunkedIndex
 from lightning.data.streaming.writer import BinaryWriter
@@ -31,7 +32,13 @@ def test_binary_writer_with_ints_and_chunk_bytes(tmpdir):
     with pytest.raises(FileNotFoundError, match="The provided cache directory `dontexists` doesn't exist."):
         BinaryWriter("dontexists", {})
 
-    with pytest.raises(ValueError, match="No compresion algorithms are installed."):
+    match = (
+        "The provided compression something_else isn't available"
+        if _ZSTD_AVAILABLE
+        else "No compresion algorithms are installed."
+    )
+
+    with pytest.raises(ValueError, match=match):
         BinaryWriter(tmpdir, {"i": "int"}, compression="something_else")
 
     binary_writer = BinaryWriter(tmpdir, chunk_bytes=90)
@@ -69,7 +76,13 @@ def test_binary_writer_with_ints_and_chunk_size(tmpdir):
     with pytest.raises(FileNotFoundError, match="The provided cache directory `dontexists` doesn't exist."):
         BinaryWriter("dontexists", {})
 
-    with pytest.raises(ValueError, match="No compresion algorithms are installed."):
+    match = (
+        "The provided compression something_else isn't available"
+        if _ZSTD_AVAILABLE
+        else "No compresion algorithms are installed."
+    )
+
+    with pytest.raises(ValueError, match=match):
         BinaryWriter(tmpdir, {"i": "int"}, compression="something_else")
 
     binary_writer = BinaryWriter(tmpdir, chunk_size=25)

Original file line number	Diff line number	Diff line change
`@@ -57,3 +57,4 @@`
`57`	`57`	`_NUMPY_DTYPES_MAPPING = {i: np.dtype(v) for i, v in enumerate(_NUMPY_SCTYPES)}`
`58`	`58`
`59`	`59`	`_TIME_FORMAT = "%Y-%m-%d_%H-%M-%S.%fZ"`
	`60`	`+_IS_IN_STUDIO = bool(os.getenv("LIGHTNING_CLOUD_PROJECT_ID", None)) and bool(os.getenv("LIGHTNING_CLUSTER_ID", None))`