Improve s3 client support (#18920)

tchaton · thomas · awaelchli · lantiga · commit cb9e5d964535 · 2023-11-06T10:21:58.000-05:00
* update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * Update src/lightning/data/streaming/client.py Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * update * update * update * update * update * update * update * update * update --------- Co-authored-by: thomas <thomas@thomass-MacBook-Pro.local> Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> (cherry picked from commit 6a0f992)
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -43,8 +43,8 @@
 
 # Data Utilities
 /examples/data/           @tchaton @nohalon @justusschock @lantiga
-/src/lightning/data/      @tchaton @nohalon @justusschock @lantiga
-/tests/tests_data         @tchaton @nohalon @justusschock @lantiga
+/src/lightning/data/      @tchaton
+/tests/tests_data         @tchaton
 
 # Lightning Fabric
 /src/lightning/fabric                       @awaelchli @carmocca @justusschock
diff --git a/index_1.txt b/index_1.txt
diff --git a/src/lightning/data/streaming/client.py b/src/lightning/data/streaming/client.py
@@ -0,0 +1,45 @@
+import os
+from time import time
+from typing import Any, Optional
+
+from lightning.data.streaming.constants import _BOTO3_AVAILABLE
+
+if _BOTO3_AVAILABLE:
+    import boto3
+    import botocore
+    from botocore.credentials import InstanceMetadataProvider
+    from botocore.utils import InstanceMetadataFetcher
+
+
+class S3Client:
+    # TODO: Generalize to support more cloud providers.
+
+    def __init__(self, refetch_interval: int = 3300) -> None:
+        self._refetch_interval = refetch_interval
+        self._last_time: Optional[float] = None
+        self._has_cloud_space_id: bool = "LIGHTNING_CLOUD_SPACE_ID" in os.environ
+        self._client: Optional[Any] = None
+
+    @property
+    def client(self) -> Any:
+        if not self._has_cloud_space_id:
+            if self._client is None:
+                self._client = boto3.client(
+                    "s3", config=botocore.config.Config(retries={"max_attempts": 1000, "mode": "adaptive"})
+                )
+            return self._client
+
+        # Re-generate credentials for EC2
+        if self._last_time is None or (time() - self._last_time) > self._refetch_interval:
+            provider = InstanceMetadataProvider(iam_role_fetcher=InstanceMetadataFetcher(timeout=3600, num_attempts=5))
+            credentials = provider.load()
+            self._client = boto3.client(
+                "s3",
+                aws_access_key_id=credentials.access_key,
+                aws_secret_access_key=credentials.secret_key,
+                aws_session_token=credentials.token,
+                config=botocore.config.Config(retries={"max_attempts": 1000, "mode": "adaptive"}),
+            )
+            self._last_time = time()
+
+        return self._client
diff --git a/src/lightning/data/streaming/data_processor.py b/src/lightning/data/streaming/data_processor.py
@@ -18,6 +18,7 @@
 
 from lightning import seed_everything
 from lightning.data.streaming import Cache
+from lightning.data.streaming.client import S3Client
 from lightning.data.streaming.constants import (
     _BOTO3_AVAILABLE,
     _DEFAULT_FAST_DEV_RUN_ITEMS,
@@ -40,7 +41,6 @@
     from lightning_cloud.resolver import _LightningSrcResolver, _LightningTargetResolver
 
 if _BOTO3_AVAILABLE:
-    import boto3
     import botocore
 
 logger = logging.Logger(__name__)
@@ -74,8 +74,8 @@ def _get_home_folder() -> str:
 def _get_cache_dir(name: Optional[str]) -> str:
     """Returns the cache directory used by the Cache to store the chunks."""
     if name is None:
-        return _get_cache_folder()
-    return os.path.join(_get_cache_folder(), name)
+        return os.path.join(_get_cache_folder(), "chunks")
+    return os.path.join(_get_cache_folder(), "chunks", name)
 
 
 def _get_cache_data_dir(name: Optional[str]) -> str:
@@ -85,10 +85,6 @@ def _get_cache_data_dir(name: Optional[str]) -> str:
     return os.path.join(_get_cache_folder(), "data", name)
 
 
-def _get_s3_client() -> Any:
-    return boto3.client("s3", config=botocore.config.Config(retries={"max_attempts": 1000, "mode": "standard"}))
-
-
 def _wait_for_file_to_exist(s3: Any, obj: parse.ParseResult, sleep_time: int = 2) -> Any:
     """This function check."""
     while True:
@@ -105,7 +101,7 @@ def _download_data_target(
     input_dir: str, remote_input_dir: str, cache_dir: str, queue_in: Queue, queue_out: Queue
 ) -> None:
     """This function is used to download data from a remote directory to a cache directory to optimise reading."""
-    s3 = _get_s3_client()
+    s3 = S3Client()
 
     while True:
         # 2. Fetch from the queue
@@ -137,7 +133,7 @@ def _download_data_target(
                     os.makedirs(dirpath, exist_ok=True)
 
                     with open(local_path, "wb") as f:
-                        s3.download_fileobj(obj.netloc, obj.path.lstrip("/"), f)
+                        s3.client.download_fileobj(obj.netloc, obj.path.lstrip("/"), f)
 
                 elif os.path.isfile(remote_path):
                     copyfile(remote_path, local_path)
@@ -176,7 +172,7 @@ def _upload_fn(upload_queue: Queue, remove_queue: Queue, cache_dir: str, remote_
     obj = parse.urlparse(remote_output_dir)
 
     if obj.scheme == "s3":
-        s3 = _get_s3_client()
+        s3 = S3Client()
 
     while True:
         local_filepath: Optional[str] = upload_queue.get()
@@ -190,10 +186,14 @@ def _upload_fn(upload_queue: Queue, remove_queue: Queue, cache_dir: str, remote_
             local_filepath = os.path.join(cache_dir, local_filepath)
 
         if obj.scheme == "s3":
-            s3.upload_file(
-                local_filepath, obj.netloc, os.path.join(obj.path.lstrip("/"), os.path.basename(local_filepath))
-            )
-        elif os.path.isdir(remote_output_dir):
+            try:
+                s3.client.upload_file(
+                    local_filepath, obj.netloc, os.path.join(obj.path.lstrip("/"), os.path.basename(local_filepath))
+                )
+            except Exception as e:
+                print(e)
+            return
+        if os.path.isdir(remote_output_dir):
             copyfile(local_filepath, os.path.join(remote_output_dir, os.path.basename(local_filepath)))
         else:
             raise ValueError(f"The provided {remote_output_dir} isn't supported.")
@@ -611,8 +611,8 @@ def _upload_index(self, remote_output_dir: str, cache_dir: str, num_nodes: int,
             local_filepath = os.path.join(cache_dir, _INDEX_FILENAME)
 
         if obj.scheme == "s3":
-            s3 = _get_s3_client()
-            s3.upload_file(
+            s3 = S3Client()
+            s3.client.upload_file(
                 local_filepath, obj.netloc, os.path.join(obj.path.lstrip("/"), os.path.basename(local_filepath))
             )
         elif os.path.isdir(remote_output_dir):
@@ -775,6 +775,7 @@ def run(self, data_recipe: DataRecipe) -> None:
         print("Workers are ready ! Starting data processing...")
 
         current_total = 0
+        has_failed = False
         with tqdm(total=num_items, smoothing=0, position=-1, mininterval=1) as pbar:
             while True:
                 try:
@@ -788,15 +789,20 @@ def run(self, data_recipe: DataRecipe) -> None:
                         continue
                     self.workers_tracker[index] = counter
                     new_total = sum(self.workers_tracker.values())
+
                 pbar.update(new_total - current_total)
                 current_total = new_total
                 if current_total == num_items:
                     break
 
-        num_nodes = _get_num_nodes()
+                # Exit early if all the workers are done.
+                # This means there were some kinda of errors.
+                if all(not w.is_alive() for w in self.workers):
+                    has_failed = True
+                    break
 
         # TODO: Understand why it hangs.
-        if num_nodes == 1:
+        if _get_num_nodes() == 1:
             for w in self.workers:
                 w.join(0)
 
@@ -806,6 +812,10 @@ def run(self, data_recipe: DataRecipe) -> None:
         data_recipe._done(self.delete_cached_files, self.remote_output_dir)
         print("Finished data processing!")
 
+        # TODO: Understand why it is required to avoid long shutdown.
+        if _get_num_nodes() > 1:
+            os._exit(int(has_failed))
+
     def _exit_on_error(self, error: str) -> None:
         for w in self.workers:
             w.join(0)
diff --git a/src/lightning/data/streaming/downloader.py b/src/lightning/data/streaming/downloader.py
@@ -16,6 +16,8 @@
 from typing import Any, Dict, List, Type
 from urllib import parse
 
+from lightning.data.streaming.client import S3Client
+
 
 class Downloader(ABC):
     def __init__(self, remote_dir: str, cache_dir: str, chunks: List[Dict[str, Any]]):
@@ -37,25 +39,20 @@ def download_file(self, remote_chunkpath: str, local_chunkpath: str) -> None:
 class S3Downloader(Downloader):
     @classmethod
     def download_file(cls, remote_filepath: str, local_filepath: str) -> None:
-        import boto3
-        from boto3.s3.transfer import TransferConfig
-        from botocore.config import Config
-
         obj = parse.urlparse(remote_filepath)
 
         if obj.scheme != "s3":
             raise ValueError(f"Expected obj.scheme to be `s3`, instead, got {obj.scheme} for remote={remote_filepath}")
 
+        # TODO: Add caching to avoid re-creating it
+        s3 = S3Client()
+
+        from boto3.s3.transfer import TransferConfig
+
         extra_args: Dict[str, Any] = {}
 
-        # Create a new session per thread
-        session = boto3.session.Session()
-        # Create a resource client using a thread's session object
-        s3 = session.client("s3", config=Config(read_timeout=None))
-        # Threads calling S3 operations return RuntimeError (cannot schedule new futures after
-        # interpreter shutdown). Temporary solution is to have `use_threads` as `False`.
         # Issue: https://github.com/boto/boto3/issues/3113
-        s3.download_file(
+        s3.client.download_file(
             obj.netloc,
             obj.path.lstrip("/"),
             local_filepath,
diff --git a/src/lightning/data/streaming/functions.py b/src/lightning/data/streaming/functions.py
@@ -92,6 +92,7 @@ def map(
     num_nodes: Optional[int] = None,
     machine: Optional[str] = None,
     input_dir: Optional[str] = None,
+    num_downloaders: int = 1,
 ) -> None:
     """This function map a callbable over a collection of files possibly in a distributed way.
 
@@ -104,6 +105,7 @@ def map(
         fast_dev_run: Whether to use process only a sub part of the inputs
         num_nodes: When doing remote execution, the number of nodes to use.
         machine: When doing remote execution, the machine to use.
+        num_downloaders: The number of downloaders per worker.
 
     """
     if not isinstance(inputs, Sequence):
@@ -127,6 +129,7 @@ def map(
             fast_dev_run=fast_dev_run,
             version=None,
             input_dir=input_dir or _get_input_dir(inputs),
+            num_downloaders=num_downloaders,
         )
         return data_processor.run(LambdaDataTransformRecipe(fn, inputs))
     return _execute(
@@ -149,6 +152,7 @@ def optimize(
     num_nodes: Optional[int] = None,
     machine: Optional[str] = None,
     input_dir: Optional[str] = None,
+    num_downloaders: int = 1,
 ) -> None:
     """This function converts a dataset into chunks possibly in a distributed way.
 
@@ -164,6 +168,7 @@ def optimize(
         fast_dev_run: Whether to use process only a sub part of the inputs
         num_nodes: When doing remote execution, the number of nodes to use.
         machine: When doing remote execution, the machine to use.
+        num_downloaders: The number of downloaders per worker.
 
     """
     if not isinstance(inputs, Sequence):
@@ -190,6 +195,7 @@ def optimize(
             remote_output_dir=PrettyDirectory(output_dir, remote_output_dir),
             fast_dev_run=fast_dev_run,
             input_dir=input_dir or _get_input_dir(inputs),
+            num_downloaders=num_downloaders,
         )
         return data_processor.run(
             LambdaDataChunkRecipe(
diff --git a/src/lightning/data/streaming/map.py b/src/lightning/data/streaming/map.py
diff --git a/tests/tests_data/streaming/test_client.py b/tests/tests_data/streaming/test_client.py
@@ -0,0 +1,50 @@
+from time import sleep, time
+from unittest import mock
+
+from lightning.data.streaming import client
+
+
+def test_s3_client_without_cloud_space_id(monkeypatch):
+    boto3 = mock.MagicMock()
+    monkeypatch.setattr(client, "boto3", boto3)
+
+    botocore = mock.MagicMock()
+    monkeypatch.setattr(client, "botocore", botocore)
+
+    s3 = client.S3Client(1)
+    assert s3.client
+    assert s3.client
+    assert s3.client
+    assert s3.client
+    assert s3.client
+
+    boto3.client.assert_called_once()
+
+
+def test_s3_client_with_cloud_space_id(monkeypatch):
+    boto3 = mock.MagicMock()
+    monkeypatch.setattr(client, "boto3", boto3)
+
+    botocore = mock.MagicMock()
+    monkeypatch.setattr(client, "botocore", botocore)
+
+    instance_metadata_provider = mock.MagicMock()
+    monkeypatch.setattr(client, "InstanceMetadataProvider", instance_metadata_provider)
+
+    instance_metadata_fetcher = mock.MagicMock()
+    monkeypatch.setattr(client, "InstanceMetadataFetcher", instance_metadata_fetcher)
+
+    monkeypatch.setenv("LIGHTNING_CLOUD_SPACE_ID", "dummy")
+
+    s3 = client.S3Client(1)
+    assert s3.client
+    assert s3.client
+    boto3.client.assert_called_once()
+    sleep(1 - (time() - s3._last_time))
+    assert s3.client
+    assert s3.client
+    assert len(boto3.client._mock_mock_calls) == 6
+    sleep(1 - (time() - s3._last_time))
+    assert s3.client
+    assert s3.client
+    assert len(boto3.client._mock_mock_calls) == 9
diff --git a/tests/tests_data/streaming/test_data_processor.py b/tests/tests_data/streaming/test_data_processor.py