Lightning-AI
diff --git a/‎requirements/app/app.txt
Lines changed: 1 addition & 1 deletion b/‎requirements/app/app.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/lightning/data/__init__.py
Lines changed: 4 additions & 0 deletions b/‎src/lightning/data/__init__.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/lightning/data/streaming/cache.py
Lines changed: 5 additions & 12 deletions b/‎src/lightning/data/streaming/cache.py
Lines changed: 5 additions & 12 deletions
diff --git a/‎src/lightning/data/streaming/combined.py
Lines changed: 4 additions & 0 deletions b/‎src/lightning/data/streaming/combined.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/lightning/data/streaming/constants.py
Lines changed: 1 addition & 1 deletion b/‎src/lightning/data/streaming/constants.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/lightning/data/streaming/data_processor.py
Lines changed: 42 additions & 26 deletions b/‎src/lightning/data/streaming/data_processor.py
Lines changed: 42 additions & 26 deletions
diff --git a/‎src/lightning/data/streaming/dataset.py
Lines changed: 6 additions & 20 deletions b/‎src/lightning/data/streaming/dataset.py
Lines changed: 6 additions & 20 deletions
@@ -1,4 +1,4 @@
-lightning-cloud == 0.5.59  # Must be pinned to ensure compatibility
+lightning-cloud == 0.5.61  # Must be pinned to ensure compatibility
 packaging
 typing-extensions >=4.4.0, <4.8.0
 deepdiff >=5.7.0, <6.6.0
 
@@ -1,9 +1,13 @@
+from lightning.data.streaming.combined import CombinedStreamingDataset
+from lightning.data.streaming.dataloader import StreamingDataLoader
 from lightning.data.streaming.dataset import StreamingDataset
 from lightning.data.streaming.functions import map, optimize
 
 __all__ = [
     "LightningDataset",
     "StreamingDataset",
+    "CombinedStreamingDataset",
+    "StreamingDataLoader",
     "LightningIterableDataset",
     "map",
     "optimize",
 
@@ -13,7 +13,6 @@
 
 import logging
 import os
-from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 from lightning.data.streaming.constants import (
@@ -23,6 +22,7 @@
 )
 from lightning.data.streaming.item_loader import BaseItemLoader
 from lightning.data.streaming.reader import BinaryReader
+from lightning.data.streaming.resolver import Dir, _resolve_dir
 from lightning.data.streaming.sampler import ChunkedIndex
 from lightning.data.streaming.serializers import Serializer
 from lightning.data.streaming.writer import BinaryWriter
@@ -31,17 +31,6 @@
 
 logger = logging.Logger(__name__)
 
-if _LIGHTNING_CLOUD_LATEST:
-    from lightning_cloud.resolver import _resolve_dir
-
-
-@dataclass
-class Dir:
-    """Holds a directory path and possibly its associated remote URL."""
-
-    path: str
-    url: Optional[str] = None
-
 
 class Cache:
     def __init__(
@@ -76,6 +65,7 @@ def __init__(
 
         input_dir = _resolve_dir(input_dir)
         self._cache_dir = input_dir.path
+        assert self._cache_dir
         self._writer = BinaryWriter(
             self._cache_dir,
             chunk_size=chunk_size,
@@ -108,15 +98,18 @@ def filled(self) -> bool:
         """Returns whether the caching phase is done."""
         if self._is_done:
             return True
+        assert self._cache_dir
         self._is_done = os.path.exists(os.path.join(self._cache_dir, _INDEX_FILENAME))
         return self._is_done
 
     @property
     def cache_dir(self) -> str:
+        assert self._cache_dir
         return self._cache_dir
 
     @property
     def checkpoint_dir(self) -> str:
+        assert self._cache_dir
         checkpoint_dir = os.path.join(self._cache_dir, "checkpoints")
         return self._try_create(checkpoint_dir)
 
 
@@ -44,6 +44,10 @@ def __init__(
 
         self._iterator: Optional[_CombinedDatasetIterator] = None
 
+    def __len__(self) -> int:
+        assert self._weights
+        return int(sum(w * len(d) for w, d in zip(self._weights, self._datasets)))
+
     def __iter__(self) -> Iterator[Any]:
         assert self._weights
         self._iterator = _CombinedDatasetIterator(self._datasets, self._seed, self._weights)
 
@@ -26,7 +26,7 @@
 # This is required for full pytree serialization / deserialization support
 _TORCH_GREATER_EQUAL_2_1_0 = RequirementCache("torch>=2.1.0")
 _VIZ_TRACKER_AVAILABLE = RequirementCache("viztracer")
-_LIGHTNING_CLOUD_LATEST = RequirementCache("lightning-cloud>=0.5.59")
+_LIGHTNING_CLOUD_LATEST = RequirementCache("lightning-cloud>=0.5.61")
 _BOTO3_AVAILABLE = RequirementCache("boto3")
 
 # DON'T CHANGE ORDER
 
@@ -1,3 +1,4 @@
+import concurrent
 import json
 import logging
 import os
@@ -27,6 +28,7 @@
     _LIGHTNING_CLOUD_LATEST,
     _TORCH_GREATER_EQUAL_2_1_0,
 )
+from lightning.data.streaming.resolver import _resolve_dir
 from lightning.data.utilities.broadcast import broadcast_object
 from lightning.data.utilities.packing import _pack_greedily
 
@@ -35,7 +37,6 @@
 
 if _LIGHTNING_CLOUD_LATEST:
     from lightning_cloud.openapi import V1DatasetType
-    from lightning_cloud.resolver import _resolve_dir
     from lightning_cloud.utils.dataset import _create_dataset
 
 
@@ -120,7 +121,9 @@ def _download_data_target(input_dir: Dir, cache_dir: str, queue_in: Queue, queue
         index, paths = r
 
         # 5. Check whether all the files are already downloaded
-        if all(os.path.exists(p.replace(input_dir.path, cache_dir) if input_dir else p) for p in paths):
+        if input_dir.path and all(
+            os.path.exists(p.replace(input_dir.path, cache_dir) if input_dir else p) for p in paths
+        ):
             queue_out.put(index)
             continue
 
@@ -131,9 +134,10 @@ def _download_data_target(input_dir: Dir, cache_dir: str, queue_in: Queue, queue
 
             # 7. Download all the required paths to unblock the current index
             for path in paths:
-                local_path = path.replace(input_dir.path, cache_dir)
+                if input_dir.path:
+                    local_path = path.replace(input_dir.path, cache_dir)
 
-                if input_dir.url:
+                if input_dir.url and input_dir.path:
                     path = path.replace(input_dir.path, input_dir.url)
 
                 obj = parse.urlparse(path)
@@ -168,7 +172,7 @@ def _remove_target(input_dir: Dir, cache_dir: str, queue_in: Queue) -> None:
         # 3. Iterate through the paths and delete them sequentially.
         for path in paths:
             if input_dir:
-                if not path.startswith(cache_dir):
+                if not path.startswith(cache_dir) and input_dir.path is not None:
                     path = path.replace(input_dir.path, cache_dir)
 
                 if os.path.exists(path):
@@ -199,11 +203,13 @@ def _upload_fn(upload_queue: Queue, remove_queue: Queue, cache_dir: str, output_
         if obj.scheme == "s3":
             try:
                 s3.client.upload_file(
-                    local_filepath, obj.netloc, os.path.join(obj.path.lstrip("/"), os.path.basename(local_filepath))
+                    local_filepath,
+                    obj.netloc,
+                    os.path.join(str(obj.path).lstrip("/"), os.path.basename(local_filepath)),
                 )
             except Exception as e:
                 print(e)
-        elif os.path.isdir(output_dir.path):
+        elif output_dir.path and os.path.isdir(output_dir.path):
             shutil.copyfile(local_filepath, os.path.join(output_dir.path, os.path.basename(local_filepath)))
         else:
             raise ValueError(f"The provided {output_dir.path} isn't supported.")
@@ -254,20 +260,30 @@ def _map_items_to_workers_weighted(
     return [worker_items[worker_id] for worker_id in worker_ids_this_node]
 
 
+def _get_num_bytes(item: Any, base_path: str) -> int:
+    flattened_item, _ = tree_flatten(item)
+
+    num_bytes = 0
+    for element in flattened_item:
+        if isinstance(element, str) and element.startswith(base_path) and os.path.exists(element):
+            file_bytes = os.path.getsize(element)
+            if file_bytes == 0:
+                raise RuntimeError(f"The file {element} has 0 bytes!")
+            num_bytes += file_bytes
+    return num_bytes
+
+
 def _get_item_filesizes(items: List[Any], base_path: str = "") -> List[int]:
     """Computes the total size in bytes of all file paths for every datastructure in the given list."""
     item_sizes = []
-    for item in items:
-        flattened_item, _ = tree_flatten(item)
-
-        num_bytes = 0
-        for element in flattened_item:
-            if isinstance(element, str) and element.startswith(base_path) and os.path.exists(element):
-                file_bytes = os.path.getsize(element)
-                if file_bytes == 0:
-                    raise RuntimeError(f"The file {element} has 0 bytes!")
-                num_bytes += file_bytes
-        item_sizes.append(num_bytes)
+
+    cpu_count = os.cpu_count() or 1
+
+    # Parallelize to accelerate retrieving the number of file bytes to read for each item
+    with concurrent.futures.ThreadPoolExecutor(max_workers=cpu_count * 2 if cpu_count > 4 else cpu_count) as executor:
+        futures = [executor.submit(_get_num_bytes, item, base_path) for item in items]
+        for future in futures:
+            item_sizes.append(future.result())
     return item_sizes
 
 
@@ -358,7 +374,7 @@ def _loop(self) -> None:
                         for uploader in self.uploaders:
                             uploader.join()
 
-                    if self.remove and self.input_dir.path is not None:
+                    if self.remove:
                         assert self.remover
                         self.remove_queue.put(None)
                         self.remover.join()
@@ -487,7 +503,7 @@ def _start_downloaders(self) -> None:
             self.to_download_queues[downloader_index].put(None)
 
     def _start_remover(self) -> None:
-        if not self.remove or self.input_dir.path is None:
+        if not self.remove:
             return
 
         self.remover = Process(
@@ -696,9 +712,9 @@ def _upload_index(self, output_dir: Dir, cache_dir: str, num_nodes: int, node_ra
         if obj.scheme == "s3":
             s3 = S3Client()
             s3.client.upload_file(
-                local_filepath, obj.netloc, os.path.join(obj.path.lstrip("/"), os.path.basename(local_filepath))
+                local_filepath, obj.netloc, os.path.join(str(obj.path).lstrip("/"), os.path.basename(local_filepath))
             )
-        elif os.path.isdir(output_dir.path):
+        elif output_dir.path and os.path.isdir(output_dir.path):
             shutil.copyfile(local_filepath, os.path.join(output_dir.path, os.path.basename(local_filepath)))
 
         if num_nodes == 1 or node_rank is None:
@@ -710,16 +726,16 @@ def _upload_index(self, output_dir: Dir, cache_dir: str, num_nodes: int, node_ra
         if num_nodes == node_rank + 1:
             # Get the index file locally
             for node_rank in range(num_nodes - 1):
-                remote_filepath = os.path.join(
-                    output_dir.url if output_dir.url else output_dir.path, f"{node_rank}-{_INDEX_FILENAME}"
-                )
+                output_dir_path = output_dir.url if output_dir.url else output_dir.path
+                assert output_dir_path
+                remote_filepath = os.path.join(output_dir_path, f"{node_rank}-{_INDEX_FILENAME}")
                 node_index_filepath = os.path.join(cache_dir, os.path.basename(remote_filepath))
                 if obj.scheme == "s3":
                     obj = parse.urlparse(remote_filepath)
                     _wait_for_file_to_exist(s3, obj)
                     with open(node_index_filepath, "wb") as f:
                         s3.client.download_fileobj(obj.netloc, obj.path.lstrip("/"), f)
-                elif os.path.isdir(output_dir.path):
+                elif output_dir.path and os.path.isdir(output_dir.path):
                     shutil.copyfile(remote_filepath, node_index_filepath)
 
             merge_cache = Cache(cache_dir, chunk_bytes=1)
 
@@ -13,7 +13,6 @@
 
 import hashlib
 import os
-from dataclasses import dataclass
 from time import time
 from typing import Any, Dict, List, Optional, Tuple, Union
 
@@ -24,24 +23,21 @@
 from lightning.data.streaming.constants import (
     _DEFAULT_CACHE_DIR,
     _INDEX_FILENAME,
-    _LIGHTNING_CLOUD_LATEST,
 )
 from lightning.data.streaming.item_loader import BaseItemLoader
+from lightning.data.streaming.resolver import Dir, _resolve_dir
 from lightning.data.streaming.sampler import ChunkedIndex
 from lightning.data.streaming.serializers import Serializer
 from lightning.data.streaming.shuffle import FullShuffle, NoShuffle, Shuffle
 from lightning.data.utilities.env import Environment, _DistributedEnv, _WorkerEnv
 
-if _LIGHTNING_CLOUD_LATEST:
-    from lightning_cloud.resolver import Dir, _resolve_dir
-
 
 class StreamingDataset(IterableDataset):
     """The streaming dataset can be used once your data have been optimised using the DatasetOptimiser class."""
 
     def __init__(
         self,
-        input_dir: Union[str, "RemoteDir"],
+        input_dir: Union[str, "Dir"],
         item_loader: Optional[BaseItemLoader] = None,
         shuffle: bool = False,
         drop_last: bool = False,
@@ -66,12 +62,10 @@ def __init__(
         if not isinstance(shuffle, bool):
             raise ValueError(f"Shuffle should be a boolean. Found {shuffle}")
 
-        if isinstance(input_dir, RemoteDir):
-            input_dir = Dir(path=input_dir.cache_dir, url=input_dir.remote)
-
         input_dir = _resolve_dir(input_dir)
 
         self.input_dir = input_dir
+
         self.item_loader = item_loader
         self.shuffle: bool = shuffle
         self.drop_last = drop_last
@@ -368,8 +362,8 @@ def _validate_state_dict(self) -> None:
             )
 
 
-def _try_create_cache_dir(input_dir: str, shard_rank: int = 0) -> Optional[str]:
-    hash_object = hashlib.md5(input_dir.encode())
+def _try_create_cache_dir(input_dir: Optional[str], shard_rank: int = 0) -> Optional[str]:
+    hash_object = hashlib.md5((input_dir or "").encode())
     if "LIGHTNING_CLUSTER_ID" not in os.environ or "LIGHTNING_CLOUD_PROJECT_ID" not in os.environ:
         cache_dir = os.path.join(_DEFAULT_CACHE_DIR, hash_object.hexdigest(), str(shard_rank))
         os.makedirs(cache_dir, exist_ok=True)
@@ -379,7 +373,7 @@ def _try_create_cache_dir(input_dir: str, shard_rank: int = 0) -> Optional[str]:
     return cache_dir
 
 
-def _should_replace_path(path: str) -> bool:
+def _should_replace_path(path: Optional[str]) -> bool:
     """Whether the input path is a special path to be replaced."""
     if path is None or path == "":
         return True
@@ -391,14 +385,6 @@ def _is_in_dataloader_worker() -> bool:
     return get_worker_info() is not None
 
 
-@dataclass
-class RemoteDir:
-    """Holds a remote URL to a directory and a cache directory where the data will be downloaded."""
-
-    cache_dir: str
-    remote: str
-
-
 def is_integer(value: str) -> bool:
     try:
         int(value)
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-lightning-cloud == 0.5.59 # Must be pinned to ensure compatibility`
	`1`	`+lightning-cloud == 0.5.61 # Must be pinned to ensure compatibility`
`2`	`2`	`packaging`
`3`	`3`	`typing-extensions >=4.4.0, <4.8.0`
`4`	`4`	`deepdiff >=5.7.0, <6.6.0`