Lightning-AI
diff --git a/‎requirements/app/app.txt
Lines changed: 1 addition & 1 deletion b/‎requirements/app/app.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/lightning/data/__init__.py
Lines changed: 3 additions & 1 deletion b/‎src/lightning/data/__init__.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/lightning/data/cache/__init__.py renamed to ‎src/lightning/data/streaming/__init__.py
Lines changed: 4 additions & 4 deletions b/‎src/lightning/data/cache/__init__.py renamed to ‎src/lightning/data/streaming/__init__.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/lightning/data/cache/cache.py renamed to ‎src/lightning/data/streaming/cache.py
Lines changed: 28 additions & 7 deletions b/‎src/lightning/data/cache/cache.py renamed to ‎src/lightning/data/streaming/cache.py
Lines changed: 28 additions & 7 deletions
diff --git a/‎src/lightning/data/cache/compression.py renamed to ‎src/lightning/data/streaming/compression.py b/‎src/lightning/data/cache/compression.py renamed to ‎src/lightning/data/streaming/compression.py
diff --git a/‎src/lightning/data/cache/config.py renamed to ‎src/lightning/data/streaming/config.py
Lines changed: 3 additions & 3 deletions b/‎src/lightning/data/cache/config.py renamed to ‎src/lightning/data/streaming/config.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/lightning/data/cache/constants.py renamed to ‎src/lightning/data/streaming/constants.py
Lines changed: 1 addition & 1 deletion b/‎src/lightning/data/cache/constants.py renamed to ‎src/lightning/data/streaming/constants.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/lightning/data/cache/dataloader.py renamed to ‎src/lightning/data/streaming/dataloader.py
Lines changed: 9 additions & 9 deletions b/‎src/lightning/data/cache/dataloader.py renamed to ‎src/lightning/data/streaming/dataloader.py
Lines changed: 9 additions & 9 deletions
diff --git a/‎src/lightning/data/streaming/dataset.py
Lines changed: 46 additions & 0 deletions b/‎src/lightning/data/streaming/dataset.py
Lines changed: 46 additions & 0 deletions
diff --git a/‎src/lightning/data/cache/dataset_optimizer.py renamed to ‎src/lightning/data/streaming/dataset_optimizer.py
Lines changed: 11 additions & 4 deletions b/‎src/lightning/data/cache/dataset_optimizer.py renamed to ‎src/lightning/data/streaming/dataset_optimizer.py
Lines changed: 11 additions & 4 deletions
@@ -1,4 +1,4 @@
-lightning-cloud ==0.5.41  # Must be pinned to ensure compatibility
+lightning-cloud ==0.5.42  # Must be pinned to ensure compatibility
 packaging
 typing-extensions >=4.0.0, <4.8.0
 deepdiff >=5.7.0, <6.6.0
 
@@ -1,3 +1,5 @@
 from lightning.data.datasets import LightningDataset, LightningIterableDataset
+from lightning.data.streaming.dataloader import StreamingDataLoader
+from lightning.data.streaming.dataset import StreamingDataset
 
-__all__ = ["LightningDataset", "LightningIterableDataset"]
+__all__ = ["LightningDataset", "StreamingDataset", "StreamingDataLoader", "LightningIterableDataset"]
@@ -11,8 +11,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from lightning.data.cache.cache import Cache
-from lightning.data.cache.dataloader import LightningDataLoader
-from lightning.data.cache.dataset_optimizer import DatasetOptimizer
+from lightning.data.streaming.cache import Cache
+from lightning.data.streaming.dataloader import StreamingDataLoader
+from lightning.data.streaming.dataset_optimizer import DatasetOptimizer
 
-__all__ = ["Cache", "DatasetOptimizer", "LightningDataLoader"]
+__all__ = ["Cache", "DatasetOptimizer", "StreamingDataLoader"]
@@ -13,22 +13,31 @@
 
 import logging
 import os
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Literal, Optional, Tuple, Union
 
-from lightning.data.cache.constants import _INDEX_FILENAME, _TORCH_GREATER_EQUAL_2_1_0
-from lightning.data.cache.reader import BinaryReader
-from lightning.data.cache.sampler import ChunkedIndex
-from lightning.data.cache.writer import BinaryWriter
 from lightning.data.datasets.env import _DistributedEnv
+from lightning.data.streaming.constants import (
+    _INDEX_FILENAME,
+    _LIGHTNING_CLOUD_GREATER_EQUAL_0_5_42,
+    _TORCH_GREATER_EQUAL_2_1_0,
+)
+from lightning.data.streaming.reader import BinaryReader
+from lightning.data.streaming.sampler import ChunkedIndex
+from lightning.data.streaming.writer import BinaryWriter
+
+if _LIGHTNING_CLOUD_GREATER_EQUAL_0_5_42:
+    from lightning_cloud.resolver import _find_remote_dir, _try_create_cache_dir
 
 logger = logging.Logger(__name__)
 
 
 class Cache:
     def __init__(
         self,
-        cache_dir: str,
+        cache_dir: Optional[str] = None,
         remote_dir: Optional[str] = None,
+        name: Optional[str] = None,
+        version: Optional[Union[int, Literal["latest"]]] = "latest",
         compression: Optional[str] = None,
         chunk_size: Optional[int] = None,
         chunk_bytes: Optional[int] = None,
@@ -40,6 +49,8 @@ def __init__(
             cache_dir: The path to where the chunks will be stored.
             remote_dir: The path to a remote folder where the data are located.
                 The scheme needs to be added to the path.
+            name: The name of dataset in the cloud.
+            version: The version of the dataset in the cloud to use. By default, we will use the latest.
             compression: The name of the algorithm to reduce the size of the chunks.
             chunk_bytes: The maximum number of bytes within a chunk.
             chunk_size: The maximum number of items within a chunk.
@@ -48,10 +59,20 @@ def __init__(
         super().__init__()
         if not _TORCH_GREATER_EQUAL_2_1_0:
             raise ModuleNotFoundError("PyTorch version 2.1 or higher is required to use the cache.")
+
+        cache_dir = cache_dir if cache_dir else _try_create_cache_dir(name)
+        if not remote_dir:
+            remote_dir, has_index_file = _find_remote_dir(name, version)
+
+            # When the index exists, we don't care about the chunk_size anymore.
+            if has_index_file and (chunk_size is None and chunk_bytes is None):
+                chunk_size = 2
         self._writer = BinaryWriter(
             str(cache_dir), chunk_size=chunk_size, chunk_bytes=chunk_bytes, compression=compression
         )
-        self._reader = BinaryReader(str(cache_dir), remote_dir=remote_dir, compression=compression)
+        self._reader = BinaryReader(
+            str(cache_dir), remote_dir=remote_dir, compression=compression, name=name, version=version
+        )
         self._cache_dir = str(cache_dir)
         self._is_done = False
         self._distributed_env = _DistributedEnv.detect()
 
@@ -15,9 +15,9 @@
 import os
 from typing import Any, Dict, List, Optional, Tuple
 
-from lightning.data.cache.constants import _INDEX_FILENAME, _TORCH_GREATER_EQUAL_2_1_0
-from lightning.data.cache.downloader import get_downloader_cls
-from lightning.data.cache.sampler import ChunkedIndex
+from lightning.data.streaming.constants import _INDEX_FILENAME, _TORCH_GREATER_EQUAL_2_1_0
+from lightning.data.streaming.downloader import get_downloader_cls
+from lightning.data.streaming.sampler import ChunkedIndex
 
 if _TORCH_GREATER_EQUAL_2_1_0:
     from torch.utils._pytree import treespec_loads
 
@@ -20,5 +20,5 @@
 # This is required for full pytree serialization / deserialization support
 _TORCH_GREATER_EQUAL_2_1_0 = RequirementCache("torch>=2.1.0")
 _VIZ_TRACKER_AVAILABLE = RequirementCache("viztracer")
-_LIGHTNING_CLOUD_GREATER_EQUAL_0_5_41 = RequirementCache("lightning-cloud>=0.5.41")
+_LIGHTNING_CLOUD_GREATER_EQUAL_0_5_42 = RequirementCache("lightning-cloud>=0.5.42")
 _BOTO3_AVAILABLE = RequirementCache("boto3")
@@ -31,10 +31,10 @@
 )
 from torch.utils.data.sampler import BatchSampler, Sampler
 
-from lightning.data.cache import Cache
-from lightning.data.cache.constants import _DEFAULT_CHUNK_BYTES, _TORCH_GREATER_EQUAL_2_1_0, _VIZ_TRACKER_AVAILABLE
-from lightning.data.cache.sampler import CacheBatchSampler
 from lightning.data.datasets.env import _DistributedEnv
+from lightning.data.streaming import Cache
+from lightning.data.streaming.constants import _DEFAULT_CHUNK_BYTES, _TORCH_GREATER_EQUAL_2_1_0, _VIZ_TRACKER_AVAILABLE
+from lightning.data.streaming.sampler import CacheBatchSampler
 
 if _TORCH_GREATER_EQUAL_2_1_0:
     from torch.utils._pytree import tree_flatten
@@ -172,7 +172,7 @@ def __call__(
     ) -> None:
         from torch.utils.data._utils import worker
 
-        from lightning.data.cache.cache import Cache
+        from lightning.data.streaming.cache import Cache
 
         enable_profiling = self._global_rank == 0 and worker_id == 0 and _VIZ_TRACKER_AVAILABLE and self._profile
 
@@ -248,7 +248,7 @@ def _next_data(self) -> Any:
             raise e
 
 
-class LightningDataLoader(DataLoader):
+class StreamingDataLoader(DataLoader):
     __doc__ = DataLoader.__doc__
 
     def __init__(
@@ -271,16 +271,16 @@ def __init__(
     ) -> None:
         if sampler:
             raise ValueError(
-                "The LightningDataLoader relies on its own internal sampler. Passing a sampler isn't supported."
+                "The StreamingDataLoader relies on its own internal sampler. Passing a sampler isn't supported."
             )
 
         if batch_sampler:
             raise ValueError(
-                "The LightningDataLoader relies on its own internal sampler. Passing a batch_sampler isn't supported."
+                "The StreamingDataLoader relies on its own internal sampler. Passing a batch_sampler isn't supported."
             )
 
         if isinstance(dataset, IterableDataset):
-            raise ValueError("Only map-based dataset are supported by the LightningDataLoader for now.")
+            raise ValueError("Only map-based dataset are supported by the StreamingDataLoader for now.")
 
         if profile and not _VIZ_TRACKER_AVAILABLE:
             raise ModuleNotFoundError("To enable DataLoader profiling, run `pip install viztracer`.")
@@ -294,7 +294,7 @@ def __init__(
 
         if len(cache_list) == 0:
             if cache_dir is None:
-                raise ValueError("You should provide a `cache_dir` filepath to the LightningDataLoader.")
+                raise ValueError("You should provide a `cache_dir` filepath to the StreamingDataLoader.")
 
             dataset = CacheDataset(dataset, cache_dir, chunk_bytes, batch_size, compression)
             cache = dataset._cache
 
@@ -0,0 +1,46 @@
+# Copyright The Lightning AI team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Literal, Optional, Union
+
+from torch.utils.data import Dataset
+
+from lightning.data.streaming import Cache
+
+
+class StreamingDataset(Dataset):
+    """The streaming dataset can be used once your data have been optimised using the DatasetOptimiser class."""
+
+    def __init__(
+        self, name: str, version: Optional[Union[int, Literal["latest"]]] = "latest", cache_dir: Optional[str] = None
+    ) -> None:
+        """The streaming dataset can be used once your data have been optimised using the DatasetOptimiser class.
+
+        Arguments:
+            name: The name of the optimised dataset.
+            version: The version of the dataset to use.
+            cache_dir: The cache dir where the data would be stored.
+
+        """
+        super().__init__()
+        self.cache = Cache(name=name, version=version, cache_dir=cache_dir)
+
+    def __len__(self) -> int:
+        return len(self.cache)
+
+    def __getitem__(self, idx: int) -> Any:
+        return self.getitem(self.cache[idx])
+
+    def getitem(self, obj: Any) -> Any:
+        """Override the getitem with your own logic to transform the cache object."""
+        return obj
@@ -16,19 +16,19 @@
 from tqdm import tqdm
 
 from lightning import seed_everything
-from lightning.data.cache import Cache
-from lightning.data.cache.constants import (
+from lightning.data.streaming import Cache
+from lightning.data.streaming.constants import (
     _BOTO3_AVAILABLE,
     _DEFAULT_FAST_DEV_RUN_ITEMS,
     _INDEX_FILENAME,
-    _LIGHTNING_CLOUD_GREATER_EQUAL_0_5_41,
+    _LIGHTNING_CLOUD_GREATER_EQUAL_0_5_42,
     _TORCH_GREATER_EQUAL_2_1_0,
 )
 
 if _TORCH_GREATER_EQUAL_2_1_0:
     from torch.utils._pytree import tree_flatten, tree_unflatten
 
-if _LIGHTNING_CLOUD_GREATER_EQUAL_0_5_41:
+if _LIGHTNING_CLOUD_GREATER_EQUAL_0_5_42:
     from lightning_cloud.resolver import _LightningSrcResolver, _LightningTargetResolver
 
 if _BOTO3_AVAILABLE:
@@ -441,6 +441,13 @@ def prepare_dataset_structure(self, src_dir, filepaths)
                 # [('file_1.JPEG', 'file_1.mask'), ... ('file_N.JPEG', 'file_N.mask')]
                 return [(x[i], x[i+1]) for i in range(len(filepaths) -1)]
 
+            def prepare_item(self, obj):
+                image_filepath, mask_filepath = obj
+
+                image = load_and_resize(image_filepath)
+                mask = load_and_resize(mask_filepath)
+                return (image, mask)
+
         """
         pass
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-lightning-cloud ==0.5.41 # Must be pinned to ensure compatibility`
	`1`	`+lightning-cloud ==0.5.42 # Must be pinned to ensure compatibility`
`2`	`2`	`packaging`
`3`	`3`	`typing-extensions >=4.0.0, <4.8.0`
`4`	`4`	`deepdiff >=5.7.0, <6.6.0`