Skip to content

Commit 1fe9f14

Browse files
tchatonpre-commit-ci[bot]thomas
authored andcommitted
Improve map, optimize and StreamingDataset (#18912)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: thomas <[email protected]> (cherry picked from commit f5f4d0a)
1 parent 89f4d18 commit 1fe9f14

File tree

26 files changed

+401
-345
lines changed

26 files changed

+401
-345
lines changed

docs/source-app/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -445,7 +445,7 @@ def find_source():
445445
linkcheck_anchors = False
446446

447447
# A timeout value, in seconds, for the linkcheck builder.
448-
linkcheck_timeout = 10
448+
linkcheck_timeout = 60
449449

450450
# ignore all links in any CHANGELOG file
451451
linkcheck_exclude_documents = [r"^(.*\/)*CHANGELOG.*$"]

docs/source-fabric/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -408,7 +408,7 @@ def find_source():
408408
linkcheck_anchors = False
409409

410410
# A timeout value, in seconds, for the linkcheck builder.
411-
linkcheck_timeout = 10
411+
linkcheck_timeout = 60
412412

413413
# ignore all links in any CHANGELOG file
414414
linkcheck_exclude_documents = [r"^(.*\/)*CHANGELOG.*$"]

docs/source-pytorch/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -599,7 +599,7 @@ def package_list_from_file(file):
599599
linkcheck_anchors = False
600600

601601
# A timeout value, in seconds, for the linkcheck builder.
602-
linkcheck_timeout = 10
602+
linkcheck_timeout = 60
603603

604604
# ignore all links in any CHANGELOG file
605605
linkcheck_exclude_documents = [r"^(.*\/)*CHANGELOG.*$"]

index_1.txt

Whitespace-only changes.

requirements/app/app.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
lightning-cloud ==0.5.46 # Must be pinned to ensure compatibility
1+
lightning-cloud == 0.5.48 # Must be pinned to ensure compatibility
22
packaging
33
typing-extensions >=4.0.0, <4.8.0
44
deepdiff >=5.7.0, <6.6.0

requirements/app/test.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,4 @@ pympler
1515
psutil <5.10.0
1616
setuptools <68.3.0
1717
requests-mock ==1.11.0
18+
pandas

requirements/app/ui.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +0,0 @@
1-
streamlit >=1.13.0, <1.27.0
2-
panel >=1.0.0, <1.3.0

src/lightning/data/streaming/cache.py

Lines changed: 30 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -13,32 +13,38 @@
1313

1414
import logging
1515
import os
16-
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
16+
from dataclasses import dataclass
17+
from typing import Any, Dict, List, Optional, Tuple, Union
1718

1819
from lightning.data.datasets.env import _DistributedEnv
1920
from lightning.data.streaming.constants import (
2021
_INDEX_FILENAME,
21-
_LIGHTNING_CLOUD_GREATER_EQUAL_0_5_46,
22+
_LIGHTNING_CLOUD_GREATER_EQUAL_0_5_48,
2223
_TORCH_GREATER_EQUAL_2_1_0,
2324
)
2425
from lightning.data.streaming.item_loader import BaseItemLoader
2526
from lightning.data.streaming.reader import BinaryReader
2627
from lightning.data.streaming.sampler import ChunkedIndex
2728
from lightning.data.streaming.writer import BinaryWriter
2829

29-
if _LIGHTNING_CLOUD_GREATER_EQUAL_0_5_46:
30-
from lightning_cloud.resolver import _find_remote_dir, _try_create_cache_dir
31-
3230
logger = logging.Logger(__name__)
3331

32+
if _LIGHTNING_CLOUD_GREATER_EQUAL_0_5_48:
33+
from lightning_cloud.resolver import _resolve_dir
34+
35+
36+
@dataclass
37+
class Dir:
38+
"""Holds a directory path and possibly its associated remote URL."""
39+
40+
path: str
41+
url: Optional[str] = None
42+
3443

3544
class Cache:
3645
def __init__(
3746
self,
38-
cache_dir: Optional[str] = None,
39-
remote_dir: Optional[str] = None,
40-
name: Optional[str] = None,
41-
version: Optional[Union[int, Literal["latest"]]] = "latest",
47+
input_dir: Optional[Union[str, Dir]],
4248
compression: Optional[str] = None,
4349
chunk_size: Optional[int] = None,
4450
chunk_bytes: Optional[Union[int, str]] = None,
@@ -48,9 +54,7 @@ def __init__(
4854
together in order to accelerate fetching.
4955
5056
Arguments:
51-
cache_dir: The path to where the chunks will be stored.
52-
remote_dir: The path to a remote folder where the data are located.
53-
The scheme needs to be added to the path.
57+
input_dir: The path to where the chunks will be or are stored.
5458
name: The name of dataset in the cloud.
5559
version: The version of the dataset in the cloud to use. By default, we will use the latest.
5660
compression: The name of the algorithm to reduce the size of the chunks.
@@ -63,25 +67,20 @@ def __init__(
6367
if not _TORCH_GREATER_EQUAL_2_1_0:
6468
raise ModuleNotFoundError("PyTorch version 2.1 or higher is required to use the cache.")
6569

66-
self._cache_dir = cache_dir = str(cache_dir) if cache_dir else _try_create_cache_dir(name)
67-
if not remote_dir:
68-
remote_dir, has_index_file = _find_remote_dir(name, version)
69-
70-
# When the index exists, we don't care about the chunk_size anymore.
71-
if has_index_file and (chunk_size is None and chunk_bytes is None):
72-
chunk_size = 2
73-
74-
# Add the version to the cache_dir to avoid collisions.
75-
if remote_dir and os.path.basename(remote_dir).startswith("version_"):
76-
cache_dir = os.path.join(cache_dir, os.path.basename(remote_dir))
77-
78-
if cache_dir:
79-
os.makedirs(cache_dir, exist_ok=True)
80-
81-
self._cache_dir = cache_dir
82-
83-
self._writer = BinaryWriter(cache_dir, chunk_size=chunk_size, chunk_bytes=chunk_bytes, compression=compression)
84-
self._reader = BinaryReader(cache_dir, remote_dir=remote_dir, compression=compression, item_loader=item_loader)
70+
if not _LIGHTNING_CLOUD_GREATER_EQUAL_0_5_48:
71+
raise ModuleNotFoundError("Lightning Cloud 0.5.48 or higher is required to use the cache.")
72+
73+
input_dir = _resolve_dir(input_dir)
74+
self._cache_dir = input_dir.path
75+
self._writer = BinaryWriter(
76+
self._cache_dir, chunk_size=chunk_size, chunk_bytes=chunk_bytes, compression=compression
77+
)
78+
self._reader = BinaryReader(
79+
self._cache_dir,
80+
remote_input_dir=input_dir.url,
81+
compression=compression,
82+
item_loader=item_loader,
83+
)
8584
self._is_done = False
8685
self._distributed_env = _DistributedEnv.detect()
8786

src/lightning/data/streaming/constants.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
# This is required for full pytree serialization / deserialization support
2222
_TORCH_GREATER_EQUAL_2_1_0 = RequirementCache("torch>=2.1.0")
2323
_VIZ_TRACKER_AVAILABLE = RequirementCache("viztracer")
24-
_LIGHTNING_CLOUD_GREATER_EQUAL_0_5_46 = RequirementCache("lightning-cloud>=0.5.46")
24+
_LIGHTNING_CLOUD_GREATER_EQUAL_0_5_48 = RequirementCache("lightning-cloud>=0.5.48")
2525
_BOTO3_AVAILABLE = RequirementCache("boto3")
2626

2727
# DON'T CHANGE ORDER

0 commit comments

Comments
 (0)