Lightning-AI
diff --git a/‎src/litdata/constants.py‎
Lines changed: 1 addition & 0 deletions b/‎src/litdata/constants.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/litdata/processing/readers.py‎
Lines changed: 1 addition & 4 deletions b/‎src/litdata/processing/readers.py‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎src/litdata/streaming/dataset.py‎
Lines changed: 17 additions & 5 deletions b/‎src/litdata/streaming/dataset.py‎
Lines changed: 17 additions & 5 deletions
diff --git a/‎src/litdata/streaming/downloader.py‎
Lines changed: 35 additions & 24 deletions b/‎src/litdata/streaming/downloader.py‎
Lines changed: 35 additions & 24 deletions
diff --git a/‎src/litdata/streaming/item_loader.py‎
Lines changed: 125 additions & 14 deletions b/‎src/litdata/streaming/item_loader.py‎
Lines changed: 125 additions & 14 deletions
@@ -39,6 +39,7 @@
 _TQDM_AVAILABLE = RequirementCache("tqdm")
 _LIGHTNING_SDK_AVAILABLE = RequirementCache("lightning_sdk")
 _HF_HUB_AVAILABLE = RequirementCache("huggingface_hub")
+_PYARROW_AVAILABLE = RequirementCache("pyarrow")
 _POLARS_AVAILABLE = RequirementCache("polars>1.0.0")
 _DEBUG = bool(int(os.getenv("DEBUG_LITDATA", "0")))
 
 
@@ -16,13 +16,10 @@
 from abc import ABC, abstractmethod
 from typing import Any, List
 
-from lightning_utilities.core.imports import RequirementCache
-
+from litdata.constants import _PYARROW_AVAILABLE
 from litdata.streaming.dataloader import StreamingDataLoader
 from litdata.utilities.format import _get_tqdm_iterator_if_available
 
-_PYARROW_AVAILABLE = RequirementCache("pyarrow")
-
 
 class BaseReader(ABC):
     """The `BaseReader` interface defines how to read and preprocess data
 
@@ -100,11 +100,25 @@ def __init__(
 
         if input_dir.url is not None and input_dir.url.startswith("hf://"):
             if index_path is None:
-                # no index path provide, load from cache, or try indexing on the go.
+                # No index_path was provided. Attempt to load it from cache or generate it dynamically on the fly.
                 index_path = index_hf_dataset(input_dir.url)
                 cache_dir.path = index_path
                 input_dir.path = index_path
-            item_loader = ParquetLoader()
+
+            if item_loader is not None and not isinstance(item_loader, ParquetLoader):
+                raise ValueError(
+                    "Invalid item_loader for hf://datasets. "
+                    "The item_loader must be an instance of ParquetLoader. "
+                    "Please provide a valid ParquetLoader instance."
+                )
+
+            if item_loader is not None and item_loader._low_memory and shuffle:
+                raise ValueError(
+                    "You have enabled shuffling when using low memory with ParquetLoader. "
+                    "This configuration may lead to performance issues during the training process. "
+                    "Consider disabling shuffling or using a ParquetLoader without low memory mode."
+                )
+            item_loader = item_loader or ParquetLoader()
 
         self.input_dir = input_dir
         self.cache_dir = cache_dir
@@ -548,9 +562,7 @@ def _validate_state_dict(self) -> None:
                     "The provided `item_loader` state doesn't match the current one. "
                     f"Found `{self.item_loader.state_dict()}` instead of `{state['item_loader']}`."
                 )
-            logger.warning(
-                f"Overriding state item_loader {state['item_loader']} " f"to {self.item_loader.state_dict()}."
-            )
+            logger.warning(f"Overriding state item_loader {state['item_loader']} to {self.item_loader.state_dict()}.")
             state["item_loader"] = self.item_loader.state_dict()
 
         if state["drop_last"] != self.drop_last:
 
@@ -15,6 +15,7 @@
 import os
 import shutil
 import subprocess
+import tempfile
 from abc import ABC
 from contextlib import suppress
 from typing import Any, Dict, List, Optional, Type
@@ -58,9 +59,9 @@ def download_chunk_from_index(self, chunk_index: int) -> None:
         local_chunkpath = os.path.join(self._cache_dir, chunk_filename)
         remote_chunkpath = os.path.join(self._remote_dir, chunk_filename)
 
-        self.download_file(remote_chunkpath, local_chunkpath, chunk_filename)
+        self.download_file(remote_chunkpath, local_chunkpath)
 
-    def download_file(self, remote_chunkpath: str, local_chunkpath: str, remote_chunk_filename: str = "") -> None:
+    def download_file(self, remote_chunkpath: str, local_chunkpath: str) -> None:
         pass
 
 
@@ -74,7 +75,7 @@ def __init__(
         if not self._s5cmd_available or _DISABLE_S5CMD:
             self._client = S3Client(storage_options=self._storage_options)
 
-    def download_file(self, remote_filepath: str, local_filepath: str, remote_chunk_filename: str = "") -> None:
+    def download_file(self, remote_filepath: str, local_filepath: str) -> None:
         obj = parse.urlparse(remote_filepath)
 
         if obj.scheme != "s3":
@@ -158,7 +159,7 @@ def __init__(
 
         super().__init__(remote_dir, cache_dir, chunks, storage_options)
 
-    def download_file(self, remote_filepath: str, local_filepath: str, remote_chunk_filename: str = "") -> None:
+    def download_file(self, remote_filepath: str, local_filepath: str) -> None:
         from google.cloud import storage
 
         obj = parse.urlparse(remote_filepath)
@@ -193,7 +194,7 @@ def __init__(
 
         super().__init__(remote_dir, cache_dir, chunks, storage_options)
 
-    def download_file(self, remote_filepath: str, local_filepath: str, remote_chunk_filename: str = "") -> None:
+    def download_file(self, remote_filepath: str, local_filepath: str) -> None:
         from azure.storage.blob import BlobServiceClient
 
         obj = parse.urlparse(remote_filepath)
@@ -220,7 +221,7 @@ def download_file(self, remote_filepath: str, local_filepath: str, remote_chunk_
 
 
 class LocalDownloader(Downloader):
-    def download_file(self, remote_filepath: str, local_filepath: str, remote_chunk_filename: str = "") -> None:
+    def download_file(self, remote_filepath: str, local_filepath: str) -> None:
         if not os.path.exists(remote_filepath):
             raise FileNotFoundError(f"The provided remote_path doesn't exist: {remote_filepath}")
 
@@ -248,32 +249,42 @@ def __init__(
             )
 
         super().__init__(remote_dir, cache_dir, chunks, storage_options)
-        from huggingface_hub import HfFileSystem
 
-        self.fs = HfFileSystem()
+    def download_file(self, remote_filepath: str, local_filepath: str) -> None:
+        """Download a file from the Hugging Face Hub.
+        The remote_filepath should be in the format `hf://<repo_type>/<repo_org>/<repo_name>/path`. For more
+        information, see
+        https://huggingface.co/docs/huggingface_hub/en/guides/hf_file_system#integrations.
+        """
+        from huggingface_hub import hf_hub_download
 
-    def download_file(self, remote_filepath: str, local_filepath: str, remote_chunk_filename: str = "") -> None:
-        # for HF dataset downloading, we don't need remote_filepath, but remote_chunk_filename
-        with suppress(Timeout), FileLock(local_filepath + ".lock", timeout=0):
-            temp_path = local_filepath + ".tmp"  # Avoid partial writes
-            try:
-                with self.fs.open(remote_chunk_filename, "rb") as cloud_file, open(temp_path, "wb") as local_file:
-                    for chunk in iter(lambda: cloud_file.read(4096), b""):  # Stream in 4KB chunks local_file.
-                        local_file.write(chunk)
+        obj = parse.urlparse(remote_filepath)
+
+        if obj.scheme != "hf":
+            raise ValueError(f"Expected obj.scheme to be `hf`, instead, got {obj.scheme} for remote={remote_filepath}")
 
-                os.rename(temp_path, local_filepath)  # Atomic move after successful write
+        if os.path.exists(local_filepath):
+            return
 
-            except Exception as e:
-                print(f"Error processing {remote_chunk_filename}: {e}")
+        with suppress(Timeout), FileLock(local_filepath + ".lock", timeout=0), tempfile.TemporaryDirectory() as tmpdir:
+            _, _, _, repo_org, repo_name, path = remote_filepath.split("/", 5)
+            repo_id = f"{repo_org}/{repo_name}"
 
-            finally:
-                # Ensure cleanup of temp file if an error occurs
-                if os.path.exists(temp_path):
-                    os.remove(temp_path)
+            downloaded_path = hf_hub_download(
+                repo_id,
+                path,
+                cache_dir=tmpdir,
+                repo_type="dataset",
+                **self._storage_options,
+            )
+            if downloaded_path != local_filepath and os.path.exists(downloaded_path):
+                temp_file_path = local_filepath + ".tmp"
+                shutil.copyfile(downloaded_path, temp_file_path)
+                os.rename(temp_file_path, local_filepath)
 
 
 class LocalDownloaderWithCache(LocalDownloader):
-    def download_file(self, remote_filepath: str, local_filepath: str, remote_chunk_filename: str = "") -> None:
+    def download_file(self, remote_filepath: str, local_filepath: str) -> None:
         remote_filepath = remote_filepath.replace("local:", "")
         super().download_file(remote_filepath, local_filepath)
 
 
@@ -10,8 +10,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import functools
+import logging
 import os
 from abc import ABC, abstractmethod
 from collections import defaultdict, namedtuple
@@ -29,6 +29,7 @@
     _MAX_WAIT_TIME,
     _NUMPY_DTYPES_MAPPING,
     _POLARS_AVAILABLE,
+    _PYARROW_AVAILABLE,
     _TORCH_DTYPES_MAPPING,
 )
 from litdata.streaming.serializers import Serializer
@@ -37,6 +38,8 @@
 
 Interval = namedtuple("Interval", ["chunk_start", "roi_start_idx", "roi_end_idx", "chunk_end"])
 
+logger = logging.getLogger(__name__)
+
 
 class BaseItemLoader(ABC):
     """The base item loader is responsible to decide how the items within a chunk are loaded."""
@@ -527,13 +530,25 @@ def encode_data(cls, data: List[bytes], _: List[int], flattened: List[Any]) -> T
 
 
 class ParquetLoader(BaseItemLoader):
-    def __init__(self) -> None:
+    def __init__(self, pre_load_chunk: bool = False, low_memory: bool = True) -> None:
         if not _POLARS_AVAILABLE:
             raise ModuleNotFoundError(
                 "You are using the Parquet item loader, which depends on `Polars > 1.0.0`.",
                 "Please, run: `pip install polars>1.0.0`",
             )
+        if not _PYARROW_AVAILABLE:
+            raise ModuleNotFoundError("Please, run: `pip install pyarrow`")
+
         self._chunk_filepaths: Dict[str, bool] = {}
+        self._pre_load_chunk = pre_load_chunk
+        self._low_memory = low_memory
+
+        if not self._low_memory:
+            logger.warning(
+                "You have set low_memory=False in ParquetLoader. "
+                "This may result in high memory usage when processing large Parquet chunk files. "
+                "Consider setting low_memory=True to reduce memory consumption."
+            )
 
     def setup(
         self,
@@ -548,7 +563,9 @@ def setup(
         self._data_format = self._config["data_format"]
         self._shift_idx = len(self._data_format) * 4
         self.region_of_interest = region_of_interest
-        self._df: Dict[str, Any] = {}
+        self._df: Dict[int, Any] = {}
+        self._chunk_row_groups: Dict[int, Any] = {}
+        self._chunk_row_group_item_read_count: Dict[int, Any] = {}
 
     def generate_intervals(self) -> List[Interval]:
         intervals = []
@@ -566,11 +583,14 @@ def generate_intervals(self) -> List[Interval]:
         return intervals
 
     def pre_load_chunk(self, chunk_index: int, chunk_filepath: str) -> None:
-        """Logic to load the chunk in background to gain some time."""
+        """Preload the chunk in the background to gain some time."""
+        if not self._pre_load_chunk or self._low_memory:
+            return
+
         import polars as pl
 
-        if chunk_filepath not in self._df:
-            self._df[chunk_filepath] = pl.scan_parquet(chunk_filepath).collect()
+        if chunk_index not in self._df and os.path.exists(chunk_filepath):
+            self._df[chunk_index] = pl.scan_parquet(chunk_filepath, low_memory=True).collect()
 
     def load_item_from_chunk(
         self,
@@ -580,7 +600,7 @@ def load_item_from_chunk(
         begin: int,
         filesize_bytes: int,
     ) -> Any:
-        """Returns an item loaded from a chunk."""
+        """Returns an item loaded from a parquet chunk."""
         if chunk_filepath in self._chunk_filepaths and not os.path.isfile(chunk_filepath):
             del self._chunk_filepaths[chunk_filepath]
 
@@ -593,21 +613,112 @@ def load_item_from_chunk(
 
             self._chunk_filepaths[chunk_filepath] = True
 
-        return self.get_df(chunk_filepath).row(index - begin)
+        # relative index of the desired row within the chunk.
+        relative_index = index - begin
+        if self._low_memory:
+            return self._get_item_with_low_memory(chunk_index, chunk_filepath, relative_index)
 
-    def get_df(self, chunk_filepath: str) -> Any:
+        return self._get_item(chunk_index, chunk_filepath, relative_index)
+
+    def _get_item_with_low_memory(self, chunk_index: int, chunk_filepath: str, row_index: int) -> Any:
+        """Retrieve a dataframe row from a parquet chunk in low memory mode.
+
+        This method reads only the necessary row group from the parquet file using PyArrow and Polars,
+        which helps in reducing memory usage.
+
+        Args:
+            chunk_index (int): The index of the chunk to be accessed.
+            chunk_filepath (str): The file path of the parquet chunk.
+            row_index (int): The relative row index within the loaded chunk.
+
+        Returns:
+            Any: The dataframe row corresponding to the specified index.
+        """
         import polars as pl
+        import pyarrow.parquet as pq
+
+        # Load the Parquet file metadata if not already loaded
+        if chunk_index not in self._df:
+            self._df[chunk_index] = pq.ParquetFile(chunk_filepath)
+
+        # Determine the row group and the row index within the row group
+        parquet_file = self._df[chunk_index]
+        num_rows_per_row_group = parquet_file.metadata.row_group(0).num_rows
+        row_group_index = row_index // num_rows_per_row_group
+        row_index_within_group = row_index % num_rows_per_row_group
+
+        # Check if the row group is already loaded
+        if chunk_index in self._chunk_row_groups and row_group_index in self._chunk_row_groups[chunk_index]:
+            # Use the cached row group
+            row_group_df = self._chunk_row_groups[chunk_index][row_group_index]
+            # update read count
+            self._chunk_row_group_item_read_count[chunk_index][row_group_index] += 1
+        else:
+            # Load the row group and convert it to a Polars DataFrame
+            row_group = self._df[chunk_index].read_row_group(row_group_index)
+            row_group_df = pl.from_arrow(row_group)
+
+            # Cache the loaded row group
+            if chunk_index not in self._chunk_row_groups:
+                self._chunk_row_groups[chunk_index] = {}
+                self._chunk_row_group_item_read_count[chunk_index] = {}
 
-        if chunk_filepath not in self._df:
-            self._df[chunk_filepath] = pl.scan_parquet(chunk_filepath).collect()
-        return self._df[chunk_filepath]
+            self._chunk_row_groups[chunk_index][row_group_index] = row_group_df
+            self._chunk_row_group_item_read_count[chunk_index][row_group_index] = 1
+
+        # Check if the row group has been fully read and release memory if necessary
+        read_count = self._chunk_row_group_item_read_count[chunk_index][row_group_index]
+        if read_count >= num_rows_per_row_group:
+            # Release memory for the fully read row group
+            del self._chunk_row_groups[chunk_index][row_group_index]
+            del self._chunk_row_group_item_read_count[chunk_index][row_group_index]
+
+        # Return the specific row from the dataframe
+        return row_group_df.row(row_index_within_group)  # type: ignore
+
+    def _get_item(self, chunk_index: int, chunk_filepath: str, index: int) -> Any:
+        """Retrieve a dataframe row from a parquet chunk by loading the entire chunk into memory.
+
+        Note:
+            This method reads the complete parquet file using Polars. Exercise caution with large files as it
+            may significantly increase memory usage.
+
+        Args:
+            chunk_index (int): The index of the chunk to be accessed.
+            chunk_filepath (str): The file path of the parquet chunk.
+            index (int): The relative row index within the loaded chunk.
+
+        Returns:
+            Any: The dataframe row corresponding to the specified index.
+        """
+        import polars as pl
+
+        if chunk_index not in self._df:
+            self._df[chunk_index] = pl.scan_parquet(chunk_filepath, low_memory=True).collect()
+        return self._df[chunk_index].row(index)
 
     def delete(self, chunk_index: int, chunk_filepath: str) -> None:
         """Delete a chunk from the local filesystem."""
+        if chunk_index in self._df:
+            del self._df[chunk_index]
+        if chunk_index in self._chunk_row_groups:
+            del self._chunk_row_groups[chunk_index]
+
+        if chunk_index in self._chunk_row_group_item_read_count:
+            del self._chunk_row_group_item_read_count[chunk_index]
         if os.path.exists(chunk_filepath):
             os.remove(chunk_filepath)
-        if chunk_filepath in self._df:
-            del self._df[chunk_filepath]
+
+    def close(self, chunk_index: int) -> None:
+        """Release the memory-mapped file for a specific chunk index."""
+        if chunk_index in self._df:
+            del self._df[chunk_index]
+
+        if chunk_index in self._chunk_row_groups:
+            del self._chunk_row_groups[chunk_index]
+
+        if chunk_index in self._chunk_row_group_item_read_count:
+            del self._chunk_row_group_item_read_count[chunk_index]
 
     def encode_data(self, data: List[bytes], sizes: List[int], flattened: List[Any]) -> Any:
         pass