Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1144,8 +1144,77 @@ class CustomEncryption(Encryption):
This allows the data to remain secure while maintaining flexibility in the encryption method.
</details>

<details>
<summary> ✅ Debug & Profile LitData with logs & Litracer</summary>

&nbsp;

LitData comes with built-in logging and profiling capabilities to help you debug and profile your data streaming workloads.

<img width="1439" alt="431247797-0e955e71-2f9a-4aad-b7c1-a8218fed2e2e" src="https://github.com/user-attachments/assets/4e40676c-ba0b-49af-acac-975977173669" />

- e.g., with LitData Streaming

```python
import litdata as ld
from litdata.debugger import enable_tracer

# WARNING: Remove existing trace `litdata_trace.json` file if it exists before re-tracing
enable_tracer()

if __name__ == "__main__":
dataset = ld.StreamingDataset("s3://my-bucket/my-data", shuffle=True)
dataloader = ld.StreamingDataLoader(dataset, batch_size=64)

for batch in dataloader:
print(batch) # Replace with your data processing logic
```

1. Generate Debug Log:

- Run your Python program and it'll create a log file containing detailed debug information.

```bash
python main.py
```

2. Install [Litracer](https://github.com/deependujha/litracer/):

- Option 1: Using Go (recommended)
- Install Go on your system.
- Run the following command to install Litracer:

```bash
go install github.com/deependujha/litracer@latest
```

- Option 2: Download Binary
- Visit the [LitRacer GitHub Releases](https://github.com/deependujha/litracer/releases) page.
- Download the appropriate binary for your operating system and follow the installation instructions.

3. Convert Debug Log to trace JSON:

- Use litracer to convert the generated log file into a trace JSON file. This command uses 100 workers for conversion:

```bash
litracer litdata_debug.log -o litdata_trace.json -w 100
```

4. Visualize the trace:

- Use either `chrome://tracing` in the Chrome browser or `ui.perfetto.dev` to view the `litdata_trace.json` file for in-depth performance insights. You can also use `SQL queries` to analyze the logs.
- `Perfetto` is recommended over `chrome://tracing` for visualization & analyzing.

- Key Points:

- For very large trace.json files (`> 2GB`), refer to the [Perfetto documentation](https://perfetto.dev/docs/visualization/large-traces) for using native accelerators.
- If you are trying to connect Perfetto to the RPC server, it is recommended to use Chrome over Brave, as it has been observed that Perfetto in Brave does not autodetect the RPC server.

</details>

&nbsp;


## Features for transforming datasets

<details>
Expand Down
6 changes: 3 additions & 3 deletions src/litdata/loggers.py → src/litdata/debugger.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from functools import lru_cache
from typing import Tuple

from litdata.constants import _DEBUG, _PRINT_DEBUG_LOGS
from litdata.constants import _PRINT_DEBUG_LOGS
from litdata.utilities.env import _DistributedEnv, _WorkerEnv

# Create the root logger for the library
Expand All @@ -41,7 +41,7 @@ def __init__(self, name: str):
@staticmethod
def get_log_file_and_level() -> Tuple[str, int]:
log_file = os.getenv("LITDATA_LOG_FILE", "litdata_debug.log")
log_lvl = os.getenv("LITDATA_LOG_LEVEL", "INFO" if not _DEBUG else "DEBUG")
log_lvl = os.getenv("LITDATA_LOG_LEVEL", "DEBUG")

log_lvl = get_logger_level(log_lvl)

Expand Down Expand Up @@ -76,7 +76,7 @@ def setup_logger(self) -> None:
self.logger.addHandler(file_handler)


def configure_logger() -> None:
def enable_tracer() -> None:
os.environ["LITDATA_LOG_FILE"] = "litdata_debug.log"
LitDataLogger("litdata")

Expand Down
2 changes: 1 addition & 1 deletion src/litdata/streaming/combined.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

from torch.utils.data import IterableDataset

from litdata.loggers import ChromeTraceColors, _get_log_msg
from litdata.debugger import ChromeTraceColors, _get_log_msg
from litdata.streaming.dataset import StreamingDataset
from litdata.utilities.env import _WorkerEnv

Expand Down
2 changes: 1 addition & 1 deletion src/litdata/streaming/compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from typing import Dict, TypeVar

from litdata.constants import _ZSTD_AVAILABLE
from litdata.loggers import ChromeTraceColors, _get_log_msg
from litdata.debugger import ChromeTraceColors, _get_log_msg

TCompressor = TypeVar("TCompressor", bound="Compressor")

Expand Down
2 changes: 1 addition & 1 deletion src/litdata/streaming/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from typing import Any, Dict, List, Optional, Tuple

from litdata.constants import _INDEX_FILENAME
from litdata.loggers import ChromeTraceColors, _get_log_msg
from litdata.debugger import ChromeTraceColors, _get_log_msg
from litdata.streaming.compression import _COMPRESSORS, Compressor
from litdata.streaming.downloader import get_downloader
from litdata.streaming.item_loader import BaseItemLoader, Interval, PyTreeLoader, TokensLoader
Expand Down
2 changes: 1 addition & 1 deletion src/litdata/streaming/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from torch.utils.data.sampler import BatchSampler, Sampler

from litdata.constants import _DEFAULT_CHUNK_BYTES, _VIZ_TRACKER_AVAILABLE
from litdata.loggers import _get_log_msg
from litdata.debugger import _get_log_msg
from litdata.streaming import Cache
from litdata.streaming.combined import (
__NUM_SAMPLES_YIELDED_KEY__,
Expand Down
2 changes: 1 addition & 1 deletion src/litdata/streaming/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@

from litdata import __version__
from litdata.constants import _INDEX_FILENAME
from litdata.debugger import _get_log_msg
from litdata.helpers import _check_version_and_prompt_upgrade
from litdata.loggers import _get_log_msg
from litdata.streaming import Cache
from litdata.streaming.item_loader import BaseItemLoader, ParquetLoader
from litdata.streaming.resolver import Dir, _resolve_dir
Expand Down
2 changes: 1 addition & 1 deletion src/litdata/streaming/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
_HF_HUB_AVAILABLE,
_INDEX_FILENAME,
)
from litdata.loggers import _get_log_msg
from litdata.debugger import _get_log_msg
from litdata.streaming.client import S3Client

logger = logging.getLogger("litdata.streaming.downloader")
Expand Down
2 changes: 1 addition & 1 deletion src/litdata/streaming/item_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
_PYARROW_AVAILABLE,
_TORCH_DTYPES_MAPPING,
)
from litdata.loggers import ChromeTraceColors, _get_log_msg
from litdata.debugger import ChromeTraceColors, _get_log_msg
from litdata.streaming.serializers import Serializer
from litdata.utilities._pytree import PyTree, tree_unflatten
from litdata.utilities.encryption import Encryption, EncryptionLevel
Expand Down
2 changes: 1 addition & 1 deletion src/litdata/streaming/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from filelock import FileLock, Timeout

from litdata.constants import _DEBUG
from litdata.loggers import _get_log_msg
from litdata.debugger import _get_log_msg
from litdata.streaming.config import ChunksConfig, Interval
from litdata.streaming.item_loader import BaseItemLoader, ParquetLoader, PyTreeLoader, TokensLoader
from litdata.streaming.sampler import ChunkedIndex
Expand Down
2 changes: 1 addition & 1 deletion tests/test_loggers.py → tests/test_debugger.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@


def test_get_logger_level():
from litdata.loggers import get_logger_level
from litdata.debugger import get_logger_level

assert get_logger_level("DEBUG") == logging.DEBUG
assert get_logger_level("INFO") == logging.INFO
Expand Down
Loading