Skip to content

Commit baebcfb

Browse files
committed
Add progress_format support to datasets.utils.tqdm
Similar to huggingface/tokenizers#1921, adds machine-readable JSON progress output. - Add set_progress_format() and get_progress_format() functions - Support 'tqdm' (default), 'json', and 'silent' formats - Emit JSON progress every 5% when format='json' - Export new functions from datasets.utils Cross-reference: huggingface/tokenizers#1921
1 parent 0feb65d commit baebcfb

File tree

2 files changed

+87
-2
lines changed

2 files changed

+87
-2
lines changed

src/datasets/utils/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
are_progress_bars_disabled,
2121
disable_progress_bars,
2222
enable_progress_bars,
23+
get_progress_format,
24+
set_progress_format,
2325
tqdm,
2426
)
2527
from .version import Version

src/datasets/utils/tqdm.py

Lines changed: 85 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
environment variable `HF_DATASETS_DISABLE_PROGRESS_BARS` to 1.
77
3. To re-enable progress bars, use `enable_progress_bars()`.
88
4. To check whether progress bars are disabled, use `are_progress_bars_disabled()`.
9+
5. To emit machine-readable JSON progress, use `set_progress_format("json")`.
910
1011
NOTE: Environment variable `HF_DATASETS_DISABLE_PROGRESS_BARS` has the priority.
1112
@@ -15,6 +16,8 @@
1516
are_progress_bars_disabled,
1617
disable_progress_bars,
1718
enable_progress_bars,
19+
set_progress_format,
20+
get_progress_format,
1821
tqdm,
1922
)
2023
@@ -37,9 +40,17 @@
3740
# Progress bar will be shown !
3841
for _ in tqdm(range(5)):
3942
do_something()
43+
44+
# Emit JSON progress (machine-readable)
45+
set_progress_format("json")
46+
for i in tqdm(range(100)):
47+
do_something()
48+
# Outputs: {"stage":"","current":50,"total":100,"percent":50.0}
4049
```
4150
"""
4251

52+
import json
53+
import sys
4354
import warnings
4455

4556
from tqdm.auto import tqdm as old_tqdm
@@ -56,6 +67,10 @@
5667
# By default, progress bars are enabled.
5768
_hf_datasets_progress_bars_disabled: bool = HF_DATASETS_DISABLE_PROGRESS_BARS or False
5869

70+
# Progress format: "tqdm" (default), "json", or "silent"
71+
# Similar to huggingface/tokenizers#1921
72+
_hf_datasets_progress_format: str = "tqdm"
73+
5974

6075
def disable_progress_bars() -> None:
6176
"""
@@ -101,18 +116,86 @@ def are_progress_bars_disabled() -> bool:
101116
return _hf_datasets_progress_bars_disabled
102117

103118

119+
def set_progress_format(format: str) -> None:
120+
"""
121+
Set the global progress format for `datasets`.
122+
123+
Similar to huggingface/tokenizers#1921 progress_format option.
124+
125+
Args:
126+
format: One of "tqdm" (default interactive bars), "json" (machine-readable JSON lines), or "silent" (no output).
127+
128+
Example:
129+
```py
130+
from datasets.utils import set_progress_format, tqdm
131+
132+
# Enable JSON output for programmatic consumption
133+
set_progress_format("json")
134+
135+
for i in tqdm(range(100), desc="Processing"):
136+
do_something()
137+
# Outputs: {"stage":"Processing","current":50,"total":100,"percent":50.0}
138+
```
139+
"""
140+
if format not in ("tqdm", "json", "silent"):
141+
raise ValueError(f"Invalid progress format: {format}. Must be 'tqdm', 'json', or 'silent'.")
142+
global _hf_datasets_progress_format
143+
_hf_datasets_progress_format = format
144+
145+
146+
def get_progress_format() -> str:
147+
"""
148+
Get the current global progress format.
149+
150+
Returns:
151+
Current progress format ("tqdm", "json", or "silent").
152+
"""
153+
global _hf_datasets_progress_format
154+
return _hf_datasets_progress_format
155+
156+
104157
class tqdm(old_tqdm):
105158
"""
106-
Class to override `disable` argument in case progress bars are globally disabled.
159+
Class to override `disable` argument in case progress bars are globally disabled
160+
and to emit JSON progress when format is "json".
107161
108-
Taken from https://github.com/tqdm/tqdm/issues/619#issuecomment-619639324.
162+
Taken from https://github.com/tqdm/tqdm/issues/619#issuecomment-619639324
163+
and enhanced with progress_format support (similar to huggingface/tokenizers#1921).
109164
"""
110165

111166
def __init__(self, *args, **kwargs):
112167
if are_progress_bars_disabled():
113168
kwargs["disable"] = True
169+
elif get_progress_format() == "silent":
170+
kwargs["disable"] = True
171+
elif get_progress_format() == "json":
172+
# Disable tqdm visual output, we'll emit JSON instead
173+
kwargs["disable"] = True
174+
114175
super().__init__(*args, **kwargs)
115176

177+
# Store description for JSON output
178+
self._json_stage = kwargs.get("desc", "")
179+
self._last_json_percent = -1
180+
181+
def update(self, n=1):
182+
"""Override update to emit JSON progress when format is 'json'."""
183+
super().update(n)
184+
185+
if get_progress_format() == "json" and self.total:
186+
current_percent = round((self.n / self.total) * 100, 1) if self.total > 0 else 0
187+
188+
# Emit JSON every 5% or at completion
189+
if current_percent - self._last_json_percent >= 5.0 or self.n == self.total:
190+
progress_data = {
191+
"stage": self._json_stage,
192+
"current": self.n,
193+
"total": self.total,
194+
"percent": current_percent
195+
}
196+
print(json.dumps(progress_data, ensure_ascii=False), file=sys.stderr, flush=True)
197+
self._last_json_percent = current_percent
198+
116199
def __delattr__(self, attr: str) -> None:
117200
"""Fix for https://github.com/huggingface/datasets/issues/6066"""
118201
try:

0 commit comments

Comments
 (0)