|
6 | 6 | environment variable `HF_DATASETS_DISABLE_PROGRESS_BARS` to 1. |
7 | 7 | 3. To re-enable progress bars, use `enable_progress_bars()`. |
8 | 8 | 4. To check whether progress bars are disabled, use `are_progress_bars_disabled()`. |
| 9 | + 5. To emit machine-readable JSON progress, use `set_progress_format("json")`. |
9 | 10 |
|
10 | 11 | NOTE: Environment variable `HF_DATASETS_DISABLE_PROGRESS_BARS` has the priority. |
11 | 12 |
|
|
15 | 16 | are_progress_bars_disabled, |
16 | 17 | disable_progress_bars, |
17 | 18 | enable_progress_bars, |
| 19 | + set_progress_format, |
| 20 | + get_progress_format, |
18 | 21 | tqdm, |
19 | 22 | ) |
20 | 23 |
|
|
37 | 40 | # Progress bar will be shown ! |
38 | 41 | for _ in tqdm(range(5)): |
39 | 42 | do_something() |
| 43 | +
|
| 44 | + # Emit JSON progress (machine-readable) |
| 45 | + set_progress_format("json") |
| 46 | + for i in tqdm(range(100)): |
| 47 | + do_something() |
| 48 | + # Outputs: {"stage":"","current":50,"total":100,"percent":50.0} |
40 | 49 | ``` |
41 | 50 | """ |
42 | 51 |
|
| 52 | +import json |
| 53 | +import sys |
43 | 54 | import warnings |
44 | 55 |
|
45 | 56 | from tqdm.auto import tqdm as old_tqdm |
|
56 | 67 | # By default, progress bars are enabled. |
57 | 68 | _hf_datasets_progress_bars_disabled: bool = HF_DATASETS_DISABLE_PROGRESS_BARS or False |
58 | 69 |
|
| 70 | +# Progress format: "tqdm" (default), "json", or "silent" |
| 71 | +# Similar to huggingface/tokenizers#1921 |
| 72 | +_hf_datasets_progress_format: str = "tqdm" |
| 73 | + |
59 | 74 |
|
60 | 75 | def disable_progress_bars() -> None: |
61 | 76 | """ |
@@ -101,18 +116,86 @@ def are_progress_bars_disabled() -> bool: |
101 | 116 | return _hf_datasets_progress_bars_disabled |
102 | 117 |
|
103 | 118 |
|
| 119 | +def set_progress_format(format: str) -> None: |
| 120 | + """ |
| 121 | + Set the global progress format for `datasets`. |
| 122 | +
|
| 123 | + Similar to huggingface/tokenizers#1921 progress_format option. |
| 124 | +
|
| 125 | + Args: |
| 126 | + format: One of "tqdm" (default interactive bars), "json" (machine-readable JSON lines), or "silent" (no output). |
| 127 | +
|
| 128 | + Example: |
| 129 | + ```py |
| 130 | + from datasets.utils import set_progress_format, tqdm |
| 131 | +
|
| 132 | + # Enable JSON output for programmatic consumption |
| 133 | + set_progress_format("json") |
| 134 | +
|
| 135 | + for i in tqdm(range(100), desc="Processing"): |
| 136 | + do_something() |
| 137 | + # Outputs: {"stage":"Processing","current":50,"total":100,"percent":50.0} |
| 138 | + ``` |
| 139 | + """ |
| 140 | + if format not in ("tqdm", "json", "silent"): |
| 141 | + raise ValueError(f"Invalid progress format: {format}. Must be 'tqdm', 'json', or 'silent'.") |
| 142 | + global _hf_datasets_progress_format |
| 143 | + _hf_datasets_progress_format = format |
| 144 | + |
| 145 | + |
| 146 | +def get_progress_format() -> str: |
| 147 | + """ |
| 148 | + Get the current global progress format. |
| 149 | +
|
| 150 | + Returns: |
| 151 | + Current progress format ("tqdm", "json", or "silent"). |
| 152 | + """ |
| 153 | + global _hf_datasets_progress_format |
| 154 | + return _hf_datasets_progress_format |
| 155 | + |
| 156 | + |
104 | 157 | class tqdm(old_tqdm): |
105 | 158 | """ |
106 | | - Class to override `disable` argument in case progress bars are globally disabled. |
| 159 | + Class to override `disable` argument in case progress bars are globally disabled |
| 160 | + and to emit JSON progress when format is "json". |
107 | 161 |
|
108 | | - Taken from https://github.com/tqdm/tqdm/issues/619#issuecomment-619639324. |
| 162 | + Taken from https://github.com/tqdm/tqdm/issues/619#issuecomment-619639324 |
| 163 | + and enhanced with progress_format support (similar to huggingface/tokenizers#1921). |
109 | 164 | """ |
110 | 165 |
|
111 | 166 | def __init__(self, *args, **kwargs): |
112 | 167 | if are_progress_bars_disabled(): |
113 | 168 | kwargs["disable"] = True |
| 169 | + elif get_progress_format() == "silent": |
| 170 | + kwargs["disable"] = True |
| 171 | + elif get_progress_format() == "json": |
| 172 | + # Disable tqdm visual output, we'll emit JSON instead |
| 173 | + kwargs["disable"] = True |
| 174 | + |
114 | 175 | super().__init__(*args, **kwargs) |
115 | 176 |
|
| 177 | + # Store description for JSON output |
| 178 | + self._json_stage = kwargs.get("desc", "") |
| 179 | + self._last_json_percent = -1 |
| 180 | + |
| 181 | + def update(self, n=1): |
| 182 | + """Override update to emit JSON progress when format is 'json'.""" |
| 183 | + super().update(n) |
| 184 | + |
| 185 | + if get_progress_format() == "json" and self.total: |
| 186 | + current_percent = round((self.n / self.total) * 100, 1) if self.total > 0 else 0 |
| 187 | + |
| 188 | + # Emit JSON every 5% or at completion |
| 189 | + if current_percent - self._last_json_percent >= 5.0 or self.n == self.total: |
| 190 | + progress_data = { |
| 191 | + "stage": self._json_stage, |
| 192 | + "current": self.n, |
| 193 | + "total": self.total, |
| 194 | + "percent": current_percent |
| 195 | + } |
| 196 | + print(json.dumps(progress_data, ensure_ascii=False), file=sys.stderr, flush=True) |
| 197 | + self._last_json_percent = current_percent |
| 198 | + |
116 | 199 | def __delattr__(self, attr: str) -> None: |
117 | 200 | """Fix for https://github.com/huggingface/datasets/issues/6066""" |
118 | 201 | try: |
|
0 commit comments