|
31 | 31 | ```
|
32 | 32 | """
|
33 | 33 |
|
| 34 | +import argparse |
34 | 35 | import json
|
35 | 36 | import multiprocessing
|
36 | 37 | import sys
|
37 | 38 | from pathlib import Path
|
38 | 39 |
|
| 40 | +import requests |
| 41 | +from datasets import load_dataset |
39 | 42 | from megatron.core.datasets import indexed_dataset
|
40 | 43 | from transformers import AutoTokenizer
|
41 | 44 |
|
@@ -198,3 +201,92 @@ def megatron_preprocess_data(
|
198 | 201 | final_enc_len += num_tokens
|
199 | 202 |
|
200 | 203 | print(f">>> Total number of tokens: {final_enc_len}")
|
| 204 | + |
| 205 | + |
| 206 | +def main(): |
| 207 | + """Sample main function to process large data for pretraining. |
| 208 | +
|
| 209 | + Example usage: |
| 210 | +
|
| 211 | + >>> python megatron_preprocess_data.py \ |
| 212 | + --dataset "nvidia/Nemotron-Pretraining-Dataset-sample" \ |
| 213 | + --tokenizer "meta-llama/Llama-3.2-1B-Instruct" \ |
| 214 | + --output_dir "./processed_data" |
| 215 | + """ |
| 216 | + parser = argparse.ArgumentParser(prog="megatron_preprocess_data") |
| 217 | + parser.add_argument("--input_path", type=str, default=None, help="Input path.") |
| 218 | + parser.add_argument( |
| 219 | + "--dataset", |
| 220 | + type=str, |
| 221 | + default="nvidia/Nemotron-Pretraining-Dataset-sample", |
| 222 | + help="Hugging Face Hub dataset name or path", |
| 223 | + ) |
| 224 | + parser.add_argument("--subset", type=str, default=None, help="Hugging Face Hub dataset subset") |
| 225 | + parser.add_argument("--split", type=str, default="train", help="Hugging Face Hub dataset split") |
| 226 | + parser.add_argument( |
| 227 | + "--output_dir", type=str, default="./processed_data", help="Output directory" |
| 228 | + ) |
| 229 | + parser.add_argument("--tokenizer", type=str, required=True, help="Tokenizer name or path") |
| 230 | + parser.add_argument("--json_keys", nargs="+", default=["text"], help="JSON keys to tokenize") |
| 231 | + parser.add_argument("--append_eod", action="store_true", help="Append <eod> token") |
| 232 | + parser.add_argument( |
| 233 | + "--max_sequence_length", type=int, default=None, help="Maximum sequence length" |
| 234 | + ) |
| 235 | + parser.add_argument("--workers", type=int, default=8, help="Number of worker processes") |
| 236 | + parser.add_argument("--log_interval", type=int, default=1000, help="Log interval") |
| 237 | + args = parser.parse_args() |
| 238 | + |
| 239 | + if args.input_path is None: |
| 240 | + args.input_path = [] |
| 241 | + |
| 242 | + try: |
| 243 | + response = requests.get( |
| 244 | + f"https://datasets-server.huggingface.co/splits?dataset={args.dataset}", |
| 245 | + timeout=10, |
| 246 | + ) |
| 247 | + response.raise_for_status() |
| 248 | + except requests.RequestException as e: |
| 249 | + print(f"Failed to fetch dataset splits for {args.dataset}: {e}") |
| 250 | + return |
| 251 | + |
| 252 | + for entry in response.json()["splits"]: |
| 253 | + skip_processing = False |
| 254 | + name = entry["dataset"] |
| 255 | + subset = entry.get("config", None) |
| 256 | + split = entry["split"] |
| 257 | + |
| 258 | + if args.subset is not None and args.subset != subset: |
| 259 | + skip_processing = True |
| 260 | + if args.split is not None and args.split != split: |
| 261 | + skip_processing = True |
| 262 | + |
| 263 | + print(f"Loading dataset {name} with subset {subset} and split {split}") |
| 264 | + dataset = load_dataset(name, subset, split=split) |
| 265 | + |
| 266 | + for key in args.json_keys: |
| 267 | + if key not in dataset.features: |
| 268 | + print(f"Key {key} not found in dataset features. Skipping...") |
| 269 | + skip_processing = True |
| 270 | + break |
| 271 | + |
| 272 | + if skip_processing: |
| 273 | + continue |
| 274 | + |
| 275 | + json_file_path = args.output_dir + "/" + name + "_" + subset + "_" + split + ".jsonl" |
| 276 | + dataset.to_json(json_file_path) |
| 277 | + args.input_path += [json_file_path] |
| 278 | + |
| 279 | + megatron_preprocess_data( |
| 280 | + input_path=args.input_path, |
| 281 | + output_dir=args.output_dir, |
| 282 | + tokenizer_name_or_path=args.tokenizer, |
| 283 | + json_keys=args.json_keys, |
| 284 | + append_eod=args.append_eod, |
| 285 | + max_sequence_length=args.max_sequence_length, |
| 286 | + workers=args.workers, |
| 287 | + log_interval=args.log_interval, |
| 288 | + ) |
| 289 | + |
| 290 | + |
| 291 | +if __name__ == "__main__": |
| 292 | + main() |
0 commit comments