@@ -210,13 +210,16 @@ def main():
210
210
211
211
>>> python megatron_preprocess_data.py \
212
212
--dataset "nvidia/Nemotron-Pretraining-Dataset-sample" \
213
- --tokenizer "nvidia/Nemotron-Pretraining-Tokenizer " \
213
+ --tokenizer "meta-llama/Llama-3.2-1B-Instruct " \
214
214
--output_dir "./processed_data"
215
215
"""
216
216
parser = argparse .ArgumentParser (prog = "megatron_preprocess_data" )
217
217
parser .add_argument ("--input_path" , type = str , default = None , help = "Input path." )
218
218
parser .add_argument (
219
- "--dataset" , type = str , default = None , help = "Hugging Face Hub dataset name or path"
219
+ "--dataset" ,
220
+ type = str ,
221
+ default = "nvidia/Nemotron-Pretraining-Dataset-sample" ,
222
+ help = "Hugging Face Hub dataset name or path" ,
220
223
)
221
224
parser .add_argument ("--subset" , type = str , default = None , help = "Hugging Face Hub dataset subset" )
222
225
parser .add_argument ("--split" , type = str , default = "train" , help = "Hugging Face Hub dataset split" )
@@ -225,7 +228,7 @@ def main():
225
228
)
226
229
parser .add_argument ("--tokenizer" , type = str , required = True , help = "Tokenizer name or path" )
227
230
parser .add_argument ("--json_keys" , nargs = "+" , default = ["text" ], help = "JSON keys to tokenize" )
228
- parser .add_argument ("--append_eod" , type = bool , default = False , help = "Append <eod> token" )
231
+ parser .add_argument ("--append_eod" , action = "store_true" , help = "Append <eod> token" )
229
232
parser .add_argument (
230
233
"--max_sequence_length" , type = int , default = None , help = "Maximum sequence length"
231
234
)
@@ -235,8 +238,6 @@ def main():
235
238
236
239
if args .input_path is None :
237
240
args .input_path = []
238
- if args .dataset is None :
239
- args .dataset = "nvidia/Nemotron-Pretraining-Dataset-sample"
240
241
241
242
response = requests .get (
242
243
"https://datasets-server.huggingface.co/splits?dataset={}" .format (args .dataset ),
@@ -250,9 +251,9 @@ def main():
250
251
split = entry ["split" ]
251
252
252
253
if args .subset is not None and args .subset != subset :
253
- continue
254
+ skip_processing = True
254
255
if args .split is not None and args .split != split :
255
- continue
256
+ skip_processing = True
256
257
257
258
print (f"Loading dataset { name } with subset { subset } and split { split } " )
258
259
dataset = load_dataset (name , subset , split = split )
0 commit comments