@@ -210,13 +210,16 @@ def main():
210210
211211 >>> python megatron_preprocess_data.py \
212212 --dataset "nvidia/Nemotron-Pretraining-Dataset-sample" \
213- --tokenizer "nvidia/Nemotron-Pretraining-Tokenizer " \
213+ --tokenizer "meta-llama/Llama-3.2-1B-Instruct " \
214214 --output_dir "./processed_data"
215215 """
216216 parser = argparse .ArgumentParser (prog = "megatron_preprocess_data" )
217217 parser .add_argument ("--input_path" , type = str , default = None , help = "Input path." )
218218 parser .add_argument (
219- "--dataset" , type = str , default = None , help = "Hugging Face Hub dataset name or path"
219+ "--dataset" ,
220+ type = str ,
221+ default = "nvidia/Nemotron-Pretraining-Dataset-sample" ,
222+ help = "Hugging Face Hub dataset name or path" ,
220223 )
221224 parser .add_argument ("--subset" , type = str , default = None , help = "Hugging Face Hub dataset subset" )
222225 parser .add_argument ("--split" , type = str , default = "train" , help = "Hugging Face Hub dataset split" )
@@ -225,7 +228,7 @@ def main():
225228 )
226229 parser .add_argument ("--tokenizer" , type = str , required = True , help = "Tokenizer name or path" )
227230 parser .add_argument ("--json_keys" , nargs = "+" , default = ["text" ], help = "JSON keys to tokenize" )
228- parser .add_argument ("--append_eod" , type = bool , default = False , help = "Append <eod> token" )
231+ parser .add_argument ("--append_eod" , action = "store_true" , help = "Append <eod> token" )
229232 parser .add_argument (
230233 "--max_sequence_length" , type = int , default = None , help = "Maximum sequence length"
231234 )
@@ -235,8 +238,6 @@ def main():
235238
236239 if args .input_path is None :
237240 args .input_path = []
238- if args .dataset is None :
239- args .dataset = "nvidia/Nemotron-Pretraining-Dataset-sample"
240241
241242 response = requests .get (
242243 "https://datasets-server.huggingface.co/splits?dataset={}" .format (args .dataset ),
@@ -250,9 +251,9 @@ def main():
250251 split = entry ["split" ]
251252
252253 if args .subset is not None and args .subset != subset :
253- continue
254+ skip_processing = True
254255 if args .split is not None and args .split != split :
255- continue
256+ skip_processing = True
256257
257258 print (f"Loading dataset { name } with subset { subset } and split { split } " )
258259 dataset = load_dataset (name , subset , split = split )
0 commit comments