@@ -541,51 +541,3 @@ def make_retrieval_dataset(
541541 logging .info (f"Created { data_type } dataset with { len (dataset )} examples" )
542542
543543 return dataset
544-
545-
546- if __name__ == "__main__" :
547- import argparse
548-
549- parser = argparse .ArgumentParser (description = "Load and transform dataset to retrieval format" )
550- parser .add_argument (
551- "--data_dir_list" , type = str , nargs = "+" , required = True , help = "Path(s) to JSON file(s) containing training data"
552- )
553- parser .add_argument (
554- "--data_type" , type = str , default = "train" , choices = ["train" , "eval" ], help = "Type of data (train or eval)"
555- )
556- parser .add_argument (
557- "--train_n_passages" , type = int , default = 5 , help = "Number of passages for training (1 positive + n-1 negatives)"
558- )
559- parser .add_argument (
560- "--eval_negative_size" , type = int , default = 10 , help = "Number of negative documents for evaluation"
561- )
562- parser .add_argument ("--seed" , type = int , default = 42 , help = "Random seed for reproducibility" )
563- parser .add_argument ("--do_shuffle" , action = "store_true" , help = "Whether to shuffle the dataset" )
564- parser .add_argument ("--max_train_samples" , type = int , default = None , help = "Maximum number of training samples" )
565-
566- args = parser .parse_args ()
567-
568- logging .basicConfig (level = logging .INFO , format = "%(asctime)s - %(levelname)s - %(message)s" )
569-
570- dataset = make_retrieval_dataset (
571- data_dir_list = args .data_dir_list ,
572- data_type = args .data_type ,
573- train_n_passages = args .train_n_passages ,
574- eval_negative_size = args .eval_negative_size ,
575- seed = args .seed ,
576- do_shuffle = args .do_shuffle ,
577- max_train_samples = args .max_train_samples ,
578- )
579-
580- print (f"\n { '=' * 60 } " )
581- print (f"Dataset loading completed successfully! (mode: { args .data_type } )" )
582- print (f"{ '=' * 60 } " )
583- print (f"Dataset size: { len (dataset )} " )
584- print ("\n Sample example:" )
585- example = dataset [0 ]
586- print (f"Question: { example ['question' ][:100 ]} ..." )
587- print (f"Num documents: { len (example ['doc_text' ])} " )
588- print (f"Positive doc: { example ['doc_text' ][0 ][:100 ] if example ['doc_text' ][0 ] else '(empty)' } ..." )
589- if len (example ["doc_text" ]) > 1 :
590- print (f"First negative: { example ['doc_text' ][1 ][:100 ] if example ['doc_text' ][1 ] else '(empty)' } ..." )
591- print (f"{ '=' * 60 } \n " )
0 commit comments