Skip to content

Commit d18d802

Browse files
committed
remove scripty code
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
1 parent a9c7d52 commit d18d802

File tree

1 file changed

+0
-48
lines changed

1 file changed

+0
-48
lines changed

nemo_automodel/components/datasets/llm/retrieval_dataset_inline.py

Lines changed: 0 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -541,51 +541,3 @@ def make_retrieval_dataset(
541541
logging.info(f"Created {data_type} dataset with {len(dataset)} examples")
542542

543543
return dataset
544-
545-
546-
if __name__ == "__main__":
547-
import argparse
548-
549-
parser = argparse.ArgumentParser(description="Load and transform dataset to retrieval format")
550-
parser.add_argument(
551-
"--data_dir_list", type=str, nargs="+", required=True, help="Path(s) to JSON file(s) containing training data"
552-
)
553-
parser.add_argument(
554-
"--data_type", type=str, default="train", choices=["train", "eval"], help="Type of data (train or eval)"
555-
)
556-
parser.add_argument(
557-
"--train_n_passages", type=int, default=5, help="Number of passages for training (1 positive + n-1 negatives)"
558-
)
559-
parser.add_argument(
560-
"--eval_negative_size", type=int, default=10, help="Number of negative documents for evaluation"
561-
)
562-
parser.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility")
563-
parser.add_argument("--do_shuffle", action="store_true", help="Whether to shuffle the dataset")
564-
parser.add_argument("--max_train_samples", type=int, default=None, help="Maximum number of training samples")
565-
566-
args = parser.parse_args()
567-
568-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
569-
570-
dataset = make_retrieval_dataset(
571-
data_dir_list=args.data_dir_list,
572-
data_type=args.data_type,
573-
train_n_passages=args.train_n_passages,
574-
eval_negative_size=args.eval_negative_size,
575-
seed=args.seed,
576-
do_shuffle=args.do_shuffle,
577-
max_train_samples=args.max_train_samples,
578-
)
579-
580-
print(f"\n{'=' * 60}")
581-
print(f"Dataset loading completed successfully! (mode: {args.data_type})")
582-
print(f"{'=' * 60}")
583-
print(f"Dataset size: {len(dataset)}")
584-
print("\nSample example:")
585-
example = dataset[0]
586-
print(f"Question: {example['question'][:100]}...")
587-
print(f"Num documents: {len(example['doc_text'])}")
588-
print(f"Positive doc: {example['doc_text'][0][:100] if example['doc_text'][0] else '(empty)'}...")
589-
if len(example["doc_text"]) > 1:
590-
print(f"First negative: {example['doc_text'][1][:100] if example['doc_text'][1] else '(empty)'}...")
591-
print(f"{'=' * 60}\n")

0 commit comments

Comments
 (0)