RagQALeaderboard/eval.py at main · AQ-MedAI/RagQALeaderboard · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python3
"""
RAGQA - Main Evaluation Script

This script provides the main entry point for RAG system evaluation.
"""

import argparse
import json
import sys
from pathlib import Path

# Add the project root to Python path
sys.path.insert(0, str(Path(__file__).parent))

from src import SUPPORTED_DATASET, get_eval
from src.data import EvalResult
from src.logger import get_logger, set_verbose
from src.report import Runner

logger = get_logger()


def main():
    """Main function for RAG evaluation."""
    parser = argparse.ArgumentParser(
        description="PRGB - RAG System Evaluation Tool",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Basic evaluation with Qwen3 model
  python eval.py --model-name "Qwen3" --model-path "/path/to/model" --data-path "data/test.jsonl"

  # Evaluation with custom noise configuration
  python eval.py --model-name "Qwen3" --noise-config '{"noise_doc_level1":4,"noise_doc_level2":4,"noise_doc_level3":1}'

  # Batch evaluation with specific parameters
  python eval.py --model-name "Qwen3" --batch-size 32 --temperature 0.8 --shuffle True
        """,
    )

    # Model configuration
    parser.add_argument("--api-key", type=str, default=None, help="api key of chatgpt")
    parser.add_argument(
        "--total_doc_number",
        type=int,
        default=30,
        help="total doc number when infering",
    )
    parser.add_argument(
        "--model-name", type=str, default="Qwen3", help="Name of the model to evaluate"
    )
    parser.add_argument(
        "--inference-mode",
        type=bool,
        default=False,
        help="whether inference model or not",
    )
    parser.add_argument(
        "--model-path", type=str, required=True, help="Path to the model or API url"
    )

    # Data configuration
    parser.add_argument(
        "--eval-dataset",
        nargs="+",
        choices=SUPPORTED_DATASET,
        default=SUPPORTED_DATASET,
        help="Specify which datasets to evaluate. Must be one or more of 'a', 'b', 'c'. (e.g., --eval-dataset a c)",
    )

    # Output configuration
    parser.add_argument(
        "--output-path",
        type=str,
        default="./results",
        help="Output directory for results",
    )

    # Evaluation parameters
    parser.add_argument(
        "--shuffle", type=bool, default=True, help="Whether to shuffle the data"
    )
    parser.add_argument(
        "--batch-size", type=int, default=5, help="Batch size for evaluation"
    )
    parser.add_argument(
        "--temperature", type=float, default=0.7, help="Temperature for text generation"
    )
    parser.add_argument(
        "--custom_config",
        type=str,
        default=None,
        help="custom prompt config path",
    )
    # Additional options
    parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")

    args = parser.parse_args()

    # Set logging level
    if args.verbose:
        set_verbose(True)

    # Create output directory if it doesn't exist
    output_path = Path(args.output_path)
    output_path.mkdir(parents=True, exist_ok=True)

    logger.info(f"Starting evaluation with model: {args.model_name}")
    logger.info(f"Eval Dataset: {args.eval_dataset}")
    logger.info(f"Output path: {args.output_path}")

    print("🚀 RAGQA Evaluation System Example")
    print("=" * 50)

    # Create runner
    runner = Runner(output_dir="reports")

    # Create sample results
    # sample_results = create_sample_results()
    results = get_eval(args)

    print("\n📊 Running evaluation for all datasets...")

    # Run evaluation for all datasets
    runner.run_all(results)

    # Print summary
    runner.print_summary()

    print("\n📝 Generating HTML report...")

    # Generate HTML report
    html_path = runner.generate_html_report()
    print(f"HTML report generated: {html_path}")

    print("\n💾 Saving JSON results...")

    # Save JSON results
    json_path = runner.save_json_results()
    print(f"JSON results saved: {json_path}")

    print("\n✅ Example completed successfully!")
    print(f"📁 Check the 'reports' directory for output files")

    # try:
    #     # Run evaluation
    #     get_eval(args)
    #     logger.info("Evaluation completed successfully!")

    # except Exception as e:
    #     logger.error(f"Evaluation failed: {e}")
    #     sys.exit(1)


if __name__ == "__main__":
    main()