PRGB/eval.py at main · AQ-MedAI/PRGB · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/env python3
"""
PRGB - Main Evaluation Script

This script provides the main entry point for RAG system evaluation.
"""

import argparse
import json
import sys
from pathlib import Path

# Add the project root to Python path
sys.path.insert(0, str(Path(__file__).parent))

from core import get_eval
from core.logger import get_logger, set_verbose

logger = get_logger()


def main():
    """Main function for RAG evaluation."""
    parser = argparse.ArgumentParser(
        description="PRGB - RAG System Evaluation Tool",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Basic evaluation with Qwen3 model
  python eval.py --model-name "Qwen3" --model-path "/path/to/model" --data-path "data/test.jsonl"

  # Evaluation with custom noise configuration
  python eval.py --model-name "Qwen3" --noise-config '{"noise_doc_level1":4,"noise_doc_level2":4,"noise_doc_level3":1}'

  # Batch evaluation with specific parameters
  python eval.py --model-name "Qwen3" --batch-size 32 --temperature 0.8 --shuffle True
        """
    )

    # Model configuration
    parser.add_argument(
        "--api-key", type=str, default=None, help="api key of chatgpt"
    )
    parser.add_argument(
        "--model-name",
        type=str,
        default="Qwen3",
        help="Name of the model to evaluate"
    )
    parser.add_argument(
        "--inference-mode", type=bool, default=False, help="whether inference model or not"
    )
    parser.add_argument(
        "--model-path",
        type=str,
        required=True,
        help="Path to the model or API key"
    )

    # Data configuration
    parser.add_argument(
        "--data-path",
        type=str,
        default="tests/test.jsonl",
        help="Path to the evaluation dataset"
    )
    parser.add_argument(
        "--num-iterations",
        type=int,
        default=3,
        help="Number of evaluation iterations. For each query, randomly select n different placeholders to run evaluation. Each placeholder represents a different version of the same query with different variable substitutions."
    )

    # Output configuration
    parser.add_argument(
        "--output-path",
        type=str,
        default="./results",
        help="Output directory for results"
    )

    # Evaluation parameters
    parser.add_argument(
        "--noise-config",
        type=str,
        default='{"noise_doc_level1":4,"noise_doc_level2":4,"noise_doc_level3":1}',
        help="Noise configuration as JSON string"
    )
    parser.add_argument(
        "--shuffle",
        type=bool,
        default=True,
        help="Whether to shuffle the data"
    )
    parser.add_argument(
        "--batch-size",
        type=int,
        default=16,
        help="Batch size for evaluation"
    )
    parser.add_argument(
        "--temperature",
        type=float,
        default=0.7,
        help="Temperature for text generation"
    )
    parser.add_argument(
        "--custom_config",
        type=str,
        default=None,
        help="custom prompt config path",
    )
    # Additional options
    parser.add_argument(
        "--verbose",
        action="store_true",
        help="Enable verbose logging"
    )

    args = parser.parse_args()

    # Set logging level
    if args.verbose:
        set_verbose(True)

    # Validate arguments
    try:
        noise_config = json.loads(args.noise_config)
    except json.JSONDecodeError:
        logger.error("Invalid noise_config JSON format")
        sys.exit(1)

    # Create output directory if it doesn't exist
    output_path = Path(args.output_path)
    output_path.mkdir(parents=True, exist_ok=True)

    logger.info(f"Starting evaluation with model: {args.model_name}")
    logger.info(f"Data path: {args.data_path}")
    logger.info(f"Output path: {args.output_path}")
    logger.info(f"Noise config: {noise_config}")

    try:
        # Run evaluation
        get_eval(args)
        logger.info("Evaluation completed successfully!")

    except Exception as e:
        logger.error(f"Evaluation failed: {e}")
        sys.exit(1)


if __name__ == "__main__":
    main()