-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy patheval.py
More file actions
153 lines (131 loc) · 4.06 KB
/
eval.py
File metadata and controls
153 lines (131 loc) · 4.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/env python3
"""
PRGB - Main Evaluation Script
This script provides the main entry point for RAG system evaluation.
"""
import argparse
import json
import sys
from pathlib import Path
# Add the project root to Python path
sys.path.insert(0, str(Path(__file__).parent))
from core import get_eval
from core.logger import get_logger, set_verbose
logger = get_logger()
def main():
"""Main function for RAG evaluation."""
parser = argparse.ArgumentParser(
description="PRGB - RAG System Evaluation Tool",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Basic evaluation with Qwen3 model
python eval.py --model-name "Qwen3" --model-path "/path/to/model" --data-path "data/test.jsonl"
# Evaluation with custom noise configuration
python eval.py --model-name "Qwen3" --noise-config '{"noise_doc_level1":4,"noise_doc_level2":4,"noise_doc_level3":1}'
# Batch evaluation with specific parameters
python eval.py --model-name "Qwen3" --batch-size 32 --temperature 0.8 --shuffle True
"""
)
# Model configuration
parser.add_argument(
"--api-key", type=str, default=None, help="api key of chatgpt"
)
parser.add_argument(
"--model-name",
type=str,
default="Qwen3",
help="Name of the model to evaluate"
)
parser.add_argument(
"--inference-mode", type=bool, default=False, help="whether inference model or not"
)
parser.add_argument(
"--model-path",
type=str,
required=True,
help="Path to the model or API key"
)
# Data configuration
parser.add_argument(
"--data-path",
type=str,
default="tests/test.jsonl",
help="Path to the evaluation dataset"
)
parser.add_argument(
"--num-iterations",
type=int,
default=3,
help="Number of evaluation iterations. For each query, randomly select n different placeholders to run evaluation. Each placeholder represents a different version of the same query with different variable substitutions."
)
# Output configuration
parser.add_argument(
"--output-path",
type=str,
default="./results",
help="Output directory for results"
)
# Evaluation parameters
parser.add_argument(
"--noise-config",
type=str,
default='{"noise_doc_level1":4,"noise_doc_level2":4,"noise_doc_level3":1}',
help="Noise configuration as JSON string"
)
parser.add_argument(
"--shuffle",
type=bool,
default=True,
help="Whether to shuffle the data"
)
parser.add_argument(
"--batch-size",
type=int,
default=16,
help="Batch size for evaluation"
)
parser.add_argument(
"--temperature",
type=float,
default=0.7,
help="Temperature for text generation"
)
parser.add_argument(
"--custom_config",
type=str,
default=None,
help="custom prompt config path",
)
# Additional options
parser.add_argument(
"--verbose",
action="store_true",
help="Enable verbose logging"
)
args = parser.parse_args()
# Set logging level
if args.verbose:
set_verbose(True)
# Validate arguments
try:
noise_config = json.loads(args.noise_config)
except json.JSONDecodeError:
logger.error("Invalid noise_config JSON format")
sys.exit(1)
# Create output directory if it doesn't exist
output_path = Path(args.output_path)
output_path.mkdir(parents=True, exist_ok=True)
logger.info(f"Starting evaluation with model: {args.model_name}")
logger.info(f"Data path: {args.data_path}")
logger.info(f"Output path: {args.output_path}")
logger.info(f"Noise config: {noise_config}")
try:
# Run evaluation
get_eval(args)
logger.info("Evaluation completed successfully!")
except Exception as e:
logger.error(f"Evaluation failed: {e}")
sys.exit(1)
if __name__ == "__main__":
main()