-
Notifications
You must be signed in to change notification settings - Fork 362
Expand file tree
/
Copy pathinspect_evaluators.py
More file actions
142 lines (126 loc) · 5.45 KB
/
inspect_evaluators.py
File metadata and controls
142 lines (126 loc) · 5.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import logging
from pathlib import Path
import chz
import tinker
from inspect_ai import Tasks, eval_async
from inspect_ai.model import GenerateConfig as InspectAIGenerateConfig
from inspect_ai.model import Model as InspectAIModel
from tinker_cookbook.eval.evaluators import SamplingClientEvaluator
from tinker_cookbook.eval.inspect_utils import InspectAPIFromTinkerSampling
from tinker_cookbook.exceptions import ConfigurationError
# Set up logger
logger = logging.getLogger(__name__)
@chz.chz
class InspectEvaluatorBuilder:
"""
Configuration for inspect evaluation.
This class provides a structured way to configure inspect evaluation
parameters that can be used both in training configs and evaluator builders.
"""
# Required parameters
tasks: Tasks
renderer_name: str | None = None
# TODO: remove model_name once the SDK adds a get_tokenizer method to sampling client
model_name: str | None = None
# Random seed for sampling. If None, sampling is non-deterministic.
seed: int | None = None
# If True, logs prompts and responses to the console (useful for debugging).
verbose: bool = False
# When True, model reasoning/thinking is preserved in inspect output as ContentReasoning.
# When False (default), reasoning is stripped and only text content is returned.
include_reasoning: bool = False
# Generation parameters
temperature: float = 1.0
max_tokens: int = 1000
top_p: float = 1.0
# Top-k sampling. -1 disables top-k filtering (uses all tokens).
top_k: int = -1
# Number of independent responses to generate per prompt. Used for majority
# voting or best-of-n evaluation strategies.
num_choices: int = 1
# Evaluation parameters
# Maximum number of samples to evaluate. If None, evaluates all samples.
limit: int | None = None
debug_errors: bool = True
log_dir: str | None = None
# Maximum concurrent sampling requests to Tinker.
max_connections: int = 512
log_level: str = "INFO"
# Metadata to associate with this evaluation run (visible in inspect logs)
metadata: dict[str, str] | None = None
def __call__(self) -> SamplingClientEvaluator:
return InspectEvaluator(self)
class InspectEvaluator(SamplingClientEvaluator):
"""
A SamplingClientEvaluator that runs inspect tasks and returns their metrics.
"""
def __init__(self, config: InspectEvaluatorBuilder):
"""
Initialize the InspectEvaluator.
Args:
config: Configuration object containing all evaluation parameters
"""
self.config = config
async def __call__(self, sampling_client: tinker.SamplingClient) -> dict[str, float]:
"""
Run inspect evaluation on the given sampling client and return metrics.
Args:
sampling_client: The sampling client to evaluate
Returns:
Dictionary of metrics from inspect evaluation
"""
if self.config.model_name is None:
raise ConfigurationError("model_name must be set before running evaluation")
if self.config.renderer_name is None:
raise ConfigurationError("renderer_name must be set before running evaluation")
# Create the inspect API wrapper
api = InspectAPIFromTinkerSampling(
renderer_name=self.config.renderer_name,
model_name=self.config.model_name,
sampling_client=sampling_client,
verbose=self.config.verbose,
include_reasoning=self.config.include_reasoning,
)
# Create the inspect model
model = InspectAIModel(
api=api,
config=InspectAIGenerateConfig(
temperature=self.config.temperature,
max_tokens=self.config.max_tokens,
top_p=self.config.top_p,
top_k=self.config.top_k,
seed=self.config.seed,
num_choices=self.config.num_choices,
),
)
# Run evaluation
results = await eval_async(
tasks=self.config.tasks,
model=[model],
limit=self.config.limit,
debug_errors=self.config.debug_errors,
# Never retry - the tinker SDK is doing this for us already
retry_on_error=0,
# Although Tinker sampling tries very hard to only throw unrecoverable failures,
# the inspect evaluation can still fail if e.g. the parser returns an error for
# a given sample.
fail_on_error=False,
log_dir=self.config.log_dir or str(Path("~/inspect-logs").expanduser()),
max_connections=self.config.max_connections,
log_level=self.config.log_level,
log_realtime=False,
log_buffer=1000,
metadata=self.config.metadata,
)
# Extract metrics from results
metrics = {}
for task_result in results:
if task_result.results is not None and task_result.results.scores is not None:
for task_name, score in task_result.results.scores[0].metrics.items():
if task_result.eval.dataset is not None:
dataset_name = task_result.eval.dataset.name
else:
dataset_name = "unknown"
metrics[dataset_name + "/" + task_name] = score.value # pyright: ignore[reportOptionalOperand]
logger.info(f"Inspect evaluation completed. Metrics: {metrics}")
return metrics