Skip to content

Commit 2e34944

Browse files
Add support for evals API (#339)
* Add support for evals API * add csv files
1 parent 70a7215 commit 2e34944

File tree

15 files changed

+1882
-77
lines changed

15 files changed

+1882
-77
lines changed

src/together/cli/api/evaluation.py

Lines changed: 379 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,379 @@
1+
import json
2+
from typing import Optional, Dict, Union, Any
3+
4+
import click
5+
from tabulate import tabulate
6+
7+
from together import Together
8+
from together.utils import convert_unix_timestamp
9+
10+
11+
@click.group()
12+
@click.pass_context
13+
def evaluation(ctx: click.Context) -> None:
14+
"""Evaluation API commands"""
15+
pass
16+
17+
18+
@evaluation.command()
19+
@click.pass_context
20+
@click.option(
21+
"--type",
22+
type=click.Choice(["classify", "score", "compare"]),
23+
required=True,
24+
help="Type of evaluation to create.",
25+
)
26+
@click.option(
27+
"--judge-model-name",
28+
type=str,
29+
required=True,
30+
help="Name of the judge model to use for evaluation.",
31+
)
32+
@click.option(
33+
"--judge-system-template",
34+
type=str,
35+
required=True,
36+
help="System template for the judge model.",
37+
)
38+
@click.option(
39+
"--input-data-file-path",
40+
type=str,
41+
required=True,
42+
help="Path to the input data file.",
43+
)
44+
@click.option(
45+
"--model-field",
46+
type=str,
47+
help="Name of the field in the input file contaning text generated by the model."
48+
"Can not be used when model-a-name and other model config parameters are specified",
49+
)
50+
@click.option(
51+
"--model-to-evaluate-name",
52+
type=str,
53+
help="Model name when using the detailed config",
54+
)
55+
@click.option(
56+
"--model-to-evaluate-max-tokens",
57+
type=int,
58+
help="Max tokens for model-to-evaluate",
59+
)
60+
@click.option(
61+
"--model-to-evaluate-temperature",
62+
type=float,
63+
help="Temperature for model-to-evaluate",
64+
)
65+
@click.option(
66+
"--model-to-evaluate-system-template",
67+
type=str,
68+
help="System template for model-to-evaluate",
69+
)
70+
@click.option(
71+
"--model-to-evaluate-input-template",
72+
type=str,
73+
help="Input template for model-to-evaluate",
74+
)
75+
@click.option(
76+
"--labels",
77+
type=str,
78+
help="Classification labels - comma-separated list",
79+
)
80+
@click.option(
81+
"--pass-labels",
82+
type=str,
83+
help="Labels considered as passing (required for classify type). A comma-separated list.",
84+
)
85+
@click.option(
86+
"--min-score",
87+
type=float,
88+
help="Minimum score value (required for score type).",
89+
)
90+
@click.option(
91+
"--max-score",
92+
type=float,
93+
help="Maximum score value (required for score type).",
94+
)
95+
@click.option(
96+
"--pass-threshold",
97+
type=float,
98+
help="Threshold score for passing (required for score type).",
99+
)
100+
@click.option(
101+
"--model-a-field",
102+
type=str,
103+
help="Name of the field in the input file containing text generated by Model A. \
104+
Can not be used when model-a-name and other model config parameters are specified",
105+
)
106+
@click.option(
107+
"--model-a-name",
108+
type=str,
109+
help="Model name for model A when using detailed config.",
110+
)
111+
@click.option(
112+
"--model-a-max-tokens",
113+
type=int,
114+
help="Max tokens for model A.",
115+
)
116+
@click.option(
117+
"--model-a-temperature",
118+
type=float,
119+
help="Temperature for model A.",
120+
)
121+
@click.option(
122+
"--model-a-system-template",
123+
type=str,
124+
help="System template for model A.",
125+
)
126+
@click.option(
127+
"--model-a-input-template",
128+
type=str,
129+
help="Input template for model A.",
130+
)
131+
@click.option(
132+
"--model-b-field",
133+
type=str,
134+
help="Name of the field in the input file containing text generated by Model B.\
135+
Can not be used when model-b-name and other model config parameters are specified",
136+
)
137+
@click.option(
138+
"--model-b-name",
139+
type=str,
140+
help="Model name for model B when using detailed config.",
141+
)
142+
@click.option(
143+
"--model-b-max-tokens",
144+
type=int,
145+
help="Max tokens for model B.",
146+
)
147+
@click.option(
148+
"--model-b-temperature",
149+
type=float,
150+
help="Temperature for model B.",
151+
)
152+
@click.option(
153+
"--model-b-system-template",
154+
type=str,
155+
help="System template for model B.",
156+
)
157+
@click.option(
158+
"--model-b-input-template",
159+
type=str,
160+
help="Input template for model B.",
161+
)
162+
def create(
163+
ctx: click.Context,
164+
type: str,
165+
judge_model_name: str,
166+
judge_system_template: str,
167+
input_data_file_path: str,
168+
model_field: Optional[str],
169+
model_to_evaluate_name: Optional[str],
170+
model_to_evaluate_max_tokens: Optional[int],
171+
model_to_evaluate_temperature: Optional[float],
172+
model_to_evaluate_system_template: Optional[str],
173+
model_to_evaluate_input_template: Optional[str],
174+
labels: str,
175+
pass_labels: str,
176+
min_score: Optional[float],
177+
max_score: Optional[float],
178+
pass_threshold: Optional[float],
179+
model_a_field: Optional[str],
180+
model_a_name: Optional[str],
181+
model_a_max_tokens: Optional[int],
182+
model_a_temperature: Optional[float],
183+
model_a_system_template: Optional[str],
184+
model_a_input_template: Optional[str],
185+
model_b_field: Optional[str],
186+
model_b_name: Optional[str],
187+
model_b_max_tokens: Optional[int],
188+
model_b_temperature: Optional[float],
189+
model_b_system_template: Optional[str],
190+
model_b_input_template: Optional[str],
191+
) -> None:
192+
"""Create a new evaluation job"""
193+
194+
client: Together = ctx.obj
195+
196+
# Convert strings to lists for labels
197+
labels_list = labels.split(",") if labels else None
198+
pass_labels_list = pass_labels.split(",") if pass_labels else None
199+
200+
# Build model configurations
201+
model_to_evaluate_final: Union[Dict[str, Any], None, str] = None
202+
203+
# Check if any config parameters are provided
204+
config_params_provided = any(
205+
[
206+
model_to_evaluate_name,
207+
model_to_evaluate_max_tokens,
208+
model_to_evaluate_temperature,
209+
model_to_evaluate_system_template,
210+
model_to_evaluate_input_template,
211+
]
212+
)
213+
214+
if model_field:
215+
# Simple mode: model_field is provided
216+
if config_params_provided:
217+
raise click.BadParameter(
218+
"Cannot specify both --model-field and --model-to-evaluate-* parameters. "
219+
"Use either --model-field alone if your input file has pre-generated responses, "
220+
"or config parameters if you want to generate it on our end"
221+
)
222+
model_to_evaluate_final = model_field
223+
elif config_params_provided:
224+
# Config mode: config parameters are provided
225+
model_to_evaluate_final = {
226+
"model_name": model_to_evaluate_name,
227+
"max_tokens": model_to_evaluate_max_tokens,
228+
"temperature": model_to_evaluate_temperature,
229+
"system_template": model_to_evaluate_system_template,
230+
"input_template": model_to_evaluate_input_template,
231+
}
232+
233+
# Build model-a configuration
234+
model_a_final: Union[Dict[str, Any], None, str] = None
235+
model_a_config_params = [
236+
model_a_name,
237+
model_a_max_tokens,
238+
model_a_temperature,
239+
model_a_system_template,
240+
model_a_input_template,
241+
]
242+
243+
if model_a_field is not None:
244+
# Simple mode: model_a_field is provided
245+
if any(model_a_config_params):
246+
raise click.BadParameter(
247+
"Cannot specify both --model-a-field and config parameters (--model-a-name, etc.). "
248+
"Use either --model-a-field alone if your input file has pre-generated responses, "
249+
"or config parameters if you want to generate it on our end"
250+
)
251+
model_a_final = model_a_field
252+
elif any(model_a_config_params):
253+
# Config mode: config parameters are provided
254+
model_a_final = {
255+
"model_name": model_a_name,
256+
"max_tokens": model_a_max_tokens,
257+
"temperature": model_a_temperature,
258+
"system_template": model_a_system_template,
259+
"input_template": model_a_input_template,
260+
}
261+
262+
# Build model-b configuration
263+
model_b_final: Union[Dict[str, Any], None, str] = None
264+
model_b_config_params = [
265+
model_b_name,
266+
model_b_max_tokens,
267+
model_b_temperature,
268+
model_b_system_template,
269+
model_b_input_template,
270+
]
271+
272+
if model_b_field is not None:
273+
# Simple mode: model_b_field is provided
274+
if any(model_b_config_params):
275+
raise click.BadParameter(
276+
"Cannot specify both --model-b-field and config parameters (--model-b-name, etc.). "
277+
"Use either --model-b-field alone if your input file has pre-generated responses, "
278+
"or config parameters if you want to generate it on our end"
279+
)
280+
model_b_final = model_b_field
281+
elif any(model_b_config_params):
282+
# Config mode: config parameters are provided
283+
model_b_final = {
284+
"model_name": model_b_name,
285+
"max_tokens": model_b_max_tokens,
286+
"temperature": model_b_temperature,
287+
"system_template": model_b_system_template,
288+
"input_template": model_b_input_template,
289+
}
290+
291+
try:
292+
response = client.evaluation.create(
293+
type=type,
294+
judge_model_name=judge_model_name,
295+
judge_system_template=judge_system_template,
296+
input_data_file_path=input_data_file_path,
297+
model_to_evaluate=model_to_evaluate_final,
298+
labels=labels_list,
299+
pass_labels=pass_labels_list,
300+
min_score=min_score,
301+
max_score=max_score,
302+
pass_threshold=pass_threshold,
303+
model_a=model_a_final,
304+
model_b=model_b_final,
305+
)
306+
except ValueError as e:
307+
raise click.BadParameter(str(e))
308+
309+
click.echo(json.dumps(response.model_dump(exclude_none=True), indent=4))
310+
311+
312+
@evaluation.command()
313+
@click.pass_context
314+
@click.option(
315+
"--status",
316+
type=str,
317+
help="Filter by job status.",
318+
)
319+
@click.option(
320+
"--limit",
321+
type=int,
322+
help="Limit number of results (max 100).",
323+
)
324+
def list(ctx: click.Context, status: Optional[str], limit: Optional[int]) -> None:
325+
"""List evaluation jobs"""
326+
327+
client: Together = ctx.obj
328+
329+
response = client.evaluation.list(status=status, limit=limit)
330+
331+
display_list = []
332+
for job in response:
333+
if job.parameters:
334+
model = job.parameters.get("model_to_evaluate", "")
335+
model_a = job.parameters.get("model_a", "")
336+
model_b = job.parameters.get("model_b", "")
337+
else:
338+
model = ""
339+
340+
display_list.append(
341+
{
342+
"Workflow ID": job.workflow_id or "",
343+
"Type": job.type,
344+
"Status": job.status,
345+
"Created At": job.created_at or 0,
346+
"Model": model,
347+
"Model A": model_a,
348+
"Model B": model_b,
349+
}
350+
)
351+
352+
table = tabulate(display_list, headers="keys", tablefmt="grid", showindex=True)
353+
click.echo(table)
354+
355+
356+
@evaluation.command()
357+
@click.pass_context
358+
@click.argument("evaluation_id", type=str, required=True)
359+
def retrieve(ctx: click.Context, evaluation_id: str) -> None:
360+
"""Get details of a specific evaluation job"""
361+
362+
client: Together = ctx.obj
363+
364+
response = client.evaluation.retrieve(evaluation_id=evaluation_id)
365+
366+
click.echo(json.dumps(response.model_dump(exclude_none=True), indent=4))
367+
368+
369+
@evaluation.command()
370+
@click.pass_context
371+
@click.argument("evaluation_id", type=str, required=True)
372+
def status(ctx: click.Context, evaluation_id: str) -> None:
373+
"""Get the status and results of a specific evaluation job"""
374+
375+
client: Together = ctx.obj
376+
377+
response = client.evaluation.status(evaluation_id=evaluation_id)
378+
379+
click.echo(json.dumps(response.model_dump(exclude_none=True), indent=4))

src/together/cli/cli.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from together.cli.api.chat import chat, interactive
1010
from together.cli.api.completions import completions
1111
from together.cli.api.endpoints import endpoints
12+
from together.cli.api.evaluation import evaluation
1213
from together.cli.api.files import files
1314
from together.cli.api.finetune import fine_tuning
1415
from together.cli.api.images import images
@@ -74,6 +75,7 @@ def main(
7475
main.add_command(fine_tuning)
7576
main.add_command(models)
7677
main.add_command(endpoints)
78+
main.add_command(evaluation)
7779

7880
if __name__ == "__main__":
7981
main()

0 commit comments

Comments
 (0)