Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
379 changes: 379 additions & 0 deletions src/together/cli/api/evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,379 @@
import json
from typing import Optional, Dict, Union, Any

import click
from tabulate import tabulate

from together import Together
from together.utils import convert_unix_timestamp


@click.group()
@click.pass_context
def evaluation(ctx: click.Context) -> None:
"""Evaluation API commands"""
pass


@evaluation.command()
@click.pass_context
@click.option(
"--type",
type=click.Choice(["classify", "score", "compare"]),
required=True,
help="Type of evaluation to create.",
)
@click.option(
"--judge-model-name",
type=str,
required=True,
help="Name of the judge model to use for evaluation.",
)
@click.option(
"--judge-system-template",
type=str,
required=True,
help="System template for the judge model.",
)
@click.option(
"--input-data-file-path",
type=str,
required=True,
help="Path to the input data file.",
)
@click.option(
"--model-field",
type=str,
help="Name of the field in the input file contaning text generated by the model."
"Can not be used when model-a-name and other model config parameters are specified",
)
@click.option(
"--model-to-evaluate-name",
type=str,
help="Model name when using the detailed config",
)
@click.option(
"--model-to-evaluate-max-tokens",
type=int,
help="Max tokens for model-to-evaluate",
)
@click.option(
"--model-to-evaluate-temperature",
type=float,
help="Temperature for model-to-evaluate",
)
@click.option(
"--model-to-evaluate-system-template",
type=str,
help="System template for model-to-evaluate",
)
@click.option(
"--model-to-evaluate-input-template",
type=str,
help="Input template for model-to-evaluate",
)
@click.option(
"--labels",
type=str,
help="Classification labels - comma-separated list",
)
@click.option(
"--pass-labels",
type=str,
help="Labels considered as passing (required for classify type). A comma-separated list.",
)
@click.option(
"--min-score",
type=float,
help="Minimum score value (required for score type).",
)
@click.option(
"--max-score",
type=float,
help="Maximum score value (required for score type).",
)
@click.option(
"--pass-threshold",
type=float,
help="Threshold score for passing (required for score type).",
)
@click.option(
"--model-a-field",
type=str,
help="Name of the field in the input file containing text generated by Model A. \
Can not be used when model-a-name and other model config parameters are specified",
)
@click.option(
"--model-a-name",
type=str,
help="Model name for model A when using detailed config.",
)
@click.option(
"--model-a-max-tokens",
type=int,
help="Max tokens for model A.",
)
@click.option(
"--model-a-temperature",
type=float,
help="Temperature for model A.",
)
@click.option(
"--model-a-system-template",
type=str,
help="System template for model A.",
)
@click.option(
"--model-a-input-template",
type=str,
help="Input template for model A.",
)
@click.option(
"--model-b-field",
type=str,
help="Name of the field in the input file containing text generated by Model B.\
Can not be used when model-b-name and other model config parameters are specified",
)
@click.option(
"--model-b-name",
type=str,
help="Model name for model B when using detailed config.",
)
@click.option(
"--model-b-max-tokens",
type=int,
help="Max tokens for model B.",
)
@click.option(
"--model-b-temperature",
type=float,
help="Temperature for model B.",
)
@click.option(
"--model-b-system-template",
type=str,
help="System template for model B.",
)
@click.option(
"--model-b-input-template",
type=str,
help="Input template for model B.",
)
def create(
ctx: click.Context,
type: str,
judge_model_name: str,
judge_system_template: str,
input_data_file_path: str,
model_field: Optional[str],
model_to_evaluate_name: Optional[str],
model_to_evaluate_max_tokens: Optional[int],
model_to_evaluate_temperature: Optional[float],
model_to_evaluate_system_template: Optional[str],
model_to_evaluate_input_template: Optional[str],
labels: str,
pass_labels: str,
min_score: Optional[float],
max_score: Optional[float],
pass_threshold: Optional[float],
model_a_field: Optional[str],
model_a_name: Optional[str],
model_a_max_tokens: Optional[int],
model_a_temperature: Optional[float],
model_a_system_template: Optional[str],
model_a_input_template: Optional[str],
model_b_field: Optional[str],
model_b_name: Optional[str],
model_b_max_tokens: Optional[int],
model_b_temperature: Optional[float],
model_b_system_template: Optional[str],
model_b_input_template: Optional[str],
) -> None:
"""Create a new evaluation job"""

client: Together = ctx.obj

# Convert strings to lists for labels
labels_list = labels.split(",") if labels else None
pass_labels_list = pass_labels.split(",") if pass_labels else None

# Build model configurations
model_to_evaluate_final: Union[Dict[str, Any], None, str] = None

# Check if any config parameters are provided
config_params_provided = any(
[
model_to_evaluate_name,
model_to_evaluate_max_tokens,
model_to_evaluate_temperature,
model_to_evaluate_system_template,
model_to_evaluate_input_template,
]
)

if model_field:
# Simple mode: model_field is provided
if config_params_provided:
raise click.BadParameter(
"Cannot specify both --model-field and --model-to-evaluate-* parameters. "
"Use either --model-field alone if your input file has pre-generated responses, "
"or config parameters if you want to generate it on our end"
)
model_to_evaluate_final = model_field
elif config_params_provided:
# Config mode: config parameters are provided
model_to_evaluate_final = {
"model_name": model_to_evaluate_name,
"max_tokens": model_to_evaluate_max_tokens,
"temperature": model_to_evaluate_temperature,
"system_template": model_to_evaluate_system_template,
"input_template": model_to_evaluate_input_template,
}

# Build model-a configuration
model_a_final: Union[Dict[str, Any], None, str] = None
model_a_config_params = [
model_a_name,
model_a_max_tokens,
model_a_temperature,
model_a_system_template,
model_a_input_template,
]

if model_a_field is not None:
# Simple mode: model_a_field is provided
if any(model_a_config_params):
raise click.BadParameter(
"Cannot specify both --model-a-field and config parameters (--model-a-name, etc.). "
"Use either --model-a-field alone if your input file has pre-generated responses, "
"or config parameters if you want to generate it on our end"
)
model_a_final = model_a_field
elif any(model_a_config_params):
# Config mode: config parameters are provided
model_a_final = {
"model_name": model_a_name,
"max_tokens": model_a_max_tokens,
"temperature": model_a_temperature,
"system_template": model_a_system_template,
"input_template": model_a_input_template,
}

# Build model-b configuration
model_b_final: Union[Dict[str, Any], None, str] = None
model_b_config_params = [
model_b_name,
model_b_max_tokens,
model_b_temperature,
model_b_system_template,
model_b_input_template,
]

if model_b_field is not None:
# Simple mode: model_b_field is provided
if any(model_b_config_params):
raise click.BadParameter(
"Cannot specify both --model-b-field and config parameters (--model-b-name, etc.). "
"Use either --model-b-field alone if your input file has pre-generated responses, "
"or config parameters if you want to generate it on our end"
)
model_b_final = model_b_field
elif any(model_b_config_params):
# Config mode: config parameters are provided
model_b_final = {
"model_name": model_b_name,
"max_tokens": model_b_max_tokens,
"temperature": model_b_temperature,
"system_template": model_b_system_template,
"input_template": model_b_input_template,
}

try:
response = client.evaluation.create(
type=type,
judge_model_name=judge_model_name,
judge_system_template=judge_system_template,
input_data_file_path=input_data_file_path,
model_to_evaluate=model_to_evaluate_final,
labels=labels_list,
pass_labels=pass_labels_list,
min_score=min_score,
max_score=max_score,
pass_threshold=pass_threshold,
model_a=model_a_final,
model_b=model_b_final,
)
except ValueError as e:
raise click.BadParameter(str(e))

click.echo(json.dumps(response.model_dump(exclude_none=True), indent=4))


@evaluation.command()
@click.pass_context
@click.option(
"--status",
type=str,
help="Filter by job status.",
)
@click.option(
"--limit",
type=int,
help="Limit number of results (max 100).",
)
def list(ctx: click.Context, status: Optional[str], limit: Optional[int]) -> None:
"""List evaluation jobs"""

client: Together = ctx.obj

response = client.evaluation.list(status=status, limit=limit)

display_list = []
for job in response:
if job.parameters:
model = job.parameters.get("model_to_evaluate", "")
model_a = job.parameters.get("model_a", "")
model_b = job.parameters.get("model_b", "")
else:
model = ""

display_list.append(
{
"Workflow ID": job.workflow_id or "",
"Type": job.type,
"Status": job.status,
"Created At": job.created_at or 0,
"Model": model,
"Model A": model_a,
"Model B": model_b,
}
)

table = tabulate(display_list, headers="keys", tablefmt="grid", showindex=True)
click.echo(table)


@evaluation.command()
@click.pass_context
@click.argument("evaluation_id", type=str, required=True)
def retrieve(ctx: click.Context, evaluation_id: str) -> None:
"""Get details of a specific evaluation job"""

client: Together = ctx.obj

response = client.evaluation.retrieve(evaluation_id=evaluation_id)

click.echo(json.dumps(response.model_dump(exclude_none=True), indent=4))


@evaluation.command()
@click.pass_context
@click.argument("evaluation_id", type=str, required=True)
def status(ctx: click.Context, evaluation_id: str) -> None:
"""Get the status and results of a specific evaluation job"""

client: Together = ctx.obj

response = client.evaluation.status(evaluation_id=evaluation_id)

click.echo(json.dumps(response.model_dump(exclude_none=True), indent=4))
2 changes: 2 additions & 0 deletions src/together/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from together.cli.api.chat import chat, interactive
from together.cli.api.completions import completions
from together.cli.api.endpoints import endpoints
from together.cli.api.evaluation import evaluation
from together.cli.api.files import files
from together.cli.api.finetune import fine_tuning
from together.cli.api.images import images
Expand Down Expand Up @@ -74,6 +75,7 @@ def main(
main.add_command(fine_tuning)
main.add_command(models)
main.add_command(endpoints)
main.add_command(evaluation)

if __name__ == "__main__":
main()
Loading
Loading