Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 81 additions & 15 deletions src/together/cli/api/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,23 @@ def evaluation(ctx: click.Context) -> None:
help="Type of evaluation to create.",
)
@click.option(
"--judge-model-name",
"--judge-model",
type=str,
required=True,
help="Name of the judge model to use for evaluation.",
)
@click.option(
"--judge-model-source",
type=click.Choice(["serverless", "dedicated", "external"]),
required=True,
help="Source of the judge model.",
)
@click.option(
"--judge-external-api-token",
type=str,
required=False,
help="Optional external API token for the judge model.",
)
@click.option(
"--judge-system-template",
type=str,
Expand All @@ -48,10 +60,20 @@ def evaluation(ctx: click.Context) -> None:
"Can not be used when model-a-name and other model config parameters are specified",
)
@click.option(
"--model-to-evaluate-name",
"--model-to-evaluate",
type=str,
help="Model name when using the detailed config",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
help="Model name when using the detailed config",
help="Model name or URL when using the detailed config",

)
@click.option(
"--model-to-evaluate-source",
type=click.Choice(["serverless", "dedicated", "external"]),
help="Source of the model to evaluate.",
)
@click.option(
"--model-to-evaluate-external-api-token",
type=str,
help="Optional external API token for the model to evaluate.",
)
@click.option(
"--model-to-evaluate-max-tokens",
type=int,
Expand Down Expand Up @@ -104,10 +126,20 @@ def evaluation(ctx: click.Context) -> None:
Can not be used when model-a-name and other model config parameters are specified",
)
@click.option(
"--model-a-name",
"--model-a",
type=str,
help="Model name for model A when using detailed config.",
)
@click.option(
"--model-a-source",
type=click.Choice(["serverless", "dedicated", "external"]),
help="Source of model A.",
)
@click.option(
"--model-a-external-api-token",
type=str,
help="Optional external API token for model A.",
)
@click.option(
"--model-a-max-tokens",
type=int,
Expand Down Expand Up @@ -135,10 +167,20 @@ def evaluation(ctx: click.Context) -> None:
Can not be used when model-b-name and other model config parameters are specified",
)
@click.option(
"--model-b-name",
"--model-b",
type=str,
help="Model name for model B when using detailed config.",
)
@click.option(
"--model-b-source",
type=click.Choice(["serverless", "dedicated", "external"]),
help="Source of model B.",
)
@click.option(
"--model-b-external-api-token",
type=str,
help="Optional external API token for model B.",
)
@click.option(
"--model-b-max-tokens",
type=int,
Expand All @@ -162,11 +204,15 @@ def evaluation(ctx: click.Context) -> None:
def create(
ctx: click.Context,
type: str,
judge_model_name: str,
judge_model: str,
judge_model_source: str,
judge_system_template: str,
judge_external_api_token: Optional[str],
input_data_file_path: str,
model_field: Optional[str],
model_to_evaluate_name: Optional[str],
model_to_evaluate: Optional[str],
model_to_evaluate_source: Optional[str],
model_to_evaluate_external_api_token: Optional[str],
model_to_evaluate_max_tokens: Optional[int],
model_to_evaluate_temperature: Optional[float],
model_to_evaluate_system_template: Optional[str],
Expand All @@ -177,13 +223,17 @@ def create(
max_score: Optional[float],
pass_threshold: Optional[float],
model_a_field: Optional[str],
model_a_name: Optional[str],
model_a: Optional[str],
model_a_source: Optional[str],
model_a_external_api_token: Optional[str],
model_a_max_tokens: Optional[int],
model_a_temperature: Optional[float],
model_a_system_template: Optional[str],
model_a_input_template: Optional[str],
model_b_field: Optional[str],
model_b_name: Optional[str],
model_b: Optional[str],
model_b_source: Optional[str],
model_b_external_api_token: Optional[str],
model_b_max_tokens: Optional[int],
model_b_temperature: Optional[float],
model_b_system_template: Optional[str],
Expand All @@ -203,7 +253,8 @@ def create(
# Check if any config parameters are provided
config_params_provided = any(
[
model_to_evaluate_name,
model_to_evaluate,
model_to_evaluate_source,
model_to_evaluate_max_tokens,
model_to_evaluate_temperature,
model_to_evaluate_system_template,
Expand All @@ -223,17 +274,23 @@ def create(
elif config_params_provided:
# Config mode: config parameters are provided
model_to_evaluate_final = {
"model_name": model_to_evaluate_name,
"model": model_to_evaluate,
"model_source": model_to_evaluate_source,
"max_tokens": model_to_evaluate_max_tokens,
"temperature": model_to_evaluate_temperature,
"system_template": model_to_evaluate_system_template,
"input_template": model_to_evaluate_input_template,
}
if model_to_evaluate_external_api_token:
model_to_evaluate_final["external_api_token"] = (
model_to_evaluate_external_api_token
)

# Build model-a configuration
model_a_final: Union[Dict[str, Any], None, str] = None
model_a_config_params = [
model_a_name,
model_a,
model_a_source,
model_a_max_tokens,
model_a_temperature,
model_a_system_template,
Expand All @@ -252,17 +309,21 @@ def create(
elif any(model_a_config_params):
# Config mode: config parameters are provided
model_a_final = {
"model_name": model_a_name,
"model": model_a,
"model_source": model_a_source,
"max_tokens": model_a_max_tokens,
"temperature": model_a_temperature,
"system_template": model_a_system_template,
"input_template": model_a_input_template,
}
if model_a_external_api_token:
model_a_final["external_api_token"] = model_a_external_api_token

# Build model-b configuration
model_b_final: Union[Dict[str, Any], None, str] = None
model_b_config_params = [
model_b_name,
model_b,
model_b_source,
model_b_max_tokens,
model_b_temperature,
model_b_system_template,
Expand All @@ -281,18 +342,23 @@ def create(
elif any(model_b_config_params):
# Config mode: config parameters are provided
model_b_final = {
"model_name": model_b_name,
"model": model_b,
"model_source": model_b_source,
"max_tokens": model_b_max_tokens,
"temperature": model_b_temperature,
"system_template": model_b_system_template,
"input_template": model_b_input_template,
}
if model_b_external_api_token:
model_b_final["external_api_token"] = model_b_external_api_token

try:
response = client.evaluation.create(
type=type,
judge_model_name=judge_model_name,
judge_model=judge_model,
judge_model_source=judge_model_source,
judge_system_template=judge_system_template,
judge_external_api_token=judge_external_api_token,
input_data_file_path=input_data_file_path,
model_to_evaluate=model_to_evaluate_final,
labels=labels_list,
Expand Down
36 changes: 26 additions & 10 deletions src/together/resources/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,11 @@ def __init__(self, client: TogetherClient) -> None:
def create(
self,
type: str,
judge_model_name: str,
judge_model: str,
judge_model_source: str,
judge_system_template: str,
input_data_file_path: str,
judge_external_api_token: Optional[str] = None,
# Classify-specific parameters
labels: Optional[List[str]] = None,
pass_labels: Optional[List[str]] = None,
Expand All @@ -48,9 +50,11 @@ def create(

Args:
type: The type of evaluation ("classify", "score", or "compare")
judge_model_name: Name of the judge model
judge_model: Name of the judge model
judge_model_source: Source of the judge model ("serverless", "dedicated", or "external")
judge_system_template: System template for the judge
input_data_file_path: Path to input data file
judge_external_api_token: Optional external API token for the judge model
labels: List of classification labels (required for classify)
pass_labels: List of labels considered as passing (required for classify)
min_score: Minimum score value (required for score)
Expand All @@ -69,8 +73,10 @@ def create(

# Build judge config
judge_config = JudgeModelConfig(
model_name=judge_model_name,
model=judge_model,
model_source=judge_model_source,
system_template=judge_system_template,
external_api_token=judge_external_api_token,
)
parameters: Union[ClassifyParameters, ScoreParameters, CompareParameters]
# Build parameters based on type
Expand Down Expand Up @@ -112,7 +118,8 @@ def create(
elif isinstance(model_to_evaluate, dict):
# Validate that all required fields are present for model config
required_fields = [
"model_name",
"model",
"model_source",
"max_tokens",
"temperature",
"system_template",
Expand Down Expand Up @@ -163,7 +170,8 @@ def create(
elif isinstance(model_to_evaluate, dict):
# Validate that all required fields are present for model config
required_fields = [
"model_name",
"model",
"model_source",
"max_tokens",
"temperature",
"system_template",
Expand Down Expand Up @@ -379,9 +387,11 @@ def __init__(self, client: TogetherClient) -> None:
async def create(
self,
type: str,
judge_model_name: str,
judge_model: str,
judge_model_source: str,
judge_system_template: str,
input_data_file_path: str,
judge_external_api_token: Optional[str] = None,
# Classify-specific parameters
labels: Optional[List[str]] = None,
pass_labels: Optional[List[str]] = None,
Expand All @@ -400,9 +410,11 @@ async def create(

Args:
type: The type of evaluation ("classify", "score", or "compare")
judge_model_name: Name of the judge model
judge_model: Name of the judge model
judge_model_source: Source of the judge model ("serverless", "dedicated", or "external")
judge_system_template: System template for the judge
input_data_file_path: Path to input data file
judge_external_api_token: Optional external API token for the judge model
labels: List of classification labels (required for classify)
pass_labels: List of labels considered as passing (required for classify)
min_score: Minimum score value (required for score)
Expand All @@ -421,8 +433,10 @@ async def create(

# Build judge config
judge_config = JudgeModelConfig(
model_name=judge_model_name,
model=judge_model,
model_source=judge_model_source,
system_template=judge_system_template,
external_api_token=judge_external_api_token,
)
parameters: Union[ClassifyParameters, ScoreParameters, CompareParameters]
# Build parameters based on type
Expand Down Expand Up @@ -464,7 +478,8 @@ async def create(
elif isinstance(model_to_evaluate, dict):
# Validate that all required fields are present for model config
required_fields = [
"model_name",
"model",
"model_source",
"max_tokens",
"temperature",
"system_template",
Expand Down Expand Up @@ -515,7 +530,8 @@ async def create(
elif isinstance(model_to_evaluate, dict):
# Validate that all required fields are present for model config
required_fields = [
"model_name",
"model",
"model_source",
"max_tokens",
"temperature",
"system_template",
Expand Down
10 changes: 7 additions & 3 deletions src/together/types/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from datetime import datetime
from enum import Enum
from typing import Any, Dict, List, Optional, Union
from typing import Any, Dict, List, Literal, Optional, Union

from pydantic import BaseModel, Field

Expand All @@ -23,16 +23,20 @@ class EvaluationStatus(str, Enum):


class JudgeModelConfig(BaseModel):
model_name: str
model: str
model_source: Literal["serverless", "dedicated", "external"]
system_template: str
external_api_token: Optional[str] = None


class ModelRequest(BaseModel):
model_name: str
model: str
model_source: Literal["serverless", "dedicated", "external"]
max_tokens: int
temperature: float
system_template: str
input_template: str
external_api_token: Optional[str] = None


class ClassifyParameters(BaseModel):
Expand Down
Loading