diff --git a/src/together/cli/api/evaluation.py b/src/together/cli/api/evaluation.py new file mode 100644 index 00000000..6aff353a --- /dev/null +++ b/src/together/cli/api/evaluation.py @@ -0,0 +1,379 @@ +import json +from typing import Optional, Dict, Union, Any + +import click +from tabulate import tabulate + +from together import Together +from together.utils import convert_unix_timestamp + + +@click.group() +@click.pass_context +def evaluation(ctx: click.Context) -> None: + """Evaluation API commands""" + pass + + +@evaluation.command() +@click.pass_context +@click.option( + "--type", + type=click.Choice(["classify", "score", "compare"]), + required=True, + help="Type of evaluation to create.", +) +@click.option( + "--judge-model-name", + type=str, + required=True, + help="Name of the judge model to use for evaluation.", +) +@click.option( + "--judge-system-template", + type=str, + required=True, + help="System template for the judge model.", +) +@click.option( + "--input-data-file-path", + type=str, + required=True, + help="Path to the input data file.", +) +@click.option( + "--model-field", + type=str, + help="Name of the field in the input file contaning text generated by the model." + "Can not be used when model-a-name and other model config parameters are specified", +) +@click.option( + "--model-to-evaluate-name", + type=str, + help="Model name when using the detailed config", +) +@click.option( + "--model-to-evaluate-max-tokens", + type=int, + help="Max tokens for model-to-evaluate", +) +@click.option( + "--model-to-evaluate-temperature", + type=float, + help="Temperature for model-to-evaluate", +) +@click.option( + "--model-to-evaluate-system-template", + type=str, + help="System template for model-to-evaluate", +) +@click.option( + "--model-to-evaluate-input-template", + type=str, + help="Input template for model-to-evaluate", +) +@click.option( + "--labels", + type=str, + help="Classification labels - comma-separated list", +) +@click.option( + "--pass-labels", + type=str, + help="Labels considered as passing (required for classify type). A comma-separated list.", +) +@click.option( + "--min-score", + type=float, + help="Minimum score value (required for score type).", +) +@click.option( + "--max-score", + type=float, + help="Maximum score value (required for score type).", +) +@click.option( + "--pass-threshold", + type=float, + help="Threshold score for passing (required for score type).", +) +@click.option( + "--model-a-field", + type=str, + help="Name of the field in the input file containing text generated by Model A. \ + Can not be used when model-a-name and other model config parameters are specified", +) +@click.option( + "--model-a-name", + type=str, + help="Model name for model A when using detailed config.", +) +@click.option( + "--model-a-max-tokens", + type=int, + help="Max tokens for model A.", +) +@click.option( + "--model-a-temperature", + type=float, + help="Temperature for model A.", +) +@click.option( + "--model-a-system-template", + type=str, + help="System template for model A.", +) +@click.option( + "--model-a-input-template", + type=str, + help="Input template for model A.", +) +@click.option( + "--model-b-field", + type=str, + help="Name of the field in the input file containing text generated by Model B.\ + Can not be used when model-b-name and other model config parameters are specified", +) +@click.option( + "--model-b-name", + type=str, + help="Model name for model B when using detailed config.", +) +@click.option( + "--model-b-max-tokens", + type=int, + help="Max tokens for model B.", +) +@click.option( + "--model-b-temperature", + type=float, + help="Temperature for model B.", +) +@click.option( + "--model-b-system-template", + type=str, + help="System template for model B.", +) +@click.option( + "--model-b-input-template", + type=str, + help="Input template for model B.", +) +def create( + ctx: click.Context, + type: str, + judge_model_name: str, + judge_system_template: str, + input_data_file_path: str, + model_field: Optional[str], + model_to_evaluate_name: Optional[str], + model_to_evaluate_max_tokens: Optional[int], + model_to_evaluate_temperature: Optional[float], + model_to_evaluate_system_template: Optional[str], + model_to_evaluate_input_template: Optional[str], + labels: str, + pass_labels: str, + min_score: Optional[float], + max_score: Optional[float], + pass_threshold: Optional[float], + model_a_field: Optional[str], + model_a_name: Optional[str], + model_a_max_tokens: Optional[int], + model_a_temperature: Optional[float], + model_a_system_template: Optional[str], + model_a_input_template: Optional[str], + model_b_field: Optional[str], + model_b_name: Optional[str], + model_b_max_tokens: Optional[int], + model_b_temperature: Optional[float], + model_b_system_template: Optional[str], + model_b_input_template: Optional[str], +) -> None: + """Create a new evaluation job""" + + client: Together = ctx.obj + + # Convert strings to lists for labels + labels_list = labels.split(",") if labels else None + pass_labels_list = pass_labels.split(",") if pass_labels else None + + # Build model configurations + model_to_evaluate_final: Union[Dict[str, Any], None, str] = None + + # Check if any config parameters are provided + config_params_provided = any( + [ + model_to_evaluate_name, + model_to_evaluate_max_tokens, + model_to_evaluate_temperature, + model_to_evaluate_system_template, + model_to_evaluate_input_template, + ] + ) + + if model_field: + # Simple mode: model_field is provided + if config_params_provided: + raise click.BadParameter( + "Cannot specify both --model-field and --model-to-evaluate-* parameters. " + "Use either --model-field alone if your input file has pre-generated responses, " + "or config parameters if you want to generate it on our end" + ) + model_to_evaluate_final = model_field + elif config_params_provided: + # Config mode: config parameters are provided + model_to_evaluate_final = { + "model_name": model_to_evaluate_name, + "max_tokens": model_to_evaluate_max_tokens, + "temperature": model_to_evaluate_temperature, + "system_template": model_to_evaluate_system_template, + "input_template": model_to_evaluate_input_template, + } + + # Build model-a configuration + model_a_final: Union[Dict[str, Any], None, str] = None + model_a_config_params = [ + model_a_name, + model_a_max_tokens, + model_a_temperature, + model_a_system_template, + model_a_input_template, + ] + + if model_a_field is not None: + # Simple mode: model_a_field is provided + if any(model_a_config_params): + raise click.BadParameter( + "Cannot specify both --model-a-field and config parameters (--model-a-name, etc.). " + "Use either --model-a-field alone if your input file has pre-generated responses, " + "or config parameters if you want to generate it on our end" + ) + model_a_final = model_a_field + elif any(model_a_config_params): + # Config mode: config parameters are provided + model_a_final = { + "model_name": model_a_name, + "max_tokens": model_a_max_tokens, + "temperature": model_a_temperature, + "system_template": model_a_system_template, + "input_template": model_a_input_template, + } + + # Build model-b configuration + model_b_final: Union[Dict[str, Any], None, str] = None + model_b_config_params = [ + model_b_name, + model_b_max_tokens, + model_b_temperature, + model_b_system_template, + model_b_input_template, + ] + + if model_b_field is not None: + # Simple mode: model_b_field is provided + if any(model_b_config_params): + raise click.BadParameter( + "Cannot specify both --model-b-field and config parameters (--model-b-name, etc.). " + "Use either --model-b-field alone if your input file has pre-generated responses, " + "or config parameters if you want to generate it on our end" + ) + model_b_final = model_b_field + elif any(model_b_config_params): + # Config mode: config parameters are provided + model_b_final = { + "model_name": model_b_name, + "max_tokens": model_b_max_tokens, + "temperature": model_b_temperature, + "system_template": model_b_system_template, + "input_template": model_b_input_template, + } + + try: + response = client.evaluation.create( + type=type, + judge_model_name=judge_model_name, + judge_system_template=judge_system_template, + input_data_file_path=input_data_file_path, + model_to_evaluate=model_to_evaluate_final, + labels=labels_list, + pass_labels=pass_labels_list, + min_score=min_score, + max_score=max_score, + pass_threshold=pass_threshold, + model_a=model_a_final, + model_b=model_b_final, + ) + except ValueError as e: + raise click.BadParameter(str(e)) + + click.echo(json.dumps(response.model_dump(exclude_none=True), indent=4)) + + +@evaluation.command() +@click.pass_context +@click.option( + "--status", + type=str, + help="Filter by job status.", +) +@click.option( + "--limit", + type=int, + help="Limit number of results (max 100).", +) +def list(ctx: click.Context, status: Optional[str], limit: Optional[int]) -> None: + """List evaluation jobs""" + + client: Together = ctx.obj + + response = client.evaluation.list(status=status, limit=limit) + + display_list = [] + for job in response: + if job.parameters: + model = job.parameters.get("model_to_evaluate", "") + model_a = job.parameters.get("model_a", "") + model_b = job.parameters.get("model_b", "") + else: + model = "" + + display_list.append( + { + "Workflow ID": job.workflow_id or "", + "Type": job.type, + "Status": job.status, + "Created At": job.created_at or 0, + "Model": model, + "Model A": model_a, + "Model B": model_b, + } + ) + + table = tabulate(display_list, headers="keys", tablefmt="grid", showindex=True) + click.echo(table) + + +@evaluation.command() +@click.pass_context +@click.argument("evaluation_id", type=str, required=True) +def retrieve(ctx: click.Context, evaluation_id: str) -> None: + """Get details of a specific evaluation job""" + + client: Together = ctx.obj + + response = client.evaluation.retrieve(evaluation_id=evaluation_id) + + click.echo(json.dumps(response.model_dump(exclude_none=True), indent=4)) + + +@evaluation.command() +@click.pass_context +@click.argument("evaluation_id", type=str, required=True) +def status(ctx: click.Context, evaluation_id: str) -> None: + """Get the status and results of a specific evaluation job""" + + client: Together = ctx.obj + + response = client.evaluation.status(evaluation_id=evaluation_id) + + click.echo(json.dumps(response.model_dump(exclude_none=True), indent=4)) diff --git a/src/together/cli/cli.py b/src/together/cli/cli.py index 7ae35121..34c05988 100644 --- a/src/together/cli/cli.py +++ b/src/together/cli/cli.py @@ -9,6 +9,7 @@ from together.cli.api.chat import chat, interactive from together.cli.api.completions import completions from together.cli.api.endpoints import endpoints +from together.cli.api.evaluation import evaluation from together.cli.api.files import files from together.cli.api.finetune import fine_tuning from together.cli.api.images import images @@ -74,6 +75,7 @@ def main( main.add_command(fine_tuning) main.add_command(models) main.add_command(endpoints) +main.add_command(evaluation) if __name__ == "__main__": main() diff --git a/src/together/client.py b/src/together/client.py index 47a43c24..d84e10d1 100644 --- a/src/together/client.py +++ b/src/together/client.py @@ -25,6 +25,7 @@ class Together: audio: resources.Audio batches: resources.Batches code_interpreter: CodeInterpreter + evaluation: resources.Evaluation # client options client: TogetherClient @@ -92,6 +93,7 @@ def __init__( self.endpoints = resources.Endpoints(self.client) self.code_interpreter = CodeInterpreter(self.client) self.batches = resources.Batches(self.client) + self.evaluation = resources.Evaluation(self.client) class AsyncTogether: @@ -106,6 +108,7 @@ class AsyncTogether: audio: resources.AsyncAudio code_interpreter: CodeInterpreter batches: resources.AsyncBatches + evaluation: resources.AsyncEvaluation # client options client: TogetherClient @@ -171,6 +174,7 @@ def __init__( self.audio = resources.AsyncAudio(self.client) self.code_interpreter = CodeInterpreter(self.client) self.batches = resources.AsyncBatches(self.client) + self.evaluation = resources.AsyncEvaluation(self.client) Client = Together diff --git a/src/together/filemanager.py b/src/together/filemanager.py index b58d873e..48f6955e 100644 --- a/src/together/filemanager.py +++ b/src/together/filemanager.py @@ -334,6 +334,8 @@ def upload( filetype = FileType.jsonl elif file.suffix == ".parquet": filetype = FileType.parquet + elif file.suffix == ".csv": + filetype = FileType.csv else: raise FileTypeError( f"Unknown extension of file {file}. " diff --git a/src/together/resources/__init__.py b/src/together/resources/__init__.py index a6b1935f..53c52e00 100644 --- a/src/together/resources/__init__.py +++ b/src/together/resources/__init__.py @@ -9,6 +9,7 @@ from together.resources.models import AsyncModels, Models from together.resources.rerank import AsyncRerank, Rerank from together.resources.batch import Batches, AsyncBatches +from together.resources.evaluation import Evaluation, AsyncEvaluation __all__ = [ @@ -34,4 +35,6 @@ "Endpoints", "Batches", "AsyncBatches", + "Evaluation", + "AsyncEvaluation", ] diff --git a/src/together/resources/batch.py b/src/together/resources/batch.py index 9602af2b..e9f30653 100644 --- a/src/together/resources/batch.py +++ b/src/together/resources/batch.py @@ -16,7 +16,6 @@ def __init__(self, client: TogetherClient) -> None: self._client = client def create_batch(self, file_id: str, endpoint: str) -> BatchJob: - requestor = api_requestor.APIRequestor( client=self._client, ) diff --git a/src/together/resources/evaluation.py b/src/together/resources/evaluation.py new file mode 100644 index 00000000..10345159 --- /dev/null +++ b/src/together/resources/evaluation.py @@ -0,0 +1,724 @@ +from __future__ import annotations + +from typing import Any, Dict, List, Optional, Union + +from together.abstract import api_requestor +from together.together_response import TogetherResponse +from together.types import ( + TogetherClient, + TogetherRequest, +) +from together.types.evaluation import ( + ClassifyParameters, + CompareParameters, + EvaluationCreateResponse, + EvaluationJob, + EvaluationStatusResponse, + JudgeModelConfig, + ModelRequest, + ScoreParameters, +) + + +class Evaluation: + def __init__(self, client: TogetherClient) -> None: + self._client = client + + def create( + self, + type: str, + judge_model_name: str, + judge_system_template: str, + input_data_file_path: str, + # Classify-specific parameters + labels: Optional[List[str]] = None, + pass_labels: Optional[List[str]] = None, + # Score-specific parameters + min_score: Optional[float] = None, + max_score: Optional[float] = None, + pass_threshold: Optional[float] = None, + # Compare-specific parameters (model_a and model_b handled below) + # Common optional parameters + model_a: Optional[Union[str, Dict[str, Any]]] = None, + model_b: Optional[Union[str, Dict[str, Any]]] = None, + model_to_evaluate: Optional[Union[str, Dict[str, Any]]] = None, + ) -> EvaluationCreateResponse: + """ + Create a new evaluation job. + + Args: + type: The type of evaluation ("classify", "score", or "compare") + judge_model_name: Name of the judge model + judge_system_template: System template for the judge + input_data_file_path: Path to input data file + labels: List of classification labels (required for classify) + pass_labels: List of labels considered as passing (required for classify) + min_score: Minimum score value (required for score) + max_score: Maximum score value (required for score) + pass_threshold: Threshold score for passing (required for score) + model_to_evaluate: Model to evaluate for classify/score types + model_a: Model A for compare type + model_b: Model B for compare type + + Returns: + EvaluationCreateResponse with workflow_id and status + """ + requestor = api_requestor.APIRequestor( + client=self._client, + ) + + # Build judge config + judge_config = JudgeModelConfig( + model_name=judge_model_name, + system_template=judge_system_template, + ) + parameters: Union[ClassifyParameters, ScoreParameters, CompareParameters] + # Build parameters based on type + if type == "classify": + if labels is None or pass_labels is None: + raise ValueError( + "labels and pass_labels are required for classify evaluation" + ) + + # Validate that no score-specific parameters are provided + if any( + [ + min_score is not None, + max_score is not None, + pass_threshold is not None, + ] + ): + raise ValueError( + "min_score, max_score, and pass_threshold parameters are exclusive to the score mode" + ) + + # Validate that no compare-specific parameters are provided + if any([model_a is not None, model_b is not None]): + raise ValueError( + "model_a and model_b parameters are exclusive to the compare mode" + ) + + parameters = ClassifyParameters( + judge=judge_config, + labels=labels, + pass_labels=pass_labels, + input_data_file_path=input_data_file_path, + ) + + # Handle model_to_evaluate + if model_to_evaluate is not None: + if isinstance(model_to_evaluate, str): + parameters.model_to_evaluate = model_to_evaluate + elif isinstance(model_to_evaluate, dict): + # Validate that all required fields are present for model config + required_fields = [ + "model_name", + "max_tokens", + "temperature", + "system_template", + "input_template", + ] + missing_fields = [ + field + for field in required_fields + if field not in model_to_evaluate + ] + if missing_fields: + raise ValueError( + f"All model config parameters are required when using detailed configuration. " + f"Missing: {', '.join(missing_fields)}" + ) + parameters.model_to_evaluate = ModelRequest(**model_to_evaluate) + + elif type == "score": + if min_score is None or max_score is None or pass_threshold is None: + raise ValueError( + "min_score, max_score, and pass_threshold are required for score evaluation" + ) + + # Validate that no classify-specific parameters are provided + if any([labels is not None, pass_labels is not None]): + raise ValueError( + "labels and pass_labels parameters are exclusive to the classify mode" + ) + + # Validate that no compare-specific parameters are provided + if any([model_a is not None, model_b is not None]): + raise ValueError( + "model_a and model_b parameters are exclusive to the compare mode" + ) + + parameters = ScoreParameters( + judge=judge_config, + min_score=min_score, + max_score=max_score, + pass_threshold=pass_threshold, + input_data_file_path=input_data_file_path, + ) + + # Handle model_to_evaluate + if model_to_evaluate is not None: + if isinstance(model_to_evaluate, str): + parameters.model_to_evaluate = model_to_evaluate + elif isinstance(model_to_evaluate, dict): + # Validate that all required fields are present for model config + required_fields = [ + "model_name", + "max_tokens", + "temperature", + "system_template", + "input_template", + ] + missing_fields = [ + field + for field in required_fields + if field not in model_to_evaluate + ] + if missing_fields: + raise ValueError( + f"All model config parameters are required when using detailed configuration. " + f"Missing: {', '.join(missing_fields)}" + ) + parameters.model_to_evaluate = ModelRequest(**model_to_evaluate) + + elif type == "compare": + # Validate that model_a and model_b are provided + if model_a is None or model_b is None: + raise ValueError( + "model_a and model_b parameters are required for compare evaluation" + ) + + # Validate that no classify-specific parameters are provided + if any([labels is not None, pass_labels is not None]): + raise ValueError( + "labels and pass_labels parameters are exclusive to the classify mode" + ) + + # Validate that no score-specific parameters are provided + if any( + [ + min_score is not None, + max_score is not None, + pass_threshold is not None, + ] + ): + raise ValueError( + "min_score, max_score, and pass_threshold parameters are exclusive to the score mode" + ) + + # Validate that model_to_evaluate is not provided + if model_to_evaluate is not None: + raise ValueError( + "model_to_evaluate parameter is exclusive to classify and score modes" + ) + + parameters = CompareParameters( + judge=judge_config, + input_data_file_path=input_data_file_path, + ) + + # Handle model_a + if isinstance(model_a, str): + parameters.model_a = model_a + elif isinstance(model_a, dict): + # Validate that all required fields are present for model config + required_fields = [ + "model_name", + "max_tokens", + "temperature", + "system_template", + "input_template", + ] + missing_fields = [ + field for field in required_fields if field not in model_a + ] + if missing_fields: + raise ValueError( + f"All model config parameters are required for model_a when using detailed configuration. " + f"Missing: {', '.join(missing_fields)}" + ) + parameters.model_a = ModelRequest(**model_a) + + # Handle model_b + if isinstance(model_b, str): + parameters.model_b = model_b + elif isinstance(model_b, dict): + # Validate that all required fields are present for model config + required_fields = [ + "model_name", + "max_tokens", + "temperature", + "system_template", + "input_template", + ] + missing_fields = [ + field for field in required_fields if field not in model_b + ] + if missing_fields: + raise ValueError( + f"All model config parameters are required for model_b when using detailed configuration. " + f"Missing: {', '.join(missing_fields)}" + ) + parameters.model_b = ModelRequest(**model_b) + + else: + raise ValueError( + f"Invalid evaluation type: {type}. Must be 'classify', 'score', or 'compare'" + ) + + payload = { + "type": type, + "parameters": parameters.model_dump(), + } + + response, _, _ = requestor.request( + options=TogetherRequest( + method="POST", + url="evaluation", + params=payload, + ), + stream=False, + ) + + assert isinstance(response, TogetherResponse) + return EvaluationCreateResponse(**response.data) + + def list( + self, + status: Optional[str] = None, + limit: Optional[int] = None, + ) -> List[EvaluationJob]: + """ + List evaluation jobs. + + Args: + status: Optional filter by job status + limit: Optional limit on number of results (max 100) + + Returns: + List of EvaluationJob objects + """ + requestor = api_requestor.APIRequestor( + client=self._client, + ) + + params: Dict[str, Any] = {} + if status is not None: + params["status"] = status + if limit is not None: + params["limit"] = limit + + response, _, _ = requestor.request( + options=TogetherRequest( + method="GET", + url="evaluations", + params=params if params else None, + ), + stream=False, + ) + + assert isinstance(response, TogetherResponse) + jobs = response.data or [] + return [EvaluationJob(**job) for job in jobs] + + def retrieve(self, evaluation_id: str) -> EvaluationJob: + """ + Get details of a specific evaluation job. + + Args: + evaluation_id: The workflow ID of the evaluation job + + Returns: + EvaluationJob object with full details + """ + requestor = api_requestor.APIRequestor( + client=self._client, + ) + + response, _, _ = requestor.request( + options=TogetherRequest( + method="GET", + url=f"evaluation/{evaluation_id}", + ), + stream=False, + ) + + assert isinstance(response, TogetherResponse) + return EvaluationJob(**response.data) + + def status(self, evaluation_id: str) -> EvaluationStatusResponse: + """ + Get the status and results of a specific evaluation job. + + Args: + evaluation_id: The workflow ID of the evaluation job + + Returns: + EvaluationStatusResponse with status and results + """ + requestor = api_requestor.APIRequestor( + client=self._client, + ) + + response, _, _ = requestor.request( + options=TogetherRequest( + method="GET", + url=f"evaluation/{evaluation_id}/status", + ), + stream=False, + ) + + assert isinstance(response, TogetherResponse) + return EvaluationStatusResponse(**response.data) + + +class AsyncEvaluation: + def __init__(self, client: TogetherClient) -> None: + self._client = client + + async def create( + self, + type: str, + judge_model_name: str, + judge_system_template: str, + input_data_file_path: str, + # Classify-specific parameters + labels: Optional[List[str]] = None, + pass_labels: Optional[List[str]] = None, + # Score-specific parameters + min_score: Optional[float] = None, + max_score: Optional[float] = None, + pass_threshold: Optional[float] = None, + # Compare-specific parameters (model_a and model_b handled below) + # Common optional parameters + model_to_evaluate: Optional[Union[str, Dict[str, Any]]] = None, + model_a: Optional[Union[str, Dict[str, Any]]] = None, + model_b: Optional[Union[str, Dict[str, Any]]] = None, + ) -> EvaluationCreateResponse: + """ + Create a new evaluation job. + + Args: + type: The type of evaluation ("classify", "score", or "compare") + judge_model_name: Name of the judge model + judge_system_template: System template for the judge + input_data_file_path: Path to input data file + labels: List of classification labels (required for classify) + pass_labels: List of labels considered as passing (required for classify) + min_score: Minimum score value (required for score) + max_score: Maximum score value (required for score) + pass_threshold: Threshold score for passing (required for score) + model_to_evaluate: Model to evaluate for classify/score types + model_a: Model A for compare type + model_b: Model B for compare type + + Returns: + EvaluationCreateResponse with workflow_id and status + """ + requestor = api_requestor.APIRequestor( + client=self._client, + ) + + # Build judge config + judge_config = JudgeModelConfig( + model_name=judge_model_name, + system_template=judge_system_template, + ) + parameters: Union[ClassifyParameters, ScoreParameters, CompareParameters] + # Build parameters based on type + if type == "classify": + if labels is None or pass_labels is None: + raise ValueError( + "labels and pass_labels are required for classify evaluation" + ) + + # Validate that no score-specific parameters are provided + if any( + [ + min_score is not None, + max_score is not None, + pass_threshold is not None, + ] + ): + raise ValueError( + "min_score, max_score, and pass_threshold parameters are exclusive to the score mode" + ) + + # Validate that no compare-specific parameters are provided + if any([model_a is not None, model_b is not None]): + raise ValueError( + "model_a and model_b parameters are exclusive to the compare mode" + ) + + parameters = ClassifyParameters( + judge=judge_config, + labels=labels, + pass_labels=pass_labels, + input_data_file_path=input_data_file_path, + ) + + # Handle model_to_evaluate + if model_to_evaluate is not None: + if isinstance(model_to_evaluate, str): + parameters.model_to_evaluate = model_to_evaluate + elif isinstance(model_to_evaluate, dict): + # Validate that all required fields are present for model config + required_fields = [ + "model_name", + "max_tokens", + "temperature", + "system_template", + "input_template", + ] + missing_fields = [ + field + for field in required_fields + if field not in model_to_evaluate + ] + if missing_fields: + raise ValueError( + f"All model config parameters are required when using detailed configuration. " + f"Missing: {', '.join(missing_fields)}" + ) + parameters.model_to_evaluate = ModelRequest(**model_to_evaluate) + + elif type == "score": + if min_score is None or max_score is None or pass_threshold is None: + raise ValueError( + "min_score, max_score, and pass_threshold are required for score evaluation" + ) + + # Validate that no classify-specific parameters are provided + if any([labels is not None, pass_labels is not None]): + raise ValueError( + "labels and pass_labels parameters are exclusive to the classify mode" + ) + + # Validate that no compare-specific parameters are provided + if any([model_a is not None, model_b is not None]): + raise ValueError( + "model_a and model_b parameters are exclusive to the compare mode" + ) + + parameters = ScoreParameters( + judge=judge_config, + min_score=min_score, + max_score=max_score, + pass_threshold=pass_threshold, + input_data_file_path=input_data_file_path, + ) + + # Handle model_to_evaluate + if model_to_evaluate is not None: + if isinstance(model_to_evaluate, str): + parameters.model_to_evaluate = model_to_evaluate + elif isinstance(model_to_evaluate, dict): + # Validate that all required fields are present for model config + required_fields = [ + "model_name", + "max_tokens", + "temperature", + "system_template", + "input_template", + ] + missing_fields = [ + field + for field in required_fields + if field not in model_to_evaluate + ] + if missing_fields: + raise ValueError( + f"All model config parameters are required when using detailed configuration. " + f"Missing: {', '.join(missing_fields)}" + ) + parameters.model_to_evaluate = ModelRequest(**model_to_evaluate) + + elif type == "compare": + parameters = CompareParameters( + judge=judge_config, + input_data_file_path=input_data_file_path, + ) + + # Validate that model_a and model_b are provided + if model_a is None or model_b is None: + raise ValueError( + "model_a and model_b parameters are required for compare evaluation" + ) + + # Validate that no classify-specific parameters are provided + if any([labels is not None, pass_labels is not None]): + raise ValueError( + "labels and pass_labels parameters are exclusive to the classify mode" + ) + + # Validate that no score-specific parameters are provided + if any( + [ + min_score is not None, + max_score is not None, + pass_threshold is not None, + ] + ): + raise ValueError( + "min_score, max_score, and pass_threshold parameters are exclusive to the score mode" + ) + + # Validate that model_to_evaluate is not provided + if model_to_evaluate is not None: + raise ValueError( + "model_to_evaluate parameter is exclusive to classify and score modes" + ) + + # Handle model_a + if isinstance(model_a, str): + parameters.model_a = model_a + elif isinstance(model_a, dict): + # Validate that all required fields are present for model config + required_fields = [ + "model_name", + "max_tokens", + "temperature", + "system_template", + "input_template", + ] + missing_fields = [ + field for field in required_fields if field not in model_a + ] + if missing_fields: + raise ValueError( + f"All model config parameters are required for model_a when using detailed configuration. " + f"Missing: {', '.join(missing_fields)}" + ) + parameters.model_a = ModelRequest(**model_a) + + # Handle model_b + if isinstance(model_b, str): + parameters.model_b = model_b + elif isinstance(model_b, dict): + # Validate that all required fields are present for model config + required_fields = [ + "model_name", + "max_tokens", + "temperature", + "system_template", + "input_template", + ] + missing_fields = [ + field for field in required_fields if field not in model_b + ] + if missing_fields: + raise ValueError( + f"All model config parameters are required for model_b when using detailed configuration. " + f"Missing: {', '.join(missing_fields)}" + ) + parameters.model_b = ModelRequest(**model_b) + + else: + raise ValueError( + f"Invalid evaluation type: {type}. Must be 'classify', 'score', or 'compare'" + ) + + payload = { + "type": type, + "parameters": parameters.model_dump(), + } + + response, _, _ = await requestor.arequest( + options=TogetherRequest( + method="POST", + url="evaluation", + params=payload, + ), + stream=False, + ) + + assert isinstance(response, TogetherResponse) + return EvaluationCreateResponse(**response.data) + + async def list( + self, + status: Optional[str] = None, + limit: Optional[int] = None, + ) -> List[EvaluationJob]: + """ + List evaluation jobs. + + Args: + status: Optional filter by job status + limit: Optional limit on number of results (max 100) + + Returns: + List of EvaluationJob objects + """ + requestor = api_requestor.APIRequestor( + client=self._client, + ) + + params: Dict[str, Any] = {} + if status is not None: + params["status"] = status + if limit is not None: + params["limit"] = limit + + response, _, _ = await requestor.arequest( + options=TogetherRequest( + method="GET", + url="evaluations", + params=params if params else None, + ), + stream=False, + ) + + assert isinstance(response, TogetherResponse) + jobs = response.data or [] + return [EvaluationJob(**job) for job in jobs] + + async def retrieve(self, evaluation_id: str) -> EvaluationJob: + """ + Get details of a specific evaluation job. + + Args: + evaluation_id: The workflow ID of the evaluation job + + Returns: + EvaluationJob object with full details + """ + requestor = api_requestor.APIRequestor( + client=self._client, + ) + + response, _, _ = await requestor.arequest( + options=TogetherRequest( + method="GET", + url=f"evaluation/{evaluation_id}", + ), + stream=False, + ) + + assert isinstance(response, TogetherResponse) + return EvaluationJob(**response.data) + + async def status(self, evaluation_id: str) -> EvaluationStatusResponse: + """ + Get the status and results of a specific evaluation job. + + Args: + evaluation_id: The workflow ID of the evaluation job + + Returns: + EvaluationStatusResponse with status and results + """ + requestor = api_requestor.APIRequestor( + client=self._client, + ) + + response, _, _ = await requestor.arequest( + options=TogetherRequest( + method="GET", + url=f"evaluation/{evaluation_id}/status", + ), + stream=False, + ) + + assert isinstance(response, TogetherResponse) + return EvaluationStatusResponse(**response.data) diff --git a/src/together/types/__init__.py b/src/together/types/__init__.py index 6e89baff..325e15a7 100644 --- a/src/together/types/__init__.py +++ b/src/together/types/__init__.py @@ -61,6 +61,19 @@ from together.types.models import ModelObject from together.types.rerank import RerankRequest, RerankResponse from together.types.batch import BatchJob, BatchJobStatus, BatchEndpoint +from together.types.evaluation import ( + EvaluationType, + EvaluationStatus, + JudgeModelConfig, + ModelRequest, + ClassifyParameters, + ScoreParameters, + CompareParameters, + EvaluationRequest, + EvaluationCreateResponse, + EvaluationJob, + EvaluationStatusResponse, +) __all__ = [ @@ -124,4 +137,15 @@ "BatchJob", "BatchJobStatus", "BatchEndpoint", + "EvaluationType", + "EvaluationStatus", + "JudgeModelConfig", + "ModelRequest", + "ClassifyParameters", + "ScoreParameters", + "CompareParameters", + "EvaluationRequest", + "EvaluationCreateResponse", + "EvaluationJob", + "EvaluationStatusResponse", ] diff --git a/src/together/types/evaluation.py b/src/together/types/evaluation.py new file mode 100644 index 00000000..2c30ed58 --- /dev/null +++ b/src/together/types/evaluation.py @@ -0,0 +1,87 @@ +from __future__ import annotations + +from datetime import datetime +from enum import Enum +from typing import Any, Dict, List, Optional, Union + +from pydantic import BaseModel, Field + + +class EvaluationType(str, Enum): + CLASSIFY = "classify" + SCORE = "score" + COMPARE = "compare" + + +class EvaluationStatus(str, Enum): + PENDING = "pending" + QUEUED = "queued" + RUNNING = "running" + COMPLETED = "completed" + ERROR = "error" + USER_ERROR = "user_error" + + +class JudgeModelConfig(BaseModel): + model_name: str + system_template: str + + +class ModelRequest(BaseModel): + model_name: str + max_tokens: int + temperature: float + system_template: str + input_template: str + + +class ClassifyParameters(BaseModel): + judge: JudgeModelConfig + labels: List[str] + pass_labels: List[str] + model_to_evaluate: Optional[Union[str, ModelRequest]] = None + input_data_file_path: str + + +class ScoreParameters(BaseModel): + judge: JudgeModelConfig + min_score: float + max_score: float + pass_threshold: float + model_to_evaluate: Optional[Union[str, ModelRequest]] = None + input_data_file_path: str + + +class CompareParameters(BaseModel): + judge: JudgeModelConfig + model_a: Optional[Union[str, ModelRequest]] = None + model_b: Optional[Union[str, ModelRequest]] = None + input_data_file_path: str + + +class EvaluationRequest(BaseModel): + type: EvaluationType + parameters: Union[ClassifyParameters, ScoreParameters, CompareParameters] + + +class EvaluationCreateResponse(BaseModel): + workflow_id: str + status: EvaluationStatus + + +class EvaluationJob(BaseModel): + workflow_id: str = Field(alias="workflow_id") + type: Optional[EvaluationType] = None + status: EvaluationStatus + results: Optional[Dict[str, Any]] = None + parameters: Optional[Dict[str, Any]] = None + created_at: Optional[datetime] = None + updated_at: Optional[datetime] = None + + class Config: + populate_by_name = True + + +class EvaluationStatusResponse(BaseModel): + status: EvaluationStatus + results: Optional[Dict[str, Any]] = None diff --git a/src/together/types/files.py b/src/together/types/files.py index 59acd8c6..6e3b36be 100644 --- a/src/together/types/files.py +++ b/src/together/types/files.py @@ -14,11 +14,13 @@ class FilePurpose(str, Enum): FineTune = "fine-tune" BatchAPI = "batch-api" + Eval = "eval" class FileType(str, Enum): jsonl = "jsonl" parquet = "parquet" + csv = "csv" class FileRequest(BaseModel): diff --git a/src/together/utils/files.py b/src/together/utils/files.py index f3cc690d..781dd518 100644 --- a/src/together/utils/files.py +++ b/src/together/utils/files.py @@ -2,6 +2,7 @@ import json import os +import csv from pathlib import Path from traceback import format_exc from typing import Any, Dict, List @@ -17,6 +18,7 @@ POSSIBLE_ROLES_CONVERSATION, DatasetFormat, ) +from together.types import FilePurpose class InvalidFileFormatError(ValueError): @@ -36,6 +38,7 @@ def __init__( def check_file( file: Path | str, + purpose: FilePurpose | str = FilePurpose.FineTune, ) -> Dict[str, Any]: if not isinstance(file, Path): file = Path(file) @@ -52,6 +55,7 @@ def check_file( "has_min_samples": None, "num_samples": None, "load_json": None, + "load_csv": None, } if not file.is_file(): @@ -79,10 +83,13 @@ def check_file( data_report_dict = {} if file.suffix == ".jsonl": report_dict["filetype"] = "jsonl" - data_report_dict = _check_jsonl(file) + data_report_dict = _check_jsonl(file, purpose) elif file.suffix == ".parquet": report_dict["filetype"] = "parquet" - data_report_dict = _check_parquet(file) + data_report_dict = _check_parquet(file, purpose) + elif file.suffix == ".csv": + report_dict["filetype"] = "csv" + data_report_dict = _check_csv(file, purpose) else: report_dict["filetype"] = ( f"Unknown extension of file {file}. " @@ -229,9 +236,15 @@ def validate_preference_openai(example: Dict[str, Any], idx: int = 0) -> None: validate_messages(example["non_preferred_output"], idx) -def _check_jsonl(file: Path) -> Dict[str, Any]: +def _check_utf8(file: Path) -> Dict[str, Any]: + """Check if the file is UTF-8 encoded. + + Args: + file (Path): Path to the file to check. + Returns: + Dict[str, Any]: A dictionary with the results of the check. + """ report_dict: Dict[str, Any] = {} - # Check that the file is UTF-8 encoded. If not report where the error occurs. try: with file.open(encoding="utf-8") as f: f.read() @@ -240,6 +253,99 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: report_dict["utf8"] = False report_dict["message"] = f"File is not UTF-8 encoded. Error raised: {e}." report_dict["is_check_passed"] = False + return report_dict + + +def _check_samples_count( + file: Path, report_dict: Dict[str, Any], idx: int +) -> Dict[str, Any]: + if idx + 1 < MIN_SAMPLES: + report_dict["has_min_samples"] = False + report_dict["message"] = ( + f"Processing {file} resulted in only {idx + 1} samples. " + f"Our minimum is {MIN_SAMPLES} samples. " + ) + report_dict["is_check_passed"] = False + else: + report_dict["num_samples"] = idx + 1 + report_dict["has_min_samples"] = True + + return report_dict + + +def _check_csv(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]: + """Check if the file is a valid CSV file. + + Args: + file (Path): Path to the file to check. + purpose (FilePurpose | str): Purpose of the file, used to determine if the file should be checked for specific columns. + + Returns: + Dict[str, Any]: A dictionary with the results of the check. + """ + report_dict: Dict[str, Any] = {} + if purpose != FilePurpose.Eval: + report_dict["is_check_passed"] = False + report_dict["message"] = ( + f"CSV files are not supported for {purpose}. " + "Only JSONL and Parquet files are supported." + ) + return report_dict + + report_dict.update(_check_utf8(file)) + + if not report_dict["utf8"]: + return report_dict + + with file.open() as f: + reader = csv.DictReader(f) + if not reader.fieldnames: + report_dict["message"] = "CSV file is empty or has no header." + report_dict["is_check_passed"] = False + return report_dict + idx = -1 + + try: + # for loop to iterate through the CSV rows + for idx, item in enumerate(reader): + if None in item.keys() or None in item.values(): + raise InvalidFileFormatError( + message=f"CSV file is malformed or the number of columns found on line {idx + 1} is inconsistent with the header", + line_number=idx + 1, + error_source="format", + ) + + report_dict.update(_check_samples_count(file, report_dict, idx)) + report_dict["load_csv"] = True + + except InvalidFileFormatError as e: + report_dict["load_csv"] = False + report_dict["is_check_passed"] = False + report_dict["message"] = e.message + if e.line_number is not None: + report_dict["line_number"] = e.line_number + if e.error_source is not None: + report_dict[e.error_source] = False + except ValueError: + report_dict["load_csv"] = False + if idx < 0: + report_dict["message"] = ( + "Unable to decode file. " + "File may be empty or in an unsupported format. " + ) + else: + report_dict["message"] = ( + f"Error parsing the CSV file. Unexpected format on line {idx + 1}." + ) + report_dict["is_check_passed"] = False + + return report_dict + + +def _check_jsonl(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]: + report_dict: Dict[str, Any] = {} + report_dict.update(_check_utf8(file)) + if not report_dict["utf8"]: return report_dict dataset_format = None @@ -259,84 +365,75 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: line_number=idx + 1, error_source="line_type", ) - - current_format = None - for possible_format in JSONL_REQUIRED_COLUMNS_MAP: - if all( - column in json_line - for column in JSONL_REQUIRED_COLUMNS_MAP[possible_format] - ): - if current_format is None: - current_format = possible_format - elif current_format != possible_format: - raise InvalidFileFormatError( - message="Found multiple dataset formats in the input file. " - f"Got {current_format} and {possible_format} on line {idx + 1}.", - line_number=idx + 1, - error_source="format", - ) - - # Check that there are no extra columns - for column in json_line: - if ( - column - not in JSONL_REQUIRED_COLUMNS_MAP[possible_format] - ): + # In evals, we don't check the format of the dataset. + if purpose != FilePurpose.Eval: + current_format = None + for possible_format in JSONL_REQUIRED_COLUMNS_MAP: + if all( + column in json_line + for column in JSONL_REQUIRED_COLUMNS_MAP[possible_format] + ): + if current_format is None: + current_format = possible_format + elif current_format != possible_format: raise InvalidFileFormatError( - message=f'Found extra column "{column}" in the line {idx + 1}.', + message="Found multiple dataset formats in the input file. " + f"Got {current_format} and {possible_format} on line {idx + 1}.", line_number=idx + 1, error_source="format", ) - if current_format is None: - raise InvalidFileFormatError( - message=( - f"Error parsing file. Could not detect a format for the line {idx + 1} with the columns:\n" - f"{json_line.keys()}" - ), - line_number=idx + 1, - error_source="format", - ) - if current_format == DatasetFormat.PREFERENCE_OPENAI: - validate_preference_openai(json_line, idx) - elif current_format == DatasetFormat.CONVERSATION: - message_column = JSONL_REQUIRED_COLUMNS_MAP[ - DatasetFormat.CONVERSATION - ][0] - validate_messages(json_line[message_column], idx) - else: - for column in JSONL_REQUIRED_COLUMNS_MAP[current_format]: - if not isinstance(json_line[column], str): - raise InvalidFileFormatError( - message=f'Invalid value type for "{column}" key on line {idx + 1}. ' - f"Expected string. Found {type(json_line[column])}.", - line_number=idx + 1, - error_source="key_value", - ) - - if dataset_format is None: - dataset_format = current_format - elif current_format is not None: - if current_format != dataset_format: + # Check that there are no extra columns + for column in json_line: + if ( + column + not in JSONL_REQUIRED_COLUMNS_MAP[possible_format] + ): + raise InvalidFileFormatError( + message=f'Found extra column "{column}" in the line {idx + 1}.', + line_number=idx + 1, + error_source="format", + ) + + if current_format is None: raise InvalidFileFormatError( - message="All samples in the dataset must have the same dataset format. " - f"Got {dataset_format} for the first line and {current_format} " - f"for the line {idx + 1}.", + message=( + f"Error parsing file. Could not detect a format for the line {idx + 1} with the columns:\n" + f"{json_line.keys()}" + ), line_number=idx + 1, error_source="format", ) + if current_format == DatasetFormat.PREFERENCE_OPENAI: + validate_preference_openai(json_line, idx) + elif current_format == DatasetFormat.CONVERSATION: + message_column = JSONL_REQUIRED_COLUMNS_MAP[ + DatasetFormat.CONVERSATION + ][0] + validate_messages(json_line[message_column], idx) + else: + for column in JSONL_REQUIRED_COLUMNS_MAP[current_format]: + if not isinstance(json_line[column], str): + raise InvalidFileFormatError( + message=f'Invalid value type for "{column}" key on line {idx + 1}. ' + f"Expected string. Found {type(json_line[column])}.", + line_number=idx + 1, + error_source="key_value", + ) - if idx + 1 < MIN_SAMPLES: - report_dict["has_min_samples"] = False - report_dict["message"] = ( - f"Processing {file} resulted in only {idx + 1} samples. " - f"Our minimum is {MIN_SAMPLES} samples. " - ) - report_dict["is_check_passed"] = False - else: - report_dict["num_samples"] = idx + 1 - report_dict["has_min_samples"] = True - report_dict["is_check_passed"] = True + if dataset_format is None: + dataset_format = current_format + elif current_format is not None: + if current_format != dataset_format: + raise InvalidFileFormatError( + message="All samples in the dataset must have the same dataset format. " + f"Got {dataset_format} for the first line and {current_format} " + f"for the line {idx + 1}.", + line_number=idx + 1, + error_source="format", + ) + + report_dict.update(_check_samples_count(file, report_dict, idx)) report_dict["load_json"] = True @@ -370,7 +467,7 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: return report_dict -def _check_parquet(file: Path) -> Dict[str, Any]: +def _check_parquet(file: Path, purpose: FilePurpose | str) -> Dict[str, Any]: try: # Pyarrow is optional as it's large (~80MB) and isn't compatible with older systems. from pyarrow import ArrowInvalid, parquet @@ -380,6 +477,13 @@ def _check_parquet(file: Path) -> Dict[str, Any]: ) report_dict: Dict[str, Any] = {} + if purpose == FilePurpose.Eval: + report_dict["is_check_passed"] = False + report_dict["message"] = ( + f"Parquet files are not supported for {purpose}. " + "Only JSONL and CSV files are supported." + ) + return report_dict try: table = parquet.read_table(str(file), memory_map=True) @@ -399,6 +503,7 @@ def _check_parquet(file: Path) -> Dict[str, Any]: report_dict["is_check_passed"] = False return report_dict + # Don't check for eval for column_name in column_names: if column_name not in PARQUET_EXPECTED_COLUMNS: report_dict["load_parquet"] = ( diff --git a/tests/unit/test_code_interpreter.py b/tests/unit/test_code_interpreter.py index 19a1c48c..bf1120cd 100644 --- a/tests/unit/test_code_interpreter.py +++ b/tests/unit/test_code_interpreter.py @@ -331,7 +331,6 @@ def test_code_interpreter_session_management(mocker): def test_code_interpreter_run_with_files(mocker): - mock_requestor = mocker.MagicMock() response_data = { "data": { diff --git a/tests/unit/test_evaluation_resources.py b/tests/unit/test_evaluation_resources.py new file mode 100644 index 00000000..e0936284 --- /dev/null +++ b/tests/unit/test_evaluation_resources.py @@ -0,0 +1,414 @@ +from __future__ import annotations + +from unittest.mock import patch + +import pytest + +from together import Together, AsyncTogether +from together.together_response import TogetherResponse +from together.types.evaluation import ( + EvaluationCreateResponse, + EvaluationJob, + EvaluationStatus, + EvaluationStatusResponse, +) + + +class TestEvaluation: + @pytest.fixture + def sync_together_instance(self) -> Together: + with patch.dict("os.environ", {"TOGETHER_API_KEY": "fake_api_key"}, clear=True): + return Together() + + @pytest.fixture + def async_together_instance(self) -> AsyncTogether: + with patch.dict("os.environ", {"TOGETHER_API_KEY": "fake_api_key"}, clear=True): + return AsyncTogether() + + def test_create_classify_evaluation(self, mocker, sync_together_instance): + # Mock the API requestor + mock_requestor = mocker.MagicMock() + response_data = {"workflow_id": "eval_123456", "status": "pending"} + mock_headers = {"x-together-request-id": "req_123"} + mock_response = TogetherResponse(data=response_data, headers=mock_headers) + mock_requestor.request.return_value = (mock_response, None, None) + mocker.patch( + "together.abstract.api_requestor.APIRequestor", return_value=mock_requestor + ) + + # Test classify evaluation creation + result = sync_together_instance.evaluation.create( + type="classify", + judge_model_name="meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo", + judge_system_template="You are a helpful assistant", + input_data_file_path="file_123", + labels=["accurate", "inaccurate"], + pass_labels=["accurate"], + model_to_evaluate="meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo", + ) + + # Verify the request + mock_requestor.request.assert_called_once() + call_args = mock_requestor.request.call_args[1]["options"] + assert call_args.method == "POST" + assert call_args.url == "evaluation" + assert call_args.params["type"] == "classify" + + # Verify parameters structure + params = call_args.params["parameters"] + assert ( + params["judge"]["model_name"] + == "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo" + ) + assert params["judge"]["system_template"] == "You are a helpful assistant" + assert params["labels"] == ["accurate", "inaccurate"] + assert params["pass_labels"] == ["accurate"] + assert params["input_data_file_path"] == "file_123" + assert ( + params["model_to_evaluate"] + == "meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo" + ) + + # Verify response + assert isinstance(result, EvaluationCreateResponse) + assert result.workflow_id == "eval_123456" + assert result.status == EvaluationStatus.PENDING + + def test_create_score_evaluation(self, mocker, sync_together_instance): + # Mock the API requestor + mock_requestor = mocker.MagicMock() + response_data = {"workflow_id": "eval_789012", "status": "queued"} + mock_headers = {"x-together-request-id": "req_456"} + mock_response = TogetherResponse(data=response_data, headers=mock_headers) + mock_requestor.request.return_value = (mock_response, None, None) + mocker.patch( + "together.abstract.api_requestor.APIRequestor", return_value=mock_requestor + ) + + # Test score evaluation creation with ModelRequest + model_request = { + "model_name": "meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo", + "max_tokens": 512, + "temperature": 0.7, + "system_template": "You are an assistant", + "input_template": "Question: {input}", + } + + result = sync_together_instance.evaluation.create( + type="score", + judge_model_name="meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo", + judge_system_template="Rate the response", + input_data_file_path="file_456", + min_score=0.0, + max_score=10.0, + pass_threshold=7.0, + model_to_evaluate=model_request, + ) + + # Verify the request + mock_requestor.request.assert_called_once() + call_args = mock_requestor.request.call_args[1]["options"] + params = call_args.params["parameters"] + + assert params["min_score"] == 0.0 + assert params["max_score"] == 10.0 + assert params["pass_threshold"] == 7.0 + assert ( + params["model_to_evaluate"]["model_name"] + == "meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo" + ) + assert params["model_to_evaluate"]["max_tokens"] == 512 + + # Verify response + assert result.workflow_id == "eval_789012" + assert result.status == EvaluationStatus.QUEUED + + def test_create_compare_evaluation(self, mocker, sync_together_instance): + # Mock the API requestor + mock_requestor = mocker.MagicMock() + response_data = {"workflow_id": "eval_345678", "status": "running"} + mock_headers = {"x-together-request-id": "req_789"} + mock_response = TogetherResponse(data=response_data, headers=mock_headers) + mock_requestor.request.return_value = (mock_response, None, None) + mocker.patch( + "together.abstract.api_requestor.APIRequestor", return_value=mock_requestor + ) + + # Test compare evaluation creation + result = sync_together_instance.evaluation.create( + type="compare", + judge_model_name="meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo", + judge_system_template="Compare the two responses", + input_data_file_path="file_789", + model_a="model-a-name", + model_b="model-b-name", + ) + + # Verify the request + call_args = mock_requestor.request.call_args[1]["options"] + params = call_args.params["parameters"] + print(params) + assert params["model_a"] == "model-a-name" + assert params["model_b"] == "model-b-name" + + # Verify response + assert result.workflow_id == "eval_345678" + assert result.status == EvaluationStatus.RUNNING + + def test_create_evaluation_missing_required_params(self, sync_together_instance): + # Test missing labels for classify + with pytest.raises( + ValueError, + match="labels are required for classify evaluation", + ): + sync_together_instance.evaluation.create( + type="classify", + judge_model_name="judge-model", + judge_system_template="template", + input_data_file_path="file_123", + model_to_evaluate="asdfg", + ) + + # Test missing score params + with pytest.raises( + ValueError, + match="min_score, max_score, and pass_threshold are required for score evaluation", + ): + sync_together_instance.evaluation.create( + type="score", + judge_model_name="judge-model", + judge_system_template="template", + input_data_file_path="file_123", + model_to_evaluate="asdfg", + ) + + # Test invalid type + with pytest.raises(ValueError, match="Invalid evaluation type"): + sync_together_instance.evaluation.create( + type="invalid_type", + judge_model_name="judge-model", + judge_system_template="template", + input_data_file_path="file_123", + model_to_evaluate="asdfg", + ) + + def test_list_evaluations(self, mocker, sync_together_instance): + # Mock the API requestor + mock_requestor = mocker.MagicMock() + response_data = [ + { + "workflow_id": "eval_1", + "type": "classify", + "status": "completed", + "results": {"accuracy": 0.95}, + "created_at": "2024-01-01T00:00:00Z", + }, + { + "workflow_id": "eval_2", + "type": "score", + "status": "running", + "created_at": "2024-01-02T00:00:00Z", + }, + ] + mock_headers = {"x-together-request-id": "req_list"} + mock_response = TogetherResponse(data=response_data, headers=mock_headers) + mock_requestor.request.return_value = (mock_response, None, None) + mocker.patch( + "together.abstract.api_requestor.APIRequestor", return_value=mock_requestor + ) + + # Test list without filters + result = sync_together_instance.evaluation.list() + + # Verify the request + call_args = mock_requestor.request.call_args[1]["options"] + assert call_args.method == "GET" + assert call_args.url == "evaluations" + assert call_args.params is None + + # Verify response + assert len(result) == 2 + assert all(isinstance(job, EvaluationJob) for job in result) + assert result[0].workflow_id == "eval_1" + assert result[1].workflow_id == "eval_2" + + def test_list_evaluations_with_filters(self, mocker, sync_together_instance): + # Mock the API requestor + mock_requestor = mocker.MagicMock() + response_data = [] + mock_headers = {"x-together-request-id": "req_list_filtered"} + mock_response = TogetherResponse(data=response_data, headers=mock_headers) + mock_requestor.request.return_value = (mock_response, None, None) + mocker.patch( + "together.abstract.api_requestor.APIRequestor", return_value=mock_requestor + ) + + # Test list with filters + result = sync_together_instance.evaluation.list(status="completed", limit=50) + + # Verify the request + call_args = mock_requestor.request.call_args[1]["options"] + assert call_args.params == {"status": "completed", "limit": 50} + + # Verify empty response + assert result == [] + + def test_retrieve_evaluation(self, mocker, sync_together_instance): + # Mock the API requestor + mock_requestor = mocker.MagicMock() + response_data = { + "workflow_id": "eval_123", + "type": "classify", + "status": "completed", + "results": {"accuracy": 0.92, "pass_rate": 0.88}, + "parameters": { + "judge": {"model_name": "judge-model", "system_template": "template"} + }, + "created_at": "2024-01-01T00:00:00Z", + "updated_at": "2024-01-01T01:00:00Z", + } + mock_headers = {"x-together-request-id": "req_retrieve"} + mock_response = TogetherResponse(data=response_data, headers=mock_headers) + mock_requestor.request.return_value = (mock_response, None, None) + mocker.patch( + "together.abstract.api_requestor.APIRequestor", return_value=mock_requestor + ) + + # Test retrieve + result = sync_together_instance.evaluation.retrieve("eval_123") + + # Verify the request + call_args = mock_requestor.request.call_args[1]["options"] + assert call_args.method == "GET" + assert call_args.url == "evaluation/eval_123" + + # Verify response + assert isinstance(result, EvaluationJob) + assert result.workflow_id == "eval_123" + assert result.status == EvaluationStatus.COMPLETED + assert result.results["accuracy"] == 0.92 + + def test_status_evaluation(self, mocker, sync_together_instance): + # Mock the API requestor + mock_requestor = mocker.MagicMock() + response_data = { + "status": "completed", + "results": {"total_samples": 100, "accuracy": 0.95, "pass_rate": 0.90}, + } + mock_headers = {"x-together-request-id": "req_status"} + mock_response = TogetherResponse(data=response_data, headers=mock_headers) + mock_requestor.request.return_value = (mock_response, None, None) + mocker.patch( + "together.abstract.api_requestor.APIRequestor", return_value=mock_requestor + ) + + # Test status + result = sync_together_instance.evaluation.status("eval_456") + + # Verify the request + call_args = mock_requestor.request.call_args[1]["options"] + assert call_args.method == "GET" + assert call_args.url == "evaluation/eval_456/status" + + # Verify response + assert isinstance(result, EvaluationStatusResponse) + assert result.status == EvaluationStatus.COMPLETED + assert result.results["total_samples"] == 100 + assert result.results["accuracy"] == 0.95 + + @pytest.mark.asyncio + async def test_async_create_evaluation(self, mocker, async_together_instance): + # Mock the async API requestor + mock_requestor = mocker.AsyncMock() + response_data = {"workflow_id": "async_eval_123", "status": "pending"} + mock_headers = {"x-together-request-id": "async_req_123"} + mock_response = TogetherResponse(data=response_data, headers=mock_headers) + mock_requestor.arequest.return_value = (mock_response, None, None) + mocker.patch( + "together.abstract.api_requestor.APIRequestor", return_value=mock_requestor + ) + + # Test async classify evaluation creation + result = await async_together_instance.evaluation.create( + type="classify", + judge_model_name="judge-model", + judge_system_template="template", + input_data_file_path="file_async", + labels=["good", "bad"], + pass_labels=["good"], + ) + + # Verify the request was made with arequest + mock_requestor.arequest.assert_called_once() + + # Verify response + assert result.workflow_id == "async_eval_123" + assert result.status == EvaluationStatus.PENDING + + @pytest.mark.asyncio + async def test_async_list_evaluations(self, mocker, async_together_instance): + # Mock the async API requestor + mock_requestor = mocker.AsyncMock() + response_data = [{"workflow_id": "async_eval_1", "status": "completed"}] + mock_headers = {"x-together-request-id": "async_req_list"} + mock_response = TogetherResponse(data=response_data, headers=mock_headers) + mock_requestor.arequest.return_value = (mock_response, None, None) + mocker.patch( + "together.abstract.api_requestor.APIRequestor", return_value=mock_requestor + ) + + # Test async list + result = await async_together_instance.evaluation.list() + + # Verify arequest was called + mock_requestor.arequest.assert_called_once() + + # Verify response + assert len(result) == 1 + assert result[0].workflow_id == "async_eval_1" + + @pytest.mark.asyncio + async def test_async_retrieve_evaluation(self, mocker, async_together_instance): + # Mock the async API requestor + mock_requestor = mocker.AsyncMock() + response_data = {"workflow_id": "async_eval_retrieve", "status": "running"} + mock_headers = {"x-together-request-id": "async_req_retrieve"} + mock_response = TogetherResponse(data=response_data, headers=mock_headers) + mock_requestor.arequest.return_value = (mock_response, None, None) + mocker.patch( + "together.abstract.api_requestor.APIRequestor", return_value=mock_requestor + ) + + # Test async retrieve + result = await async_together_instance.evaluation.retrieve( + "async_eval_retrieve" + ) + + # Verify arequest was called + mock_requestor.arequest.assert_called_once() + + # Verify response + assert result.workflow_id == "async_eval_retrieve" + assert result.status == EvaluationStatus.RUNNING + + @pytest.mark.asyncio + async def test_async_status_evaluation(self, mocker, async_together_instance): + # Mock the async API requestor + mock_requestor = mocker.AsyncMock() + response_data = {"status": "completed", "results": {"success": True}} + mock_headers = {"x-together-request-id": "async_req_status"} + mock_response = TogetherResponse(data=response_data, headers=mock_headers) + mock_requestor.arequest.return_value = (mock_response, None, None) + mocker.patch( + "together.abstract.api_requestor.APIRequestor", return_value=mock_requestor + ) + + # Test async status + result = await async_together_instance.evaluation.status("async_eval_status") + + # Verify arequest was called + mock_requestor.arequest.assert_called_once() + + # Verify response + assert result.status == EvaluationStatus.COMPLETED + assert result.results["success"] is True diff --git a/tests/unit/test_files_checks.py b/tests/unit/test_files_checks.py index 68af1f6c..4888718e 100644 --- a/tests/unit/test_files_checks.py +++ b/tests/unit/test_files_checks.py @@ -1,9 +1,10 @@ import json import pytest +import csv from pathlib import Path from together.constants import MIN_SAMPLES -from together.utils.files import check_file +from together.utils.files import check_file, FilePurpose def test_check_jsonl_valid_general(tmp_path: Path): @@ -392,3 +393,64 @@ def test_check_jsonl_invalid_weight(tmp_path: Path): report = check_file(file) assert not report["is_check_passed"] assert "Weight must be either 0 or 1" in report["message"] + + +def test_check_csv_valid_general(tmp_path: Path): + # Create a valid CSV file + file = tmp_path / "valid.csv" + with open(file, "w") as f: + writer = csv.DictWriter(f, fieldnames=["text"]) + writer.writeheader() + writer.writerow({"text": "Hello, world!"}) + writer.writerow({"text": "How are you?"}) + + report = check_file(file, purpose=FilePurpose.Eval) + assert report["is_check_passed"] + assert report["utf8"] + assert report["num_samples"] == 2 + assert report["has_min_samples"] + + +def test_check_csv_empty_file(tmp_path: Path): + # Create an empty CSV file + file = tmp_path / "empty.csv" + file.touch() + + report = check_file(file, purpose=FilePurpose.Eval) + + assert not report["is_check_passed"] + assert report["message"] == "File is empty" + assert report["file_size"] == 0 + + +def test_check_csv_valid_completion(tmp_path: Path): + # Create a valid CSV file with conversational format + file = tmp_path / "valid_completion.csv" + + with open(file, "w") as f: + writer = csv.DictWriter(f, fieldnames=["prompt", "completion"]) + writer.writeheader() + writer.writerow( + { + "prompt": "Translate the following sentence.", + "completion": "Hello, world!", + } + ) + + report = check_file(file, purpose=FilePurpose.Eval) + assert report["is_check_passed"] + assert report["utf8"] + assert report["num_samples"] == 1 + assert report["has_min_samples"] + + +def test_check_csv_invalid_column(tmp_path: Path): + # Create a CSV file with an invalid column + file = tmp_path / "invalid_column.csv" + with open(file, "w") as f: + writer = csv.DictWriter(f, fieldnames=["asfg"]) + writer.writeheader() + + report = check_file(file) + + assert not report["is_check_passed"] diff --git a/tests/unit/test_finetune_resources.py b/tests/unit/test_finetune_resources.py index 5ddbe0ab..46e2f2f8 100644 --- a/tests/unit/test_finetune_resources.py +++ b/tests/unit/test_finetune_resources.py @@ -87,7 +87,6 @@ def test_lora_request(): @pytest.mark.parametrize("lora_dropout", [-1, 0, 0.5, 1.0, 10.0]) def test_lora_request_with_lora_dropout(lora_dropout: float): - if 0 <= lora_dropout < 1: request = create_finetune_request( model_limits=_MODEL_LIMITS,