diff --git a/src/together/cli/api/finetune.py b/src/together/cli/api/finetune.py index ad81339d..b413f323 100644 --- a/src/together/cli/api/finetune.py +++ b/src/together/cli/api/finetune.py @@ -110,6 +110,18 @@ def fine_tuning(ctx: click.Context) -> None: default="all-linear", help="Trainable modules for LoRA adapters. For example, 'all-linear', 'q_proj,v_proj'", ) +@click.option( + "--training-method", + type=click.Choice(["sft", "dpo"]), + default="sft", + help="Training method to use. Options: sft (supervised fine-tuning), dpo (Direct Preference Optimization)", +) +@click.option( + "--dpo-beta", + type=float, + default=0.1, + help="Beta parameter for DPO training (only used when '--training-method' is 'dpo')", +) @click.option( "--suffix", type=str, default=None, help="Suffix for the fine-tuned model name" ) @@ -166,6 +178,8 @@ def create( wandb_name: str, confirm: bool, train_on_inputs: bool | Literal["auto"], + training_method: str, + dpo_beta: float, from_checkpoint: str, ) -> None: """Start fine-tuning""" @@ -195,6 +209,8 @@ def create( wandb_project_name=wandb_project_name, wandb_name=wandb_name, train_on_inputs=train_on_inputs, + training_method=training_method, + dpo_beta=dpo_beta, from_checkpoint=from_checkpoint, ) diff --git a/src/together/constants.py b/src/together/constants.py index c64af326..99e27a4a 100644 --- a/src/together/constants.py +++ b/src/together/constants.py @@ -39,12 +39,18 @@ class DatasetFormat(enum.Enum): GENERAL = "general" CONVERSATION = "conversation" INSTRUCTION = "instruction" + PREFERENCE_OPENAI = "preference_openai" JSONL_REQUIRED_COLUMNS_MAP = { DatasetFormat.GENERAL: ["text"], DatasetFormat.CONVERSATION: ["messages"], DatasetFormat.INSTRUCTION: ["prompt", "completion"], + DatasetFormat.PREFERENCE_OPENAI: [ + "input", + "preferred_output", + "non_preferred_output", + ], } REQUIRED_COLUMNS_MESSAGE = ["role", "content"] POSSIBLE_ROLES_CONVERSATION = ["system", "user", "assistant"] diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py index 11d445db..8cc48a17 100644 --- a/src/together/resources/finetune.py +++ b/src/together/resources/finetune.py @@ -23,6 +23,8 @@ TrainingType, FinetuneLRScheduler, FinetuneLinearLRSchedulerArgs, + TrainingMethodDPO, + TrainingMethodSFT, FinetuneCheckpoint, ) from together.types.finetune import ( @@ -39,6 +41,12 @@ _FT_JOB_WITH_STEP_REGEX = r"^ft-[\dabcdef-]+:\d+$" +AVAILABLE_TRAINING_METHODS = { + TrainingMethodSFT().method, + TrainingMethodDPO().method, +} + + def createFinetuneRequest( model_limits: FinetuneTrainingLimits, training_file: str, @@ -64,8 +72,11 @@ def createFinetuneRequest( wandb_project_name: str | None = None, wandb_name: str | None = None, train_on_inputs: bool | Literal["auto"] = "auto", + training_method: str = "sft", + dpo_beta: float | None = None, from_checkpoint: str | None = None, ) -> FinetuneRequest: + if batch_size == "max": log_warn_once( "Starting from together>=1.3.0, " @@ -113,11 +124,20 @@ def createFinetuneRequest( if weight_decay is not None and (weight_decay < 0): raise ValueError("Weight decay should be non-negative") + if training_method not in AVAILABLE_TRAINING_METHODS: + raise ValueError( + f"training_method must be one of {', '.join(AVAILABLE_TRAINING_METHODS)}" + ) + lrScheduler = FinetuneLRScheduler( lr_scheduler_type="linear", lr_scheduler_args=FinetuneLinearLRSchedulerArgs(min_lr_ratio=min_lr_ratio), ) + training_method_cls: TrainingMethodSFT | TrainingMethodDPO = TrainingMethodSFT() + if training_method == "dpo": + training_method_cls = TrainingMethodDPO(dpo_beta=dpo_beta) + finetune_request = FinetuneRequest( model=model, training_file=training_file, @@ -138,6 +158,7 @@ def createFinetuneRequest( wandb_project_name=wandb_project_name, wandb_name=wandb_name, train_on_inputs=train_on_inputs, + training_method=training_method_cls, from_checkpoint=from_checkpoint, ) @@ -240,6 +261,8 @@ def create( verbose: bool = False, model_limits: FinetuneTrainingLimits | None = None, train_on_inputs: bool | Literal["auto"] = "auto", + training_method: str = "sft", + dpo_beta: float | None = None, from_checkpoint: str | None = None, ) -> FinetuneResponse: """ @@ -286,6 +309,9 @@ def create( For datasets with the "messages" field (conversational format) or "prompt" and "completion" fields (Instruction format), inputs will be masked. Defaults to "auto". + training_method (str, optional): Training method. Defaults to "sft". + Supported methods: "sft", "dpo". + dpo_beta (float, optional): DPO beta parameter. Defaults to None. from_checkpoint (str, optional): The checkpoint identifier to continue training from a previous fine-tuning job. The format: {$JOB_ID/$OUTPUT_MODEL_NAME}:{$STEP}. The step value is optional, without it the final checkpoint will be used. @@ -300,7 +326,6 @@ def create( if model_limits is None: model_limits = self.get_model_limits(model=model) - finetune_request = createFinetuneRequest( model_limits=model_limits, training_file=training_file, @@ -326,6 +351,8 @@ def create( wandb_project_name=wandb_project_name, wandb_name=wandb_name, train_on_inputs=train_on_inputs, + training_method=training_method, + dpo_beta=dpo_beta, from_checkpoint=from_checkpoint, ) @@ -344,7 +371,6 @@ def create( ), stream=False, ) - assert isinstance(response, TogetherResponse) return FinetuneResponse(**response.data) @@ -608,6 +634,8 @@ async def create( verbose: bool = False, model_limits: FinetuneTrainingLimits | None = None, train_on_inputs: bool | Literal["auto"] = "auto", + training_method: str = "sft", + dpo_beta: float | None = None, from_checkpoint: str | None = None, ) -> FinetuneResponse: """ @@ -654,6 +682,9 @@ async def create( For datasets with the "messages" field (conversational format) or "prompt" and "completion" fields (Instruction format), inputs will be masked. Defaults to "auto". + training_method (str, optional): Training method. Defaults to "sft". + Supported methods: "sft", "dpo". + dpo_beta (float, optional): DPO beta parameter. Defaults to None. from_checkpoint (str, optional): The checkpoint identifier to continue training from a previous fine-tuning job. The format: {$JOB_ID/$OUTPUT_MODEL_NAME}:{$STEP}. The step value is optional, without it the final checkpoint will be used. @@ -694,6 +725,8 @@ async def create( wandb_project_name=wandb_project_name, wandb_name=wandb_name, train_on_inputs=train_on_inputs, + training_method=training_method, + dpo_beta=dpo_beta, from_checkpoint=from_checkpoint, ) diff --git a/src/together/types/__init__.py b/src/together/types/__init__.py index 1a7419a5..47fed22b 100644 --- a/src/together/types/__init__.py +++ b/src/together/types/__init__.py @@ -31,6 +31,8 @@ FileType, ) from together.types.finetune import ( + TrainingMethodDPO, + TrainingMethodSFT, FinetuneCheckpoint, FinetuneDownloadResult, FinetuneLinearLRSchedulerArgs, @@ -81,6 +83,8 @@ "TrainingType", "FullTrainingType", "LoRATrainingType", + "TrainingMethodDPO", + "TrainingMethodSFT", "RerankRequest", "RerankResponse", "FinetuneTrainingLimits", diff --git a/src/together/types/finetune.py b/src/together/types/finetune.py index e3811292..c41cbce2 100644 --- a/src/together/types/finetune.py +++ b/src/together/types/finetune.py @@ -135,6 +135,31 @@ class LoRATrainingType(TrainingType): type: str = "Lora" +class TrainingMethod(BaseModel): + """ + Training method type + """ + + method: str + + +class TrainingMethodSFT(TrainingMethod): + """ + Training method type for SFT training + """ + + method: Literal["sft"] = "sft" + + +class TrainingMethodDPO(TrainingMethod): + """ + Training method type for DPO training + """ + + method: Literal["dpo"] = "dpo" + dpo_beta: float | None = None + + class FinetuneRequest(BaseModel): """ Fine-tune request type @@ -178,6 +203,10 @@ class FinetuneRequest(BaseModel): training_type: FullTrainingType | LoRATrainingType | None = None # train on inputs train_on_inputs: StrictBool | Literal["auto"] = "auto" + # training method + training_method: TrainingMethodSFT | TrainingMethodDPO = Field( + default_factory=TrainingMethodSFT + ) # from step from_checkpoint: str diff --git a/src/together/utils/files.py b/src/together/utils/files.py index cc39fca0..e1e1d4ed 100644 --- a/src/together/utils/files.py +++ b/src/together/utils/files.py @@ -4,7 +4,7 @@ import os from pathlib import Path from traceback import format_exc -from typing import Any, Dict +from typing import Any, Dict, List from pyarrow import ArrowInvalid, parquet @@ -96,6 +96,140 @@ def check_file( return report_dict +def validate_messages(messages: List[Dict[str, str | bool]], idx: int) -> None: + """Validate the messages column.""" + if not isinstance(messages, list): + raise InvalidFileFormatError( + message=f"Invalid format on line {idx + 1} of the input file. " + f"Expected a list of messages. Found {type(messages)}", + line_number=idx + 1, + error_source="key_value", + ) + if not messages: + raise InvalidFileFormatError( + message=f"Invalid format on line {idx + 1} of the input file. " + f"Expected a non-empty list of messages. Found empty list", + line_number=idx + 1, + error_source="key_value", + ) + + has_weights = any("weight" in message for message in messages) + + previous_role = None + for message in messages: + if not isinstance(message, dict): + raise InvalidFileFormatError( + message=f"Invalid format on line {idx + 1} of the input file. " + f"Expected a dictionary in the messages list. Found {type(message)}", + line_number=idx + 1, + error_source="key_value", + ) + for column in REQUIRED_COLUMNS_MESSAGE: + if column not in message: + raise InvalidFileFormatError( + message=f"Field `{column}` is missing for a turn `{message}` on line {idx + 1} " + "of the the input file.", + line_number=idx + 1, + error_source="key_value", + ) + else: + if not isinstance(message[column], str): + raise InvalidFileFormatError( + message=f"Invalid format on line {idx + 1} in the column {column} for turn `{message}` " + f"of the input file. Expected string. Found {type(message[column])}", + line_number=idx + 1, + error_source="text_field", + ) + + if has_weights and "weight" in message: + weight = message["weight"] + if not isinstance(weight, int): + raise InvalidFileFormatError( + message="Weight must be an integer", + line_number=idx + 1, + error_source="key_value", + ) + if weight not in {0, 1}: + raise InvalidFileFormatError( + message="Weight must be either 0 or 1", + line_number=idx + 1, + error_source="key_value", + ) + if message["role"] not in POSSIBLE_ROLES_CONVERSATION: + raise InvalidFileFormatError( + message=f"Found invalid role `{message['role']}` in the messages on the line {idx + 1}. " + f"Possible roles in the conversation are: {POSSIBLE_ROLES_CONVERSATION}", + line_number=idx + 1, + error_source="key_value", + ) + + if previous_role == message["role"]: + raise InvalidFileFormatError( + message=f"Invalid role turns on line {idx + 1} of the input file. " + "`user` and `assistant` roles must alternate user/assistant/user/assistant/...", + line_number=idx + 1, + error_source="key_value", + ) + previous_role = message["role"] + + +def validate_preference_openai(example: Dict[str, Any], idx: int = 0) -> None: + """Validate the OpenAI preference dataset format. + + Args: + example (dict): Input entry to be checked. + idx (int): Line number in the file. + + Raises: + InvalidFileFormatError: If the dataset format is invalid. + """ + if not isinstance(example["input"], dict): + raise InvalidFileFormatError( + message="The dataset is malformed, the `input` field must be a dictionary.", + line_number=idx + 1, + error_source="key_value", + ) + + if "messages" not in example["input"]: + raise InvalidFileFormatError( + message="The dataset is malformed, the `input` dictionary must contain a `messages` field.", + line_number=idx + 1, + error_source="key_value", + ) + + validate_messages(example["input"]["messages"], idx) + + for output_field in ["preferred_output", "non_preferred_output"]: + if not isinstance(example[output_field], list): + raise InvalidFileFormatError( + message=f"The dataset is malformed, the `{output_field}` field must be a list.", + line_number=idx + 1, + error_source="key_value", + ) + + if len(example[output_field]) != 1: + raise InvalidFileFormatError( + message=f"The dataset is malformed, the `{output_field}` list must contain exactly one message.", + line_number=idx + 1, + error_source="key_value", + ) + if "role" not in example[output_field][0]: + raise InvalidFileFormatError( + message=f"The dataset is malformed, the `{output_field}` message is missing the `role` field.", + line_number=idx + 1, + error_source="key_value", + ) + elif example[output_field][0]["role"] != "assistant": + raise InvalidFileFormatError( + message=f"The dataset is malformed, the `{output_field}` must contain an assistant message.", + line_number=idx + 1, + error_source="key_value", + ) + + validate_messages(example["preferred_output"], idx) + validate_messages(example["non_preferred_output"], idx) + + def _check_jsonl(file: Path) -> Dict[str, Any]: report_dict: Dict[str, Any] = {} # Check that the file is UTF-8 encoded. If not report where the error occurs. @@ -164,74 +298,13 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: line_number=idx + 1, error_source="format", ) - - if current_format == DatasetFormat.CONVERSATION: + if current_format == DatasetFormat.PREFERENCE_OPENAI: + validate_preference_openai(json_line, idx) + elif current_format == DatasetFormat.CONVERSATION: message_column = JSONL_REQUIRED_COLUMNS_MAP[ DatasetFormat.CONVERSATION ][0] - if not isinstance(json_line[message_column], list): - raise InvalidFileFormatError( - message=f"Invalid format on line {idx + 1} of the input file. " - f"Expected a list of messages. Found {type(json_line[message_column])}", - line_number=idx + 1, - error_source="key_value", - ) - - if len(json_line[message_column]) == 0: - raise InvalidFileFormatError( - message=f"Invalid format on line {idx + 1} of the input file. " - f"Expected a non-empty list of messages. Found empty list", - line_number=idx + 1, - error_source="key_value", - ) - - for turn_id, turn in enumerate(json_line[message_column]): - if not isinstance(turn, dict): - raise InvalidFileFormatError( - message=f"Invalid format on line {idx + 1} of the input file. " - f"Expected a dictionary in the {turn_id + 1} turn. Found {type(turn)}", - line_number=idx + 1, - error_source="key_value", - ) - - previous_role = None - for turn in json_line[message_column]: - for column in REQUIRED_COLUMNS_MESSAGE: - if column not in turn: - raise InvalidFileFormatError( - message=f"Field `{column}` is missing for a turn `{turn}` on line {idx + 1} " - "of the the input file.", - line_number=idx + 1, - error_source="key_value", - ) - else: - if not isinstance(turn[column], str): - raise InvalidFileFormatError( - message=f"Invalid format on line {idx + 1} in the column {column} for turn `{turn}` " - f"of the input file. Expected string. Found {type(turn[column])}", - line_number=idx + 1, - error_source="text_field", - ) - role = turn["role"] - - if role not in POSSIBLE_ROLES_CONVERSATION: - raise InvalidFileFormatError( - message=f"Found invalid role `{role}` in the messages on the line {idx + 1}. " - f"Possible roles in the conversation are: {POSSIBLE_ROLES_CONVERSATION}", - line_number=idx + 1, - error_source="key_value", - ) - - if previous_role == role: - raise InvalidFileFormatError( - message=f"Invalid role turns on line {idx + 1} of the input file. " - "`user` and `assistant` roles must alternate user/assistant/user/assistant/...", - line_number=idx + 1, - error_source="key_value", - ) - - previous_role = role - + validate_messages(json_line[message_column], idx) else: for column in JSONL_REQUIRED_COLUMNS_MAP[current_format]: if not isinstance(json_line[column], str): diff --git a/tests/unit/test_preference_openai.py b/tests/unit/test_preference_openai.py new file mode 100644 index 00000000..3781c830 --- /dev/null +++ b/tests/unit/test_preference_openai.py @@ -0,0 +1,312 @@ +import json +import pytest +from pathlib import Path + +from together.constants import MIN_SAMPLES +from together.utils.files import check_file + + +_TEST_PREFERENCE_OPENAI_CONTENT = [ + { + "input": { + "messages": [ + {"role": "user", "content": "Hi there, I have a question."}, + {"role": "assistant", "content": "Hello, how is your day going?"}, + { + "role": "user", + "content": "Hello, can you tell me how cold San Francisco is today?", + }, + ], + }, + "preferred_output": [ + { + "role": "assistant", + "content": "Today in San Francisco, it is not quite cold as expected. Morning clouds will give away " + "to sunshine, with a high near 68°F (20°C) and a low around 57°F (14°C).", + } + ], + "non_preferred_output": [ + { + "role": "assistant", + "content": "It is not particularly cold in San Francisco today.", + } + ], + }, + { + "input": { + "messages": [ + { + "role": "user", + "content": "What's the best way to learn programming?", + }, + ], + }, + "preferred_output": [ + { + "role": "assistant", + "content": "The best way to learn programming is through consistent practice, working on real projects, " + "and breaking down complex problems into smaller parts. Start with a beginner-friendly language like Python.", + } + ], + "non_preferred_output": [ + {"role": "assistant", "content": "Just read some books and you'll be fine."} + ], + }, +] + + +def test_check_jsonl_valid_preference_openai(tmp_path: Path): + """Test valid preference OpenAI format.""" + file = tmp_path / "valid_preference_openai.jsonl" + content = _TEST_PREFERENCE_OPENAI_CONTENT + with file.open("w") as f: + f.write("\n".join(json.dumps(item) for item in content)) + + report = check_file(file) + + assert report["is_check_passed"] + assert report["utf8"] + assert report["num_samples"] == len(content) + assert report["has_min_samples"] + + +MISSING_FIELDS_TEST_CASES = [ + pytest.param("input", "Missing input field", id="missing_input"), + pytest.param( + "preferred_output", + "Missing preferred_output field", + id="missing_preferred_output", + ), + pytest.param( + "non_preferred_output", + "Missing non_preferred_output field", + id="missing_non_preferred_output", + ), +] + + +@pytest.mark.parametrize("field_to_remove, description", MISSING_FIELDS_TEST_CASES) +def test_check_jsonl_invalid_preference_openai_missing_fields( + tmp_path: Path, field_to_remove, description +): + """Test missing required fields in OpenAI preference format.""" + file = tmp_path / f"invalid_preference_openai_missing_{field_to_remove}.jsonl" + content = [item.copy() for item in _TEST_PREFERENCE_OPENAI_CONTENT] + + # Remove the specified field from the first item + del content[0][field_to_remove] + + with file.open("w") as f: + f.write("\n".join(json.dumps(item) for item in content)) + + report = check_file(file) + + assert not report["is_check_passed"], f"Test should fail when {description}" + + +STRUCTURAL_ISSUE_TEST_CASES = [ + pytest.param( + "empty_messages", + lambda item: item.update({"input": {"messages": []}}), + "Empty messages array", + id="empty_messages", + ), + pytest.param( + "missing_role_preferred", + lambda item: item.update( + {"preferred_output": [{"content": "Missing role field"}]} + ), + "Missing role in preferred_output", + id="missing_role_preferred", + ), + pytest.param( + "missing_role_non_preferred", + lambda item: item.update( + {"non_preferred_output": [{"content": "Missing role field"}]} + ), + "Missing role in non_preferred_output", + id="missing_role_non_preferred", + ), + pytest.param( + "missing_content_preferred", + lambda item: item.update({"preferred_output": [{"role": "assistant"}]}), + "Missing content in preferred_output", + id="missing_content_preferred", + ), + pytest.param( + "missing_content_non_preferred", + lambda item: item.update({"non_preferred_output": [{"role": "assistant"}]}), + "Missing content in non_preferred_output", + id="missing_content_non_preferred", + ), + pytest.param( + "wrong_output_format_preferred", + lambda item: item.update({"preferred_output": "Not an array but a string"}), + "Wrong format for preferred_output", + id="wrong_output_format_preferred", + ), + pytest.param( + "wrong_output_format_non_preferred", + lambda item: item.update({"non_preferred_output": "Not an array but a string"}), + "Wrong format for non_preferred_output", + id="wrong_output_format_non_preferred", + ), + pytest.param( + "missing_content", + lambda item: item.update({"input": {"messages": [{"role": "user"}]}}), + "Missing content in messages", + id="missing_content", + ), + pytest.param( + "multiple_preferred_outputs", + lambda item: item.update( + { + "preferred_output": [ + {"role": "assistant", "content": "First response"}, + {"role": "assistant", "content": "Second response"}, + ] + } + ), + "Multiple messages in preferred_output", + id="multiple_preferred_outputs", + ), + pytest.param( + "multiple_non_preferred_outputs", + lambda item: item.update( + { + "non_preferred_output": [ + {"role": "assistant", "content": "First response"}, + {"role": "assistant", "content": "Second response"}, + ] + } + ), + "Multiple messages in non_preferred_output", + id="multiple_non_preferred_outputs", + ), + pytest.param( + "empty_preferred_output", + lambda item: item.update({"preferred_output": []}), + "Empty preferred_output array", + id="empty_preferred_output", + ), + pytest.param( + "empty_non_preferred_output", + lambda item: item.update({"non_preferred_output": []}), + "Empty non_preferred_output array", + id="empty_non_preferred_output", + ), + pytest.param( + "non_string_content_in_messages", + lambda item: item.update( + {"input": {"messages": [{"role": "user", "content": 123}]}} + ), + "Non-string content in messages", + id="non_string_content_in_messages", + ), + pytest.param( + "invalid_role_in_messages", + lambda item: item.update( + {"input": {"messages": [{"role": "invalid_role", "content": "Hello"}]}} + ), + "Invalid role in messages", + id="invalid_role_in_messages", + ), + pytest.param( + "non_alternating_roles", + lambda item: item.update( + { + "input": { + "messages": [ + {"role": "user", "content": "Hello"}, + {"role": "user", "content": "How are you?"}, + ] + } + } + ), + "Non-alternating roles in messages", + id="non_alternating_roles", + ), + pytest.param( + "invalid_weight_type", + lambda item: item.update( + { + "input": { + "messages": [ + {"role": "user", "content": "Hello", "weight": "not_an_integer"} + ] + } + } + ), + "Invalid weight type", + id="invalid_weight_type", + ), + pytest.param( + "invalid_weight_value", + lambda item: item.update( + {"input": {"messages": [{"role": "user", "content": "Hello", "weight": 2}]}} + ), + "Invalid weight value", + id="invalid_weight_value", + ), + pytest.param( + "non_dict_message", + lambda item: item.update({"input": {"messages": ["Not a dictionary"]}}), + "Non-dictionary message", + id="non_dict_message", + ), + pytest.param( + "non_dict_input", + lambda item: item.update({"input": "Not a dictionary"}), + "Non-dictionary input", + id="non_dict_input", + ), + pytest.param( + "missing_messages_in_input", + lambda item: item.update({"input": {}}), + "Missing messages in input", + id="missing_messages_in_input", + ), + pytest.param( + "non_assistant_role_in_preferred", + lambda item: item.update( + { + "preferred_output": [ + {"role": "user", "content": "This should be assistant"} + ] + } + ), + "Non-assistant role in preferred output", + id="non_assistant_role_in_preferred", + ), + pytest.param( + "non_assistant_role_in_non_preferred", + lambda item: item.update( + { + "non_preferred_output": [ + {"role": "user", "content": "This should be assistant"} + ] + } + ), + "Non-assistant role in non-preferred output", + id="non_assistant_role_in_non_preferred", + ), +] + + +@pytest.mark.parametrize("name, modifier, description", STRUCTURAL_ISSUE_TEST_CASES) +def test_check_jsonl_invalid_preference_openai_structural_issues( + tmp_path: Path, name, modifier, description +): + """Test various structural issues in OpenAI preference format.""" + file = tmp_path / f"invalid_preference_openai_{name}.jsonl" + content = [item.copy() for item in _TEST_PREFERENCE_OPENAI_CONTENT] + + # Apply the modification to the first item + modifier(content[0]) + + with file.open("w") as f: + f.write("\n".join(json.dumps(item) for item in content)) + + report = check_file(file) + + assert not report["is_check_passed"], f"Test should fail with {description}"