diff --git a/pyproject.toml b/pyproject.toml index 3911eb95..53d2739b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ build-backend = "poetry.masonry.api" [tool.poetry] name = "together" -version = "1.3.6" +version = "1.3.7" authors = [ "Together AI " ] diff --git a/src/together/utils/files.py b/src/together/utils/files.py index 7267ccbd..6c5892f1 100644 --- a/src/together/utils/files.py +++ b/src/together/utils/files.py @@ -120,7 +120,8 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: raise InvalidFileFormatError( message=( f"Error parsing file. Invalid format on line {idx + 1} of the input file. " - 'Example of valid json: {"text": "my sample string"}. ' + "Datasets must follow text, conversational, or instruction format. For more" + "information, see https://docs.together.ai/docs/fine-tuning-data-preparation" ), line_number=idx + 1, error_source="line_type", @@ -142,6 +143,18 @@ def _check_jsonl(file: Path) -> Dict[str, Any]: error_source="format", ) + # Check that there are no extra columns + for column in json_line: + if ( + column + not in JSONL_REQUIRED_COLUMNS_MAP[possible_format] + ): + raise InvalidFileFormatError( + message=f'Found extra column "{column}" in the line {idx + 1}.', + line_number=idx + 1, + error_source="format", + ) + if current_format is None: raise InvalidFileFormatError( message=( diff --git a/tests/unit/test_files_checks.py b/tests/unit/test_files_checks.py index 65f59f61..7abae4ad 100644 --- a/tests/unit/test_files_checks.py +++ b/tests/unit/test_files_checks.py @@ -279,3 +279,14 @@ def test_check_jsonl_wrong_turn_type(tmp_path: Path): "Invalid format on line 1 of the input file. Expected a dictionary" in report["message"] ) + + +def test_check_jsonl_extra_column(tmp_path: Path): + file = tmp_path / "extra_column.jsonl" + content = [{"text": "Hello, world!", "extra_column": "extra"}] + with file.open("w") as f: + f.write("\n".join(json.dumps(item) for item in content)) + + report = check_file(file) + assert not report["is_check_passed"] + assert "Found extra column" in report["message"]