Update tool handling to support JSON string schemas in trainers (#5118)

qgallouedec · web-flow · commit f8181886c6a5 · 2026-02-18T11:59:13.000-06:00
diff --git a/docs/source/dataset_formats.md b/docs/source/dataset_formats.md
@@ -159,6 +159,7 @@ When preparing datasets for Supervised Fine-Tuning (SFT) with tool calling, it i
 The tools must be specified in a codified JSON schema format. You can automatically generate this schema from Python function signatures using the [`~transformers.utils.get_json_schema`] utility:
 
 ```python
+import json
 from transformers.utils import get_json_schema
 
 def control_light(room: str, state: str) -> str:
@@ -175,38 +176,27 @@ def control_light(room: str, state: str) -> str:
     return f"The lights in {room} are now {state}."
 
 # Generate JSON schema
-json_schema = get_json_schema(control_light)
+json_schema = json.dumps([get_json_schema(control_light)])
 ```
 
 The generated schema would look like:
 
 ```python
-{
-    "type": "function",
-    "function": {
-        "name": "control_light",
-        "description": "Controls the lights in a room.",
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "room": {"type": "string", "description": "The name of the room."},
-                "state": {"type": "string", "description": 'The desired state of the light ("on" or "off").'},
-            },
-            "required": ["room", "state"],
-        },
-        "return": {"type": "string", "description": "str: A message indicating the new state of the lights."},
-    },
-}
+'[{"type": "function", "function": {"name": "control_light", "description": "Controls the lights in a room.", "parameters": {"type": "object", "properties": {"room": {"type": "string", "description": "The name of the room."}, "state": {"type": "string", "description": "The desired state of the light (\\"on\\" or \\"off\\")."}}, "required": ["room", "state"]}, "return": {"type": "string", "description": "str: A message indicating the new state of the lights."}}}]'
 ```
 
 A complete dataset entry for SFT might look like:
 
 ```python
-{"messages": messages, "tools": [json_schema]}
+{"messages": messages, "tools": json_schema}
 ```
 
 For more detailed information on tool calling, refer to the [Tool Calling section in the `transformers` documentation](https://huggingface.co/docs/transformers/chat_extras#tools-and-rag) and the blog post [Tool Use, Unified](https://huggingface.co/blog/unified-tool-use).
 
+> [!NOTE]  
+> TRL also accepts `tools` as a Python `list[dict]` (for backward compatibility).  
+> This is a legacy format and is **not recommended** for new datasets. Prefer storing `tools` as a JSON `str` (with `json.dumps([...])`).
+
 ### Harmony
 
 The [Harmony response format](https://cookbook.openai.com/articles/openai-harmony) was introduced with the [OpenAI GPT OSS models](https://huggingface.co/collections/openai/gpt-oss-68911959590a1634ba11c7a4). It extends the conversational format by adding richer structure for reasoning, function calls, and metadata about the model’s behavior. Key features include:
diff --git a/docs/source/reward_trainer.md b/docs/source/reward_trainer.md
@@ -218,7 +218,7 @@ trainer.train()
 The [`RewardTrainer`] fully supports fine-tuning models with _tool calling_ capabilities. In this case, each dataset example should include:
 
 * The conversation messages, including any tool calls (`tool_calls`) and tool responses (`tool` role messages)
-* The list of available tools in the `tools` column, typically provided as JSON schemas
+* The list of available tools in the `tools` column, typically provided as JSON `str` schemas
 
 For details on the expected dataset structure, see the [Dataset Format — Tool Calling](dataset_formats#tool-calling) section.
 
diff --git a/docs/source/sft_trainer.md b/docs/source/sft_trainer.md
@@ -289,7 +289,7 @@ Alternatively, use the structured conversation format (recommended):
 The [`SFTTrainer`] fully supports fine-tuning models with _tool calling_ capabilities. In this case, each dataset example should include:
 
 * The conversation messages, including any tool calls (`tool_calls`) and tool responses (`tool` role messages)
-* The list of available tools in the `tools` column, typically provided as JSON schemas
+* The list of available tools in the `tools` column, typically provided as JSON `str` schemas
 
 For details on the expected dataset structure, see the [Dataset Format — Tool Calling](dataset_formats#tool-calling) section.
 
diff --git a/scripts/generate_toolcall_dataset.py b/scripts/generate_toolcall_dataset.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import json
 from dataclasses import dataclass, field
 
 from datasets import Dataset
@@ -212,14 +213,14 @@ def get_wind_conditions(city: str, unit: str) -> tuple[int, str]:
             ]
         ],
         "tools": [
-            [start_timer, create_reminder],
-            [get_current_time],
-            [get_air_quality_index, get_weather_forecast, get_wind_conditions],
-            [play_music, control_light],
-            [get_weather_forecast, get_wind_conditions],
-            [control_light],
-            [start_timer, create_reminder],
-            [get_weather_forecast, get_wind_conditions],
+            json.dumps([start_timer, create_reminder]),
+            json.dumps([get_current_time]),
+            json.dumps([get_air_quality_index, get_weather_forecast, get_wind_conditions]),
+            json.dumps([play_music, control_light]),
+            json.dumps([get_weather_forecast, get_wind_conditions]),
+            json.dumps([control_light]),
+            json.dumps([start_timer, create_reminder]),
+            json.dumps([get_weather_forecast, get_wind_conditions]),
         ]
     })
     language_modeling_dataset = language_modeling_dataset.train_test_split(test_size=test_size, shuffle=False)
@@ -318,14 +319,14 @@ def get_wind_conditions(city: str, unit: str) -> tuple[int, str]:
             ],
         ],
         "tools": [
-            [start_timer],
-            [get_current_time],
-            [get_air_quality_index],
-            [play_music],
-            [get_weather_forecast],
-            [control_light],
-            [create_reminder],
-            [get_wind_conditions],
+            json.dumps([start_timer]),
+            json.dumps([get_current_time]),
+            json.dumps([get_air_quality_index]),
+            json.dumps([play_music]),
+            json.dumps([get_weather_forecast]),
+            json.dumps([control_light]),
+            json.dumps([create_reminder]),
+            json.dumps([get_wind_conditions]),
         ],
     })
     preference_dataset = preference_dataset.train_test_split(test_size=test_size, shuffle=False)
diff --git a/tests/test_reward_trainer.py b/tests/test_reward_trainer.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import json
 import pathlib
 
 import pytest
@@ -666,7 +667,41 @@ def test_train_with_set_chat_template_from_path(self, lazy_shared_datadir):
 
     def test_train_toolcall_data(self):
         # Get the dataset
-        dataset = load_dataset("trl-internal-testing/toolcall", "preference", split="train")
+        dataset = load_dataset("trl-internal-testing/toolcall", "preference", split="train", revision="refs/pr/3")
+
+        # Initialize the trainer
+        training_args = RewardConfig(output_dir=self.tmp_dir, report_to="none")
+        trainer = RewardTrainer(
+            model="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
+            args=training_args,
+            train_dataset=dataset,
+        )
+
+        # Save the initial parameters to compare them later
+        previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
+
+        # Train the model
+        trainer.train()
+
+        # Check that the training loss is not None
+        assert trainer.state.log_history[-1]["train_loss"] is not None
+
+        # Check the params have changed
+        for n, param in previous_trainable_params.items():
+            new_param = trainer.model.get_parameter(n)
+            assert not torch.allclose(param, new_param), f"Parameter {n} has not changed"
+
+    def test_train_toolcall_data_as_json(self):
+        # Tabular backends (Arrow/Parquet) can insert `None` for missing keys in nested structures.
+        # If `tools` is stored as a list of dicts and examples use different dict schemas, nulls may
+        # be introduced and break tool processing. This test ensures we also support `tools` provided
+        # as a list of dicts.
+        dataset = load_dataset("trl-internal-testing/toolcall", "preference", split="train", revision="refs/pr/3")
+
+        def convert_to_json(example):
+            return {"tools": json.loads(example["tools"])}
+
+        dataset = dataset.map(convert_to_json)
 
         # Initialize the trainer
         training_args = RewardConfig(output_dir=self.tmp_dir, report_to="none")
diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import gc
+import json
 import pathlib
 from unittest.mock import MagicMock, patch
 
@@ -1312,7 +1313,44 @@ def test_train_with_set_chat_template_from_path(self, lazy_shared_datadir):
 
     def test_train_toolcall_data(self):
         # Get the dataset
-        dataset = load_dataset("trl-internal-testing/toolcall", "language_modeling", split="train")
+        dataset = load_dataset(
+            "trl-internal-testing/toolcall", "language_modeling", split="train", revision="refs/pr/2"
+        )
+
+        # Initialize the trainer
+        training_args = SFTConfig(output_dir=self.tmp_dir, report_to="none")
+        trainer = SFTTrainer(
+            model="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5", args=training_args, train_dataset=dataset
+        )
+
+        # Save the initial parameters to compare them later
+        previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
+
+        # Train the model
+        trainer.train()
+
+        # Check that the training loss is not None
+        assert trainer.state.log_history[-1]["train_loss"] is not None
+
+        # Check the params have changed
+        for n, param in previous_trainable_params.items():
+            new_param = trainer.model.get_parameter(n)
+            assert not torch.allclose(param, new_param), f"Parameter {n} has not changed"
+
+    def test_train_toolcall_data_as_json(self):
+        # Tabular backends (Arrow/Parquet) can insert `None` for missing keys in nested structures.
+        # If `tools` is stored as a list of dicts and examples use different dict schemas, nulls may
+        # be introduced and break tool processing. This test ensures we also support `tools` provided
+        # as a list of dicts.
+        # Get the dataset
+        dataset = load_dataset(
+            "trl-internal-testing/toolcall", "language_modeling", split="train", revision="refs/pr/2"
+        )
+
+        def convert_to_json(example):
+            return {"tools": json.loads(example["tools"])}
+
+        dataset = dataset.map(convert_to_json)
 
         # Initialize the trainer
         training_args = SFTConfig(output_dir=self.tmp_dir, report_to="none")
diff --git a/trl/trainer/reward_trainer.py b/trl/trainer/reward_trainer.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import contextlib
+import json
 import logging
 import os
 import re
@@ -565,20 +566,22 @@ def add_eos(example, eos_token):
                     map_kwargs["desc"] = f"Tokenizing {dataset_name} dataset"
 
                 def tokenize_fn(example, processing_class):
+                    tools = example.get("tools")
+                    tools = json.loads(tools) if isinstance(tools, str) else tools
                     if "prompt" in example:  # explicit prompt case
                         example["chosen"] = example["prompt"] + example["chosen"]
                         example["rejected"] = example["prompt"] + example["rejected"]
 
                     if is_conversational(example):
                         chosen_input_ids = processing_class.apply_chat_template(
                             example["chosen"],
-                            tools=example.get("tools"),
+                            tools=tools,
                             return_dict=True,
                             **example.get("chat_template_kwargs", {}),
                         )["input_ids"]
                         rejected_input_ids = processing_class.apply_chat_template(
                             example["rejected"],
-                            tools=example.get("tools"),
+                            tools=tools,
                             return_dict=True,
                             **example.get("chat_template_kwargs", {}),
                         )["input_ids"]
diff --git a/trl/trainer/sft_trainer.py b/trl/trainer/sft_trainer.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import contextlib
+import json
 import os
 import warnings
 from collections import defaultdict
@@ -1016,6 +1017,8 @@ def add_eos(example, eos_token):
                     map_kwargs["desc"] = f"Tokenizing {dataset_name} dataset"
 
                 def tokenize_fn(example, processing_class, dataset_text_field, assistant_only_loss):
+                    tools = example.get("tools")
+                    tools = json.loads(tools) if isinstance(tools, str) else tools
                     if "prompt" in example:  # prompt-completion case
                         output = {}
                         if is_conversational(example):
@@ -1027,7 +1030,7 @@ def tokenize_fn(example, processing_class, dataset_text_field, assistant_only_lo
                                 completion = example["completion"]
                             prompt_ids = processing_class.apply_chat_template(
                                 prompt,
-                                tools=example.get("tools"),
+                                tools=tools,
                                 add_generation_prompt=True,
                                 tokenize=True,
                                 return_dict=False,
@@ -1038,7 +1041,7 @@ def tokenize_fn(example, processing_class, dataset_text_field, assistant_only_lo
                             prompt_ids = prompt_ids[0] if isinstance(prompt_ids[0], list) else prompt_ids
                             prompt_completion_processed = processing_class.apply_chat_template(
                                 prompt + completion,
-                                tools=example.get("tools"),
+                                tools=tools,
                                 tokenize=True,
                                 return_dict=True,
                                 return_assistant_tokens_mask=assistant_only_loss,
@@ -1088,7 +1091,7 @@ def tokenize_fn(example, processing_class, dataset_text_field, assistant_only_lo
                                 messages = example["messages"]
                             processed = processing_class.apply_chat_template(
                                 messages,
-                                tools=example.get("tools"),
+                                tools=tools,
                                 tokenize=True,
                                 return_dict=True,
                                 return_assistant_tokens_mask=assistant_only_loss,