meta-pytorch · krammnic · Jun 6, 2025 · Jun 6, 2025 · Jun 17, 2025 · Jun 17, 2025
diff --git a/torchtune/data/_messages.py b/torchtune/data/_messages.py
@@ -52,6 +52,7 @@ class Message:
         masked (bool): whether the message is masked in the sample. If True, do not use
             in loss calculation. Default: False
         ipython (bool): whether the message is a tool call. Default: False
+        tool_calls (Optional[list]): list of tool calls related to this message. Default: None
         eot (bool): whether the message corresponds to the end of a turn, where control is handed over
             to the assistant from the user or the user from the assistant. Default: True. Should be true
             in most cases except for:
@@ -71,12 +72,14 @@ def __init__(
         content: Union[str, list[dict[str, Any]]],
         masked: bool = False,
         ipython: bool = False,
+        tool_calls: Optional[list] = None,
         eot: bool = True,
     ):
         self.role = role
         self.content = self._convert_to_list_of_dict(content)
         self.masked = masked
         self.ipython = ipython
+        self.tool_calls = tool_calls
         self.eot = eot
 
         self._validate_message()

diff --git a/torchtune/modules/transforms/tokenizers/_hf_tokenizer.py b/torchtune/modules/transforms/tokenizers/_hf_tokenizer.py
@@ -6,7 +6,7 @@
 
 import json
 
-from typing import Any, Optional
+from typing import Any, Optional, Mapping
 
 import jinja2
 from jinja2 import StrictUndefined
@@ -90,8 +90,11 @@ def _infer_bos_eos_tokens(self):
         self.eos_token = "<eos>"
 
         if self.config:
-            self.bos_token = self._get_token_from_config(self.config, "bos_token")
-            self.eos_token = self._get_token_from_config(self.config, "eos_token")
+            try:
+                self.bos_token = self._get_token_from_config(self.config, "bos_token")
+                self.eos_token = self._get_token_from_config(self.config, "eos_token")
+            except ValueError:
+                pass
             if self.bos_token is not None:
                 self.bos_id = self.tokenizer.token_to_id(self.bos_token)
             if self.eos_token is not None:
@@ -103,9 +106,6 @@ def _infer_bos_eos_tokens(self):
             if self.eos_id is None:
                 self.eos_id = self.generation_config.get("eos_token_id")
 
-        if self.bos_id is None or self.eos_id is None:
-            raise ValueError("Could not infer BOS and EOS token IDs from config")
-
     def _infer_should_add_bos_eos(self):
         """
         Hugging Face tokenizers sometimes add BOS by default. We should infer this to determine
@@ -136,9 +136,16 @@ def encode(
             list[int]: The list of token ids.
         """
         token_ids = self.tokenizer.encode(text).ids
-        if add_bos and not self.hf_adds_bos and self.bos_token not in text:
+
+        # Both bos_id and eos_id might be None (null). Therefore, we need an additional check.
+        if (
+            add_bos
+            and not self.hf_adds_bos
+            and self.bos_token not in text
+            and self.bos_id
+        ):
             token_ids.insert(0, self.bos_id)
-        if add_eos and not self.hf_adds_eos:
+        if add_eos and not self.hf_adds_eos and self.eos_id:
             token_ids.append(self.eos_id)
         return token_ids
 
@@ -221,13 +228,15 @@ def __init__(
         *,
         tokenizer_config_json_path: Optional[str] = None,
         generation_config_path: Optional[str] = None,
+        max_seq_len: Optional[int] = None,
         truncation_type: str = "right",
     ):
         self.base_tokenizer = HuggingFaceBaseTokenizer(
             tokenizer_json_path=tokenizer_json_path,
             tokenizer_config_json_path=tokenizer_config_json_path,
             generation_config_path=generation_config_path,
         )
+        self.max_seq_len = max_seq_len
 
         # Contents of the tokenizer_config.json
         config = self.base_tokenizer.config
@@ -272,15 +281,18 @@ def tokenize_messages(
         self,
         messages: list[Message],
         add_eos: bool = True,
-        max_seq_len: Optional[int] = None,
     ) -> tuple[list[int], list[bool]]:
         tokenized_messages = []
         mask = []
         previous_tokens = []
 
         for i, message in enumerate(messages):
             current_messages = [
-                {"role": m.role, "content": m.content[0]["content"]}
+                {
+                    "role": m.role,
+                    "content": m.content[0]["content"],
+                    "tool_calls": m.tool_calls,
+                }
                 for m in messages[: i + 1]
             ]
 
@@ -310,16 +322,26 @@ def tokenize_messages(
         # Finally, truncate if necessary
         tokenized_messages = truncate(
             tokens=tokenized_messages,
-            max_seq_len=max_seq_len,
+            max_seq_len=self.max_seq_len,
             eos_id=self.base_tokenizer.eos_id,
             truncation_type=self.truncation_type,
         )
 
         mask = truncate(
             tokens=mask,
-            max_seq_len=max_seq_len,
+            max_seq_len=self.max_seq_len,
             eos_id=True if add_eos else None,
             truncation_type=self.truncation_type,
         )
 
         return tokenized_messages, mask
+
+    def __call__(self, sample: Mapping[str, Any], inference: bool = False) -> Mapping[str, Any]:
+        """
+        Apply ``tokenize_messages`` to the "messages" field in the sample.
+        """
+        messages = sample.pop("messages")
+        tokens, mask = self.tokenize_messages(messages, add_eos=not inference)
+        sample["tokens"] = tokens
+        sample["mask"] = mask
+        return sample