feat(data): add --train-only-last-turn option for thinking models (#419)

qianlihuang · web-flow · commit ed87efa342e4 · 2026-01-17T01:29:55.000+08:00
Add a new CLI argument to train_eagle3.py that enables training only on the last assistant turn in each conversation. This is useful for 'thinking' models (like DeepSeek-R1) or distilled datasets where the conversation history lacks the thought process present in the current generation.

Changes:

- Add train_only_last_turn parameter to GeneralParser, HarmonyParser, ThinkingParser

- Add train_only_last_turn parameter to preprocess_conversations and build_eagle3_dataset

- Add --train-only-last-turn CLI argument to train_eagle3.py

Co-authored-by: yiliu &lt;123&gt;
diff --git a/scripts/train_eagle3.py b/scripts/train_eagle3.py
@@ -109,6 +109,12 @@ def parse_args() -> Tuple[ArgumentParser, Namespace]:
         action="store_true",
         help="Whether the input data is preformatted text with the chat template already applied to the conversation messages.",
     )
+    dataset_group.add_argument(
+        "--train-only-last-turn",
+        action="store_true",
+        help="If set, only the last assistant turn in each conversation contributes to the loss. "
+        "Useful for thinking models where conversation history may lack thought processes.",
+    )
     dataset_group.add_argument("--build-dataset-num-proc", type=int, default=8)
     dataset_group.add_argument(
         "--dataloader-num-workers",
@@ -422,6 +428,7 @@ def build_dataloaders(
             is_preformatted=args.is_preformatted,
             processor=processor,
             num_proc=args.build_dataset_num_proc,
+            train_only_last_turn=args.train_only_last_turn,
         )
         vocab_mapping_path = generate_vocab_mapping_file(
             dataset=train_eagle3_dataset,
@@ -462,6 +469,7 @@ def build_dataloaders(
                 processor=processor,
                 num_proc=args.build_dataset_num_proc,
                 is_preformatted=args.is_preformatted,
+                train_only_last_turn=args.train_only_last_turn,
             )
         elif args.eval_hidden_states_path is not None:
             eval_eagle3_dataset = build_offline_eagle3_dataset(
diff --git a/specforge/data/parse.py b/specforge/data/parse.py
@@ -54,6 +54,7 @@ def parse(
         conversation: "Conversation",
         max_length: int,
         preformatted: bool = False,
+        train_only_last_turn: bool = False,
         **kwargs,
     ) -> Dict[str, List[torch.Tensor]]:
         if not preformatted:
@@ -138,7 +139,12 @@ def parse(
         )
         input_ids = encoding.input_ids[0]
         loss_mask = torch.zeros(len(input_ids), dtype=torch.long)
-        for match in re.finditer(assistant_pattern, conversation, re.DOTALL):
+
+        matches = list(re.finditer(assistant_pattern, conversation, re.DOTALL))
+        if train_only_last_turn and matches:
+            matches = [matches[-1]]  # Only keep the last match
+
+        for match in matches:
             content_start_char = match.start(1)
             content_end_char = match.end(1)
 
@@ -200,7 +206,11 @@ def build_single_turn_prompt(
         return prompt_text
 
     def parse(
-        self, conversation: "Conversation", max_length: int, preformatted: bool = False
+        self,
+        conversation: "Conversation",
+        max_length: int,
+        preformatted: bool = False,
+        train_only_last_turn: bool = False,
     ) -> List[torch.Tensor]:
         # conversation = process_harmony_conversations(conversation)
         if not preformatted:
@@ -243,7 +253,11 @@ def parse(
         )
 
         # Find all matching segments
-        for match in pattern.finditer(conversation):
+        matches = list(pattern.finditer(conversation))
+        if train_only_last_turn and matches:
+            matches = [matches[-1]]  # Only keep the last match
+
+        for match in matches:
             # match.start(0) is the start index of the full match (including `<|start|>assistant`)
             # match.start(1) is the start index of the first capture group (excluding `<|start|>assistant`)
             # match.end(1) is the end index of the content
@@ -288,10 +302,13 @@ def parse(
         conversation: "Conversation",
         max_length: int,
         preformatted: bool = False,
+        train_only_last_turn: bool = False,
         **kwargs,
     ) -> Dict[str, List[torch.Tensor]]:
         if self.chat_template.enable_thinking:
             kwargs["enable_thinking"] = True
         else:
             pass
-        return super().parse(conversation, max_length, preformatted, **kwargs)
+        return super().parse(
+            conversation, max_length, preformatted, train_only_last_turn, **kwargs
+        )
diff --git a/specforge/data/preprocessing.py b/specforge/data/preprocessing.py
@@ -117,6 +117,7 @@ def preprocess_conversations(
     chat_template: ChatTemplate,
     max_length: int = 2048,
     is_preformatted: bool = False,
+    train_only_last_turn: bool = False,
     **kwargs,
 ) -> Dict[str, List[torch.Tensor]]:
     """
@@ -129,6 +130,7 @@ def preprocess_conversations(
         chat_template: The chat template to use for formatting/identifying spans.
         max_length: The maximum length of the tokenized input.
         is_preformatted: Whether the input is already formatted text strings.
+        train_only_last_turn: If True, only the last assistant turn contributes to the loss.
 
     Returns:
         A dictionary containing:
@@ -158,7 +160,11 @@ def preprocess_conversations(
             # if the source is None, skip it
             continue
         input_ids, loss_mask = parser.parse(
-            source, max_length, preformatted=is_preformatted, **kwargs_item
+            source,
+            max_length,
+            preformatted=is_preformatted,
+            train_only_last_turn=train_only_last_turn,
+            **kwargs_item,
         )
         results["input_ids"].append(input_ids[None, :])
         results["loss_mask"].append(loss_mask[None, :])
@@ -294,6 +300,7 @@ def build_eagle3_dataset(
     is_vlm: Optional[bool] = False,
     processor: Optional[ImageProcessingMixin] = None,
     is_preformatted: Optional[bool] = False,
+    train_only_last_turn: Optional[bool] = False,
 ) -> HFDataset:
     """
     build eagle3 dataset
@@ -319,6 +326,8 @@ def build_eagle3_dataset(
                         the assistant spans for loss mask generation.
                         If True, expects "text" column with ready-to-train text.
                         If False, expects "conversations" column with ShareGPT format.
+        train_only_last_turn: If True, only the last assistant turn contributes to the loss.
+                             Useful for thinking models where history may not contain thoughts.
 
     Returns:
         The processed HF dataset.
@@ -360,6 +369,7 @@ def preprocess_function(examples):
                 template,
                 max_length,
                 is_preformatted=True,
+                train_only_last_turn=train_only_last_turn,
             )
         else:
             # Handle ShareGPT conversations
@@ -376,6 +386,7 @@ def preprocess_function(examples):
                 template,
                 max_length,
                 is_preformatted=False,
+                train_only_last_turn=train_only_last_turn,
                 **examples,
             )