add no template (#2676)

Jonathans575 · web-flow · commit 32e4a08e5cec · 2025-09-24T19:24:36.000+08:00
diff --git a/examples/run_finetune.py b/examples/run_finetune.py
@@ -204,6 +204,7 @@ def neft_post_hook(module, input, output):
         "packing": data_args.packing,
         "mix_strategy": data_args.mix_strategy,
         "encode_one_turn": data_args.encode_one_turn,
+        "use_template": data_args.use_template,
     }
 
     train_dataset = create_dataset_sft(
diff --git a/paddleformers/datasets/finetuning.py b/paddleformers/datasets/finetuning.py
@@ -83,6 +83,7 @@ def create_dataset(**dataset_config):
         packing=dataset_config["packing"],
         mix_strategy=dataset_config["mix_strategy"],
         encode_one_turn=dataset_config["encode_one_turn"],
+        use_template=dataset_config["use_template"],
     )
     return sequence_dataset
 
@@ -289,6 +290,7 @@ def __init__(
         packing: bool = False,
         mix_strategy: str = "random",
         encode_one_turn: bool = True,
+        use_template: bool = True,
     ):
         """Initialize SequenceDataset.
 
@@ -319,6 +321,7 @@ def __init__(
         self.packing = packing
         self.mix_strategy = mix_strategy
         self.encode_one_turn = encode_one_turn
+        self.use_template = use_template
         self.num_samples_each_epoch = num_samples_each_epoch
         self.reverse = True
 
@@ -536,12 +539,19 @@ def _postprocess_sequence(self, example, actual_example_num):
         Returns:
             Sequence: Processed sequence or None if invalid.
         """
-        if not self.tokenizer.chat_template:
-            self.tokenizer.chat_template = NONE_CHAT_TEMPLATE
-        if example.is_function_call:
-            encoded_messages = self._postprocess_fc_sequence(example)
+        if self.use_template:
+            if not self.tokenizer.chat_template:
+                self.tokenizer.chat_template = NONE_CHAT_TEMPLATE
+            if example.is_function_call:
+                encoded_messages = self._postprocess_fc_sequence(example)
+            else:
+                encoded_messages = self.tokenizer.encode_chat_inputs(
+                    example.request, encode_one_turn=self.encode_one_turn
+                )
         else:
-            encoded_messages = self.tokenizer.encode_chat_inputs(example.request, encode_one_turn=self.encode_one_turn)
+            encoded_messages = self.tokenizer.encode_chat_inputs_with_no_template(
+                example.request, encode_one_turn=self.encode_one_turn
+            )
 
         num_reserved_tokens_for_each_dialog = 1  # only break_turn_token or end_token
         num_reserved_tokens_for_each_turn = 8
@@ -585,26 +595,36 @@ def _postprocess_sequence(self, example, actual_example_num):
 
             return None
 
-        if self.begin_token_id is not None and self.end_of_response_id is not None:
-            # Maybe left truncated, so need to add begin_token
-            if tokens[0] != self.begin_token_id:
-                tokens = [self.begin_token_id] + tokens
-                loss_mask = [0] + loss_mask
-
-            if len(tokens) > self.max_seq_len:
-                raise RuntimeError(f"token_ids is too long: {len(tokens)}")
-
-            # Add EOS token at the end
-            del tokens[-1]
-            del loss_mask[-1]
-            labels = tokens[1:] + [self.tokenizer.eos_token_id]
-
-            # end_of_response is a special token that indicates the end of the turn.
-            # end_token is a special token that indicates the end of the answer.
-            labels = [label if label != self.end_of_response_id else self.tokenizer.eos_token_id for label in labels]
+        if self.use_template:
+            if self.begin_token_id is not None and self.end_of_response_id is not None:
+                # Maybe left truncated, so need to add begin_token
+                if tokens[0] != self.begin_token_id:
+                    tokens = [self.begin_token_id] + tokens
+                    loss_mask = [0] + loss_mask
+
+                if len(tokens) > self.max_seq_len:
+                    raise RuntimeError(f"token_ids is too long: {len(tokens)}")
+
+                # Add EOS token at the end
+                del tokens[-1]
+                del loss_mask[-1]
+                labels = tokens[1:] + [self.tokenizer.eos_token_id]
+
+                # end_of_response is a special token that indicates the end of the turn.
+                # end_token is a special token that indicates the end of the answer.
+                labels = [
+                    label if label != self.end_of_response_id else self.tokenizer.eos_token_id for label in labels
+                ]
+            else:
+                tokens = tokens[:-1] + [self.tokenizer.eos_token_id]
+                labels = tokens[1:] + [-100]
+                if len(tokens) > self.max_seq_len:
+                    raise RuntimeError(f"token_ids is too long: {len(tokens)}")
         else:
-            tokens = tokens[:-1] + [self.tokenizer.eos_token_id]
-            labels = tokens[1:] + [-100]
+            oral_tokens = tokens
+            tokens = oral_tokens[:-1]
+            labels = oral_tokens[1:]
+            loss_mask = loss_mask[1:]
             if len(tokens) > self.max_seq_len:
                 raise RuntimeError(f"token_ids is too long: {len(tokens)}")
 
diff --git a/paddleformers/transformers/tokenizer_utils.py b/paddleformers/transformers/tokenizer_utils.py
@@ -515,6 +515,54 @@ def encode_chat_inputs(
                     query = self._encode_chat_inputs_openai_format(conversations)
         return query
 
+    def encode_chat_inputs_with_no_template(
+        self, conversations: List[List[str, str]] | Dict[str, Any], context_data: Dict[str, Any] = {}, **kwargs
+    ):
+        """
+        Args:
+            conversation (List[List[str, str]]): the conversation of data
+            context_data (Dict[str, Any]): the context data of conversation
+
+        Returns:
+            List[list[int], list[int]]: the pair of input_ids and target_ids
+        """
+        assert isinstance(conversations, dict)
+
+        conversation_dict = {} if "tools" not in conversations else {"tools": conversations["tools"]}
+        conversation_dict["messages"] = (
+            [conversations["messages"][0]] if conversations["messages"][0]["role"] == "system" else []
+        )
+
+        if conversations["messages"][0]["role"] == "system":
+            conversations["messages"] = conversations["messages"][1:]
+
+        cur_str = ""
+        conversation_ids = []
+        for idx in range(0, len(conversations["messages"]), 2):
+            conversation_id = []
+            conversation_dict["messages"].append(conversations["messages"][idx])
+            round_str = conversation_dict["messages"]
+            # fake template
+            tokenize_input = "".join(item["content"] for item in round_str)
+            tokenize_input = tokenize_input[len(cur_str) :]
+            input_ids = self.convert_tokens_to_ids(self.tokenize(tokenize_input))
+            conversation_id.append(input_ids)
+            cur_str = tokenize_input
+
+            if idx + 1 < len(conversations["messages"]):
+                conversation_dict["messages"].append(conversations["messages"][idx + 1])
+                round_str = conversation_dict["messages"]
+                # fake template
+                tokenize_input = "".join(item["content"] for item in round_str)
+                tokenize_input = tokenize_input[len(cur_str) :]
+                output_ids = self.convert_tokens_to_ids(self.tokenize(tokenize_input))
+                conversation_id.append(output_ids)
+
+            conversation_ids.append(conversation_id)
+            conversation_dict["messages"] = []
+            cur_str = ""
+        return conversation_ids
+
     def decode_token(
         self,
         all_input_ids: List[int],
diff --git a/paddleformers/trl/sftdata_config.py b/paddleformers/trl/sftdata_config.py
@@ -57,6 +57,10 @@ class DataConfig:
             "help": "Strategy to use in dataset mixing (random/concat/interleave) (undersampling/oversampling)."
         },
     )
+    use_template: bool = field(
+        default=True,
+        metadata={"help": "Whether to use template in data processing."},
+    )
     encode_one_turn: bool = field(
         default=True,
         metadata={"help": "Whether encode each round independently in a multi-round dialogue."},
diff --git a/tests/dataset/test_ernie_datasets.py b/tests/dataset/test_ernie_datasets.py
@@ -42,6 +42,7 @@ def test_random_dataset_len(self):
             "packing": False,
             "mix_strategy": "random",
             "encode_one_turn": True,
+            "use_template": True,
         }
 
         train_dataset = create_dataset_sft(
@@ -71,6 +72,7 @@ def test_concat_dataset_len(self):
             "packing": False,
             "mix_strategy": "concat",
             "encode_one_turn": True,
+            "use_template": True,
         }
 
         train_dataset = create_dataset_sft(
@@ -100,6 +102,7 @@ def test_interleave_under_dataset_len(self):
             "packing": False,
             "mix_strategy": "interleave_under",
             "encode_one_turn": True,
+            "use_template": True,
         }
 
         train_dataset = create_dataset_sft(
@@ -129,6 +132,7 @@ def test_interleave_over_dataset_len(self):
             "packing": False,
             "mix_strategy": "interleave_over",
             "encode_one_turn": True,
+            "use_template": True,
         }
 
         train_dataset = create_dataset_sft(

Original file line number	Diff line number	Diff line change
`@@ -204,6 +204,7 @@ def neft_post_hook(module, input, output):`
`204`	`204`	`"packing": data_args.packing,`
`205`	`205`	`"mix_strategy": data_args.mix_strategy,`
`206`	`206`	`"encode_one_turn": data_args.encode_one_turn,`
	`207`	`+ "use_template": data_args.use_template,`
`207`	`208`	`}`
`208`	`209`
`209`	`210`	`train_dataset = create_dataset_sft(`