add thinking dataset support + bugfix (#2672)

Jonathans575 · web-flow · commit a103c704fb92 · 2025-09-24T19:33:02.000+08:00
diff --git a/paddleformers/datasets/base.py b/paddleformers/datasets/base.py
@@ -161,6 +161,7 @@ def __init__(
         sub_dataset_type=["erniekit"],
         random_seed=11,
         process_fn=None,
+        process_fn_fc=None,
         shuffle_file=False,
         shuffle_files=False,
     ):
@@ -226,7 +227,7 @@ def __init__(
                 task["dataset"] = FileDataset(
                     task["filepath"],
                     process_fn=(
-                        partial(process_fn, task_name=task["task_name"]) if "task_name" in task else process_fn
+                        partial(process_fn_fc, task_name=task["task_name"]) if "task_name" in task else process_fn_fc
                     ),
                     shuffle_file=shuffle_file,
                 )
diff --git a/paddleformers/datasets/finetuning.py b/paddleformers/datasets/finetuning.py
@@ -69,7 +69,8 @@ def create_dataset(**dataset_config):
         task_dataset_path=task_dataset_path,
         task_dataset_prob=task_dataset_prob,
         sub_dataset_type=sub_dataset_type,
-        process_fn=(process_fc if dataset_config["sub_dataset_type"] == "chatml" else process_example),
+        process_fn=process_example,
+        process_fn_fc=process_fc,
     )
     sequence_dataset = SequenceDataset(
         dataset=example_dataset,
@@ -174,7 +175,7 @@ def collate_fn(batch: List[List[Sequence]], tokenizer, model_args, max_seq_len:
 
 def process_fc(data, input_file):
     multi_turns_messages = data["messages"]
-    tools_list = data["tools"]
+    tools_list = data["tools"] if "tools" in data else None
     label = data["label"] if "label" in data else None
 
     system = ""
@@ -507,17 +508,26 @@ def __iter__(self):
 
     def function_call_chat_template(self, messages, tools):
         history = messages[:-1]
+        input_dict = dict()
+        input_dict["messages"] = history
+        if tools is not None:
+            input_dict["tools"] = tools
         history_str = self.tokenizer.apply_chat_template(
-            {"messages": history, "tools": tools},
+            input_dict,
             add_generation_prompt=True,
             tokenize=False,
         )
         history_len = len(history_str)
+        input_dict["messages"] = messages
         all_str = self.tokenizer.apply_chat_template(
-            {"messages": messages, "tools": tools},
+            input_dict,
             add_generation_prompt=False,
             tokenize=False,
         )
+        # (21b think model) remove generation content
+        s = "<|im_end|>\n\n<|im_start|>assistant\n<think>\n"
+        if all_str.endswith(s):
+            all_str = all_str[: -len(s)]
         response_str = all_str[history_len:]
         history_id = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(history_str))
         response_id = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(response_str))
@@ -591,7 +601,7 @@ def _postprocess_sequence(self, example, actual_example_num):
                 if LOGGER_COUNT <= 5:
                     logger.warning(f"even one turn, example_output:'{{'src':[{sub_src}, ……],'tgt':[……{sub_tgt}]}}'")
             except Exception:
-                logger.warning(f"[SKIP] wrong example: {example}")
+                logger.warning("[SKIP] wrong example")
 
             return None
 
diff --git a/paddleformers/datasets/hf/__init__.py b/paddleformers/datasets/hf/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/setup.py b/setup.py
@@ -186,7 +186,9 @@ def get_package_data_files(package, data, package_dir=None):
             where=".",
             exclude=("examples*", "tests*", "applications*", "fast_generation*", "model_zoo*"),
         ),
-        package_data={},
+        package_data={
+            "paddleformers": ["datasets/hf/data_info.json"],
+        },
         setup_requires=["cython", "numpy"],
         install_requires=REQUIRED_PACKAGES,
         entry_points={"console_scripts": ["paddleformers = paddleformers.cli:main"]},