[Improvement] support system prompt in training dataset (#7667)

wj-Mcat · web-flow · commit 1982091ba862 · 2024-01-02T14:54:52.000+08:00
* update system prompt

* support system prompt

* add context system prompt docs

* update chat-template readme
diff --git a/llm/data.py b/llm/data.py
@@ -105,6 +105,7 @@ def tokenize_rounds_example(tokenizer, example, data_args):
     """
 
     # 0. prepare data
+    context_data = example.get("context", {})
     example["src"] = example["src"] if isinstance(example["src"], list) else [example["src"]]
     example["tgt"] = example["tgt"] if isinstance(example["tgt"], list) else [example["tgt"]]
 
@@ -113,7 +114,9 @@ def tokenize_rounds_example(tokenizer, example, data_args):
     conversations = [[src, tgt] for src, tgt in zip(example["src"], example["tgt"])]
 
     # 1. only tokenize input_ids
-    conversation_result: list[tuple[list[int], list[int]]] = tokenizer.encode_chat_inputs(conversations)
+    conversation_result: list[tuple[list[int], list[int]]] = tokenizer.encode_chat_inputs(
+        conversations, context_data=context_data
+    )
     system_ids = conversation_result.pop("system", []) or []
 
     # 2. truncate conversations based on conversation unit
diff --git a/llm/docs/chat_template.md b/llm/docs/chat_template.md
@@ -57,3 +57,29 @@ python finetune_generation.py ... --chat_template ./qwen_14b_chat_template.json
 1. 当 `chat_template` 参数和 `model_name_or_path` 参数一致时，此时将默认使用模型自带的 `chat_template.json` 文件。
 1. 当 `chat_template` 参数为文件路径时，此时将使用该文件中的 `chat_template` 配置。
 1. 当 `chat_template` 参数为空时，此时不使用 `chat_template` 配置进行训练。
+
+#### 如何自定义system prompt
+
+如果想要在训练或者推理的过程中动态调整 system prompt，需要进行以下调整：
+
+1. 则需要保证 `chat_template.json` 文件中的 system 配置是包含jinja2 中的变量占位符（比如：`<|im_start|>user\n{{user}}<|im_end|>` 中的 {{user}} 就是一个变量占位符），同时尽量让其保留默认参数，比如上述配置可调整成：
+
+> 需要开发者手动调整 `chat_template.json` 实现动态调整 system prompt。
+
+```diff
+{
+-    "system": "You are a helpful assistant.",
++    "system": "{{system | 'You are a helpful assistant.'}}",
+    "conversation": ["\n<|im_start|>user\n{{user}}<|im_end|>\n<|im_start|>assistant\n", "{{bot}}<|im_end|>"],
+    "query": "\n<|im_start|>user\n{{query}}<|im_end|>\n<|im_start|>assistant\n",
+}
+```
+
+2. 训练文本数据中需要配置 `context` 字段将 `system` 字段给传递进去，示例数据为：
+
+```json
+{"src": ["user-1", "user-2", ..., "user-n"], "tgt": ["bot-1", "bot-2", ..., "bot-n"], "context": {"system": "你是一个擅长做任务的人工智能助手"}}
+...
+```
+
+在渲染 chat_template 的时候将以上数据中的`context` 作为jinja2 的上下文数据，这样就可以在训练数据集中定制每个训练数据的 system prompt。
diff --git a/paddlenlp/transformers/chatglm_v2/tokenizer.py b/paddlenlp/transformers/chatglm_v2/tokenizer.py
@@ -14,7 +14,7 @@
 from __future__ import annotations
 
 import os
-from typing import Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
 import numpy as np
 from sentencepiece import SentencePieceProcessor
@@ -280,9 +280,9 @@ def _pad(
 
         return encoded_inputs
 
-    def encode_chat_inputs(self, conversations: List[List[str, str]]):
+    def encode_chat_inputs(self, conversations: List[List[str, str]], context_data: Dict[str, Any] = {}):
         # encode system
-        result = super().encode_chat_inputs(conversations)
+        result = super().encode_chat_inputs(conversations, context_data=context_data)
         if "system" in result:
             result["system"] = self.get_prefix_tokens() + result["system"]
         else:
diff --git a/paddlenlp/transformers/tokenizer_utils.py b/paddlenlp/transformers/tokenizer_utils.py
@@ -647,26 +647,30 @@ def apply_chat_template(
         tokenizer_kwargs["add_special_tokens"] = False
         return self(query, **tokenizer_kwargs)
 
-    def encode_chat_inputs(self, conversations: List[List[str, str]]):
+    def encode_chat_inputs(self, conversations: List[List[str, str]], context_data: Dict[str, Any] = {}):
         """Encodes conversation to pairs of token ids.
         Turn 0: bos + system + sep + user     bot + eos
         Turn t: sep + bot + query             bot + eos
 
         Args:
             conversation (List[List[str, str]]): the conversation of data
+            context_data (Dict[str, Any]): the context data of conversation
 
         Returns:
             List[list[int], list[int]]: the pair of input_ids and target_ids
         """
         # encode system
         result = {}
         if self.chat_template.system:
-            result["system"] = self.encode(self.chat_template.system, add_special_tokens=False)["input_ids"]
+            system = self.chat_template.render_system(context_data)
+            result["system"] = self.encode(system, add_special_tokens=False)["input_ids"]
 
         # encode conversation
         conversation_ids = []
         for index, conversation in enumerate(conversations):
-            user_input, bot_output = self.chat_template.render_conversation(conversation, index=index)
+            user_input, bot_output = self.chat_template.render_conversation(
+                conversation, index=index, context_data=context_data
+            )
             user_ids = self.encode(user_input, add_special_tokens=False)["input_ids"]
             bot_ids = self.encode(bot_output, add_special_tokens=False)["input_ids"]
             conversation_ids.append([user_ids, bot_ids])