[Bug Fixes] update chatglm1 tokenizer (PaddlePaddle#7870)

wj-Mcat · web-flow · commit b50db1ce70de · 2024-01-23T23:23:12.000+08:00
* update chatglm1 tokenizer

* update additional_special_token

* add is_training tag

* fix linting
diff --git a/llm/data.py b/llm/data.py
@@ -106,6 +106,8 @@ def tokenize_rounds_example(tokenizer, example, data_args):
 
     # 0. prepare data
     context_data = example.get("context", {})
+    context_data["is_training"] = True
+    
     example["src"] = example["src"] if isinstance(example["src"], list) else [example["src"]]
     example["tgt"] = example["tgt"] if isinstance(example["tgt"], list) else [example["tgt"]]
 
diff --git a/llm/predictor.py b/llm/predictor.py
@@ -49,6 +49,7 @@
     AutoModelForCausalLM,
     AutoTokenizer,
     ChatGLMv2Tokenizer,
+    ChatGLMTokenizer,
     LlamaTokenizer,
     PretrainedModel,
     PretrainedTokenizer,
@@ -240,7 +241,7 @@ def _preprocess(self, source):
             padding=True,
             # when use chat_template, it should not add special tokens
             # chatglm2 prefix-tokens can not be tokenized into ids
-            add_special_tokens=self.tokenizer.chat_template is None or isinstance(self.tokenizer, ChatGLMv2Tokenizer),
+            add_special_tokens=self.tokenizer.chat_template is None or isinstance(self.tokenizer, (ChatGLMv2Tokenizer, ChatGLMTokenizer)),
         )
         return tokenized_source
 
@@ -924,7 +925,7 @@ def _preprocess(self, source):
                 max_length=self.config.src_length,
                 # if use chat_template, it will not add special_tokens
                 add_special_tokens=self.tokenizer.chat_template is None
-                or isinstance(self.tokenizer, ChatGLMv2Tokenizer),
+                or isinstance(self.tokenizer, (ChatGLMv2Tokenizer, ChatGLMTokenizer)),
             )
             input_ids = tokens["input_ids"][0]
             length = len(input_ids)
diff --git a/paddlenlp/transformers/chatglm/tokenizer.py b/paddlenlp/transformers/chatglm/tokenizer.py
@@ -57,6 +57,7 @@ def __init__(
         num_image_tokens=20000,
         **kwargs
     ) -> None:
+        kwargs["additional_special_tokens"] = kwargs.pop("additional_special_tokens", []) + [gmask_token]
         super().__init__(
             pad_token=pad_token,
             unk_token=unk_token,
diff --git a/paddlenlp/transformers/tokenizer_utils.py b/paddlenlp/transformers/tokenizer_utils.py
@@ -562,6 +562,11 @@ def render_query(self, query: str, index: int = 0, context_data: Dict[str, Union
         template = self._compile_jinja_template(self.query)
         return template.render(query=query, index=index, **context_data)
 
+    def _init_context_data(self, context_data: Dict[str, Union[int, str]] = {}) -> Dict[str, Union[int, str]]:
+        """init the context data for chat-template"""
+        context_data["is_training"] = context_data.get("is_training", False)
+        return context_data
+
     def render_system(self, context_data: Dict[str, Union[int, str]] = {}) -> str:
         if self.system is None:
             return ""
@@ -633,6 +638,8 @@ def apply_chat_template(
         Returns:
             str | dict[str, numpy.ndarray | paddle.Tensor]: return the result of applied data
         """
+        context_data = self.chat_template._init_context_data(context_data)
+
         if isinstance(conversation, str):
             conversation = [[conversation]]
         elif isinstance(conversation, list) and isinstance(conversation[0], str):
@@ -661,6 +668,7 @@ def encode_chat_inputs(self, conversations: List[List[str, str]], context_data:
         Returns:
             List[list[int], list[int]]: the pair of input_ids and target_ids
         """
+        context_data = self.chat_template._init_context_data(context_data)
         # encode system
         result = {}
         if self.chat_template.system: