fix chatglm3 template bug (#298)

Jintao-Huang · web-flow · commit 178033d23d23 · 2024-01-08T16:50:47.000+08:00
diff --git a/swift/llm/sft.py b/swift/llm/sft.py
@@ -181,6 +181,8 @@ def llm_sft(args: SftArguments) -> Dict[str, Union[str, Any]]:
         greater_is_better=args.predict_with_generate,
         sortish_sampler=True,
         optim=args.optim,
+        adam_beta1=args.adam_beta1,
+        adam_beta2=args.adam_beta2,
         hub_model_id=args.hub_model_id,
         hub_private_repo=args.hub_private_repo,
         push_hub_strategy=args.push_hub_strategy,
@@ -200,7 +202,8 @@ def llm_sft(args: SftArguments) -> Dict[str, Union[str, Any]]:
         disable_tqdm=args.disable_tqdm,
         save_on_each_node=args.save_on_each_node,
         acc_strategy=args.acc_strategy,
-        save_safetensors=args.save_safetensors)
+        save_safetensors=args.save_safetensors,
+        logging_first_step=True)
 
     if args.gradient_checkpointing:
         model.config.use_cache = False  # fix transformers==4.36
diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
@@ -104,6 +104,8 @@ class SftArguments:
     # if max_steps >= 0, override num_train_epochs
     max_steps: int = -1
     optim: str = 'adamw_torch'
+    adam_beta1: float = 0.9
+    adam_beta2: float = 0.999
     learning_rate: Optional[float] = None
     weight_decay: float = 0.01
     gradient_accumulation_steps: Optional[int] = None
diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py
@@ -409,7 +409,7 @@ def _repair_agent_conversations(conversations: str,
 
 advertise_gen_prompt = """Task: Generating advertisements based on keywords.
 Keywords: {query}
-Advertisements: """
+Advertisements:"""
 register_dataset(
     DatasetName.advertise_gen_zh,
     'lvjianjin/AdvertiseGen', ['train'], ['validation'],
@@ -513,7 +513,7 @@ def _preprocess_dureader_robust(dataset: HfDataset) -> HfDataset:
     prompt = """Task: Question Generation
 Context: {context}
 Answer: {answer}
-Question: """
+Question:"""
     query = []
     response = []
     for d in dataset:
@@ -850,7 +850,7 @@ def _preprocess_hc3(dataset: HfDataset) -> HfDataset:
 Question: {question}
 Answer: {answer}
 Category: Human, ChatGPT
-Output: """
+Output:"""
     query = []
     response = []
     for d in dataset:
@@ -978,6 +978,9 @@ def add_self_cognition_dataset(
         return concatenate_datasets([train_dataset, dataset])
 
 
+NoneType = type(None)
+
+
 def _check_dataset(
     dataset: Optional[None],
     check_dataset_strategy: Literal['none', 'discard', 'error', 'warning']
@@ -1003,7 +1006,7 @@ def _check_dataset(
                 continue
             else:
                 raise ValueError(f"d['response']: {d['response']}, i: {i}")
-        if has_query and not isinstance(d['response'], str):
+        if has_query and not isinstance(d['query'], (str, NoneType)):
             is_modified = True
             if check_dataset_strategy == 'discard':
                 continue
@@ -1012,7 +1015,7 @@ def _check_dataset(
                 continue
             else:
                 raise ValueError(f"d['query']: {d['query']}, i: {i}")
-        if has_history and not isinstance(d['history'], (list, type(None))):
+        if has_history and not isinstance(d['history'], (list, NoneType)):
             is_modified = True
             if check_dataset_strategy == 'discard':
                 continue
@@ -1021,7 +1024,7 @@ def _check_dataset(
                 continue
             else:
                 raise ValueError(f"d['history']: {d['history']}, i: {i}")
-        if has_system and not isinstance(d['system'], str):
+        if has_system and not isinstance(d['system'], (str, NoneType)):
             is_modified = True
             if check_dataset_strategy == 'discard':
                 continue
diff --git a/swift/llm/utils/preprocess.py b/swift/llm/utils/preprocess.py
@@ -232,7 +232,7 @@ def __init__(self, labels: List[str], task_name: str,
         self.prompt = f"""Task: {task_name}
 {inputs}
 Category: {category}
-Output: """
+Output:"""
         self.task_name = task_name
         self.is_pair_seq = is_pair_seq
 
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
@@ -618,13 +618,13 @@ def register_template(template_type: str,
 
 register_template(
     TemplateType.chatglm3,
-    Template([[64790, 64792]], [[64795], '\n {{QUERY}}', [64796], '\n '], [],
+    Template([[64790, 64792]], [[64795], '\n {{QUERY}}', [64796], '\n'], [],
              [['eos_token_id']], None,
              [[64790, 64792, 64794], '\n {{SYSTEM}}']))
 
 register_template(
     TemplateType.deepseek,
-    Template([['bos_token_id']], ['User: {{QUERY}}\n\nAssistant: '],
+    Template([['bos_token_id']], ['User: {{QUERY}}\n\nAssistant:'],
              [['eos_token_id']], [['eos_token_id']], None,
              [['bos_token_id'], '{{SYSTEM}}\n\n']))
 
@@ -660,7 +660,7 @@ def register_template(template_type: str,
 )
 register_template(
     TemplateType.openbuddy,
-    Template([['bos_token_id']], ['User: {{QUERY}}\nAssistant: '], ['\n'],
+    Template([['bos_token_id']], ['User: {{QUERY}}\nAssistant:'], ['\n'],
              [['eos_token_id']], OPENBUDDY_DEFAULT_SYSTEM,
              [['bos_token_id'], '{{SYSTEM}}\n\n']))
 
diff --git a/tests/llm/test_run.py b/tests/llm/test_run.py
@@ -42,6 +42,7 @@ def test_basic(self):
                 quantization_bit=quantization_bit,
                 batch_size=2,
                 eval_steps=5,
+                adam_beta2=0.95,
                 check_dataset_strategy='warning',
                 train_dataset_sample=200,
                 predict_with_generate=predict_with_generate,
diff --git a/tests/llm/test_template.py b/tests/llm/test_template.py
@@ -85,7 +85,7 @@ def test_chatglm3_template(self):
             64790, 64792, 64794, 30910, 13, 344, 383, 260, 6483, 9319, 30992,
             64795, 30910, 13, 30910, 30939, 30943, 30966, 30972, 30970, 31011,
             30943, 30966, 30972, 30980, 31514, 64796
-        ] + [30910, 13, 30910]
+        ] + [30910, 13]
         input_ids_swift = template.encode({
             'query': query,
             'system': system
@@ -439,7 +439,7 @@ def test_openbuddy_template(self):
         #
         input_ids_official = inputs[0].tolist()
         input_ids_swift = template.encode({'query': query})['input_ids']
-        self.assertTrue(input_ids_swift[:-1] == input_ids_official)
+        self.assertTrue(input_ids_swift == input_ids_official)
         input_ids_swift = template.encode({
             'query': query,
             'history': [['1234', 'avdc']]
@@ -577,7 +577,7 @@ def test_deepseek_template(self):
         response = tokenizer.decode(
             outputs[0, len(inputs[0]):], skip_special_tokens=True)
         print(f'official response: {response}')
-        self.assertTrue(input_ids_swift[:-1] == input_ids_official)
+        self.assertTrue(input_ids_swift == input_ids_official)
 
     @unittest.skipIf(
         SKPT_TEST,