[dataset] fix self-cognition & load_from_cache_file (#4426)

Jintao-Huang · web-flow · commit 09255ce66e2a · 2025-05-30T15:15:19.000+08:00
diff --git a/swift/llm/argument/base_args/data_args.py b/swift/llm/argument/base_args/data_args.py
@@ -22,9 +22,8 @@ class DataArguments:
         streaming (bool): Flag to enable streaming of datasets. Default is False.
         download_mode (Literal): Mode for downloading datasets. Default is 'reuse_dataset_if_exists'.
         columns: Used for manual column mapping of datasets.
-        model_name (List[str]): List containing Chinese and English names of the model. Default is [None, None].
-        model_author (List[str]): List containing Chinese and English names of the model author.
-            Default is [None, None].
+        model_name (List[str]): List containing Chinese and English names of the model. Default is None.
+        model_author (List[str]): List containing Chinese and English names of the model author. Default is None.
         custom_dataset_info (Optional[str]): Path to custom dataset_info.json file. Default is None.
     """
     # dataset_id or dataset_dir or dataset_path
@@ -49,9 +48,8 @@ class DataArguments:
     strict: bool = False
     remove_unused_columns: bool = True
     # Chinese name and English name
-    model_name: List[str] = field(default_factory=lambda: [None, None], metadata={'help': "e.g. ['小黄', 'Xiao Huang']"})
-    model_author: List[str] = field(
-        default_factory=lambda: [None, None], metadata={'help': "e.g. ['魔搭', 'ModelScope']"})
+    model_name: Optional[List[str]] = field(default=None, metadata={'help': "e.g. ['小黄', 'Xiao Huang']"})
+    model_author: Optional[List[str]] = field(default=None, metadata={'help': "e.g. ['魔搭', 'ModelScope']"})
 
     custom_dataset_info: List[str] = field(default_factory=list)  # .json
 
diff --git a/swift/llm/dataset/dataset/llm.py b/swift/llm/dataset/dataset/llm.py
@@ -825,14 +825,18 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
 
 
 class SelfCognitionPreprocessor(ResponsePreprocessor):
-    name: Optional[Tuple[str, str]] = None
-    author: Optional[Tuple[str, str]] = None
 
     def __init__(self, *args, query_suffix: str = '', response_prefix: str = '', **kwargs):
         self.query_suffix = query_suffix
         self.response_prefix = response_prefix
+        self.name: Optional[Tuple[str, str]] = None
+        self.author: Optional[Tuple[str, str]] = None
         super().__init__(*args, **kwargs)
 
+    def set_name_author(self, name, author):
+        self.name = name
+        self.author = author
+
     def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
         for key in ['name', 'author']:
             val = getattr(self, key)
@@ -863,4 +867,5 @@ def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
             SubsetDataset(
                 'empty_think', preprocess_func=SelfCognitionPreprocessor(response_prefix='<think>\n\n</think>\n\n')),
         ],
+        dataset_name='self-cognition',
         tags=['chat', 'self-cognition', '🔥']))
diff --git a/swift/llm/dataset/loader.py b/swift/llm/dataset/loader.py
@@ -422,18 +422,30 @@ def load(
 
 
 def init_self_cognition_preprocessor(
+    dataset_meta: Optional[DatasetMeta],
     model_name: Union[Tuple[str, str], List[str], None] = None,
     model_author: Union[Tuple[str, str], List[str], None] = None,
 ) -> None:
-    from .dataset.llm import SelfCognitionPreprocessor
+    if dataset_meta is None or model_name is None and model_author is None:
+        return
+    kwargs = {}
     # zh, en
-    for key in ['model_name', 'model_author']:
-        val = locals()[key]
+    for key in ['name', 'author']:
+        val = locals()[f'model_{key}']
         if isinstance(val, str):
             val = [val]
         if val is not None and val[0] is not None and (len(val) == 1 or val[1] is None):
             val = (val[0], val[0])
-        setattr(SelfCognitionPreprocessor, key[len('model_'):], val)
+        kwargs[key] = val
+
+    from .dataset.llm import SelfCognitionPreprocessor
+    preprocess_funcs = [dataset_meta.preprocess_func]
+    preprocess_funcs += [subset.preprocess_func for subset in dataset_meta.subsets if isinstance(subset, SubsetDataset)]
+    for preprocess_func in preprocess_funcs:
+        if isinstance(preprocess_func, SelfCognitionPreprocessor):
+            preprocess_func.set_name_author(**kwargs)
+    logger.info_once(f"SelfCognitionPreprocessor has been successfully configured with name: {kwargs['name']}, "
+                     f"author: {kwargs['author']}.")
 
 
 def load_dataset(
@@ -479,7 +491,7 @@ def load_dataset(
     Returns:
         The train dataset and val dataset
     """
-    init_self_cognition_preprocessor(model_name, model_author)
+    init_self_cognition_preprocessor(DATASET_MAPPING.get('self-cognition'), model_name, model_author)
     if isinstance(datasets, str):
         datasets = [datasets]
     if not isinstance(seed, np.random.RandomState):