[bugfix] fix streaming & compat transformers 4.54 (#5381)

Jintao-Huang · Jintao-Huang · commit eb0b03a30731 · 2025-08-14T15:31:25.000+08:00
diff --git a/swift/llm/dataset/utils.py b/swift/llm/dataset/utils.py
@@ -274,14 +274,14 @@ def __iter__(self):
 
 class EncodePreprocessor(RowPreprocessor):
 
-    def __init__(self, template: 'Template'):
+    def __init__(self, template: 'Template', pre_tokenize: bool = False):
         super().__init__()
         self.template = template
-        self.is_multimodal = template.model_meta.is_multimodal
+        self.pre_tokenize = pre_tokenize
 
     def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
         encoded = self.template.encode(row, return_length=True)
-        if self.is_multimodal:
+        if self.pre_tokenize:
             row['length'] = encoded['length']
             encoded = row
         return encoded
diff --git a/swift/llm/train/sft.py b/swift/llm/train/sft.py
@@ -127,7 +127,14 @@ def _prepare_dataset(self):
             if i == 1 and predict_with_generate:
                 # val_dataset
                 continue
-            if (args.model_meta.is_multimodal or args.lazy_tokenize) and not args.streaming:
+            if args.streaming:
+                preprocessor = EncodePreprocessor(template=template)
+                dataset = preprocessor(
+                    dataset,
+                    num_proc=args.dataset_num_proc,
+                    load_from_cache_file=args.load_from_cache_file,
+                    strict=args.strict)
+            elif (args.model_meta.is_multimodal or args.lazy_tokenize):
                 dataset = LazyLLMDataset(dataset, template.encode, strict=args.strict, random_state=args.data_seed)
             if args.packing:
                 packing_dataset_cls = IterablePackingDataset if args.streaming else PackingDataset
@@ -299,7 +306,7 @@ def _encode_dataset(self, train_dataset, val_dataset):
                 # val_dataset
                 continue
             if not args.lazy_tokenize and not args.streaming:
-                preprocessor = EncodePreprocessor(template=template)
+                preprocessor = EncodePreprocessor(template=template, pre_tokenize=args.model_meta.is_multimodal)
                 batch_size = 100 if args.model_meta.is_multimodal else 1000
                 dataset = preprocessor(
                     dataset,
diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py
@@ -399,7 +399,8 @@ def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=N
             from swift.trainers.sequence_parallel import sequence_parallel
             loss = sequence_parallel.reduce_outputs(loss, labels)
 
-        if getattr(self.args, 'average_tokens_across_devices', False) and self.model_accepts_loss_kwargs:
+        if getattr(self.args, 'average_tokens_across_devices',
+                   False) and self.model_accepts_loss_kwargs and num_items_in_batch is not None:
             loss *= self.accelerator.num_processes
 
         if (outputs.logits is not None and labels is not None and not return_outputs