[megatron] write use last rank (#5324)

Jintao-Huang · web-flow · commit 5b1da6bd085b · 2025-08-09T21:01:08.000+08:00
diff --git a/swift/llm/template/template/utils.py b/swift/llm/template/template/utils.py
@@ -40,7 +40,7 @@ def _swift_prepare_inputs(self, inputs):
         messages = inputs.messages
         # Only during inference or training, and only if the loss_scale is set to 'last_round',
         # will the previous 'think' entries be deleted.
-        if not self.is_training or self.loss_scale.name == 'last_round':
+        if not self.is_training or self.loss_scale.name in {'last_round', 'last_round_with_ignore_empty_think'}:
             for i, message in enumerate(messages):
                 # Delete the content before '</think>' in all assistant turns except the last round.
                 if message['role'] == 'assistant' and isinstance(message['content'], str) and i != len(messages) - 1:
diff --git a/swift/megatron/init.py b/swift/megatron/init.py
@@ -8,7 +8,6 @@
 from datetime import datetime
 from typing import List, Optional, Tuple
 
-import numpy as np
 import peft
 import torch
 import torch.nn as nn
@@ -17,8 +16,8 @@
 from tqdm import tqdm
 
 from swift.llm import git_clone_github
-from swift.utils import (JsonlWriter, format_time, get_logger, is_flash_attn_3_available, is_master,
-                         is_megatron_available, safe_ddp_context, split_list, subprocess_run)
+from swift.utils import (JsonlWriter, format_time, get_logger, is_flash_attn_3_available, is_megatron_available,
+                         safe_ddp_context, split_list, subprocess_run)
 
 logger = get_logger()
 
@@ -75,10 +74,10 @@ def training_log(loss_dict, total_loss_dict, learning_rate, decoupled_learning_r
         """Log training information such as losses, timing, ...."""
         nonlocal jsonl_writer
         args = get_args()
-        if is_master() and jsonl_writer is None:
+        if jsonl_writer is None:
             logging_path = os.path.join(args.save, 'logging.jsonl')
             logger.info(f'logging_path: {logging_path}')
-            jsonl_writer = JsonlWriter(logging_path, enable_async=True)
+            jsonl_writer = JsonlWriter(logging_path, enable_async=True, write_on_rank='last')
         timers = get_timers()
         writer = get_tensorboard_writer()
         wandb_writer = get_wandb_writer()
@@ -300,7 +299,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, decoupled_learning_r
                 report_memory_flag = False
             timers.log(timers_to_log, normalizer=args.log_interval)
 
-            if is_master():
+            if is_last_rank():
                 logs = {}
                 for key in origin_total_loss_dict:
                     if key not in [advanced_iters_key, skipped_iters_key, nan_iters_key]:
@@ -819,6 +818,9 @@ def _patch_megatron():
     except Exception:
         pass
 
+    import megatron.core
+    logger.info(f'megatron.core.__version__: {megatron.core.__version__}')
+
 
 def init_megatron_env() -> None:
     if 'MEGATRON_LM_PATH' not in os.environ:
diff --git a/swift/megatron/trainers/base.py b/swift/megatron/trainers/base.py
@@ -20,7 +20,7 @@
 from megatron.training.checkpointing import load_checkpoint
 from packaging import version
 
-from swift.utils import JsonlWriter, deep_getattr, get_logger, is_master
+from swift.utils import JsonlWriter, deep_getattr, get_logger
 from ..utils import adapter_state_dict_context, copy_original_module_weight, prepare_mcore_model
 from .utils import get_swift_datasets_provider
 
@@ -34,7 +34,7 @@ def __init__(self, args):
         self.stimer = StragglerDetector()
         logging_path = os.path.join(args.save, 'logging.jsonl')
         logger.info(f'logging_path: {logging_path}')
-        self.jsonl_writer = JsonlWriter(logging_path, enable_async=True)
+        self.jsonl_writer = JsonlWriter(logging_path, enable_async=True, write_on_rank='last')  # for evaluate
         self._patch_megatron()
 
     @contextmanager
@@ -372,7 +372,7 @@ def evaluate(self,
         timers.log(['evaluate'])
 
         rerun_state_machine.set_mode(rerun_mode)
-        if is_master():
+        if is_last_rank():
             logs = {}
             for key, val in total_loss_dict.items():
                 logs[f'eval_{key}'] = round(val.item(), 8)
diff --git a/swift/utils/__init__.py b/swift/utils/__init__.py
@@ -1,7 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 from .env import (get_dist_setting, get_hf_endpoint, get_node_setting, get_pai_tensorboard_dir, is_deepspeed_enabled,
-                  is_dist, is_local_master, is_master, is_mp, is_mp_ddp, is_pai_training_job, use_hf_hub)
+                  is_dist, is_last_rank, is_local_master, is_master, is_mp, is_mp_ddp, is_pai_training_job, use_hf_hub)
 from .import_utils import (is_flash_attn_2_available, is_flash_attn_3_available, is_liger_available,
                            is_lmdeploy_available, is_megatron_available, is_swanlab_available, is_trl_available,
                            is_unsloth_available, is_vllm_ascend_available, is_vllm_available, is_wandb_available)
diff --git a/swift/utils/env.py b/swift/utils/env.py
@@ -52,6 +52,11 @@ def is_master():
     return rank in {-1, 0}
 
 
+def is_last_rank():
+    rank, _, world_size, _ = get_dist_setting()
+    return rank in {-1, world_size - 1}
+
+
 def is_dist():
     """Determine if the training is distributed"""
     rank, local_rank, _, _ = get_dist_setting()
diff --git a/swift/utils/io_utils.py b/swift/utils/io_utils.py
@@ -11,7 +11,7 @@
 from modelscope.hub.api import ModelScopeConfig
 from tqdm import tqdm
 
-from .env import is_master
+from .env import is_last_rank, is_master
 from .logger import get_logger
 from .utils import check_json_format
 
@@ -46,8 +46,20 @@ def write_to_jsonl(fpath: str, obj_list: List[Any], encoding: str = 'utf-8') ->
 
 class JsonlWriter:
 
-    def __init__(self, fpath: str, *, encoding: str = 'utf-8', strict: bool = True, enable_async: bool = False):
-        self.fpath = os.path.abspath(os.path.expanduser(fpath)) if is_master() else None
+    def __init__(self,
+                 fpath: str,
+                 *,
+                 encoding: str = 'utf-8',
+                 strict: bool = True,
+                 enable_async: bool = False,
+                 write_on_rank: Literal['master', 'last'] = 'master'):
+        if write_on_rank == 'master':
+            self.is_write_rank = is_master()
+        elif write_on_rank == 'last':
+            self.is_write_rank = is_last_rank()
+        else:
+            raise ValueError(f"Invalid `write_on_rank`: {write_on_rank}, should be 'master' or 'last'")
+        self.fpath = os.path.abspath(os.path.expanduser(fpath)) if self.is_write_rank else None
         self.encoding = encoding
         self.strict = strict
         self.enable_async = enable_async
@@ -66,7 +78,7 @@ def _append(self, obj: Union[Dict, List[Dict]], gather_obj: bool = False):
             obj_list = [obj]
         if gather_obj and dist.is_initialized():
             obj_list = gather_object(obj_list)
-        if not is_master():
+        if not self.is_write_rank:
             return
         obj_list = check_json_format(obj_list)
         for i, _obj in enumerate(obj_list):
@@ -85,7 +97,7 @@ def append(self, obj: Union[Dict, List[Dict]], gather_obj: bool = False):
     def _write_buffer(self, text: str):
         if not text:
             return
-        assert is_master(), f'is_master(): {is_master()}'
+        assert self.is_write_rank, f'self.is_write_rank: {self.is_write_rank}'
         try:
             os.makedirs(os.path.dirname(self.fpath), exist_ok=True)
             with open(self.fpath, 'a', encoding=self.encoding) as f: