Skip to content

Commit d936f06

Browse files
yangbofunyangbo1
authored andcommitted
add megatron log
1 parent 90c98a6 commit d936f06

File tree

4 files changed

+49
-1
lines changed

4 files changed

+49
-1
lines changed

swift/megatron/arguments/megatron_args.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,8 @@ class MegatronArguments(RLHFMegatronArgumentsMixin, MegatronTunerMixin):
453453
overlap_grad_reduce: bool = False
454454
overlap_param_gather: bool = False
455455
overlap_param_gather_with_optimizer_step: bool = False
456+
"""DDP bucket size for grad reduce. Default: max(40M, 1M*dp_size). Only used when overlap_grad_reduce=True."""
457+
bucket_size: Optional[int] = None
456458
align_grad_reduce: bool = True
457459
virtual_pipeline_model_parallel_size: Optional[int] = None
458460
microbatch_group_size_per_vp_stage: Optional[int] = None

swift/megatron/trainers/base.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@
3838
from swift.trainers import dynamic_gradient_checkpointing
3939
from swift.trainers.utils import patch_modelscope_hub_timeout
4040
from swift.utils import deep_getattr, get_last_valid_indices, get_logger, is_last_rank, is_master, ms_logger_context
41+
from swift.utils import setup_megatron_logging
42+
from swift.utils.logger import add_file_handler_if_needed
4143
from .batch_sampler import MegatronPretrainingRandomSampler, MegatronPretrainingSampler
4244
from .utils import (TrainerState, build_streaming_dataloader, get_batch_on_this_cp_rank, get_batch_on_this_pp_rank,
4345
get_packed_seq_params)
@@ -97,6 +99,10 @@ def __init__(self, args, template: Template):
9799
if args.async_save and args.use_persistent_ckpt_worker:
98100
init_persistent_async_worker()
99101

102+
# Configure Megatron logging after main logger is ready so Megatron logs
103+
# (e.g. DDP bucket info) appear in both console and log file.
104+
setup_megatron_logging()
105+
100106
def _load_checkpoint(self):
101107
args = self.args
102108
if not args.finetune:

swift/utils/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
is_lmdeploy_available, is_megatron_available, is_swanlab_available, is_trl_available,
1010
is_unsloth_available, is_vllm_ascend_available, is_vllm_available, is_wandb_available)
1111
from .io_utils import JsonlWriter, append_to_jsonl, get_file_mm_type, read_from_jsonl, write_to_jsonl
12-
from .logger import get_logger, ms_logger_context
12+
from .logger import get_logger, ms_logger_context, setup_megatron_logging
1313
from .np_utils import get_seed, stat_array, transform_jsonl_to_df
1414
from .processor_utils import Processor, ProcessorMixin
1515
from .safetensors import LazyTensor, SafetensorLazyLoader, StreamingSafetensorSaver

swift/utils/logger.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,3 +154,43 @@ def add_file_handler_if_needed(logger, log_file, file_mode, log_level):
154154
file_handler.setFormatter(logger_format)
155155
file_handler.setLevel(log_level)
156156
logger.addHandler(file_handler)
157+
158+
159+
def setup_megatron_logging(log_file: Optional[str] = None):
160+
"""Configure Megatron loggers to output to the same stream/file as swift logs.
161+
162+
Call this after init_megatron_env() so Megatron logs (e.g. DDP bucket info,
163+
param_and_grad_buffer) appear in ms-swift log files.
164+
165+
Reuses Swift's handlers to avoid log interleaving/corruption when both
166+
Swift and Megatron write concurrently (shared handler lock serializes writes).
167+
168+
Args:
169+
log_file: Optional log file path. If None, only uses Swift's StreamHandler.
170+
When using tee (e.g. 2>&1 | tee node_rank_0.log), stderr is captured.
171+
"""
172+
if importlib.util.find_spec('torch') is not None:
173+
is_worker0 = int(os.getenv('LOCAL_RANK', -1)) in {-1, 0}
174+
else:
175+
is_worker0 = True
176+
if not is_worker0:
177+
return
178+
log_level = os.getenv('LOG_LEVEL', 'INFO').upper()
179+
log_level = getattr(logging, log_level, logging.INFO)
180+
megatron_logger = logging.getLogger('megatron.core')
181+
megatron_logger.setLevel(log_level)
182+
megatron_logger.propagate = False # Don't propagate to root (filters at ERROR)
183+
# Reuse Swift logger's handlers to avoid interleaved/corrupted output
184+
swift_logger = logging.getLogger('swift')
185+
for handler in swift_logger.handlers:
186+
if handler not in megatron_logger.handlers:
187+
megatron_logger.addHandler(handler)
188+
if log_file is not None:
189+
for h in megatron_logger.handlers:
190+
if isinstance(h, logging.FileHandler) and getattr(h, 'baseFilename', '') == log_file:
191+
break
192+
else:
193+
file_handler = logging.FileHandler(log_file, 'a')
194+
file_handler.setFormatter(logger_format)
195+
file_handler.setLevel(log_level)
196+
megatron_logger.addHandler(file_handler)

0 commit comments

Comments
 (0)