Skip to content

Commit ce8787e

Browse files
author
yangbo1
committed
add megatron log
1 parent 90c98a6 commit ce8787e

File tree

3 files changed

+43
-1
lines changed

3 files changed

+43
-1
lines changed

swift/megatron/arguments/megatron_args.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,8 @@ class MegatronArguments(RLHFMegatronArgumentsMixin, MegatronTunerMixin):
453453
overlap_grad_reduce: bool = False
454454
overlap_param_gather: bool = False
455455
overlap_param_gather_with_optimizer_step: bool = False
456+
"""DDP bucket size for grad reduce. Default: max(40M, 1M*dp_size). Only used when overlap_grad_reduce=True."""
457+
bucket_size: Optional[int] = None
456458
align_grad_reduce: bool = True
457459
virtual_pipeline_model_parallel_size: Optional[int] = None
458460
microbatch_group_size_per_vp_stage: Optional[int] = None

swift/utils/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
is_lmdeploy_available, is_megatron_available, is_swanlab_available, is_trl_available,
1010
is_unsloth_available, is_vllm_ascend_available, is_vllm_available, is_wandb_available)
1111
from .io_utils import JsonlWriter, append_to_jsonl, get_file_mm_type, read_from_jsonl, write_to_jsonl
12-
from .logger import get_logger, ms_logger_context
12+
from .logger import get_logger, ms_logger_context, setup_megatron_logging
1313
from .np_utils import get_seed, stat_array, transform_jsonl_to_df
1414
from .processor_utils import Processor, ProcessorMixin
1515
from .safetensors import LazyTensor, SafetensorLazyLoader, StreamingSafetensorSaver

swift/utils/logger.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,3 +154,43 @@ def add_file_handler_if_needed(logger, log_file, file_mode, log_level):
154154
file_handler.setFormatter(logger_format)
155155
file_handler.setLevel(log_level)
156156
logger.addHandler(file_handler)
157+
158+
159+
def setup_megatron_logging(log_file: Optional[str] = None):
160+
"""Configure Megatron loggers to output to the same stream/file as swift logs.
161+
162+
Call this after init_megatron_env() so Megatron logs (e.g. DDP bucket info,
163+
param_and_grad_buffer) appear in ms-swift log files.
164+
165+
Reuses Swift's handlers to avoid log interleaving/corruption when both
166+
Swift and Megatron write concurrently (shared handler lock serializes writes).
167+
168+
Args:
169+
log_file: Optional log file path. If None, only uses Swift's StreamHandler.
170+
When using tee (e.g. 2>&1 | tee node_rank_0.log), stderr is captured.
171+
"""
172+
if importlib.util.find_spec('torch') is not None:
173+
is_worker0 = int(os.getenv('LOCAL_RANK', -1)) in {-1, 0}
174+
else:
175+
is_worker0 = True
176+
if not is_worker0:
177+
return
178+
log_level = os.getenv('LOG_LEVEL', 'INFO').upper()
179+
log_level = getattr(logging, log_level, logging.INFO)
180+
megatron_logger = logging.getLogger('megatron.core')
181+
megatron_logger.setLevel(log_level)
182+
megatron_logger.propagate = False # Don't propagate to root (filters at ERROR)
183+
# Reuse Swift logger's handlers to avoid interleaved/corrupted output
184+
swift_logger = logging.getLogger('swift')
185+
for handler in swift_logger.handlers:
186+
if handler not in megatron_logger.handlers:
187+
megatron_logger.addHandler(handler)
188+
if log_file is not None:
189+
for h in megatron_logger.handlers:
190+
if isinstance(h, logging.FileHandler) and getattr(h, 'baseFilename', '') == log_file:
191+
break
192+
else:
193+
file_handler = logging.FileHandler(log_file, 'a')
194+
file_handler.setFormatter(logger_format)
195+
file_handler.setLevel(log_level)
196+
megatron_logger.addHandler(file_handler)

0 commit comments

Comments
 (0)