Skip to content

Commit e158486

Browse files
aschhabrapytorchmergebot
authored andcommitted
[1/n]adding torch.distributed.run option to provide destination for event logging (pytorch#154644) (pytorch#155268)
Summary: **Problem Statement** Currently, torch distributed elastic does not support to an option specify destination for event logging from torch.distributed.run. *recording events to default destination:* https://fburl.com/code/7f9b0993 The default destination is "null". ***Solution*** adding option in torch.destributed.run to specify event_logging_destination. The default value will be "null" which is current default so it won;t affect users unless the specify it via command line. Test Plan: https://www.internalfb.com/mlhub/pipelines/runs/mast/f738408681-TrainingApplication_torch_distributed_run_3?job_attempt=0&version=0&tab=execution_details&env=PRODUCTION Rollback Plan: Reviewed By: kiukchung Differential Revision: D75183591 Pull Request resolved: pytorch#155268 Approved by: https://github.com/d4l3k
1 parent 9968c85 commit e158486

File tree

3 files changed

+31
-13
lines changed

3 files changed

+31
-13
lines changed

torch/distributed/elastic/agent/server/api.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,8 @@ class WorkerSpec:
7171
tee: tees the specified std stream(s) to console + file,
7272
selectively tee for a particular local rank by passing a map,
7373
takes precedence over ``redirects`` settings.
74-
74+
event_log_handler: name of the event logging handler as registered in
75+
`elastic/events/handlers.py <https://docs.pytorch.org/docs/stable/elastic/events.html>`_.
7576
"""
7677

7778
role: str
@@ -86,6 +87,7 @@ class WorkerSpec:
8687
master_port: Optional[int] = None
8788
master_addr: Optional[str] = None
8889
local_addr: Optional[str] = None
90+
event_log_handler: str = "null"
8991

9092
def __post_init__(self):
9193
assert self.local_world_size > 0

torch/distributed/launcher/api.py

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,9 @@ class LaunchConfig:
6464
local_addr: address of the local node if any. If not set, a lookup on the local
6565
machine's FQDN will be performed.
6666
local_ranks_filter: ranks for which to show logs in console. If not set, show from all.
67+
event_log_handler: name of the event logging handler as registered in
68+
`elastic/events/handlers.py <https://docs.pytorch.org/docs/stable/elastic/events.html>`_.
69+
6770
6871
.. note::
6972
`rdzv_timeout` is a legacy argument that will be removed in future.
@@ -87,6 +90,7 @@ class LaunchConfig:
8790
log_line_prefix_template: Optional[str] = None
8891
metrics_cfg: dict[str, str] = field(default_factory=dict)
8992
local_addr: Optional[str] = None
93+
event_log_handler: str = "null"
9094

9195
def __post_init__(self):
9296
default_timeout = 900
@@ -194,18 +198,19 @@ def launch_agent(
194198

195199
logger.info(
196200
"Starting elastic_operator with launch configs:\n"
197-
" entrypoint : %(entrypoint)s\n"
198-
" min_nodes : %(min_nodes)s\n"
199-
" max_nodes : %(max_nodes)s\n"
200-
" nproc_per_node : %(nproc_per_node)s\n"
201-
" run_id : %(run_id)s\n"
202-
" rdzv_backend : %(rdzv_backend)s\n"
203-
" rdzv_endpoint : %(rdzv_endpoint)s\n"
204-
" rdzv_configs : %(rdzv_configs)s\n"
205-
" max_restarts : %(max_restarts)s\n"
206-
" monitor_interval : %(monitor_interval)s\n"
207-
" log_dir : %(log_dir)s\n"
208-
" metrics_cfg : %(metrics_cfg)s\n",
201+
" entrypoint : %(entrypoint)s\n"
202+
" min_nodes : %(min_nodes)s\n"
203+
" max_nodes : %(max_nodes)s\n"
204+
" nproc_per_node : %(nproc_per_node)s\n"
205+
" run_id : %(run_id)s\n"
206+
" rdzv_backend : %(rdzv_backend)s\n"
207+
" rdzv_endpoint : %(rdzv_endpoint)s\n"
208+
" rdzv_configs : %(rdzv_configs)s\n"
209+
" max_restarts : %(max_restarts)s\n"
210+
" monitor_interval : %(monitor_interval)s\n"
211+
" log_dir : %(log_dir)s\n"
212+
" metrics_cfg : %(metrics_cfg)s\n"
213+
" event_log_handler : %(event_log_handler)s\n",
209214
{
210215
"entrypoint": entrypoint_name,
211216
"min_nodes": config.min_nodes,
@@ -219,6 +224,7 @@ def launch_agent(
219224
"monitor_interval": config.monitor_interval,
220225
"log_dir": config.logs_specs.root_log_dir, # type: ignore[union-attr]
221226
"metrics_cfg": config.metrics_cfg,
227+
"event_log_handler": config.event_log_handler,
222228
},
223229
)
224230

@@ -245,6 +251,7 @@ def launch_agent(
245251
master_addr=master_addr,
246252
master_port=master_port,
247253
local_addr=config.local_addr,
254+
event_log_handler=config.event_log_handler,
248255
)
249256

250257
agent = LocalElasticAgent(

torch/distributed/run.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -486,6 +486,14 @@ def get_args_parser() -> ArgumentParser:
486486
choices=["spawn", "fork", "forkserver"],
487487
help="Multiprocessing start method to use when creating workers.",
488488
)
489+
parser.add_argument(
490+
"--event-log-handler",
491+
"--event_log_handler",
492+
action=env,
493+
type=str,
494+
default="null",
495+
help="name of a registered event logging handler (see: https://docs.pytorch.org/docs/stable/elastic/events.html)",
496+
)
489497
parser.add_argument(
490498
"--role",
491499
action=env,
@@ -817,6 +825,7 @@ def config_from_args(args) -> tuple[LaunchConfig, Union[Callable, str], list[str
817825
log_line_prefix_template=log_line_prefix_template,
818826
local_addr=args.local_addr,
819827
logs_specs=logs_specs,
828+
event_log_handler=args.event_log_handler,
820829
)
821830

822831
with_python = not args.no_python

0 commit comments

Comments
 (0)