Skip to content

Commit 431c0e2

Browse files
Kiuk Chungfacebook-github-bot
authored andcommitted
(torchx/runner) Implement scheduler runopt dumping and loading to/from .INI file (#249)
Summary: Pull Request resolved: #249 Implements `dumps` and `loads` functions that dumps and loads INI file containing the runopts for all registered schedulers. Here's a sample config file dumped by calling `torchx.runner.config.dumps(std.out)` ``` [default.local] log_dir = None prepend_cwd = False image_type = dir root_dir = /var/cache/fbpkg [default.local_cwd] log_dir = None prepend_cwd = False [default.local_par] log_dir = None prepend_cwd = False [default.local_fbpkg] log_dir = None prepend_cwd = False [default.mast] hpcClusterUuid = TSCTestCluster hpcEntitlementName = None runningTimeoutSec = None hpcIdentity = <FIXME_WITH_A_str_VALUE> hpcJobOncall = pyper_training useStrictName = False mounts = None localityConstraints = None enableGracefulPreemption = False [default.flow] secure_group = <FIXME_WITH_A_str_VALUE> entitlement = default proxy_workflow_image = None [default.quickflow] entitlement = default runningTimeoutSec = None identity = <FIXME_WITH_A_str_VALUE> oncall = None useStrictName = False mounts = None localityConstraints = None enableGracefulPreemption = False perpetualRun = False [default.slurm] partition = None time = None [default.kubernetes] namespace = default queue = <FIXME_WITH_A_str_VALUE> ``` Added a `//torchx/fb/example/.torchxconfig` . Next steps: document all this (will do this in a separate diff since its somewhat urgent we ship the functionality first) Reviewed By: aivanou Differential Revision: D31601700 fbshipit-source-id: ec1a9809be5b9b367e77c5e8b1cd05789aa91af9
1 parent ffa8c48 commit 431c0e2

File tree

9 files changed

+593
-5
lines changed

9 files changed

+593
-5
lines changed

torchx/cli/cmd_configure.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
#!/usr/bin/env python3
2+
# Copyright (c) Facebook, Inc. and its affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
import argparse
9+
import logging
10+
import sys
11+
12+
from torchx.cli.cmd_base import SubCommand
13+
from torchx.runner.config import dump
14+
from torchx.schedulers import get_schedulers
15+
16+
17+
logger: logging.Logger = logging.getLogger(__name__)
18+
19+
20+
class CmdConfigure(SubCommand):
21+
def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
22+
subparser.add_argument(
23+
"-s",
24+
"--schedulers",
25+
type=str,
26+
help="comma delimited list of schedulers to dump runopts for, if not specified, dumps for all schedulers",
27+
)
28+
subparser.add_argument(
29+
"--print",
30+
action="store_true",
31+
help="if specified, prints the config file to stdout instead of saving it to a file",
32+
)
33+
subparser.add_argument(
34+
"-a",
35+
"--all",
36+
action="store_true",
37+
help="if specified, includes required and optional runopts (default only dumps required)",
38+
)
39+
40+
def run(self, args: argparse.Namespace) -> None:
41+
42+
if args.schedulers:
43+
schedulers = args.schedulers.split(",")
44+
else:
45+
schedulers = get_schedulers(session_name="_").keys()
46+
47+
required_only = not args.all
48+
49+
if args.print:
50+
dump(f=sys.stdout, schedulers=schedulers, required_only=required_only)
51+
else:
52+
with open(".torchxconfig", "w") as f:
53+
dump(f=f, schedulers=schedulers, required_only=required_only)

torchx/cli/cmd_run.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,12 +79,14 @@ def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
7979
scheduler_names = get_scheduler_factories().keys()
8080
self._subparser = subparser
8181
subparser.add_argument(
82+
"-s",
8283
"--scheduler",
8384
type=str,
8485
help=f"Name of the scheduler to use. One of: [{','.join(scheduler_names)}]",
8586
default=get_default_scheduler_name(),
8687
)
8788
subparser.add_argument(
89+
"-a",
8890
"--scheduler_args",
8991
type=str,
9092
help="Arguments to pass to the scheduler (Ex:`cluster=foo,user=bar`)."

torchx/cli/main.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,19 @@
77
import logging
88
import sys
99
from argparse import ArgumentParser
10-
from typing import List, Dict
10+
from typing import Dict, List
1111

1212
from torchx.cli.cmd_base import SubCommand
13+
from torchx.cli.cmd_configure import CmdConfigure
1314
from torchx.cli.cmd_describe import CmdDescribe
1415
from torchx.cli.cmd_log import CmdLog
1516
from torchx.cli.cmd_run import CmdBuiltins, CmdRun
1617
from torchx.cli.cmd_runopts import CmdRunopts
1718
from torchx.cli.cmd_status import CmdStatus
18-
from torchx.cli.colors import ORANGE, GRAY, ENDC
19+
from torchx.cli.colors import ENDC, GRAY, ORANGE
1920
from torchx.util.entrypoints import load_group
2021

22+
2123
sub_parser_description = """Use the following commands to run operations, e.g.:
2224
torchx run ${JOB_NAME}
2325
"""
@@ -31,6 +33,7 @@ def get_default_sub_cmds() -> Dict[str, SubCommand]:
3133
"builtins": CmdBuiltins(),
3234
"runopts": CmdRunopts(),
3335
"status": CmdStatus(),
36+
"configure": CmdConfigure(),
3437
}
3538

3639

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
#!/usr/bin/env python3
2+
# Copyright (c) Facebook, Inc. and its affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
import argparse
9+
import os
10+
import shutil
11+
import tempfile
12+
import unittest
13+
from pathlib import Path
14+
from typing import List
15+
16+
from torchx.cli.cmd_configure import CmdConfigure
17+
18+
19+
class CmdConfigureTest(unittest.TestCase):
20+
def setUp(self) -> None:
21+
self.parser = argparse.ArgumentParser()
22+
self.cmd_configure = CmdConfigure()
23+
self.cmd_configure.add_arguments(self.parser)
24+
25+
self.test_dir = tempfile.mkdtemp(prefix="torchx_cmd_configure_test")
26+
self._old_cwd = os.getcwd()
27+
os.chdir(self.test_dir)
28+
29+
def tearDown(self) -> None:
30+
os.chdir(self._old_cwd)
31+
shutil.rmtree(self.test_dir)
32+
33+
def _args(self, sys_args: List[str]) -> argparse.Namespace:
34+
return self.parser.parse_args(sys_args)
35+
36+
def test_configure_print(self) -> None:
37+
# nothing to assert, just make sure the cmd runs
38+
self.cmd_configure.run(self._args(["--print"]))
39+
self.cmd_configure.run(self._args(["--print", "--all"]))
40+
41+
def test_configure(self) -> None:
42+
os.chdir(self.test_dir)
43+
self.cmd_configure.run(self._args([]))
44+
45+
self.assertTrue((Path(self.test_dir) / ".torchxconfig").exists())
46+
47+
def test_configure_all(self) -> None:
48+
self.cmd_configure.run(self._args(["--all"]))
49+
self.assertTrue((Path(self.test_dir) / ".torchxconfig").exists())
50+
51+
def test_configure_local_cwd(self) -> None:
52+
self.cmd_configure.run(self._args(["--schedulers", "local_cwd"]))
53+
self.assertTrue((Path(self.test_dir) / ".torchxconfig").exists())

torchx/runner/api.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union
1414

1515
from pyre_extensions import none_throws
16+
from torchx.runner import config
1617
from torchx.runner.events import log_event
1718
from torchx.schedulers import get_schedulers
1819
from torchx.schedulers.api import Scheduler
@@ -259,9 +260,14 @@ def dryrun(
259260
f"Non-positive replicas for role: {role.name}."
260261
f" Did you forget to set role.num_replicas?"
261262
)
263+
264+
cfg = cfg or RunConfig()
265+
# TODO enable profiles - https://github.com/pytorch/torchx/issues/248
266+
config.apply(profile="default", scheduler=scheduler, runcfg=cfg)
267+
262268
sched = self._scheduler(scheduler)
263269
sched._validate(app, scheduler)
264-
dryrun_info = sched.submit_dryrun(app, cfg or RunConfig())
270+
dryrun_info = sched.submit_dryrun(app, cfg)
265271
dryrun_info._scheduler = scheduler
266272
return dryrun_info
267273

torchx/runner/config.py

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
#!/usr/bin/env python3
2+
# Copyright (c) Facebook, Inc. and its affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
import configparser as configparser
9+
import logging
10+
from pathlib import Path
11+
from typing import List, Optional, TextIO
12+
13+
from torchx.schedulers import Scheduler, get_schedulers
14+
from torchx.specs import RunConfig, get_type_name
15+
16+
17+
_NONE = "None"
18+
19+
log: logging.Logger = logging.getLogger(__name__)
20+
21+
22+
def _configparser() -> configparser.ConfigParser:
23+
"""
24+
Sets up the configparser and returns it. The same config parser
25+
should be used between dumps() and loads() methods for ser/de compatibility
26+
"""
27+
28+
config = configparser.ConfigParser()
29+
# if optionxform is not overridden, configparser will by default lowercase
30+
# the option keys because it is compatible with Windows INI files
31+
# which are expected to be parsed case insensitive.
32+
# override since torchx's runopts are case-sensitive
33+
# see: https://stackoverflow.com/questions/19359556/configparser-reads-capital-keys-and-make-them-lower-case
34+
# pyre-ignore[8]
35+
config.optionxform = lambda option: option
36+
37+
return config
38+
39+
40+
def _get_scheduler(name: str) -> Scheduler:
41+
schedulers = get_schedulers(session_name="_")
42+
sched = schedulers.get(name)
43+
if not sched:
44+
raise ValueError(
45+
f"`{name}` is not a registered scheduler. Valid scheduler names: {schedulers.keys()}"
46+
)
47+
return sched
48+
49+
50+
def dump(
51+
f: TextIO, schedulers: Optional[List[str]] = None, required_only: bool = False
52+
) -> None:
53+
"""
54+
Dumps a default INI-style config template containing the runopts for the
55+
given scheduler names into ``f``. If no ``schedulers`` are specified
56+
dumps all known registered schedulers.
57+
58+
Optional runopts are pre-filled with their default values.
59+
Required runopts are set with a ``<FIXME_...>`` placeholder.
60+
Each scheduler's runopts are written in the section called
61+
``[default.scheduler_args.{scheduler_name}]`` (e.g. ``[default.scheduler_args.kubernetes]``)
62+
63+
To only dump required runopts pass ``required_only=True``.
64+
65+
Raises a ``ValueError`` if given a scheduler name that is not known
66+
"""
67+
68+
if schedulers:
69+
scheds = schedulers
70+
else:
71+
scheds = get_schedulers(session_name="_").keys()
72+
73+
config = _configparser()
74+
for sched_name in scheds:
75+
sched = _get_scheduler(sched_name)
76+
77+
section = f"default.scheduler_args.{sched_name}"
78+
config.add_section(section)
79+
80+
for opt_name, opt in sched.run_opts():
81+
if opt.is_required:
82+
val = f"<FIXME_WITH_A_{get_type_name(opt.opt_type)}_VALUE>"
83+
else: # not required runopts MUST have a default
84+
if required_only:
85+
continue
86+
87+
# serialize list elements with `;` delimiter (consistent with torchx cli)
88+
if opt.opt_type == List[str]:
89+
# deal with empty or None default lists
90+
if opt.default:
91+
# pyre-ignore[6] opt.default type checked already as List[str]
92+
val = ";".join(opt.default)
93+
else:
94+
val = _NONE
95+
else:
96+
val = f"{opt.default}"
97+
98+
config.set(section, opt_name, val)
99+
100+
config.write(f, space_around_delimiters=True)
101+
102+
103+
def apply(profile: str, scheduler: str, runcfg: RunConfig) -> None:
104+
"""
105+
Loads .torchxconfig files from predefined locations according
106+
to a load hierarchy and applies the loaded configs into the
107+
given ``runcfg``. The load hierarchy is as follows (in order of precedence):
108+
109+
#. ``runcfg`` given to this function
110+
#. configs loaded from ``$HOME/.torchxconfig``
111+
#. configs loaded from ``$CWD/.torchxconfig``
112+
113+
Note that load hierarchy does NOT overwrite, but rather adds.
114+
That is, the configs already present in ``runcfg`` are not
115+
overridden during the load.
116+
"""
117+
lookup_dirs = [Path.home(), Path.cwd()]
118+
119+
for d in lookup_dirs:
120+
configfile = d / ".torchxconfig"
121+
if configfile.exists():
122+
log.info(f"loading configs from {configfile}")
123+
with open(str(configfile), "r") as f:
124+
load(profile, scheduler, f, runcfg)
125+
126+
127+
def load(profile: str, scheduler: str, f: TextIO, runcfg: RunConfig) -> None:
128+
"""
129+
loads the section ``[{profile}.scheduler_args.{scheduler}]`` from the given
130+
configfile ``f`` (in .INI format) into the provided ``runcfg``, only adding
131+
configs that are NOT currently in the given ``runcfg`` (e.g. does not
132+
override existing values in ``runcfg``). If no section is found, does nothing.
133+
"""
134+
135+
config = _configparser()
136+
config.read_file(f)
137+
138+
runopts = _get_scheduler(scheduler).run_opts()
139+
140+
section = f"{profile}.scheduler_args.{scheduler}"
141+
if config.has_section(section):
142+
for name, value in config.items(section):
143+
if name in runcfg.cfgs:
144+
# DO NOT OVERRIDE existing configs
145+
continue
146+
147+
if value == _NONE:
148+
# should map to None (not str 'None')
149+
# this also handles empty or None lists
150+
runcfg.set(name, None)
151+
else:
152+
runopt = runopts.get(name)
153+
154+
if runopt is None:
155+
log.warning(
156+
f"`{name} = {value}` was declared in the [{section}] section "
157+
f" of the config file but is not a runopt of `{scheduler}` scheduler."
158+
f" Remove the entry from the config file to no longer see this warning"
159+
)
160+
else:
161+
if runopt.opt_type is bool:
162+
# need to handle bool specially since str -> bool is based on
163+
# str emptiness not value (e.g. bool("False") == True)
164+
runcfg.set(name, config.getboolean(section, name))
165+
elif runopt.opt_type is List[str]:
166+
runcfg.set(name, value.split(";"))
167+
else:
168+
# pyre-ignore[29]
169+
runcfg.set(name, runopt.opt_type(value))

0 commit comments

Comments
 (0)