Skip to content

Commit c8fb44e

Browse files
germa89German Martinez Ayusogermanpre-commit-ci[bot]MaxJPRey
authored
Supporting SLURM env vars for launching MAPDL configuration (#2754)
* Adapting launcher to run on slurm * Working on nodes parser * Implemented machine argument * Update * Adding more debugging Adding memory option * Merge branch 'main' into feat/supporting-slurm-manager * Avoiding checking number of processors in slrum * Cleaning empty args * renaming argument * Update env vars to check to decide if ON_SLURM or not. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Removing unneeded * Adding comment * removing redundancies * Removing self * Removing unused arg * Update src/ansys/mapdl/core/launcher.py Co-authored-by: Maxime Rey <[email protected]> * fixing memory units * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update doc/styles/Vocab/ANSYS/accept.txt * Apply suggestions from code review Co-authored-by: Kathy Pippert <[email protected]> * Apply suggestions from code review Co-authored-by: Kathy Pippert <[email protected]> * Adding tests * Moving fixtures to main file * testing exec_file * fix tests * Adding env var documentation * Small refactoring regarding env var processing Adding typing * Better env var order * chore: adding changelog file 2754.documentation.md * Apply suggestions from code review Co-authored-by: Kathy Pippert <[email protected]> * fix: table format --------- Co-authored-by: German Martinez Ayuso <[email protected]> Co-authored-by: german <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Maxime Rey <[email protected]> Co-authored-by: Kathy Pippert <[email protected]> Co-authored-by: pyansys-ci-bot <[email protected]>
1 parent ba66041 commit c8fb44e

File tree

6 files changed

+417
-34
lines changed

6 files changed

+417
-34
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
feat: Supporting SLURM env vars for launching MAPDL configuration

doc/source/user_guide/mapdl.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1167,6 +1167,9 @@ These are described in the following table:
11671167
| | export PYMAPDL_MAPDL_VERSION=22.2 |
11681168
| | |
11691169
+---------------------------------------+---------------------------------------------------------------------+
1170+
| :envvar:`PYMAPDL_ON_SLURM` | With this environment variable set to ``FALSE``, you can avoid |
1171+
| | PyMAPDL from detecting that it is running on a SLURM HPC cluster. |
1172+
+---------------------------------------+---------------------------------------------------------------------+
11701173
| :envvar:`PYMAPDL_MAX_MESSAGE_LENGTH` | Maximum gRPC message length. If your |
11711174
| | connection terminates when running |
11721175
| | PRNSOL or NLIST, raise this. In bytes, |

doc/styles/config/vocabularies/ANSYS/accept.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,6 @@ Linder
9999
Linux
100100
MacOS
101101
mapdl
102-
mapdl
103102
MAPDL
104103
mater
105104
MATLAB

src/ansys/mapdl/core/launcher.py

Lines changed: 238 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
import tempfile
3333
import threading
3434
import time
35-
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
35+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
3636
import warnings
3737

3838
import psutil
@@ -309,11 +309,12 @@ def launch_grpc(
309309
Number of processors. Defaults to 2.
310310
311311
ram : float, optional
312-
Fixed amount of memory to request for MAPDL. If ``None``,
313-
then MAPDL will use as much as available on the host machine.
312+
Total size in megabytes of the workspace (memory) used for the initial allocation.
313+
The default is ``None``, in which case 2 GB (2048 MB) is used. To force a fixed size
314+
throughout the run, specify a negative number.
314315
315316
run_location : str, optional
316-
MAPDL working directory. Defaults to a temporary working
317+
MAPDL working directory. The default is the temporary working
317318
directory.
318319
319320
port : int
@@ -525,6 +526,9 @@ def launch_grpc(
525526

526527
pymapdl._LOCAL_PORTS.append(port)
527528

529+
if not nproc:
530+
nproc = 2
531+
528532
cpu_sw = "-np %d" % nproc
529533

530534
if ram:
@@ -576,22 +580,22 @@ def launch_grpc(
576580
port_sw,
577581
grpc_sw,
578582
]
579-
command = " ".join(command_parm)
580583

581584
else: # linux
582-
command_parm = []
583-
command_parm.extend(
584-
[
585-
'"%s"' % exec_file,
586-
job_sw,
587-
cpu_sw,
588-
ram_sw,
589-
additional_switches,
590-
port_sw,
591-
grpc_sw,
592-
]
593-
)
594-
command = " ".join(command_parm)
585+
command_parm = [
586+
'"%s"' % exec_file,
587+
job_sw,
588+
cpu_sw,
589+
ram_sw,
590+
additional_switches,
591+
port_sw,
592+
grpc_sw,
593+
]
594+
595+
command_parm = [
596+
each for each in command_parm if command_parm
597+
] # cleaning empty args.
598+
command = " ".join(command_parm)
595599

596600
LOG.debug(f"Starting MAPDL with command: {command}")
597601

@@ -1085,7 +1089,8 @@ def launch_mapdl(
10851089
add_env_vars: Optional[Dict[str, str]] = None,
10861090
replace_env_vars: Optional[Dict[str, str]] = None,
10871091
version: Optional[Union[int, str]] = None,
1088-
**kwargs,
1092+
detect_slurm_config: bool = True,
1093+
**kwargs: Dict[str, Any],
10891094
) -> Union[MapdlGrpc, "MapdlConsole"]:
10901095
"""Start MAPDL locally.
10911096
@@ -1116,8 +1121,9 @@ def launch_mapdl(
11161121
Number of processors. Defaults to 2.
11171122
11181123
ram : float, optional
1119-
Fixed amount of memory to request for MAPDL. If ``None``,
1120-
then MAPDL will use as much as available on the host machine.
1124+
Total size in megabytes of the workspace (memory) used for the initial allocation.
1125+
The default is ``None``, in which case 2 GB (2048 MB) is used. To force a fixed size
1126+
throughout the run, specify a negative number.
11211127
11221128
mode : str, optional
11231129
Mode to launch MAPDL. Must be one of the following:
@@ -1441,9 +1447,43 @@ def launch_mapdl(
14411447
"ANSYSLMD_LICENSE_FILE":"1055@MYSERVER"}
14421448
>>> mapdl = launch_mapdl(replace_env_vars=my_env_vars)
14431449
"""
1450+
# By default
1451+
ON_SLURM = os.environ.get("PYMAPDL_ON_SLURM", None)
1452+
if ON_SLURM is None:
1453+
ON_SLURM = True
1454+
else:
1455+
# Unless the env var is false, it will be true.
1456+
ON_SLURM = not (ON_SLURM.lower() == "false")
1457+
1458+
# Let's require the following env vars to exist to go into slurm mode.
1459+
ON_SLURM = (
1460+
ON_SLURM
1461+
and bool(os.environ.get("SLURM_JOB_NAME", ""))
1462+
and bool(os.environ.get("SLURM_JOB_ID", ""))
1463+
)
1464+
1465+
if detect_slurm_config and ON_SLURM:
1466+
LOG.info("On Slurm mode.")
1467+
1468+
# extracting parameters
1469+
exec_file, jobname, nproc, ram, additional_switches = _parse_slurm_options(
1470+
exec_file,
1471+
jobname,
1472+
nproc,
1473+
ram,
1474+
additional_switches,
1475+
**kwargs,
1476+
)
1477+
# To avoid timeouts
1478+
license_server_check = False
1479+
start_timeout = 2 * start_timeout
1480+
ON_SLURM = True # Using this as main variable
1481+
else:
1482+
ON_SLURM = False
1483+
14441484
if remove_temp_files is not None:
14451485
warnings.warn(
1446-
"The option ``remove_temp_files`` is being deprecated and it will be removed by PyMAPDL version 0.66.0.\n"
1486+
"The ``remove_temp_files`` option is being deprecated. It is to be removed in PyMAPDL version 0.66.0.\n"
14471487
"Please use ``remove_temp_dir_on_exit`` instead.",
14481488
DeprecationWarning,
14491489
stacklevel=2,
@@ -1637,7 +1677,7 @@ def launch_mapdl(
16371677
start_parm,
16381678
start_instance,
16391679
version,
1640-
)
1680+
) # type: ignore
16411681

16421682
mapdl = MapdlGrpc(
16431683
ip=ip,
@@ -1727,16 +1767,20 @@ def launch_mapdl(
17271767
additional_switches = _check_license_argument(license_type, additional_switches)
17281768
LOG.debug(f"Using additional switches {additional_switches}.")
17291769

1730-
# Setting number of processors
1731-
machine_cores = psutil.cpu_count(logical=False)
1732-
if not nproc:
1733-
if machine_cores < 2: # default required cores
1734-
nproc = machine_cores # to avoid starting issues
1770+
# Bypassing number of processors checks because VDI/VNC might have
1771+
# different number of processors than the cluster compute nodes.
1772+
if not ON_SLURM:
1773+
# Setting number of processors
1774+
machine_cores = psutil.cpu_count(logical=False)
1775+
1776+
if not nproc:
1777+
# Some machines only have 1 core
1778+
nproc = machine_cores if machine_cores < 2 else 2
17351779
else:
1736-
nproc = 2
1737-
else:
1738-
if machine_cores < int(nproc):
1739-
raise NotEnoughResources
1780+
if machine_cores < int(nproc):
1781+
raise NotEnoughResources(
1782+
f"The machine has {machine_cores} cores. PyMAPDL is asking for {nproc} cores."
1783+
)
17401784

17411785
start_parm.update(
17421786
{
@@ -1791,7 +1835,7 @@ def launch_mapdl(
17911835
start_parm,
17921836
start_instance,
17931837
version,
1794-
)
1838+
) # type: ignore
17951839

17961840
port, actual_run_location, process = launch_grpc(
17971841
port=port,
@@ -2078,6 +2122,167 @@ def _parse_ip_route(output):
20782122
return match[0]
20792123

20802124

2125+
def _parse_slurm_options(
2126+
exec_file: Optional[str],
2127+
jobname: str,
2128+
nproc: Optional[int],
2129+
ram: Optional[Union[str, int]],
2130+
additional_switches: str,
2131+
**kwargs: Dict[str, Any],
2132+
):
2133+
def get_value(
2134+
variable: str,
2135+
kwargs: Dict[str, Any],
2136+
default: Optional[Union[str, int, float]] = 1,
2137+
astype: Optional[Callable[[Any], Any]] = int,
2138+
):
2139+
value_from_env_vars = os.environ.get(variable, None)
2140+
value_from_kwargs = kwargs.pop(variable, None)
2141+
value = value_from_kwargs or value_from_env_vars or default
2142+
if astype and value:
2143+
return astype(value)
2144+
else:
2145+
return value
2146+
2147+
## Getting env vars
2148+
SLURM_NNODES = get_value("SLURM_NNODES", kwargs)
2149+
LOG.info(f"SLURM_NNODES: {SLURM_NNODES}")
2150+
# ntasks is for mpi
2151+
SLURM_NTASKS = get_value("SLURM_NTASKS", kwargs)
2152+
LOG.info(f"SLURM_NTASKS: {SLURM_NTASKS}")
2153+
# Sharing tasks acrros multiple nodes (DMP)
2154+
# the format of this envvar is a bit tricky. Avoiding it for the moment.
2155+
# SLURM_TASKS_PER_NODE = int(
2156+
# kwargs.pop(
2157+
# "SLURM_TASKS_PER_NODE", os.environ.get("SLURM_TASKS_PER_NODE", 1)
2158+
# )
2159+
# )
2160+
2161+
# cpus-per-task is for multithreading,
2162+
# sharing tasks across multiple CPUs in same node (SMP)
2163+
SLURM_CPUS_PER_TASK = get_value("SLURM_CPUS_PER_TASK", kwargs)
2164+
LOG.info(f"SLURM_CPUS_PER_TASK: {SLURM_CPUS_PER_TASK}")
2165+
2166+
# Set to value of the --ntasks option, if specified. See SLURM_NTASKS. Included for backwards compatibility.
2167+
SLURM_NPROCS = get_value("SLURM_NPROCS", kwargs)
2168+
LOG.info(f"SLURM_NPROCS: {SLURM_NPROCS}")
2169+
2170+
# Number of CPUs allocated to the batch step.
2171+
SLURM_CPUS_ON_NODE = get_value("SLURM_CPUS_ON_NODE", kwargs)
2172+
LOG.info(f"SLURM_CPUS_ON_NODE: {SLURM_CPUS_ON_NODE}")
2173+
2174+
SLURM_MEM_PER_NODE = get_value(
2175+
"SLURM_MEM_PER_NODE", kwargs, default=None, astype=None
2176+
)
2177+
LOG.info(f"SLURM_MEM_PER_NODE: {SLURM_MEM_PER_NODE}")
2178+
2179+
SLURM_NODELIST = get_value(
2180+
"SLURM_NODELIST", kwargs, default="", astype=None
2181+
).lower()
2182+
LOG.info(f"SLURM_NODELIST: {SLURM_NODELIST}")
2183+
2184+
if not exec_file:
2185+
exec_file = os.environ.get("PYMAPDL_MAPDL_EXEC", None)
2186+
2187+
if not exec_file:
2188+
# We should probably make a way to find it.
2189+
# We will use the module thing
2190+
pass
2191+
LOG.info(f"Using MAPDL executable in: {exec_file}")
2192+
2193+
if not jobname:
2194+
jobname = os.environ.get("SLURM_JOB_NAME", "file")
2195+
LOG.info(f"Using jobname: {jobname}")
2196+
2197+
# Checking specific env var
2198+
if not nproc:
2199+
nproc = os.environ.get("PYMAPDL_NPROC", None)
2200+
if nproc:
2201+
nproc = int(nproc)
2202+
2203+
if not nproc:
2204+
## Attempt to calculate the appropriate number of cores:
2205+
# Reference: https://stackoverflow.com/a/51141287/6650211
2206+
# I'm assuming the env var makes sense.
2207+
#
2208+
# - SLURM_CPUS_ON_NODE is a property of the cluster, not of the job.
2209+
#
2210+
options = [
2211+
# 4, # Fall back option
2212+
SLURM_CPUS_PER_TASK * SLURM_NTASKS, # (CPUs)
2213+
SLURM_NPROCS, # (CPUs)
2214+
# SLURM_NTASKS, # (tasks) Not necessary the number of CPUs,
2215+
# SLURM_NNODES * SLURM_TASKS_PER_NODE * SLURM_CPUS_PER_TASK, # (CPUs)
2216+
SLURM_CPUS_ON_NODE * SLURM_NNODES, # (cpus)
2217+
]
2218+
LOG.info(f"On SLURM number of processors options {options}")
2219+
nproc = max(options)
2220+
2221+
LOG.info(f"Setting number of CPUs to: {nproc}")
2222+
2223+
if not ram:
2224+
if SLURM_MEM_PER_NODE:
2225+
# RAM argument is in MB, so we need to convert
2226+
2227+
if SLURM_MEM_PER_NODE[-1] == "T": # tera
2228+
ram = int(SLURM_MEM_PER_NODE[:-1]) * (2**10) ** 2
2229+
elif SLURM_MEM_PER_NODE[-1] == "G": # giga
2230+
ram = int(SLURM_MEM_PER_NODE[:-1]) * (2**10) ** 1
2231+
elif SLURM_MEM_PER_NODE[-1].upper() == "k": # kilo
2232+
ram = int(SLURM_MEM_PER_NODE[:-1]) * (2**10) ** (-1)
2233+
else: # Mega
2234+
ram = int(SLURM_MEM_PER_NODE)
2235+
2236+
LOG.info(f"Setting RAM to: {ram}")
2237+
2238+
# We use "-dis " (with space) to avoid collision with user variables such
2239+
# as `-distro` or so
2240+
if "-dis " not in additional_switches and not additional_switches.endswith("-dis"):
2241+
additional_switches += " -dis"
2242+
2243+
## Getting the node list
2244+
machines = ""
2245+
# parsing nodes to list
2246+
if SLURM_NODELIST:
2247+
try:
2248+
p = subprocess.Popen(
2249+
["scontrol", "show", "hostnames", f"{SLURM_NODELIST}"],
2250+
stderr=subprocess.PIPE,
2251+
stdout=subprocess.PIPE,
2252+
)
2253+
stderr = p.stderr.read().decode()
2254+
stdout = p.stdout.read().decode()
2255+
2256+
if "Invalid hostlist" in stderr:
2257+
raise ValueError(
2258+
"The node list is invalid, or it could not be parsed.\n",
2259+
"Are you passing the nodes correctly?\n",
2260+
f"Nodes list: {SLURM_NODELIST}",
2261+
)
2262+
if stderr:
2263+
raise RuntimeError(stderr)
2264+
nodes = stdout.strip().splitlines()
2265+
2266+
machines = ":".join([f"{each_node}" for each_node in nodes])
2267+
2268+
# The following code creates the cmd line bit for MAPDL. It seems it
2269+
# is not needed in slurm.
2270+
# machines = " -machines " + ":".join([
2271+
# f"{each_node}:{SLURM_CPUS_ON_NODE}" for each_node in nodes
2272+
# ])
2273+
2274+
# We do not need to inject the machines in MAPDL command line.
2275+
# additional_switches += machines
2276+
LOG.info(f"Using nodes configuration: {machines}")
2277+
2278+
except Exception as e:
2279+
LOG.info(
2280+
f"The machines list could not be obtained.\nThis error occurred:\n{str(e)}"
2281+
)
2282+
2283+
return exec_file, jobname, nproc, ram, additional_switches
2284+
2285+
20812286
def pack_parameters(
20822287
port,
20832288
ip,

0 commit comments

Comments
 (0)