|
32 | 32 | import tempfile |
33 | 33 | import threading |
34 | 34 | import time |
35 | | -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union |
| 35 | +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union |
36 | 36 | import warnings |
37 | 37 |
|
38 | 38 | import psutil |
@@ -309,11 +309,12 @@ def launch_grpc( |
309 | 309 | Number of processors. Defaults to 2. |
310 | 310 |
|
311 | 311 | ram : float, optional |
312 | | - Fixed amount of memory to request for MAPDL. If ``None``, |
313 | | - then MAPDL will use as much as available on the host machine. |
| 312 | + Total size in megabytes of the workspace (memory) used for the initial allocation. |
| 313 | + The default is ``None``, in which case 2 GB (2048 MB) is used. To force a fixed size |
| 314 | + throughout the run, specify a negative number. |
314 | 315 |
|
315 | 316 | run_location : str, optional |
316 | | - MAPDL working directory. Defaults to a temporary working |
| 317 | + MAPDL working directory. The default is the temporary working |
317 | 318 | directory. |
318 | 319 |
|
319 | 320 | port : int |
@@ -525,6 +526,9 @@ def launch_grpc( |
525 | 526 |
|
526 | 527 | pymapdl._LOCAL_PORTS.append(port) |
527 | 528 |
|
| 529 | + if not nproc: |
| 530 | + nproc = 2 |
| 531 | + |
528 | 532 | cpu_sw = "-np %d" % nproc |
529 | 533 |
|
530 | 534 | if ram: |
@@ -576,22 +580,22 @@ def launch_grpc( |
576 | 580 | port_sw, |
577 | 581 | grpc_sw, |
578 | 582 | ] |
579 | | - command = " ".join(command_parm) |
580 | 583 |
|
581 | 584 | else: # linux |
582 | | - command_parm = [] |
583 | | - command_parm.extend( |
584 | | - [ |
585 | | - '"%s"' % exec_file, |
586 | | - job_sw, |
587 | | - cpu_sw, |
588 | | - ram_sw, |
589 | | - additional_switches, |
590 | | - port_sw, |
591 | | - grpc_sw, |
592 | | - ] |
593 | | - ) |
594 | | - command = " ".join(command_parm) |
| 585 | + command_parm = [ |
| 586 | + '"%s"' % exec_file, |
| 587 | + job_sw, |
| 588 | + cpu_sw, |
| 589 | + ram_sw, |
| 590 | + additional_switches, |
| 591 | + port_sw, |
| 592 | + grpc_sw, |
| 593 | + ] |
| 594 | + |
| 595 | + command_parm = [ |
| 596 | + each for each in command_parm if command_parm |
| 597 | + ] # cleaning empty args. |
| 598 | + command = " ".join(command_parm) |
595 | 599 |
|
596 | 600 | LOG.debug(f"Starting MAPDL with command: {command}") |
597 | 601 |
|
@@ -1085,7 +1089,8 @@ def launch_mapdl( |
1085 | 1089 | add_env_vars: Optional[Dict[str, str]] = None, |
1086 | 1090 | replace_env_vars: Optional[Dict[str, str]] = None, |
1087 | 1091 | version: Optional[Union[int, str]] = None, |
1088 | | - **kwargs, |
| 1092 | + detect_slurm_config: bool = True, |
| 1093 | + **kwargs: Dict[str, Any], |
1089 | 1094 | ) -> Union[MapdlGrpc, "MapdlConsole"]: |
1090 | 1095 | """Start MAPDL locally. |
1091 | 1096 |
|
@@ -1116,8 +1121,9 @@ def launch_mapdl( |
1116 | 1121 | Number of processors. Defaults to 2. |
1117 | 1122 |
|
1118 | 1123 | ram : float, optional |
1119 | | - Fixed amount of memory to request for MAPDL. If ``None``, |
1120 | | - then MAPDL will use as much as available on the host machine. |
| 1124 | + Total size in megabytes of the workspace (memory) used for the initial allocation. |
| 1125 | + The default is ``None``, in which case 2 GB (2048 MB) is used. To force a fixed size |
| 1126 | + throughout the run, specify a negative number. |
1121 | 1127 |
|
1122 | 1128 | mode : str, optional |
1123 | 1129 | Mode to launch MAPDL. Must be one of the following: |
@@ -1441,9 +1447,43 @@ def launch_mapdl( |
1441 | 1447 | "ANSYSLMD_LICENSE_FILE":"1055@MYSERVER"} |
1442 | 1448 | >>> mapdl = launch_mapdl(replace_env_vars=my_env_vars) |
1443 | 1449 | """ |
| 1450 | + # By default |
| 1451 | + ON_SLURM = os.environ.get("PYMAPDL_ON_SLURM", None) |
| 1452 | + if ON_SLURM is None: |
| 1453 | + ON_SLURM = True |
| 1454 | + else: |
| 1455 | + # Unless the env var is false, it will be true. |
| 1456 | + ON_SLURM = not (ON_SLURM.lower() == "false") |
| 1457 | + |
| 1458 | + # Let's require the following env vars to exist to go into slurm mode. |
| 1459 | + ON_SLURM = ( |
| 1460 | + ON_SLURM |
| 1461 | + and bool(os.environ.get("SLURM_JOB_NAME", "")) |
| 1462 | + and bool(os.environ.get("SLURM_JOB_ID", "")) |
| 1463 | + ) |
| 1464 | + |
| 1465 | + if detect_slurm_config and ON_SLURM: |
| 1466 | + LOG.info("On Slurm mode.") |
| 1467 | + |
| 1468 | + # extracting parameters |
| 1469 | + exec_file, jobname, nproc, ram, additional_switches = _parse_slurm_options( |
| 1470 | + exec_file, |
| 1471 | + jobname, |
| 1472 | + nproc, |
| 1473 | + ram, |
| 1474 | + additional_switches, |
| 1475 | + **kwargs, |
| 1476 | + ) |
| 1477 | + # To avoid timeouts |
| 1478 | + license_server_check = False |
| 1479 | + start_timeout = 2 * start_timeout |
| 1480 | + ON_SLURM = True # Using this as main variable |
| 1481 | + else: |
| 1482 | + ON_SLURM = False |
| 1483 | + |
1444 | 1484 | if remove_temp_files is not None: |
1445 | 1485 | warnings.warn( |
1446 | | - "The option ``remove_temp_files`` is being deprecated and it will be removed by PyMAPDL version 0.66.0.\n" |
| 1486 | + "The ``remove_temp_files`` option is being deprecated. It is to be removed in PyMAPDL version 0.66.0.\n" |
1447 | 1487 | "Please use ``remove_temp_dir_on_exit`` instead.", |
1448 | 1488 | DeprecationWarning, |
1449 | 1489 | stacklevel=2, |
@@ -1637,7 +1677,7 @@ def launch_mapdl( |
1637 | 1677 | start_parm, |
1638 | 1678 | start_instance, |
1639 | 1679 | version, |
1640 | | - ) |
| 1680 | + ) # type: ignore |
1641 | 1681 |
|
1642 | 1682 | mapdl = MapdlGrpc( |
1643 | 1683 | ip=ip, |
@@ -1727,16 +1767,20 @@ def launch_mapdl( |
1727 | 1767 | additional_switches = _check_license_argument(license_type, additional_switches) |
1728 | 1768 | LOG.debug(f"Using additional switches {additional_switches}.") |
1729 | 1769 |
|
1730 | | - # Setting number of processors |
1731 | | - machine_cores = psutil.cpu_count(logical=False) |
1732 | | - if not nproc: |
1733 | | - if machine_cores < 2: # default required cores |
1734 | | - nproc = machine_cores # to avoid starting issues |
| 1770 | + # Bypassing number of processors checks because VDI/VNC might have |
| 1771 | + # different number of processors than the cluster compute nodes. |
| 1772 | + if not ON_SLURM: |
| 1773 | + # Setting number of processors |
| 1774 | + machine_cores = psutil.cpu_count(logical=False) |
| 1775 | + |
| 1776 | + if not nproc: |
| 1777 | + # Some machines only have 1 core |
| 1778 | + nproc = machine_cores if machine_cores < 2 else 2 |
1735 | 1779 | else: |
1736 | | - nproc = 2 |
1737 | | - else: |
1738 | | - if machine_cores < int(nproc): |
1739 | | - raise NotEnoughResources |
| 1780 | + if machine_cores < int(nproc): |
| 1781 | + raise NotEnoughResources( |
| 1782 | + f"The machine has {machine_cores} cores. PyMAPDL is asking for {nproc} cores." |
| 1783 | + ) |
1740 | 1784 |
|
1741 | 1785 | start_parm.update( |
1742 | 1786 | { |
@@ -1791,7 +1835,7 @@ def launch_mapdl( |
1791 | 1835 | start_parm, |
1792 | 1836 | start_instance, |
1793 | 1837 | version, |
1794 | | - ) |
| 1838 | + ) # type: ignore |
1795 | 1839 |
|
1796 | 1840 | port, actual_run_location, process = launch_grpc( |
1797 | 1841 | port=port, |
@@ -2078,6 +2122,167 @@ def _parse_ip_route(output): |
2078 | 2122 | return match[0] |
2079 | 2123 |
|
2080 | 2124 |
|
| 2125 | +def _parse_slurm_options( |
| 2126 | + exec_file: Optional[str], |
| 2127 | + jobname: str, |
| 2128 | + nproc: Optional[int], |
| 2129 | + ram: Optional[Union[str, int]], |
| 2130 | + additional_switches: str, |
| 2131 | + **kwargs: Dict[str, Any], |
| 2132 | +): |
| 2133 | + def get_value( |
| 2134 | + variable: str, |
| 2135 | + kwargs: Dict[str, Any], |
| 2136 | + default: Optional[Union[str, int, float]] = 1, |
| 2137 | + astype: Optional[Callable[[Any], Any]] = int, |
| 2138 | + ): |
| 2139 | + value_from_env_vars = os.environ.get(variable, None) |
| 2140 | + value_from_kwargs = kwargs.pop(variable, None) |
| 2141 | + value = value_from_kwargs or value_from_env_vars or default |
| 2142 | + if astype and value: |
| 2143 | + return astype(value) |
| 2144 | + else: |
| 2145 | + return value |
| 2146 | + |
| 2147 | + ## Getting env vars |
| 2148 | + SLURM_NNODES = get_value("SLURM_NNODES", kwargs) |
| 2149 | + LOG.info(f"SLURM_NNODES: {SLURM_NNODES}") |
| 2150 | + # ntasks is for mpi |
| 2151 | + SLURM_NTASKS = get_value("SLURM_NTASKS", kwargs) |
| 2152 | + LOG.info(f"SLURM_NTASKS: {SLURM_NTASKS}") |
| 2153 | + # Sharing tasks acrros multiple nodes (DMP) |
| 2154 | + # the format of this envvar is a bit tricky. Avoiding it for the moment. |
| 2155 | + # SLURM_TASKS_PER_NODE = int( |
| 2156 | + # kwargs.pop( |
| 2157 | + # "SLURM_TASKS_PER_NODE", os.environ.get("SLURM_TASKS_PER_NODE", 1) |
| 2158 | + # ) |
| 2159 | + # ) |
| 2160 | + |
| 2161 | + # cpus-per-task is for multithreading, |
| 2162 | + # sharing tasks across multiple CPUs in same node (SMP) |
| 2163 | + SLURM_CPUS_PER_TASK = get_value("SLURM_CPUS_PER_TASK", kwargs) |
| 2164 | + LOG.info(f"SLURM_CPUS_PER_TASK: {SLURM_CPUS_PER_TASK}") |
| 2165 | + |
| 2166 | + # Set to value of the --ntasks option, if specified. See SLURM_NTASKS. Included for backwards compatibility. |
| 2167 | + SLURM_NPROCS = get_value("SLURM_NPROCS", kwargs) |
| 2168 | + LOG.info(f"SLURM_NPROCS: {SLURM_NPROCS}") |
| 2169 | + |
| 2170 | + # Number of CPUs allocated to the batch step. |
| 2171 | + SLURM_CPUS_ON_NODE = get_value("SLURM_CPUS_ON_NODE", kwargs) |
| 2172 | + LOG.info(f"SLURM_CPUS_ON_NODE: {SLURM_CPUS_ON_NODE}") |
| 2173 | + |
| 2174 | + SLURM_MEM_PER_NODE = get_value( |
| 2175 | + "SLURM_MEM_PER_NODE", kwargs, default=None, astype=None |
| 2176 | + ) |
| 2177 | + LOG.info(f"SLURM_MEM_PER_NODE: {SLURM_MEM_PER_NODE}") |
| 2178 | + |
| 2179 | + SLURM_NODELIST = get_value( |
| 2180 | + "SLURM_NODELIST", kwargs, default="", astype=None |
| 2181 | + ).lower() |
| 2182 | + LOG.info(f"SLURM_NODELIST: {SLURM_NODELIST}") |
| 2183 | + |
| 2184 | + if not exec_file: |
| 2185 | + exec_file = os.environ.get("PYMAPDL_MAPDL_EXEC", None) |
| 2186 | + |
| 2187 | + if not exec_file: |
| 2188 | + # We should probably make a way to find it. |
| 2189 | + # We will use the module thing |
| 2190 | + pass |
| 2191 | + LOG.info(f"Using MAPDL executable in: {exec_file}") |
| 2192 | + |
| 2193 | + if not jobname: |
| 2194 | + jobname = os.environ.get("SLURM_JOB_NAME", "file") |
| 2195 | + LOG.info(f"Using jobname: {jobname}") |
| 2196 | + |
| 2197 | + # Checking specific env var |
| 2198 | + if not nproc: |
| 2199 | + nproc = os.environ.get("PYMAPDL_NPROC", None) |
| 2200 | + if nproc: |
| 2201 | + nproc = int(nproc) |
| 2202 | + |
| 2203 | + if not nproc: |
| 2204 | + ## Attempt to calculate the appropriate number of cores: |
| 2205 | + # Reference: https://stackoverflow.com/a/51141287/6650211 |
| 2206 | + # I'm assuming the env var makes sense. |
| 2207 | + # |
| 2208 | + # - SLURM_CPUS_ON_NODE is a property of the cluster, not of the job. |
| 2209 | + # |
| 2210 | + options = [ |
| 2211 | + # 4, # Fall back option |
| 2212 | + SLURM_CPUS_PER_TASK * SLURM_NTASKS, # (CPUs) |
| 2213 | + SLURM_NPROCS, # (CPUs) |
| 2214 | + # SLURM_NTASKS, # (tasks) Not necessary the number of CPUs, |
| 2215 | + # SLURM_NNODES * SLURM_TASKS_PER_NODE * SLURM_CPUS_PER_TASK, # (CPUs) |
| 2216 | + SLURM_CPUS_ON_NODE * SLURM_NNODES, # (cpus) |
| 2217 | + ] |
| 2218 | + LOG.info(f"On SLURM number of processors options {options}") |
| 2219 | + nproc = max(options) |
| 2220 | + |
| 2221 | + LOG.info(f"Setting number of CPUs to: {nproc}") |
| 2222 | + |
| 2223 | + if not ram: |
| 2224 | + if SLURM_MEM_PER_NODE: |
| 2225 | + # RAM argument is in MB, so we need to convert |
| 2226 | + |
| 2227 | + if SLURM_MEM_PER_NODE[-1] == "T": # tera |
| 2228 | + ram = int(SLURM_MEM_PER_NODE[:-1]) * (2**10) ** 2 |
| 2229 | + elif SLURM_MEM_PER_NODE[-1] == "G": # giga |
| 2230 | + ram = int(SLURM_MEM_PER_NODE[:-1]) * (2**10) ** 1 |
| 2231 | + elif SLURM_MEM_PER_NODE[-1].upper() == "k": # kilo |
| 2232 | + ram = int(SLURM_MEM_PER_NODE[:-1]) * (2**10) ** (-1) |
| 2233 | + else: # Mega |
| 2234 | + ram = int(SLURM_MEM_PER_NODE) |
| 2235 | + |
| 2236 | + LOG.info(f"Setting RAM to: {ram}") |
| 2237 | + |
| 2238 | + # We use "-dis " (with space) to avoid collision with user variables such |
| 2239 | + # as `-distro` or so |
| 2240 | + if "-dis " not in additional_switches and not additional_switches.endswith("-dis"): |
| 2241 | + additional_switches += " -dis" |
| 2242 | + |
| 2243 | + ## Getting the node list |
| 2244 | + machines = "" |
| 2245 | + # parsing nodes to list |
| 2246 | + if SLURM_NODELIST: |
| 2247 | + try: |
| 2248 | + p = subprocess.Popen( |
| 2249 | + ["scontrol", "show", "hostnames", f"{SLURM_NODELIST}"], |
| 2250 | + stderr=subprocess.PIPE, |
| 2251 | + stdout=subprocess.PIPE, |
| 2252 | + ) |
| 2253 | + stderr = p.stderr.read().decode() |
| 2254 | + stdout = p.stdout.read().decode() |
| 2255 | + |
| 2256 | + if "Invalid hostlist" in stderr: |
| 2257 | + raise ValueError( |
| 2258 | + "The node list is invalid, or it could not be parsed.\n", |
| 2259 | + "Are you passing the nodes correctly?\n", |
| 2260 | + f"Nodes list: {SLURM_NODELIST}", |
| 2261 | + ) |
| 2262 | + if stderr: |
| 2263 | + raise RuntimeError(stderr) |
| 2264 | + nodes = stdout.strip().splitlines() |
| 2265 | + |
| 2266 | + machines = ":".join([f"{each_node}" for each_node in nodes]) |
| 2267 | + |
| 2268 | + # The following code creates the cmd line bit for MAPDL. It seems it |
| 2269 | + # is not needed in slurm. |
| 2270 | + # machines = " -machines " + ":".join([ |
| 2271 | + # f"{each_node}:{SLURM_CPUS_ON_NODE}" for each_node in nodes |
| 2272 | + # ]) |
| 2273 | + |
| 2274 | + # We do not need to inject the machines in MAPDL command line. |
| 2275 | + # additional_switches += machines |
| 2276 | + LOG.info(f"Using nodes configuration: {machines}") |
| 2277 | + |
| 2278 | + except Exception as e: |
| 2279 | + LOG.info( |
| 2280 | + f"The machines list could not be obtained.\nThis error occurred:\n{str(e)}" |
| 2281 | + ) |
| 2282 | + |
| 2283 | + return exec_file, jobname, nproc, ram, additional_switches |
| 2284 | + |
| 2285 | + |
2081 | 2286 | def pack_parameters( |
2082 | 2287 | port, |
2083 | 2288 | ip, |
|
0 commit comments