diff --git a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py index 4c58407896..0e59272815 100644 --- a/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py +++ b/common/library/module_utils/input_validation/common_utils/slurm_conf_utils.py @@ -27,7 +27,7 @@ class SlurmParserEnum(str, Enum): S_P_UINT32 = "int" # unsigned int mapped to int S_P_UINT64 = "int" # unsigned int mapped to int S_P_POINTER = "object" # generic object / pointer - S_P_ARRAY = "list" # array-like -> list + S_P_ARRAY = "array" # list of dict S_P_BOOLEAN = "bool" # boolean S_P_LINE = "str" # line of text S_P_EXPLINE = "str" # expanded line of text @@ -35,6 +35,8 @@ class SlurmParserEnum(str, Enum): S_P_FLOAT = "float" # floating point S_P_DOUBLE = "float" # Python float is double precision S_P_LONG_DOUBLE = "float" # approximate with float + S_P_CSV = "csv" # comma separated values + S_P_LIST = "list" # list of strings # Convenience aliases (if other modules refer to S_P_* directly) @@ -53,6 +55,8 @@ class SlurmParserEnum(str, Enum): S_P_FLOAT = SlurmParserEnum.S_P_FLOAT S_P_DOUBLE = SlurmParserEnum.S_P_DOUBLE S_P_LONG_DOUBLE = SlurmParserEnum.S_P_LONG_DOUBLE +S_P_CSV = SlurmParserEnum.S_P_CSV +S_P_LIST = SlurmParserEnum.S_P_LIST downnodes_options = { @@ -67,11 +71,11 @@ class SlurmParserEnum(str, Enum): "CoreSpecCount": S_P_UINT16, "CoresPerSocket": S_P_UINT16, "CPUs": S_P_UINT16, - "CPUSpecList": S_P_STRING, + "CPUSpecList": S_P_CSV, "CpuBind": S_P_STRING, "Feature": S_P_STRING, - "Features": S_P_STRING, - "Gres": S_P_STRING, + "Features": S_P_CSV, + "Gres": S_P_CSV, "GresConf": S_P_STRING, "MemSpecLimit": S_P_UINT64, "NodeAddr": S_P_STRING, @@ -87,7 +91,7 @@ class SlurmParserEnum(str, Enum): "State": S_P_STRING, "ThreadsPerCore": S_P_UINT16, "TmpDisk": S_P_UINT32, - "Topology": S_P_STRING, + "Topology": S_P_CSV, "TRESWeights": S_P_STRING, "Weight": S_P_UINT32, } @@ -95,15 +99,15 @@ class SlurmParserEnum(str, Enum): nodeset_options = { "Feature": S_P_STRING, - "Nodes": S_P_STRING, + "Nodes": S_P_STRING } partition_options = { - "AllocNodes": S_P_STRING, - "AllowAccounts": S_P_STRING, - "AllowGroups": S_P_STRING, - "AllowQos": S_P_STRING, + "AllocNodes": S_P_CSV, + "AllowAccounts": S_P_CSV, + "AllowGroups": S_P_CSV, + "AllowQos": S_P_CSV, "Alternate": S_P_STRING, "CpuBind": S_P_STRING, "DefCPUPerGPU": S_P_UINT64, @@ -112,8 +116,8 @@ class SlurmParserEnum(str, Enum): "DefMemPerNode": S_P_UINT64, "Default": S_P_BOOLEAN, "DefaultTime": S_P_STRING, - "DenyAccounts": S_P_STRING, - "DenyQos": S_P_STRING, + "DenyAccounts": S_P_CSV, + "DenyQos": S_P_CSV, "DisableRootJobs": S_P_BOOLEAN, "ExclusiveUser": S_P_BOOLEAN, "ExclusiveTopo": S_P_BOOLEAN, @@ -127,7 +131,7 @@ class SlurmParserEnum(str, Enum): "MaxTime": S_P_STRING, "MaxNodes": S_P_UINT32, "MinNodes": S_P_UINT32, - "Nodes": S_P_STRING, + "Nodes": S_P_CSV, "OverSubscribe": S_P_STRING, "OverTimeLimit": S_P_STRING, "PowerDownOnIdle": S_P_BOOLEAN, @@ -145,22 +149,22 @@ class SlurmParserEnum(str, Enum): "SuspendTime": S_P_STRING, "SuspendTimeout": S_P_UINT16, "Topology": S_P_STRING, - "TRESBillingWeights": S_P_STRING, + "TRESBillingWeights": S_P_CSV } # From https://github.com/SchedMD/slurm/blob/slurm-/src/common/read_config.c slurm_options = { "AccountingStorageBackupHost": S_P_STRING, - "AccountingStorageEnforce": S_P_STRING, - "AccountingStorageExternalHost": S_P_STRING, + "AccountingStorageEnforce": S_P_CSV, + "AccountingStorageExternalHost": S_P_CSV, "AccountingStorageHost": S_P_STRING, - "AccountingStorageParameters": S_P_STRING, + "AccountingStorageParameters": S_P_CSV, "AccountingStoragePass": S_P_STRING, "AccountingStoragePort": S_P_UINT16, - "AccountingStorageTRES": S_P_STRING, + "AccountingStorageTRES": S_P_CSV, "AccountingStorageType": S_P_STRING, # {"AccountingStorageUser": S_P_STRING, _defunct_option, - "AccountingStoreFlags": S_P_STRING, + "AccountingStoreFlags": S_P_CSV, "AccountingStoreJobComment": S_P_BOOLEAN, "AcctGatherEnergyType": S_P_STRING, "AcctGatherFilesystemType": S_P_STRING, @@ -169,25 +173,25 @@ class SlurmParserEnum(str, Enum): "AcctGatherNodeFreq": S_P_UINT16, "AcctGatherProfileType": S_P_STRING, "AllowSpecResourcesUsage": S_P_BOOLEAN, - "AuthAltParameters": S_P_STRING, - "AuthAltTypes": S_P_STRING, - "AuthInfo": S_P_STRING, + "AuthAltParameters": S_P_CSV, + "AuthAltTypes": S_P_CSV, + "AuthInfo": S_P_CSV, "AuthType": S_P_STRING, "BackupAddr": S_P_STRING, "BackupController": S_P_STRING, "BatchStartTimeout": S_P_UINT16, - "BcastExclude": S_P_STRING, - "BcastParameters": S_P_STRING, + "BcastExclude": S_P_CSV, + "BcastParameters": S_P_CSV, "BurstBufferParameters": S_P_STRING, "BurstBufferType": S_P_STRING, "CertgenType": S_P_STRING, - "CertgenParameters": S_P_STRING, + "CertgenParameters": S_P_CSV, "CertmgrType": S_P_STRING, "CertmgrParameters": S_P_STRING, - "CliFilterParameters": S_P_STRING, - "CliFilterPlugins": S_P_STRING, + "CliFilterParameters": S_P_CSV, + "CliFilterPlugins": S_P_CSV, "ClusterName": S_P_STRING, - "CommunicationParameters": S_P_STRING, + "CommunicationParameters": S_P_CSV, "CompleteWait": S_P_UINT16, "ControlAddr": S_P_STRING, "ControlMachine": S_P_STRING, @@ -197,33 +201,33 @@ class SlurmParserEnum(str, Enum): "CredType": S_P_STRING, "CryptoType": S_P_STRING, "DataParserParameters": S_P_STRING, - "DebugFlags": S_P_STRING, + "DebugFlags": S_P_CSV, "DefCPUPerGPU": S_P_UINT64, "DefMemPerCPU": S_P_UINT64, "DefMemPerGPU": S_P_UINT64, "DefMemPerNode": S_P_UINT64, - "DependencyParameters": S_P_STRING, + "DependencyParameters": S_P_CSV, "DisableRootJobs": S_P_BOOLEAN, "EioTimeout": S_P_UINT16, "EnforcePartLimits": S_P_STRING, - "Epilog": S_P_ARRAY, + "Epilog": S_P_LIST, "EpilogMsgTime": S_P_UINT32, - "EpilogSlurmctld": S_P_ARRAY, + "EpilogSlurmctld": S_P_LIST, "EpilogTimeout": S_P_UINT16, # {"ExtSensorsFreq": S_P_UINT16, _defunct_option, # {"ExtSensorsType": S_P_STRING, _defunct_option, "FairShareDampeningFactor": S_P_UINT16, "FastSchedule": S_P_UINT16, - "FederationParameters": S_P_STRING, + "FederationParameters": S_P_CSV, "FirstJobId": S_P_UINT32, # {"GetEnvTimeout": S_P_UINT16, _defunct_option, "GpuFreqDef": S_P_STRING, - "GresTypes": S_P_STRING, + "GresTypes": S_P_CSV, "GroupUpdateForce": S_P_UINT16, "GroupUpdateTime": S_P_UINT16, "HashPlugin": S_P_STRING, "HealthCheckInterval": S_P_UINT16, - "HealthCheckNodeState": S_P_STRING, + "HealthCheckNodeState": S_P_CSV, "HealthCheckProgram": S_P_STRING, "HttpParserType": S_P_STRING, "InactiveLimit": S_P_UINT16, @@ -233,7 +237,7 @@ class SlurmParserEnum(str, Enum): "JobAcctGatherType": S_P_STRING, "JobCompHost": S_P_STRING, "JobCompLoc": S_P_STRING, - "JobCompParams": S_P_STRING, + "JobCompParams": S_P_CSV, "JobCompPass": S_P_STRING, "JobCompPassScript": S_P_STRING, "JobCompPort": S_P_UINT32, @@ -244,13 +248,13 @@ class SlurmParserEnum(str, Enum): # {"JobCredentialPublicCertificate": S_P_STRING, _defunct_option, "JobFileAppend": S_P_UINT16, "JobRequeue": S_P_UINT16, - "JobSubmitPlugins": S_P_STRING, + "JobSubmitPlugins": S_P_CSV, "KeepAliveTime": S_P_UINT32, "KillOnBadExit": S_P_UINT16, "KillWait": S_P_UINT16, "LaunchParameters": S_P_STRING, "LaunchType": S_P_STRING, - "Licenses": S_P_STRING, + "Licenses": S_P_CSV, "LogTimeFormat": S_P_STRING, "MailDomain": S_P_STRING, "MailProg": S_P_STRING, @@ -270,7 +274,7 @@ class SlurmParserEnum(str, Enum): "MetricsType": S_P_STRING, "MinJobAge": S_P_UINT32, "MpiDefault": S_P_STRING, - "MpiParams": S_P_STRING, + "MpiParams": S_P_CSV, "NamespaceType": S_P_STRING, "NodeFeaturesPlugins": S_P_STRING, "OverTimeLimit": S_P_UINT16, @@ -279,11 +283,11 @@ class SlurmParserEnum(str, Enum): # {"PowerParameters": S_P_STRING, _defunct_option, # {"PowerPlugin": S_P_STRING, _defunct_option, "PreemptExemptTime": S_P_STRING, - "PreemptMode": S_P_STRING, - "PreemptParameters": S_P_STRING, + "PreemptMode": S_P_CSV, + "PreemptParameters": S_P_CSV, "PreemptType": S_P_STRING, "PrEpParameters": S_P_STRING, - "PrEpPlugins": S_P_STRING, + "PrEpPlugins": S_P_CSV, "PriorityCalcPeriod": S_P_STRING, "PriorityDecayHalfLife": S_P_STRING, "PriorityFavorSmall": S_P_BOOLEAN, @@ -300,21 +304,21 @@ class SlurmParserEnum(str, Enum): "PriorityWeightJobSize": S_P_UINT32, "PriorityWeightPartition": S_P_UINT32, "PriorityWeightQOS": S_P_UINT32, - "PriorityWeightTRES": S_P_STRING, - "PrivateData": S_P_STRING, + "PriorityWeightTRES": S_P_CSV, + "PrivateData": S_P_CSV, "ProctrackType": S_P_STRING, - "Prolog": S_P_ARRAY, + "Prolog": S_P_LIST, "PrologEpilogTimeout": S_P_UINT16, - "PrologFlags": S_P_STRING, - "PrologSlurmctld": S_P_ARRAY, + "PrologFlags": S_P_CSV, + "PrologSlurmctld": S_P_LIST, "PrologTimeout": S_P_UINT16, "PropagatePrioProcess": S_P_UINT16, - "PropagateResourceLimits": S_P_STRING, - "PropagateResourceLimitsExcept": S_P_STRING, + "PropagateResourceLimits": S_P_CSV, + "PropagateResourceLimitsExcept": S_P_CSV, "RebootProgram": S_P_STRING, "ReconfigFlags": S_P_STRING, - "RequeueExit": S_P_STRING, - "RequeueExitHold": S_P_STRING, + "RequeueExit": S_P_CSV, + "RequeueExitHold": S_P_CSV, "ResumeFailProgram": S_P_STRING, "ResumeProgram": S_P_STRING, "ResumeRate": S_P_UINT16, @@ -326,16 +330,16 @@ class SlurmParserEnum(str, Enum): "RoutePlugin": S_P_STRING, "SallocDefaultCommand": S_P_STRING, "SbcastParameters": S_P_STRING, - "SchedulerParameters": S_P_STRING, + "SchedulerParameters": S_P_CSV, "SchedulerTimeSlice": S_P_UINT16, "SchedulerType": S_P_STRING, - "ScronParameters": S_P_STRING, + "ScronParameters": S_P_CSV, "SelectType": S_P_STRING, "SelectTypeParameters": S_P_STRING, "SlurmctldAddr": S_P_STRING, "SlurmctldDebug": S_P_STRING, "SlurmctldLogFile": S_P_STRING, - "SlurmctldParameters": S_P_STRING, + "SlurmctldParameters": S_P_CSV, "SlurmctldPidFile": S_P_STRING, "SlurmctldPort": S_P_STRING, "SlurmctldPrimaryOffProg": S_P_STRING, @@ -344,7 +348,7 @@ class SlurmParserEnum(str, Enum): "SlurmctldTimeout": S_P_UINT16, "SlurmdDebug": S_P_STRING, "SlurmdLogFile": S_P_STRING, - "SlurmdParameters": S_P_STRING, + "SlurmdParameters": S_P_CSV, "SlurmdPidFile": S_P_STRING, "SlurmdPort": S_P_UINT32, "SlurmdSpoolDir": S_P_STRING, @@ -358,24 +362,24 @@ class SlurmParserEnum(str, Enum): "SrunPortRange": S_P_STRING, "SrunProlog": S_P_STRING, "StateSaveLocation": S_P_STRING, - "SuspendExcNodes": S_P_STRING, - "SuspendExcParts": S_P_STRING, + "SuspendExcNodes": S_P_CSV, + "SuspendExcParts": S_P_CSV, "SuspendExcStates": S_P_STRING, "SuspendProgram": S_P_STRING, "SuspendRate": S_P_UINT16, "SuspendTime": S_P_STRING, "SuspendTimeout": S_P_UINT16, - "SwitchParameters": S_P_STRING, + "SwitchParameters": S_P_CSV, "SwitchType": S_P_STRING, "TaskEpilog": S_P_STRING, - "TaskPlugin": S_P_STRING, - "TaskPluginParam": S_P_STRING, + "TaskPlugin": S_P_CSV, + "TaskPluginParam": S_P_CSV, "TaskProlog": S_P_STRING, "TCPTimeout": S_P_UINT16, - "TLSParameters": S_P_STRING, + "TLSParameters": S_P_CSV, "TLSType": S_P_STRING, "TmpFS": S_P_STRING, - "TopologyParam": S_P_STRING, + "TopologyParam": S_P_CSV, "TopologyPlugin": S_P_STRING, "TrackWCKey": S_P_BOOLEAN, "TreeWidth": S_P_UINT16, @@ -390,7 +394,7 @@ class SlurmParserEnum(str, Enum): "NodeName": S_P_ARRAY, "NodeSet": S_P_ARRAY, "PartitionName": S_P_ARRAY, - "SlurmctldHost": S_P_ARRAY, + "SlurmctldHost": S_P_LIST } # From https://github.com/SchedMD/slurm/blob/slurm-/src/slurmdbd/read_config.c @@ -406,12 +410,12 @@ class SlurmParserEnum(str, Enum): "ArchiveSuspend": S_P_BOOLEAN, "ArchiveTXN": S_P_BOOLEAN, "ArchiveUsage": S_P_BOOLEAN, - "AuthAltTypes": S_P_STRING, - "AuthAltParameters": S_P_STRING, - "AuthInfo": S_P_STRING, + "AuthAltTypes": S_P_CSV, + "AuthAltParameters": S_P_CSV, + "AuthInfo": S_P_CSV, "AuthType": S_P_STRING, "CommitDelay": S_P_UINT16, - "CommunicationParameters": S_P_STRING, + "CommunicationParameters": S_P_CSV, "DbdAddr": S_P_STRING, "DbdBackupHost": S_P_STRING, "DbdHost": S_P_STRING, @@ -429,10 +433,10 @@ class SlurmParserEnum(str, Enum): "MaxPurgeLimit": S_P_UINT32, "MaxQueryTimeRange": S_P_STRING, "MessageTimeout": S_P_UINT16, - "Parameters": S_P_STRING, + "Parameters": S_P_CSV, "PidFile": S_P_STRING, "PluginDir": S_P_STRING, - "PrivateData": S_P_STRING, + "PrivateData": S_P_CSV, "PurgeEventAfter": S_P_STRING, "PurgeJobAfter": S_P_STRING, "PurgeResvAfter": S_P_STRING, @@ -451,14 +455,14 @@ class SlurmParserEnum(str, Enum): "StorageBackupHost": S_P_STRING, "StorageHost": S_P_STRING, "StorageLoc": S_P_STRING, - "StorageParameters": S_P_STRING, + "StorageParameters": S_P_CSV, "StoragePass": S_P_STRING, "StoragePassScript": S_P_STRING, "StoragePort": S_P_UINT16, "StorageType": S_P_STRING, "StorageUser": S_P_STRING, "TCPTimeout": S_P_UINT16, - "TLSParameters": S_P_STRING, + "TLSParameters": S_P_CSV, "TLSType": S_P_STRING, "TrackWCKey": S_P_BOOLEAN, "TrackSlurmctldDown": S_P_BOOLEAN @@ -505,23 +509,23 @@ class SlurmParserEnum(str, Enum): "PMIxNetDevicesUCX": S_P_STRING, "PMIxShareServerTopology": S_P_BOOLEAN, "PMIxTimeout": S_P_UINT32, - "PMIxTlsUCX": S_P_STRING + "PMIxTlsUCX": S_P_CSV } # From https://github.com/SchedMD/slurm/blob/slurm-s/src/interfaces/gres.c#L101C40-L116C2 gres_options = { "AutoDetect": S_P_STRING, - "Count": S_P_STRING, # Number of Gres available */ + "Count": S_P_STRING, # Number of Gres available "CPUs": S_P_STRING, # CPUs to bind to Gres resource - "Cores": S_P_STRING, # Cores to bind to Gres resource */ - "File": S_P_STRING, # Path to Gres device */ - "Files": S_P_STRING, # Path to Gres device */ - "Flags": S_P_STRING, # GRES Flags */ - "Link": S_P_STRING, # Communication link IDs */ - "Links": S_P_STRING, # Communication link IDs */ - "MultipleFiles": S_P_STRING, # list of GRES device files */ - "Name": S_P_STRING, # Gres name */ - "Type": S_P_STRING # Gres type (e.g. model name) */ + "Cores": S_P_CSV, # Cores to bind to Gres resource + "File": S_P_STRING, # Path to Gres device + "Files": S_P_STRING, # Path to Gres device + "Flags": S_P_STRING, # GRES Flags + "Link": S_P_STRING, # Communication link IDs + "Links": S_P_CSV, # Communication link IDs + "MultipleFiles": S_P_CSV, # list of GRES device files + "Name": S_P_STRING, # Gres name + "Type": S_P_STRING # Gres type (e.g. model name) } all_confs = { @@ -529,8 +533,8 @@ class SlurmParserEnum(str, Enum): "slurmdbd": slurmdbd_options, "cgroup": cgroup_options, "mpi": mpi_options, - # "gres": gres_options, - # GRES can have different combinations, hence excluded + "gres": gres_options, + # TOD: GRES can have different combinations, NodeName and Name # https://slurm.schedmd.com/gres.conf.html#SECTION_EXAMPLES "PartitionName": partition_options, "NodeName": nodename_options, diff --git a/common/library/modules/slurm_conf.py b/common/library/modules/slurm_conf.py index a35586e89c..3d2b14b24d 100644 --- a/common/library/modules/slurm_conf.py +++ b/common/library/modules/slurm_conf.py @@ -19,28 +19,28 @@ version_added: "1.0.0" description: - This module provides utilities for working with Slurm configuration files. - - It can parse a Slurm conf file into a dictionary (f2d). - - It can convert a dictionary back to Slurm conf INI format (d2f). + - It can parse a Slurm conf file into a dictionary (parse). + - It can convert a dictionary back to Slurm conf INI format (render). - It can merge multiple configuration sources (files and/or dicts) into one (merge). options: op: description: - The operation to perform. - - C(f2d) - File to dict. Parse a Slurm conf file and return as dictionary. - - C(d2f) - Dict to file. Convert a dictionary to Slurm conf INI lines. + - C(parse) - File to dict. Parse a Slurm conf file and return as dictionary. + - C(render) - Dict to file. Convert a dictionary to Slurm conf INI lines. - C(merge) - Merge multiple configuration sources into one. required: true type: str - choices: ['f2d', 'd2f', 'merge'] + choices: ['parse', 'render', 'merge'] path: description: - Path to the Slurm configuration file. - - Required when I(op=f2d). + - Required when I(op=parse). type: str conf_map: description: - Dictionary of configuration key-value pairs. - - Required when I(op=d2f). + - Required when I(op=render). type: dict default: {} conf_sources: @@ -58,19 +58,15 @@ - Used for validation of configuration keys. type: str default: slurm - choices: ['slurm', 'cgroup', 'gres', 'mpi', 'slurmdbd'] author: - - Jagadeesh N V (jagadeesh.n.v@dell.com) -notes: - - Requires Python 3.7+ for ordered dict behavior. - - Array-type parameters (NodeName, PartitionName, SlurmctldHost, etc.) are handled specially. + - Jagadeesh N V (@jagadeeshnv) ''' EXAMPLES = r''' # Parse a slurm.conf file into a dictionary - name: Read slurm.conf slurm_conf: - op: f2d + op: parse path: /etc/slurm/slurm.conf conf_name: slurm register: slurm_config @@ -78,13 +74,12 @@ # Convert a dictionary to slurm.conf INI lines - name: Generate slurm.conf lines slurm_conf: - op: d2f + op: render conf_map: ClusterName: mycluster SlurmctldPort: 6817 SlurmctldHost: - - SlurmctldHost: controller1 - - SlurmctldHost: controller2 + - controller2 NodeName: - NodeName: node[1-10] CPUs: 16 @@ -118,47 +113,45 @@ ''' RETURN = r''' -slurm_dict: - description: Parsed configuration as a dictionary (when op=f2d). - type: dict - returned: when op=f2d - sample: {"ClusterName": "mycluster", "SlurmctldPort": "6817"} -slurm_conf: - description: Configuration as INI-format lines (when op=d2f). - type: list - returned: when op=d2f - sample: ["ClusterName=mycluster", "SlurmctldPort=6817"] conf_dict: - description: Merged configuration as a dictionary (when op=merge). + description: Merged configuration as a dictionary (when op=merge or op=parse). type: dict - returned: when op=merge + returned: when op=merge or op=parse sample: {"ClusterName": "mycluster", "SlurmctldTimeout": 120} ini_lines: - description: Merged configuration as INI-format lines (when op=merge). + description: Merged configuration as INI-format lines (when op=merge or op=render). type: list - returned: when op=merge + returned: when op=merge or op=render sample: ["ClusterName=mycluster", "SlurmctldTimeout=120"] ''' +# TODO: +# - Module is not case sensitive for conf keys +# - Support for validation of S_P_ types +# - Validation for choices for each type +# - Choices types for each type +# - Merge of sub options +# - Hostlist expressions, split and merge computations + + from collections import OrderedDict from ansible.module_utils.basic import AnsibleModule from ansible.module_utils.input_validation.common_utils.slurm_conf_utils import SlurmParserEnum, all_confs import os -# NOTE: depends on python3.7+ where dict order is maintained - def read_dict2ini(conf_dict): + """Convert a configuration dictionary to INI-style lines for slurm.conf.""" data = [] for k, v in conf_dict.items(): if isinstance(v, list): for dct_item in v: if isinstance(dct_item, dict): # TODO: Ordered dict, move the key to the top - # od = OrderedDict([('a', 1), ('b', 2), ('c', 3)]) - # od.move_to_end('c', last=False) # Move 'c' to the beginning + od = OrderedDict(dct_item) + od.move_to_end(k, last=False) # Move k to the beginning data.append( - " ".join(f"{key}={value}" for key, value in dct_item.items())) + " ".join(f"{key}={value}" for key, value in od.items())) else: data.append(f"{k}={dct_item}") else: @@ -166,77 +159,71 @@ def read_dict2ini(conf_dict): return data -def parse_slurm_conf(file_path, module): +def parse_slurm_conf(file_path, conf_name, validate): """Parses the slurm.conf file and returns it as a dictionary.""" - # slurm_dict = {"NodeName": [], "PartitionName": []} - conf_name = module.params['conf_name'] - current_conf = all_confs.get(conf_name) + current_conf = all_confs.get(conf_name, {}) slurm_dict = OrderedDict() if not os.path.exists(file_path): raise FileNotFoundError(f"{file_path} not found.") - with open(file_path, 'r') as f: + with open(file_path, 'r', encoding='utf-8') as f: for line in f: # handles any comment after the data line = line.split('#')[0].strip() - - # Skip comments and empty lines if not line: continue # Split the line by one or more spaces items = line.split() tmp_dict = OrderedDict() for item in items: - # module.warn(f"Item: {item}") # Split only on the first '=' to allow '=' inside the value key, value = item.split('=', 1) tmp_dict[key.strip()] = value.strip() skey = list(tmp_dict.keys())[0] - if skey not in current_conf: - raise Exception(f"Invalid key while parsing {file_path}: {skey}") - # if current_conf[skey] == SlurmParserEnum.S_P_ARRAY or len(tmp_dict) > 1: - if current_conf[skey] == SlurmParserEnum.S_P_ARRAY: - # TODO hostlist expressions and multiple DEFAULT entries handling - # if len(tmp_dict) == 1: - # first_key = list(tmp_dict.keys())[0] - # first_value = list(tmp_dict.values())[0] - # slurm_dict[first_key] = list( - # slurm_dict.get(first_key, [])) + [first_value] - # else: + if validate and skey not in current_conf: + raise ValueError(f"Invalid key while parsing {file_path}: {skey}") + if current_conf.get(skey) == SlurmParserEnum.S_P_ARRAY: slurm_dict[list(tmp_dict.keys())[0]] = list( slurm_dict.get(list(tmp_dict.keys())[0], [])) + [tmp_dict] + elif current_conf.get(skey) == SlurmParserEnum.S_P_CSV: + existing_values = [v.strip() for v in slurm_dict.get(skey, "").split(',') if v.strip()] + new_values = [v.strip() for v in tmp_dict[skey].split(',') if v.strip()] + slurm_dict[skey] = ",".join(list(dict.fromkeys(existing_values + new_values))) + elif current_conf.get(skey) == SlurmParserEnum.S_P_LIST: + slurm_dict[skey] = list(slurm_dict.get(skey, [])) + list(tmp_dict.values()) else: - # TODO handle csv values, currently no definite data type for csv values slurm_dict.update(tmp_dict) return slurm_dict -def slurm_conf_dict_merge(conf_dict_list, module): +def slurm_conf_dict_merge(conf_dict_list, conf_name): + """Merge multiple Slurm configuration dictionaries into a single dictionary.""" merged_dict = OrderedDict() + current_conf = all_confs.get(conf_name, {}) for conf_dict in conf_dict_list: for ky, vl in conf_dict.items(): - if isinstance(vl, list): + if current_conf.get(ky) == SlurmParserEnum.S_P_ARRAY: for item in vl: if isinstance(item, dict): - # module.warn(f"DICT Key: {ky}, Value: {vl}") existing_dict = merged_dict.get(ky, {}) inner_dict = existing_dict.get(item.get(ky), {}) inner_dict.update(item) # TODO Partition node combiner logic existing_dict[item.get(ky)] = inner_dict merged_dict[ky] = existing_dict - else: - # module.warn(f"LIST Key: {ky}, Value: {vl}") - existing_list = merged_dict.get(ky, []) - # module.warn(f"Existing list: {existing_list}") - # module.warn(f"Item: {item}") - if item not in existing_list: - # existing_list.append(item) - existing_list.update(item) - # module.warn(f"Updated list: {existing_list}") - merged_dict[ky] = existing_list + elif current_conf.get(ky) == SlurmParserEnum.S_P_LIST: + existing_list = merged_dict.get(ky, []) + if isinstance(vl, list): + new_items = vl + else: + new_items = [vl] + merged_dict[ky] = list(dict.fromkeys(existing_list + new_items)) + elif current_conf.get(ky) == SlurmParserEnum.S_P_CSV: + existing_values = [v.strip() for v in merged_dict.get(ky, "").split(',') if v.strip()] + new_values = [v.strip() for v in vl.split(',') if v.strip()] + merged_dict[ky] = ",".join(list(dict.fromkeys(existing_values + new_values))) else: merged_dict[ky] = vl # flatten the dict @@ -248,50 +235,51 @@ def slurm_conf_dict_merge(conf_dict_list, module): def run_module(): + """Entry point for the Ansible module handling slurm.conf operations.""" module_args = { "path": {'type': 'str'}, - "op": {'type': 'str', 'required': True, 'choices': ['f2d', 'd2f', 'merge']}, + "op": {'type': 'str', 'required': True, 'choices': ['parse', 'render', 'merge']}, "conf_map": {'type': 'dict', 'default': {}}, "conf_sources": {'type': 'list', 'elements': 'raw', 'default': []}, - "conf_name": {'type': 'str', 'default': 'slurm'} + "conf_name": {'type': 'str', 'default': 'slurm'}, + "validate": {'type': 'bool', 'default': False} } - result = {"changed": False, "slurm_dict": {}, "failed": False} + result = {"changed": False, "failed": False} # Create the AnsibleModule object module = AnsibleModule(argument_spec=module_args, required_if=[ - ('op', 'd2f', ('conf_map',)), + ('op', 'render', ('conf_map',)), ('op', 'merge', ('conf_sources',)) ], supports_check_mode=True) try: + conf_name = module.params['conf_name'] + validate = module.params['validate'] # Parse the slurm.conf file - if module.params['op'] == 'f2d': - s_dict = parse_slurm_conf(module.params['path'], module) - result['slurm_dict'] = s_dict - elif module.params['op'] == 'd2f': + if module.params['op'] == 'parse': + s_dict = parse_slurm_conf(module.params['path'], conf_name, validate) + result['conf_dict'] = s_dict + elif module.params['op'] == 'render': s_list = read_dict2ini(module.params['conf_map']) - result['slurm_conf'] = s_list + result['ini_lines'] = s_list elif module.params['op'] == 'merge': conf_dict_list = [] for conf_source in module.params['conf_sources']: if isinstance(conf_source, dict): - conf_dict_list.append(conf_source) + conf_dict_list.append(OrderedDict(conf_source)) elif isinstance(conf_source, str): if not os.path.exists(conf_source): - raise Exception(f"File {conf_source} does not exist") - s_dict = parse_slurm_conf(conf_source, module) - # module.warn(f"Conf dict: {s_dict}") - conf_dict_list.append(s_dict) - # module.warn("After append") + raise FileNotFoundError(f"File {conf_source} does not exist") + s_dict = parse_slurm_conf(conf_source, conf_name, validate) + conf_dict_list.append(OrderedDict(s_dict)) else: - raise Exception(f"Invalid type for conf_source: {type(conf_source)}") - # module.exit_json(changed=False, conf_dict=conf_dict_list) - merged_dict = slurm_conf_dict_merge(conf_dict_list, module) + raise TypeError(f"Invalid type for conf_source: {type(conf_source)}") + merged_dict = slurm_conf_dict_merge(conf_dict_list, conf_name) result['conf_dict'] = merged_dict result['ini_lines'] = read_dict2ini(merged_dict) - except Exception as e: + except (FileNotFoundError, ValueError, TypeError, AttributeError) as e: result['failed'] = True result['msg'] = str(e) module.fail_json(msg=str(e)) diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 index 5051e16177..82646da1c6 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 @@ -126,9 +126,8 @@ - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages /hpc_tools/container_images /hpc_tools/scripts - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmd /var/spool/slurmd nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d /etc/slurm/epilog.d nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path}}/hpc_tools/container_images /hpc_tools/container_images nfs defaults,_netdev 0 0" >> /etc/fstab @@ -153,6 +152,7 @@ - chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm - chmod {{ file_mode_400 }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/ + - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/logout_user.sh - mkdir -p /var/spool/slurmd - chmod {{ file_mode_755 }} /var/spool/slurmd - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 index a384d64674..2f2721d7eb 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 @@ -307,10 +307,10 @@ content: | SELECT VERSION(); SHOW DATABASES; - - CREATE DATABASE slurm_acct_db; - CREATE USER 'slurm'@'%' IDENTIFIED BY '{{ hostvars['localhost']['slurm_db_password'] }}'; - GRANT ALL PRIVILEGES ON slurm_acct_db.* TO 'slurm'@'%'; + CREATE DATABASE IF NOT EXISTS {{ apply_config['slurmdbd']['StorageLoc'] }}; + CREATE USER IF NOT EXISTS '{{ apply_config['slurmdbd']['SlurmUser'] }}'@'%' IDENTIFIED BY '{{ hostvars['localhost']['slurm_db_password'] }}'; + ALTER USER '{{ apply_config['slurmdbd']['SlurmUser'] }}'@'%' IDENTIFIED BY '{{ hostvars['localhost']['slurm_db_password'] }}'; + GRANT ALL PRIVILEGES ON {{ apply_config['slurmdbd']['StorageLoc'] }}.* TO '{{ apply_config['slurmdbd']['SlurmUser'] }}'@'%'; FLUSH PRIVILEGES; - path: /root/omnia_slurm_scripts/00_munge_setup.sh @@ -405,7 +405,7 @@ echo "${value:-$default}" } #dir StateSaveLocation - StateSaveLocation=$(get_value_slurm_conf "StateSaveLocation" "/var/spool") + StateSaveLocation=$(get_value_slurm_conf "StateSaveLocation" "/var/spool/slurmctld") mkdir -pv $StateSaveLocation chown -v "$SLURM_USER:$SLURM_GROUP" $StateSaveLocation chmod -v 0744 $StateSaveLocation @@ -477,7 +477,7 @@ - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab {% if powervault_config is not defined %} - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/lib/mysql /var/lib/mysql nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmctld /var/spool/slurmctld nfs defaults,_netdev 0 0" >> /etc/fstab {% endif %} - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 80347f6854..5128aee1d1 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -261,7 +261,7 @@ echo "[INFO] Updating /etc/fstab with NFS entries for Pulp cert, Slurm and Munge paths" echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab + echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool/slurmd /var/spool/slurmd nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d /etc/slurm/epilog.d nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab diff --git a/discovery/roles/slurm_config/defaults/main.yml b/discovery/roles/slurm_config/defaults/main.yml index 47a8f3dd64..03ea48760c 100644 --- a/discovery/roles/slurm_config/defaults/main.yml +++ b/discovery/roles/slurm_config/defaults/main.yml @@ -263,8 +263,9 @@ __default_config: SlurmctldPort: 6817 SlurmdPort: 6818 SrunPortRange: "60001-63000" - StateSaveLocation: "/var/spool/state" + StateSaveLocation: "/var/spool/slurmctld" SlurmdSpoolDir: "/var/spool/slurmd" + SlurmctldParameters: "{{ slurm_ctld_parameters | join(',') }}" ReturnToService: 2 SchedulerType: sched/backfill MpiDefault: none @@ -283,6 +284,16 @@ __default_config: SlurmctldTimeout: 120 SlurmdTimeout: 300 Epilog: "/etc/slurm/epilog.d/logout_user.sh" + PluginDir: "{{ plugin_slurm_dir }}" + NodeName: + - NodeName: DEFAULT + State: UNKNOWN + PartitionName: + - PartitionName: DEFAULT + Nodes: ALL + Default: true + MaxTime: INFINITE + State: UP # S_P_ARRAY type paramater to be provided this way # Epilog: # - Epilog: "/etc/slurm/epilog.d/logout_user.sh" @@ -293,5 +304,10 @@ __default_config: SlurmUser: "{{ slurm_user }}" StorageType: accounting_storage/mysql StorageLoc: slurm_acct_db + StoragePort: "{{ slurm_db_port }}" + StorageUser: "{{ slurm_dbd_db_username }}" + StoragePass: "{{ slurm_db_password }}" + PluginDir: "{{ plugin_slurm_dir }}" + DbdPort: "{{ slurm_dbd_port }}" gres: AutoDetect: nvml diff --git a/discovery/roles/slurm_config/tasks/build_slurm_conf.yml b/discovery/roles/slurm_config/tasks/build_slurm_conf.yml new file mode 100644 index 0000000000..cd72cf33f0 --- /dev/null +++ b/discovery/roles/slurm_config/tasks/build_slurm_conf.yml @@ -0,0 +1,53 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Append node_params list into NodeName list + ansible.builtin.set_fact: + apply_config: "{{ apply_config | default({}) + | combine({'slurm': (apply_config['slurm'] + | combine({'NodeName': (apply_config['slurm'].NodeName | default([])) + (node_params | default([]))}))}) }}" + when: node_params is defined and node_params + +- name: Append login nodes to NodeName list + ansible.builtin.set_fact: + apply_config: "{{ apply_config | default({}) + | combine({'slurm': (apply_config['slurm'] + | combine({'NodeName': (apply_config['slurm'].NodeName | default([])) + [{'NodeName': item}]}))}) }}" + loop: "{{ login_list }}" + when: login_list is defined and login_list + +- name: Append compiler login nodes to NodeName list + ansible.builtin.set_fact: + apply_config: "{{ apply_config | default({}) + | combine({'slurm': (apply_config['slurm'] + | combine({'NodeName': (apply_config['slurm'].NodeName | default([])) + [{'NodeName': item}]}))}) }}" + loop: "{{ compiler_login_list }}" + when: compiler_login_list is defined and compiler_login_list + +- name: Append Partition + ansible.builtin.set_fact: + apply_config: "{{ apply_config | default({}) + | combine({'slurm': (apply_config['slurm'] + | combine({'PartitionName': (apply_config['slurm'].PartitionName | default([])) + [partition_params]}))}) }}" + when: node_params is defined and node_params + +- name: Add gpu parameters to slurm conf + ansible.builtin.set_fact: + apply_config: "{{ apply_config | default({}) | combine({'slurm': (apply_config['slurm'] | combine(gpu_slurm_conf))}) }}" + when: gpu_params is defined and gpu_params + +- name: Add dbd parameters to slurm conf + ansible.builtin.set_fact: + apply_config: "{{ apply_config | default({}) | combine({'slurm': (apply_config['slurm'] | combine(dbd_slurm_conf))}) }}" + when: dbd_list is defined and dbd_list diff --git a/discovery/roles/slurm_config/tasks/check_ctld_running.yml b/discovery/roles/slurm_config/tasks/check_ctld_running.yml new file mode 100644 index 0000000000..ee57c1603f --- /dev/null +++ b/discovery/roles/slurm_config/tasks/check_ctld_running.yml @@ -0,0 +1,53 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Initialize ctld_state dict + ansible.builtin.set_fact: + ctld_state: "{{ ctld_state | default({}) | combine({item: false}) }}" + +- name: Check if remote host is reachable via SSH + ansible.builtin.wait_for: + host: "{{ item }}" + port: 22 # TODO: make it configurable + timeout: 10 + state: started + delegate_to: localhost + register: ssh_check + ignore_errors: true + # debugger: always + +- name: Check if slurmctld is running on remote host + ansible.builtin.service_facts: + delegate_to: "{{ item }}" + register: service_facts + when: ssh_check is success + ignore_errors: true + +- name: Update ctld_state if slurmctld is running + ansible.builtin.set_fact: + ctld_state: "{{ ctld_state | combine({item: true}) }}" + when: + - ssh_check is success + - service_facts is success + - ansible_facts.services['slurmctld.service'] is defined + - ansible_facts.services['slurmctld.service'].state == 'running' + +- name: Trigger the scontrol reconfigure + ansible.builtin.command: scontrol reconfigure + changed_when: scontrol_reconfig.rc == 0 + failed_when: false + register: scontrol_reconfig + delegate_to: "{{ item }}" + when: ctld_state[item] is true + # debugger: always diff --git a/discovery/roles/slurm_config/tasks/conf_merge.yml b/discovery/roles/slurm_config/tasks/conf_merge.yml deleted file mode 100644 index 4677113cb2..0000000000 --- a/discovery/roles/slurm_config/tasks/conf_merge.yml +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- -- name: Merge the default and custom slurm conf - slurm_conf: - op: merge - conf_sources: - - "{{ slurm_config_path }}/{{ item.0 }}/etc/slurm/{{ item.1 }}.conf" - - "{{ configs_input.get(item.1, {}) }}" # Either map or file path - conf_name: "{{ item.1 }}" - register: merged_conf - # debugger: always - -- name: Write merged .conf - ansible.builtin.copy: - content: "{{ merged_conf.ini_lines | join('\n') }}\n" - dest: "{{ slurm_config_path }}/{{ item.0 }}/etc/slurm/{{ item.1 }}.conf" - mode: "{{ conf_file_mode }}" - remote_src: "{{ copy_from_oim }}" diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml index 8150214bfd..4d1008d813 100644 --- a/discovery/roles/slurm_config/tasks/confs.yml +++ b/discovery/roles/slurm_config/tasks/confs.yml @@ -21,28 +21,86 @@ when: cmpt_list loop: "{{ cmpt_list }}" -- name: Add gpu parameters to slurm conf - ansible.builtin.set_fact: - apply_config: "{{ apply_config | default({}) | combine({'slurm': (apply_config['slurm'] | combine(gpu_slurm_conf))}) }}" - when: gpu_params is defined and gpu_params +- name: Build slurm.conf + ansible.builtin.include_tasks: build_slurm_conf.yml -# TODO: Move to input validation with all confs -- name: Slurm dict ops +- name: Slurm dbd opts ansible.builtin.set_fact: - slurm_conf_dict: "{{ apply_config['slurm'] }}" - -- name: Create all .conf for ctld only Write all files default config - ansible.builtin.template: - src: "{{ item.1 }}.conf.j2" - dest: "{{ slurm_config_path }}/{{ item.0 }}/etc/slurm/{{ item.1 }}.conf" - owner: "{{ root_user }}" - group: "{{ root_group }}" - mode: "{{ conf_file_mode }}" + apply_config: "{{ apply_config | default({}) + | combine({'slurmdbd': (apply_config['slurmdbd'] | combine({'DbdHost': ctld_list[0], 'StorageHost': ctld_list[0]}))}) }}" when: ctld_list - loop: "{{ ctld_list | product(conf_files | default([])) }}" -- name: Conf merge and write using slurm_conf module - ansible.builtin.include_tasks: conf_merge.yml +- name: Check .conf files existence + ansible.builtin.stat: + path: "{{ slurm_config_path }}/{{ item.0 }}/etc/slurm/{{ item.1 }}.conf" when: ctld_list - loop: "{{ ctld_list | product(configs_input.keys() | default([])) }}" -# TODO: To apply the cluster_name after merge + loop: "{{ ctld_list | product(conf_files | default([])) }}" + register: ctld_conf_files + +- name: Create lists for conf_merge + ansible.builtin.set_fact: + conf_merge_dict: "{{ + conf_merge_dict | default({}) + | combine({ + conf_set.item.1: ( + [apply_config[conf_set.item.1]] + + ([conf_set.stat.path] if conf_set.stat.exists else []) + + ([configs_input.get(conf_set.item.1)] if configs_input.get(conf_set.item.1) else []) + ) + }) + }}" + loop: "{{ ctld_conf_files.results }}" + loop_control: + loop_var: conf_set + register: prepared_conf_lists + +- name: Prepend ClusterName and SlurmctldHost to slurm conf sources + ansible.builtin.set_fact: # TODO: Change order if needed + conf_merge_dict: "{{ conf_merge_dict + | combine({'slurm': [{'ClusterName': cluster_name, 'SlurmctldHost': ctld_list}] + conf_merge_dict['slurm']}) }}" + when: "'slurm' in conf_merge_dict" + +- name: Merge the confs + slurm_conf: + op: merge + conf_sources: "{{ item.value }}" + conf_name: "{{ item.key }}" + loop: "{{ conf_merge_dict | dict2items }}" + register: merged_conf + +- name: Update slurm_conf_dict with merged configuration for cloud_init read. # TODO: Remove cloud init dependency + ansible.builtin.set_fact: + slurm_conf_dict: "{{ (merged_conf.results | selectattr('item.key', 'equalto', 'slurm') | first).conf_dict }}" + when: "'slurm' in conf_merge_dict" + +- name: Create directories from conf values + ansible.builtin.include_tasks: exist_dir.yml + loop: + - "{{ ctld_list + | product([slurm_conf_dict.get('StateSaveLocation', '/var/spool/slurmctld'), + (slurm_conf_dict.get('SlurmctldLogFile', '/var/log/slurmctld.log') | dirname), + (slurm_conf_dict.get('SlurmctldPidFile', '/var/run/slurmctld.pid') | dirname)]) }}" + - "{{ (cmpt_list + login_list + compiler_login_list) + | product([slurm_conf_dict.get('SlurmdSpoolDir', '/var/spool/slurmd'), + (slurm_conf_dict.get('SlurmdLogFile', '/var/log/slurmd.log') | dirname), + (slurm_conf_dict.get('SlurmdPidFile', '/var/run/slurmd.pid') | dirname)]) }}" + loop_control: + loop_var: product + +- name: Write merged .conf + ansible.builtin.copy: + content: "{{ item.ini_lines | join('\n') }}\n" + dest: "{{ slurm_config_path }}/{{ ctld_list[0] }}/etc/slurm/{{ item.item.key }}.conf" + mode: "{{ conf_file_mode }}" + owner: "{{ slurm_user }}" + group: "{{ slurm_user_group }}" + remote_src: "{{ copy_from_oim }}" + loop: "{{ merged_conf.results }}" + register: ctld_conf_files + +- name: Check if cluster running + ansible.builtin.include_tasks: check_ctld_running.yml + when: + - ctld_list + - ctld_conf_files is changed + loop: "{{ ctld_list }}" diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml index 662802274b..9ce43dcd6a 100644 --- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml +++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml @@ -67,6 +67,18 @@ ansible.builtin.include_tasks: openldap_config.yml when: hostvars['localhost']['openldap_support'] +- name: Create slurm group + ansible.builtin.group: + name: "{{ slurm_user_group }}" + gid: "{{ slurm_uid }}" + +- name: Create slurm User + ansible.builtin.user: + name: "{{ slurm_user }}" + uid: "{{ slurm_uid }}" + group: "{{ slurm_user_group }}" + create_home: false + - name: Set facts for slurm ansible.builtin.set_fact: share_prefix: "{{ slurm_config_path }}" @@ -96,25 +108,15 @@ group: root mode: "{{ common_mode }}" -- name: Create the slurm ctld directory on share - ansible.builtin.file: - path: "{{ slurm_config_path }}/{{ item[0] }}{{ item[1] }}" - state: directory - owner: root - group: root - mode: "{{ common_mode }}" - when: ctld_list - loop: "{{ ctld_list | product(ctld_dir) }}" - -- name: Create the slurm cmpt directory on share - ansible.builtin.file: - path: "{{ slurm_config_path }}/{{ item[0] }}{{ item[1] }}" - state: directory - owner: root - group: root - mode: "{{ common_mode }}" - when: cmpt_list or login_list or compiler_login_list - loop: "{{ (cmpt_list + login_list + compiler_login_list) | product(cmpt_dir) }}" +- name: Create all common directories + ansible.builtin.include_tasks: exist_dir.yml + loop: + - "{{ (ctld_list + cmpt_list + login_list + compiler_login_list) | product(common_dir) }}" + - "{{ ctld_list | product(ctld_dir) }}" + - "{{ dbd_list | product(db_dir) }}" + - "{{ cmpt_list | product(cmpt_dir) }}" + loop_control: + loop_var: product - name: Create the cert directory on share ansible.builtin.file: @@ -179,11 +181,11 @@ dest: "{{ slurm_config_path }}/{{ item.0 }}/etc/slurm/epilog.d/{{ item.1 }}" owner: "{{ root_user }}" group: "{{ root_group }}" - mode: "{{ conf_file_mode }}" + mode: "{{ common_mode }}" when: cmpt_list loop: "{{ cmpt_list | product(['logout_user.sh', 'slurmd.service']) }}" -- name: Create slurmd.service in login and login_compiler +- name: Create logout_user.sh and slurmd.service in login and login_compiler ansible.builtin.template: src: "{{ item.1 }}.j2" dest: "{{ slurm_config_path }}/{{ item.0 }}/etc/slurm/epilog.d/{{ item.1 }}" diff --git a/discovery/roles/slurm_config/tasks/exist_dir.yml b/discovery/roles/slurm_config/tasks/exist_dir.yml new file mode 100644 index 0000000000..92e88a47ae --- /dev/null +++ b/discovery/roles/slurm_config/tasks/exist_dir.yml @@ -0,0 +1,27 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Create directories if not exist +- name: Check if directories exist + ansible.builtin.stat: + path: "{{ slurm_config_path }}/{{ item[0] }}{{ item[1] }}" + loop: "{{ product }}" + register: existing_dir + +- name: Create directories if not exist + ansible.builtin.file: # noqa: risky-file-permissions + path: "{{ slurm_config_path }}/{{ item.item.0 }}{{ item.item.1 }}" + state: directory + loop: "{{ existing_dir.results }}" + when: not item.stat.exists diff --git a/discovery/roles/slurm_config/tasks/read_node_idrac.yml b/discovery/roles/slurm_config/tasks/read_node_idrac.yml index 5b8b29f571..8424f69603 100644 --- a/discovery/roles/slurm_config/tasks/read_node_idrac.yml +++ b/discovery/roles/slurm_config/tasks/read_node_idrac.yml @@ -72,7 +72,7 @@ - name: Calculate proc facts ansible.builtin.set_fact: - proc_params: "{{ {} | combine({'Sockets': (1 if (cpus | length == 0) else (cpus | length))}) + proc_params: "{{ {'NodeName': item} | combine({'Sockets': (1 if (cpus | length == 0) else (cpus | length))}) | combine({'CoresPerSocket': (cpus[0].TotalEnabledCores | default(default_corespersocket))}) | combine({'ThreadsPerCore': ((cpus[0].TotalThreads | default(default_threadspercore)) // (cpus[0].TotalCores | default(1)))}) | combine({'RealMemory': real_memory | default(default_real_memory) }) @@ -84,5 +84,5 @@ - name: Add to Nodeparam dict ansible.builtin.set_fact: - node_params: "{{ node_params | default({}) | combine({item: proc_params}) }}" + node_params: "{{ (node_params | default([])) + [proc_params] }}" gpu_params: "{{ gpu_params | default({}) | combine({item: gpus} if gpus else {}) }}" diff --git a/discovery/roles/slurm_config/templates/cgroup.conf.j2 b/discovery/roles/slurm_config/templates/cgroup.conf.j2 deleted file mode 100644 index d10c75ac57..0000000000 --- a/discovery/roles/slurm_config/templates/cgroup.conf.j2 +++ /dev/null @@ -1,6 +0,0 @@ -{% for key in apply_config['cgroup'] | sort %} -{% set val = apply_config['cgroup'][key] %} -{% if val is not none and val != omit %} -{{ key }}={{ 'yes' if val is sameas true else ('no' if val is sameas false else val) }} -{% endif %} -{% endfor %} \ No newline at end of file diff --git a/discovery/roles/slurm_config/templates/gres.conf.j2 b/discovery/roles/slurm_config/templates/gres.conf.j2 deleted file mode 100644 index 805fc86e66..0000000000 --- a/discovery/roles/slurm_config/templates/gres.conf.j2 +++ /dev/null @@ -1,6 +0,0 @@ -{% for key in apply_config['gres'] | sort %} -{% set val = apply_config['gres'][key] %} -{% if val is not none and val != omit %} -{{ key }}={{ 'yes' if val is sameas true else ('no' if val is sameas false else val) }} -{% endif %} -{% endfor %} \ No newline at end of file diff --git a/discovery/roles/slurm_config/templates/slurm.conf.j2 b/discovery/roles/slurm_config/templates/slurm.conf.j2 deleted file mode 100644 index a720637b38..0000000000 --- a/discovery/roles/slurm_config/templates/slurm.conf.j2 +++ /dev/null @@ -1,47 +0,0 @@ -ClusterName={{ cluster_name }} -{% for ctld in ctld_list %} -SlurmctldHost={{ ctld }} -{% endfor %} -{% if slurm_installation_type == configless_slurm %} -{% set slurm_ctld_parameters = (slurm_ctld_parameters | default([]) ) + ['enable_configless'] %} -{% endif %} -{% if slurm_ctld_parameters | length > 0 %} -SlurmctldParameters={{ slurm_ctld_parameters | join(',') }} -{% endif %} -PluginDir={{ installroot }}{{ plugin_slurm_dir }} -{% for key in apply_config['slurm'] | sort %} -{% set val = apply_config['slurm'][key] %} -{% if val is not none and val != omit %} -{{ key }}={{ 'yes' if val is sameas true else ('no' if val is sameas false else val) }} -{% endif %} -{% endfor %} - -# SLURM DBD -{% if dbd_list %} -AccountingStorageHost={{ dbd_list[0] }} -AccountingStoragePort={{ slurm_dbd_port }} -AccountingStorageType=accounting_storage/slurmdbd -{% endif %} - -# COMPUTE NODES -NodeName=DEFAULT State=UNKNOWN -# TODO: Default case no compute nodes to be handled -{% for cmpt in (cmpt_list | default([])) %} -NodeName={{ cmpt }}{% for k in node_params[cmpt] %} {{ k }}={{ node_params[cmpt][k] }}{% endfor %} - -{% else %} -NodeName=localhost State=UNKNOWN -{% endfor %} -{% set all_login_nodes = (login_list | default([])) + (compiler_login_list | default([])) %} -{% for login_node in all_login_nodes %} -NodeName={{ login_node }} -{% endfor %} - -# PARTITION INFO -PartitionName=DEFAULT Nodes=ALL Default=YES MaxTime=INFINITE State=UP -{% if cmpt_list %} -PartitionName={{ slurm_partition_name }} Nodes={{ cmpt_list | join(',') }} MaxTime=INFINITE State=UP -{% endif %} -{% for i in partitions %} -PartitionName={{ i.PartitionName }}{% for k in i | sort if k != 'PartitionName' %} {{ k }}={{ 'YES' if i[k] is sameas true else ('NO' if i[k] is sameas false else i[k]) }}{% endfor %} -{% endfor %} diff --git a/discovery/roles/slurm_config/templates/slurmdbd.conf.j2 b/discovery/roles/slurm_config/templates/slurmdbd.conf.j2 deleted file mode 100644 index 3978aced17..0000000000 --- a/discovery/roles/slurm_config/templates/slurmdbd.conf.j2 +++ /dev/null @@ -1,18 +0,0 @@ -# ADD DEFAULTS -{% for key in apply_config['slurmdbd'] | sort %} -{% set val = apply_config['slurmdbd'][key] %} -{% if val is not none and val != omit %} -{{ key }}={{ 'yes' if val is sameas true else ('no' if val is sameas false else val) }} -{% endif %} -{% endfor %} - -DbdHost={{ ctld_list[0] }} -DbdPort={{ slurm_dbd_port }} -PluginDir={{ installroot }}{{ plugin_slurm_dir }} -# DATABASE INFO -StorageHost={{ ctld_list[0] }} -{% if slurm_db_port is not none and slurm_db_port != omit %} -StoragePort={{ slurm_db_port }} -{% endif %} -StorageUser={{ slurm_dbd_db_username }} -StoragePass={{ slurm_db_password }} \ No newline at end of file diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index 63bb52fb41..9c547e34ab 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -22,24 +22,17 @@ conf_files: # Must match this MASTER list - cgroup - gres copy_from_oim: false +common_dir: + - /etc/munge ctld_dir: - /etc/slurm +db_dir: - /etc/my.cnf.d - /var/lib/mysql - /var/log/mariadb - - /var/log/slurm - - /var/spool - - /etc/munge - cmpt_dir: - - /var/log/slurm - - /var/spool - - /var/lib/slurm - /etc/slurm/epilog.d - - /etc/munge -login_dir: - - /etc/munge gpu_slurm_conf: GresTypes: gpu SelectType: select/cons_tres @@ -82,9 +75,9 @@ mariadb: mariadb mysql: mysql root_user: root root_group: root -plugin_slurm_dir: "usr/lib64/slurm" +plugin_slurm_dir: "/usr/lib64/slurm" munge_key_cmd: "dd if=/dev/urandom bs=1 count=1024" -slurm_ctld_parameters: [] +slurm_ctld_parameters: ['enable_configless'] partitions: {} _clean_before_install: false _force_install_nfs: true @@ -96,6 +89,15 @@ munge_dir_mode: "0700" common_mode: "0755" slurm_dbd_mode: "0600" slurm_db_cnf_mode: "0600" +dbd_slurm_conf: + AccountingStorageHost: "{{ dbd_list[0] }}" + AccountingStoragePort: "{{ slurm_dbd_port }}" + AccountingStorageType: accounting_storage/slurmdbd +partition_params: + PartitionName: "{{ slurm_partition_name }}" + Nodes: "{{ cmpt_list | join(',') }}" + MaxTime: "INFINITE" + State: "UP" openldap_dir_name: "openldap/" software_config_file: "{{ input_project_dir }}/software_config.json" omnia_run_tags: "{{ hostvars['localhost']['omnia_run_tags'] }}"