Skip to content

Commit 272e2a6

Browse files
committed
Fix regex matching rules // add logs if spmix_appdir is not mounted
1 parent c90ec2f commit 272e2a6

File tree

2 files changed

+26
-246
lines changed

2 files changed

+26
-246
lines changed

src/pmix/PMIxHook.cpp

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -160,8 +160,8 @@ void PMIxHook::derivePathsFromScontrol() {
160160
auto ss = std::stringstream{output};
161161
auto line = std::string{};
162162
auto matches = boost::smatch{};
163-
auto reSlurmdSpoolDir = boost::regex{"^SlurmdSpoolDir *= (.*)$"};
164-
auto reTmpFS = boost::regex{"^TmpFS *= (.*)$"};
163+
auto reSlurmdSpoolDir = boost::regex{"^SlurmdSpoolDir *= *([^ ]*).*"};
164+
auto reTmpFS = boost::regex{"^TmpFS *= *([^ ]*).*"};
165165

166166
while (std::getline(ss, line)) {
167167
if (boost::regex_match(line, matches, reSlurmdSpoolDir)) {
@@ -178,10 +178,15 @@ void PMIxHook::derivePathsFromScontrol() {
178178
// Check the sanity of the derived paths.
179179
boost::system::error_code ec;
180180

181-
if (!boost::filesystem::is_directory(pathSlurmdSpoolDir, ec))
182-
SARUS_THROW_ERROR("SlurmdSpoolDir is not a directory");
183-
if (!boost::filesystem::is_directory(pathTmpFS, ec))
184-
SARUS_THROW_ERROR("TmpFS is not a directory");
181+
if (!boost::filesystem::is_directory(pathSlurmdSpoolDir, ec)) {
182+
auto msg = boost::format("SlurmdSpoolDir is not a directory (%s)") % pathSlurmdSpoolDir;
183+
SARUS_THROW_ERROR(msg.str());
184+
}
185+
186+
if (!boost::filesystem::is_directory(pathTmpFS, ec)) {
187+
auto msg = boost::format("TmpFS is not a directory (%s)") % pathTmpFS;
188+
SARUS_THROW_ERROR(msg.str());
189+
}
185190

186191
log("Derived paths from 'scontrol'", libsarus::LogLevel::INFO);
187192
}
@@ -201,16 +206,22 @@ void PMIxHook::mountPMIxDirectories() {
201206
pathAppdirJobStep += ("_" + envSlurmJobID);
202207
pathAppdirJobStep += ("." + envSlurmStepID);
203208

204-
try {
205-
if (!envSlurmJobUID.empty() && boost::filesystem::is_directory(pathAppdirUIDJobStep)) {
209+
if (!envSlurmJobUID.empty() && boost::filesystem::is_directory(pathAppdirUIDJobStep)) {
210+
try {
206211
libsarus::mount::validatedBindMount(pathAppdirUIDJobStep, pathAppdirUIDJobStep, userIdentity, pathRootFS, mount_flags);
207-
log(boost::format("Mounted: %s") % pathAppdirUIDJobStep, libsarus::LogLevel::INFO);
208-
} else {
212+
log(boost::format("Mounted spmix_appdir: %s") % pathAppdirUIDJobStep, libsarus::LogLevel::INFO);
213+
} catch (...) {
214+
// Respecfully ignore. ("nofail")
215+
log(boost::format("Cannot mount spmix_appdir: %s") % pathAppdirUIDJobStep, libsarus::LogLevel::INFO);
216+
}
217+
} else {
218+
try {
209219
libsarus::mount::validatedBindMount(pathAppdirJobStep, pathAppdirJobStep, userIdentity, pathRootFS, mount_flags);
210-
log(boost::format("Mounted: %s") % pathAppdirJobStep, libsarus::LogLevel::INFO);
220+
log(boost::format("Mounted spmix_appdir: %s") % pathAppdirJobStep, libsarus::LogLevel::INFO);
221+
} catch (...) {
222+
// Respecfully ignore. ("nofail")
223+
log(boost::format("Cannot mount spmix_appdir: %s") % pathAppdirJobStep, libsarus::LogLevel::INFO);
211224
}
212-
} catch (...) {
213-
// Respecfully ignore. ("nofail")
214225
}
215226

216227
// Mount "pmix".

tests/assets/scontrol-mock

Lines changed: 2 additions & 233 deletions
Original file line numberDiff line numberDiff line change
@@ -3,239 +3,8 @@
33
if [[ "$1" == "show" ]] && [[ "$2" == "config" ]]; then
44
cat <<EOF
55
Configuration data as of 2025-09-16T10:59:04
6-
AccountingStorageBackupHost = (null)
7-
AccountingStorageEnforce = associations,limits,qos
8-
AccountingStorageHost = zinal-slurmdbd.tds.cscs.ch
9-
AccountingStorageExternalHost = (null)
10-
AccountingStorageParameters = (null)
11-
AccountingStoragePort = 6819
12-
AccountingStorageTRES = cpu,mem,energy,node,billing,fs/disk,vmem,pages,gres/gpu,gres/gpumem,gres/gpuutil
13-
AccountingStorageType = accounting_storage/slurmdbd
14-
AccountingStorageUser = N/A
15-
AccountingStoreFlags = (null)
16-
AcctGatherEnergyType = acct_gather_energy/pm_counters
17-
AcctGatherFilesystemType = (null)
18-
AcctGatherInterconnectType = (null)
19-
AcctGatherNodeFreq = 0 sec
20-
AcctGatherProfileType = (null)
21-
AllowSpecResourcesUsage = No
22-
AuthAltTypes = auth/jwt
23-
AuthAltParameters = jwks=/etc/slurm_extra/jwks.json,userclaimfield=preferred_username
24-
AuthInfo = (null)
25-
AuthType = auth/munge
26-
BatchStartTimeout = 10 sec
27-
BcastExclude = /lib,/usr/lib,/lib64,/usr/lib64
28-
BcastParameters = (null)
29-
BOOT_TIME = 2025-09-01T14:17:43
30-
BurstBufferType = (null)
31-
CliFilterPlugins = (null)
32-
ClusterName = zinal
33-
CommunicationParameters = (null)
34-
CompleteWait = 0 sec
35-
CpuFreqDef = Unknown
36-
CpuFreqGovernors = OnDemand,Performance,UserSpace
37-
CredType = cred/munge
38-
DebugFlags = (null)
39-
DefMemPerNode = UNLIMITED
40-
DependencyParameters = (null)
41-
DisableRootJobs = No
42-
EioTimeout = 60
43-
EnforcePartLimits = NO
44-
EpilogMsgTime = 2000 usec
45-
FairShareDampeningFactor = 1
46-
FederationParameters = (null)
47-
FirstJobId = 1
48-
GetEnvTimeout = 2 sec
49-
GresTypes = (null)
50-
GpuFreqDef = (null)
51-
GroupUpdateForce = 1
52-
GroupUpdateTime = 600 sec
53-
HASH_VAL = Different Ours=0xe7c32fa3 Slurmctld=0xf73d113
54-
HashPlugin = hash/k12
55-
HealthCheckInterval = 0 sec
56-
HealthCheckNodeState = ANY
57-
HealthCheckProgram = (null)
58-
InactiveLimit = 0 sec
59-
InteractiveStepOptions = --interactive --preserve-env --pty $SHELL
60-
JobAcctGatherFrequency = task=30,energy=30,filesystem=30
61-
JobAcctGatherType = jobacct_gather/linux
62-
JobAcctGatherParams = (null)
63-
JobCompHost = localhost
64-
JobCompLoc = /etc/slurm/rdkafka.conf
65-
JobCompParams = flush_timeout=200,poll_interval=3,requeue_on_msg_timeout,topic=raw.slurm.jobcomp.zinal
66-
JobCompPort = 0
67-
JobCompType = jobcomp/kafka
68-
JobCompUser = root
69-
JobContainerType = (null)
70-
JobDefaults = (null)
71-
JobFileAppend = 0
72-
JobRequeue = 1
73-
JobSubmitPlugins = (null)
74-
KillOnBadExit = 0
75-
KillWait = 30 sec
76-
LaunchParameters = (null)
77-
Licenses = (null)
78-
LogTimeFormat = iso8601_ms
79-
MailDomain = (null)
80-
MailProg = /bin/mail
81-
MaxArraySize = 1001
82-
MaxBatchRequeue = 5
83-
MaxDBDMsgs = 20096
84-
MaxJobCount = 10000
85-
MaxJobId = 67043328
86-
MaxMemPerNode = UNLIMITED
87-
MaxNodeCount = 24
88-
MaxStepCount = 40000
89-
MaxTasksPerNode = 512
90-
MCSPlugin = (null)
91-
MCSParameters = (null)
92-
MessageTimeout = 30 sec
93-
MinJobAge = 300 sec
94-
MpiDefault = (null)
95-
MpiParams = (null)
96-
NEXT_JOB_ID = 243
97-
NodeFeaturesPlugins = (null)
98-
OverTimeLimit = 0 min
99-
PluginDir = /usr/lib64/slurm
100-
PlugStackConfig = (null)
101-
PreemptMode = OFF
102-
PreemptParameters = (null)
103-
PreemptType = (null)
104-
PreemptExemptTime = 00:00:00
105-
PrEpParameters = (null)
106-
PrEpPlugins = prep/script
107-
PriorityParameters = (null)
108-
PrioritySiteFactorParameters = (null)
109-
PrioritySiteFactorPlugin = (null)
110-
PriorityDecayHalfLife = 40-00:00:00
111-
PriorityCalcPeriod = 00:05:00
112-
PriorityFavorSmall = No
113-
PriorityFlags = NO_NORMAL_ALL
114-
PriorityMaxAge = 14-00:00:00
115-
PriorityUsageResetPeriod = QUARTERLY
116-
PriorityType = priority/multifactor
117-
PriorityWeightAge = 172800
118-
PriorityWeightAssoc = 100000
119-
PriorityWeightFairShare = 259200
120-
PriorityWeightJobSize = 0
121-
PriorityWeightPartition = 172800
122-
PriorityWeightQOS = 500000
123-
PriorityWeightTRES = (null)
124-
PrivateData = none
125-
ProctrackType = proctrack/cgroup
126-
Prolog[0] = /etc/slurm/node_prolog.d/*
127-
PrologEpilogTimeout = 65534
128-
PrologFlags = Alloc,Contain,X11
129-
PropagatePrioProcess = 0
130-
PropagateResourceLimits = ALL
131-
PropagateResourceLimitsExcept = (null)
132-
RebootProgram = (null)
133-
ReconfigFlags = (null)
134-
RequeueExit = (null)
135-
RequeueExitHold = (null)
136-
ResumeFailProgram = (null)
137-
ResumeProgram = (null)
138-
ResumeRate = 300 nodes/min
139-
ResumeTimeout = 60 sec
140-
ResvEpilog = (null)
141-
ResvOverRun = 0 min
142-
ResvProlog = (null)
143-
ReturnToService = 1
144-
SchedulerParameters = (null)
145-
SchedulerTimeSlice = 30 sec
146-
SchedulerType = sched/backfill
147-
ScronParameters = (null)
148-
SelectType = select/cons_tres
149-
SelectTypeParameters = CR_CORE_MEMORY
150-
SlurmUser = root(0)
151-
SlurmctldAddr = (null)
152-
SlurmctldDebug = verbose
153-
SlurmctldHost[0] = zinal-slurmctl(zinal-slurmctl.tds.cscs.ch)
154-
SlurmctldLogFile = /var/log/slurmctld.log
155-
SlurmctldPort = 6817
156-
SlurmctldSyslogDebug = (null)
157-
SlurmctldPrimaryOffProg = (null)
158-
SlurmctldPrimaryOnProg = (null)
159-
SlurmctldTimeout = 120 sec
160-
SlurmctldParameters = enable_configless,enable_stepmgr
161-
SlurmdDebug = debug
162-
SlurmdLogFile = /var/log/slurmd.log
163-
SlurmdParameters = (null)
164-
SlurmdPidFile = /var/run/slurmd.pid
165-
SlurmdPort = 6818
166-
SlurmdSpoolDir = /tmp/spool/slurmd
167-
SlurmdSyslogDebug = (null)
168-
SlurmdTimeout = 300 sec
169-
SlurmdUser = root(0)
170-
SlurmSchedLogFile = (null)
171-
SlurmSchedLogLevel = 0
172-
SlurmctldPidFile = /var/run/slurmctld.pid
173-
SLURM_CONF = /etc/slurm/slurm.conf
174-
SLURM_VERSION = 24.05.8
175-
SrunEpilog = (null)
176-
SrunPortRange = 0-0
177-
SrunProlog = (null)
178-
StateSaveLocation = /var/spool/slurmctld
179-
SuspendExcNodes = (null)
180-
SuspendExcParts = (null)
181-
SuspendExcStates = (null)
182-
SuspendProgram = (null)
183-
SuspendRate = 60 nodes/min
184-
SuspendTime = INFINITE
185-
SuspendTimeout = 30 sec
186-
SwitchParameters = (null)
187-
SwitchType = (null)
188-
TaskEpilog = (null)
189-
TaskPlugin = task/affinity
190-
TaskPluginParam = (null type)
191-
TaskProlog = (null)
192-
TCPTimeout = 2 sec
193-
TLSParameters = (null)
194-
TLSType = tls/none
195-
TmpFS = /tmp
196-
TopologyParam = (null)
197-
TopologyPlugin = topology/default
198-
TrackWCKey = No
199-
TreeWidth = 65533
200-
UsePam = No
201-
UnkillableStepProgram = (null)
202-
UnkillableStepTimeout = 150 sec
203-
VSizeFactor = 0 percent
204-
WaitTime = 0 sec
205-
X11Parameters = (null)
206-
207-
Cgroup Support Configuration:
208-
AllowedRAMSpace = 100.0%
209-
AllowedSwapSpace = 0.0%
210-
CgroupMountpoint = /sys/fs/cgroup
211-
CgroupPlugin = autodetect
212-
ConstrainCores = yes
213-
ConstrainDevices = no
214-
ConstrainRAMSpace = yes
215-
ConstrainSwapSpace = no
216-
EnableControllers = no
217-
IgnoreSystemd = no
218-
IgnoreSystemdOnFailure = no
219-
MaxRAMPercent = 100.0%
220-
MaxSwapPercent = 100.0%
221-
MemorySwappiness = (null)
222-
MinRAMSpace = 30MB
223-
SystemdTimeout = 1000 ms
224-
225-
MPI Plugins Configuration:
226-
PMIxCliTmpDirBase = (null)
227-
PMIxCollFence = (null)
228-
PMIxDebug = 0
229-
PMIxDirectConn = yes
230-
PMIxDirectConnEarly = no
231-
PMIxDirectConnUCX = no
232-
PMIxDirectSameArch = no
233-
PMIxEnv = (null)
234-
PMIxFenceBarrier = no
235-
PMIxNetDevicesUCX = (null)
236-
PMIxTimeout = 300
237-
PMIxTlsUCX = (null)
238-
6+
SlurmdSpoolDir = /tmp/spool/slurmd (default)
7+
TmpFS = /tmp (default)
2398
Slurmctld(primary) at zinal-slurmctl is UP
2409
EOF
24110
fi

0 commit comments

Comments
 (0)