Skip to content

Commit cc67709

Browse files
committed
style: better formatting and docstrings
1 parent a066a22 commit cc67709

File tree

4 files changed

+68
-60
lines changed

4 files changed

+68
-60
lines changed

src/DIRAC/WorkloadManagementSystem/Agent/PushJobAgent.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
from DIRAC.WorkloadManagementSystem.Service.WMSUtilities import getGridEnv
2626
from DIRAC.WorkloadManagementSystem.Agent.JobAgent import JobAgent
2727
from DIRAC.WorkloadManagementSystem.private.ConfigHelper import findGenericPilotCredentials
28-
from DIRAC.WorkloadManagementSystem.Client import JobStatus
2928

3029
MAX_JOBS_MANAGED = 100
3130

src/DIRAC/WorkloadManagementSystem/Agent/StalledJobAgent.py

Lines changed: 62 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,12 @@
1-
""" The StalledJobAgent hunts for stalled jobs in the Job database. Jobs in "running"
2-
state not receiving a heart beat signal for more than stalledTime
3-
seconds will be assigned the "Stalled" state.
4-
1+
"""The StalledJobAgent hunts for stalled jobs in the Job database. Jobs in
2+
"running" state not receiving a heart beat signal for more than stalledTime
3+
seconds will be assigned the "Stalled" state.
54
65
.. literalinclude:: ../ConfigTemplate.cfg
76
:start-after: ##BEGIN StalledJobAgent
87
:end-before: ##END
98
:dedent: 2
109
:caption: StalledJobAgent options
11-
1210
"""
1311
import concurrent.futures
1412
import datetime
@@ -32,23 +30,28 @@
3230

3331

3432
class StalledJobAgent(AgentModule):
35-
"""Agent for setting Running jobs Stalled, and Stalled jobs Failed. And a few more."""
33+
"""Agent for setting Running jobs Stalled, and Stalled jobs Failed.
34+
35+
And a few more.
36+
"""
3637

3738
def __init__(self, *args, **kwargs):
38-
"""c'tor"""
39+
"""c'tor."""
3940
super().__init__(*args, **kwargs)
4041

4142
self.jobDB = None
4243
self.logDB = None
4344
self.matchedTime = 7200
4445
self.rescheduledTime = 600
4546
self.submittingTime = 300
47+
self.stalledJobsToleranceTime = 0
4648
self.stalledJobsTolerantSites = []
4749
self.stalledJobsToRescheduleSites = []
50+
self.threadPoolExecutor = None
4851

4952
#############################################################################
5053
def initialize(self):
51-
"""Sets default parameters"""
54+
"""Sets default parameters."""
5255
self.jobDB = JobDB()
5356
self.logDB = JobLoggingDB()
5457

@@ -57,19 +60,21 @@ def initialize(self):
5760
if not self.am_getOption("Enable", True):
5861
self.log.info("Stalled Job Agent running in disabled mode")
5962

60-
wms_instance = getSystemInstance("WorkloadManagement")
61-
if not wms_instance:
63+
wmsInstance = getSystemInstance("WorkloadManagement")
64+
if not wmsInstance:
6265
return S_ERROR("Can not get the WorkloadManagement system instance")
63-
self.stalledJobsTolerantSites = self.am_getOption("StalledJobsTolerantSites", [])
64-
self.stalledJobsToleranceTime = self.am_getOption("StalledJobsToleranceTime", 0)
66+
self.stalledJobsTolerantSites = self.am_getOption("StalledJobsTolerantSites", self.stalledJobsTolerantSites)
67+
self.stalledJobsToleranceTime = self.am_getOption("StalledJobsToleranceTime", self.stalledJobsToleranceTime)
6568

66-
self.stalledJobsToRescheduleSites = self.am_getOption("StalledJobsToRescheduleSites", [])
69+
self.stalledJobsToRescheduleSites = self.am_getOption(
70+
"StalledJobsToRescheduleSites", self.stalledJobsToRescheduleSites
71+
)
6772

6873
self.submittingTime = self.am_getOption("SubmittingTime", self.submittingTime)
6974
self.matchedTime = self.am_getOption("MatchedTime", self.matchedTime)
7075
self.rescheduledTime = self.am_getOption("RescheduledTime", self.rescheduledTime)
7176

72-
wrapperSection = cfgPath("Systems", "WorkloadManagement", wms_instance, "JobWrapper")
77+
wrapperSection = cfgPath("Systems", "WorkloadManagement", wmsInstance, "JobWrapper")
7378

7479
failedTime = self.am_getOption("FailedTimeHours", 6)
7580
watchdogCycle = gConfig.getValue(cfgPath(wrapperSection, "CheckingTime"), 30 * 60)
@@ -89,14 +94,14 @@ def initialize(self):
8994

9095
# setting up the threading
9196
maxNumberOfThreads = self.am_getOption("MaxNumberOfThreads", 15)
92-
self.log.verbose("Multithreaded with %d threads" % maxNumberOfThreads)
97+
self.log.verbose(f"Multithreaded with {maxNumberOfThreads} threads")
9398
self.threadPoolExecutor = concurrent.futures.ThreadPoolExecutor(max_workers=maxNumberOfThreads)
9499

95100
return S_OK()
96101

97102
#############################################################################
98103
def execute(self):
99-
"""The main agent execution method"""
104+
"""The main agent execution method."""
100105
# Now we are getting what's going to be checked
101106
futures = []
102107

@@ -162,22 +167,23 @@ def execute(self):
162167
return S_OK()
163168

164169
def finalize(self):
165-
"""graceful finalization"""
170+
"""Graceful finalization."""
166171

167172
self.log.info("Wait for threads to get empty before terminating the agent")
168173
self.threadPoolExecutor.shutdown()
169174
self.log.info("Threads are empty, terminating the agent...")
170175
return S_OK()
171176

172177
def _execute(self, job_Op):
173-
"""
174-
Doing the actual job. This is run inside the threads
178+
"""Doing the actual job.
179+
180+
This is run inside the threads
175181
"""
176182
jobID, jobOp = job_Op.split(":")
177183
jobID = int(jobID)
178184
res = getattr(self, f"{jobOp}")(jobID)
179185
if not res["OK"]:
180-
self.log.error(f"Failure executing {jobOp}", "on %d: %s" % (jobID, res["Message"]))
186+
self.log.error(f"Failure executing {jobOp}", f"on {jobID}: {res['Message']}")
181187

182188
#############################################################################
183189
def _markStalledJobs(self, jobID):
@@ -205,8 +211,8 @@ def _markStalledJobs(self, jobID):
205211

206212
#############################################################################
207213
def _failStalledJobs(self, jobID):
208-
"""
209-
Changes the Stalled status to Failed for jobs long in the Stalled status.
214+
"""Changes the Stalled status to Failed for jobs long in the Stalled
215+
status.
210216
211217
Run inside thread.
212218
"""
@@ -215,7 +221,7 @@ def _failStalledJobs(self, jobID):
215221
# Check if the job pilot is lost
216222
result = self._getJobPilotStatus(jobID)
217223
if not result["OK"]:
218-
self.log.error("Failed to get pilot status", "for job %d: %s" % (jobID, result["Message"]))
224+
self.log.error("Failed to get pilot status", f"for job {jobID}: {result['Message']}")
219225
return result
220226
pilotStatus = result["Value"]
221227
if pilotStatus != "Running":
@@ -224,7 +230,7 @@ def _failStalledJobs(self, jobID):
224230
# Verify that there was no sign of life for long enough
225231
result = self._getLatestUpdateTime(jobID)
226232
if not result["OK"]:
227-
self.log.error("Failed to get job update time", "for job %d: %s" % (jobID, result["Message"]))
233+
self.log.error("Failed to get job update time", f"for job {jobID}: {result['Message']}")
228234
return result
229235
elapsedTime = toEpoch() - result["Value"]
230236
if elapsedTime > self.failedTime:
@@ -233,7 +239,9 @@ def _failStalledJobs(self, jobID):
233239
# Set the jobs Failed, send them a kill signal in case they are not really dead
234240
# and send accounting info
235241
if setFailed:
236-
self._sendKillCommand(jobID) # always returns None
242+
res = self._sendKillCommand(jobID)
243+
if not res["OK"]:
244+
self.log.error("Failed to kill job", jobID)
237245

238246
# For some sites we might want to reschedule rather than fail the jobs
239247
if self.stalledJobsToRescheduleSites:
@@ -249,7 +257,7 @@ def _failStalledJobs(self, jobID):
249257
return S_OK()
250258

251259
def _getJobPilotStatus(self, jobID):
252-
"""Get the job pilot status"""
260+
"""Get the job pilot status."""
253261
result = JobMonitoringClient().getJobParameter(jobID, "Pilot_Reference")
254262
if not result["OK"]:
255263
return result
@@ -261,9 +269,9 @@ def _getJobPilotStatus(self, jobID):
261269
result = PilotManagerClient().getPilotInfo(pilotReference)
262270
if not result["OK"]:
263271
if DErrno.cmpError(result, DErrno.EWMSNOPILOT):
264-
self.log.warn("No pilot found", "for job %d: %s" % (jobID, result["Message"]))
272+
self.log.warn("No pilot found", f"for job {jobID}: {result['Message']}")
265273
return S_OK("NoPilot")
266-
self.log.error("Failed to get pilot information", "for job %d: %s" % (jobID, result["Message"]))
274+
self.log.error("Failed to get pilot information", f"for job {jobID}: {result['Message']}")
267275
return result
268276
pilotStatus = result["Value"][pilotReference]["Status"]
269277

@@ -272,8 +280,7 @@ def _getJobPilotStatus(self, jobID):
272280
#############################################################################
273281
def _checkJobStalled(self, job, stalledTime):
274282
"""Compares the most recent of LastUpdateTime and HeartBeatTime against
275-
the stalledTime limit.
276-
"""
283+
the stalledTime limit."""
277284
result = self._getLatestUpdateTime(job)
278285
if not result["OK"]:
279286
return result
@@ -290,7 +297,7 @@ def _checkJobStalled(self, job, stalledTime):
290297

291298
#############################################################################
292299
def _getLatestUpdateTime(self, job):
293-
"""Returns the most recent of HeartBeatTime and LastUpdateTime"""
300+
"""Returns the most recent of HeartBeatTime and LastUpdateTime."""
294301
result = self.jobDB.getJobAttributes(job, ["HeartBeatTime", "LastUpdateTime"])
295302
if not result["OK"] or not result["Value"]:
296303
self.log.error(
@@ -318,7 +325,7 @@ def _getLatestUpdateTime(self, job):
318325

319326
#############################################################################
320327
def _updateJobStatus(self, job, status, minorStatus=None, force=False):
321-
"""This method updates the job status in the JobDB"""
328+
"""This method updates the job status in the JobDB."""
322329

323330
if not self.am_getOption("Enable", True):
324331
return S_OK("Disabled")
@@ -328,23 +335,21 @@ def _updateJobStatus(self, job, status, minorStatus=None, force=False):
328335
self.log.debug(f"self.jobDB.setJobAttribute({job},'Status','{status}',update=True)")
329336
result = self.jobDB.setJobAttribute(job, "Status", status, update=True, force=force)
330337
if not result["OK"]:
331-
self.log.error("Failed setting Status", "%s for job %d: %s" % (status, job, result["Message"]))
338+
self.log.error("Failed setting Status", f"{status} for job {job}: {result['Message']}")
332339
toRet = result
333340
if minorStatus:
334341
self.log.debug(f"self.jobDB.setJobAttribute({job},'MinorStatus','{minorStatus}',update=True)")
335342
result = self.jobDB.setJobAttribute(job, "MinorStatus", minorStatus, update=True)
336343
if not result["OK"]:
337-
self.log.error(
338-
"Failed setting MinorStatus", "%s for job %d: %s" % (minorStatus, job, result["Message"])
339-
)
344+
self.log.error("Failed setting MinorStatus", f"{minorStatus} for job {job}: {result['Message']}")
340345
toRet = result
341346

342347
if not minorStatus: # Retain last minor status for stalled jobs
343348
result = self.jobDB.getJobAttributes(job, ["MinorStatus"])
344349
if result["OK"]:
345350
minorStatus = result["Value"]["MinorStatus"]
346351
else:
347-
self.log.error("Failed getting MinorStatus", "for job %d: %s" % (job, result["Message"]))
352+
self.log.error("Failed getting MinorStatus", f"for job {job}: {result['Message']}")
348353
minorStatus = "idem"
349354
toRet = result
350355

@@ -356,7 +361,8 @@ def _updateJobStatus(self, job, status, minorStatus=None, force=False):
356361
return toRet
357362

358363
def _getProcessingType(self, jobID):
359-
"""Get the Processing Type from the JDL, until it is promoted to a real Attribute"""
364+
"""Get the Processing Type from the JDL, until it is promoted to a real
365+
Attribute."""
360366
processingType = "unknown"
361367
result = self.jobDB.getJobJDL(jobID, original=True)
362368
if not result["OK"]:
@@ -367,8 +373,7 @@ def _getProcessingType(self, jobID):
367373
return processingType
368374

369375
def _sendAccounting(self, jobID):
370-
"""
371-
Send WMS accounting data for the given job.
376+
"""Send WMS accounting data for the given job.
372377
373378
Run inside thread.
374379
"""
@@ -448,11 +453,11 @@ def _sendAccounting(self, jobID):
448453
if result["OK"]:
449454
self.jobDB.setJobAttribute(jobID, "AccountedFlag", "True")
450455
else:
451-
self.log.error("Failed to send accounting report", "Job: %d, Error: %s" % (int(jobID), result["Message"]))
456+
self.log.error("Failed to send accounting report", f"for job {jobID}: {result['Message']}")
452457
return result
453458

454459
def _checkHeartBeat(self, jobID, jobDict):
455-
"""Get info from HeartBeat"""
460+
"""Get info from HeartBeat."""
456461
result = self.jobDB.getHeartBeatData(jobID)
457462
lastCPUTime = 0
458463
lastWallTime = 0
@@ -482,7 +487,7 @@ def _checkHeartBeat(self, jobID, jobDict):
482487
return lastCPUTime, lastWallTime, lastHeartBeatTime
483488

484489
def _checkLoggingInfo(self, jobID, jobDict):
485-
"""Get info from JobLogging"""
490+
"""Get info from JobLogging."""
486491
logList = []
487492
result = self.logDB.getJobLoggingInfo(jobID)
488493
if result["OK"]:
@@ -516,7 +521,8 @@ def _checkLoggingInfo(self, jobID, jobDict):
516521
return startTime, endTime
517522

518523
def _kickStuckJobs(self):
519-
"""Reschedule jobs stuck in initialization status Rescheduled, Matched"""
524+
"""Reschedule jobs stuck in initialization status Rescheduled,
525+
Matched."""
520526

521527
message = ""
522528

@@ -563,6 +569,7 @@ def _kickStuckJobs(self):
563569

564570
def _failSubmittingJobs(self):
565571
"""Failed Jobs stuck in Submitting Status for a long time.
572+
566573
They are due to a failed bulk submission transaction.
567574
"""
568575

@@ -587,11 +594,17 @@ def _sendKillCommand(self, job):
587594
:param int job: ID of job to send kill command
588595
"""
589596

590-
owner = self.jobDB.getJobAttribute(job, "Owner")["Value"]
591-
ownerGroup = self.jobDB.getJobAttribute(job, "OwnerGroup")["Value"]
597+
res = self.jobDB.getJobAttribute(job, "Owner")
598+
if not res["OK"]:
599+
return res
600+
owner = res["Value"]
601+
602+
res = self.jobDB.getJobAttribute(job, "OwnerGroup")
603+
if not res["OK"]:
604+
return res
605+
ownerGroup = res["Value"]
606+
592607
wmsClient = WMSClient(
593608
useCertificates=True, delegatedDN=getDNForUsername(owner)["Value"][0], delegatedGroup=ownerGroup
594609
)
595-
resKill = wmsClient.killJob(job)
596-
if not resKill["OK"]:
597-
self.log.error("Failed to send kill command to job", f"{job}: {resKill['Message']}")
610+
return wmsClient.killJob(job)

src/DIRAC/WorkloadManagementSystem/Service/SandboxStoreHandler.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,24 +7,23 @@
77
:dedent: 2
88
:caption: SandboxStore options
99
"""
10+
import hashlib
1011
import os
11-
import time
12-
import threading
1312
import tempfile
14-
import hashlib
13+
import threading
14+
import time
1515

16-
from DIRAC import gLogger, S_OK, S_ERROR
17-
from DIRAC.ConfigurationSystem.Client.Helpers.Registry import getDNForUsername
16+
from DIRAC import S_ERROR, S_OK, gLogger
1817
from DIRAC.Core.DISET.RequestHandler import RequestHandler
1918
from DIRAC.Core.Security import Locations, Properties, X509Certificate
2019
from DIRAC.Core.Utilities.File import mkDir
2120
from DIRAC.Core.Utilities.ObjectLoader import ObjectLoader
2221
from DIRAC.DataManagementSystem.Client.DataManager import DataManager
2322
from DIRAC.DataManagementSystem.Service.StorageElementHandler import getDiskSpace
23+
from DIRAC.RequestManagementSystem.Client.File import File
24+
from DIRAC.RequestManagementSystem.Client.Operation import Operation
2425
from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient
2526
from DIRAC.RequestManagementSystem.Client.Request import Request
26-
from DIRAC.RequestManagementSystem.Client.Operation import Operation
27-
from DIRAC.RequestManagementSystem.Client.File import File
2827
from DIRAC.Resources.Storage.StorageElement import StorageElement
2928
from DIRAC.Core.Utilities.File import getGlobbedTotalSize
3029

tests/Integration/WorkloadManagementSystem/Test_Client_WMS.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,6 @@ def test_submitJob(jobType, inputData, expectedSite):
182182
# # Check that the JDL contains some fields
183183
# assert jobDescription.lookupAttribute("Owner") is True
184184
# assert jobDescription.lookupAttribute("OwnerGroup") is True
185-
# assert jobDescription.lookupAttribute("OwnerDN") is True
186185
# assert jobDescription.lookupAttribute("CPUTime") is True
187186
# assert jobDescription.lookupAttribute("Priority") is True
188187
# assert jobDescription.lookupAttribute("JobID") is True
@@ -193,7 +192,6 @@ def test_submitJob(jobType, inputData, expectedSite):
193192

194193
# resourceDescription = {
195194
# "OwnerGroup": jobDescription.getAttributeString("OwnerGroup"),
196-
# "OwnerDN": jobDescription.getAttributeString("OwnerDN"),
197195
# "VirtualOrganization": jobDescription.getAttributeString("VirtualOrganization"),
198196
# "CPUTime": jobDescription.getAttributeInt("CPUTime"),
199197
# "DIRACVersion": "pippo",
@@ -324,7 +322,6 @@ def test_WMSClient_rescheduleJob():
324322

325323
# resourceDescription = {
326324
# "OwnerGroup": jobDescription.getAttributeString("OwnerGroup"),
327-
# "OwnerDN": jobDescription.getAttributeString("OwnerDN"),
328325
# "VirtualOrganization": jobDescription.getAttributeString("VirtualOrganization"),
329326
# "CPUTime": jobDescription.getAttributeInt("CPUTime"),
330327
# "DIRACVersion": "pippo",

0 commit comments

Comments
 (0)