@@ -88,8 +88,12 @@ def __init__(self, agentName, loadName, baseAgentName=False, properties=None):
88
88
self .timeLeftUtil = None
89
89
self .pilotInfoReportedFlag = False
90
90
91
- # Submission results
92
- self .submissionDict = {}
91
+ # Attributes related to the processed jobs, it should take the following form:
92
+ # {"<jobID>": {"jobReport": JobReport(), "taskID": "<taskID>"}}
93
+ # where taskID is the ID of the job as seen by the CE
94
+ # and jobReport is the JobReport instance for the job
95
+ # (one instance per job to avoid any discrepancy when communicating with the WMS)
96
+ self .jobs = {}
93
97
94
98
#############################################################################
95
99
def initialize (self ):
@@ -135,7 +139,6 @@ def initialize(self):
135
139
136
140
# Utilities
137
141
self .timeLeftUtil = TimeLeft ()
138
- self .jobReport = JobReport (0 , f"{ self .__class__ .__name__ } @{ self .siteName } " )
139
142
return S_OK ()
140
143
141
144
def _initializeComputingElement (self , localCE ):
@@ -211,8 +214,11 @@ def execute(self):
211
214
matcherParams = ["JDL" , "DN" , "Group" ]
212
215
matcherInfo = jobRequest ["Value" ]
213
216
jobID = matcherInfo ["JobID" ]
214
- self .jobReport .setJob (jobID )
215
- result = self ._checkMatcherInfo (matcherInfo , matcherParams )
217
+
218
+ self .jobs [jobID ] = {}
219
+ self .jobs [jobID ]["JobReport" ] = JobReport (jobID , f"{ self .__class__ .__name__ } @{ self .siteName } " )
220
+
221
+ result = self ._checkMatcherInfo (jobID , matcherInfo , matcherParams )
216
222
if not result ["OK" ]:
217
223
return self ._finish (result ["Message" ])
218
224
@@ -235,30 +241,35 @@ def execute(self):
235
241
# Get JDL paramters
236
242
parameters = self ._getJDLParameters (jobJDL )
237
243
if not parameters ["OK" ]:
238
- self .jobReport .setJobStatus (status = JobStatus .FAILED , minorStatus = "Could Not Extract JDL Parameters" )
244
+ self .jobs [jobID ]["JobReport" ].setJobStatus (
245
+ status = JobStatus .FAILED , minorStatus = "Could Not Extract JDL Parameters"
246
+ )
239
247
self .log .warn ("Could Not Extract JDL Parameters" , parameters ["Message" ])
240
- return self ._finish ("JDL Problem" )
248
+ return self ._finish ("JDL Problem" , self . stopOnApplicationFailure )
241
249
242
250
params = parameters ["Value" ]
243
251
result = self ._extractValuesFromJobParams (params )
244
252
if not result ["OK" ]:
245
- return self ._finish (result ["Value" ])
253
+ self .jobs [jobID ]["JobReport" ].setJobStatus (status = JobStatus .FAILED , minorStatus = result ["Message" ])
254
+ return self ._finish (result ["Value" ], self .stopOnApplicationFailure )
246
255
submissionParams = result ["Value" ]
247
256
jobID = submissionParams ["jobID" ]
248
257
jobType = submissionParams ["jobType" ]
249
258
250
259
self .log .verbose ("Job request successful: \n " , jobRequest ["Value" ])
251
260
self .log .info ("Received" , f"JobID={ jobID } , JobType={ jobType } , OwnerDN={ ownerDN } , JobGroup={ jobGroup } " )
252
261
self .jobCount += 1
253
- self .jobReport .setJobParameter (par_name = "MatcherServiceTime" , par_value = str (matchTime ), sendFlag = False )
262
+ self .jobs [jobID ]["JobReport" ].setJobParameter (
263
+ par_name = "MatcherServiceTime" , par_value = str (matchTime ), sendFlag = False
264
+ )
254
265
if "BOINC_JOB_ID" in os .environ :
255
266
# Report BOINC environment
256
267
for thisp in ("BoincUserID" , "BoincHostID" , "BoincHostPlatform" , "BoincHostName" ):
257
- self .jobReport .setJobParameter (
268
+ self .jobs [ jobID ][ "JobReport" ] .setJobParameter (
258
269
par_name = thisp , par_value = gConfig .getValue (f"/LocalSite/{ thisp } " , "Unknown" ), sendFlag = False
259
270
)
260
271
261
- self .jobReport .setJobStatus (minorStatus = "Job Received by Agent" , sendFlag = False )
272
+ self .jobs [ jobID ][ "JobReport" ] .setJobStatus (minorStatus = "Job Received by Agent" , sendFlag = False )
262
273
result_setupProxy = self ._setupProxy (ownerDN , jobGroup )
263
274
if not result_setupProxy ["OK" ]:
264
275
result = self ._rescheduleFailedJob (jobID , result_setupProxy ["Message" ])
@@ -269,7 +280,8 @@ def execute(self):
269
280
self ._saveJobJDLRequest (jobID , jobJDL )
270
281
271
282
# Check software and install them if required
272
- software = self ._checkInstallSoftware (jobID , params , ceDict )
283
+ self .jobs [jobID ]["JobReport" ].setJobStatus (minorStatus = "Installing Software" , sendFlag = False )
284
+ software = self ._checkInstallSoftware (params , ceDict )
273
285
if not software ["OK" ]:
274
286
self .log .error ("Failed to install software for job" , f"{ jobID } " )
275
287
errorMsg = software ["Message" ]
@@ -280,14 +292,14 @@ def execute(self):
280
292
281
293
gridCE = gConfig .getValue ("/LocalSite/GridCE" , "" )
282
294
if gridCE :
283
- self .jobReport .setJobParameter (par_name = "GridCE" , par_value = gridCE , sendFlag = False )
295
+ self .jobs [ jobID ][ "JobReport" ] .setJobParameter (par_name = "GridCE" , par_value = gridCE , sendFlag = False )
284
296
285
297
queue = gConfig .getValue ("/LocalSite/CEQueue" , "" )
286
298
if queue :
287
- self .jobReport .setJobParameter (par_name = "CEQueue" , par_value = queue , sendFlag = False )
299
+ self .jobs [ jobID ][ "JobReport" ] .setJobParameter (par_name = "CEQueue" , par_value = queue , sendFlag = False )
288
300
289
301
if batchSystem := gConfig .getValue ("/LocalSite/BatchSystem/Type" , "" ):
290
- self .jobReport .setJobParameter (par_name = "BatchSystem" , par_value = batchSystem , sendFlag = False )
302
+ self .jobs [ jobID ][ "JobReport" ] .setJobParameter (par_name = "BatchSystem" , par_value = batchSystem , sendFlag = False )
291
303
292
304
self .log .debug (f"Before self._submitJob() ({ self .ceName } CE)" )
293
305
result = self ._submitJob (
@@ -307,32 +319,18 @@ def execute(self):
307
319
return self ._finish (result ["Message" ])
308
320
self .log .debug (f"After { self .ceName } CE submitJob()" )
309
321
310
- # Committing the JobReport before evaluating the result of job submission
311
- res = self .jobReport .commit ()
312
- if not res ["OK" ]:
313
- resFD = self .jobReport .generateForwardDISET ()
314
- if not resFD ["OK" ]:
315
- self .log .error ("Error generating ForwardDISET operation" , resFD ["Message" ])
316
- elif resFD ["Value" ]:
317
- # Here we create the Request.
318
- op = resFD ["Value" ]
319
- request = Request ()
320
- requestName = f"jobAgent_{ jobID } "
321
- request .RequestName = requestName .replace ('"' , "" )
322
- request .JobID = jobID
323
- request .SourceComponent = f"JobAgent_{ jobID } "
324
- request .addOperation (op )
325
- # This might fail, but only a message would be printed.
326
- self ._sendFailoverRequest (request )
327
-
328
322
# Checking errors that could have occurred during the job submission and/or execution
329
323
result = self ._checkSubmittedJobs ()
330
324
if not result ["OK" ]:
331
325
return result
326
+
332
327
submissionErrors = result ["Value" ][0 ]
333
328
payloadErrors = result ["Value" ][1 ]
334
329
if submissionErrors :
335
- return self ._finish ("Error during the submission process" )
330
+ # Stop the JobAgent if too many CE errors occurred
331
+ return self ._finish (
332
+ "Error during the submission process" , self .hostFailureCount > self .stopAfterHostFailures
333
+ )
336
334
if payloadErrors :
337
335
return self ._finish ("Error during a payload execution" , self .stopOnApplicationFailure )
338
336
@@ -525,7 +523,7 @@ def _requestProxyFromProxyManager(self, ownerDN, ownerGroup):
525
523
return S_OK (chain )
526
524
527
525
#############################################################################
528
- def _checkInstallSoftware (self , jobID , jobParams , resourceParams ):
526
+ def _checkInstallSoftware (self , jobParams , resourceParams ):
529
527
"""Checks software requirement of job and whether this is already present
530
528
before installing software locally.
531
529
"""
@@ -534,7 +532,6 @@ def _checkInstallSoftware(self, jobID, jobParams, resourceParams):
534
532
self .log .verbose (msg )
535
533
return S_OK (msg )
536
534
537
- self .jobReport .setJobStatus (minorStatus = "Installing Software" , sendFlag = False )
538
535
softwareDist = jobParams ["SoftwareDistModule" ]
539
536
self .log .verbose ("Found VO Software Distribution module" , f": { softwareDist } " )
540
537
argumentsDict = {"Job" : jobParams , "CE" : resourceParams }
@@ -586,15 +583,19 @@ def _checkMatchingIssues(self, jobRequest):
586
583
return self ._finish ("Nothing to do for more than %d cycles" % self .stopAfterFailedMatches )
587
584
return S_OK ()
588
585
589
- def _checkMatcherInfo (self , matcherInfo , matcherParams ):
586
+ def _checkMatcherInfo (self , jobID , matcherInfo , matcherParams ):
590
587
"""Check that all relevant information about the job are available"""
591
588
for param in matcherParams :
592
589
if param not in matcherInfo :
593
- self .jobReport .setJobStatus (status = JobStatus .FAILED , minorStatus = f"Matcher did not return { param } " )
590
+ self .jobs [jobID ]["JobReport" ].setJobStatus (
591
+ status = JobStatus .FAILED , minorStatus = f"Matcher did not return { param } "
592
+ )
594
593
return S_ERROR ("Matcher Failed" )
595
594
596
595
if not matcherInfo [param ]:
597
- self .jobReport .setJobStatus (status = JobStatus .FAILED , minorStatus = f"Matcher returned null { param } " )
596
+ self .jobs [jobID ]["JobReport" ].setJobStatus (
597
+ status = JobStatus .FAILED , minorStatus = f"Matcher returned null { param } "
598
+ )
598
599
return S_ERROR ("Matcher Failed" )
599
600
600
601
self .log .verbose ("Matcher returned" , f"{ param } = { matcherInfo [param ]} " )
@@ -636,7 +637,7 @@ def _submitJob(
636
637
637
638
wrapperFile = result ["Value" ][0 ]
638
639
inputs = list (result ["Value" ][1 :])
639
- self .jobReport .setJobStatus (minorStatus = "Submitting To CE" )
640
+ self .jobs [ jobID ][ "JobReport" ] .setJobStatus (minorStatus = "Submitting To CE" )
640
641
641
642
self .log .info ("Submitting JobWrapper" , f"{ os .path .basename (wrapperFile )} to { self .ceName } CE" )
642
643
@@ -666,7 +667,7 @@ def _submitJob(
666
667
taskID = 0
667
668
# We create a S_ERROR from the exception to compute it as a normal error
668
669
self .computingElement .taskResults [taskID ] = S_ERROR (unexpectedSubmitException )
669
- self .submissionDict [jobID ] = taskID
670
+ self .jobs [jobID ][ "TaskID" ] = taskID
670
671
return S_OK ()
671
672
672
673
# Submission results are processed in _checkSubmittedJobs
@@ -684,7 +685,7 @@ def _submitJob(
684
685
685
686
self .log .info ("Job being submitted" , f"(DIRAC JobID: { jobID } ; Task ID: { taskID } )" )
686
687
687
- self .submissionDict [jobID ] = taskID
688
+ self .jobs [jobID ][ "TaskID" ] = taskID
688
689
time .sleep (self .jobSubmissionDelay )
689
690
return S_OK ()
690
691
@@ -693,31 +694,26 @@ def _checkSubmittedJobs(self):
693
694
# We expect the computingElement to have a taskResult dictionary.
694
695
submissionErrors = []
695
696
payloadErrors = []
696
- originalJobID = self .jobReport .jobID
697
697
# Loop over the jobIDs submitted to the CE
698
698
# Here we iterate over a copy of the keys because we are modifying the dictionary within the loop
699
- for jobID in list (self .submissionDict .keys ()):
700
- taskID = self .submissionDict [jobID ]
701
- if taskID not in self .computingElement .taskResults :
699
+ for jobID in list (self .jobs .keys ()):
700
+ taskID = self .jobs [jobID ]. get ( "TaskID" )
701
+ if taskID is None or taskID not in self .computingElement .taskResults :
702
702
continue
703
703
704
704
result = self .computingElement .taskResults [taskID ]
705
- # jobReport will handle different jobIDs
706
- # setJobParameter() and setJobStatus() should send status immediately (sendFlag=True by default)
707
- self .jobReport .setJob (jobID )
708
705
709
706
# The submission process failed
710
707
if not result ["OK" ]:
711
708
self .log .error ("Job submission failed" , jobID )
712
- self .jobReport .setJobParameter (par_name = "ErrorMessage" , par_value = f"{ self .ceName } CE Submission Error" )
709
+ self .jobs [jobID ]["JobReport" ].setJobParameter (
710
+ par_name = "ErrorMessage" , par_value = f"{ self .ceName } CE Submission Error" , sendFlag = False
711
+ )
713
712
714
713
self .log .error ("Error in DIRAC JobWrapper or inner CE execution:" , result ["Message" ])
715
714
submissionErrors .append (result ["Message" ])
716
715
self ._rescheduleFailedJob (jobID , result ["Message" ])
717
- # Stop the JobAgent if too many CE errors
718
716
self .hostFailureCount += 1
719
- if self .hostFailureCount > self .stopAfterHostFailures :
720
- return self ._finish (result ["Message" ], self .stopAfterHostFailures )
721
717
722
718
# The payload failed (if result["Value"] is not 0)
723
719
elif result ["Value" ]:
@@ -726,19 +722,38 @@ def _checkSubmittedJobs(self):
726
722
if not res ["OK" ]:
727
723
return res
728
724
if res ["Value" ][int (jobID )]["Status" ] == JobStatus .RUNNING :
729
- self .jobReport .setJobStatus (status = JobStatus .FAILED , minorStatus = "Payload failed" )
725
+ self .jobs [jobID ]["JobReport" ].setJobStatus (
726
+ status = JobStatus .FAILED , minorStatus = "Payload failed" , sendFlag = False
727
+ )
730
728
731
729
# Do not keep running and do not overwrite the Payload error
732
730
message = f"Payload execution failed with error code { result ['Value' ]} "
733
731
payloadErrors .append (message )
734
732
self .log .info (message )
735
733
734
+ # The job has been treated, we can commit the JobReport
735
+ res = self .jobs [jobID ]["JobReport" ].commit ()
736
+ if not res ["OK" ]:
737
+ resFD = self .jobs [jobID ]["JobReport" ].generateForwardDISET ()
738
+ if not resFD ["OK" ]:
739
+ self .log .error ("Error generating ForwardDISET operation" , resFD ["Message" ])
740
+ elif resFD ["Value" ]:
741
+ # Here we create the Request.
742
+ op = resFD ["Value" ]
743
+ request = Request ()
744
+ requestName = f"jobAgent_{ jobID } "
745
+ request .RequestName = requestName .replace ('"' , "" )
746
+ request .JobID = jobID
747
+ request .SourceComponent = f"JobAgent_{ jobID } "
748
+ request .addOperation (op )
749
+ # This might fail, but only a message would be printed.
750
+ self ._sendFailoverRequest (request )
751
+
736
752
# Remove taskID from computingElement.taskResults as it has been treated
737
- # Remove jobID from submissionDict as it has been treated
753
+ # Remove jobID from jobs as it has been treated
738
754
del self .computingElement .taskResults [taskID ]
739
- del self .submissionDict [jobID ]
755
+ del self .jobs [jobID ]
740
756
741
- self .jobReport .setJob (originalJobID )
742
757
return S_OK ((submissionErrors , payloadErrors ))
743
758
744
759
#############################################################################
@@ -777,9 +792,8 @@ def _extractValuesFromJobParams(self, params):
777
792
submissionDict ["jobID" ] = params .get ("JobID" )
778
793
if not submissionDict ["jobID" ]:
779
794
msg = "Job has not JobID defined in JDL parameters"
780
- self .jobReport .setJobStatus (status = JobStatus .FAILED , minorStatus = msg )
781
795
self .log .warn (msg )
782
- return S_ERROR ("JDL Problem" )
796
+ return S_ERROR (msg )
783
797
784
798
submissionDict ["jobType" ] = params .get ("JobType" , "Unknown" )
785
799
if submissionDict ["jobType" ] == "Unknown" :
@@ -816,25 +830,19 @@ def _finish(self, message, stop=True):
816
830
return S_OK (message )
817
831
818
832
#############################################################################
819
- def _rescheduleFailedJob (self , jobID , message , direct = False ):
833
+ def _rescheduleFailedJob (self , jobID , message ):
820
834
"""
821
835
Set Job Status to "Rescheduled" and issue a reschedule command to the Job Manager
822
836
"""
823
837
824
838
self .log .warn ("Failure ==> rescheduling" , f"(during { message } )" )
825
839
826
- if direct :
827
- JobStateUpdateClient ().setJobStatus (
828
- int (jobID ), status = JobStatus .RESCHEDULED , applicationStatus = message , source = "JobAgent@%s" , force = True
829
- )
830
- else :
831
- originalJobID = self .jobReport .jobID
832
- self .jobReport .setJob (jobID )
833
- # Setting a job parameter does not help since the job will be rescheduled,
834
- # instead set the status with the cause and then another status showing the
835
- # reschedule operation.
836
- self .jobReport .setJobStatus (status = JobStatus .RESCHEDULED , applicationStatus = message , sendFlag = True )
837
- self .jobReport .setJob (originalJobID )
840
+ # Setting a job parameter does not help since the job will be rescheduled,
841
+ # instead set the status with the cause and then another status showing the
842
+ # reschedule operation.
843
+ self .jobs [jobID ]["JobReport" ].setJobStatus (
844
+ status = JobStatus .RESCHEDULED , applicationStatus = message , sendFlag = True
845
+ )
838
846
839
847
self .log .info ("Job will be rescheduled" )
840
848
result = JobManagerClient ().rescheduleJob (jobID )
@@ -882,11 +890,15 @@ def finalize(self):
882
890
if not res ["OK" ]:
883
891
self .log .error ("CE could not be properly shut down" , res ["Message" ])
884
892
885
- # Check the submitted jobs a last time
886
- result = self ._checkSubmittedJobs ()
887
- if not result ["OK" ]:
888
- self .log .error ("Problem while trying to get status of the last submitted jobs" )
893
+ # Check the latest submitted jobs
894
+ while self .jobs :
895
+ result = self ._checkSubmittedJobs ()
896
+ if not result ["OK" ]:
897
+ self .log .error ("Problem while trying to get status of the last submitted jobs" )
898
+ break
899
+ time .sleep (int (self .am_getOption ("PollingTime" )))
889
900
901
+ # Set the pilot status to Done
890
902
gridCE = gConfig .getValue ("/LocalSite/GridCE" , "" )
891
903
queue = gConfig .getValue ("/LocalSite/CEQueue" , "" )
892
904
result = PilotManagerClient ().setPilotStatus (
0 commit comments