Skip to content

Commit 70538bb

Browse files
authored
Merge pull request #7302 from fstagni/80_fixes72
[8.0] fix: sets jobStatus=Failed/Payload failed iff the job was running
2 parents ea17c79 + dab5430 commit 70538bb

File tree

2 files changed

+10
-2
lines changed

2 files changed

+10
-2
lines changed

src/DIRAC/WorkloadManagementSystem/Agent/JobAgent.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
from DIRAC.WorkloadManagementSystem.Client.MatcherClient import MatcherClient
3333
from DIRAC.WorkloadManagementSystem.Client.PilotManagerClient import PilotManagerClient
3434
from DIRAC.WorkloadManagementSystem.Client.JobManagerClient import JobManagerClient
35+
from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient
3536
from DIRAC.WorkloadManagementSystem.Client.JobStateUpdateClient import JobStateUpdateClient
3637
from DIRAC.WorkloadManagementSystem.Client.JobReport import JobReport
3738
from DIRAC.WorkloadManagementSystem.Client import JobStatus
@@ -691,7 +692,7 @@ def _checkSubmittedJobs(self):
691692
payloadErrors = []
692693
originalJobID = self.jobReport.jobID
693694
for jobID, taskID in self.submissionDict.items():
694-
if not taskID in self.computingElement.taskResults:
695+
if taskID not in self.computingElement.taskResults:
695696
continue
696697

697698
result = self.computingElement.taskResults[taskID]
@@ -714,7 +715,12 @@ def _checkSubmittedJobs(self):
714715

715716
# The payload failed (if result["Value"] is not 0)
716717
elif result["Value"]:
717-
self.jobReport.setJobStatus(status=JobStatus.FAILED, minorStatus="Payload failed")
718+
# In order to avoid overriding perfectly valid states, the status is updated iff the job was running
719+
res = JobMonitoringClient().getJobsStatus(jobID)
720+
if not res["OK"]:
721+
return res
722+
if res["Value"][jobID]["Status"] == JobStatus.RUNNING:
723+
self.jobReport.setJobStatus(status=JobStatus.FAILED, minorStatus="Payload failed")
718724

719725
# Do not keep running and do not overwrite the Payload error
720726
message = f"Payload execution failed with error code {result['Value']}"

src/DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_JobAgent.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import os
44
import pytest
55
import time
6+
from unittest.mock import MagicMock
67

78
from DIRAC import gLogger, S_OK, S_ERROR
89
from DIRAC.Core.Security.X509Chain import X509Chain # pylint: disable=import-error
@@ -498,6 +499,7 @@ def test_submitAndCheckJob(mocker, localCE, job, expectedResult1, expectedResult
498499

499500
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobAgent.AgentModule.__init__")
500501
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobAgent.JobAgent.am_stopExecution")
502+
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobAgent.JobMonitoringClient", return_value=MagicMock())
501503
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobAgent.createJobWrapper", return_value=S_OK([jobName]))
502504
mocker.patch("DIRAC.Core.Security.X509Chain.X509Chain.dumpAllToString", return_value=S_OK())
503505

0 commit comments

Comments
 (0)