fix (FTS3Agent): add a monitoring delay, and avoid monitorig multiple

chaen · chaen · commit 2247c6f4b01c · 2023-02-24T13:29:01.000+01:00
times per loop
diff --git a/src/DIRAC/DataManagementSystem/Agent/FTS3Agent.py b/src/DIRAC/DataManagementSystem/Agent/FTS3Agent.py
@@ -12,6 +12,7 @@
   :caption: FTS3Agent options
 
 """
+import datetime
 import errno
 import os
 from urllib import parse
@@ -54,6 +55,10 @@
 # when running multiple agents, we rather do it in steps
 JOB_MONITORING_BATCH_SIZE = 20
 
+# We do not monitor a job more often
+# than MONITORING_DELAY in minutes
+MONITORING_DELAY = 10
+
 
 class FTS3Agent(AgentModule):
     """
@@ -299,18 +304,29 @@ def monitorJobsLoop(self):
         if mod:
             nbOfLoops += 1
 
+        # Not only is it pointless to monitor right after submission
+        # but also we would end up fetching multiple time the same job otherwise
+        # as we call getActiveJobs by batch
+        lastMonitor = datetime.datetime.utcnow() - datetime.timedelta(minutes=MONITORING_DELAY)
+
         log.debug("Getting active jobs")
 
         for loopId in range(nbOfLoops):
             log.info("Getting next batch of jobs to monitor", f"{loopId}/{nbOfLoops}")
             # get jobs from DB
-            res = self.fts3db.getActiveJobs(limit=JOB_MONITORING_BATCH_SIZE, jobAssignmentTag=self.assignmentTag)
+            res = self.fts3db.getActiveJobs(
+                limit=JOB_MONITORING_BATCH_SIZE, lastMonitor=lastMonitor, jobAssignmentTag=self.assignmentTag
+            )
 
             if not res["OK"]:
                 log.error("Could not retrieve ftsJobs from the DB", res)
                 return res
 
             activeJobs = res["Value"]
+            if not activeJobs:
+                log.info("No more jobs to monitor")
+                break
+
             log.info("Jobs queued for monitoring", len(activeJobs))
 
             # We store here the AsyncResult object on which we are going to wait