1
- ########################################################################
2
- # File : Watchdog.py
3
- # Author: Stuart Paterson
4
- ########################################################################
5
-
6
1
""" The Watchdog class is used by the Job Wrapper to resolve and monitor
7
2
the system resource consumption. The Watchdog can determine if
8
3
a running job is stalled and indicate this to the Job Wrapper.
22
17
import math
23
18
import os
24
19
import re
25
- import resource
26
20
import socket
27
21
import time
28
22
from pathlib import Path
@@ -287,12 +281,14 @@ def _performChecks(self):
287
281
self .parameters ["LoadAverage" ] = []
288
282
self .parameters ["LoadAverage" ].append (loadAvg )
289
283
290
- memoryUsed = self .getMemoryUsed ()
291
- msg += f"MemUsed: { memoryUsed :.1f} kb "
292
- heartBeatDict ["MemoryUsed" ] = memoryUsed
284
+ result = self .profiler .memoryUsage (withChildren = True )
285
+ if not result ["OK" ]:
286
+ self .log .warn ("Could not get rss info from profiler" , result ["Message" ])
287
+ msg += f"MemUsed: { result ['Value' ]:.1f} kb "
288
+ heartBeatDict ["MemoryUsed" ] = result ["Value" ]
293
289
if "MemoryUsed" not in self .parameters :
294
290
self .parameters ["MemoryUsed" ] = []
295
- self .parameters ["MemoryUsed" ].append (memoryUsed )
291
+ self .parameters ["MemoryUsed" ].append (result [ "Value" ] )
296
292
297
293
result = self .profiler .vSizeUsage (withChildren = True )
298
294
if not result ["OK" ]:
@@ -304,16 +300,6 @@ def _performChecks(self):
304
300
self .parameters ["Vsize" ].append (vsize )
305
301
msg += f"Job Vsize: { vsize :.1f} kb "
306
302
307
- result = self .profiler .memoryUsage (withChildren = True )
308
- if not result ["OK" ]:
309
- self .log .warn ("Could not get rss info from profiler" , result ["Message" ])
310
- else :
311
- rss = result ["Value" ] * 1024.0
312
- heartBeatDict ["RSS" ] = rss
313
- self .parameters .setdefault ("RSS" , [])
314
- self .parameters ["RSS" ].append (rss )
315
- msg += f"Job RSS: { rss :.1f} kb "
316
-
317
303
if "DiskSpace" not in self .parameters :
318
304
self .parameters ["DiskSpace" ] = []
319
305
@@ -744,11 +730,6 @@ def calibrate(self):
744
730
self .initialValues ["LoadAverage" ] = float (os .getloadavg ()[0 ])
745
731
self .parameters ["LoadAverage" ] = []
746
732
747
- memUsed = self .getMemoryUsed ()
748
-
749
- self .initialValues ["MemoryUsed" ] = memUsed
750
- self .parameters ["MemoryUsed" ] = []
751
-
752
733
result = self .profiler .vSizeUsage (withChildren = True )
753
734
if not result ["OK" ]:
754
735
self .log .warn ("Could not get vSize info from profiler" , result ["Message" ])
@@ -762,9 +743,8 @@ def calibrate(self):
762
743
if not result ["OK" ]:
763
744
self .log .warn ("Could not get rss info from profiler" , result ["Message" ])
764
745
else :
765
- rss = result ["Value" ] * 1024.0
766
- self .initialValues ["RSS" ] = rss
767
- self .log .verbose ("RSS(kb)" , f"{ rss :.1f} " )
746
+ self .initialValues ["RSS" ] = result ["Value" ]
747
+ self .log .verbose ("RSS(mb)" , f"{ result ['Value' ]:.1f} " )
768
748
self .parameters ["RSS" ] = []
769
749
770
750
# We exclude fuse so that mountpoints can be cleaned up by automount after a period unused
@@ -968,14 +948,6 @@ def getNodeInformation(self):
968
948
969
949
return result
970
950
971
- #############################################################################
972
- def getMemoryUsed (self ):
973
- """Obtains the memory used."""
974
- mem = (
975
- resource .getrusage (resource .RUSAGE_SELF ).ru_maxrss + resource .getrusage (resource .RUSAGE_CHILDREN ).ru_maxrss
976
- )
977
- return float (mem )
978
-
979
951
#############################################################################
980
952
def getDiskSpace (self , exclude = None ):
981
953
"""Obtains the available disk space."""
0 commit comments