1
- ########################################################################
2
- # File : Watchdog.py
3
- # Author: Stuart Paterson
4
- ########################################################################
5
-
6
1
""" The Watchdog class is used by the Job Wrapper to resolve and monitor
7
2
the system resource consumption. The Watchdog can determine if
8
3
a running job is stalled and indicate this to the Job Wrapper.
22
17
import math
23
18
import os
24
19
import re
25
- import resource
26
20
import socket
27
21
import time
28
22
from pathlib import Path
@@ -287,12 +281,15 @@ def _performChecks(self):
287
281
self .parameters ["LoadAverage" ] = []
288
282
self .parameters ["LoadAverage" ].append (loadAvg )
289
283
290
- memoryUsed = self .getMemoryUsed ()
291
- msg += f"MemUsed: { memoryUsed :.1f} kb "
292
- heartBeatDict ["MemoryUsed" ] = memoryUsed
293
- if "MemoryUsed" not in self .parameters :
294
- self .parameters ["MemoryUsed" ] = []
295
- self .parameters ["MemoryUsed" ].append (memoryUsed )
284
+ result = self .profiler .memoryUsage (withChildren = True )
285
+ if not result ["OK" ]:
286
+ self .log .warn ("Could not get rss info from profiler" , result ["Message" ])
287
+ else :
288
+ msg += f"MemUsed: { result ['Value' ]:.1f} MB "
289
+ heartBeatDict ["MemoryUsed" ] = result ["Value" ]
290
+ if "MemoryUsed" not in self .parameters :
291
+ self .parameters ["MemoryUsed" ] = []
292
+ self .parameters ["MemoryUsed" ].append (result ["Value" ])
296
293
297
294
result = self .profiler .vSizeUsage (withChildren = True )
298
295
if not result ["OK" ]:
@@ -302,17 +299,7 @@ def _performChecks(self):
302
299
heartBeatDict ["Vsize" ] = vsize
303
300
self .parameters .setdefault ("Vsize" , [])
304
301
self .parameters ["Vsize" ].append (vsize )
305
- msg += f"Job Vsize: { vsize :.1f} kb "
306
-
307
- result = self .profiler .memoryUsage (withChildren = True )
308
- if not result ["OK" ]:
309
- self .log .warn ("Could not get rss info from profiler" , result ["Message" ])
310
- else :
311
- rss = result ["Value" ] * 1024.0
312
- heartBeatDict ["RSS" ] = rss
313
- self .parameters .setdefault ("RSS" , [])
314
- self .parameters ["RSS" ].append (rss )
315
- msg += f"Job RSS: { rss :.1f} kb "
302
+ msg += f"Job Vsize: { vsize :.1f} MB "
316
303
317
304
if "DiskSpace" not in self .parameters :
318
305
self .parameters ["DiskSpace" ] = []
@@ -666,7 +653,7 @@ def __checkMemoryLimit(self):
666
653
if vsize and self .memoryLimit :
667
654
if vsize > self .memoryLimit :
668
655
# Just a warning for the moment
669
- self .log .warn (f"Job has consumed { vsize :f} .2 KB of memory with the limit of { self .memoryLimit :f} .2 KB " )
656
+ self .log .warn (f"Job has consumed { vsize :f} .2 MB of memory with the limit of { self .memoryLimit :f} .2 MB " )
670
657
671
658
return S_OK ()
672
659
@@ -744,27 +731,21 @@ def calibrate(self):
744
731
self .initialValues ["LoadAverage" ] = float (os .getloadavg ()[0 ])
745
732
self .parameters ["LoadAverage" ] = []
746
733
747
- memUsed = self .getMemoryUsed ()
748
-
749
- self .initialValues ["MemoryUsed" ] = memUsed
750
- self .parameters ["MemoryUsed" ] = []
751
-
752
734
result = self .profiler .vSizeUsage (withChildren = True )
753
735
if not result ["OK" ]:
754
736
self .log .warn ("Could not get vSize info from profiler" , result ["Message" ])
755
737
else :
756
738
vsize = result ["Value" ] * 1024.0
757
739
self .initialValues ["Vsize" ] = vsize
758
- self .log .verbose ("Vsize(kb )" , f"{ vsize :.1f} " )
740
+ self .log .verbose ("Vsize(MB )" , f"{ vsize :.1f} " )
759
741
self .parameters ["Vsize" ] = []
760
742
761
743
result = self .profiler .memoryUsage (withChildren = True )
762
744
if not result ["OK" ]:
763
745
self .log .warn ("Could not get rss info from profiler" , result ["Message" ])
764
746
else :
765
- rss = result ["Value" ] * 1024.0
766
- self .initialValues ["RSS" ] = rss
767
- self .log .verbose ("RSS(kb)" , f"{ rss :.1f} " )
747
+ self .initialValues ["RSS" ] = result ["Value" ]
748
+ self .log .verbose ("RSS(MB)" , f"{ result ['Value' ]:.1f} " )
768
749
self .parameters ["RSS" ] = []
769
750
770
751
# We exclude fuse so that mountpoints can be cleaned up by automount after a period unused
@@ -847,9 +828,9 @@ def __getUsageSummary(self):
847
828
if "MemoryUsed" in self .parameters :
848
829
memory = self .parameters ["MemoryUsed" ]
849
830
if memory :
850
- summary ["MemoryUsed(kb )" ] = abs (float (memory [- 1 ]) - float (self .initialValues ["MemoryUsed" ]))
831
+ summary ["MemoryUsed(MB )" ] = abs (float (memory [- 1 ]) - float (self .initialValues ["MemoryUsed" ]))
851
832
else :
852
- summary ["MemoryUsed(kb )" ] = math .nan
833
+ summary ["MemoryUsed(MB )" ] = math .nan
853
834
# LoadAverage
854
835
if "LoadAverage" in self .parameters :
855
836
laList = self .parameters ["LoadAverage" ]
@@ -956,7 +937,7 @@ def getNodeInformation(self):
956
937
"""Retrieves all static system information"""
957
938
result = {}
958
939
result ["HostName" ] = socket .gethostname ()
959
- result ["Memory(kB )" ] = int (psutil .virtual_memory ()[1 ] / 1024 )
940
+ result ["Memory(MB )" ] = int (psutil .virtual_memory ()[1 ] / 1024 / 1024 )
960
941
result ["LocalAccount" ] = getpass .getuser ()
961
942
962
943
path = Path ("/proc/cpuinfo" )
@@ -968,14 +949,6 @@ def getNodeInformation(self):
968
949
969
950
return result
970
951
971
- #############################################################################
972
- def getMemoryUsed (self ):
973
- """Obtains the memory used."""
974
- mem = (
975
- resource .getrusage (resource .RUSAGE_SELF ).ru_maxrss + resource .getrusage (resource .RUSAGE_CHILDREN ).ru_maxrss
976
- )
977
- return float (mem )
978
-
979
952
#############################################################################
980
953
def getDiskSpace (self , exclude = None ):
981
954
"""Obtains the available disk space."""
0 commit comments