Skip to content

Commit 108861c

Browse files
authored
Merge pull request #7518 from fstagni/fixMemoryReporting
[8.0] Fix memory reporting
2 parents b8e8dfa + 27f2f18 commit 108861c

File tree

1 file changed

+17
-44
lines changed
  • src/DIRAC/WorkloadManagementSystem/JobWrapper

1 file changed

+17
-44
lines changed

src/DIRAC/WorkloadManagementSystem/JobWrapper/Watchdog.py

Lines changed: 17 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,3 @@
1-
########################################################################
2-
# File : Watchdog.py
3-
# Author: Stuart Paterson
4-
########################################################################
5-
61
""" The Watchdog class is used by the Job Wrapper to resolve and monitor
72
the system resource consumption. The Watchdog can determine if
83
a running job is stalled and indicate this to the Job Wrapper.
@@ -22,7 +17,6 @@
2217
import math
2318
import os
2419
import re
25-
import resource
2620
import socket
2721
import time
2822
from pathlib import Path
@@ -287,12 +281,15 @@ def _performChecks(self):
287281
self.parameters["LoadAverage"] = []
288282
self.parameters["LoadAverage"].append(loadAvg)
289283

290-
memoryUsed = self.getMemoryUsed()
291-
msg += f"MemUsed: {memoryUsed:.1f} kb "
292-
heartBeatDict["MemoryUsed"] = memoryUsed
293-
if "MemoryUsed" not in self.parameters:
294-
self.parameters["MemoryUsed"] = []
295-
self.parameters["MemoryUsed"].append(memoryUsed)
284+
result = self.profiler.memoryUsage(withChildren=True)
285+
if not result["OK"]:
286+
self.log.warn("Could not get rss info from profiler", result["Message"])
287+
else:
288+
msg += f"MemUsed: {result['Value']:.1f} MB "
289+
heartBeatDict["MemoryUsed"] = result["Value"]
290+
if "MemoryUsed" not in self.parameters:
291+
self.parameters["MemoryUsed"] = []
292+
self.parameters["MemoryUsed"].append(result["Value"])
296293

297294
result = self.profiler.vSizeUsage(withChildren=True)
298295
if not result["OK"]:
@@ -302,17 +299,7 @@ def _performChecks(self):
302299
heartBeatDict["Vsize"] = vsize
303300
self.parameters.setdefault("Vsize", [])
304301
self.parameters["Vsize"].append(vsize)
305-
msg += f"Job Vsize: {vsize:.1f} kb "
306-
307-
result = self.profiler.memoryUsage(withChildren=True)
308-
if not result["OK"]:
309-
self.log.warn("Could not get rss info from profiler", result["Message"])
310-
else:
311-
rss = result["Value"] * 1024.0
312-
heartBeatDict["RSS"] = rss
313-
self.parameters.setdefault("RSS", [])
314-
self.parameters["RSS"].append(rss)
315-
msg += f"Job RSS: {rss:.1f} kb "
302+
msg += f"Job Vsize: {vsize:.1f} MB "
316303

317304
if "DiskSpace" not in self.parameters:
318305
self.parameters["DiskSpace"] = []
@@ -666,7 +653,7 @@ def __checkMemoryLimit(self):
666653
if vsize and self.memoryLimit:
667654
if vsize > self.memoryLimit:
668655
# Just a warning for the moment
669-
self.log.warn(f"Job has consumed {vsize:f}.2 KB of memory with the limit of {self.memoryLimit:f}.2 KB")
656+
self.log.warn(f"Job has consumed {vsize:f}.2 MB of memory with the limit of {self.memoryLimit:f}.2 MB")
670657

671658
return S_OK()
672659

@@ -744,27 +731,21 @@ def calibrate(self):
744731
self.initialValues["LoadAverage"] = float(os.getloadavg()[0])
745732
self.parameters["LoadAverage"] = []
746733

747-
memUsed = self.getMemoryUsed()
748-
749-
self.initialValues["MemoryUsed"] = memUsed
750-
self.parameters["MemoryUsed"] = []
751-
752734
result = self.profiler.vSizeUsage(withChildren=True)
753735
if not result["OK"]:
754736
self.log.warn("Could not get vSize info from profiler", result["Message"])
755737
else:
756738
vsize = result["Value"] * 1024.0
757739
self.initialValues["Vsize"] = vsize
758-
self.log.verbose("Vsize(kb)", f"{vsize:.1f}")
740+
self.log.verbose("Vsize(MB)", f"{vsize:.1f}")
759741
self.parameters["Vsize"] = []
760742

761743
result = self.profiler.memoryUsage(withChildren=True)
762744
if not result["OK"]:
763745
self.log.warn("Could not get rss info from profiler", result["Message"])
764746
else:
765-
rss = result["Value"] * 1024.0
766-
self.initialValues["RSS"] = rss
767-
self.log.verbose("RSS(kb)", f"{rss:.1f}")
747+
self.initialValues["RSS"] = result["Value"]
748+
self.log.verbose("RSS(MB)", f"{result['Value']:.1f}")
768749
self.parameters["RSS"] = []
769750

770751
# We exclude fuse so that mountpoints can be cleaned up by automount after a period unused
@@ -847,9 +828,9 @@ def __getUsageSummary(self):
847828
if "MemoryUsed" in self.parameters:
848829
memory = self.parameters["MemoryUsed"]
849830
if memory:
850-
summary["MemoryUsed(kb)"] = abs(float(memory[-1]) - float(self.initialValues["MemoryUsed"]))
831+
summary["MemoryUsed(MB)"] = abs(float(memory[-1]) - float(self.initialValues["MemoryUsed"]))
851832
else:
852-
summary["MemoryUsed(kb)"] = math.nan
833+
summary["MemoryUsed(MB)"] = math.nan
853834
# LoadAverage
854835
if "LoadAverage" in self.parameters:
855836
laList = self.parameters["LoadAverage"]
@@ -956,7 +937,7 @@ def getNodeInformation(self):
956937
"""Retrieves all static system information"""
957938
result = {}
958939
result["HostName"] = socket.gethostname()
959-
result["Memory(kB)"] = int(psutil.virtual_memory()[1] / 1024)
940+
result["Memory(MB)"] = int(psutil.virtual_memory()[1] / 1024 / 1024)
960941
result["LocalAccount"] = getpass.getuser()
961942

962943
path = Path("/proc/cpuinfo")
@@ -968,14 +949,6 @@ def getNodeInformation(self):
968949

969950
return result
970951

971-
#############################################################################
972-
def getMemoryUsed(self):
973-
"""Obtains the memory used."""
974-
mem = (
975-
resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss
976-
)
977-
return float(mem)
978-
979952
#############################################################################
980953
def getDiskSpace(self, exclude=None):
981954
"""Obtains the available disk space."""

0 commit comments

Comments
 (0)