Skip to content

Commit b8b3a90

Browse files
committed
Improve MemEff calculation: can now be calculated directly from AllocTRES and TRESUsageInTot
1 parent b535e58 commit b8b3a90

File tree

2 files changed

+38
-15
lines changed

2 files changed

+38
-15
lines changed

slurm2sql.py

Lines changed: 33 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,23 @@ def calc(row):
467467
raise ValueError('unknown memory type: %s'%reqmem_type)
468468
return mem_max / nodemem
469469

470+
RE_TRES_MEM = re.compile(rf'\bmem=([^,]*)\b')
471+
class slurmMemEff2(linefunc):
472+
"""Slurm memory efficiency (using AllocTRES and TRESUsageInTot columns).
473+
474+
This *does* work in new enough Slurm.
475+
"""
476+
# https://github.com/SchedMD/slurm/blob/master/contribs/seff/seff
477+
type = 'real'
478+
@staticmethod
479+
def calc(row):
480+
m_used = RE_TRES_MEM.search(row['TRESUsageInTot'])
481+
m_alloc = RE_TRES_MEM.search(row['AllocTRES'])
482+
if m_alloc and m_used:
483+
return float_bytes(m_used.group(1)) / float_bytes(m_alloc.group(1))
484+
return None
485+
486+
470487
class slurmCPUEff(linefunc):
471488
# This matches the seff tool currently:
472489
# https://github.com/SchedMD/slurm/blob/master/contribs/seff/seff
@@ -589,6 +606,9 @@ def calc(row):
589606
'MinCPUTask': nullstr,
590607

591608
# Memory related
609+
'_TotalMem': ExtractField('TotalMem', 'TRESUsageInTot', 'mem', float_bytes),
610+
'_AllocMem': ExtractField('AllocMem', 'AllocTRES', 'mem', float_bytes),
611+
'_MemEff': slurmMemEff2, # Calculated from AllocTRES and TRESUsageInTot
592612
'ReqMem': float_bytes, # Requested mem, value from slurm. Sum across all nodes
593613
'_ReqMemNode': slurmMemNode, # Mem per node, computed
594614
'_ReqMemCPU': slurmMemCPU, # Mem per cpu, computed
@@ -598,7 +618,6 @@ def calc(row):
598618
'MaxRSSTask': nullstr,
599619
'MaxPages': int_metric,
600620
'MaxVMSize': slurmmem,
601-
#'_MemEff': slurmMemEff, # Slurm memory efficiency - see above for why this doesn't work
602621

603622
# Disk related
604623
'AveDiskRead': int_bytes,
@@ -882,9 +901,11 @@ def infer_type(cd):
882901
'max(cputime) AS cpu_s_reserved, '
883902
'max(totalcpu) AS cpu_s_used, '
884903
'max(ReqMemNode) AS MemReq, '
885-
'max(ReqMemNode*Elapsed) AS mem_s_reserved, ' # highest of any job
904+
'max(AllocMem) AS AllocMem, '
905+
'max(TotalMem) AS TotalMem, '
886906
'max(MaxRSS) AS MaxRSS, '
887-
'max(MaxRSS) / max(ReqMemNode) AS MemEff, '
907+
'max(MemEff) AS MemEff, '
908+
'max(AllocMem*Elapsed) AS mem_s_reserved, ' # highest of any job
888909
'max(NGpus) AS NGpus, '
889910
'max(NGpus)*max(Elapsed) AS gpu_s_reserved, '
890911
'max(NGpus)*max(Elapsed)*max(GPUutil) AS gpu_s_used, '
@@ -1011,7 +1032,8 @@ def compact_table():
10111032
)
10121033

10131034

1014-
SACCT_DEFAULT_FIELDS = 'JobID,User,State,Start,End,Partition,ExitCodeRaw,NodeList,NCPUS,CPUtime,CPUEff,ReqMem,MaxRSS,ReqGPUS,GPUUtil,TotDiskRead,TotDiskWrite,ReqTRES,AllocTRES,TRESUsageInTot,TRESUsageOutTot'
1035+
SACCT_DEFAULT_FIELDS = 'JobID,User,State,Start,End,Partition,ExitCodeRaw,NodeList,NCPUS,CPUtime,CPUEff,AllocMem,TotalMem,MemEff,ReqGPUS,GPUUtil,TotDiskRead,TotDiskWrite,ReqTRES,AllocTRES,TRESUsageInTot,TRESUsageOutTot'
1036+
SACCT_DEFAULT_FIELDS_LONG = 'JobID,User,State,Start,End,Elapsed,Partition,ExitCodeRaw,NodeList,NCPUS,CPUtime,CPUEff,AllocMem,TotalMem,MemEff,ReqMem,MaxRSS,ReqGPUS,GPUUtil,TotDiskRead,TotDiskWrite,ReqTRES,AllocTRES,TRESUsageInTot,TRESUsageOutTot'
10151037
COMPLETED_STATES = 'CA,CD,DL,F,NF,OOM,PR,RV,TO'
10161038
def sacct_cli(argv=sys.argv[1:], csv_input=None):
10171039
"""A command line that uses slurm2sql to give an sacct-like interface."""
@@ -1026,7 +1048,7 @@ def sacct_cli(argv=sys.argv[1:], csv_input=None):
10261048
parser.add_argument('--db',
10271049
help="Read from this DB. Don't import new data.")
10281050
parser.add_argument('--output', '-o', default=SACCT_DEFAULT_FIELDS,
1029-
help="Fields to output (comma separated list, use '*' for all fields). NOT safe from SQL injection")
1051+
help="Fields to output (comma separated list, use '*' for all fields). NOT safe from SQL injection. If 'long' then some longer default list")
10301052
parser.add_argument('--format', '-f', default=compact_table(),
10311053
help="Output format (see tabulate formats: https://pypi.org/project/tabulate/ (default simple)")
10321054
parser.add_argument('--order',
@@ -1048,6 +1070,8 @@ def sacct_cli(argv=sys.argv[1:], csv_input=None):
10481070
if args.quiet:
10491071
logging.lastResort.setLevel(logging.WARN)
10501072
LOG.debug(args)
1073+
if args.output == 'long':
1074+
args.output = SACCT_DEFAULT_FIELDS_LONG
10511075

10521076
sacct_filter = process_sacct_filter(args, sacct_filter)
10531077

@@ -1079,8 +1103,6 @@ def seff_cli(argv=sys.argv[1:], csv_input=None):
10791103
jobs, use "--completed -S now-1week" (a start time must be
10801104
given with --completed because of how sacct works).
10811105
1082-
MemReqGiB is amount requested per node (to compare with MaxRSSGiB).
1083-
10841106
This only queries jobs with an End time (unlike most other commands).
10851107
10861108
If a single argument is given, and it
@@ -1140,8 +1162,8 @@ def seff_cli(argv=sys.argv[1:], csv_input=None):
11401162
round(sum(Elapsed*NCPUS)/86400,1) AS cpu_day,
11411163
printf("%2.0f%%", 100*sum(Elapsed*NCPUS*CPUEff)/sum(Elapsed*NCPUS)) AS CPUEff,
11421164
1143-
round(sum(Elapsed*MemReq)/1073741824/86400,1) AS mem_GiB_day,
1144-
printf("%2.0f%%", 100*sum(Elapsed*MemReq*MemEff)/sum(Elapsed*MemReq)) AS MemEff,
1165+
round(sum(Elapsed*AllocMem)/1073741824/86400,1) AS mem_GiB_day,
1166+
printf("%2.0f%%", 100*sum(Elapsed*AllocMem*MemEff)/sum(Elapsed*AllocMem)) AS MemEff,
11451167
11461168
round(sum(Elapsed*NGPUs)/86400,1) AS gpu_day,
11471169
iif(sum(NGpus), printf("%2.0f%%", 100*sum(Elapsed*NGPUs*GPUeff)/sum(Elapsed*NGPUs)), NULL) AS GPUEff,
@@ -1169,8 +1191,8 @@ def seff_cli(argv=sys.argv[1:], csv_input=None):
11691191
NCPUS,
11701192
printf("%3.0f%%",round(CPUeff, 2)*100) AS "CPUeff",
11711193
1172-
round(MemReq/1073741824,2) AS MemReqGiB,
1173-
round(MaxRSS/1073741824,2) AS MaxRSSGiB,
1194+
round(AllocMem/1073741824,2) AS MemAllocGiB,
1195+
round(TotalMem/1073741824,2) AS MemTotGiB,
11741196
printf("%3.0f%%",round(MemEff,2)*100) AS MemEff,
11751197
11761198
NGpus,

test.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -230,14 +230,15 @@ def test_seff(db, capsys):
230230

231231
def test_seff_mem(db, capsys):
232232
data = """
233-
JobID,End,NNodes,NCPUS,ReqMem,MaxRSS
234-
111,1970-01-01T00:00:00,1,1,10G,
235-
111.2,,1,1,,8G
233+
JobID,End,NNodes,NCPUS,ReqMem,MaxRSS,AllocTRES,TRESUsageInTot
234+
111,1970-01-01T00:00:00,1,1,10G,,mem=10G,
235+
111.2,,1,1,,8G,mem=10G,mem=6G
236236
"""
237+
# Changed 2025-04-23: no longer uses ReqMe.m and MaxRSS but AllocTRES and TRESUsageInTot
237238
slurm2sql.seff_cli(argv=[], csv_input=csvdata(data))
238239
captured = capsys.readouterr()
239240
assert '111' in captured.out
240-
assert '80%' in captured.out
241+
assert '60%' in captured.out
241242

242243
def test_seff_gpu(db, capsys):
243244
data = """

0 commit comments

Comments
 (0)