-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patholcf-s3m-compute.py
More file actions
executable file
·130 lines (114 loc) · 4.91 KB
/
olcf-s3m-compute.py
File metadata and controls
executable file
·130 lines (114 loc) · 4.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python3
import argparse
import json
from pathlib import Path
from os import getenv
from olcf_s3m_api.client import OLCFAPIClient
from olcf_s3m_api.compute import ComputeService
def list_jobs(service : ComputeService):
print('++++ OLCF S3M - Compute Job Orchestration ++++ Listing Jobs')
success, msg = service.list_jobs()
if success:
for job in msg:
job = json.dumps(job, indent=4)
print(job)
print('\n')
print('\n\n')
def list_queues(service : ComputeService):
print('++++ OLCF S3M - Compute Job Orchestration ++++ Listing Queues')
success, msg = service.list_queues()
if success:
print(msg)
print('\n\n')
def submit(service : ComputeService,
job_script : str,
queue : str,
project : str,
workdir : str,
nodes : int,
walltime : int):
print(f'++++ OLCF S3M - Compute Job Orchestration ++++ Submitting a {nodes}-node Job')
jobfile = Path(job_script)
if jobfile.exists() and jobfile.is_file():
jobname = jobfile.stem
jobscript = jobfile.read_text()
jobenv = ["EXAMPLE_VARIABLE_1=test1", "EXAMPLE_VARIABLE_2=/some/interesting/path"]
success, msg = service.submit_job(project=project,
workdir=workdir,
job_name=jobname,
job_queue=queue,
script_contents=jobscript,
time_minutes=walltime,
node_count=nodes,
env_vars=jobenv)
if success:
print(f'{msg}\n\n')
print('\n\n')
def cancel(service : ComputeService, job_id : str):
print('++++ OLCF S3M - Compute Job Orchestration ++++ Cancelling Job')
success, msg = service.cancel_job(jobid=job_id)
if success:
print(msg)
print('\n\n')
def job_info(service : ComputeService, job_id : str):
print('++++ OLCF S3M - Compute Job Orchestration ++++ Getting Job Information')
success, msg = service.get_job_info(jobid=job_id)
msg = json.dumps(msg, indent=4)
if success:
print(msg)
print('\n\n')
def status(service : ComputeService, queue : str=None):
if queue:
print('++++ OLCF S3M - Compute Job Orchestration ++++ Getting Compute Queue Status')
success, msg = service.get_queue_status(queue)
msg = json.dumps(msg, indent=4)
else:
print('++++ OLCF S3M - Compute Job Orchestration ++++ Getting Compute System Status')
success, status = service.get_system_status()
msg = status.msg()
if success:
print(msg)
print('\n\n')
def main(args):
my_system_name = args.system
my_queue = args.queue
my_job = args.job
my_api_client = OLCFAPIClient(api_token=getenv("olcf_s3m_api_TOKEN", "InvalidToken"))
my_project = getenv("olcf_s3m_api_PROJECT", "InvalidProject")
my_workdir = getenv("olcf_s3m_api_WORKDIR", str(Path.cwd()))
my_comp_service = ComputeService(cluster_name=my_system_name,
api_client=my_api_client)
if args.cancel:
cancel(my_comp_service, my_job)
elif args.joblist:
list_jobs(my_comp_service)
elif args.queuelist:
list_queues(my_comp_service)
elif args.submit:
submit(service=my_comp_service,
job_script=my_job,
queue=my_queue,
project=my_project,
workdir=my_workdir,
nodes=args.nodecount,
walltime=args.walltime)
else:
if my_job != "invalid-job":
job_info(my_comp_service, my_job)
elif my_queue != "invalid-queue":
status(my_comp_service, queue=my_queue)
else:
status(my_comp_service)
if __name__ == '__main__':
parser = argparse.ArgumentParser('olcf-s3m-compute')
parser.add_argument('-c', '--cancel', help='cancel a specific job', action='store_true')
parser.add_argument('-j', '--joblist', help='list compute jobs for system', action='store_true')
parser.add_argument('-q', '--queuelist', help='list compute queues for system', action='store_true')
parser.add_argument('-s', '--submit', help='submit a job to the system', action='store_true')
parser.add_argument('system', nargs='?', help='name of the target HPC system', default='invalid-system')
parser.add_argument('queue', nargs='?', help='job queue to use on the target system', default='invalid-queue')
parser.add_argument('job', nargs='?', help='job script file to submit or existing job id', default='invalid-job')
parser.add_argument('nodecount', nargs='?', help='number of compute nodes for job submission', type=int, default=1)
parser.add_argument('walltime', nargs='?', help='wall time in minutes for job submission', type=int, default=5)
args = parser.parse_args()
main(args)