Skip to content

Commit bf0242c

Browse files
committed
kill: Handle supervisor procs when killing runs
This is a follow-up to ff615aa, which only handled killing individual jobs. Since we're using the results server for all run and job metadata, we can drop all mentions of the archive. This change is necessary since we've restricted access to the archive from the teuthology machine for normal users, to avoid resource contention. Signed-off-by: Zack Cerza <zack@cerza.org>
1 parent 0d6e6fd commit bf0242c

2 files changed

Lines changed: 28 additions & 68 deletions

File tree

scripts/kill.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55

66
doc = """
77
usage: teuthology-kill -h
8-
teuthology-kill [-a ARCHIVE] [-p] -r RUN
9-
teuthology-kill [-a ARCHIVE] [-p] -m MACHINE_TYPE -r RUN
10-
teuthology-kill [-a ARCHIVE] [-o OWNER] -r RUN -j JOB ...
11-
teuthology-kill [-a ARCHIVE] [-o OWNER] -J JOBSPEC
8+
teuthology-kill [-p] -r RUN
9+
teuthology-kill [-p] -m MACHINE_TYPE -r RUN
10+
teuthology-kill [-o OWNER] -r RUN -j JOB ...
11+
teuthology-kill [-o OWNER] -J JOBSPEC
1212
teuthology-kill [-p] -o OWNER -m MACHINE_TYPE -r RUN
1313
1414
Kill running teuthology jobs:
@@ -21,9 +21,6 @@
2121
2222
optional arguments:
2323
-h, --help show this help message and exit
24-
-a ARCHIVE, --archive ARCHIVE
25-
The base archive directory
26-
[default: {archive_base}]
2724
-p, --preserve-queue Preserve the queue - do not delete queued jobs
2825
-r, --run RUN The name(s) of the run(s) to kill
2926
-j, --job JOB The job_id of the job to kill
@@ -36,7 +33,7 @@
3633
The type of machine the job(s) are running on.
3734
This is required if killing a job that is still
3835
entirely in the queue.
39-
""".format(archive_base=teuthology.config.config.archive_base)
36+
"""
4037

4138

4239
def main():

teuthology/kill.py

Lines changed: 23 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
#!/usr/bin/python
2-
import os
32
import sys
43
import yaml
54
import psutil
@@ -23,7 +22,6 @@ def main(args):
2322
run_name = args['--run']
2423
job = args['--job']
2524
jobspec = args['--jobspec']
26-
archive_base = args['--archive']
2725
owner = args['--owner']
2826
machine_type = args['--machine-type']
2927
preserve_queue = args['--preserve-queue']
@@ -35,42 +33,35 @@ def main(args):
3533

3634
if job:
3735
for job_id in job:
38-
kill_job(run_name, job_id, archive_base, owner)
36+
kill_job(
37+
run_name,
38+
job_id,
39+
owner
40+
)
3941
else:
40-
kill_run(run_name, archive_base, owner, machine_type,
41-
preserve_queue=preserve_queue)
42+
kill_run(
43+
run_name,
44+
owner,
45+
machine_type,
46+
preserve_queue=preserve_queue,
47+
)
4248

4349

44-
def kill_run(run_name, archive_base=None, owner=None, machine_type=None,
50+
def kill_run(run_name, owner=None, machine_type=None,
4551
preserve_queue=False):
46-
run_info = {}
47-
serializer = report.ResultsSerializer(archive_base)
48-
if archive_base:
49-
run_archive_dir = os.path.join(archive_base, run_name)
50-
if os.path.isdir(run_archive_dir):
51-
run_info = find_run_info(serializer, run_name)
52-
if 'machine_type' in run_info:
53-
machine_type = run_info['machine_type']
54-
owner = run_info['owner']
55-
else:
56-
log.warning("The run info does not have machine type: %s" % run_info)
57-
log.warning("Run archive used: %s" % run_archive_dir)
58-
log.info("Using machine type '%s' and owner '%s'" % (machine_type, owner))
59-
elif machine_type is None:
60-
# no jobs found in archive and no machine type specified,
61-
# so we try paddles to see if there is anything scheduled
62-
run_info = report.ResultsReporter().get_run(run_name)
63-
machine_type = run_info.get('machine_type', None)
64-
if machine_type:
65-
log.info(f"Using machine type '{machine_type}' received from paddles.")
66-
else:
67-
raise RuntimeError(f"Cannot find machine type for the run {run_name}; " +
68-
"you must also pass --machine-type")
52+
run_info = report.ResultsReporter().get_run(run_name)
53+
# run: machine_type, owner
54+
# job: pid, id
55+
machine_type = run_info.get('machine_type', None)
6956

7057
if not preserve_queue:
7158
remove_beanstalk_jobs(run_name, machine_type)
7259
remove_paddles_jobs(run_name)
73-
if kill_processes(run_name, run_info.get('pids')):
60+
pids = []
61+
for job in run_info['jobs']:
62+
if pid := job.get('pid'):
63+
pids.append(int(pid))
64+
if kill_processes(run_name, pids):
7465
return
7566
if owner is not None:
7667
targets = find_targets(run_name)
@@ -79,7 +70,7 @@ def kill_run(run_name, archive_base=None, owner=None, machine_type=None,
7970
report.try_mark_run_dead(run_name)
8071

8172

82-
def kill_job(run_name, job_id, archive_base=None, owner=None, skip_unlock=False):
73+
def kill_job(run_name, job_id, owner=None, skip_unlock=False):
8374
job_info = report.ResultsReporter().get_jobs(run_name, job_id)
8475
if not owner:
8576
if 'owner' not in job_info:
@@ -111,34 +102,6 @@ def kill_job(run_name, job_id, archive_base=None, owner=None, skip_unlock=False)
111102
lock_ops.unlock_safe(list(targets.keys()), owner, run_name, job_id)
112103

113104

114-
def find_run_info(serializer, run_name):
115-
log.info("Assembling run information...")
116-
run_info_fields = [
117-
'machine_type',
118-
'owner',
119-
]
120-
121-
pids = []
122-
run_info = {}
123-
job_info = {}
124-
job_num = 0
125-
jobs = serializer.jobs_for_run(run_name)
126-
job_total = len(jobs)
127-
for (job_id, job_dir) in jobs.items():
128-
if not os.path.isdir(job_dir):
129-
continue
130-
job_num += 1
131-
beanstalk.print_progress(job_num, job_total, 'Reading Job: ')
132-
job_info = serializer.job_info(run_name, job_id, simple=True)
133-
for key in job_info.keys():
134-
if key in run_info_fields and key not in run_info:
135-
run_info[key] = job_info[key]
136-
if 'pid' in job_info:
137-
pids.append(job_info['pid'])
138-
run_info['pids'] = pids
139-
return run_info
140-
141-
142105
def remove_paddles_jobs(run_name):
143106
jobs = report.ResultsReporter().get_jobs(run_name, fields=['status'])
144107
job_ids = [job['job_id'] for job in jobs if job['status'] == 'queued']
@@ -229,7 +192,7 @@ def kill_processes(run_name, pids=None, job_id=None):
229192
def process_matches_run(pid, run_name):
230193
try:
231194
p = psutil.Process(pid)
232-
cmd = p.cmdline()
195+
cmd = ' '.join(p.cmdline())
233196
if run_name in cmd and sys.argv[0] not in cmd:
234197
return True
235198
except psutil.NoSuchProcess:

0 commit comments

Comments
 (0)