Skip to content

Commit 6821cc8

Browse files
authored
Merge pull request #2130 from ceph/kill-multi-supervisor
kill: Handle supervisor procs when killing runs
2 parents 9d51f37 + bf0242c commit 6821cc8

2 files changed

Lines changed: 28 additions & 68 deletions

File tree

scripts/kill.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55

66
doc = """
77
usage: teuthology-kill -h
8-
teuthology-kill [-a ARCHIVE] [-p] -r RUN
9-
teuthology-kill [-a ARCHIVE] [-p] -m MACHINE_TYPE -r RUN
10-
teuthology-kill [-a ARCHIVE] [-o OWNER] -r RUN -j JOB ...
11-
teuthology-kill [-a ARCHIVE] [-o OWNER] -J JOBSPEC
8+
teuthology-kill [-p] -r RUN
9+
teuthology-kill [-p] -m MACHINE_TYPE -r RUN
10+
teuthology-kill [-o OWNER] -r RUN -j JOB ...
11+
teuthology-kill [-o OWNER] -J JOBSPEC
1212
teuthology-kill [-p] -o OWNER -m MACHINE_TYPE -r RUN
1313
1414
Kill running teuthology jobs:
@@ -21,9 +21,6 @@
2121
2222
optional arguments:
2323
-h, --help show this help message and exit
24-
-a ARCHIVE, --archive ARCHIVE
25-
The base archive directory
26-
[default: {archive_base}]
2724
-p, --preserve-queue Preserve the queue - do not delete queued jobs
2825
-r, --run RUN The name(s) of the run(s) to kill
2926
-j, --job JOB The job_id of the job to kill
@@ -36,7 +33,7 @@
3633
The type of machine the job(s) are running on.
3734
This is required if killing a job that is still
3835
entirely in the queue.
39-
""".format(archive_base=teuthology.config.config.archive_base)
36+
"""
4037

4138

4239
def main():

teuthology/kill.py

Lines changed: 23 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
#!/usr/bin/python
2-
import os
32
import sys
43
import yaml
54
import psutil
@@ -23,7 +22,6 @@ def main(args):
2322
run_name = args['--run']
2423
job = args['--job']
2524
jobspec = args['--jobspec']
26-
archive_base = args['--archive']
2725
owner = args['--owner']
2826
machine_type = args['--machine-type']
2927
preserve_queue = args['--preserve-queue']
@@ -35,42 +33,35 @@ def main(args):
3533

3634
if job:
3735
for job_id in job:
38-
kill_job(run_name, job_id, archive_base, owner)
36+
kill_job(
37+
run_name,
38+
job_id,
39+
owner
40+
)
3941
else:
40-
kill_run(run_name, archive_base, owner, machine_type,
41-
preserve_queue=preserve_queue)
42+
kill_run(
43+
run_name,
44+
owner,
45+
machine_type,
46+
preserve_queue=preserve_queue,
47+
)
4248

4349

44-
def kill_run(run_name, archive_base=None, owner=None, machine_type=None,
50+
def kill_run(run_name, owner=None, machine_type=None,
4551
preserve_queue=False):
46-
run_info = {}
47-
serializer = report.ResultsSerializer(archive_base)
48-
if archive_base:
49-
run_archive_dir = os.path.join(archive_base, run_name)
50-
if os.path.isdir(run_archive_dir):
51-
run_info = find_run_info(serializer, run_name)
52-
if 'machine_type' in run_info:
53-
machine_type = run_info['machine_type']
54-
owner = run_info['owner']
55-
else:
56-
log.warning("The run info does not have machine type: %s" % run_info)
57-
log.warning("Run archive used: %s" % run_archive_dir)
58-
log.info("Using machine type '%s' and owner '%s'" % (machine_type, owner))
59-
elif machine_type is None:
60-
# no jobs found in archive and no machine type specified,
61-
# so we try paddles to see if there is anything scheduled
62-
run_info = report.ResultsReporter().get_run(run_name)
63-
machine_type = run_info.get('machine_type', None)
64-
if machine_type:
65-
log.info(f"Using machine type '{machine_type}' received from paddles.")
66-
else:
67-
raise RuntimeError(f"Cannot find machine type for the run {run_name}; " +
68-
"you must also pass --machine-type")
52+
run_info = report.ResultsReporter().get_run(run_name)
53+
# run: machine_type, owner
54+
# job: pid, id
55+
machine_type = run_info.get('machine_type', None)
6956

7057
if not preserve_queue:
7158
remove_beanstalk_jobs(run_name, machine_type)
7259
remove_paddles_jobs(run_name)
73-
if kill_processes(run_name, run_info.get('pids')):
60+
pids = []
61+
for job in run_info['jobs']:
62+
if pid := job.get('pid'):
63+
pids.append(int(pid))
64+
if kill_processes(run_name, pids):
7465
return
7566
if owner is not None:
7667
targets = find_targets(run_name)
@@ -79,7 +70,7 @@ def kill_run(run_name, archive_base=None, owner=None, machine_type=None,
7970
report.try_mark_run_dead(run_name)
8071

8172

82-
def kill_job(run_name, job_id, archive_base=None, owner=None, skip_unlock=False):
73+
def kill_job(run_name, job_id, owner=None, skip_unlock=False):
8374
job_info = report.ResultsReporter().get_jobs(run_name, job_id)
8475
if not owner:
8576
if 'owner' not in job_info:
@@ -111,34 +102,6 @@ def kill_job(run_name, job_id, archive_base=None, owner=None, skip_unlock=False)
111102
lock_ops.unlock_safe(list(targets.keys()), owner, run_name, job_id)
112103

113104

114-
def find_run_info(serializer, run_name):
115-
log.info("Assembling run information...")
116-
run_info_fields = [
117-
'machine_type',
118-
'owner',
119-
]
120-
121-
pids = []
122-
run_info = {}
123-
job_info = {}
124-
job_num = 0
125-
jobs = serializer.jobs_for_run(run_name)
126-
job_total = len(jobs)
127-
for (job_id, job_dir) in jobs.items():
128-
if not os.path.isdir(job_dir):
129-
continue
130-
job_num += 1
131-
beanstalk.print_progress(job_num, job_total, 'Reading Job: ')
132-
job_info = serializer.job_info(run_name, job_id, simple=True)
133-
for key in job_info.keys():
134-
if key in run_info_fields and key not in run_info:
135-
run_info[key] = job_info[key]
136-
if 'pid' in job_info:
137-
pids.append(job_info['pid'])
138-
run_info['pids'] = pids
139-
return run_info
140-
141-
142105
def remove_paddles_jobs(run_name):
143106
jobs = report.ResultsReporter().get_jobs(run_name, fields=['status'])
144107
job_ids = [job['job_id'] for job in jobs if job['status'] == 'queued']
@@ -229,7 +192,7 @@ def kill_processes(run_name, pids=None, job_id=None):
229192
def process_matches_run(pid, run_name):
230193
try:
231194
p = psutil.Process(pid)
232-
cmd = p.cmdline()
195+
cmd = ' '.join(p.cmdline())
233196
if run_name in cmd and sys.argv[0] not in cmd:
234197
return True
235198
except psutil.NoSuchProcess:

0 commit comments

Comments
 (0)