Skip to content

Commit 355d399

Browse files
authored
Merge pull request #2125 from ceph/kill-supervisor
kill: Look for, and kill, the supervisor process
2 parents f3d1e60 + ff615aa commit 355d399

File tree

2 files changed

+36
-11
lines changed

2 files changed

+36
-11
lines changed

teuthology/dispatcher/supervisor.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,12 @@ def main(args):
4242
except SkipJob:
4343
return 0
4444

45+
report.try_push_job_info({
46+
'name': job_config['name'],
47+
'job_id': job_config['job_id'],
48+
'pid': os.getpid(),
49+
})
50+
4551
# reimage target machines before running the job
4652
if 'targets' in job_config:
4753
node_count = len(job_config["targets"])

teuthology/kill.py

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -80,20 +80,25 @@ def kill_run(run_name, archive_base=None, owner=None, machine_type=None,
8080

8181

8282
def kill_job(run_name, job_id, archive_base=None, owner=None, skip_unlock=False):
83-
serializer = report.ResultsSerializer(archive_base)
84-
job_info = serializer.job_info(run_name, job_id)
85-
# If we can't read the filesystem, job_info will be nearly empty. Ask paddles:
86-
if 'name' not in job_info:
87-
job_info = report.ResultsReporter().get_jobs(run_name, job_id)
83+
job_info = report.ResultsReporter().get_jobs(run_name, job_id)
8884
if not owner:
8985
if 'owner' not in job_info:
9086
raise RuntimeError(
9187
"I could not figure out the owner of the requested job. "
9288
"Please pass --owner <owner>.")
9389
owner = job_info['owner']
94-
if kill_processes(run_name, [job_info.get('pid')]):
90+
if kill_processes(run_name, [int(job_info.get('pid'))], job_info.get('job_id')):
9591
return
96-
report.try_push_job_info(job_info, dict(status="dead"))
92+
report.try_push_job_info(
93+
{
94+
'name': run_name,
95+
'job_id': job_id,
96+
},
97+
{
98+
'status': 'dead',
99+
'failure_reason': 'killed',
100+
}
101+
)
97102
if 'machine_type' in job_info:
98103
teuthology.exporter.JobResults().record(
99104
machine_type=job_info["machine_type"],
@@ -177,17 +182,23 @@ def remove_beanstalk_jobs(run_name, tube_name):
177182
beanstalk_conn.close()
178183

179184

180-
def kill_processes(run_name, pids=None):
185+
def kill_processes(run_name, pids=None, job_id=None):
181186
if pids:
182187
to_kill = set(pids).intersection(psutil.pids())
183188
else:
184189
to_kill = find_pids(run_name)
185190

186191
pids_need_sudo = set()
187192
for pid in set(to_kill):
188-
if not process_matches_run(pid, run_name):
189-
to_kill.remove(pid)
190-
elif psutil.Process(int(pid)).username() != getpass.getuser():
193+
if job_id:
194+
if not process_matches_job(pid, run_name, job_id):
195+
to_kill.remove(pid)
196+
continue
197+
else:
198+
if not process_matches_run(pid, run_name):
199+
to_kill.remove(pid)
200+
continue
201+
if psutil.Process(int(pid)).username() != getpass.getuser():
191202
pids_need_sudo.add(pid)
192203

193204
survivors = []
@@ -227,6 +238,14 @@ def process_matches_run(pid, run_name):
227238
pass
228239
return False
229240

241+
def process_matches_job(pid, run_name, job_id):
242+
try:
243+
return f"{run_name}/{job_id}" in ' '.join(psutil.Process(pid).cmdline())
244+
except psutil.NoSuchProcess:
245+
pass
246+
except psutil.AccessDenied:
247+
pass
248+
return False
230249

231250
def find_pids(run_name):
232251
run_pids = []

0 commit comments

Comments
 (0)