|
3 | 3 | import subprocess |
4 | 4 | import json |
5 | 5 | import time |
| 6 | +import traceback |
6 | 7 |
|
7 | 8 | from gpu_tasker.settings import RUNNING_LOG_DIR |
8 | 9 | from .models import GPUTask, GPUTaskRunningLog |
@@ -58,41 +59,67 @@ def run_task(task, available_server): |
58 | 59 | RUNNING_LOG_DIR, |
59 | 60 | '{:d}_{:s}_{:s}_{:d}_{:d}.log'.format(task.id, task.name, server.ip, index, int(time.time())) |
60 | 61 | ) |
61 | | - process = RemoteGPUProcess( |
62 | | - task.user.config.server_username, |
63 | | - server.ip, |
64 | | - gpus, |
65 | | - task.cmd, |
66 | | - task.workspace, |
67 | | - task.user.config.server_private_key_path, |
68 | | - log_file_path |
69 | | - ) |
70 | | - pid = process.pid() |
71 | | - print('Task {:d}-{:s} is running, pid: {:d}'.format(task.id, task.name, pid)) |
72 | | - server.set_gpus_busy(gpus) |
73 | | - server.save() |
| 62 | + # create running_log |
74 | 63 | running_log = GPUTaskRunningLog( |
75 | 64 | index=index, |
76 | 65 | task=task, |
77 | 66 | server=server, |
78 | | - pid=pid, |
| 67 | + pid=-1, |
79 | 68 | gpus=','.join(map(str, gpus)), |
80 | 69 | log_file_path=log_file_path, |
81 | 70 | status=1 |
82 | 71 | ) |
83 | 72 | running_log.save() |
84 | | - task.status = 1 |
85 | | - task.save() |
86 | | - send_task_start_email(running_log) |
87 | | - return_code = process.get_return_code() |
88 | | - print('Task {:d}-{:s} stopped, return_code: {:d}'.format(task.id, task.name, return_code)) |
89 | | - if return_code == 0: |
90 | | - send_task_finish_email(running_log) |
91 | | - else: |
92 | | - send_task_fail_email(running_log) |
93 | | - server.set_gpus_free(gpus) |
94 | | - server.save() |
95 | | - running_log.status = 2 if return_code == 0 else -1 |
96 | | - running_log.save() |
97 | | - task.status = 2 if return_code == 0 else -1 |
98 | | - task.save() |
| 73 | + try: |
| 74 | + # run process |
| 75 | + process = RemoteGPUProcess( |
| 76 | + task.user.config.server_username, |
| 77 | + server.ip, |
| 78 | + gpus, |
| 79 | + task.cmd, |
| 80 | + task.workspace, |
| 81 | + task.user.config.server_private_key_path, |
| 82 | + log_file_path |
| 83 | + ) |
| 84 | + pid = process.pid() |
| 85 | + print('Task {:d}-{:s} is running, pid: {:d}'.format(task.id, task.name, pid)) |
| 86 | + |
| 87 | + # save process status |
| 88 | + running_log.pid = pid |
| 89 | + running_log.save() |
| 90 | + server.set_gpus_busy(gpus) |
| 91 | + server.save() |
| 92 | + task.status = 1 |
| 93 | + task.save() |
| 94 | + |
| 95 | + # send email |
| 96 | + send_task_start_email(running_log) |
| 97 | + |
| 98 | + # wait for return |
| 99 | + return_code = process.get_return_code() |
| 100 | + print('Task {:d}-{:s} stopped, return_code: {:d}'.format(task.id, task.name, return_code)) |
| 101 | + |
| 102 | + # save process status |
| 103 | + running_log.status = 2 if return_code == 0 else -1 |
| 104 | + running_log.save() |
| 105 | + task.status = 2 if return_code == 0 else -1 |
| 106 | + task.save() |
| 107 | + |
| 108 | + # send email |
| 109 | + if return_code == 0: |
| 110 | + send_task_finish_email(running_log) |
| 111 | + else: |
| 112 | + send_task_fail_email(running_log) |
| 113 | + except Exception: |
| 114 | + es = traceback.format_exc() |
| 115 | + print(es) |
| 116 | + running_log.status = -1 |
| 117 | + running_log.save() |
| 118 | + task.status = -1 |
| 119 | + task.save() |
| 120 | + with open(log_file_path, 'a') as f: |
| 121 | + f.write('\n') |
| 122 | + f.write(es) |
| 123 | + finally: |
| 124 | + server.set_gpus_free(gpus) |
| 125 | + server.save() |
0 commit comments