Skip to content

Commit b269279

Browse files
committed
fix: database status error occured by running error
1 parent 7115df1 commit b269279

File tree

2 files changed

+65
-31
lines changed

2 files changed

+65
-31
lines changed

notification/email_notification.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import traceback
2+
13
from django.core.mail import send_mail
24

35
from gpu_tasker.settings import EMAIL_NOTIFICATION
@@ -56,8 +58,13 @@
5658

5759
def send_email(address, title, content):
5860
if EMAIL_NOTIFICATION:
59-
from gpu_tasker.settings import DEFAULT_FROM_EMAIL
60-
send_mail(title, content, DEFAULT_FROM_EMAIL, [address], fail_silently=False)
61+
try:
62+
from gpu_tasker.settings import DEFAULT_FROM_EMAIL
63+
send_mail(title, content, DEFAULT_FROM_EMAIL, [address], fail_silently=False)
64+
except Exception:
65+
es = traceback.format_exc()
66+
print('Send email fail')
67+
print(es)
6168

6269

6370
def check_email_config(func):

task/utils.py

Lines changed: 56 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import subprocess
44
import json
55
import time
6+
import traceback
67

78
from gpu_tasker.settings import RUNNING_LOG_DIR
89
from .models import GPUTask, GPUTaskRunningLog
@@ -58,41 +59,67 @@ def run_task(task, available_server):
5859
RUNNING_LOG_DIR,
5960
'{:d}_{:s}_{:s}_{:d}_{:d}.log'.format(task.id, task.name, server.ip, index, int(time.time()))
6061
)
61-
process = RemoteGPUProcess(
62-
task.user.config.server_username,
63-
server.ip,
64-
gpus,
65-
task.cmd,
66-
task.workspace,
67-
task.user.config.server_private_key_path,
68-
log_file_path
69-
)
70-
pid = process.pid()
71-
print('Task {:d}-{:s} is running, pid: {:d}'.format(task.id, task.name, pid))
72-
server.set_gpus_busy(gpus)
73-
server.save()
62+
# create running_log
7463
running_log = GPUTaskRunningLog(
7564
index=index,
7665
task=task,
7766
server=server,
78-
pid=pid,
67+
pid=-1,
7968
gpus=','.join(map(str, gpus)),
8069
log_file_path=log_file_path,
8170
status=1
8271
)
8372
running_log.save()
84-
task.status = 1
85-
task.save()
86-
send_task_start_email(running_log)
87-
return_code = process.get_return_code()
88-
print('Task {:d}-{:s} stopped, return_code: {:d}'.format(task.id, task.name, return_code))
89-
if return_code == 0:
90-
send_task_finish_email(running_log)
91-
else:
92-
send_task_fail_email(running_log)
93-
server.set_gpus_free(gpus)
94-
server.save()
95-
running_log.status = 2 if return_code == 0 else -1
96-
running_log.save()
97-
task.status = 2 if return_code == 0 else -1
98-
task.save()
73+
try:
74+
# run process
75+
process = RemoteGPUProcess(
76+
task.user.config.server_username,
77+
server.ip,
78+
gpus,
79+
task.cmd,
80+
task.workspace,
81+
task.user.config.server_private_key_path,
82+
log_file_path
83+
)
84+
pid = process.pid()
85+
print('Task {:d}-{:s} is running, pid: {:d}'.format(task.id, task.name, pid))
86+
87+
# save process status
88+
running_log.pid = pid
89+
running_log.save()
90+
server.set_gpus_busy(gpus)
91+
server.save()
92+
task.status = 1
93+
task.save()
94+
95+
# send email
96+
send_task_start_email(running_log)
97+
98+
# wait for return
99+
return_code = process.get_return_code()
100+
print('Task {:d}-{:s} stopped, return_code: {:d}'.format(task.id, task.name, return_code))
101+
102+
# save process status
103+
running_log.status = 2 if return_code == 0 else -1
104+
running_log.save()
105+
task.status = 2 if return_code == 0 else -1
106+
task.save()
107+
108+
# send email
109+
if return_code == 0:
110+
send_task_finish_email(running_log)
111+
else:
112+
send_task_fail_email(running_log)
113+
except Exception:
114+
es = traceback.format_exc()
115+
print(es)
116+
running_log.status = -1
117+
running_log.save()
118+
task.status = -1
119+
task.save()
120+
with open(log_file_path, 'a') as f:
121+
f.write('\n')
122+
f.write(es)
123+
finally:
124+
server.set_gpus_free(gpus)
125+
server.save()

0 commit comments

Comments
 (0)