Skip to content

Commit d75a3f6

Browse files
author
yyhuni
committed
fix(task_distributor): adjust high load wait parameters and improve timeout handling
- Increase high load wait interval from 60 to 120 seconds (2 minutes) - Increase max retries from 10 to 60 to support up to 2 hours total wait time - Improve timeout message to show actual wait duration in minutes - Remove duplicate return statement in worker selection logic - Update notification message to reflect new wait parameters (2 minutes check interval, 2 hours max wait) - Clean up trailing whitespace in task_distributor.py - Remove redundant error message from install.sh about missing/incorrect image versions - Better handling of high load scenarios with clearer logging and user communication
1 parent 59e48e5 commit d75a3f6

File tree

3 files changed

+17
-18
lines changed

3 files changed

+17
-18
lines changed

backend/apps/engine/services/task_distributor.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -156,10 +156,10 @@ def select_best_worker(self) -> Optional[WorkerNode]:
156156
# 降级策略:如果没有正常负载的,循环等待后重新检测
157157
if not scored_workers:
158158
if high_load_workers:
159-
# 高负载等待参数(默认每 60 秒检测一次,最多 10 次
160-
high_load_wait = getattr(settings, 'HIGH_LOAD_WAIT_SECONDS', 60)
161-
high_load_max_retries = getattr(settings, 'HIGH_LOAD_MAX_RETRIES', 10)
162-
159+
# 高负载等待参数(每 2 分钟检测一次,最多等待 2 小时
160+
high_load_wait = getattr(settings, 'HIGH_LOAD_WAIT_SECONDS', 120)
161+
high_load_max_retries = getattr(settings, 'HIGH_LOAD_MAX_RETRIES', 60)
162+
163163
# 开始等待前发送高负载通知
164164
high_load_workers.sort(key=lambda x: x[1])
165165
_, _, first_cpu, first_mem = high_load_workers[0]
@@ -170,51 +170,51 @@ def select_best_worker(self) -> Optional[WorkerNode]:
170170
cpu=first_cpu,
171171
mem=first_mem
172172
)
173-
173+
174174
for retry in range(high_load_max_retries):
175175
logger.warning(
176176
"所有 Worker 高负载,等待 %d 秒后重试... (%d/%d)",
177177
high_load_wait, retry + 1, high_load_max_retries
178178
)
179179
time.sleep(high_load_wait)
180-
180+
181181
# 重新获取负载数据
182182
loads = worker_load_service.get_all_loads(worker_ids)
183-
183+
184184
# 重新评估
185185
scored_workers = []
186186
high_load_workers = []
187-
187+
188188
for worker in workers:
189189
load = loads.get(worker.id)
190190
if not load:
191191
continue
192-
192+
193193
cpu = load.get('cpu', 0)
194194
mem = load.get('mem', 0)
195195
score = cpu * 0.7 + mem * 0.3
196-
196+
197197
if cpu > 85 or mem > 85:
198198
high_load_workers.append((worker, score, cpu, mem))
199199
else:
200200
scored_workers.append((worker, score, cpu, mem))
201-
201+
202202
# 如果有正常负载的 Worker,跳出循环
203203
if scored_workers:
204204
logger.info("检测到正常负载 Worker,结束等待")
205205
break
206-
207-
# 超时或仍然高负载,选择负载最低的
206+
207+
# 超时后强制派发到负载最低的 Worker
208208
if not scored_workers and high_load_workers:
209209
high_load_workers.sort(key=lambda x: x[1])
210210
best_worker, _, cpu, mem = high_load_workers[0]
211-
211+
212212
logger.warning(
213-
"等待超时,强制分发到高负载 Worker: %s (CPU: %.1f%%, MEM: %.1f%%)",
213+
"等待 %d 分钟后仍高负载,强制分发到 Worker: %s (CPU: %.1f%%, MEM: %.1f%%)",
214+
(high_load_wait * high_load_max_retries) // 60,
214215
best_worker.name, cpu, mem
215216
)
216217
return best_worker
217-
return best_worker
218218
else:
219219
logger.warning("没有可用的 Worker")
220220
return None

backend/apps/scan/notifications/receivers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ def on_all_workers_high_load(sender, worker_name, cpu, mem, **kwargs):
8787
"""所有 Worker 高负载时的通知处理"""
8888
create_notification(
8989
title="系统负载较高",
90-
message=f"所有节点负载较高(最低负载节点 CPU: {cpu:.1f}%, 内存: {mem:.1f}%),系统将等待最多 10 分钟后分发任务,扫描速度可能受影响",
90+
message=f"所有节点负载较高(最低负载节点 CPU: {cpu:.1f}%, 内存: {mem:.1f}%),系统将每 2 分钟检测一次,最多等待 2 小时后分发任务",
9191
level=NotificationLevel.MEDIUM,
9292
category=NotificationCategory.SYSTEM
9393
)

install.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -703,7 +703,6 @@ else
703703
warn "镜像加速已配置,但拉取仍然失败,可能原因:"
704704
echo -e " 1. 镜像源暂时不可用,请稍后重试"
705705
echo -e " 2. 网络连接问题"
706-
echo -e " 3. 镜像不存在或版本错误"
707706
fi
708707
echo
709708
exit 1

0 commit comments

Comments
 (0)