Skip to content

Commit 9a20e23

Browse files
committed
fix - lite fix for eplb cause rapid increase in memory
1 parent 9d8a767 commit 9a20e23

File tree

1 file changed

+7
-3
lines changed

1 file changed

+7
-3
lines changed

rtp_llm/eplb/ep_balancer.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import datetime
2+
import gc
23
import json
34
import logging
45
import random
@@ -19,8 +20,8 @@
1920
ModelDeployWeightInfo,
2021
ModelWeightInfo,
2122
)
22-
from rtp_llm.utils.database import BaseDatabase
2323
from rtp_llm.model_loader.tensor_source import DatabaseTensorSource
24+
from rtp_llm.utils.database import BaseDatabase
2425
from rtp_llm.utils.model_weight import W
2526

2627

@@ -189,7 +190,7 @@ def create_balance_plan(self, log_stats: torch.Tensor, gpu_loads: torch.Tensor):
189190
log2phy_pad[:, :k] = log2phy[0]
190191

191192
logging.info(f"[EPLB_py PLAN] phy2log for layer {layer_id}: {phy2log[0]}")
192-
193+
gc.collect()
193194
dtype = torch.int32
194195
return (
195196
torch.tensor([layer_id], dtype=torch.int32).contiguous(),
@@ -218,7 +219,9 @@ def load_moe_weight(
218219
f"[EPLB_py][RANK {self._load_config.ep_rank}] Load MOE weight layer {layer_id} for {choose_expert_id}"
219220
)
220221
try:
221-
res = moe_weight.load(DatabaseTensorSource(self.database), layer_id, "cpu", self._load_config)
222+
res = moe_weight.load(
223+
DatabaseTensorSource(self.database), layer_id, "cpu", self._load_config
224+
)
222225
except:
223226
logging.error(
224227
f"[EPLB_py][RANK {self._load_config.ep_rank}] Load MOE weight layer failed: 完整堆栈:\n{traceback.format_exc()}"
@@ -227,6 +230,7 @@ def load_moe_weight(
227230
logging.info(
228231
f"[EPLB_py][RANK {self._load_config.ep_rank}] Load MOE weight layer {layer_id} done"
229232
)
233+
gc.collect()
230234
return (
231235
layer_id,
232236
res.get(W.moe_w1),

0 commit comments

Comments
 (0)