diff --git a/lmdeploy/pytorch/backends/cuda/graph_runner.py b/lmdeploy/pytorch/backends/cuda/graph_runner.py index 6e815e6e73..59396b7b1e 100644 --- a/lmdeploy/pytorch/backends/cuda/graph_runner.py +++ b/lmdeploy/pytorch/backends/cuda/graph_runner.py @@ -272,6 +272,11 @@ def prepare_inputs_for_generation( def reset(self): """Remove all graphs to prevent hanging on exit.""" self._runner_map.clear() + # destroy deepep buffer + if get_moe_backend().use_deepep_moe_backend(): + from dlblas.layers.moe.token_dispatcher import DeepEPBuffer + if hasattr(DeepEPBuffer, 'destroy'): + DeepEPBuffer.destroy() def update_inputs(self, inputs): """Update inputs.""" diff --git a/lmdeploy/pytorch/backends/cuda/moe/blocked_fp8.py b/lmdeploy/pytorch/backends/cuda/moe/blocked_fp8.py index 00f1036b0d..8810f57a7e 100644 --- a/lmdeploy/pytorch/backends/cuda/moe/blocked_fp8.py +++ b/lmdeploy/pytorch/backends/cuda/moe/blocked_fp8.py @@ -117,6 +117,8 @@ def __init__(self, try: from dlblas.layers.moe.token_dispatcher import DeepEPBuffer, DeepEPMode, use_deepep # noqa: F401 get_moe_backend().set_deepep_moe_backend() + if hasattr(DeepEPBuffer, 'set_explicitly_destroy'): + DeepEPBuffer.set_explicitly_destroy() except ImportError: logger.warning('For higher performance, please install DeepEP https://github.com/deepseek-ai/DeepEP') diff --git a/lmdeploy/pytorch/backends/cuda/moe/default.py b/lmdeploy/pytorch/backends/cuda/moe/default.py index fd3971e14b..ade7a1d2b5 100644 --- a/lmdeploy/pytorch/backends/cuda/moe/default.py +++ b/lmdeploy/pytorch/backends/cuda/moe/default.py @@ -392,6 +392,8 @@ def __init__( try: from dlblas.layers.moe.token_dispatcher import DeepEPBuffer, DeepEPMode, use_deepep # noqa: F401 get_moe_backend().set_deepep_moe_backend() + if hasattr(DeepEPBuffer, 'set_explicitly_destroy'): + DeepEPBuffer.set_explicitly_destroy() except ImportError: logger.warning('For higher performance, please install DeepEP https://github.com/deepseek-ai/DeepEP')