From 5db90c7f9b4f6a67a9b1c349dda372b15baeec17 Mon Sep 17 00:00:00 2001 From: Jiani Wang Date: Fri, 15 Aug 2025 17:35:44 -0700 Subject: [PATCH 1/2] free memory --- torchtitan/models/deepseek_v3/model/state_dict_adapter.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/torchtitan/models/deepseek_v3/model/state_dict_adapter.py b/torchtitan/models/deepseek_v3/model/state_dict_adapter.py index 5a676b5a0..787bfe4f6 100644 --- a/torchtitan/models/deepseek_v3/model/state_dict_adapter.py +++ b/torchtitan/models/deepseek_v3/model/state_dict_adapter.py @@ -158,6 +158,9 @@ def to_hf(self, state_dict: dict[str, Any]) -> dict[str, Any]: new_key = new_abstract_key.format(layer_num, expert_num) hf_state_dict[new_key] = split_values[expert_num].squeeze() + # Remove the GroupedExperts' weight from the state_dict to free memory + del value + elif "layers" in key: abstract_key = re.sub(r"(\d+)", "{}", key, count=1) layer_num = re.search(r"\d+", key).group(0) From 672f73e1ed585f4e1e4af95e7948865c13299317 Mon Sep 17 00:00:00 2001 From: Jiani Wang Date: Sat, 16 Aug 2025 15:40:34 -0700 Subject: [PATCH 2/2] add README --- torchtitan/models/deepseek_v3/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/torchtitan/models/deepseek_v3/README.md b/torchtitan/models/deepseek_v3/README.md index 085403d47..92dd7bab7 100644 --- a/torchtitan/models/deepseek_v3/README.md +++ b/torchtitan/models/deepseek_v3/README.md @@ -61,6 +61,7 @@ python scripts/checkpoint_conversion/convert_from_hf.py