|
| 1 | +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS FOR ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | +"""ESM2 server module: loads checkpoint state_dict for NVFlare FedAvg (no Megatron/Lightning init).""" |
| 15 | + |
| 16 | +import os |
| 17 | +import warnings |
| 18 | +from collections import OrderedDict |
| 19 | +from typing import List, NamedTuple, Optional |
| 20 | + |
| 21 | + |
| 22 | +class _IncompatibleKeys(NamedTuple): |
| 23 | + """Compatible with PyTorch's load_state_dict return type (missing_keys, unexpected_keys).""" |
| 24 | + |
| 25 | + missing_keys: List[str] |
| 26 | + unexpected_keys: List[str] |
| 27 | + |
| 28 | + |
| 29 | +import torch |
| 30 | + |
| 31 | +from nvflare.fuel.utils.network_utils import get_open_ports |
| 32 | + |
| 33 | + |
| 34 | +def _checkpoint_key_to_client(k: str) -> str: |
| 35 | + for old, new in ( |
| 36 | + ("encoder.layers.self_attention.", "encoder.layers.0.self_attention."), |
| 37 | + ("encoder.layers.mlp.", "encoder.layers.0.mlp."), |
| 38 | + ): |
| 39 | + if old in k: |
| 40 | + k = k.replace(old, new, 1) |
| 41 | + return k |
| 42 | + |
| 43 | + |
| 44 | +def _expand_checkpoint_state_dict(sd: OrderedDict) -> OrderedDict: |
| 45 | + """Split layer-stacked tensors [n, ...] into per-layer keys (layers.0.*, layers.1.*, ...).""" |
| 46 | + out = OrderedDict() |
| 47 | + for k, v in sd.items(): |
| 48 | + if not isinstance(v, torch.Tensor): |
| 49 | + out[k] = v |
| 50 | + continue |
| 51 | + # Keys that are layer-stacked: encoder.layers.self_attention.* or encoder.layers.mlp.* |
| 52 | + if "encoder.layers.self_attention." not in k and "encoder.layers.mlp." not in k: |
| 53 | + out[_checkpoint_key_to_client(k)] = v |
| 54 | + continue |
| 55 | + if v.ndim < 1: |
| 56 | + out[_checkpoint_key_to_client(k)] = v |
| 57 | + continue |
| 58 | + num_layers = v.shape[0] |
| 59 | + # Split into per-layer keys |
| 60 | + if "encoder.layers.self_attention." in k: |
| 61 | + base = k.replace("encoder.layers.self_attention.", "encoder.layers.{}.self_attention.", 1) |
| 62 | + else: |
| 63 | + base = k.replace("encoder.layers.mlp.", "encoder.layers.{}.mlp.", 1) |
| 64 | + for i in range(num_layers): |
| 65 | + out[base.format(i)] = v[i].clone() |
| 66 | + return out |
| 67 | + |
| 68 | + |
| 69 | +class ESM2ModuleForServer(torch.nn.Module): |
| 70 | + """Holds state_dict loaded from checkpoint; BioNeMoParamsFilter adds prefix when sending to client.""" |
| 71 | + |
| 72 | + def __init__(self, checkpoint_path: str, **kwargs): |
| 73 | + super().__init__() |
| 74 | + path = os.path.abspath(checkpoint_path) |
| 75 | + if not os.path.isfile(path) and not os.path.isdir(path): |
| 76 | + raise FileNotFoundError(f"Checkpoint path does not exist or is not a file/directory: {checkpoint_path!r}") |
| 77 | + sd = load_state_dict_from_checkpoint_path(checkpoint_path) |
| 78 | + if sd is None: |
| 79 | + raise ValueError( |
| 80 | + f"Checkpoint is missing or invalid (could not load state dict from {checkpoint_path!r}). " |
| 81 | + "Ensure the path points to a valid NeMo or PyTorch checkpoint." |
| 82 | + ) |
| 83 | + self._state_dict = _expand_checkpoint_state_dict(sd) |
| 84 | + |
| 85 | + @staticmethod |
| 86 | + def _stored_key(k: str) -> str: |
| 87 | + if k.startswith("module.module."): |
| 88 | + return k[len("module.") :] |
| 89 | + return k |
| 90 | + |
| 91 | + def state_dict(self, *args, **kwargs): |
| 92 | + return OrderedDict(self._state_dict) |
| 93 | + |
| 94 | + def load_state_dict(self, state_dict, strict: bool = True): |
| 95 | + self._state_dict = OrderedDict((self._stored_key(k), v) for k, v in state_dict.items()) |
| 96 | + return _IncompatibleKeys(missing_keys=[], unexpected_keys=[]) |
| 97 | + |
| 98 | + |
| 99 | +def _flatten_state_dict(d: dict, prefix: str = "") -> OrderedDict: |
| 100 | + out = OrderedDict() |
| 101 | + for k, v in d.items(): |
| 102 | + key = f"{prefix}.{k}" if prefix else k |
| 103 | + if isinstance(v, torch.Tensor): |
| 104 | + out[key] = v |
| 105 | + elif isinstance(v, (dict, OrderedDict)): |
| 106 | + out.update(_flatten_state_dict(v, key)) |
| 107 | + return out |
| 108 | + |
| 109 | + |
| 110 | +def _extract_state_dict(loaded: dict) -> Optional[OrderedDict]: |
| 111 | + d = loaded |
| 112 | + for key in ("model", "state_dict", "weights", "checkpoint"): |
| 113 | + if key in d and isinstance(d[key], (dict, OrderedDict)): |
| 114 | + d = d[key] |
| 115 | + break |
| 116 | + if d is None or not d: |
| 117 | + return None |
| 118 | + if all(isinstance(v, torch.Tensor) for v in d.values()): |
| 119 | + return OrderedDict(d) |
| 120 | + flat = _flatten_state_dict(d) |
| 121 | + if flat is None or not flat: |
| 122 | + return None |
| 123 | + if all(isinstance(v, torch.Tensor) for v in flat.values()): |
| 124 | + return flat |
| 125 | + return None |
| 126 | + |
| 127 | + |
| 128 | +def _load_nemo_distributed_checkpoint(path: str) -> Optional[OrderedDict]: |
| 129 | + weights_dir = os.path.join(path, "weights") |
| 130 | + if not os.path.isdir(weights_dir): |
| 131 | + return None |
| 132 | + files = os.listdir(weights_dir) |
| 133 | + if "metadata.json" not in files or not any(f.endswith(".distcp") for f in files): |
| 134 | + return None |
| 135 | + try: |
| 136 | + from megatron.core.dist_checkpointing.serialization import load_plain_tensors |
| 137 | + except ImportError: |
| 138 | + try: |
| 139 | + from megatron.core import dist_checkpointing as dist_ckpt |
| 140 | + |
| 141 | + load_plain_tensors = getattr(dist_ckpt, "load_plain_tensors", None) |
| 142 | + except ImportError: |
| 143 | + load_plain_tensors = None |
| 144 | + if load_plain_tensors is None: |
| 145 | + return None |
| 146 | + we_initialized = not torch.distributed.is_initialized() |
| 147 | + if we_initialized: |
| 148 | + os.environ.setdefault("MASTER_ADDR", "localhost") |
| 149 | + os.environ.setdefault("MASTER_PORT", str(get_open_ports(1)[0])) |
| 150 | + torch.distributed.init_process_group(backend="gloo", rank=0, world_size=1) |
| 151 | + try: |
| 152 | + ckpt_dir = os.path.abspath(weights_dir) |
| 153 | + loaded_sd = load_plain_tensors(ckpt_dir) |
| 154 | + if not isinstance(loaded_sd, dict): |
| 155 | + return None |
| 156 | + out = OrderedDict((k, v.cpu() if v.is_cuda else v) for k, v in loaded_sd.items() if isinstance(v, torch.Tensor)) |
| 157 | + return out if out else None |
| 158 | + except Exception as e: |
| 159 | + warnings.warn(f"NeMo distributed checkpoint load failed: {e}", UserWarning, stacklevel=2) |
| 160 | + return None |
| 161 | + finally: |
| 162 | + if we_initialized and torch.distributed.is_initialized(): |
| 163 | + torch.distributed.destroy_process_group() |
| 164 | + |
| 165 | + |
| 166 | +def load_state_dict_from_checkpoint_path(checkpoint_path: str) -> Optional[OrderedDict]: |
| 167 | + """Load a state dict from a NeMo/Megatron checkpoint file or directory. |
| 168 | +
|
| 169 | + Supports single-file checkpoints and NeMo distributed checkpoint directories. |
| 170 | + Uses ``torch.load(..., weights_only=False)`` so that non-tensor objects in |
| 171 | + NeMo/Megatron checkpoints are restored correctly. |
| 172 | +
|
| 173 | + .. note:: |
| 174 | + ``weights_only=False`` uses Python's pickle module, which can execute |
| 175 | + arbitrary code during deserialization. Only load checkpoints from |
| 176 | + trusted sources. |
| 177 | + """ |
| 178 | + path = os.path.abspath(checkpoint_path) |
| 179 | + loaded = None |
| 180 | + if os.path.isfile(path): |
| 181 | + try: |
| 182 | + loaded = torch.load(path, map_location="cpu", weights_only=False) |
| 183 | + except Exception: |
| 184 | + return None |
| 185 | + elif os.path.isdir(path): |
| 186 | + result = _load_nemo_distributed_checkpoint(path) |
| 187 | + if result is not None: |
| 188 | + return result |
| 189 | + candidate = os.path.join(path, "weights", "common.pt") |
| 190 | + if os.path.isfile(candidate): |
| 191 | + try: |
| 192 | + loaded = torch.load(candidate, map_location="cpu", weights_only=False) |
| 193 | + except Exception: |
| 194 | + pass |
| 195 | + if loaded is None or not isinstance(loaded, dict): |
| 196 | + return None |
| 197 | + return _extract_state_dict(loaded) |
0 commit comments