YuanTingHsieh
diff --git a/‎AGENTS.md‎
Lines changed: 35 additions & 0 deletions b/‎AGENTS.md‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎examples/advanced/bionemo/downstream/client.py‎
Lines changed: 10 additions & 2 deletions b/‎examples/advanced/bionemo/downstream/client.py‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎examples/advanced/bionemo/downstream/downstream_nvflare.ipynb‎
Lines changed: 16 additions & 0 deletions b/‎examples/advanced/bionemo/downstream/downstream_nvflare.ipynb‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎examples/advanced/bionemo/downstream/model.py‎
Lines changed: 197 additions & 0 deletions b/‎examples/advanced/bionemo/downstream/model.py‎
Lines changed: 197 additions & 0 deletions
diff --git a/‎examples/advanced/bionemo/downstream/sabdab/job.py‎
Lines changed: 10 additions & 5 deletions b/‎examples/advanced/bionemo/downstream/sabdab/job.py‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎examples/advanced/bionemo/downstream/scl/job.py‎
Lines changed: 10 additions & 6 deletions b/‎examples/advanced/bionemo/downstream/scl/job.py‎
Lines changed: 10 additions & 6 deletions
@@ -0,0 +1,35 @@
+# NVFlare Agent Notes
+
+- To trigger CI/CD from a PR review thread, post a single-line comment exactly: `/build`.
+- Prefer `rg` and `rg --files` for fast codebase search.
+- Keep edits scoped to the task; do not modify unrelated files in a dirty worktree.
+- Start with targeted tests for changed files, then run broader checks as needed.
+
+## Fast Commands
+
+- `./runtest.sh` runs license/style/tests with coverage.
+- `./runtest.sh -s` runs style checks (flake8, black, isort).
+- `./runtest.sh -f` auto-fixes style where possible.
+- `./runtest.sh -u` runs unit tests.
+- `python3 -m pytest tests/unit_test/path/to/test_file.py -v` runs one test file.
+- `python3 -m pytest --numprocesses=8 -v tests/unit_test` runs unit tests in parallel.
+- `./build_doc.sh --html` builds docs.
+- `./build_doc.sh --clean` cleans docs build artifacts.
+
+## Style and Testing Conventions
+
+- Format/lint stack: black (line length 120), flake8, isort (black profile).
+- Python support targets: 3.9, 3.10, 3.11, 3.12.
+- Add the standard NVIDIA Apache-2.0 license header to new Python source files.
+- Unit tests live in `tests/unit_test/`; integration tests live in `tests/integration_test/`.
+- Test file names follow `[module_name]_test.py`.
+
+## Quick Package Map
+
+- `nvflare/apis/`: core interfaces (Controller, Executor, Task, Shareable, FLContext).
+- `nvflare/app_common/`: common algorithms and utilities.
+- `nvflare/app_opt/`: optional integrations/dependencies.
+- `nvflare/client/`: client-side APIs.
+- `nvflare/job_config/`: FedJob/job configuration.
+- `nvflare/private/`: internal implementations.
+- `nvflare/fuel/`: shared infrastructure utilities.
@@ -22,17 +22,23 @@
 from pathlib import Path
 from typing import Optional
 
+from nvflare.fuel.utils.network_utils import get_open_ports
+
 # Set NumExpr thread limits before importing numerical libraries to avoid thread conflicts
 os.environ.setdefault("NUMEXPR_MAX_THREADS", "64")
 os.environ.setdefault("NUMEXPR_NUM_THREADS", "8")
 
+# Use an available port for PyTorch distributed to avoid EADDRINUSE and PID collisions.
+if "MASTER_PORT" not in os.environ:
+    os.environ["MASTER_PORT"] = str(get_open_ports(1)[0])
+if "MASTER_ADDR" not in os.environ:
+    os.environ.setdefault("MASTER_ADDR", "localhost")
+
 from bionemo.core.utils.dtypes import PrecisionTypes, get_autocast_dtype
 from bionemo.esm2.data.tokenizer import get_tokenizer
 from bionemo.esm2.model.finetune.datamodule import ESM2FineTuneDataModule
 from bionemo.esm2.model.finetune.dataset import InMemoryProteinDataset, InMemorySingleValueDataset
 from bionemo.esm2.model.finetune.sequence_model import ESM2FineTuneSeqConfig
-
-# Resue parser and config constants from bionemo
 from bionemo.esm2.scripts.finetune_esm2 import get_parser
 from bionemo.llm.model.biobert.lightning import biobert_lightning_module
 from bionemo.llm.model.biobert.model import BioBertConfig
@@ -413,6 +419,8 @@ def train_model(
     )
 
     # perform local training starting with the received global model
+    # Set MASTER_PORT so the training subprocess (spawned by Lightning) inherits an available port.
+    os.environ["MASTER_PORT"] = str(get_open_ports(1)[0])
     llm.train(
         model=module,
         data=data_module,
 
@@ -58,6 +58,22 @@
     "warnings.simplefilter(\"ignore\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Copy the model.py files to each tasks subfolders for execution."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! for d in tap sabdab scl; do cp model.py \"$d/\"; done"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
 
@@ -0,0 +1,197 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS FOR ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ESM2 server module: loads checkpoint state_dict for NVFlare FedAvg (no Megatron/Lightning init)."""
+
+import os
+import warnings
+from collections import OrderedDict
+from typing import List, NamedTuple, Optional
+
+
+class _IncompatibleKeys(NamedTuple):
+    """Compatible with PyTorch's load_state_dict return type (missing_keys, unexpected_keys)."""
+
+    missing_keys: List[str]
+    unexpected_keys: List[str]
+
+
+import torch
+
+from nvflare.fuel.utils.network_utils import get_open_ports
+
+
+def _checkpoint_key_to_client(k: str) -> str:
+    for old, new in (
+        ("encoder.layers.self_attention.", "encoder.layers.0.self_attention."),
+        ("encoder.layers.mlp.", "encoder.layers.0.mlp."),
+    ):
+        if old in k:
+            k = k.replace(old, new, 1)
+    return k
+
+
+def _expand_checkpoint_state_dict(sd: OrderedDict) -> OrderedDict:
+    """Split layer-stacked tensors [n, ...] into per-layer keys (layers.0.*, layers.1.*, ...)."""
+    out = OrderedDict()
+    for k, v in sd.items():
+        if not isinstance(v, torch.Tensor):
+            out[k] = v
+            continue
+        # Keys that are layer-stacked: encoder.layers.self_attention.* or encoder.layers.mlp.*
+        if "encoder.layers.self_attention." not in k and "encoder.layers.mlp." not in k:
+            out[_checkpoint_key_to_client(k)] = v
+            continue
+        if v.ndim < 1:
+            out[_checkpoint_key_to_client(k)] = v
+            continue
+        num_layers = v.shape[0]
+        # Split into per-layer keys
+        if "encoder.layers.self_attention." in k:
+            base = k.replace("encoder.layers.self_attention.", "encoder.layers.{}.self_attention.", 1)
+        else:
+            base = k.replace("encoder.layers.mlp.", "encoder.layers.{}.mlp.", 1)
+        for i in range(num_layers):
+            out[base.format(i)] = v[i].clone()
+    return out
+
+
+class ESM2ModuleForServer(torch.nn.Module):
+    """Holds state_dict loaded from checkpoint; BioNeMoParamsFilter adds prefix when sending to client."""
+
+    def __init__(self, checkpoint_path: str, **kwargs):
+        super().__init__()
+        path = os.path.abspath(checkpoint_path)
+        if not os.path.isfile(path) and not os.path.isdir(path):
+            raise FileNotFoundError(f"Checkpoint path does not exist or is not a file/directory: {checkpoint_path!r}")
+        sd = load_state_dict_from_checkpoint_path(checkpoint_path)
+        if sd is None:
+            raise ValueError(
+                f"Checkpoint is missing or invalid (could not load state dict from {checkpoint_path!r}). "
+                "Ensure the path points to a valid NeMo or PyTorch checkpoint."
+            )
+        self._state_dict = _expand_checkpoint_state_dict(sd)
+
+    @staticmethod
+    def _stored_key(k: str) -> str:
+        if k.startswith("module.module."):
+            return k[len("module.") :]
+        return k
+
+    def state_dict(self, *args, **kwargs):
+        return OrderedDict(self._state_dict)
+
+    def load_state_dict(self, state_dict, strict: bool = True):
+        self._state_dict = OrderedDict((self._stored_key(k), v) for k, v in state_dict.items())
+        return _IncompatibleKeys(missing_keys=[], unexpected_keys=[])
+
+
+def _flatten_state_dict(d: dict, prefix: str = "") -> OrderedDict:
+    out = OrderedDict()
+    for k, v in d.items():
+        key = f"{prefix}.{k}" if prefix else k
+        if isinstance(v, torch.Tensor):
+            out[key] = v
+        elif isinstance(v, (dict, OrderedDict)):
+            out.update(_flatten_state_dict(v, key))
+    return out
+
+
+def _extract_state_dict(loaded: dict) -> Optional[OrderedDict]:
+    d = loaded
+    for key in ("model", "state_dict", "weights", "checkpoint"):
+        if key in d and isinstance(d[key], (dict, OrderedDict)):
+            d = d[key]
+            break
+    if d is None or not d:
+        return None
+    if all(isinstance(v, torch.Tensor) for v in d.values()):
+        return OrderedDict(d)
+    flat = _flatten_state_dict(d)
+    if flat is None or not flat:
+        return None
+    if all(isinstance(v, torch.Tensor) for v in flat.values()):
+        return flat
+    return None
+
+
+def _load_nemo_distributed_checkpoint(path: str) -> Optional[OrderedDict]:
+    weights_dir = os.path.join(path, "weights")
+    if not os.path.isdir(weights_dir):
+        return None
+    files = os.listdir(weights_dir)
+    if "metadata.json" not in files or not any(f.endswith(".distcp") for f in files):
+        return None
+    try:
+        from megatron.core.dist_checkpointing.serialization import load_plain_tensors
+    except ImportError:
+        try:
+            from megatron.core import dist_checkpointing as dist_ckpt
+
+            load_plain_tensors = getattr(dist_ckpt, "load_plain_tensors", None)
+        except ImportError:
+            load_plain_tensors = None
+        if load_plain_tensors is None:
+            return None
+    we_initialized = not torch.distributed.is_initialized()
+    if we_initialized:
+        os.environ.setdefault("MASTER_ADDR", "localhost")
+        os.environ.setdefault("MASTER_PORT", str(get_open_ports(1)[0]))
+        torch.distributed.init_process_group(backend="gloo", rank=0, world_size=1)
+    try:
+        ckpt_dir = os.path.abspath(weights_dir)
+        loaded_sd = load_plain_tensors(ckpt_dir)
+        if not isinstance(loaded_sd, dict):
+            return None
+        out = OrderedDict((k, v.cpu() if v.is_cuda else v) for k, v in loaded_sd.items() if isinstance(v, torch.Tensor))
+        return out if out else None
+    except Exception as e:
+        warnings.warn(f"NeMo distributed checkpoint load failed: {e}", UserWarning, stacklevel=2)
+        return None
+    finally:
+        if we_initialized and torch.distributed.is_initialized():
+            torch.distributed.destroy_process_group()
+
+
+def load_state_dict_from_checkpoint_path(checkpoint_path: str) -> Optional[OrderedDict]:
+    """Load a state dict from a NeMo/Megatron checkpoint file or directory.
+
+    Supports single-file checkpoints and NeMo distributed checkpoint directories.
+    Uses ``torch.load(..., weights_only=False)`` so that non-tensor objects in
+    NeMo/Megatron checkpoints are restored correctly.
+
+    .. note::
+        ``weights_only=False`` uses Python's pickle module, which can execute
+        arbitrary code during deserialization. Only load checkpoints from
+        trusted sources.
+    """
+    path = os.path.abspath(checkpoint_path)
+    loaded = None
+    if os.path.isfile(path):
+        try:
+            loaded = torch.load(path, map_location="cpu", weights_only=False)
+        except Exception:
+            return None
+    elif os.path.isdir(path):
+        result = _load_nemo_distributed_checkpoint(path)
+        if result is not None:
+            return result
+        candidate = os.path.join(path, "weights", "common.pt")
+        if os.path.isfile(candidate):
+            try:
+                loaded = torch.load(candidate, map_location="cpu", weights_only=False)
+            except Exception:
+                pass
+    if loaded is None or not isinstance(loaded, dict):
+        return None
+    return _extract_state_dict(loaded)
@@ -18,17 +18,19 @@
 
 from bionemo.core.data.load import load
 
-from nvflare.app_common.widgets.decomposer_reg import DecomposerRegister
 from nvflare.app_opt.pt.recipes.fedavg import FedAvgRecipe
 from nvflare.recipe import SimEnv
 
 # BioNeMo requires heavy imports (PyTorch, NeMo, Megatron) which can take longer than
 # the default 300s timeout on systems with slow I/O or resource contention
 BIONEMO_EXTERNAL_PRE_INIT_TIMEOUT = 900.0  # 15 minutes
 
+# isort: off
 sys.path.append(os.path.join(os.getcwd(), ".."))  # include parent folder in path
 from bionemo_filters import BioNeMoParamsFilter, BioNeMoStateDictFilter
 
+# isort: on
+
 
 def main(args):
     checkpoint_path = load(f"esm2/{args.model}:2.0")
@@ -54,10 +56,17 @@ def main(args):
     script_args = f"--restore-from-checkpoint-path {checkpoint_path} --train-data-path /tmp/placeholder --valid-data-path /tmp/placeholder --config-class ESM2FineTuneSeqConfig --dataset-class InMemorySingleValueDataset --task-type classification --mlp-ft-dropout 0.1 --mlp-hidden-size 256 --mlp-target-size 2 --experiment-name sabdab_esm2_{args.model} --num-steps {args.local_steps} --num-gpus 1 --val-check-interval {val_check_interval} --log-every-n-steps 10 --lr 1e-4 --lr-multiplier 5 --scale-lr-layer classification_head --result-dir bionemo --micro-batch-size 64 --precision {precision} --save-top-k 1 --limit-val-batches 1.0 --classes {classes} --dataset-name sabdab --exp-name {args.exp_name}"
     print(f"Running {args.train_script} with base args (data paths will be resolved per-client)")
 
+    # Use dict config of the model so we only instantiate the model on the server.
+    model = {
+        "class_path": "model.ESM2ModuleForServer",
+        "args": {"checkpoint_path": str(checkpoint_path)},
+    }
+
     # Create FedAvgRecipe
     job_name = f"{args.exp_name}_sabdab_esm2_{args.model}"
     recipe = FedAvgRecipe(
         name=job_name,
+        model=model,
         min_clients=args.num_clients,
         num_rounds=args.num_rounds,
         train_script=f"../{args.train_script}",
@@ -73,10 +82,6 @@ def main(args):
     recipe.add_client_input_filter(BioNeMoParamsFilter(precision), tasks=["train", "validate"])
     recipe.add_client_output_filter(BioNeMoStateDictFilter(), tasks=["train", "validate"])
 
-    # Add decomposer register to server and clients
-    recipe.job.to_server(DecomposerRegister(["nvflare.app_opt.pt.decomposers.TensorDecomposer"]))
-    recipe.job.to_clients(DecomposerRegister(["nvflare.app_opt.pt.decomposers.TensorDecomposer"]))
-
     # Add BioNeMo-specific timeout configuration to client config to override its default timeout
     recipe.add_client_config({"EXTERNAL_PRE_INIT_TIMEOUT": BIONEMO_EXTERNAL_PRE_INIT_TIMEOUT})
 
 
@@ -18,17 +18,19 @@
 
 from bionemo.core.data.load import load
 
-from nvflare.app_common.widgets.decomposer_reg import DecomposerRegister
 from nvflare.app_opt.pt.recipes.fedavg import FedAvgRecipe
 from nvflare.recipe import SimEnv
 
 # BioNeMo requires heavy imports (PyTorch, NeMo, Megatron) which can take longer than
 # the default 300s timeout on systems with slow I/O or resource contention
 BIONEMO_EXTERNAL_PRE_INIT_TIMEOUT = 900.0  # 15 minutes
 
+# isort: off
 sys.path.append(os.path.join(os.getcwd(), ".."))  # include parent folder in path
 from bionemo_filters import BioNeMoParamsFilter, BioNeMoStateDictFilter
 
+# isort: on
+
 
 def main(args):
     checkpoint_path = load(f"esm2/{args.model}:2.0")
@@ -54,10 +56,16 @@ def main(args):
     script_args = f"--restore-from-checkpoint-path {checkpoint_path} --train-data-path /tmp/placeholder --valid-data-path /tmp/placeholder --config-class ESM2FineTuneSeqConfig --dataset-class InMemorySingleValueDataset --task-type classification --mlp-ft-dropout 0.1 --mlp-hidden-size 256 --mlp-target-size 10 --experiment-name scl_esm2_{args.model} --num-steps {args.local_steps} --num-gpus 1 --val-check-interval {val_check_interval} --log-every-n-steps 10 --lr 5e-4 --result-dir bionemo --micro-batch-size 64 --precision {precision} --save-top-k 1 --encoder-frozen --limit-val-batches 1.0 --classes {classes} --dataset-name scl --exp-name {args.exp_name}"
     print(f"Running {args.train_script} with base args (data paths will be resolved per-client)")
 
-    # Create FedAvgRecipe
+    # Use dict config of the model so we only instantiate the model on the server.
+    model = {
+        "class_path": "model.ESM2ModuleForServer",
+        "args": {"checkpoint_path": str(checkpoint_path)},
+    }
+
     job_name = f"{args.exp_name}_scl_esm2_{args.model}"
     recipe = FedAvgRecipe(
         name=job_name,
+        model=model,
         min_clients=args.num_clients,
         num_rounds=args.num_rounds,
         train_script=f"../{args.train_script}",
@@ -73,10 +81,6 @@ def main(args):
     recipe.add_client_input_filter(BioNeMoParamsFilter(precision), tasks=["train", "validate"])
     recipe.add_client_output_filter(BioNeMoStateDictFilter(), tasks=["train", "validate"])
 
-    # Add decomposer register to server and clients
-    recipe.job.to_server(DecomposerRegister(["nvflare.app_opt.pt.decomposers.TensorDecomposer"]))
-    recipe.job.to_clients(DecomposerRegister(["nvflare.app_opt.pt.decomposers.TensorDecomposer"]))
-
     # Add BioNeMo-specific timeout configuration to client config to override its default timeout
     recipe.add_client_config({"EXTERNAL_PRE_INIT_TIMEOUT": BIONEMO_EXTERNAL_PRE_INIT_TIMEOUT})