From 84ecfeaa831a5eae2b4fcc7260e2ab63f0f2080a Mon Sep 17 00:00:00 2001
From: Jason Mancuso <7891333+jvmncs@users.noreply.github.com>
Date: Mon, 23 Mar 2026 20:12:45 -0400
Subject: [PATCH 1/5] init miles example + lora recipes

---
 miles/README.md                               | 101 ++++
 miles/modal_train.py                          | 531 ++++++++++++++++++
 miles/recipes/glm4-7-flash-lora.args          | 120 ++++
 .../recipes/glm5-744b-a40b-20layer-lora.args  | 119 ++++
 miles/recipes/glm5-744b-a40b-4layer-lora.args | 119 ++++
 miles/recipes/glm5-744b-a40b-lora.args        | 119 ++++
 miles/recipes/qwen25-0p5b-lora.args           |  93 +++
 7 files changed, 1202 insertions(+)
 create mode 100644 miles/README.md
 create mode 100644 miles/modal_train.py
 create mode 100644 miles/recipes/glm4-7-flash-lora.args
 create mode 100644 miles/recipes/glm5-744b-a40b-20layer-lora.args
 create mode 100644 miles/recipes/glm5-744b-a40b-4layer-lora.args
 create mode 100644 miles/recipes/glm5-744b-a40b-lora.args
 create mode 100644 miles/recipes/qwen25-0p5b-lora.args

diff --git a/miles/README.md b/miles/README.md
new file mode 100644
index 0000000..26541f5
--- /dev/null
+++ b/miles/README.md
@@ -0,0 +1,101 @@
+# Miles example
+
+Multi-node Miles RL training on Modal using the same Ray bootstrap pattern as
+[`ray/modal_train.py`](../ray/modal_train.py), but with Miles recipes stored as
+native CLI flag files instead of Python config classes.
+
+## Prerequisites
+
+- A Modal account with multi-node access.
+- A `huggingface-secret` Modal secret containing `HF_TOKEN` for gated model
+  downloads.
+- Optionally export `WANDB_API_KEY` in your local shell before `modal run` to
+  auto-enable Weights & Biases logging for training runs.
+
+## Recipes
+
+List the built-in recipes:
+
+```bash
+modal run miles/modal_train.py --list-recipes
+```
+
+Current recipes:
+
+- `qwen25-0p5b-lora`: single-node smoke test adapted from the upstream Miles
+  LoRA example.
+- `glm4-7-flash-lora`: first real GLM MoE validation recipe.
+- `glm5-744b-a40b-4layer-lora`: GLM-5 testing recipe using the 4-layer script
+  shape from upstream Miles.
+- `glm5-744b-a40b-20layer-lora`: larger GLM-5 testing recipe using the 20-layer
+  script shape from upstream Miles.
+- `glm5-744b-a40b-lora`: full GLM-5 starter recipe.
+
+## Prepare assets
+
+Prepare a small GSM8K dataset in the shared volume:
+
+```bash
+modal run miles/modal_train.py::prepare_dataset
+```
+
+Download a recipe's base model into the shared Hugging Face cache:
+
+```bash
+modal run miles/modal_train.py::download_model --recipe glm4-7-flash-lora
+```
+
+## Train
+
+The cluster size is chosen at import time by `MILES_N_NODES`, so set it in the
+same shell invocation as `modal run`.
+
+Single-node smoke test:
+
+```bash
+MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen25-0p5b-lora
+```
+
+GLM-4.7-Flash multi-node validation:
+
+```bash
+MILES_N_NODES=4 modal run miles/modal_train.py --recipe glm4-7-flash-lora
+```
+
+GLM-5 4-layer testing recipe:
+
+```bash
+MILES_N_NODES=1 modal run miles/modal_train.py --recipe glm5-744b-a40b-4layer-lora --gpu H200:8
+```
+
+GLM-5 20-layer testing recipe:
+
+```bash
+MILES_N_NODES=2 modal run miles/modal_train.py --recipe glm5-744b-a40b-20layer-lora --gpu H200:8
+```
+
+Full GLM-5 starter recipe:
+
+```bash
+MILES_N_NODES=8 modal run miles/modal_train.py --recipe glm5-744b-a40b-lora --gpu H200:8
+```
+
+Useful options:
+
+- `--dry-run`: print the assembled Miles command with a `$MODEL_PATH`
+  placeholder without launching the cluster.
+- `--extra-args "...flags..."`: append ad hoc Miles CLI overrides.
+- `--extra-args-file path/to/file.args`: append overrides from a local text
+  file.
+- `--custom-config path/to/overrides.yaml`: pass a flat YAML override map to
+  Miles via `--custom-config-path`.
+- `--allow-cluster-mismatch`: bypass recipe/node-count validation if you are
+  intentionally adapting a canned recipe.
+- `USE_LOCAL_MILES=/path/to/miles`: overlay a local Miles checkout on top of
+  the pinned container image.
+- `MILES_IMAGE=radixark/miles:...`: override the pinned image tag. The current
+  default is `radixark/miles:dev-202603231227`.
+
+The wrapper intentionally owns only Modal/Ray plumbing plus a small set of
+cluster-critical flags. All model and training settings live in
+[`miles/recipes/`](./recipes/).
diff --git a/miles/modal_train.py b/miles/modal_train.py
new file mode 100644
index 0000000..35b075e
--- /dev/null
+++ b/miles/modal_train.py
@@ -0,0 +1,531 @@
+"""
+Thin Modal launcher for multi-node Miles training.
+
+Design:
+- Bootstrap the Ray cluster once in modal.enter() inside a clustered modal.Cls.
+- Submit the actual Miles job from a modal.method() on rank 0.
+- Keep Miles recipes as native CLI flag files under miles/recipes/.
+- Own only infrastructure-critical flags in Python:
+  cluster size, GPUs per node, model path resolution, checkpoint path, and
+  optional YAML override transport.
+"""
+
+import datetime as dt
+import os
+import pathlib
+import shlex
+import subprocess
+import time
+from dataclasses import dataclass
+from typing import Optional
+
+import modal
+import modal.experimental
+
+
+here = pathlib.Path(__file__).parent.resolve()
+
+APP_NAME = os.environ.get("MILES_APP_NAME", "miles-modal")
+MILES_IMAGE = os.environ.get("MILES_IMAGE", "radixark/miles:dev-202603231227")
+CLUSTER_NODES = int(os.environ.get("MILES_N_NODES", "1"))
+DEFAULT_GPU = os.environ.get("MILES_GPU", "H100:8")
+LOCAL_MILES_PATH = os.environ.get("USE_LOCAL_MILES", "")
+
+HF_CACHE_PATH = pathlib.Path("/root/.cache/huggingface")
+DATA_PATH = pathlib.Path("/data")
+CHECKPOINTS_PATH = pathlib.Path("/checkpoints")
+REMOTE_RECIPES_DIR = pathlib.Path("/root/miles-recipes")
+REMOTE_MILES_DIR = pathlib.Path("/root/miles")
+REMOTE_TRAIN_SCRIPT = REMOTE_MILES_DIR / "train.py"
+
+RAY_PORT = 6379
+RAY_DASHBOARD_PORT = 8265
+
+hf_cache_volume = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
+data_volume = modal.Volume.from_name("miles-example-data", create_if_missing=True)
+checkpoints_volume = modal.Volume.from_name(
+    "miles-example-checkpoints", create_if_missing=True
+)
+
+
+@dataclass(frozen=True)
+class Recipe:
+    name: str
+    description: str
+    model_id: str
+    args_file: str
+    recommended_nodes: int
+    gpu: str
+
+
+RECIPES = {
+    "qwen25-0p5b-lora": Recipe(
+        name="qwen25-0p5b-lora",
+        description="Single-node smoke test adapted from the upstream Miles LoRA example.",
+        model_id="Qwen/Qwen2.5-0.5B-Instruct",
+        args_file="qwen25-0p5b-lora.args",
+        recommended_nodes=1,
+        gpu="H100:8",
+    ),
+    "glm4-7-flash-lora": Recipe(
+        name="glm4-7-flash-lora",
+        description="First real GLM MoE validation recipe on multiple nodes.",
+        model_id="zai-org/GLM-4.7-Flash",
+        args_file="glm4-7-flash-lora.args",
+        recommended_nodes=4,
+        gpu="H100:8",
+    ),
+    "glm5-744b-a40b-4layer-lora": Recipe(
+        name="glm5-744b-a40b-4layer-lora",
+        description="GLM-5 testing recipe using the upstream 4-layer model script shape.",
+        model_id="zai-org/GLM-5",
+        args_file="glm5-744b-a40b-4layer-lora.args",
+        recommended_nodes=1,
+        gpu="H200:8",
+    ),
+    "glm5-744b-a40b-20layer-lora": Recipe(
+        name="glm5-744b-a40b-20layer-lora",
+        description="GLM-5 testing recipe using the upstream 20-layer model script shape.",
+        model_id="zai-org/GLM-5",
+        args_file="glm5-744b-a40b-20layer-lora.args",
+        recommended_nodes=2,
+        gpu="H200:8",
+    ),
+    "glm5-744b-a40b-lora": Recipe(
+        name="glm5-744b-a40b-lora",
+        description="Full GLM-5 starter recipe for LoRA RLVR experiments.",
+        model_id="zai-org/GLM-5",
+        args_file="glm5-744b-a40b-lora.args",
+        recommended_nodes=8,
+        gpu="H200:8",
+    ),
+}
+
+
+def _get_recipe(name: str) -> Recipe:
+    if name not in RECIPES:
+        available = ", ".join(sorted(RECIPES))
+        raise ValueError(f"Unknown recipe: {name}. Available recipes: {available}")
+    return RECIPES[name]
+
+
+def _parse_gpus_per_node(gpu: str) -> int:
+    try:
+        return int(gpu.rsplit(":", 1)[1])
+    except (IndexError, ValueError) as exc:
+        raise ValueError(
+            f"GPU spec must include a per-node count like 'H100:8'; got {gpu!r}"
+        ) from exc
+
+
+def _clean_arg_text(arg_text: str) -> str:
+    lines: list[str] = []
+    for raw_line in arg_text.splitlines():
+        line = raw_line.split("#", 1)[0].strip()
+        if line:
+            lines.append(line)
+    return "\n".join(lines)
+
+
+def _parse_arg_text(arg_text: str) -> list[str]:
+    cleaned = _clean_arg_text(arg_text)
+    return shlex.split(cleaned) if cleaned else []
+
+
+def _load_recipe_text(recipe: Recipe, remote: bool = False) -> str:
+    base_dir = REMOTE_RECIPES_DIR if remote else here / "recipes"
+    return (base_dir / recipe.args_file).read_text()
+
+
+def _build_enforced_args(
+    *,
+    model_path: str,
+    cluster_nodes: int,
+    gpus_per_node: int,
+    checkpoint_dir: pathlib.Path,
+    custom_config_path: Optional[str],
+    wandb_key: Optional[str],
+) -> list[str]:
+    args = [
+        "--train-backend",
+        "megatron",
+        "--hf-checkpoint",
+        model_path,
+        "--ref-load",
+        model_path,
+        "--save",
+        checkpoint_dir.as_posix(),
+        "--actor-num-nodes",
+        str(cluster_nodes),
+        "--actor-num-gpus-per-node",
+        str(gpus_per_node),
+        "--num-gpus-per-node",
+        str(gpus_per_node),
+        "--colocate",
+    ]
+    if custom_config_path:
+        args.extend(["--custom-config-path", custom_config_path])
+    if wandb_key:
+        args.extend(["--use-wandb", "--wandb-key", wandb_key])
+    return args
+
+
+def _build_miles_argv(
+    recipe: Recipe,
+    *,
+    model_path: str,
+    cluster_nodes: int,
+    gpus_per_node: int,
+    checkpoint_dir: pathlib.Path,
+    extra_args_text: str,
+    custom_config_path: Optional[str],
+    wandb_key: Optional[str],
+    remote_recipe: bool,
+) -> list[str]:
+    recipe_args = _parse_arg_text(_load_recipe_text(recipe, remote=remote_recipe))
+    extra_args = _parse_arg_text(extra_args_text)
+    enforced_args = _build_enforced_args(
+        model_path=model_path,
+        cluster_nodes=cluster_nodes,
+        gpus_per_node=gpus_per_node,
+        checkpoint_dir=checkpoint_dir,
+        custom_config_path=custom_config_path,
+        wandb_key=wandb_key,
+    )
+    return ["python3", REMOTE_TRAIN_SCRIPT.as_posix(), *recipe_args, *extra_args, *enforced_args]
+
+
+def _read_optional_file(path_str: str) -> str:
+    if not path_str:
+        return ""
+    return pathlib.Path(path_str).read_text()
+
+
+def _build_runtime_env(master_addr: str, wandb_key: Optional[str]) -> dict:
+    env_vars = {
+        "MASTER_ADDR": master_addr,
+        "no_proxy": master_addr,
+        "PYTHONPATH": "/root/Megatron-LM",
+        "CUDA_DEVICE_MAX_CONNECTIONS": "1",
+        "NCCL_ALGO": "Ring",
+        "NVTE_ALLOW_NONDETERMINISTIC_ALGO": "0",
+        "CUBLAS_WORKSPACE_CONFIG": ":4096:8",
+    }
+    if wandb_key:
+        env_vars["WANDB_API_KEY"] = wandb_key
+    return {"env_vars": env_vars}
+
+
+def _print_recipe_table():
+    print("Available recipes:")
+    for recipe in sorted(RECIPES.values(), key=lambda item: item.name):
+        print(
+            f"  - {recipe.name}: {recipe.description} "
+            f"(model={recipe.model_id}, nodes={recipe.recommended_nodes}, gpu={recipe.gpu})"
+        )
+
+
+image = (
+    modal.Image.from_registry(MILES_IMAGE)
+    .entrypoint([])
+    .add_local_dir(here / "recipes", remote_path=REMOTE_RECIPES_DIR.as_posix(), copy=True)
+)
+
+if LOCAL_MILES_PATH:
+    image = image.add_local_dir(
+        LOCAL_MILES_PATH,
+        remote_path=REMOTE_MILES_DIR.as_posix(),
+        copy=True,
+        ignore=["**/__pycache__", "**/*.pyc", "**/.git", "**/.venv"],
+    ).run_commands(f"pip install -e {REMOTE_MILES_DIR} --no-deps")
+
+
+with image.imports():
+    import ray
+    from huggingface_hub import snapshot_download
+    from ray.job_submission import JobSubmissionClient
+
+
+app = modal.App(APP_NAME)
+
+
+@app.cls(
+    image=image,
+    gpu=DEFAULT_GPU,
+    volumes={
+        HF_CACHE_PATH.as_posix(): hf_cache_volume,
+        DATA_PATH.as_posix(): data_volume,
+        CHECKPOINTS_PATH.as_posix(): checkpoints_volume,
+    },
+    timeout=24 * 60 * 60,
+    scaledown_window=60 * 60,
+    retries=2,
+    experimental_options={"efa_enabled": True},
+)
+@modal.experimental.clustered(size=CLUSTER_NODES, rdma=True)
+class MilesCluster:
+    @modal.enter()
+    def bootstrap_ray(self):
+        hf_cache_volume.reload()
+        data_volume.reload()
+        checkpoints_volume.reload()
+        self.rank = None
+        self.node_ips = []
+        self.main_addr = None
+        self.node_addr = None
+        self.client = None
+        self._ray_ready = False
+
+    def _ensure_ray_started(self):
+        if self._ray_ready:
+            return
+
+        cluster_info = modal.experimental.get_cluster_info()
+        self.rank = cluster_info.rank
+        if cluster_info.container_ipv4_ips:
+            self.node_ips = cluster_info.container_ipv4_ips
+        elif CLUSTER_NODES == 1:
+            # Modal may omit container IPv4s for size-1 clustered functions.
+            self.node_ips = ["127.0.0.1"]
+        else:
+            raise RuntimeError(
+                "Modal did not provide container IPv4s for a multi-node cluster."
+            )
+
+        self.main_addr = self.node_ips[0]
+        self.node_addr = self.node_ips[min(self.rank, len(self.node_ips) - 1)]
+
+        if self.rank == 0:
+            print(f"Starting Ray head at {self.node_addr}")
+            subprocess.Popen(
+                [
+                    "ray",
+                    "start",
+                    "--head",
+                    f"--node-ip-address={self.node_addr}",
+                    "--dashboard-host=0.0.0.0",
+                    "--disable-usage-stats",
+                ]
+            )
+
+            for _ in range(30):
+                try:
+                    ray.init(address="auto")
+                    break
+                except Exception:
+                    time.sleep(1)
+            else:
+                raise RuntimeError("Failed to connect to the Ray head node")
+
+            for _ in range(60):
+                alive_nodes = [node for node in ray.nodes() if node["Alive"]]
+                print(f"Alive nodes: {len(alive_nodes)}/{len(self.node_ips)}")
+                if len(alive_nodes) == len(self.node_ips):
+                    break
+                time.sleep(1)
+            else:
+                raise RuntimeError("Not all Ray worker nodes connected")
+
+            self.client = JobSubmissionClient(f"http://127.0.0.1:{RAY_DASHBOARD_PORT}")
+            print("Ray cluster is ready.")
+        else:
+            print(f"Starting Ray worker at {self.node_addr}, head={self.main_addr}")
+            subprocess.Popen(
+                [
+                    "ray",
+                    "start",
+                    f"--node-ip-address={self.node_addr}",
+                    "--address",
+                    f"{self.main_addr}:{RAY_PORT}",
+                    "--disable-usage-stats",
+                ]
+            )
+        self._ray_ready = True
+
+    @modal.method()
+    async def submit_training(
+        self,
+        recipe_name: str,
+        *,
+        gpus_per_node: int,
+        extra_args_text: str = "",
+        custom_config_yaml: str = "",
+        wandb_key: str = "",
+    ) -> dict:
+        self._ensure_ray_started()
+
+        if self.rank != 0:
+            while True:
+                time.sleep(10)
+
+        recipe = _get_recipe(recipe_name)
+
+        try:
+            model_path = snapshot_download(repo_id=recipe.model_id, local_files_only=True)
+        except Exception as exc:
+            raise RuntimeError(
+                f"Model {recipe.model_id} is not present in the shared HF cache. "
+                f"Run `modal run miles/modal_train.py::download_model --recipe {recipe.name}` first."
+            ) from exc
+
+        run_id = dt.datetime.utcnow().strftime("%Y%m%d-%H%M%S")
+        checkpoint_dir = CHECKPOINTS_PATH / recipe.name / run_id
+        custom_config_path = None
+        if custom_config_yaml:
+            custom_config_path = f"/tmp/{recipe.name}-{run_id}-overrides.yaml"
+            pathlib.Path(custom_config_path).write_text(custom_config_yaml)
+
+        argv = _build_miles_argv(
+            recipe,
+            model_path=model_path,
+            cluster_nodes=CLUSTER_NODES,
+            gpus_per_node=gpus_per_node,
+            checkpoint_dir=checkpoint_dir,
+            extra_args_text=extra_args_text,
+            custom_config_path=custom_config_path,
+            wandb_key=wandb_key or None,
+            remote_recipe=True,
+        )
+        entrypoint = shlex.join(argv)
+        runtime_env = _build_runtime_env(self.main_addr, wandb_key or None)
+
+        print(f"Recipe: {recipe.name}")
+        print(f"Model: {recipe.model_id}")
+        print(f"Nodes: {CLUSTER_NODES}")
+        print(f"GPUs per node: {gpus_per_node}")
+        print(f"Checkpoint dir: {checkpoint_dir}")
+        print(f"Entrypoint: {entrypoint}")
+
+        with modal.forward(RAY_DASHBOARD_PORT) as tunnel:
+            print(f"Dashboard URL: {tunnel.url}")
+            job_id = self.client.submit_job(entrypoint=entrypoint, runtime_env=runtime_env)
+            print(f"Submitted Ray job: {job_id}")
+
+            async for line in self.client.tail_job_logs(job_id):
+                print(line, end="", flush=True)
+
+        status = self.client.get_job_status(job_id).value
+        checkpoints_volume.commit()
+        print(f"\nFinal status: {status}")
+        return {
+            "job_id": job_id,
+            "status": status,
+            "recipe": recipe.name,
+            "checkpoint_dir": checkpoint_dir.as_posix(),
+        }
+
+
+@app.function(
+    image=image,
+    volumes={HF_CACHE_PATH.as_posix(): hf_cache_volume},
+    secrets=[modal.Secret.from_name("huggingface-secret")],
+    timeout=24 * 60 * 60,
+)
+def download_model(
+    recipe: str = "glm4-7-flash-lora",
+    revision: Optional[str] = None,
+    model_id: Optional[str] = None,
+):
+    from huggingface_hub import snapshot_download
+
+    resolved_model_id = model_id or _get_recipe(recipe).model_id
+    hf_cache_volume.reload()
+    path = snapshot_download(
+        repo_id=resolved_model_id,
+        revision=revision,
+        token=os.environ.get("HF_TOKEN"),
+    )
+    print(f"Downloaded {resolved_model_id} to {path}")
+    hf_cache_volume.commit()
+
+
+@app.function(
+    image=image,
+    volumes={DATA_PATH.as_posix(): data_volume},
+    timeout=24 * 60 * 60,
+)
+def prepare_dataset(
+    hf_dataset: str = "zhuzilin/gsm8k",
+    data_folder: str = "gsm8k",
+):
+    from datasets import load_dataset
+
+    data_volume.reload()
+    dataset = load_dataset(hf_dataset)
+    output_dir = DATA_PATH / data_folder
+    output_dir.mkdir(parents=True, exist_ok=True)
+    dataset["train"].to_parquet((output_dir / "train.parquet").as_posix())
+    dataset["test"].to_parquet((output_dir / "test.parquet").as_posix())
+    data_volume.commit()
+    print(f"Prepared dataset {hf_dataset} under {output_dir}")
+
+
+@app.local_entrypoint()
+def main(
+    recipe: str = "qwen25-0p5b-lora",
+    gpu: str = "",
+    extra_args: str = "",
+    extra_args_file: str = "",
+    custom_config: str = "",
+    list_recipes: bool = False,
+    dry_run: bool = False,
+    allow_cluster_mismatch: bool = False,
+):
+    if list_recipes:
+        _print_recipe_table()
+        return
+
+    selected_recipe = _get_recipe(recipe)
+    selected_gpu = gpu or selected_recipe.gpu
+    gpus_per_node = _parse_gpus_per_node(selected_gpu)
+
+    if (
+        not allow_cluster_mismatch
+        and CLUSTER_NODES != selected_recipe.recommended_nodes
+    ):
+        raise ValueError(
+            f"Recipe {selected_recipe.name} expects MILES_N_NODES={selected_recipe.recommended_nodes}, "
+            f"but this process was started with MILES_N_NODES={CLUSTER_NODES}. "
+            f"Rerun with the recommended value or pass --allow-cluster-mismatch."
+        )
+
+    merged_extra_args = "\n".join(
+        part for part in [extra_args, _read_optional_file(extra_args_file)] if part
+    )
+    custom_config_yaml = _read_optional_file(custom_config)
+    wandb_key = os.environ.get("WANDB_API_KEY", "")
+    checkpoint_dir = CHECKPOINTS_PATH / selected_recipe.name / "DRY_RUN"
+
+    if dry_run:
+        argv = _build_miles_argv(
+            selected_recipe,
+            model_path="$MODEL_PATH",
+            cluster_nodes=CLUSTER_NODES,
+            gpus_per_node=gpus_per_node,
+            checkpoint_dir=checkpoint_dir,
+            extra_args_text=merged_extra_args,
+            custom_config_path="/tmp/custom-config.yaml" if custom_config_yaml else None,
+            wandb_key="$WANDB_API_KEY" if wandb_key else None,
+            remote_recipe=False,
+        )
+        print(f"Recipe: {selected_recipe.name}")
+        print(f"Model: {selected_recipe.model_id}")
+        print(f"Cluster nodes: {CLUSTER_NODES}")
+        print(f"GPU: {selected_gpu}")
+        print(shlex.join(argv))
+        return
+
+    print(f"Recipe: {selected_recipe.name}")
+    print(f"Model: {selected_recipe.model_id}")
+    print(f"Cluster nodes: {CLUSTER_NODES}")
+    print(f"GPU: {selected_gpu}")
+
+    cluster = MilesCluster.with_options(gpu=selected_gpu)()
+    result = cluster.submit_training.remote(
+        recipe_name=selected_recipe.name,
+        gpus_per_node=gpus_per_node,
+        extra_args_text=merged_extra_args,
+        custom_config_yaml=custom_config_yaml,
+        wandb_key=wandb_key,
+    )
+    print(result)
diff --git a/miles/recipes/glm4-7-flash-lora.args b/miles/recipes/glm4-7-flash-lora.args
new file mode 100644
index 0000000..0cf819f
--- /dev/null
+++ b/miles/recipes/glm4-7-flash-lora.args
@@ -0,0 +1,120 @@
+# GLM-4.7-Flash LoRA validation recipe.
+
+# Model architecture from the upstream Miles model script.
+--moe-layer-freq "[0]*1+[1]*46"
+--num-experts 64
+--moe-shared-expert-intermediate-size 1536
+--moe-router-topk 4
+--moe-grouped-gemm
+--moe-permute-fusion
+--moe-ffn-hidden-size 1536
+--moe-router-score-function sigmoid
+--moe-router-pre-softmax
+--moe-router-enable-expert-bias
+--moe-router-bias-update-rate 0
+--moe-router-load-balancing-type seq_aux_loss
+--moe-router-topk-scaling-factor 1.8
+--moe-aux-loss-coeff 0
+--moe-router-dtype fp32
+--make-vocab-size-divisible-by 64
+--num-layers 47
+--hidden-size 2048
+--ffn-hidden-size 10240
+--num-attention-heads 20
+--disable-bias-linear
+--add-qkv-bias
+--swiglu
+--untie-embeddings-and-output-weights
+--position-embedding-type rope
+--no-position-embedding
+--normalization RMSNorm
+--qk-layernorm
+--multi-latent-attention
+--q-lora-rank 768
+--kv-lora-rank 512
+--qk-head-dim 192
+--v-head-dim 256
+--kv-channels 192
+--qk-pos-emb-head-dim 64
+--vocab-size 154880
+--rotary-base 1000000
+--no-rope-fusion
+--mtp-num-layers 1
+
+# Checkpoint conversion
+--megatron-to-hf-mode bridge
+
+# LoRA
+--lora-rank 32
+--lora-alpha 32
+--lora-dropout 0.0
+--target-modules all-linear
+
+# Data and rollout
+--prompt-data /data/gsm8k/train.parquet
+--input-key messages
+--label-key label
+--apply-chat-template
+--rollout-shuffle
+--rm-type math
+--num-rollout 100
+--rollout-batch-size 8
+--n-samples-per-prompt 8
+--rollout-max-response-len 2048
+--rollout-temperature 1
+--global-batch-size 64
+
+# Evaluation
+--eval-interval 10
+--eval-prompt-data gsm8k /data/gsm8k/test.parquet
+--n-samples-per-eval-prompt 2
+--eval-max-response-len 4096
+--eval-top-k 1
+
+# Parallelism and performance
+--tensor-model-parallel-size 2
+--sequence-parallel
+--pipeline-model-parallel-size 4
+--context-parallel-size 1
+--expert-model-parallel-size 4
+--expert-tensor-parallel-size 1
+--recompute-granularity full
+--recompute-method uniform
+--recompute-num-layers 1
+--use-dynamic-batch-size
+--max-tokens-per-gpu 4096
+
+# GRPO
+--advantage-estimator grpo
+--kl-loss-coef 0.0
+--kl-loss-type low_var_kl
+--kl-coef 0.0
+--entropy-coef 0.0
+--eps-clip 0.2
+--eps-clip-high 0.28
+
+# Optimizer
+--optimizer adam
+--lr 1e-5
+--lr-decay-style constant
+--weight-decay 0.1
+--adam-beta1 0.9
+--adam-beta2 0.98
+
+# Rollout serving
+--rollout-num-gpus-per-engine 1
+--sglang-mem-fraction-static 0.55
+
+# Training runtime
+--attention-dropout 0.0
+--hidden-dropout 0.0
+--accumulate-allreduce-grads-in-fp32
+--attention-softmax-in-fp32
+--calculate-per-token-loss
+--use-miles-router
+--save-interval 25
+
+# Logging defaults
+--wandb-project miles-modal
+--wandb-group glm4-7-flash-lora
+--disable-wandb-random-suffix
diff --git a/miles/recipes/glm5-744b-a40b-20layer-lora.args b/miles/recipes/glm5-744b-a40b-20layer-lora.args
new file mode 100644
index 0000000..5a2e760
--- /dev/null
+++ b/miles/recipes/glm5-744b-a40b-20layer-lora.args
@@ -0,0 +1,119 @@
+# GLM-5 20-layer testing recipe.
+
+# Model architecture from the upstream Miles GLM-5 model script with the 20-layer override.
+--spec miles_plugins.models.glm5.glm5 get_glm5_spec
+--moe-layer-freq "[0]*3+[1]*17"
+--num-experts 256
+--moe-shared-expert-intermediate-size 2048
+--moe-router-topk 8
+--moe-grouped-gemm
+--moe-permute-fusion
+--moe-ffn-hidden-size 2048
+--moe-router-score-function sigmoid
+--moe-router-pre-softmax
+--moe-router-enable-expert-bias
+--moe-router-bias-update-rate 0
+--moe-router-load-balancing-type seq_aux_loss
+--moe-router-topk-scaling-factor 2.5
+--moe-aux-loss-coeff 0
+--moe-router-dtype fp32
+--make-vocab-size-divisible-by 16
+--num-layers 20
+--hidden-size 6144
+--ffn-hidden-size 12288
+--num-attention-heads 64
+--disable-bias-linear
+--swiglu
+--untie-embeddings-and-output-weights
+--position-embedding-type rope
+--no-position-embedding
+--normalization RMSNorm
+--qk-layernorm
+--multi-latent-attention
+--q-lora-rank 2048
+--kv-lora-rank 512
+--qk-head-dim 192
+--v-head-dim 256
+--kv-channels 192
+--qk-pos-emb-head-dim 64
+--vocab-size 154880
+--rotary-base 1000000
+--enable-experimental
+
+# Checkpoint conversion
+--megatron-to-hf-mode bridge
+
+# LoRA
+--lora-rank 32
+--lora-alpha 32
+--lora-dropout 0.0
+--target-modules all-linear
+
+# Data and rollout
+--prompt-data /data/gsm8k/train.parquet
+--input-key messages
+--label-key label
+--apply-chat-template
+--rollout-shuffle
+--rm-type math
+--num-rollout 100
+--rollout-batch-size 4
+--n-samples-per-prompt 8
+--rollout-max-response-len 1536
+--rollout-temperature 1
+--global-batch-size 32
+
+# Evaluation
+--eval-interval 10
+--eval-prompt-data gsm8k /data/gsm8k/test.parquet
+--n-samples-per-eval-prompt 2
+--eval-max-response-len 3072
+--eval-top-k 1
+
+# Parallelism and performance
+--tensor-model-parallel-size 2
+--sequence-parallel
+--pipeline-model-parallel-size 2
+--context-parallel-size 1
+--expert-model-parallel-size 4
+--expert-tensor-parallel-size 1
+--recompute-granularity full
+--recompute-method uniform
+--recompute-num-layers 1
+--use-dynamic-batch-size
+--max-tokens-per-gpu 3072
+
+# GRPO
+--advantage-estimator grpo
+--kl-loss-coef 0.0
+--kl-loss-type low_var_kl
+--kl-coef 0.0
+--entropy-coef 0.0
+--eps-clip 0.2
+--eps-clip-high 0.28
+
+# Optimizer
+--optimizer adam
+--lr 5e-6
+--lr-decay-style constant
+--weight-decay 0.1
+--adam-beta1 0.9
+--adam-beta2 0.98
+
+# Rollout serving
+--rollout-num-gpus-per-engine 1
+--sglang-mem-fraction-static 0.5
+
+# Training runtime
+--attention-dropout 0.0
+--hidden-dropout 0.0
+--accumulate-allreduce-grads-in-fp32
+--attention-softmax-in-fp32
+--calculate-per-token-loss
+--use-miles-router
+--save-interval 25
+
+# Logging defaults
+--wandb-project miles-modal
+--wandb-group glm5-744b-a40b-20layer-lora
+--disable-wandb-random-suffix
diff --git a/miles/recipes/glm5-744b-a40b-4layer-lora.args b/miles/recipes/glm5-744b-a40b-4layer-lora.args
new file mode 100644
index 0000000..64ac703
--- /dev/null
+++ b/miles/recipes/glm5-744b-a40b-4layer-lora.args
@@ -0,0 +1,119 @@
+# GLM-5 4-layer testing recipe.
+
+# Model architecture from the upstream Miles GLM-5 model script with the 4-layer override.
+--spec miles_plugins.models.glm5.glm5 get_glm5_spec
+--moe-layer-freq "[0]*3+[1]*1"
+--num-experts 256
+--moe-shared-expert-intermediate-size 2048
+--moe-router-topk 8
+--moe-grouped-gemm
+--moe-permute-fusion
+--moe-ffn-hidden-size 2048
+--moe-router-score-function sigmoid
+--moe-router-pre-softmax
+--moe-router-enable-expert-bias
+--moe-router-bias-update-rate 0
+--moe-router-load-balancing-type seq_aux_loss
+--moe-router-topk-scaling-factor 2.5
+--moe-aux-loss-coeff 0
+--moe-router-dtype fp32
+--make-vocab-size-divisible-by 16
+--num-layers 4
+--hidden-size 6144
+--ffn-hidden-size 12288
+--num-attention-heads 64
+--disable-bias-linear
+--swiglu
+--untie-embeddings-and-output-weights
+--position-embedding-type rope
+--no-position-embedding
+--normalization RMSNorm
+--qk-layernorm
+--multi-latent-attention
+--q-lora-rank 2048
+--kv-lora-rank 512
+--qk-head-dim 192
+--v-head-dim 256
+--kv-channels 192
+--qk-pos-emb-head-dim 64
+--vocab-size 154880
+--rotary-base 1000000
+--enable-experimental
+
+# Checkpoint conversion
+--megatron-to-hf-mode bridge
+
+# LoRA
+--lora-rank 32
+--lora-alpha 32
+--lora-dropout 0.0
+--target-modules all-linear
+
+# Data and rollout
+--prompt-data /data/gsm8k/train.parquet
+--input-key messages
+--label-key label
+--apply-chat-template
+--rollout-shuffle
+--rm-type math
+--num-rollout 100
+--rollout-batch-size 4
+--n-samples-per-prompt 8
+--rollout-max-response-len 1024
+--rollout-temperature 1
+--global-batch-size 32
+
+# Evaluation
+--eval-interval 10
+--eval-prompt-data gsm8k /data/gsm8k/test.parquet
+--n-samples-per-eval-prompt 2
+--eval-max-response-len 2048
+--eval-top-k 1
+
+# Parallelism and performance
+--tensor-model-parallel-size 2
+--sequence-parallel
+--pipeline-model-parallel-size 1
+--context-parallel-size 1
+--expert-model-parallel-size 4
+--expert-tensor-parallel-size 1
+--recompute-granularity full
+--recompute-method uniform
+--recompute-num-layers 1
+--use-dynamic-batch-size
+--max-tokens-per-gpu 3072
+
+# GRPO
+--advantage-estimator grpo
+--kl-loss-coef 0.0
+--kl-loss-type low_var_kl
+--kl-coef 0.0
+--entropy-coef 0.0
+--eps-clip 0.2
+--eps-clip-high 0.28
+
+# Optimizer
+--optimizer adam
+--lr 5e-6
+--lr-decay-style constant
+--weight-decay 0.1
+--adam-beta1 0.9
+--adam-beta2 0.98
+
+# Rollout serving
+--rollout-num-gpus-per-engine 1
+--sglang-mem-fraction-static 0.5
+
+# Training runtime
+--attention-dropout 0.0
+--hidden-dropout 0.0
+--accumulate-allreduce-grads-in-fp32
+--attention-softmax-in-fp32
+--calculate-per-token-loss
+--use-miles-router
+--save-interval 25
+
+# Logging defaults
+--wandb-project miles-modal
+--wandb-group glm5-744b-a40b-4layer-lora
+--disable-wandb-random-suffix
diff --git a/miles/recipes/glm5-744b-a40b-lora.args b/miles/recipes/glm5-744b-a40b-lora.args
new file mode 100644
index 0000000..81ebca5
--- /dev/null
+++ b/miles/recipes/glm5-744b-a40b-lora.args
@@ -0,0 +1,119 @@
+# Full GLM-5 starter recipe.
+
+# Model architecture from the upstream Miles GLM-5 model script.
+--spec miles_plugins.models.glm5.glm5 get_glm5_spec
+--moe-layer-freq "[0]*3+[1]*75"
+--num-experts 256
+--moe-shared-expert-intermediate-size 2048
+--moe-router-topk 8
+--moe-grouped-gemm
+--moe-permute-fusion
+--moe-ffn-hidden-size 2048
+--moe-router-score-function sigmoid
+--moe-router-pre-softmax
+--moe-router-enable-expert-bias
+--moe-router-bias-update-rate 0
+--moe-router-load-balancing-type seq_aux_loss
+--moe-router-topk-scaling-factor 2.5
+--moe-aux-loss-coeff 0
+--moe-router-dtype fp32
+--make-vocab-size-divisible-by 16
+--num-layers 78
+--hidden-size 6144
+--ffn-hidden-size 12288
+--num-attention-heads 64
+--disable-bias-linear
+--swiglu
+--untie-embeddings-and-output-weights
+--position-embedding-type rope
+--no-position-embedding
+--normalization RMSNorm
+--qk-layernorm
+--multi-latent-attention
+--q-lora-rank 2048
+--kv-lora-rank 512
+--qk-head-dim 192
+--v-head-dim 256
+--kv-channels 192
+--qk-pos-emb-head-dim 64
+--vocab-size 154880
+--rotary-base 1000000
+--enable-experimental
+
+# Checkpoint conversion
+--megatron-to-hf-mode bridge
+
+# LoRA
+--lora-rank 32
+--lora-alpha 32
+--lora-dropout 0.0
+--target-modules all-linear
+
+# Data and rollout
+--prompt-data /data/gsm8k/train.parquet
+--input-key messages
+--label-key label
+--apply-chat-template
+--rollout-shuffle
+--rm-type math
+--num-rollout 100
+--rollout-batch-size 2
+--n-samples-per-prompt 8
+--rollout-max-response-len 2048
+--rollout-temperature 1
+--global-batch-size 16
+
+# Evaluation
+--eval-interval 10
+--eval-prompt-data gsm8k /data/gsm8k/test.parquet
+--n-samples-per-eval-prompt 2
+--eval-max-response-len 4096
+--eval-top-k 1
+
+# Parallelism and performance
+--tensor-model-parallel-size 2
+--sequence-parallel
+--pipeline-model-parallel-size 4
+--context-parallel-size 1
+--expert-model-parallel-size 8
+--expert-tensor-parallel-size 1
+--recompute-granularity full
+--recompute-method uniform
+--recompute-num-layers 1
+--use-dynamic-batch-size
+--max-tokens-per-gpu 2048
+
+# GRPO
+--advantage-estimator grpo
+--kl-loss-coef 0.0
+--kl-loss-type low_var_kl
+--kl-coef 0.0
+--entropy-coef 0.0
+--eps-clip 0.2
+--eps-clip-high 0.28
+
+# Optimizer
+--optimizer adam
+--lr 5e-6
+--lr-decay-style constant
+--weight-decay 0.1
+--adam-beta1 0.9
+--adam-beta2 0.98
+
+# Rollout serving
+--rollout-num-gpus-per-engine 1
+--sglang-mem-fraction-static 0.45
+
+# Training runtime
+--attention-dropout 0.0
+--hidden-dropout 0.0
+--accumulate-allreduce-grads-in-fp32
+--attention-softmax-in-fp32
+--calculate-per-token-loss
+--use-miles-router
+--save-interval 25
+
+# Logging defaults
+--wandb-project miles-modal
+--wandb-group glm5-744b-a40b-lora
+--disable-wandb-random-suffix
diff --git a/miles/recipes/qwen25-0p5b-lora.args b/miles/recipes/qwen25-0p5b-lora.args
new file mode 100644
index 0000000..a3cffc7
--- /dev/null
+++ b/miles/recipes/qwen25-0p5b-lora.args
@@ -0,0 +1,93 @@
+# Qwen2.5-0.5B single-node smoke test adapted from the upstream Miles LoRA demo.
+
+# Model architecture
+--swiglu
+--num-layers 24
+--hidden-size 896
+--ffn-hidden-size 4864
+--num-attention-heads 14
+--use-rotary-position-embeddings
+--disable-bias-linear
+--add-qkv-bias
+--normalization RMSNorm
+--norm-epsilon 1e-6
+--rotary-base 1000000
+--group-query-attention
+--num-query-groups 2
+--vocab-size 151936
+
+# Checkpoint conversion
+--megatron-to-hf-mode bridge
+
+# LoRA
+--lora-rank 32
+--lora-alpha 32
+--lora-dropout 0.0
+--target-modules all-linear
+
+# Data and rollout
+--prompt-data /data/gsm8k/train.parquet
+--input-key messages
+--label-key label
+--apply-chat-template
+--rollout-shuffle
+--rm-type math
+--num-rollout 100
+--rollout-batch-size 32
+--n-samples-per-prompt 8
+--rollout-max-response-len 1024
+--rollout-temperature 1
+--global-batch-size 256
+
+# Evaluation
+--eval-interval 10
+--eval-prompt-data gsm8k /data/gsm8k/test.parquet
+--n-samples-per-eval-prompt 1
+--eval-max-response-len 1024
+--eval-top-k 1
+
+# Parallelism and performance
+--tensor-model-parallel-size 1
+--sequence-parallel
+--pipeline-model-parallel-size 1
+--context-parallel-size 1
+--expert-model-parallel-size 1
+--expert-tensor-parallel-size 1
+--use-dynamic-batch-size
+--max-tokens-per-gpu 9216
+
+# GRPO
+--advantage-estimator grpo
+--kl-loss-coef 0.0
+--kl-loss-type low_var_kl
+--kl-coef 0.0
+--entropy-coef 0.0
+--eps-clip 0.2
+--eps-clip-high 0.28
+
+# Optimizer
+--optimizer adam
+--lr 1e-5
+--lr-decay-style constant
+--weight-decay 0.1
+--adam-beta1 0.9
+--adam-beta2 0.98
+
+# Rollout serving
+--rollout-num-gpus-per-engine 1
+--sglang-mem-fraction-static 0.4
+
+# Training runtime
+--attention-dropout 0.0
+--hidden-dropout 0.0
+--accumulate-allreduce-grads-in-fp32
+--attention-softmax-in-fp32
+--attention-backend flash
+--calculate-per-token-loss
+--use-miles-router
+--save-interval 25
+
+# Logging defaults
+--wandb-project miles-modal
+--wandb-group qwen25-0p5b-lora
+--disable-wandb-random-suffix

From 8d9664f26bf2af9c53b540d93fdd92e80418d255 Mon Sep 17 00:00:00 2001
From: Jason Mancuso <7891333+jvmncs@users.noreply.github.com>
Date: Thu, 26 Mar 2026 19:04:32 -0400
Subject: [PATCH 2/5] Validate Qwen3-30B-A3B LoRA on Modal and move test
 recipes

---
 miles/README.md                               |  98 ++++-
 miles/modal_patches/sitecustomize.py          | 415 ++++++++++++++++++
 miles/modal_train.py                          |  74 ++--
 miles/recipes/glm4-7-flash-lora.args          | 120 -----
 .../recipes/glm5-744b-a40b-20layer-lora.args  | 119 -----
 ...a.args => qwen3-30b-a3b-experts-lora.args} |  94 ++--
 ...ayer-lora.args => qwen3-30b-a3b-lora.args} |  95 ++--
 .../recipes/{ => tests}/qwen25-0p5b-lora.args |   0
 .../tests/qwen3-30b-a3b-experts-fewstep.args  | 110 +++++
 .../tests/qwen3-30b-a3b-lora-fewstep.args     | 110 +++++
 .../qwen3-30b-a3b-lora-greedy-debug.args      | 111 +++++
 11 files changed, 964 insertions(+), 382 deletions(-)
 create mode 100644 miles/modal_patches/sitecustomize.py
 delete mode 100644 miles/recipes/glm4-7-flash-lora.args
 delete mode 100644 miles/recipes/glm5-744b-a40b-20layer-lora.args
 rename miles/recipes/{glm5-744b-a40b-lora.args => qwen3-30b-a3b-experts-lora.args} (56%)
 rename miles/recipes/{glm5-744b-a40b-4layer-lora.args => qwen3-30b-a3b-lora.args} (56%)
 rename miles/recipes/{ => tests}/qwen25-0p5b-lora.args (100%)
 create mode 100644 miles/recipes/tests/qwen3-30b-a3b-experts-fewstep.args
 create mode 100644 miles/recipes/tests/qwen3-30b-a3b-lora-fewstep.args
 create mode 100644 miles/recipes/tests/qwen3-30b-a3b-lora-greedy-debug.args

diff --git a/miles/README.md b/miles/README.md
index 26541f5..9d7c67c 100644
--- a/miles/README.md
+++ b/miles/README.md
@@ -24,12 +24,16 @@ Current recipes:
 
 - `qwen25-0p5b-lora`: single-node smoke test adapted from the upstream Miles
   LoRA example.
-- `glm4-7-flash-lora`: first real GLM MoE validation recipe.
-- `glm5-744b-a40b-4layer-lora`: GLM-5 testing recipe using the 4-layer script
-  shape from upstream Miles.
-- `glm5-744b-a40b-20layer-lora`: larger GLM-5 testing recipe using the 20-layer
-  script shape from upstream Miles.
-- `glm5-744b-a40b-lora`: full GLM-5 starter recipe.
+- `qwen3-30b-a3b-lora`: first-pass Qwen3-30B-A3B bridge-mode LoRA validation
+  recipe, restricted to attention targets (`linear_qkv`, `linear_proj`).
+- `qwen3-30b-a3b-lora-fewstep`: trimmed attention-only recipe that is intended
+  to prove a few full RL updates on Modal.
+- `qwen3-30b-a3b-experts-lora`: second-pass Qwen3-30B-A3B recipe widened to
+  expert `linear_fc1` and `linear_fc2` targets after the baseline path works.
+- `qwen3-30b-a3b-experts-fewstep`: trimmed expert-target recipe built from the
+  working few-step shape.
+
+Testing/debug recipe files live under [`recipes/tests/`](./recipes/tests).
 
 ## Prepare assets
 
@@ -42,7 +46,7 @@ modal run miles/modal_train.py::prepare_dataset
 Download a recipe's base model into the shared Hugging Face cache:
 
 ```bash
-modal run miles/modal_train.py::download_model --recipe glm4-7-flash-lora
+modal run miles/modal_train.py::download_model --recipe qwen3-30b-a3b-lora
 ```
 
 ## Train
@@ -56,30 +60,94 @@ Single-node smoke test:
 MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen25-0p5b-lora
 ```
 
-GLM-4.7-Flash multi-node validation:
+Qwen3-30B-A3B baseline LoRA validation:
 
 ```bash
-MILES_N_NODES=4 modal run miles/modal_train.py --recipe glm4-7-flash-lora
+MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen3-30b-a3b-lora
 ```
 
-GLM-5 4-layer testing recipe:
+Qwen3-30B-A3B few-step attention-only validation:
 
 ```bash
-MILES_N_NODES=1 modal run miles/modal_train.py --recipe glm5-744b-a40b-4layer-lora --gpu H200:8
+MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen3-30b-a3b-lora-fewstep
 ```
 
-GLM-5 20-layer testing recipe:
+Qwen3-30B-A3B expert-target LoRA follow-up:
 
 ```bash
-MILES_N_NODES=2 modal run miles/modal_train.py --recipe glm5-744b-a40b-20layer-lora --gpu H200:8
+MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen3-30b-a3b-experts-lora
 ```
 
-Full GLM-5 starter recipe:
+Qwen3-30B-A3B expert-target few-step validation:
 
 ```bash
-MILES_N_NODES=8 modal run miles/modal_train.py --recipe glm5-744b-a40b-lora --gpu H200:8
+MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen3-30b-a3b-experts-fewstep
 ```
 
+## Qwen3 Notes
+
+- Start with standard LoRA, not DoRA. Miles' current rollout sync and adapter
+  filtering are LoRA-specific and keyed off `lora_A` / `lora_B` names, so DoRA
+  is not the first validation target.
+- The baked Qwen3 recipes are single-node `H100:8` shapes. They are intended to
+  validate end-to-end bridge-mode LoRA with colocated rollout first, not to
+  exhaustively cover every parallelism combination.
+- Source inspection suggests the training path should handle TP / PP / EP / CP
+  because the bridge setup forwards all of those settings into Megatron-Bridge,
+  and Megatron-Bridge's PEFT tests cover pipeline-style model chunk lists. That
+  is still weaker than an actual Miles e2e validation for each shape.
+- Miles currently supports LoRA weight sync only for colocated rollout engines.
+  Distributed non-colocated rollout sync is not yet implemented for LoRA.
+- The baseline Qwen3 recipe stays close to the upstream Miles single-node
+  Qwen3-30B-A3B shape. The expert-target recipe is a follow-on experiment, not
+  the initial correctness target.
+
+## Observed On Modal
+
+The current wrapper includes runtime patches in
+[`modal_patches/sitecustomize.py`](./modal_patches/sitecustomize.py) that:
+
+- register Megatron-Bridge's `LinearCrossEntropyModule` as column-parallel
+  before Hugging Face weights are loaded, which fixes bridge-mode Qwen3 load on
+  `output_layer.weight`;
+- serialize colocated LoRA weight buckets in a builtins-only format and
+  rehydrate them inside SGLang, which fixes the Modal colocated LoRA sync path;
+- sanitize non-finite SGLang logprob values before JSON serialization;
+- sanitize invalid SGLang sampling probability rows before `torch.multinomial`.
+
+What the Modal runs have validated so far on `modal-labs`:
+
+- `qwen3-30b-a3b-lora` gets through bridge-mode LoRA creation and attention
+  module injection (`linear_qkv`, `linear_proj`), and it can start loading the
+  Hugging Face checkpoint into Megatron.
+- `qwen3-30b-a3b-lora-fewstep` now gets through full RL training on Modal. In
+  recent runs it passed rollout, weight sync, and actor training repeatedly and
+  reached at least `train/step` 6 on a single-node `H100:8` shape.
+- `qwen3-30b-a3b-experts-lora` goes further: it creates LoRA with
+  `linear_qkv`, `linear_proj`, `linear_fc1`, and `linear_fc2`, injects those
+  expert targets under `decoder.layers.*.mlp.experts.*`, loads weights, pushes
+  the adapter into SGLang, and starts `Eval gsm8k`.
+- `qwen3-30b-a3b-experts-fewstep` has validated the widened target surface on
+  Modal: Miles creates LoRA with `linear_fc1` / `linear_fc2`, injects those
+  expert modules, completes weight sync, and reaches rollout collection plus
+  actor training. A detached confirmation of a full expert-target train step is
+  still in progress.
+- The remaining instability has been in the colocated SGLang rollout path, not
+  in LoRA target discovery. The main concrete runtime failures we hit were:
+  non-finite logprobs breaking HTTP JSON serialization, and invalid sampling
+  probability tensors breaking `torch.multinomial`.
+
+Current interpretation:
+
+- Qwen3-30B-A3B MoE LoRA support in Miles is real enough to instantiate,
+  target, load, and export adapters for both attention and expert MLP layers.
+- Attention-only Qwen3-30B-A3B LoRA is now runtime-validated for repeated RL
+  updates on `modal-labs`.
+- The remaining risk is concentrated in the colocated SGLang rollout lifecycle,
+  which is coupled to `offload_rollout` / `enable_memory_saver=True` in the
+  current Miles SGLang engine setup, especially once expert-target LoRA is
+  enabled.
+
 Useful options:
 
 - `--dry-run`: print the assembled Miles command with a `$MODEL_PATH`
diff --git a/miles/modal_patches/sitecustomize.py b/miles/modal_patches/sitecustomize.py
new file mode 100644
index 0000000..0662644
--- /dev/null
+++ b/miles/modal_patches/sitecustomize.py
@@ -0,0 +1,415 @@
+"""Modal runtime patches loaded automatically via PYTHONPATH."""
+
+
+def _log(message: str) -> None:
+    print(message, flush=True)
+
+
+def _register_linear_cross_entropy_module() -> None:
+    try:
+        from megatron.bridge.models.conversion.param_mapping import AutoMapping
+    except Exception as exc:
+        _log(
+            "[miles-modal] bridge patch unavailable for LinearCrossEntropyModule: "
+            f"{type(exc).__name__}: {exc}"
+        )
+        return
+
+    try:
+        AutoMapping.register_module_type("LinearCrossEntropyModule", "column")
+    except Exception as exc:
+        message = str(exc).lower()
+        if any(token in message for token in ("already", "exists", "duplicate")):
+            _log(
+                "[miles-modal] bridge patch already present for "
+                "LinearCrossEntropyModule"
+            )
+            return
+        _log(
+            "[miles-modal] bridge patch failed for LinearCrossEntropyModule: "
+            f"{type(exc).__name__}: {exc}"
+        )
+        return
+
+    _log("[miles-modal] registered LinearCrossEntropyModule as column parallel")
+
+
+def _patch_lora_cpu_serialization() -> None:
+    try:
+        import base64
+        import io
+        import torch
+        from miles.backends.megatron_utils.update_weight import (
+            update_weight_from_tensor as update_weight_mod,
+        )
+    except Exception as exc:
+        _log(
+            "[miles-modal] LoRA CPU serialization patch unavailable: "
+            f"{type(exc).__name__}: {exc}"
+        )
+        return
+
+    original = getattr(update_weight_mod, "_send_to_colocated_engine", None)
+    if original is None:
+        _log("[miles-modal] LoRA CPU serialization patch missing target function")
+        return
+
+    if getattr(original, "__module__", "") == __name__:
+        _log("[miles-modal] LoRA CPU serialization patch already present")
+        return
+
+    dist = update_weight_mod.dist
+    ray = update_weight_mod.ray
+    FlattenedTensorBucket = update_weight_mod.FlattenedTensorBucket
+    MultiprocessingSerializer = update_weight_mod.MultiprocessingSerializer
+
+    def _send_to_colocated_engine(
+        hf_named_tensors,
+        *,
+        ipc_engine,
+        ipc_gather_src,
+        ipc_gather_group,
+        weight_version=None,
+        lora_config=None,
+        lora_name=None,
+        lora_loaded=False,
+    ):
+        # Placeholder ranks (GPU slots reserved but no engine) have no gather group.
+        # gather_object is only collective among group members, so we skip entirely.
+        if ipc_gather_group is None:
+            return [], None
+
+        is_lora = lora_config is not None
+        long_live_tensors = []
+
+        if getattr(FlattenedTensorBucket, "supports_multi_dtypes", False):
+            converted_named_tensors_by_dtypes = {"dtype": hf_named_tensors}
+        else:
+            converted_named_tensors_by_dtypes = {}
+            for name, tensor in hf_named_tensors:
+                dtype = tensor.dtype
+                if dtype not in converted_named_tensors_by_dtypes:
+                    converted_named_tensors_by_dtypes[dtype] = []
+                converted_named_tensors_by_dtypes[dtype].append((name, tensor))
+
+        serialized_tensors = []
+        for _dtype, named_tensors in converted_named_tensors_by_dtypes.items():
+            flattened_tensor_bucket = FlattenedTensorBucket(named_tensors=named_tensors)
+            flattened_tensor = flattened_tensor_bucket.get_flattened_tensor()
+
+            # Modal's colocated LoRA sync can fail on CUDA IPC, and CPU torch.Tensor
+            # pickling still goes through multiprocessing resource_sharer. Serialize
+            # LoRA flattened buckets into a builtins-only payload so SGLang's safe
+            # unpickler can accept it without touching multiprocessing shims.
+            if is_lora and isinstance(flattened_tensor, torch.Tensor) and flattened_tensor.is_cuda:
+                flattened_tensor = flattened_tensor.detach().cpu()
+
+            if is_lora:
+                if not isinstance(flattened_tensor, torch.Tensor):
+                    raise TypeError(
+                        "Expected LoRA flattened tensor to be a torch.Tensor, got "
+                        f"{type(flattened_tensor).__name__}"
+                    )
+                buffer = io.BytesIO()
+                torch.save(flattened_tensor.contiguous(), buffer)
+                flattened_tensor_data = {
+                    "_miles_modal_format": "torch_save_flattened_lora_v2",
+                    "flattened_tensor_torch_save_b64": base64.b64encode(buffer.getvalue()).decode("ascii"),
+                    "metadata": [
+                        {
+                            "name": meta.name,
+                            "shape": list(meta.shape),
+                            "dtype": str(meta.dtype).removeprefix("torch."),
+                            "start_idx": meta.start_idx,
+                            "end_idx": meta.end_idx,
+                            "numel": meta.numel,
+                        }
+                        for meta in flattened_tensor_bucket.get_metadata()
+                    ],
+                }
+            else:
+                flattened_tensor_data = {
+                    "flattened_tensor": flattened_tensor,
+                    "metadata": flattened_tensor_bucket.get_metadata(),
+                }
+            long_live_tensors.append(flattened_tensor_data)
+            serialized_tensors.append(
+                MultiprocessingSerializer.serialize(
+                    flattened_tensor_data,
+                    output_str=True,
+                )
+            )
+
+        serialized_named_tensors = (
+            [None] * dist.get_world_size(ipc_gather_group)
+            if ipc_gather_src == dist.get_rank()
+            else None
+        )
+        dist.gather_object(
+            serialized_tensors,
+            object_gather_list=serialized_named_tensors,
+            dst=ipc_gather_src,
+            group=ipc_gather_group,
+        )
+
+        refs = []
+        if dist.get_rank() == ipc_gather_src:
+            if is_lora:
+                if lora_loaded:
+                    ray.get(ipc_engine.unload_lora_adapter.remote(lora_name=lora_name))
+
+                refs.append(
+                    ipc_engine.load_lora_adapter_from_tensors.remote(
+                        lora_name=lora_name,
+                        config_dict=lora_config,
+                        serialized_tensors=serialized_named_tensors[0][0],
+                        load_format="flattened_bucket",
+                    )
+                )
+            else:
+                num_dtypes = len(serialized_named_tensors[0])
+                for i in range(num_dtypes):
+                    kwargs = {
+                        "serialized_named_tensors": [tensors[i] for tensors in serialized_named_tensors],
+                        "load_format": "flattened_bucket",
+                        "weight_version": str(weight_version),
+                    }
+                    refs.append(ipc_engine.update_weights_from_tensor.remote(**kwargs))
+
+        return refs, long_live_tensors
+
+    update_weight_mod._send_to_colocated_engine = _send_to_colocated_engine
+    _log("[miles-modal] patched colocated LoRA sync to builtins-only flattened buckets")
+
+
+def _patch_sglang_lora_numpy_rehydration() -> None:
+    try:
+        import base64
+        import io
+        import torch
+        from sglang.srt.managers import tp_worker as tp_worker_mod
+        from sglang.srt.weight_sync.tensor_bucket import FlattenedTensorMetadata
+    except Exception as exc:
+        _log(
+            "[miles-modal] SGLang LoRA rehydration patch unavailable: "
+            f"{type(exc).__name__}: {exc}"
+        )
+        return
+
+    TpModelWorker = getattr(tp_worker_mod, "TpModelWorker", None)
+    if TpModelWorker is None:
+        _log("[miles-modal] SGLang LoRA rehydration patch missing TpModelWorker")
+        return
+
+    original = getattr(TpModelWorker, "load_lora_adapter_from_tensors", None)
+    if original is None:
+        _log("[miles-modal] SGLang LoRA rehydration patch missing target method")
+        return
+
+    if getattr(original, "__module__", "") == __name__:
+        _log("[miles-modal] SGLang LoRA rehydration patch already present")
+        return
+
+    MultiprocessingSerializer = tp_worker_mod.MultiprocessingSerializer
+    FlattenedTensorBucket = tp_worker_mod.FlattenedTensorBucket
+
+    def _torch_dtype_from_name(dtype_name: str):
+        return getattr(torch, dtype_name.removeprefix("torch."))
+
+    def load_lora_adapter_from_tensors(self, recv_req):
+        # The LoRA code handles TP sharding internally using slice_lora_a_weights
+        # and slice_lora_b_weights methods (see lora/layers.py:46-49, mem_pool.py:437-440).
+        if recv_req.load_format == "flattened_bucket":
+            flattened_data = MultiprocessingSerializer.deserialize(
+                recv_req.serialized_tensors
+            )
+            if flattened_data.get("_miles_modal_format") == "torch_save_flattened_lora_v2":
+                raw_bytes = base64.b64decode(flattened_data["flattened_tensor_torch_save_b64"])
+                flattened_tensor = torch.load(
+                    io.BytesIO(raw_bytes),
+                    map_location="cpu",
+                )
+                metadata = [
+                    FlattenedTensorMetadata(
+                        name=meta["name"],
+                        shape=torch.Size(meta["shape"]),
+                        dtype=_torch_dtype_from_name(meta["dtype"]),
+                        start_idx=meta["start_idx"],
+                        end_idx=meta["end_idx"],
+                        numel=meta["numel"],
+                    )
+                    for meta in flattened_data["metadata"]
+                ]
+            elif flattened_data.get("_miles_modal_format") == "raw_flattened_lora_v1":
+                raw_bytes = base64.b64decode(flattened_data["flattened_tensor_b64"])
+                flattened_tensor = torch.frombuffer(
+                    memoryview(raw_bytes),
+                    dtype=torch.uint8,
+                ).clone()
+                metadata = [
+                    FlattenedTensorMetadata(
+                        name=meta["name"],
+                        shape=torch.Size(meta["shape"]),
+                        dtype=_torch_dtype_from_name(meta["dtype"]),
+                        start_idx=meta["start_idx"],
+                        end_idx=meta["end_idx"],
+                        numel=meta["numel"],
+                    )
+                    for meta in flattened_data["metadata"]
+                ]
+            else:
+                flattened_tensor = flattened_data["flattened_tensor"]
+                metadata = flattened_data["metadata"]
+            bucket = FlattenedTensorBucket(
+                flattened_tensor=flattened_tensor,
+                metadata=metadata,
+            )
+            tensors = dict(bucket.reconstruct_tensors())
+        else:
+            tensors = MultiprocessingSerializer.deserialize(recv_req.serialized_tensors)
+        result = self.model_runner.load_lora_adapter_from_tensors(
+            recv_req.to_ref(),
+            tensors,
+            recv_req.config_dict,
+            recv_req.added_tokens_config,
+        )
+        return result
+
+    TpModelWorker.load_lora_adapter_from_tensors = load_lora_adapter_from_tensors
+    _log("[miles-modal] patched SGLang LoRA load path to rehydrate builtins-only flattened buckets")
+
+
+def _patch_sglang_logprob_sanitization() -> None:
+    try:
+        import math
+        from sglang.srt.managers import tokenizer_manager as tokenizer_manager_mod
+    except Exception as exc:
+        _log(
+            "[miles-modal] SGLang logprob sanitization patch unavailable: "
+            f"{type(exc).__name__}: {exc}"
+        )
+        return
+
+    TokenizerManager = getattr(tokenizer_manager_mod, "TokenizerManager", None)
+    if TokenizerManager is None:
+        _log("[miles-modal] SGLang logprob sanitization patch missing TokenizerManager")
+        return
+
+    original = getattr(TokenizerManager, "detokenize_logprob_tokens", None)
+    if original is None:
+        _log("[miles-modal] SGLang logprob sanitization patch missing target method")
+        return
+
+    if getattr(original, "__module__", "") == __name__:
+        _log("[miles-modal] SGLang logprob sanitization patch already present")
+        return
+
+    sanitize_state = {"count": 0}
+
+    def _sanitize_logprob(value):
+        try:
+            numeric = float(value)
+        except Exception:
+            return value
+
+        if math.isnan(numeric) or math.isinf(numeric):
+            sanitized = 0.0
+        elif numeric > 0.0:
+            sanitized = 0.0
+        else:
+            sanitized = numeric
+
+        if sanitized != numeric:
+            sanitize_state["count"] += 1
+            if sanitize_state["count"] <= 8:
+                _log(
+                    "[miles-modal] sanitized SGLang logprob "
+                    f"{numeric!r} -> {sanitized!r}"
+                )
+        return sanitized
+
+    def detokenize_logprob_tokens(self, token_logprobs_val, token_logprobs_idx, decode_to_text):
+        sanitized_vals = [_sanitize_logprob(value) for value in token_logprobs_val]
+        return original(self, sanitized_vals, token_logprobs_idx, decode_to_text)
+
+    TokenizerManager.detokenize_logprob_tokens = detokenize_logprob_tokens
+    _log("[miles-modal] patched SGLang logprob detokenization to sanitize non-finite values")
+
+
+def _patch_sglang_sampling_probability_sanitization() -> None:
+    try:
+        import torch
+        from sglang.srt.layers import sampler as sampler_mod
+    except Exception as exc:
+        _log(
+            "[miles-modal] SGLang sampling probability patch unavailable: "
+            f"{type(exc).__name__}: {exc}"
+        )
+        return
+
+    original = getattr(sampler_mod, "sampling_from_probs_torch", None)
+    if original is None:
+        _log("[miles-modal] SGLang sampling probability patch missing target function")
+        return
+
+    if getattr(original, "__module__", "") == __name__:
+        _log("[miles-modal] SGLang sampling probability patch already present")
+        return
+
+    sanitize_state = {"count": 0}
+
+    def _sanitize_probs(probs: torch.Tensor) -> torch.Tensor:
+        probs_fp32 = probs.float()
+        valid_mask = torch.isfinite(probs_fp32) & (probs_fp32 >= 0)
+        safe_probs = torch.where(valid_mask, probs_fp32, torch.zeros_like(probs_fp32))
+        row_sums = safe_probs.sum(dim=-1, keepdim=True)
+        zero_rows = row_sums <= 0
+
+        has_invalid = bool((~valid_mask).any().item())
+        has_zero_rows = bool(zero_rows.any().item())
+
+        if has_zero_rows:
+            fallback_scores = torch.nan_to_num(
+                probs_fp32,
+                nan=float("-inf"),
+                posinf=float("-inf"),
+                neginf=float("-inf"),
+            )
+            fallback_indices = fallback_scores.argmax(dim=-1, keepdim=True)
+            fallback_probs = torch.zeros_like(safe_probs)
+            fallback_probs.scatter_(-1, fallback_indices, 1.0)
+            safe_probs = torch.where(zero_rows, fallback_probs, safe_probs)
+            row_sums = safe_probs.sum(dim=-1, keepdim=True)
+
+        if has_invalid or has_zero_rows:
+            sanitize_state["count"] += 1
+            if sanitize_state["count"] <= 8:
+                _log(
+                    "[miles-modal] sanitized SGLang sampling probs "
+                    f"(invalid_entries={int((~valid_mask).sum().item())}, "
+                    f"zero_rows={int(zero_rows.sum().item())})"
+                )
+
+        return safe_probs / row_sums.clamp_min(1e-12)
+
+    def sampling_from_probs_torch(
+        probs: torch.Tensor,
+        sampling_seed=None,
+        positions=None,
+    ):
+        safe_probs = _sanitize_probs(probs)
+        return original(
+            safe_probs,
+            sampling_seed=sampling_seed,
+            positions=positions,
+        )
+
+    sampler_mod.sampling_from_probs_torch = sampling_from_probs_torch
+    _log("[miles-modal] patched SGLang sampling to sanitize invalid probability rows")
+
+
+_register_linear_cross_entropy_module()
+_patch_lora_cpu_serialization()
+_patch_sglang_lora_numpy_rehydration()
+_patch_sglang_logprob_sanitization()
+_patch_sglang_sampling_probability_sanitization()
diff --git a/miles/modal_train.py b/miles/modal_train.py
index 35b075e..5477a33 100644
--- a/miles/modal_train.py
+++ b/miles/modal_train.py
@@ -35,6 +35,7 @@
 DATA_PATH = pathlib.Path("/data")
 CHECKPOINTS_PATH = pathlib.Path("/checkpoints")
 REMOTE_RECIPES_DIR = pathlib.Path("/root/miles-recipes")
+REMOTE_PATCH_DIR = pathlib.Path("/root/miles-modal-patches")
 REMOTE_MILES_DIR = pathlib.Path("/root/miles")
 REMOTE_TRAIN_SCRIPT = REMOTE_MILES_DIR / "train.py"
 
@@ -63,41 +64,49 @@ class Recipe:
         name="qwen25-0p5b-lora",
         description="Single-node smoke test adapted from the upstream Miles LoRA example.",
         model_id="Qwen/Qwen2.5-0.5B-Instruct",
-        args_file="qwen25-0p5b-lora.args",
+        args_file="tests/qwen25-0p5b-lora.args",
         recommended_nodes=1,
         gpu="H100:8",
     ),
-    "glm4-7-flash-lora": Recipe(
-        name="glm4-7-flash-lora",
-        description="First real GLM MoE validation recipe on multiple nodes.",
-        model_id="zai-org/GLM-4.7-Flash",
-        args_file="glm4-7-flash-lora.args",
-        recommended_nodes=4,
+    "qwen3-30b-a3b-lora": Recipe(
+        name="qwen3-30b-a3b-lora",
+        description="Single-node Qwen3-30B-A3B bridge-mode LoRA validation recipe.",
+        model_id="Qwen/Qwen3-30B-A3B",
+        args_file="qwen3-30b-a3b-lora.args",
+        recommended_nodes=1,
         gpu="H100:8",
     ),
-    "glm5-744b-a40b-4layer-lora": Recipe(
-        name="glm5-744b-a40b-4layer-lora",
-        description="GLM-5 testing recipe using the upstream 4-layer model script shape.",
-        model_id="zai-org/GLM-5",
-        args_file="glm5-744b-a40b-4layer-lora.args",
+    "qwen3-30b-a3b-lora-fewstep": Recipe(
+        name="qwen3-30b-a3b-lora-fewstep",
+        description="Single-node Qwen3-30B-A3B attention-only LoRA recipe trimmed to chase a few full RL steps.",
+        model_id="Qwen/Qwen3-30B-A3B",
+        args_file="tests/qwen3-30b-a3b-lora-fewstep.args",
         recommended_nodes=1,
-        gpu="H200:8",
+        gpu="H100:8",
     ),
-    "glm5-744b-a40b-20layer-lora": Recipe(
-        name="glm5-744b-a40b-20layer-lora",
-        description="GLM-5 testing recipe using the upstream 20-layer model script shape.",
-        model_id="zai-org/GLM-5",
-        args_file="glm5-744b-a40b-20layer-lora.args",
-        recommended_nodes=2,
-        gpu="H200:8",
+    "qwen3-30b-a3b-lora-greedy-debug": Recipe(
+        name="qwen3-30b-a3b-lora-greedy-debug",
+        description="Single-node Qwen3-30B-A3B attention-only LoRA debug recipe with greedy rollout to validate LoRA sync.",
+        model_id="Qwen/Qwen3-30B-A3B",
+        args_file="tests/qwen3-30b-a3b-lora-greedy-debug.args",
+        recommended_nodes=1,
+        gpu="H100:8",
     ),
-    "glm5-744b-a40b-lora": Recipe(
-        name="glm5-744b-a40b-lora",
-        description="Full GLM-5 starter recipe for LoRA RLVR experiments.",
-        model_id="zai-org/GLM-5",
-        args_file="glm5-744b-a40b-lora.args",
-        recommended_nodes=8,
-        gpu="H200:8",
+    "qwen3-30b-a3b-experts-lora": Recipe(
+        name="qwen3-30b-a3b-experts-lora",
+        description="Second-phase Qwen3-30B-A3B recipe widened to expert linear_fc1/fc2 targets.",
+        model_id="Qwen/Qwen3-30B-A3B",
+        args_file="qwen3-30b-a3b-experts-lora.args",
+        recommended_nodes=1,
+        gpu="H100:8",
+    ),
+    "qwen3-30b-a3b-experts-fewstep": Recipe(
+        name="qwen3-30b-a3b-experts-fewstep",
+        description="Single-node Qwen3-30B-A3B expert-target LoRA recipe trimmed to chase a few RL steps.",
+        model_id="Qwen/Qwen3-30B-A3B",
+        args_file="tests/qwen3-30b-a3b-experts-fewstep.args",
+        recommended_nodes=1,
+        gpu="H100:8",
     ),
 }
 
@@ -205,7 +214,7 @@ def _build_runtime_env(master_addr: str, wandb_key: Optional[str]) -> dict:
     env_vars = {
         "MASTER_ADDR": master_addr,
         "no_proxy": master_addr,
-        "PYTHONPATH": "/root/Megatron-LM",
+        "PYTHONPATH": f"{REMOTE_PATCH_DIR.as_posix()}:/root/Megatron-LM",
         "CUDA_DEVICE_MAX_CONNECTIONS": "1",
         "NCCL_ALGO": "Ring",
         "NVTE_ALLOW_NONDETERMINISTIC_ALGO": "0",
@@ -229,6 +238,11 @@ def _print_recipe_table():
     modal.Image.from_registry(MILES_IMAGE)
     .entrypoint([])
     .add_local_dir(here / "recipes", remote_path=REMOTE_RECIPES_DIR.as_posix(), copy=True)
+    .add_local_dir(
+        here / "modal_patches",
+        remote_path=REMOTE_PATCH_DIR.as_posix(),
+        copy=True,
+    )
 )
 
 if LOCAL_MILES_PATH:
@@ -422,7 +436,7 @@ async def submit_training(
     timeout=24 * 60 * 60,
 )
 def download_model(
-    recipe: str = "glm4-7-flash-lora",
+    recipe: str = "qwen3-30b-a3b-lora",
     revision: Optional[str] = None,
     model_id: Optional[str] = None,
 ):
@@ -462,7 +476,7 @@ def prepare_dataset(
 
 @app.local_entrypoint()
 def main(
-    recipe: str = "qwen25-0p5b-lora",
+    recipe: str = "qwen3-30b-a3b-lora",
     gpu: str = "",
     extra_args: str = "",
     extra_args_file: str = "",
diff --git a/miles/recipes/glm4-7-flash-lora.args b/miles/recipes/glm4-7-flash-lora.args
deleted file mode 100644
index 0cf819f..0000000
--- a/miles/recipes/glm4-7-flash-lora.args
+++ /dev/null
@@ -1,120 +0,0 @@
-# GLM-4.7-Flash LoRA validation recipe.
-
-# Model architecture from the upstream Miles model script.
---moe-layer-freq "[0]*1+[1]*46"
---num-experts 64
---moe-shared-expert-intermediate-size 1536
---moe-router-topk 4
---moe-grouped-gemm
---moe-permute-fusion
---moe-ffn-hidden-size 1536
---moe-router-score-function sigmoid
---moe-router-pre-softmax
---moe-router-enable-expert-bias
---moe-router-bias-update-rate 0
---moe-router-load-balancing-type seq_aux_loss
---moe-router-topk-scaling-factor 1.8
---moe-aux-loss-coeff 0
---moe-router-dtype fp32
---make-vocab-size-divisible-by 64
---num-layers 47
---hidden-size 2048
---ffn-hidden-size 10240
---num-attention-heads 20
---disable-bias-linear
---add-qkv-bias
---swiglu
---untie-embeddings-and-output-weights
---position-embedding-type rope
---no-position-embedding
---normalization RMSNorm
---qk-layernorm
---multi-latent-attention
---q-lora-rank 768
---kv-lora-rank 512
---qk-head-dim 192
---v-head-dim 256
---kv-channels 192
---qk-pos-emb-head-dim 64
---vocab-size 154880
---rotary-base 1000000
---no-rope-fusion
---mtp-num-layers 1
-
-# Checkpoint conversion
---megatron-to-hf-mode bridge
-
-# LoRA
---lora-rank 32
---lora-alpha 32
---lora-dropout 0.0
---target-modules all-linear
-
-# Data and rollout
---prompt-data /data/gsm8k/train.parquet
---input-key messages
---label-key label
---apply-chat-template
---rollout-shuffle
---rm-type math
---num-rollout 100
---rollout-batch-size 8
---n-samples-per-prompt 8
---rollout-max-response-len 2048
---rollout-temperature 1
---global-batch-size 64
-
-# Evaluation
---eval-interval 10
---eval-prompt-data gsm8k /data/gsm8k/test.parquet
---n-samples-per-eval-prompt 2
---eval-max-response-len 4096
---eval-top-k 1
-
-# Parallelism and performance
---tensor-model-parallel-size 2
---sequence-parallel
---pipeline-model-parallel-size 4
---context-parallel-size 1
---expert-model-parallel-size 4
---expert-tensor-parallel-size 1
---recompute-granularity full
---recompute-method uniform
---recompute-num-layers 1
---use-dynamic-batch-size
---max-tokens-per-gpu 4096
-
-# GRPO
---advantage-estimator grpo
---kl-loss-coef 0.0
---kl-loss-type low_var_kl
---kl-coef 0.0
---entropy-coef 0.0
---eps-clip 0.2
---eps-clip-high 0.28
-
-# Optimizer
---optimizer adam
---lr 1e-5
---lr-decay-style constant
---weight-decay 0.1
---adam-beta1 0.9
---adam-beta2 0.98
-
-# Rollout serving
---rollout-num-gpus-per-engine 1
---sglang-mem-fraction-static 0.55
-
-# Training runtime
---attention-dropout 0.0
---hidden-dropout 0.0
---accumulate-allreduce-grads-in-fp32
---attention-softmax-in-fp32
---calculate-per-token-loss
---use-miles-router
---save-interval 25
-
-# Logging defaults
---wandb-project miles-modal
---wandb-group glm4-7-flash-lora
---disable-wandb-random-suffix
diff --git a/miles/recipes/glm5-744b-a40b-20layer-lora.args b/miles/recipes/glm5-744b-a40b-20layer-lora.args
deleted file mode 100644
index 5a2e760..0000000
--- a/miles/recipes/glm5-744b-a40b-20layer-lora.args
+++ /dev/null
@@ -1,119 +0,0 @@
-# GLM-5 20-layer testing recipe.
-
-# Model architecture from the upstream Miles GLM-5 model script with the 20-layer override.
---spec miles_plugins.models.glm5.glm5 get_glm5_spec
---moe-layer-freq "[0]*3+[1]*17"
---num-experts 256
---moe-shared-expert-intermediate-size 2048
---moe-router-topk 8
---moe-grouped-gemm
---moe-permute-fusion
---moe-ffn-hidden-size 2048
---moe-router-score-function sigmoid
---moe-router-pre-softmax
---moe-router-enable-expert-bias
---moe-router-bias-update-rate 0
---moe-router-load-balancing-type seq_aux_loss
---moe-router-topk-scaling-factor 2.5
---moe-aux-loss-coeff 0
---moe-router-dtype fp32
---make-vocab-size-divisible-by 16
---num-layers 20
---hidden-size 6144
---ffn-hidden-size 12288
---num-attention-heads 64
---disable-bias-linear
---swiglu
---untie-embeddings-and-output-weights
---position-embedding-type rope
---no-position-embedding
---normalization RMSNorm
---qk-layernorm
---multi-latent-attention
---q-lora-rank 2048
---kv-lora-rank 512
---qk-head-dim 192
---v-head-dim 256
---kv-channels 192
---qk-pos-emb-head-dim 64
---vocab-size 154880
---rotary-base 1000000
---enable-experimental
-
-# Checkpoint conversion
---megatron-to-hf-mode bridge
-
-# LoRA
---lora-rank 32
---lora-alpha 32
---lora-dropout 0.0
---target-modules all-linear
-
-# Data and rollout
---prompt-data /data/gsm8k/train.parquet
---input-key messages
---label-key label
---apply-chat-template
---rollout-shuffle
---rm-type math
---num-rollout 100
---rollout-batch-size 4
---n-samples-per-prompt 8
---rollout-max-response-len 1536
---rollout-temperature 1
---global-batch-size 32
-
-# Evaluation
---eval-interval 10
---eval-prompt-data gsm8k /data/gsm8k/test.parquet
---n-samples-per-eval-prompt 2
---eval-max-response-len 3072
---eval-top-k 1
-
-# Parallelism and performance
---tensor-model-parallel-size 2
---sequence-parallel
---pipeline-model-parallel-size 2
---context-parallel-size 1
---expert-model-parallel-size 4
---expert-tensor-parallel-size 1
---recompute-granularity full
---recompute-method uniform
---recompute-num-layers 1
---use-dynamic-batch-size
---max-tokens-per-gpu 3072
-
-# GRPO
---advantage-estimator grpo
---kl-loss-coef 0.0
---kl-loss-type low_var_kl
---kl-coef 0.0
---entropy-coef 0.0
---eps-clip 0.2
---eps-clip-high 0.28
-
-# Optimizer
---optimizer adam
---lr 5e-6
---lr-decay-style constant
---weight-decay 0.1
---adam-beta1 0.9
---adam-beta2 0.98
-
-# Rollout serving
---rollout-num-gpus-per-engine 1
---sglang-mem-fraction-static 0.5
-
-# Training runtime
---attention-dropout 0.0
---hidden-dropout 0.0
---accumulate-allreduce-grads-in-fp32
---attention-softmax-in-fp32
---calculate-per-token-loss
---use-miles-router
---save-interval 25
-
-# Logging defaults
---wandb-project miles-modal
---wandb-group glm5-744b-a40b-20layer-lora
---disable-wandb-random-suffix
diff --git a/miles/recipes/glm5-744b-a40b-lora.args b/miles/recipes/qwen3-30b-a3b-experts-lora.args
similarity index 56%
rename from miles/recipes/glm5-744b-a40b-lora.args
rename to miles/recipes/qwen3-30b-a3b-experts-lora.args
index 81ebca5..39a25ec 100644
--- a/miles/recipes/glm5-744b-a40b-lora.args
+++ b/miles/recipes/qwen3-30b-a3b-experts-lora.args
@@ -1,44 +1,35 @@
-# Full GLM-5 starter recipe.
+# Qwen3-30B-A3B bridge-mode LoRA validation recipe.
+# Phase 2: widen the working baseline to include expert linear_fc1 / linear_fc2.
 
-# Model architecture from the upstream Miles GLM-5 model script.
---spec miles_plugins.models.glm5.glm5 get_glm5_spec
---moe-layer-freq "[0]*3+[1]*75"
---num-experts 256
---moe-shared-expert-intermediate-size 2048
---moe-router-topk 8
---moe-grouped-gemm
---moe-permute-fusion
---moe-ffn-hidden-size 2048
---moe-router-score-function sigmoid
---moe-router-pre-softmax
---moe-router-enable-expert-bias
---moe-router-bias-update-rate 0
---moe-router-load-balancing-type seq_aux_loss
---moe-router-topk-scaling-factor 2.5
---moe-aux-loss-coeff 0
---moe-router-dtype fp32
---make-vocab-size-divisible-by 16
---num-layers 78
---hidden-size 6144
---ffn-hidden-size 12288
---num-attention-heads 64
+# Model architecture from the upstream Miles Qwen3-30B-A3B model script.
 --disable-bias-linear
+--qk-layernorm
+--group-query-attention
+--num-attention-heads 32
+--num-query-groups 4
+--kv-channels 128
+--num-layers 48
+--hidden-size 2048
+--ffn-hidden-size 6144
+--normalization RMSNorm
+--position-embedding-type rope
+--norm-epsilon 1e-6
+--rotary-percent 1.0
 --swiglu
 --untie-embeddings-and-output-weights
---position-embedding-type rope
---no-position-embedding
---normalization RMSNorm
---qk-layernorm
---multi-latent-attention
---q-lora-rank 2048
---kv-lora-rank 512
---qk-head-dim 192
---v-head-dim 256
---kv-channels 192
---qk-pos-emb-head-dim 64
---vocab-size 154880
+--vocab-size 151936
 --rotary-base 1000000
---enable-experimental
+--moe-ffn-hidden-size 768
+--moe-router-score-function softmax
+--moe-token-dispatcher-type alltoall
+--moe-router-topk 8
+--moe-layer-freq "[1]*48"
+--num-experts 128
+--moe-grouped-gemm
+--moe-token-drop-policy probs
+--moe-router-dtype fp32
+--moe-permute-fusion
+--moe-aux-loss-coeff 0
 
 # Checkpoint conversion
 --megatron-to-hf-mode bridge
@@ -47,7 +38,7 @@
 --lora-rank 32
 --lora-alpha 32
 --lora-dropout 0.0
---target-modules all-linear
+--target-modules linear_qkv,linear_proj,linear_fc1,linear_fc2
 
 # Data and rollout
 --prompt-data /data/gsm8k/train.parquet
@@ -56,24 +47,24 @@
 --apply-chat-template
 --rollout-shuffle
 --rm-type math
---num-rollout 100
---rollout-batch-size 2
---n-samples-per-prompt 8
---rollout-max-response-len 2048
+--num-rollout 40
+--rollout-batch-size 4
+--n-samples-per-prompt 4
+--rollout-max-response-len 1024
 --rollout-temperature 1
 --global-batch-size 16
 
 # Evaluation
 --eval-interval 10
 --eval-prompt-data gsm8k /data/gsm8k/test.parquet
---n-samples-per-eval-prompt 2
---eval-max-response-len 4096
+--n-samples-per-eval-prompt 1
+--eval-max-response-len 1024
 --eval-top-k 1
 
 # Parallelism and performance
---tensor-model-parallel-size 2
+--tensor-model-parallel-size 4
 --sequence-parallel
---pipeline-model-parallel-size 4
+--pipeline-model-parallel-size 1
 --context-parallel-size 1
 --expert-model-parallel-size 8
 --expert-tensor-parallel-size 1
@@ -94,26 +85,31 @@
 
 # Optimizer
 --optimizer adam
---lr 5e-6
+--lr 1e-5
 --lr-decay-style constant
 --weight-decay 0.1
 --adam-beta1 0.9
 --adam-beta2 0.98
+--optimizer-cpu-offload
+--overlap-cpu-optimizer-d2h-h2d
+--use-precision-aware-optimizer
 
 # Rollout serving
---rollout-num-gpus-per-engine 1
---sglang-mem-fraction-static 0.45
+--rollout-num-gpus-per-engine 8
+--sglang-mem-fraction-static 0.7
+--sglang-cuda-graph-max-bs 256
 
 # Training runtime
 --attention-dropout 0.0
 --hidden-dropout 0.0
 --accumulate-allreduce-grads-in-fp32
 --attention-softmax-in-fp32
+--attention-backend flash
 --calculate-per-token-loss
 --use-miles-router
 --save-interval 25
 
 # Logging defaults
 --wandb-project miles-modal
---wandb-group glm5-744b-a40b-lora
+--wandb-group qwen3-30b-a3b-experts-lora
 --disable-wandb-random-suffix
diff --git a/miles/recipes/glm5-744b-a40b-4layer-lora.args b/miles/recipes/qwen3-30b-a3b-lora.args
similarity index 56%
rename from miles/recipes/glm5-744b-a40b-4layer-lora.args
rename to miles/recipes/qwen3-30b-a3b-lora.args
index 64ac703..d80d9b3 100644
--- a/miles/recipes/glm5-744b-a40b-4layer-lora.args
+++ b/miles/recipes/qwen3-30b-a3b-lora.args
@@ -1,44 +1,36 @@
-# GLM-5 4-layer testing recipe.
+# Qwen3-30B-A3B bridge-mode LoRA validation recipe.
+# Phase 1: confirm end-to-end Miles + Megatron-Bridge + SGLang support with
+# attention-only LoRA targets before widening into expert MLP modules.
 
-# Model architecture from the upstream Miles GLM-5 model script with the 4-layer override.
---spec miles_plugins.models.glm5.glm5 get_glm5_spec
---moe-layer-freq "[0]*3+[1]*1"
---num-experts 256
---moe-shared-expert-intermediate-size 2048
---moe-router-topk 8
---moe-grouped-gemm
---moe-permute-fusion
---moe-ffn-hidden-size 2048
---moe-router-score-function sigmoid
---moe-router-pre-softmax
---moe-router-enable-expert-bias
---moe-router-bias-update-rate 0
---moe-router-load-balancing-type seq_aux_loss
---moe-router-topk-scaling-factor 2.5
---moe-aux-loss-coeff 0
---moe-router-dtype fp32
---make-vocab-size-divisible-by 16
---num-layers 4
---hidden-size 6144
---ffn-hidden-size 12288
---num-attention-heads 64
+# Model architecture from the upstream Miles Qwen3-30B-A3B model script.
 --disable-bias-linear
+--qk-layernorm
+--group-query-attention
+--num-attention-heads 32
+--num-query-groups 4
+--kv-channels 128
+--num-layers 48
+--hidden-size 2048
+--ffn-hidden-size 6144
+--normalization RMSNorm
+--position-embedding-type rope
+--norm-epsilon 1e-6
+--rotary-percent 1.0
 --swiglu
 --untie-embeddings-and-output-weights
---position-embedding-type rope
---no-position-embedding
---normalization RMSNorm
---qk-layernorm
---multi-latent-attention
---q-lora-rank 2048
---kv-lora-rank 512
---qk-head-dim 192
---v-head-dim 256
---kv-channels 192
---qk-pos-emb-head-dim 64
---vocab-size 154880
+--vocab-size 151936
 --rotary-base 1000000
---enable-experimental
+--moe-ffn-hidden-size 768
+--moe-router-score-function softmax
+--moe-token-dispatcher-type alltoall
+--moe-router-topk 8
+--moe-layer-freq "[1]*48"
+--num-experts 128
+--moe-grouped-gemm
+--moe-token-drop-policy probs
+--moe-router-dtype fp32
+--moe-permute-fusion
+--moe-aux-loss-coeff 0
 
 # Checkpoint conversion
 --megatron-to-hf-mode bridge
@@ -47,7 +39,7 @@
 --lora-rank 32
 --lora-alpha 32
 --lora-dropout 0.0
---target-modules all-linear
+--target-modules linear_qkv,linear_proj
 
 # Data and rollout
 --prompt-data /data/gsm8k/train.parquet
@@ -56,32 +48,32 @@
 --apply-chat-template
 --rollout-shuffle
 --rm-type math
---num-rollout 100
+--num-rollout 40
 --rollout-batch-size 4
---n-samples-per-prompt 8
+--n-samples-per-prompt 4
 --rollout-max-response-len 1024
 --rollout-temperature 1
---global-batch-size 32
+--global-batch-size 16
 
 # Evaluation
 --eval-interval 10
 --eval-prompt-data gsm8k /data/gsm8k/test.parquet
---n-samples-per-eval-prompt 2
---eval-max-response-len 2048
+--n-samples-per-eval-prompt 1
+--eval-max-response-len 1024
 --eval-top-k 1
 
 # Parallelism and performance
---tensor-model-parallel-size 2
+--tensor-model-parallel-size 4
 --sequence-parallel
 --pipeline-model-parallel-size 1
 --context-parallel-size 1
---expert-model-parallel-size 4
+--expert-model-parallel-size 8
 --expert-tensor-parallel-size 1
 --recompute-granularity full
 --recompute-method uniform
 --recompute-num-layers 1
 --use-dynamic-batch-size
---max-tokens-per-gpu 3072
+--max-tokens-per-gpu 2048
 
 # GRPO
 --advantage-estimator grpo
@@ -94,26 +86,31 @@
 
 # Optimizer
 --optimizer adam
---lr 5e-6
+--lr 1e-5
 --lr-decay-style constant
 --weight-decay 0.1
 --adam-beta1 0.9
 --adam-beta2 0.98
+--optimizer-cpu-offload
+--overlap-cpu-optimizer-d2h-h2d
+--use-precision-aware-optimizer
 
 # Rollout serving
---rollout-num-gpus-per-engine 1
---sglang-mem-fraction-static 0.5
+--rollout-num-gpus-per-engine 8
+--sglang-mem-fraction-static 0.7
+--sglang-cuda-graph-max-bs 256
 
 # Training runtime
 --attention-dropout 0.0
 --hidden-dropout 0.0
 --accumulate-allreduce-grads-in-fp32
 --attention-softmax-in-fp32
+--attention-backend flash
 --calculate-per-token-loss
 --use-miles-router
 --save-interval 25
 
 # Logging defaults
 --wandb-project miles-modal
---wandb-group glm5-744b-a40b-4layer-lora
+--wandb-group qwen3-30b-a3b-lora
 --disable-wandb-random-suffix
diff --git a/miles/recipes/qwen25-0p5b-lora.args b/miles/recipes/tests/qwen25-0p5b-lora.args
similarity index 100%
rename from miles/recipes/qwen25-0p5b-lora.args
rename to miles/recipes/tests/qwen25-0p5b-lora.args
diff --git a/miles/recipes/tests/qwen3-30b-a3b-experts-fewstep.args b/miles/recipes/tests/qwen3-30b-a3b-experts-fewstep.args
new file mode 100644
index 0000000..d479fcb
--- /dev/null
+++ b/miles/recipes/tests/qwen3-30b-a3b-experts-fewstep.args
@@ -0,0 +1,110 @@
+# Qwen3-30B-A3B bridge-mode LoRA recipe tuned to get through a few RL steps.
+# This widens the working attention-only few-step shape to expert linear_fc1
+# and linear_fc2 targets while keeping rollout pressure trimmed.
+
+# Model architecture from the upstream Miles Qwen3-30B-A3B model script.
+--disable-bias-linear
+--qk-layernorm
+--group-query-attention
+--num-attention-heads 32
+--num-query-groups 4
+--kv-channels 128
+--num-layers 48
+--hidden-size 2048
+--ffn-hidden-size 6144
+--normalization RMSNorm
+--position-embedding-type rope
+--norm-epsilon 1e-6
+--rotary-percent 1.0
+--swiglu
+--untie-embeddings-and-output-weights
+--vocab-size 151936
+--rotary-base 1000000
+--moe-ffn-hidden-size 768
+--moe-router-score-function softmax
+--moe-token-dispatcher-type alltoall
+--moe-router-topk 8
+--moe-layer-freq "[1]*48"
+--num-experts 128
+--moe-grouped-gemm
+--moe-token-drop-policy probs
+--moe-router-dtype fp32
+--moe-permute-fusion
+--moe-aux-loss-coeff 0
+
+# Checkpoint conversion
+--megatron-to-hf-mode bridge
+
+# LoRA
+--lora-rank 32
+--lora-alpha 32
+--lora-dropout 0.0
+--target-modules linear_qkv,linear_proj,linear_fc1,linear_fc2
+
+# Data and rollout
+--prompt-data /data/gsm8k/train.parquet
+--input-key messages
+--label-key label
+--apply-chat-template
+--rollout-shuffle
+--rm-type math
+--num-rollout 8
+--rollout-batch-size 2
+--n-samples-per-prompt 2
+--rollout-max-response-len 512
+--rollout-temperature 1
+--global-batch-size 4
+
+# Parallelism and performance
+--tensor-model-parallel-size 4
+--sequence-parallel
+--pipeline-model-parallel-size 1
+--context-parallel-size 1
+--expert-model-parallel-size 8
+--expert-tensor-parallel-size 1
+--recompute-granularity full
+--recompute-method uniform
+--recompute-num-layers 1
+--use-dynamic-batch-size
+--max-tokens-per-gpu 1024
+
+# GRPO
+--advantage-estimator grpo
+--kl-loss-coef 0.0
+--kl-loss-type low_var_kl
+--kl-coef 0.0
+--entropy-coef 0.0
+--eps-clip 0.2
+--eps-clip-high 0.28
+
+# Optimizer
+--optimizer adam
+--lr 1e-5
+--lr-decay-style constant
+--weight-decay 0.1
+--adam-beta1 0.9
+--adam-beta2 0.98
+--optimizer-cpu-offload
+--overlap-cpu-optimizer-d2h-h2d
+--use-precision-aware-optimizer
+
+# Rollout serving
+--rollout-num-gpus-per-engine 8
+--sglang-mem-fraction-static 0.7
+--sglang-cuda-graph-max-bs 64
+--sglang-disable-cuda-graph
+
+# Training runtime
+--attention-dropout 0.0
+--hidden-dropout 0.0
+--accumulate-allreduce-grads-in-fp32
+--attention-softmax-in-fp32
+--attention-backend flash
+--calculate-per-token-loss
+--use-miles-router
+--save-interval 1000
+
+# Logging defaults
+--wandb-project miles-modal
+--wandb-group qwen3-30b-a3b-experts-fewstep
+--disable-wandb-random-suffix
diff --git a/miles/recipes/tests/qwen3-30b-a3b-lora-fewstep.args b/miles/recipes/tests/qwen3-30b-a3b-lora-fewstep.args
new file mode 100644
index 0000000..dda4790
--- /dev/null
+++ b/miles/recipes/tests/qwen3-30b-a3b-lora-fewstep.args
@@ -0,0 +1,110 @@
+# Qwen3-30B-A3B bridge-mode LoRA recipe tuned to get through a few RL steps.
+# This keeps attention-only LoRA targets but trims rollout pressure and disables
+# eval so the run can reach training sooner.
+
+# Model architecture from the upstream Miles Qwen3-30B-A3B model script.
+--disable-bias-linear
+--qk-layernorm
+--group-query-attention
+--num-attention-heads 32
+--num-query-groups 4
+--kv-channels 128
+--num-layers 48
+--hidden-size 2048
+--ffn-hidden-size 6144
+--normalization RMSNorm
+--position-embedding-type rope
+--norm-epsilon 1e-6
+--rotary-percent 1.0
+--swiglu
+--untie-embeddings-and-output-weights
+--vocab-size 151936
+--rotary-base 1000000
+--moe-ffn-hidden-size 768
+--moe-router-score-function softmax
+--moe-token-dispatcher-type alltoall
+--moe-router-topk 8
+--moe-layer-freq "[1]*48"
+--num-experts 128
+--moe-grouped-gemm
+--moe-token-drop-policy probs
+--moe-router-dtype fp32
+--moe-permute-fusion
+--moe-aux-loss-coeff 0
+
+# Checkpoint conversion
+--megatron-to-hf-mode bridge
+
+# LoRA
+--lora-rank 32
+--lora-alpha 32
+--lora-dropout 0.0
+--target-modules linear_qkv,linear_proj
+
+# Data and rollout
+--prompt-data /data/gsm8k/train.parquet
+--input-key messages
+--label-key label
+--apply-chat-template
+--rollout-shuffle
+--rm-type math
+--num-rollout 8
+--rollout-batch-size 2
+--n-samples-per-prompt 2
+--rollout-max-response-len 512
+--rollout-temperature 1
+--global-batch-size 4
+
+# Parallelism and performance
+--tensor-model-parallel-size 4
+--sequence-parallel
+--pipeline-model-parallel-size 1
+--context-parallel-size 1
+--expert-model-parallel-size 8
+--expert-tensor-parallel-size 1
+--recompute-granularity full
+--recompute-method uniform
+--recompute-num-layers 1
+--use-dynamic-batch-size
+--max-tokens-per-gpu 1024
+
+# GRPO
+--advantage-estimator grpo
+--kl-loss-coef 0.0
+--kl-loss-type low_var_kl
+--kl-coef 0.0
+--entropy-coef 0.0
+--eps-clip 0.2
+--eps-clip-high 0.28
+
+# Optimizer
+--optimizer adam
+--lr 1e-5
+--lr-decay-style constant
+--weight-decay 0.1
+--adam-beta1 0.9
+--adam-beta2 0.98
+--optimizer-cpu-offload
+--overlap-cpu-optimizer-d2h-h2d
+--use-precision-aware-optimizer
+
+# Rollout serving
+--rollout-num-gpus-per-engine 8
+--sglang-mem-fraction-static 0.7
+--sglang-cuda-graph-max-bs 64
+--sglang-disable-cuda-graph
+
+# Training runtime
+--attention-dropout 0.0
+--hidden-dropout 0.0
+--accumulate-allreduce-grads-in-fp32
+--attention-softmax-in-fp32
+--attention-backend flash
+--calculate-per-token-loss
+--use-miles-router
+--save-interval 1000
+
+# Logging defaults
+--wandb-project miles-modal
+--wandb-group qwen3-30b-a3b-lora-fewstep
+--disable-wandb-random-suffix
diff --git a/miles/recipes/tests/qwen3-30b-a3b-lora-greedy-debug.args b/miles/recipes/tests/qwen3-30b-a3b-lora-greedy-debug.args
new file mode 100644
index 0000000..351f4ed
--- /dev/null
+++ b/miles/recipes/tests/qwen3-30b-a3b-lora-greedy-debug.args
@@ -0,0 +1,111 @@
+# Qwen3-30B-A3B bridge-mode LoRA recipe narrowed for faster Modal debugging.
+# This keeps attention-only LoRA targets, trims rollout pressure, and uses
+# deterministic top-k=1 decoding so the next run quickly isolates rollout bugs.
+
+# Model architecture from the upstream Miles Qwen3-30B-A3B model script.
+--disable-bias-linear
+--qk-layernorm
+--group-query-attention
+--num-attention-heads 32
+--num-query-groups 4
+--kv-channels 128
+--num-layers 48
+--hidden-size 2048
+--ffn-hidden-size 6144
+--normalization RMSNorm
+--position-embedding-type rope
+--norm-epsilon 1e-6
+--rotary-percent 1.0
+--swiglu
+--untie-embeddings-and-output-weights
+--vocab-size 151936
+--rotary-base 1000000
+--moe-ffn-hidden-size 768
+--moe-router-score-function softmax
+--moe-token-dispatcher-type alltoall
+--moe-router-topk 8
+--moe-layer-freq "[1]*48"
+--num-experts 128
+--moe-grouped-gemm
+--moe-token-drop-policy probs
+--moe-router-dtype fp32
+--moe-permute-fusion
+--moe-aux-loss-coeff 0
+
+# Checkpoint conversion
+--megatron-to-hf-mode bridge
+
+# LoRA
+--lora-rank 32
+--lora-alpha 32
+--lora-dropout 0.0
+--target-modules linear_qkv,linear_proj
+
+# Data and rollout
+--prompt-data /data/gsm8k/train.parquet
+--input-key messages
+--label-key label
+--apply-chat-template
+--rollout-shuffle
+--rm-type math
+--num-rollout 2
+--rollout-batch-size 1
+--n-samples-per-prompt 2
+--rollout-max-response-len 128
+--rollout-temperature 1
+--rollout-top-k 1
+--global-batch-size 2
+
+# Parallelism and performance
+--tensor-model-parallel-size 4
+--sequence-parallel
+--pipeline-model-parallel-size 1
+--context-parallel-size 1
+--expert-model-parallel-size 8
+--expert-tensor-parallel-size 1
+--recompute-granularity full
+--recompute-method uniform
+--recompute-num-layers 1
+--use-dynamic-batch-size
+--max-tokens-per-gpu 768
+
+# GRPO
+--advantage-estimator grpo
+--kl-loss-coef 0.0
+--kl-loss-type low_var_kl
+--kl-coef 0.0
+--entropy-coef 0.0
+--eps-clip 0.2
+--eps-clip-high 0.28
+
+# Optimizer
+--optimizer adam
+--lr 1e-5
+--lr-decay-style constant
+--weight-decay 0.1
+--adam-beta1 0.9
+--adam-beta2 0.98
+--optimizer-cpu-offload
+--overlap-cpu-optimizer-d2h-h2d
+--use-precision-aware-optimizer
+
+# Rollout serving
+--rollout-num-gpus-per-engine 8
+--sglang-mem-fraction-static 0.6
+--sglang-cuda-graph-max-bs 16
+--sglang-disable-cuda-graph
+
+# Training runtime
+--attention-dropout 0.0
+--hidden-dropout 0.0
+--accumulate-allreduce-grads-in-fp32
+--attention-softmax-in-fp32
+--attention-backend flash
+--calculate-per-token-loss
+--use-miles-router
+--save-interval 1000
+
+# Logging defaults
+--wandb-project miles-modal
+--wandb-group qwen3-30b-a3b-lora-greedy-debug
+--disable-wandb-random-suffix

From 4ad8c8f68c371cad51bfcde83fe447994e7694c0 Mon Sep 17 00:00:00 2001
From: Jason Mancuso <7891333+jvmncs@users.noreply.github.com>
Date: Thu, 26 Mar 2026 23:18:00 -0400
Subject: [PATCH 3/5] Align Qwen3 LoRA defaults with thinky method

---
 miles/README.md                               | 63 ++++++++++---------
 miles/modal_train.py                          | 10 +--
 miles/recipes/qwen3-30b-a3b-lora.args         |  8 +--
 .../tests/qwen3-30b-a3b-lora-fewstep.args     |  6 +-
 .../qwen3-30b-a3b-lora-greedy-debug.args      |  4 +-
 5 files changed, 49 insertions(+), 42 deletions(-)

diff --git a/miles/README.md b/miles/README.md
index 9d7c67c..4902222 100644
--- a/miles/README.md
+++ b/miles/README.md
@@ -24,16 +24,20 @@ Current recipes:
 
 - `qwen25-0p5b-lora`: single-node smoke test adapted from the upstream Miles
   LoRA example.
-- `qwen3-30b-a3b-lora`: first-pass Qwen3-30B-A3B bridge-mode LoRA validation
-  recipe, restricted to attention targets (`linear_qkv`, `linear_proj`).
-- `qwen3-30b-a3b-lora-fewstep`: trimmed attention-only recipe that is intended
-  to prove a few full RL updates on Modal.
-- `qwen3-30b-a3b-experts-lora`: second-pass Qwen3-30B-A3B recipe widened to
-  expert `linear_fc1` and `linear_fc2` targets after the baseline path works.
-- `qwen3-30b-a3b-experts-fewstep`: trimmed expert-target recipe built from the
-  working few-step shape.
+- `qwen3-30b-a3b-lora`: default Qwen3-30B-A3B all-layer recipe, targeting
+  attention plus MLP/MoE layers (`linear_qkv`, `linear_proj`, `linear_fc1`,
+  `linear_fc2`).
+- `qwen3-30b-a3b-lora-fewstep`: trimmed all-layer recipe that is intended to
+  prove a few full RL updates on Modal.
+- `qwen3-30b-a3b-experts-lora`: explicit all-layer alias that makes the expert
+  `linear_fc1` / `linear_fc2` targeting obvious in the name.
+- `qwen3-30b-a3b-experts-fewstep`: trimmed explicit all-layer alias built from
+  the working few-step shape.
 
 Testing/debug recipe files live under [`recipes/tests/`](./recipes/tests).
+The attention-only recipe is kept only as
+[`qwen3-30b-a3b-lora-greedy-debug`](./recipes/tests/qwen3-30b-a3b-lora-greedy-debug.args),
+as a diagnostic control rather than a recommended training setup.
 
 ## Prepare assets
 
@@ -66,7 +70,7 @@ Qwen3-30B-A3B baseline LoRA validation:
 MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen3-30b-a3b-lora
 ```
 
-Qwen3-30B-A3B few-step attention-only validation:
+Qwen3-30B-A3B few-step all-layer validation:
 
 ```bash
 MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen3-30b-a3b-lora-fewstep
@@ -89,6 +93,16 @@ MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen3-30b-a3b-experts-fe
 - Start with standard LoRA, not DoRA. Miles' current rollout sync and adapter
   filtering are LoRA-specific and keyed off `lora_A` / `lora_B` names, so DoRA
   is not the first validation target.
+- The default Qwen3 recipes now follow the recommendations from Thinking
+  Machines' “LoRA Without Regret”: keep the standard `alpha=32` / `1/r`
+  parameterization, use a LoRA LR around 10x the FullFT baseline (`1e-5` here
+  vs. the upstream Qwen3-30B-A3B FullFT `1e-6`), and include the MLP/MoE layers
+  rather than using attention-only LoRA.
+- One MoE-specific nuance from the article is not exposed cleanly by the current
+  Miles recipe surface: their Qwen3 MoE experiments scale per-expert LoRA rank
+  by the number of active experts. Our recipes currently use a uniform
+  `--lora-rank 32` across all targeted modules because Miles exposes one global
+  LoRA rank, not per-module or per-expert ranks.
 - The baked Qwen3 recipes are single-node `H100:8` shapes. They are intended to
   validate end-to-end bridge-mode LoRA with colocated rollout first, not to
   exhaustively cover every parallelism combination.
@@ -99,8 +113,8 @@ MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen3-30b-a3b-experts-fe
 - Miles currently supports LoRA weight sync only for colocated rollout engines.
   Distributed non-colocated rollout sync is not yet implemented for LoRA.
 - The baseline Qwen3 recipe stays close to the upstream Miles single-node
-  Qwen3-30B-A3B shape. The expert-target recipe is a follow-on experiment, not
-  the initial correctness target.
+  Qwen3-30B-A3B shape while using all-layer LoRA. The explicit expert-target
+  recipe names are kept mainly for clarity and backwards compatibility.
 
 ## Observed On Modal
 
@@ -117,21 +131,14 @@ The current wrapper includes runtime patches in
 
 What the Modal runs have validated so far on `modal-labs`:
 
-- `qwen3-30b-a3b-lora` gets through bridge-mode LoRA creation and attention
-  module injection (`linear_qkv`, `linear_proj`), and it can start loading the
-  Hugging Face checkpoint into Megatron.
-- `qwen3-30b-a3b-lora-fewstep` now gets through full RL training on Modal. In
-  recent runs it passed rollout, weight sync, and actor training repeatedly and
-  reached at least `train/step` 6 on a single-node `H100:8` shape.
-- `qwen3-30b-a3b-experts-lora` goes further: it creates LoRA with
-  `linear_qkv`, `linear_proj`, `linear_fc1`, and `linear_fc2`, injects those
-  expert targets under `decoder.layers.*.mlp.experts.*`, loads weights, pushes
-  the adapter into SGLang, and starts `Eval gsm8k`.
-- `qwen3-30b-a3b-experts-fewstep` has validated the widened target surface on
-  Modal: Miles creates LoRA with `linear_fc1` / `linear_fc2`, injects those
-  expert modules, completes weight sync, and reaches rollout collection plus
-  actor training. A detached confirmation of a full expert-target train step is
-  still in progress.
+- The all-layer Qwen3-30B-A3B LoRA shape now has runtime validation on Modal.
+  In recent detached runs of the few-step recipe shape, Miles created LoRA with
+  `linear_qkv`, `linear_proj`, `linear_fc1`, and `linear_fc2`, injected expert
+  modules under `decoder.layers.*.mlp.experts.*`, completed rollout collection,
+  and reached at least `train/step` 1 on a single-node `H100:8` shape.
+- The attention-only debug/control recipe also works, but it is no longer the
+  recommended configuration after comparing against the Thinking Machines
+  guidance.
 - The remaining instability has been in the colocated SGLang rollout path, not
   in LoRA target discovery. The main concrete runtime failures we hit were:
   non-finite logprobs breaking HTTP JSON serialization, and invalid sampling
@@ -141,8 +148,8 @@ Current interpretation:
 
 - Qwen3-30B-A3B MoE LoRA support in Miles is real enough to instantiate,
   target, load, and export adapters for both attention and expert MLP layers.
-- Attention-only Qwen3-30B-A3B LoRA is now runtime-validated for repeated RL
-  updates on `modal-labs`.
+- Attention-only Qwen3-30B-A3B LoRA is still runtime-validated as a debug
+  control on `modal-labs`, but it is no longer the recommended default.
 - The remaining risk is concentrated in the colocated SGLang rollout lifecycle,
   which is coupled to `offload_rollout` / `enable_memory_saver=True` in the
   current Miles SGLang engine setup, especially once expert-target LoRA is
diff --git a/miles/modal_train.py b/miles/modal_train.py
index 5477a33..9384fbb 100644
--- a/miles/modal_train.py
+++ b/miles/modal_train.py
@@ -70,7 +70,7 @@ class Recipe:
     ),
     "qwen3-30b-a3b-lora": Recipe(
         name="qwen3-30b-a3b-lora",
-        description="Single-node Qwen3-30B-A3B bridge-mode LoRA validation recipe.",
+        description="Single-node Qwen3-30B-A3B all-layer bridge-mode LoRA recipe aligned with current best practices.",
         model_id="Qwen/Qwen3-30B-A3B",
         args_file="qwen3-30b-a3b-lora.args",
         recommended_nodes=1,
@@ -78,7 +78,7 @@ class Recipe:
     ),
     "qwen3-30b-a3b-lora-fewstep": Recipe(
         name="qwen3-30b-a3b-lora-fewstep",
-        description="Single-node Qwen3-30B-A3B attention-only LoRA recipe trimmed to chase a few full RL steps.",
+        description="Single-node Qwen3-30B-A3B all-layer LoRA recipe trimmed to chase a few full RL steps.",
         model_id="Qwen/Qwen3-30B-A3B",
         args_file="tests/qwen3-30b-a3b-lora-fewstep.args",
         recommended_nodes=1,
@@ -86,7 +86,7 @@ class Recipe:
     ),
     "qwen3-30b-a3b-lora-greedy-debug": Recipe(
         name="qwen3-30b-a3b-lora-greedy-debug",
-        description="Single-node Qwen3-30B-A3B attention-only LoRA debug recipe with greedy rollout to validate LoRA sync.",
+        description="Single-node Qwen3-30B-A3B attention-only debug/control recipe with greedy rollout.",
         model_id="Qwen/Qwen3-30B-A3B",
         args_file="tests/qwen3-30b-a3b-lora-greedy-debug.args",
         recommended_nodes=1,
@@ -94,7 +94,7 @@ class Recipe:
     ),
     "qwen3-30b-a3b-experts-lora": Recipe(
         name="qwen3-30b-a3b-experts-lora",
-        description="Second-phase Qwen3-30B-A3B recipe widened to expert linear_fc1/fc2 targets.",
+        description="Explicit all-layer Qwen3-30B-A3B LoRA recipe including expert linear_fc1/fc2 targets.",
         model_id="Qwen/Qwen3-30B-A3B",
         args_file="qwen3-30b-a3b-experts-lora.args",
         recommended_nodes=1,
@@ -102,7 +102,7 @@ class Recipe:
     ),
     "qwen3-30b-a3b-experts-fewstep": Recipe(
         name="qwen3-30b-a3b-experts-fewstep",
-        description="Single-node Qwen3-30B-A3B expert-target LoRA recipe trimmed to chase a few RL steps.",
+        description="Explicit all-layer Qwen3-30B-A3B few-step recipe including expert linear_fc1/fc2 targets.",
         model_id="Qwen/Qwen3-30B-A3B",
         args_file="tests/qwen3-30b-a3b-experts-fewstep.args",
         recommended_nodes=1,
diff --git a/miles/recipes/qwen3-30b-a3b-lora.args b/miles/recipes/qwen3-30b-a3b-lora.args
index d80d9b3..a208cee 100644
--- a/miles/recipes/qwen3-30b-a3b-lora.args
+++ b/miles/recipes/qwen3-30b-a3b-lora.args
@@ -1,6 +1,6 @@
-# Qwen3-30B-A3B bridge-mode LoRA validation recipe.
-# Phase 1: confirm end-to-end Miles + Megatron-Bridge + SGLang support with
-# attention-only LoRA targets before widening into expert MLP modules.
+# Qwen3-30B-A3B bridge-mode LoRA recipe aligned with the
+# "LoRA Without Regret" guidance from Thinking Machines:
+# apply LoRA to the attention and MLP/MoE linear layers, not attention only.
 
 # Model architecture from the upstream Miles Qwen3-30B-A3B model script.
 --disable-bias-linear
@@ -39,7 +39,7 @@
 --lora-rank 32
 --lora-alpha 32
 --lora-dropout 0.0
---target-modules linear_qkv,linear_proj
+--target-modules linear_qkv,linear_proj,linear_fc1,linear_fc2
 
 # Data and rollout
 --prompt-data /data/gsm8k/train.parquet
diff --git a/miles/recipes/tests/qwen3-30b-a3b-lora-fewstep.args b/miles/recipes/tests/qwen3-30b-a3b-lora-fewstep.args
index dda4790..75d749b 100644
--- a/miles/recipes/tests/qwen3-30b-a3b-lora-fewstep.args
+++ b/miles/recipes/tests/qwen3-30b-a3b-lora-fewstep.args
@@ -1,6 +1,6 @@
 # Qwen3-30B-A3B bridge-mode LoRA recipe tuned to get through a few RL steps.
-# This keeps attention-only LoRA targets but trims rollout pressure and disables
-# eval so the run can reach training sooner.
+# This keeps the smaller rollout shape from validation, but uses the
+# article-aligned all-layer LoRA target set rather than attention only.
 
 # Model architecture from the upstream Miles Qwen3-30B-A3B model script.
 --disable-bias-linear
@@ -39,7 +39,7 @@
 --lora-rank 32
 --lora-alpha 32
 --lora-dropout 0.0
---target-modules linear_qkv,linear_proj
+--target-modules linear_qkv,linear_proj,linear_fc1,linear_fc2
 
 # Data and rollout
 --prompt-data /data/gsm8k/train.parquet
diff --git a/miles/recipes/tests/qwen3-30b-a3b-lora-greedy-debug.args b/miles/recipes/tests/qwen3-30b-a3b-lora-greedy-debug.args
index 351f4ed..36972be 100644
--- a/miles/recipes/tests/qwen3-30b-a3b-lora-greedy-debug.args
+++ b/miles/recipes/tests/qwen3-30b-a3b-lora-greedy-debug.args
@@ -1,6 +1,6 @@
 # Qwen3-30B-A3B bridge-mode LoRA recipe narrowed for faster Modal debugging.
-# This keeps attention-only LoRA targets, trims rollout pressure, and uses
-# deterministic top-k=1 decoding so the next run quickly isolates rollout bugs.
+# This intentionally keeps attention-only LoRA as a diagnostic control, trims
+# rollout pressure, and uses deterministic top-k=1 decoding to isolate bugs.
 
 # Model architecture from the upstream Miles Qwen3-30B-A3B model script.
 --disable-bias-linear

From 491525918f4782dd92423a7d182fd2abb255ad65 Mon Sep 17 00:00:00 2001
From: Jason Mancuso <7891333+jvmncs@users.noreply.github.com>
Date: Fri, 27 Mar 2026 12:44:10 -0400
Subject: [PATCH 4/5] Validate non-colocated Qwen3 LoRA on Modal

---
 miles/README.md                      |  15 ++
 miles/modal_patches/sitecustomize.py | 208 +++++++++++++++++++++++++++
 miles/modal_train.py                 | 102 ++++++++++++-
 3 files changed, 322 insertions(+), 3 deletions(-)

diff --git a/miles/README.md b/miles/README.md
index 4902222..26bd650 100644
--- a/miles/README.md
+++ b/miles/README.md
@@ -76,6 +76,12 @@ Qwen3-30B-A3B few-step all-layer validation:
 MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen3-30b-a3b-lora-fewstep
 ```
 
+Qwen3-30B-A3B few-step all-layer validation in non-colocated mode:
+
+```bash
+MILES_N_NODES=2 modal run miles/modal_train.py --recipe qwen3-30b-a3b-lora-fewstep --no-colocate --actor-nodes 1 --allow-cluster-mismatch
+```
+
 Qwen3-30B-A3B expert-target LoRA follow-up:
 
 ```bash
@@ -106,6 +112,10 @@ MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen3-30b-a3b-experts-fe
 - The baked Qwen3 recipes are single-node `H100:8` shapes. They are intended to
   validate end-to-end bridge-mode LoRA with colocated rollout first, not to
   exhaustively cover every parallelism combination.
+- The Modal wrapper now also supports a non-colocated split for experimentation:
+  total cluster size still comes from `MILES_N_NODES`, while `--actor-nodes`
+  controls how many of those nodes are reserved for Megatron training and the
+  remaining GPUs default to rollout.
 - Source inspection suggests the training path should handle TP / PP / EP / CP
   because the bridge setup forwards all of those settings into Megatron-Bridge,
   and Megatron-Bridge's PEFT tests cover pipeline-style model chunk lists. That
@@ -136,6 +146,11 @@ What the Modal runs have validated so far on `modal-labs`:
   `linear_qkv`, `linear_proj`, `linear_fc1`, and `linear_fc2`, injected expert
   modules under `decoder.layers.*.mlp.experts.*`, completed rollout collection,
   and reached at least `train/step` 1 on a single-node `H100:8` shape.
+- The same all-layer few-step recipe now also has non-colocated runtime
+  validation on `modal-labs` with a 2-node split: 1 node for actor training and
+  1 node for rollout (`--no-colocate --actor-nodes 1`). After forcing the Miles
+  router to advertise the cluster head IP and patching distributed LoRA sync to
+  export bridge adapters directly, the run reached at least `train/step` 2.
 - The attention-only debug/control recipe also works, but it is no longer the
   recommended configuration after comparing against the Thinking Machines
   guidance.
diff --git a/miles/modal_patches/sitecustomize.py b/miles/modal_patches/sitecustomize.py
index 0662644..0b47f4b 100644
--- a/miles/modal_patches/sitecustomize.py
+++ b/miles/modal_patches/sitecustomize.py
@@ -408,8 +408,216 @@ def sampling_from_probs_torch(
     _log("[miles-modal] patched SGLang sampling to sanitize invalid probability rows")
 
 
+def _patch_distributed_lora_sync() -> None:
+    try:
+        import base64
+        import io
+        import torch
+        from miles.backends.megatron_utils.lora_utils import (
+            LORA_ADAPTER_NAME,
+            build_lora_sync_config,
+            is_lora_weight_name,
+        )
+        from miles.backends.megatron_utils.update_weight import (
+            update_weight_from_distributed as distributed_mod,
+            update_weight_from_tensor as tensor_mod,
+        )
+        from miles.backends.megatron_utils.update_weight.hf_weight_iterator_base import (
+            HfWeightIteratorBase,
+        )
+    except Exception as exc:
+        _log(
+            "[miles-modal] distributed LoRA sync patch unavailable: "
+            f"{type(exc).__name__}: {exc}"
+        )
+        return
+
+    UpdateWeightFromDistributed = getattr(
+        distributed_mod, "UpdateWeightFromDistributed", None
+    )
+    if UpdateWeightFromDistributed is None:
+        _log("[miles-modal] distributed LoRA sync patch missing updater class")
+        return
+
+    original_init = getattr(UpdateWeightFromDistributed, "__init__", None)
+    original_update_weights = getattr(UpdateWeightFromDistributed, "update_weights", None)
+    if original_init is None or original_update_weights is None:
+        _log("[miles-modal] distributed LoRA sync patch missing target methods")
+        return
+
+    if getattr(original_update_weights, "__module__", "") == __name__:
+        _log("[miles-modal] distributed LoRA sync patch already present")
+        return
+
+    ray = distributed_mod.ray
+    dist = distributed_mod.dist
+    get_gloo_group = distributed_mod.get_gloo_group
+    post_process_weights = distributed_mod.post_process_weights
+    FlattenedTensorBucket = tensor_mod.FlattenedTensorBucket
+    MultiprocessingSerializer = tensor_mod.MultiprocessingSerializer
+    check_results = tensor_mod._check_weight_sync_results
+
+    def _serialize_lora_named_tensors(named_tensors):
+        flattened_tensor_bucket = FlattenedTensorBucket(named_tensors=named_tensors)
+        flattened_tensor = flattened_tensor_bucket.get_flattened_tensor()
+        if not isinstance(flattened_tensor, torch.Tensor):
+            raise TypeError(
+                "Expected LoRA flattened tensor to be a torch.Tensor, got "
+                f"{type(flattened_tensor).__name__}"
+            )
+        if flattened_tensor.is_cuda:
+            flattened_tensor = flattened_tensor.detach().cpu()
+        buffer = io.BytesIO()
+        torch.save(flattened_tensor.contiguous(), buffer)
+        flattened_tensor_data = {
+            "_miles_modal_format": "torch_save_flattened_lora_v2",
+            "flattened_tensor_torch_save_b64": base64.b64encode(buffer.getvalue()).decode(
+                "ascii"
+            ),
+            "metadata": [
+                {
+                    "name": meta.name,
+                    "shape": list(meta.shape),
+                    "dtype": str(meta.dtype).removeprefix("torch."),
+                    "start_idx": meta.start_idx,
+                    "end_idx": meta.end_idx,
+                    "numel": meta.numel,
+                }
+                for meta in flattened_tensor_bucket.get_metadata()
+            ],
+        }
+        return MultiprocessingSerializer.serialize(flattened_tensor_data, output_str=True)
+
+    def __init__(
+        self,
+        args,
+        model,
+        weights_getter,
+        *,
+        model_name,
+        quantization_config,
+        is_lora=False,
+    ):
+        original_init(
+            self,
+            args,
+            model,
+            weights_getter,
+            model_name=model_name,
+            quantization_config=quantization_config,
+            is_lora=is_lora,
+        )
+        self.weights_getter = weights_getter
+        self.is_lora = is_lora
+        self._lora_loaded = False
+        if self.is_lora:
+            self._hf_weight_iterator = HfWeightIteratorBase.create(
+                args=args,
+                model=model,
+                model_name=model_name,
+                quantization_config=quantization_config,
+                is_lora=True,
+            )
+            self._lora_config = build_lora_sync_config(args)
+        else:
+            self._hf_weight_iterator = None
+            self._lora_config = None
+
+    @torch.no_grad()
+    def update_weights(self):
+        if not getattr(self, "is_lora", False):
+            return original_update_weights(self)
+
+        self.weight_version += 1
+        rank = dist.get_rank()
+
+        if rank == 0:
+            ray.get([engine.pause_generation.remote() for engine in self.rollout_engines])
+            ray.get([engine.flush_cache.remote() for engine in self.rollout_engines])
+            if self.quantization_config and self.quantization_config["quant_method"] in [
+                "compressed-tensors"
+            ]:
+                post_process_weights(
+                    restore_weights_before_load=True,
+                    post_process_quantization=False,
+                    rollout_engines=self.rollout_engines,
+                )
+        dist.barrier(group=get_gloo_group())
+
+        megatron_local_weights = self.weights_getter() if self.weights_getter else {}
+        all_lora_named_tensors = []
+        sync_chunk_count = 0
+
+        for hf_named_tensors in self._hf_weight_iterator.get_hf_weight_chunks(
+            megatron_local_weights
+        ):
+            lora_named_tensors = [
+                (name, tensor)
+                for name, tensor in hf_named_tensors
+                if is_lora_weight_name(name)
+            ]
+            if not lora_named_tensors:
+                continue
+
+            sync_chunk_count += 1
+            if self._is_pp_src_rank:
+                all_lora_named_tensors.extend(
+                    (name, tensor.detach().cpu())
+                    for name, tensor in lora_named_tensors
+                )
+
+        if self._is_pp_src_rank:
+            if sync_chunk_count == 0:
+                raise RuntimeError(
+                    "Distributed LoRA weight sync failed: bridge export produced zero "
+                    "LoRA chunks."
+                )
+
+            serialized_tensors = _serialize_lora_named_tensors(all_lora_named_tensors)
+            if self._lora_loaded:
+                ray.get(
+                    [
+                        engine.unload_lora_adapter.remote(lora_name=LORA_ADAPTER_NAME)
+                        for engine in self.rollout_engines
+                    ]
+                )
+
+            results = ray.get(
+                [
+                    engine.load_lora_adapter_from_tensors.remote(
+                        lora_name=LORA_ADAPTER_NAME,
+                        config_dict=self._lora_config,
+                        serialized_tensors=serialized_tensors,
+                        load_format="flattened_bucket",
+                    )
+                    for engine in self.rollout_engines
+                ]
+            )
+            check_results(results, is_lora=True)
+            self._lora_loaded = True
+
+        dist.barrier(group=get_gloo_group())
+        if rank == 0:
+            if self.quantization_config and self.quantization_config["quant_method"] in [
+                "compressed-tensors",
+                "mxfp8",
+            ]:
+                post_process_weights(
+                    restore_weights_before_load=False,
+                    post_process_quantization=True,
+                    rollout_engines=self.rollout_engines,
+                )
+            ray.get([engine.continue_generation.remote() for engine in self.rollout_engines])
+        dist.barrier(group=get_gloo_group())
+
+    UpdateWeightFromDistributed.__init__ = __init__
+    UpdateWeightFromDistributed.update_weights = update_weights
+    _log("[miles-modal] patched distributed LoRA sync to export bridge adapters directly")
+
+
 _register_linear_cross_entropy_module()
 _patch_lora_cpu_serialization()
 _patch_sglang_lora_numpy_rehydration()
 _patch_sglang_logprob_sanitization()
 _patch_sglang_sampling_probability_sanitization()
+_patch_distributed_lora_sync()
diff --git a/miles/modal_train.py b/miles/modal_train.py
index 9384fbb..70b35b6 100644
--- a/miles/modal_train.py
+++ b/miles/modal_train.py
@@ -150,11 +150,17 @@ def _build_enforced_args(
     *,
     model_path: str,
     cluster_nodes: int,
+    actor_nodes: int,
     gpus_per_node: int,
     checkpoint_dir: pathlib.Path,
     custom_config_path: Optional[str],
     wandb_key: Optional[str],
+    colocate: bool,
+    rollout_num_gpus: Optional[int],
 ) -> list[str]:
+    if actor_nodes < 1:
+        raise ValueError(f"actor_nodes must be >= 1, got {actor_nodes}")
+
     args = [
         "--train-backend",
         "megatron",
@@ -165,13 +171,20 @@ def _build_enforced_args(
         "--save",
         checkpoint_dir.as_posix(),
         "--actor-num-nodes",
-        str(cluster_nodes),
+        str(actor_nodes),
         "--actor-num-gpus-per-node",
         str(gpus_per_node),
         "--num-gpus-per-node",
         str(gpus_per_node),
-        "--colocate",
     ]
+    if colocate:
+        args.append("--colocate")
+    else:
+        if rollout_num_gpus is None or rollout_num_gpus < 1:
+            raise ValueError(
+                "rollout_num_gpus must be >= 1 when launching non-colocated rollout."
+            )
+        args.extend(["--rollout-num-gpus", str(rollout_num_gpus)])
     if custom_config_path:
         args.extend(["--custom-config-path", custom_config_path])
     if wandb_key:
@@ -184,26 +197,67 @@ def _build_miles_argv(
     *,
     model_path: str,
     cluster_nodes: int,
+    actor_nodes: int,
     gpus_per_node: int,
     checkpoint_dir: pathlib.Path,
     extra_args_text: str,
     custom_config_path: Optional[str],
     wandb_key: Optional[str],
     remote_recipe: bool,
+    colocate: bool,
+    rollout_num_gpus: Optional[int],
 ) -> list[str]:
     recipe_args = _parse_arg_text(_load_recipe_text(recipe, remote=remote_recipe))
     extra_args = _parse_arg_text(extra_args_text)
     enforced_args = _build_enforced_args(
         model_path=model_path,
         cluster_nodes=cluster_nodes,
+        actor_nodes=actor_nodes,
         gpus_per_node=gpus_per_node,
         checkpoint_dir=checkpoint_dir,
         custom_config_path=custom_config_path,
         wandb_key=wandb_key,
+        colocate=colocate,
+        rollout_num_gpus=rollout_num_gpus,
     )
     return ["python3", REMOTE_TRAIN_SCRIPT.as_posix(), *recipe_args, *extra_args, *enforced_args]
 
 
+def _resolve_actor_nodes(cluster_nodes: int, *, colocate: bool, actor_nodes: int) -> int:
+    if colocate:
+        return cluster_nodes
+    if actor_nodes > 0:
+        return actor_nodes
+    if cluster_nodes < 2:
+        raise ValueError(
+            "Non-colocated rollout needs spare cluster capacity. "
+            "Set MILES_N_NODES>=2 or pass --colocate."
+        )
+    return cluster_nodes - 1
+
+
+def _resolve_rollout_num_gpus(
+    cluster_nodes: int,
+    *,
+    actor_nodes: int,
+    gpus_per_node: int,
+    colocate: bool,
+    rollout_num_gpus: int,
+) -> Optional[int]:
+    if colocate:
+        return None
+    if rollout_num_gpus > 0:
+        return rollout_num_gpus
+    spare_gpus = (cluster_nodes - actor_nodes) * gpus_per_node
+    if spare_gpus < 1:
+        raise ValueError(
+            "Non-colocated rollout needs spare GPUs after reserving actor nodes. "
+            f"cluster_nodes={cluster_nodes}, actor_nodes={actor_nodes}, "
+            f"gpus_per_node={gpus_per_node}"
+        )
+    return spare_gpus
+
+
 def _read_optional_file(path_str: str) -> str:
     if not path_str:
         return ""
@@ -213,6 +267,7 @@ def _read_optional_file(path_str: str) -> str:
 def _build_runtime_env(master_addr: str, wandb_key: Optional[str]) -> dict:
     env_vars = {
         "MASTER_ADDR": master_addr,
+        "MILES_HOST_IP": master_addr,
         "no_proxy": master_addr,
         "PYTHONPATH": f"{REMOTE_PATCH_DIR.as_posix()}:/root/Megatron-LM",
         "CUDA_DEVICE_MAX_CONNECTIONS": "1",
@@ -365,6 +420,9 @@ async def submit_training(
         extra_args_text: str = "",
         custom_config_yaml: str = "",
         wandb_key: str = "",
+        actor_nodes: int | None = None,
+        colocate: bool = True,
+        rollout_num_gpus: int | None = None,
     ) -> dict:
         self._ensure_ray_started()
 
@@ -389,23 +447,32 @@ async def submit_training(
             custom_config_path = f"/tmp/{recipe.name}-{run_id}-overrides.yaml"
             pathlib.Path(custom_config_path).write_text(custom_config_yaml)
 
+        resolved_actor_nodes = actor_nodes if actor_nodes is not None else CLUSTER_NODES
+        resolved_rollout_num_gpus = rollout_num_gpus
         argv = _build_miles_argv(
             recipe,
             model_path=model_path,
             cluster_nodes=CLUSTER_NODES,
+            actor_nodes=resolved_actor_nodes,
             gpus_per_node=gpus_per_node,
             checkpoint_dir=checkpoint_dir,
             extra_args_text=extra_args_text,
             custom_config_path=custom_config_path,
             wandb_key=wandb_key or None,
             remote_recipe=True,
+            colocate=colocate,
+            rollout_num_gpus=resolved_rollout_num_gpus,
         )
         entrypoint = shlex.join(argv)
         runtime_env = _build_runtime_env(self.main_addr, wandb_key or None)
 
         print(f"Recipe: {recipe.name}")
         print(f"Model: {recipe.model_id}")
-        print(f"Nodes: {CLUSTER_NODES}")
+        print(f"Cluster nodes: {CLUSTER_NODES}")
+        print(f"Actor nodes: {resolved_actor_nodes}")
+        print(f"Colocate: {colocate}")
+        if not colocate:
+            print(f"Rollout GPUs: {resolved_rollout_num_gpus}")
         print(f"GPUs per node: {gpus_per_node}")
         print(f"Checkpoint dir: {checkpoint_dir}")
         print(f"Entrypoint: {entrypoint}")
@@ -484,6 +551,9 @@ def main(
     list_recipes: bool = False,
     dry_run: bool = False,
     allow_cluster_mismatch: bool = False,
+    colocate: bool = True,
+    actor_nodes: int = 0,
+    rollout_num_gpus: int = 0,
 ):
     if list_recipes:
         _print_recipe_table()
@@ -492,6 +562,18 @@ def main(
     selected_recipe = _get_recipe(recipe)
     selected_gpu = gpu or selected_recipe.gpu
     gpus_per_node = _parse_gpus_per_node(selected_gpu)
+    resolved_actor_nodes = _resolve_actor_nodes(
+        CLUSTER_NODES,
+        colocate=colocate,
+        actor_nodes=actor_nodes,
+    )
+    resolved_rollout_num_gpus = _resolve_rollout_num_gpus(
+        CLUSTER_NODES,
+        actor_nodes=resolved_actor_nodes,
+        gpus_per_node=gpus_per_node,
+        colocate=colocate,
+        rollout_num_gpus=rollout_num_gpus,
+    )
 
     if (
         not allow_cluster_mismatch
@@ -515,16 +597,23 @@ def main(
             selected_recipe,
             model_path="$MODEL_PATH",
             cluster_nodes=CLUSTER_NODES,
+            actor_nodes=resolved_actor_nodes,
             gpus_per_node=gpus_per_node,
             checkpoint_dir=checkpoint_dir,
             extra_args_text=merged_extra_args,
             custom_config_path="/tmp/custom-config.yaml" if custom_config_yaml else None,
             wandb_key="$WANDB_API_KEY" if wandb_key else None,
             remote_recipe=False,
+            colocate=colocate,
+            rollout_num_gpus=resolved_rollout_num_gpus,
         )
         print(f"Recipe: {selected_recipe.name}")
         print(f"Model: {selected_recipe.model_id}")
         print(f"Cluster nodes: {CLUSTER_NODES}")
+        print(f"Actor nodes: {resolved_actor_nodes}")
+        print(f"Colocate: {colocate}")
+        if not colocate:
+            print(f"Rollout GPUs: {resolved_rollout_num_gpus}")
         print(f"GPU: {selected_gpu}")
         print(shlex.join(argv))
         return
@@ -532,6 +621,10 @@ def main(
     print(f"Recipe: {selected_recipe.name}")
     print(f"Model: {selected_recipe.model_id}")
     print(f"Cluster nodes: {CLUSTER_NODES}")
+    print(f"Actor nodes: {resolved_actor_nodes}")
+    print(f"Colocate: {colocate}")
+    if not colocate:
+        print(f"Rollout GPUs: {resolved_rollout_num_gpus}")
     print(f"GPU: {selected_gpu}")
 
     cluster = MilesCluster.with_options(gpu=selected_gpu)()
@@ -541,5 +634,8 @@ def main(
         extra_args_text=merged_extra_args,
         custom_config_yaml=custom_config_yaml,
         wandb_key=wandb_key,
+        actor_nodes=resolved_actor_nodes,
+        colocate=colocate,
+        rollout_num_gpus=resolved_rollout_num_gpus,
     )
     print(result)

From bada71606c9158fcdb6eeb4f38f738225e7f3c37 Mon Sep 17 00:00:00 2001
From: Jason Mancuso <7891333+jvmncs@users.noreply.github.com>
Date: Mon, 30 Mar 2026 15:25:19 -0400
Subject: [PATCH 5/5] Refactor Miles Modal launcher and recipe argument
 plumbing

---
 miles/README.md           | 214 ++++++++---------------
 miles/modal_train.py      | 345 +++++++++++++++++++++-----------------
 miles/recipes/__init__.py |  29 ++++
 miles/recipes/util.py     | 122 ++++++++++++++
 4 files changed, 416 insertions(+), 294 deletions(-)
 create mode 100644 miles/recipes/__init__.py
 create mode 100644 miles/recipes/util.py

diff --git a/miles/README.md b/miles/README.md
index 26bd650..f01f58b 100644
--- a/miles/README.md
+++ b/miles/README.md
@@ -1,191 +1,121 @@
-# Miles example
+# Miles on Modal
 
-Multi-node Miles RL training on Modal using the same Ray bootstrap pattern as
-[`ray/modal_train.py`](../ray/modal_train.py), but with Miles recipes stored as
-native CLI flag files instead of Python config classes.
+Run Miles RL training on Modal with recipe files stored under [`recipes/`](./recipes/).
+The wrapper handles Modal and Ray orchestration; model and training flags stay in
+recipe arg files.
 
 ## Prerequisites
 
 - A Modal account with multi-node access.
-- A `huggingface-secret` Modal secret containing `HF_TOKEN` for gated model
-  downloads.
-- Optionally export `WANDB_API_KEY` in your local shell before `modal run` to
-  auto-enable Weights & Biases logging for training runs.
+- A `huggingface-secret` Modal secret containing `HF_TOKEN`.
+- Optional: `WANDB_API_KEY` in your local shell for Weights & Biases logging.
+- Optional: `modal deploy miles/modal_train.py`. The local entrypoint will try
+  the deployed `MilesCluster` first and fall back to an ephemeral app if it is
+  not deployed.
 
-## Recipes
+## Prepare Shared Assets
 
-List the built-in recipes:
+Prepare the default GSM8K dataset:
 
 ```bash
-modal run miles/modal_train.py --list-recipes
+modal run miles/modal_train.py::prepare_dataset
 ```
 
-Current recipes:
-
-- `qwen25-0p5b-lora`: single-node smoke test adapted from the upstream Miles
-  LoRA example.
-- `qwen3-30b-a3b-lora`: default Qwen3-30B-A3B all-layer recipe, targeting
-  attention plus MLP/MoE layers (`linear_qkv`, `linear_proj`, `linear_fc1`,
-  `linear_fc2`).
-- `qwen3-30b-a3b-lora-fewstep`: trimmed all-layer recipe that is intended to
-  prove a few full RL updates on Modal.
-- `qwen3-30b-a3b-experts-lora`: explicit all-layer alias that makes the expert
-  `linear_fc1` / `linear_fc2` targeting obvious in the name.
-- `qwen3-30b-a3b-experts-fewstep`: trimmed explicit all-layer alias built from
-  the working few-step shape.
+Download a model for a built-in recipe:
 
-Testing/debug recipe files live under [`recipes/tests/`](./recipes/tests).
-The attention-only recipe is kept only as
-[`qwen3-30b-a3b-lora-greedy-debug`](./recipes/tests/qwen3-30b-a3b-lora-greedy-debug.args),
-as a diagnostic control rather than a recommended training setup.
-
-## Prepare assets
+```bash
+modal run miles/modal_train.py::download_model --recipe qwen3-30b-a3b-lora
+```
 
-Prepare a small GSM8K dataset in the shared volume:
+Or download any model directly:
 
 ```bash
-modal run miles/modal_train.py::prepare_dataset
+modal run miles/modal_train.py::download_model --model-id Qwen/Qwen3-30B-A3B
 ```
 
-Download a recipe's base model into the shared Hugging Face cache:
+## Recipes
+
+List the available recipes:
 
 ```bash
-modal run miles/modal_train.py::download_model --recipe qwen3-30b-a3b-lora
+modal run miles/modal_train.py --list-recipes
 ```
 
+Recommended starting points:
+
+- `qwen3-30b-a3b-lora`: default Qwen3 recipe.
+- `qwen3-30b-a3b-lora-fewstep`: smallest end-to-end Qwen3 validation recipe.
+- `qwen3-30b-a3b-experts-lora`: explicit expert-target variant.
+- `qwen3-30b-a3b-experts-fewstep`: trimmed expert-target validation recipe.
+- `qwen25-0p5b-lora`: small smoke test.
+
+Testing and debug recipes live under [`recipes/tests/`](./recipes/tests).
+
 ## Train
 
-The cluster size is chosen at import time by `MILES_N_NODES`, so set it in the
-same shell invocation as `modal run`.
+Set `MILES_N_NODES` in the same shell invocation as `modal run`.
 
-Single-node smoke test:
+Single-node Qwen3 few-step validation:
 
 ```bash
-MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen25-0p5b-lora
+MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen3-30b-a3b-lora-fewstep
 ```
 
-Qwen3-30B-A3B baseline LoRA validation:
+Single-node Qwen3 default recipe:
 
 ```bash
 MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen3-30b-a3b-lora
 ```
 
-Qwen3-30B-A3B few-step all-layer validation:
+Single-node expert-target follow-up:
 
 ```bash
-MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen3-30b-a3b-lora-fewstep
+MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen3-30b-a3b-experts-fewstep
 ```
 
-Qwen3-30B-A3B few-step all-layer validation in non-colocated mode:
+Non-colocated Qwen3 validation:
 
 ```bash
-MILES_N_NODES=2 modal run miles/modal_train.py --recipe qwen3-30b-a3b-lora-fewstep --no-colocate --actor-nodes 1 --allow-cluster-mismatch
+MILES_N_NODES=2 modal run miles/modal_train.py \
+  --recipe qwen3-30b-a3b-lora-fewstep \
+  --no-colocate \
+  --actor-nodes 1 \
+  --allow-cluster-mismatch
 ```
 
-Qwen3-30B-A3B expert-target LoRA follow-up:
+Small smoke test:
 
 ```bash
-MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen3-30b-a3b-experts-lora
+MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen25-0p5b-lora
 ```
 
-Qwen3-30B-A3B expert-target few-step validation:
+## Ad Hoc Runs
+
+You can launch without a predefined recipe by passing args directly:
 
 ```bash
-MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen3-30b-a3b-experts-fewstep
+MILES_N_NODES=1 modal run miles/modal_train.py \
+  --model-id Qwen/Qwen3-30B-A3B \
+  --args-file miles/recipes/qwen3-30b-a3b-lora.args \
+  --extra-args "--train-samples 8 --eval-interval 1" \
+  --run-name qwen3-adhoc
 ```
 
-## Qwen3 Notes
-
-- Start with standard LoRA, not DoRA. Miles' current rollout sync and adapter
-  filtering are LoRA-specific and keyed off `lora_A` / `lora_B` names, so DoRA
-  is not the first validation target.
-- The default Qwen3 recipes now follow the recommendations from Thinking
-  Machines' “LoRA Without Regret”: keep the standard `alpha=32` / `1/r`
-  parameterization, use a LoRA LR around 10x the FullFT baseline (`1e-5` here
-  vs. the upstream Qwen3-30B-A3B FullFT `1e-6`), and include the MLP/MoE layers
-  rather than using attention-only LoRA.
-- One MoE-specific nuance from the article is not exposed cleanly by the current
-  Miles recipe surface: their Qwen3 MoE experiments scale per-expert LoRA rank
-  by the number of active experts. Our recipes currently use a uniform
-  `--lora-rank 32` across all targeted modules because Miles exposes one global
-  LoRA rank, not per-module or per-expert ranks.
-- The baked Qwen3 recipes are single-node `H100:8` shapes. They are intended to
-  validate end-to-end bridge-mode LoRA with colocated rollout first, not to
-  exhaustively cover every parallelism combination.
-- The Modal wrapper now also supports a non-colocated split for experimentation:
-  total cluster size still comes from `MILES_N_NODES`, while `--actor-nodes`
-  controls how many of those nodes are reserved for Megatron training and the
-  remaining GPUs default to rollout.
-- Source inspection suggests the training path should handle TP / PP / EP / CP
-  because the bridge setup forwards all of those settings into Megatron-Bridge,
-  and Megatron-Bridge's PEFT tests cover pipeline-style model chunk lists. That
-  is still weaker than an actual Miles e2e validation for each shape.
-- Miles currently supports LoRA weight sync only for colocated rollout engines.
-  Distributed non-colocated rollout sync is not yet implemented for LoRA.
-- The baseline Qwen3 recipe stays close to the upstream Miles single-node
-  Qwen3-30B-A3B shape while using all-layer LoRA. The explicit expert-target
-  recipe names are kept mainly for clarity and backwards compatibility.
-
-## Observed On Modal
-
-The current wrapper includes runtime patches in
-[`modal_patches/sitecustomize.py`](./modal_patches/sitecustomize.py) that:
-
-- register Megatron-Bridge's `LinearCrossEntropyModule` as column-parallel
-  before Hugging Face weights are loaded, which fixes bridge-mode Qwen3 load on
-  `output_layer.weight`;
-- serialize colocated LoRA weight buckets in a builtins-only format and
-  rehydrate them inside SGLang, which fixes the Modal colocated LoRA sync path;
-- sanitize non-finite SGLang logprob values before JSON serialization;
-- sanitize invalid SGLang sampling probability rows before `torch.multinomial`.
-
-What the Modal runs have validated so far on `modal-labs`:
-
-- The all-layer Qwen3-30B-A3B LoRA shape now has runtime validation on Modal.
-  In recent detached runs of the few-step recipe shape, Miles created LoRA with
-  `linear_qkv`, `linear_proj`, `linear_fc1`, and `linear_fc2`, injected expert
-  modules under `decoder.layers.*.mlp.experts.*`, completed rollout collection,
-  and reached at least `train/step` 1 on a single-node `H100:8` shape.
-- The same all-layer few-step recipe now also has non-colocated runtime
-  validation on `modal-labs` with a 2-node split: 1 node for actor training and
-  1 node for rollout (`--no-colocate --actor-nodes 1`). After forcing the Miles
-  router to advertise the cluster head IP and patching distributed LoRA sync to
-  export bridge adapters directly, the run reached at least `train/step` 2.
-- The attention-only debug/control recipe also works, but it is no longer the
-  recommended configuration after comparing against the Thinking Machines
-  guidance.
-- The remaining instability has been in the colocated SGLang rollout path, not
-  in LoRA target discovery. The main concrete runtime failures we hit were:
-  non-finite logprobs breaking HTTP JSON serialization, and invalid sampling
-  probability tensors breaking `torch.multinomial`.
-
-Current interpretation:
-
-- Qwen3-30B-A3B MoE LoRA support in Miles is real enough to instantiate,
-  target, load, and export adapters for both attention and expert MLP layers.
-- Attention-only Qwen3-30B-A3B LoRA is still runtime-validated as a debug
-  control on `modal-labs`, but it is no longer the recommended default.
-- The remaining risk is concentrated in the colocated SGLang rollout lifecycle,
-  which is coupled to `offload_rollout` / `enable_memory_saver=True` in the
-  current Miles SGLang engine setup, especially once expert-target LoRA is
-  enabled.
-
-Useful options:
-
-- `--dry-run`: print the assembled Miles command with a `$MODEL_PATH`
-  placeholder without launching the cluster.
-- `--extra-args "...flags..."`: append ad hoc Miles CLI overrides.
-- `--extra-args-file path/to/file.args`: append overrides from a local text
-  file.
-- `--custom-config path/to/overrides.yaml`: pass a flat YAML override map to
-  Miles via `--custom-config-path`.
-- `--allow-cluster-mismatch`: bypass recipe/node-count validation if you are
-  intentionally adapting a canned recipe.
-- `USE_LOCAL_MILES=/path/to/miles`: overlay a local Miles checkout on top of
-  the pinned container image.
-- `MILES_IMAGE=radixark/miles:...`: override the pinned image tag. The current
-  default is `radixark/miles:dev-202603231227`.
-
-The wrapper intentionally owns only Modal/Ray plumbing plus a small set of
-cluster-critical flags. All model and training settings live in
-[`miles/recipes/`](./recipes/).
+## Useful Options
+
+- `--dry-run`: print the assembled Miles command without launching a job.
+- `--args` / `--args-file`: provide the base Miles CLI args.
+- `--extra-args` / `--extra-args-file`: append overrides to a recipe or ad hoc run.
+- `--custom-config`: pass a YAML override file through to Miles.
+- `--run-name`: override the checkpoint subdirectory name.
+- `--allow-cluster-mismatch`: bypass recipe node-count checks.
+- `USE_LOCAL_MILES=/path/to/miles`: overlay a local Miles checkout.
+- `MILES_IMAGE=radixark/miles:...`: override the pinned container image.
+
+## Notes
+
+- The default Qwen3 recipes use standard all-layer LoRA over
+  `linear_qkv`, `linear_proj`, `linear_fc1`, and `linear_fc2`.
+- Start with the few-step recipes when validating a new environment.
+- Modal-specific runtime compatibility patches live in
+  [`modal_patches/sitecustomize.py`](./modal_patches/sitecustomize.py).
diff --git a/miles/modal_train.py b/miles/modal_train.py
index 70b35b6..97fb3e7 100644
--- a/miles/modal_train.py
+++ b/miles/modal_train.py
@@ -1,27 +1,38 @@
-"""
-Thin Modal launcher for multi-node Miles training.
-
-Design:
-- Bootstrap the Ray cluster once in modal.enter() inside a clustered modal.Cls.
-- Submit the actual Miles job from a modal.method() on rank 0.
-- Keep Miles recipes as native CLI flag files under miles/recipes/.
-- Own only infrastructure-critical flags in Python:
-  cluster size, GPUs per node, model path resolution, checkpoint path, and
-  optional YAML override transport.
+"""Launch Miles training jobs on Modal.
+
+This module defines the clustered `MilesCluster` launcher, helper functions for
+model download and dataset preparation, and a local CLI entrypoint for
+submitting runs.
+
+It supports two execution modes:
+  - deployed: submit runs to a deployed `MilesCluster` with a fixed compute shape
+  - ephemeral: launch a one-off app directly from the local entrypoint
+
+The Python wrapper owns only Modal and Ray orchestration plus a small set of
+infrastructure-critical flags. Model and training arguments live in `recipes/`.
 """
 
 import datetime as dt
 import os
 import pathlib
+import re
 import shlex
 import subprocess
 import time
-from dataclasses import dataclass
 from typing import Optional
 
 import modal
 import modal.experimental
 
+from recipes import (
+    Recipe,
+    format_recipe_table,
+    get_optional_recipe,
+    load_recipe_text,
+    merge_arg_texts,
+    parse_arg_text,
+    read_arg_file,
+)
 
 here = pathlib.Path(__file__).parent.resolve()
 
@@ -34,7 +45,7 @@
 HF_CACHE_PATH = pathlib.Path("/root/.cache/huggingface")
 DATA_PATH = pathlib.Path("/data")
 CHECKPOINTS_PATH = pathlib.Path("/checkpoints")
-REMOTE_RECIPES_DIR = pathlib.Path("/root/miles-recipes")
+REMOTE_RECIPES_DIR = pathlib.Path("/root/recipes")
 REMOTE_PATCH_DIR = pathlib.Path("/root/miles-modal-patches")
 REMOTE_MILES_DIR = pathlib.Path("/root/miles")
 REMOTE_TRAIN_SCRIPT = REMOTE_MILES_DIR / "train.py"
@@ -49,75 +60,6 @@
 )
 
 
-@dataclass(frozen=True)
-class Recipe:
-    name: str
-    description: str
-    model_id: str
-    args_file: str
-    recommended_nodes: int
-    gpu: str
-
-
-RECIPES = {
-    "qwen25-0p5b-lora": Recipe(
-        name="qwen25-0p5b-lora",
-        description="Single-node smoke test adapted from the upstream Miles LoRA example.",
-        model_id="Qwen/Qwen2.5-0.5B-Instruct",
-        args_file="tests/qwen25-0p5b-lora.args",
-        recommended_nodes=1,
-        gpu="H100:8",
-    ),
-    "qwen3-30b-a3b-lora": Recipe(
-        name="qwen3-30b-a3b-lora",
-        description="Single-node Qwen3-30B-A3B all-layer bridge-mode LoRA recipe aligned with current best practices.",
-        model_id="Qwen/Qwen3-30B-A3B",
-        args_file="qwen3-30b-a3b-lora.args",
-        recommended_nodes=1,
-        gpu="H100:8",
-    ),
-    "qwen3-30b-a3b-lora-fewstep": Recipe(
-        name="qwen3-30b-a3b-lora-fewstep",
-        description="Single-node Qwen3-30B-A3B all-layer LoRA recipe trimmed to chase a few full RL steps.",
-        model_id="Qwen/Qwen3-30B-A3B",
-        args_file="tests/qwen3-30b-a3b-lora-fewstep.args",
-        recommended_nodes=1,
-        gpu="H100:8",
-    ),
-    "qwen3-30b-a3b-lora-greedy-debug": Recipe(
-        name="qwen3-30b-a3b-lora-greedy-debug",
-        description="Single-node Qwen3-30B-A3B attention-only debug/control recipe with greedy rollout.",
-        model_id="Qwen/Qwen3-30B-A3B",
-        args_file="tests/qwen3-30b-a3b-lora-greedy-debug.args",
-        recommended_nodes=1,
-        gpu="H100:8",
-    ),
-    "qwen3-30b-a3b-experts-lora": Recipe(
-        name="qwen3-30b-a3b-experts-lora",
-        description="Explicit all-layer Qwen3-30B-A3B LoRA recipe including expert linear_fc1/fc2 targets.",
-        model_id="Qwen/Qwen3-30B-A3B",
-        args_file="qwen3-30b-a3b-experts-lora.args",
-        recommended_nodes=1,
-        gpu="H100:8",
-    ),
-    "qwen3-30b-a3b-experts-fewstep": Recipe(
-        name="qwen3-30b-a3b-experts-fewstep",
-        description="Explicit all-layer Qwen3-30B-A3B few-step recipe including expert linear_fc1/fc2 targets.",
-        model_id="Qwen/Qwen3-30B-A3B",
-        args_file="tests/qwen3-30b-a3b-experts-fewstep.args",
-        recommended_nodes=1,
-        gpu="H100:8",
-    ),
-}
-
-
-def _get_recipe(name: str) -> Recipe:
-    if name not in RECIPES:
-        available = ", ".join(sorted(RECIPES))
-        raise ValueError(f"Unknown recipe: {name}. Available recipes: {available}")
-    return RECIPES[name]
-
-
 def _parse_gpus_per_node(gpu: str) -> int:
     try:
         return int(gpu.rsplit(":", 1)[1])
@@ -127,29 +69,44 @@ def _parse_gpus_per_node(gpu: str) -> int:
         ) from exc
 
 
-def _clean_arg_text(arg_text: str) -> str:
-    lines: list[str] = []
-    for raw_line in arg_text.splitlines():
-        line = raw_line.split("#", 1)[0].strip()
-        if line:
-            lines.append(line)
-    return "\n".join(lines)
+def _resolve_model_id(recipe: Recipe | None, model_id: str) -> str:
+    if model_id:
+        return model_id
+    if recipe:
+        return recipe.model_id
+    raise ValueError("Pass --recipe or --model-id.")
+
+
+def _sanitize_path_component(value: str) -> str:
+    sanitized = re.sub(r"[^A-Za-z0-9._-]+", "-", value).strip("-")
+    return sanitized or "run"
 
 
-def _parse_arg_text(arg_text: str) -> list[str]:
-    cleaned = _clean_arg_text(arg_text)
-    return shlex.split(cleaned) if cleaned else []
+def _resolve_run_label(recipe: Recipe | None, *, model_id: str, run_name: str) -> str:
+    if run_name:
+        return _sanitize_path_component(run_name)
+    if recipe:
+        return recipe.name
+    return _sanitize_path_component(model_id)
 
 
-def _load_recipe_text(recipe: Recipe, remote: bool = False) -> str:
-    base_dir = REMOTE_RECIPES_DIR if remote else here / "recipes"
-    return (base_dir / recipe.args_file).read_text()
+def _resolve_base_args_text(
+    recipe: Recipe | None,
+    *,
+    args_text: str,
+    recipes_dir: pathlib.Path,
+) -> str:
+    parts: list[str] = []
+    if recipe:
+        parts.append(load_recipe_text(recipe, base_dir=recipes_dir))
+    if args_text:
+        parts.append(args_text)
+    return merge_arg_texts(*parts)
 
 
 def _build_enforced_args(
     *,
     model_path: str,
-    cluster_nodes: int,
     actor_nodes: int,
     gpus_per_node: int,
     checkpoint_dir: pathlib.Path,
@@ -193,25 +150,22 @@ def _build_enforced_args(
 
 
 def _build_miles_argv(
-    recipe: Recipe,
     *,
+    base_args_text: str,
     model_path: str,
-    cluster_nodes: int,
     actor_nodes: int,
     gpus_per_node: int,
     checkpoint_dir: pathlib.Path,
     extra_args_text: str,
     custom_config_path: Optional[str],
     wandb_key: Optional[str],
-    remote_recipe: bool,
     colocate: bool,
     rollout_num_gpus: Optional[int],
 ) -> list[str]:
-    recipe_args = _parse_arg_text(_load_recipe_text(recipe, remote=remote_recipe))
-    extra_args = _parse_arg_text(extra_args_text)
+    base_args = parse_arg_text(base_args_text)
+    extra_args = parse_arg_text(extra_args_text)
     enforced_args = _build_enforced_args(
         model_path=model_path,
-        cluster_nodes=cluster_nodes,
         actor_nodes=actor_nodes,
         gpus_per_node=gpus_per_node,
         checkpoint_dir=checkpoint_dir,
@@ -220,10 +174,18 @@ def _build_miles_argv(
         colocate=colocate,
         rollout_num_gpus=rollout_num_gpus,
     )
-    return ["python3", REMOTE_TRAIN_SCRIPT.as_posix(), *recipe_args, *extra_args, *enforced_args]
+    return [
+        "python3",
+        REMOTE_TRAIN_SCRIPT.as_posix(),
+        *base_args,
+        *extra_args,
+        *enforced_args,
+    ]
 
 
-def _resolve_actor_nodes(cluster_nodes: int, *, colocate: bool, actor_nodes: int) -> int:
+def _resolve_actor_nodes(
+    cluster_nodes: int, *, colocate: bool, actor_nodes: int
+) -> int:
     if colocate:
         return cluster_nodes
     if actor_nodes > 0:
@@ -258,12 +220,6 @@ def _resolve_rollout_num_gpus(
     return spare_gpus
 
 
-def _read_optional_file(path_str: str) -> str:
-    if not path_str:
-        return ""
-    return pathlib.Path(path_str).read_text()
-
-
 def _build_runtime_env(master_addr: str, wandb_key: Optional[str]) -> dict:
     env_vars = {
         "MASTER_ADDR": master_addr,
@@ -282,21 +238,17 @@ def _build_runtime_env(master_addr: str, wandb_key: Optional[str]) -> dict:
 
 def _print_recipe_table():
     print("Available recipes:")
-    for recipe in sorted(RECIPES.values(), key=lambda item: item.name):
-        print(
-            f"  - {recipe.name}: {recipe.description} "
-            f"(model={recipe.model_id}, nodes={recipe.recommended_nodes}, gpu={recipe.gpu})"
-        )
+    for line in format_recipe_table():
+        print(line)
 
 
 image = (
     modal.Image.from_registry(MILES_IMAGE)
     .entrypoint([])
-    .add_local_dir(here / "recipes", remote_path=REMOTE_RECIPES_DIR.as_posix(), copy=True)
+    .add_local_dir(here / "recipes", remote_path=REMOTE_RECIPES_DIR.as_posix())
     .add_local_dir(
         here / "modal_patches",
         remote_path=REMOTE_PATCH_DIR.as_posix(),
-        copy=True,
     )
 )
 
@@ -318,6 +270,9 @@ def _print_recipe_table():
 app = modal.App(APP_NAME)
 
 
+# ---- Training Cluster Cls ---- #
+
+
 @app.cls(
     image=image,
     gpu=DEFAULT_GPU,
@@ -414,12 +369,15 @@ def _ensure_ray_started(self):
     @modal.method()
     async def submit_training(
         self,
-        recipe_name: str,
+        recipe_name: str = "",
         *,
+        model_id: str = "",
+        base_args_text: str = "",
         gpus_per_node: int,
         extra_args_text: str = "",
         custom_config_yaml: str = "",
         wandb_key: str = "",
+        run_name: str = "",
         actor_nodes: int | None = None,
         colocate: bool = True,
         rollout_num_gpus: int | None = None,
@@ -430,44 +388,65 @@ async def submit_training(
             while True:
                 time.sleep(10)
 
-        recipe = _get_recipe(recipe_name)
+        recipe = get_optional_recipe(recipe_name)
+        resolved_model_id = _resolve_model_id(recipe, model_id)
+        resolved_base_args_text = _resolve_base_args_text(
+            recipe,
+            args_text=base_args_text,
+            recipes_dir=REMOTE_RECIPES_DIR,
+        )
+        if not resolved_base_args_text:
+            raise ValueError(
+                "No training args were provided. Choose a recipe or pass --args/--args-file."
+            )
+        run_label = _resolve_run_label(
+            recipe,
+            model_id=resolved_model_id,
+            run_name=run_name,
+        )
 
         try:
-            model_path = snapshot_download(repo_id=recipe.model_id, local_files_only=True)
+            model_path = snapshot_download(
+                repo_id=resolved_model_id, local_files_only=True
+            )
         except Exception as exc:
-            raise RuntimeError(
-                f"Model {recipe.model_id} is not present in the shared HF cache. "
+            recipe_hint = (
                 f"Run `modal run miles/modal_train.py::download_model --recipe {recipe.name}` first."
+                if recipe
+                else f"Run `modal run miles/modal_train.py::download_model --model-id {resolved_model_id}` first."
+            )
+            raise RuntimeError(
+                f"Model {resolved_model_id} is not present in the shared HF cache. "
+                f"{recipe_hint}"
             ) from exc
 
         run_id = dt.datetime.utcnow().strftime("%Y%m%d-%H%M%S")
-        checkpoint_dir = CHECKPOINTS_PATH / recipe.name / run_id
+        checkpoint_dir = CHECKPOINTS_PATH / run_label / run_id
         custom_config_path = None
         if custom_config_yaml:
-            custom_config_path = f"/tmp/{recipe.name}-{run_id}-overrides.yaml"
+            custom_config_path = f"/tmp/{run_label}-{run_id}-overrides.yaml"
             pathlib.Path(custom_config_path).write_text(custom_config_yaml)
 
         resolved_actor_nodes = actor_nodes if actor_nodes is not None else CLUSTER_NODES
         resolved_rollout_num_gpus = rollout_num_gpus
         argv = _build_miles_argv(
-            recipe,
+            base_args_text=resolved_base_args_text,
             model_path=model_path,
-            cluster_nodes=CLUSTER_NODES,
             actor_nodes=resolved_actor_nodes,
             gpus_per_node=gpus_per_node,
             checkpoint_dir=checkpoint_dir,
             extra_args_text=extra_args_text,
             custom_config_path=custom_config_path,
             wandb_key=wandb_key or None,
-            remote_recipe=True,
             colocate=colocate,
             rollout_num_gpus=resolved_rollout_num_gpus,
         )
         entrypoint = shlex.join(argv)
         runtime_env = _build_runtime_env(self.main_addr, wandb_key or None)
 
-        print(f"Recipe: {recipe.name}")
-        print(f"Model: {recipe.model_id}")
+        print(f"Recipe: {recipe.name if recipe else '<none>'}")
+        print(f"Model: {resolved_model_id}")
+        print(f"Run label: {run_label}")
         print(f"Cluster nodes: {CLUSTER_NODES}")
         print(f"Actor nodes: {resolved_actor_nodes}")
         print(f"Colocate: {colocate}")
@@ -479,7 +458,9 @@ async def submit_training(
 
         with modal.forward(RAY_DASHBOARD_PORT) as tunnel:
             print(f"Dashboard URL: {tunnel.url}")
-            job_id = self.client.submit_job(entrypoint=entrypoint, runtime_env=runtime_env)
+            job_id = self.client.submit_job(
+                entrypoint=entrypoint, runtime_env=runtime_env
+            )
             print(f"Submitted Ray job: {job_id}")
 
             async for line in self.client.tail_job_logs(job_id):
@@ -491,11 +472,16 @@ async def submit_training(
         return {
             "job_id": job_id,
             "status": status,
-            "recipe": recipe.name,
+            "recipe": recipe.name if recipe else None,
+            "model_id": resolved_model_id,
+            "run_name": run_label,
             "checkpoint_dir": checkpoint_dir.as_posix(),
         }
 
 
+# ---- Model Download Utility ---- #
+
+
 @app.function(
     image=image,
     volumes={HF_CACHE_PATH.as_posix(): hf_cache_volume},
@@ -509,7 +495,12 @@ def download_model(
 ):
     from huggingface_hub import snapshot_download
 
-    resolved_model_id = model_id or _get_recipe(recipe).model_id
+    selected_recipe = get_optional_recipe(recipe)
+    resolved_model_id = model_id or (
+        selected_recipe.model_id if selected_recipe else ""
+    )
+    if not resolved_model_id:
+        raise ValueError("Pass --recipe or --model-id.")
     hf_cache_volume.reload()
     path = snapshot_download(
         repo_id=resolved_model_id,
@@ -520,6 +511,9 @@ def download_model(
     hf_cache_volume.commit()
 
 
+# ---- Dataset Processing Utility ---- #
+
+
 @app.function(
     image=image,
     volumes={DATA_PATH.as_posix(): data_volume},
@@ -528,29 +522,47 @@ def download_model(
 def prepare_dataset(
     hf_dataset: str = "zhuzilin/gsm8k",
     data_folder: str = "gsm8k",
+    train_limit: int = 0,
+    test_limit: int = 0,
 ):
     from datasets import load_dataset
 
     data_volume.reload()
     dataset = load_dataset(hf_dataset)
+    train_split = dataset["train"]
+    test_split = dataset["test"]
+    if train_limit > 0:
+        train_split = train_split.select(range(min(train_limit, len(train_split))))
+    if test_limit > 0:
+        test_split = test_split.select(range(min(test_limit, len(test_split))))
     output_dir = DATA_PATH / data_folder
     output_dir.mkdir(parents=True, exist_ok=True)
-    dataset["train"].to_parquet((output_dir / "train.parquet").as_posix())
-    dataset["test"].to_parquet((output_dir / "test.parquet").as_posix())
+    train_split.to_parquet((output_dir / "train.parquet").as_posix())
+    test_split.to_parquet((output_dir / "test.parquet").as_posix())
     data_volume.commit()
-    print(f"Prepared dataset {hf_dataset} under {output_dir}")
+    print(
+        f"Prepared dataset {hf_dataset} under {output_dir} "
+        f"(train_rows={len(train_split)}, test_rows={len(test_split)})"
+    )
+
+
+# ---- Local Entrypoint ---- #
 
 
 @app.local_entrypoint()
 def main(
-    recipe: str = "qwen3-30b-a3b-lora",
+    recipe: str = "",
+    model_id: str = "",
     gpu: str = "",
+    args: str = "",
+    args_file: str = "",
     extra_args: str = "",
     extra_args_file: str = "",
     custom_config: str = "",
     list_recipes: bool = False,
     dry_run: bool = False,
     allow_cluster_mismatch: bool = False,
+    run_name: str = "",
     colocate: bool = True,
     actor_nodes: int = 0,
     rollout_num_gpus: int = 0,
@@ -559,8 +571,9 @@ def main(
         _print_recipe_table()
         return
 
-    selected_recipe = _get_recipe(recipe)
-    selected_gpu = gpu or selected_recipe.gpu
+    selected_recipe = get_optional_recipe(recipe)
+    resolved_model_id = _resolve_model_id(selected_recipe, model_id)
+    selected_gpu = gpu or (selected_recipe.gpu if selected_recipe else DEFAULT_GPU)
     gpus_per_node = _parse_gpus_per_node(selected_gpu)
     resolved_actor_nodes = _resolve_actor_nodes(
         CLUSTER_NODES,
@@ -576,7 +589,8 @@ def main(
     )
 
     if (
-        not allow_cluster_mismatch
+        selected_recipe
+        and not allow_cluster_mismatch
         and CLUSTER_NODES != selected_recipe.recommended_nodes
     ):
         raise ValueError(
@@ -585,30 +599,44 @@ def main(
             f"Rerun with the recommended value or pass --allow-cluster-mismatch."
         )
 
-    merged_extra_args = "\n".join(
-        part for part in [extra_args, _read_optional_file(extra_args_file)] if part
+    base_args_text = merge_arg_texts(args, read_arg_file(args_file))
+    resolved_base_args_text = _resolve_base_args_text(
+        selected_recipe,
+        args_text=base_args_text,
+        recipes_dir=here / "recipes",
     )
-    custom_config_yaml = _read_optional_file(custom_config)
+    if not resolved_base_args_text:
+        raise ValueError(
+            "No training args were provided. Choose a recipe or pass --args/--args-file."
+        )
+    merged_extra_args = merge_arg_texts(extra_args, read_arg_file(extra_args_file))
+    custom_config_yaml = read_arg_file(custom_config)
     wandb_key = os.environ.get("WANDB_API_KEY", "")
-    checkpoint_dir = CHECKPOINTS_PATH / selected_recipe.name / "DRY_RUN"
+    resolved_run_label = _resolve_run_label(
+        selected_recipe,
+        model_id=resolved_model_id,
+        run_name=run_name,
+    )
+    checkpoint_dir = CHECKPOINTS_PATH / resolved_run_label / "DRY_RUN"
 
     if dry_run:
         argv = _build_miles_argv(
-            selected_recipe,
+            base_args_text=resolved_base_args_text,
             model_path="$MODEL_PATH",
-            cluster_nodes=CLUSTER_NODES,
             actor_nodes=resolved_actor_nodes,
             gpus_per_node=gpus_per_node,
             checkpoint_dir=checkpoint_dir,
             extra_args_text=merged_extra_args,
-            custom_config_path="/tmp/custom-config.yaml" if custom_config_yaml else None,
+            custom_config_path="/tmp/custom-config.yaml"
+            if custom_config_yaml
+            else None,
             wandb_key="$WANDB_API_KEY" if wandb_key else None,
-            remote_recipe=False,
             colocate=colocate,
             rollout_num_gpus=resolved_rollout_num_gpus,
         )
-        print(f"Recipe: {selected_recipe.name}")
-        print(f"Model: {selected_recipe.model_id}")
+        print(f"Recipe: {selected_recipe.name if selected_recipe else '<none>'}")
+        print(f"Model: {resolved_model_id}")
+        print(f"Run label: {resolved_run_label}")
         print(f"Cluster nodes: {CLUSTER_NODES}")
         print(f"Actor nodes: {resolved_actor_nodes}")
         print(f"Colocate: {colocate}")
@@ -618,8 +646,9 @@ def main(
         print(shlex.join(argv))
         return
 
-    print(f"Recipe: {selected_recipe.name}")
-    print(f"Model: {selected_recipe.model_id}")
+    print(f"Recipe: {selected_recipe.name if selected_recipe else '<none>'}")
+    print(f"Model: {resolved_model_id}")
+    print(f"Run label: {resolved_run_label}")
     print(f"Cluster nodes: {CLUSTER_NODES}")
     print(f"Actor nodes: {resolved_actor_nodes}")
     print(f"Colocate: {colocate}")
@@ -627,13 +656,25 @@ def main(
         print(f"Rollout GPUs: {resolved_rollout_num_gpus}")
     print(f"GPU: {selected_gpu}")
 
-    cluster = MilesCluster.with_options(gpu=selected_gpu)()
+    try:
+        cluster_cls = modal.Cls.from_name(APP_NAME, "MilesCluster")
+        print(f"Using deployed Modal class: {APP_NAME}.MilesCluster")
+    except modal.exception.NotFoundError:
+        cluster_cls = MilesCluster
+        print(
+            f"No deployed Modal class found for {APP_NAME}.MilesCluster; using an ephemeral app."
+        )
+
+    cluster = cluster_cls.with_options(gpu=selected_gpu)()
     result = cluster.submit_training.remote(
-        recipe_name=selected_recipe.name,
+        recipe_name=selected_recipe.name if selected_recipe else "",
+        model_id=resolved_model_id,
+        base_args_text=base_args_text,
         gpus_per_node=gpus_per_node,
         extra_args_text=merged_extra_args,
         custom_config_yaml=custom_config_yaml,
         wandb_key=wandb_key,
+        run_name=resolved_run_label,
         actor_nodes=resolved_actor_nodes,
         colocate=colocate,
         rollout_num_gpus=resolved_rollout_num_gpus,
diff --git a/miles/recipes/__init__.py b/miles/recipes/__init__.py
new file mode 100644
index 0000000..288aa81
--- /dev/null
+++ b/miles/recipes/__init__.py
@@ -0,0 +1,29 @@
+from .util import (
+    RECIPES,
+    RECIPES_DIR,
+    Recipe,
+    clean_arg_text,
+    format_recipe_table,
+    get_optional_recipe,
+    get_recipe,
+    iter_recipes,
+    load_recipe_text,
+    merge_arg_texts,
+    parse_arg_text,
+    read_arg_file,
+)
+
+__all__ = [
+    "RECIPES",
+    "RECIPES_DIR",
+    "Recipe",
+    "clean_arg_text",
+    "format_recipe_table",
+    "get_optional_recipe",
+    "get_recipe",
+    "iter_recipes",
+    "load_recipe_text",
+    "merge_arg_texts",
+    "parse_arg_text",
+    "read_arg_file",
+]
diff --git a/miles/recipes/util.py b/miles/recipes/util.py
new file mode 100644
index 0000000..ec79479
--- /dev/null
+++ b/miles/recipes/util.py
@@ -0,0 +1,122 @@
+import pathlib
+import shlex
+from dataclasses import dataclass
+
+
+RECIPES_DIR = pathlib.Path(__file__).parent.resolve()
+
+
+@dataclass(frozen=True)
+class Recipe:
+    name: str
+    description: str
+    model_id: str
+    args_file: str
+    recommended_nodes: int
+    gpu: str
+
+
+RECIPES = {
+    "qwen25-0p5b-lora": Recipe(
+        name="qwen25-0p5b-lora",
+        description="Single-node smoke test adapted from the upstream Miles LoRA example.",
+        model_id="Qwen/Qwen2.5-0.5B-Instruct",
+        args_file="tests/qwen25-0p5b-lora.args",
+        recommended_nodes=1,
+        gpu="H100:8",
+    ),
+    "qwen3-30b-a3b-lora": Recipe(
+        name="qwen3-30b-a3b-lora",
+        description="Single-node Qwen3-30B-A3B all-layer bridge-mode LoRA recipe aligned with current best practices.",
+        model_id="Qwen/Qwen3-30B-A3B",
+        args_file="qwen3-30b-a3b-lora.args",
+        recommended_nodes=1,
+        gpu="H100:8",
+    ),
+    "qwen3-30b-a3b-lora-fewstep": Recipe(
+        name="qwen3-30b-a3b-lora-fewstep",
+        description="Single-node Qwen3-30B-A3B all-layer LoRA recipe trimmed to chase a few full RL steps.",
+        model_id="Qwen/Qwen3-30B-A3B",
+        args_file="tests/qwen3-30b-a3b-lora-fewstep.args",
+        recommended_nodes=1,
+        gpu="H100:8",
+    ),
+    "qwen3-30b-a3b-lora-greedy-debug": Recipe(
+        name="qwen3-30b-a3b-lora-greedy-debug",
+        description="Single-node Qwen3-30B-A3B attention-only debug/control recipe with greedy rollout.",
+        model_id="Qwen/Qwen3-30B-A3B",
+        args_file="tests/qwen3-30b-a3b-lora-greedy-debug.args",
+        recommended_nodes=1,
+        gpu="H100:8",
+    ),
+    "qwen3-30b-a3b-experts-lora": Recipe(
+        name="qwen3-30b-a3b-experts-lora",
+        description="Explicit all-layer Qwen3-30B-A3B LoRA recipe including expert linear_fc1/fc2 targets.",
+        model_id="Qwen/Qwen3-30B-A3B",
+        args_file="qwen3-30b-a3b-experts-lora.args",
+        recommended_nodes=1,
+        gpu="H100:8",
+    ),
+    "qwen3-30b-a3b-experts-fewstep": Recipe(
+        name="qwen3-30b-a3b-experts-fewstep",
+        description="Explicit all-layer Qwen3-30B-A3B few-step recipe including expert linear_fc1/fc2 targets.",
+        model_id="Qwen/Qwen3-30B-A3B",
+        args_file="tests/qwen3-30b-a3b-experts-fewstep.args",
+        recommended_nodes=1,
+        gpu="H100:8",
+    ),
+}
+
+
+def get_recipe(name: str) -> Recipe:
+    if name not in RECIPES:
+        available = ", ".join(sorted(RECIPES))
+        raise ValueError(f"Unknown recipe: {name}. Available recipes: {available}")
+    return RECIPES[name]
+
+
+def get_optional_recipe(name: str) -> Recipe | None:
+    if not name:
+        return None
+    return get_recipe(name)
+
+
+def iter_recipes() -> list[Recipe]:
+    return sorted(RECIPES.values(), key=lambda item: item.name)
+
+
+def clean_arg_text(arg_text: str) -> str:
+    lines: list[str] = []
+    for raw_line in arg_text.splitlines():
+        line = raw_line.split("#", 1)[0].strip()
+        if line:
+            lines.append(line)
+    return "\n".join(lines)
+
+
+def parse_arg_text(arg_text: str) -> list[str]:
+    cleaned = clean_arg_text(arg_text)
+    return shlex.split(cleaned) if cleaned else []
+
+
+def read_arg_file(path_str: str) -> str:
+    if not path_str:
+        return ""
+    return pathlib.Path(path_str).read_text()
+
+
+def merge_arg_texts(*parts: str) -> str:
+    return "\n".join(part for part in parts if part and part.strip())
+
+
+def load_recipe_text(recipe: Recipe, base_dir: pathlib.Path | None = None) -> str:
+    recipe_dir = base_dir if base_dir is not None else RECIPES_DIR
+    return (recipe_dir / recipe.args_file).read_text()
+
+
+def format_recipe_table() -> list[str]:
+    return [
+        f"  - {recipe.name}: {recipe.description} "
+        f"(model={recipe.model_id}, nodes={recipe.recommended_nodes}, gpu={recipe.gpu})"
+        for recipe in iter_recipes()
+    ]