From 84ecfeaa831a5eae2b4fcc7260e2ab63f0f2080a Mon Sep 17 00:00:00 2001 From: Jason Mancuso <7891333+jvmncs@users.noreply.github.com> Date: Mon, 23 Mar 2026 20:12:45 -0400 Subject: [PATCH 1/5] init miles example + lora recipes --- miles/README.md | 101 ++++ miles/modal_train.py | 531 ++++++++++++++++++ miles/recipes/glm4-7-flash-lora.args | 120 ++++ .../recipes/glm5-744b-a40b-20layer-lora.args | 119 ++++ miles/recipes/glm5-744b-a40b-4layer-lora.args | 119 ++++ miles/recipes/glm5-744b-a40b-lora.args | 119 ++++ miles/recipes/qwen25-0p5b-lora.args | 93 +++ 7 files changed, 1202 insertions(+) create mode 100644 miles/README.md create mode 100644 miles/modal_train.py create mode 100644 miles/recipes/glm4-7-flash-lora.args create mode 100644 miles/recipes/glm5-744b-a40b-20layer-lora.args create mode 100644 miles/recipes/glm5-744b-a40b-4layer-lora.args create mode 100644 miles/recipes/glm5-744b-a40b-lora.args create mode 100644 miles/recipes/qwen25-0p5b-lora.args diff --git a/miles/README.md b/miles/README.md new file mode 100644 index 0000000..26541f5 --- /dev/null +++ b/miles/README.md @@ -0,0 +1,101 @@ +# Miles example + +Multi-node Miles RL training on Modal using the same Ray bootstrap pattern as +[`ray/modal_train.py`](../ray/modal_train.py), but with Miles recipes stored as +native CLI flag files instead of Python config classes. + +## Prerequisites + +- A Modal account with multi-node access. +- A `huggingface-secret` Modal secret containing `HF_TOKEN` for gated model + downloads. +- Optionally export `WANDB_API_KEY` in your local shell before `modal run` to + auto-enable Weights & Biases logging for training runs. + +## Recipes + +List the built-in recipes: + +```bash +modal run miles/modal_train.py --list-recipes +``` + +Current recipes: + +- `qwen25-0p5b-lora`: single-node smoke test adapted from the upstream Miles + LoRA example. +- `glm4-7-flash-lora`: first real GLM MoE validation recipe. +- `glm5-744b-a40b-4layer-lora`: GLM-5 testing recipe using the 4-layer script + shape from upstream Miles. +- `glm5-744b-a40b-20layer-lora`: larger GLM-5 testing recipe using the 20-layer + script shape from upstream Miles. +- `glm5-744b-a40b-lora`: full GLM-5 starter recipe. + +## Prepare assets + +Prepare a small GSM8K dataset in the shared volume: + +```bash +modal run miles/modal_train.py::prepare_dataset +``` + +Download a recipe's base model into the shared Hugging Face cache: + +```bash +modal run miles/modal_train.py::download_model --recipe glm4-7-flash-lora +``` + +## Train + +The cluster size is chosen at import time by `MILES_N_NODES`, so set it in the +same shell invocation as `modal run`. + +Single-node smoke test: + +```bash +MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen25-0p5b-lora +``` + +GLM-4.7-Flash multi-node validation: + +```bash +MILES_N_NODES=4 modal run miles/modal_train.py --recipe glm4-7-flash-lora +``` + +GLM-5 4-layer testing recipe: + +```bash +MILES_N_NODES=1 modal run miles/modal_train.py --recipe glm5-744b-a40b-4layer-lora --gpu H200:8 +``` + +GLM-5 20-layer testing recipe: + +```bash +MILES_N_NODES=2 modal run miles/modal_train.py --recipe glm5-744b-a40b-20layer-lora --gpu H200:8 +``` + +Full GLM-5 starter recipe: + +```bash +MILES_N_NODES=8 modal run miles/modal_train.py --recipe glm5-744b-a40b-lora --gpu H200:8 +``` + +Useful options: + +- `--dry-run`: print the assembled Miles command with a `$MODEL_PATH` + placeholder without launching the cluster. +- `--extra-args "...flags..."`: append ad hoc Miles CLI overrides. +- `--extra-args-file path/to/file.args`: append overrides from a local text + file. +- `--custom-config path/to/overrides.yaml`: pass a flat YAML override map to + Miles via `--custom-config-path`. +- `--allow-cluster-mismatch`: bypass recipe/node-count validation if you are + intentionally adapting a canned recipe. +- `USE_LOCAL_MILES=/path/to/miles`: overlay a local Miles checkout on top of + the pinned container image. +- `MILES_IMAGE=radixark/miles:...`: override the pinned image tag. The current + default is `radixark/miles:dev-202603231227`. + +The wrapper intentionally owns only Modal/Ray plumbing plus a small set of +cluster-critical flags. All model and training settings live in +[`miles/recipes/`](./recipes/). diff --git a/miles/modal_train.py b/miles/modal_train.py new file mode 100644 index 0000000..35b075e --- /dev/null +++ b/miles/modal_train.py @@ -0,0 +1,531 @@ +""" +Thin Modal launcher for multi-node Miles training. + +Design: +- Bootstrap the Ray cluster once in modal.enter() inside a clustered modal.Cls. +- Submit the actual Miles job from a modal.method() on rank 0. +- Keep Miles recipes as native CLI flag files under miles/recipes/. +- Own only infrastructure-critical flags in Python: + cluster size, GPUs per node, model path resolution, checkpoint path, and + optional YAML override transport. +""" + +import datetime as dt +import os +import pathlib +import shlex +import subprocess +import time +from dataclasses import dataclass +from typing import Optional + +import modal +import modal.experimental + + +here = pathlib.Path(__file__).parent.resolve() + +APP_NAME = os.environ.get("MILES_APP_NAME", "miles-modal") +MILES_IMAGE = os.environ.get("MILES_IMAGE", "radixark/miles:dev-202603231227") +CLUSTER_NODES = int(os.environ.get("MILES_N_NODES", "1")) +DEFAULT_GPU = os.environ.get("MILES_GPU", "H100:8") +LOCAL_MILES_PATH = os.environ.get("USE_LOCAL_MILES", "") + +HF_CACHE_PATH = pathlib.Path("/root/.cache/huggingface") +DATA_PATH = pathlib.Path("/data") +CHECKPOINTS_PATH = pathlib.Path("/checkpoints") +REMOTE_RECIPES_DIR = pathlib.Path("/root/miles-recipes") +REMOTE_MILES_DIR = pathlib.Path("/root/miles") +REMOTE_TRAIN_SCRIPT = REMOTE_MILES_DIR / "train.py" + +RAY_PORT = 6379 +RAY_DASHBOARD_PORT = 8265 + +hf_cache_volume = modal.Volume.from_name("huggingface-cache", create_if_missing=True) +data_volume = modal.Volume.from_name("miles-example-data", create_if_missing=True) +checkpoints_volume = modal.Volume.from_name( + "miles-example-checkpoints", create_if_missing=True +) + + +@dataclass(frozen=True) +class Recipe: + name: str + description: str + model_id: str + args_file: str + recommended_nodes: int + gpu: str + + +RECIPES = { + "qwen25-0p5b-lora": Recipe( + name="qwen25-0p5b-lora", + description="Single-node smoke test adapted from the upstream Miles LoRA example.", + model_id="Qwen/Qwen2.5-0.5B-Instruct", + args_file="qwen25-0p5b-lora.args", + recommended_nodes=1, + gpu="H100:8", + ), + "glm4-7-flash-lora": Recipe( + name="glm4-7-flash-lora", + description="First real GLM MoE validation recipe on multiple nodes.", + model_id="zai-org/GLM-4.7-Flash", + args_file="glm4-7-flash-lora.args", + recommended_nodes=4, + gpu="H100:8", + ), + "glm5-744b-a40b-4layer-lora": Recipe( + name="glm5-744b-a40b-4layer-lora", + description="GLM-5 testing recipe using the upstream 4-layer model script shape.", + model_id="zai-org/GLM-5", + args_file="glm5-744b-a40b-4layer-lora.args", + recommended_nodes=1, + gpu="H200:8", + ), + "glm5-744b-a40b-20layer-lora": Recipe( + name="glm5-744b-a40b-20layer-lora", + description="GLM-5 testing recipe using the upstream 20-layer model script shape.", + model_id="zai-org/GLM-5", + args_file="glm5-744b-a40b-20layer-lora.args", + recommended_nodes=2, + gpu="H200:8", + ), + "glm5-744b-a40b-lora": Recipe( + name="glm5-744b-a40b-lora", + description="Full GLM-5 starter recipe for LoRA RLVR experiments.", + model_id="zai-org/GLM-5", + args_file="glm5-744b-a40b-lora.args", + recommended_nodes=8, + gpu="H200:8", + ), +} + + +def _get_recipe(name: str) -> Recipe: + if name not in RECIPES: + available = ", ".join(sorted(RECIPES)) + raise ValueError(f"Unknown recipe: {name}. Available recipes: {available}") + return RECIPES[name] + + +def _parse_gpus_per_node(gpu: str) -> int: + try: + return int(gpu.rsplit(":", 1)[1]) + except (IndexError, ValueError) as exc: + raise ValueError( + f"GPU spec must include a per-node count like 'H100:8'; got {gpu!r}" + ) from exc + + +def _clean_arg_text(arg_text: str) -> str: + lines: list[str] = [] + for raw_line in arg_text.splitlines(): + line = raw_line.split("#", 1)[0].strip() + if line: + lines.append(line) + return "\n".join(lines) + + +def _parse_arg_text(arg_text: str) -> list[str]: + cleaned = _clean_arg_text(arg_text) + return shlex.split(cleaned) if cleaned else [] + + +def _load_recipe_text(recipe: Recipe, remote: bool = False) -> str: + base_dir = REMOTE_RECIPES_DIR if remote else here / "recipes" + return (base_dir / recipe.args_file).read_text() + + +def _build_enforced_args( + *, + model_path: str, + cluster_nodes: int, + gpus_per_node: int, + checkpoint_dir: pathlib.Path, + custom_config_path: Optional[str], + wandb_key: Optional[str], +) -> list[str]: + args = [ + "--train-backend", + "megatron", + "--hf-checkpoint", + model_path, + "--ref-load", + model_path, + "--save", + checkpoint_dir.as_posix(), + "--actor-num-nodes", + str(cluster_nodes), + "--actor-num-gpus-per-node", + str(gpus_per_node), + "--num-gpus-per-node", + str(gpus_per_node), + "--colocate", + ] + if custom_config_path: + args.extend(["--custom-config-path", custom_config_path]) + if wandb_key: + args.extend(["--use-wandb", "--wandb-key", wandb_key]) + return args + + +def _build_miles_argv( + recipe: Recipe, + *, + model_path: str, + cluster_nodes: int, + gpus_per_node: int, + checkpoint_dir: pathlib.Path, + extra_args_text: str, + custom_config_path: Optional[str], + wandb_key: Optional[str], + remote_recipe: bool, +) -> list[str]: + recipe_args = _parse_arg_text(_load_recipe_text(recipe, remote=remote_recipe)) + extra_args = _parse_arg_text(extra_args_text) + enforced_args = _build_enforced_args( + model_path=model_path, + cluster_nodes=cluster_nodes, + gpus_per_node=gpus_per_node, + checkpoint_dir=checkpoint_dir, + custom_config_path=custom_config_path, + wandb_key=wandb_key, + ) + return ["python3", REMOTE_TRAIN_SCRIPT.as_posix(), *recipe_args, *extra_args, *enforced_args] + + +def _read_optional_file(path_str: str) -> str: + if not path_str: + return "" + return pathlib.Path(path_str).read_text() + + +def _build_runtime_env(master_addr: str, wandb_key: Optional[str]) -> dict: + env_vars = { + "MASTER_ADDR": master_addr, + "no_proxy": master_addr, + "PYTHONPATH": "/root/Megatron-LM", + "CUDA_DEVICE_MAX_CONNECTIONS": "1", + "NCCL_ALGO": "Ring", + "NVTE_ALLOW_NONDETERMINISTIC_ALGO": "0", + "CUBLAS_WORKSPACE_CONFIG": ":4096:8", + } + if wandb_key: + env_vars["WANDB_API_KEY"] = wandb_key + return {"env_vars": env_vars} + + +def _print_recipe_table(): + print("Available recipes:") + for recipe in sorted(RECIPES.values(), key=lambda item: item.name): + print( + f" - {recipe.name}: {recipe.description} " + f"(model={recipe.model_id}, nodes={recipe.recommended_nodes}, gpu={recipe.gpu})" + ) + + +image = ( + modal.Image.from_registry(MILES_IMAGE) + .entrypoint([]) + .add_local_dir(here / "recipes", remote_path=REMOTE_RECIPES_DIR.as_posix(), copy=True) +) + +if LOCAL_MILES_PATH: + image = image.add_local_dir( + LOCAL_MILES_PATH, + remote_path=REMOTE_MILES_DIR.as_posix(), + copy=True, + ignore=["**/__pycache__", "**/*.pyc", "**/.git", "**/.venv"], + ).run_commands(f"pip install -e {REMOTE_MILES_DIR} --no-deps") + + +with image.imports(): + import ray + from huggingface_hub import snapshot_download + from ray.job_submission import JobSubmissionClient + + +app = modal.App(APP_NAME) + + +@app.cls( + image=image, + gpu=DEFAULT_GPU, + volumes={ + HF_CACHE_PATH.as_posix(): hf_cache_volume, + DATA_PATH.as_posix(): data_volume, + CHECKPOINTS_PATH.as_posix(): checkpoints_volume, + }, + timeout=24 * 60 * 60, + scaledown_window=60 * 60, + retries=2, + experimental_options={"efa_enabled": True}, +) +@modal.experimental.clustered(size=CLUSTER_NODES, rdma=True) +class MilesCluster: + @modal.enter() + def bootstrap_ray(self): + hf_cache_volume.reload() + data_volume.reload() + checkpoints_volume.reload() + self.rank = None + self.node_ips = [] + self.main_addr = None + self.node_addr = None + self.client = None + self._ray_ready = False + + def _ensure_ray_started(self): + if self._ray_ready: + return + + cluster_info = modal.experimental.get_cluster_info() + self.rank = cluster_info.rank + if cluster_info.container_ipv4_ips: + self.node_ips = cluster_info.container_ipv4_ips + elif CLUSTER_NODES == 1: + # Modal may omit container IPv4s for size-1 clustered functions. + self.node_ips = ["127.0.0.1"] + else: + raise RuntimeError( + "Modal did not provide container IPv4s for a multi-node cluster." + ) + + self.main_addr = self.node_ips[0] + self.node_addr = self.node_ips[min(self.rank, len(self.node_ips) - 1)] + + if self.rank == 0: + print(f"Starting Ray head at {self.node_addr}") + subprocess.Popen( + [ + "ray", + "start", + "--head", + f"--node-ip-address={self.node_addr}", + "--dashboard-host=0.0.0.0", + "--disable-usage-stats", + ] + ) + + for _ in range(30): + try: + ray.init(address="auto") + break + except Exception: + time.sleep(1) + else: + raise RuntimeError("Failed to connect to the Ray head node") + + for _ in range(60): + alive_nodes = [node for node in ray.nodes() if node["Alive"]] + print(f"Alive nodes: {len(alive_nodes)}/{len(self.node_ips)}") + if len(alive_nodes) == len(self.node_ips): + break + time.sleep(1) + else: + raise RuntimeError("Not all Ray worker nodes connected") + + self.client = JobSubmissionClient(f"http://127.0.0.1:{RAY_DASHBOARD_PORT}") + print("Ray cluster is ready.") + else: + print(f"Starting Ray worker at {self.node_addr}, head={self.main_addr}") + subprocess.Popen( + [ + "ray", + "start", + f"--node-ip-address={self.node_addr}", + "--address", + f"{self.main_addr}:{RAY_PORT}", + "--disable-usage-stats", + ] + ) + self._ray_ready = True + + @modal.method() + async def submit_training( + self, + recipe_name: str, + *, + gpus_per_node: int, + extra_args_text: str = "", + custom_config_yaml: str = "", + wandb_key: str = "", + ) -> dict: + self._ensure_ray_started() + + if self.rank != 0: + while True: + time.sleep(10) + + recipe = _get_recipe(recipe_name) + + try: + model_path = snapshot_download(repo_id=recipe.model_id, local_files_only=True) + except Exception as exc: + raise RuntimeError( + f"Model {recipe.model_id} is not present in the shared HF cache. " + f"Run `modal run miles/modal_train.py::download_model --recipe {recipe.name}` first." + ) from exc + + run_id = dt.datetime.utcnow().strftime("%Y%m%d-%H%M%S") + checkpoint_dir = CHECKPOINTS_PATH / recipe.name / run_id + custom_config_path = None + if custom_config_yaml: + custom_config_path = f"/tmp/{recipe.name}-{run_id}-overrides.yaml" + pathlib.Path(custom_config_path).write_text(custom_config_yaml) + + argv = _build_miles_argv( + recipe, + model_path=model_path, + cluster_nodes=CLUSTER_NODES, + gpus_per_node=gpus_per_node, + checkpoint_dir=checkpoint_dir, + extra_args_text=extra_args_text, + custom_config_path=custom_config_path, + wandb_key=wandb_key or None, + remote_recipe=True, + ) + entrypoint = shlex.join(argv) + runtime_env = _build_runtime_env(self.main_addr, wandb_key or None) + + print(f"Recipe: {recipe.name}") + print(f"Model: {recipe.model_id}") + print(f"Nodes: {CLUSTER_NODES}") + print(f"GPUs per node: {gpus_per_node}") + print(f"Checkpoint dir: {checkpoint_dir}") + print(f"Entrypoint: {entrypoint}") + + with modal.forward(RAY_DASHBOARD_PORT) as tunnel: + print(f"Dashboard URL: {tunnel.url}") + job_id = self.client.submit_job(entrypoint=entrypoint, runtime_env=runtime_env) + print(f"Submitted Ray job: {job_id}") + + async for line in self.client.tail_job_logs(job_id): + print(line, end="", flush=True) + + status = self.client.get_job_status(job_id).value + checkpoints_volume.commit() + print(f"\nFinal status: {status}") + return { + "job_id": job_id, + "status": status, + "recipe": recipe.name, + "checkpoint_dir": checkpoint_dir.as_posix(), + } + + +@app.function( + image=image, + volumes={HF_CACHE_PATH.as_posix(): hf_cache_volume}, + secrets=[modal.Secret.from_name("huggingface-secret")], + timeout=24 * 60 * 60, +) +def download_model( + recipe: str = "glm4-7-flash-lora", + revision: Optional[str] = None, + model_id: Optional[str] = None, +): + from huggingface_hub import snapshot_download + + resolved_model_id = model_id or _get_recipe(recipe).model_id + hf_cache_volume.reload() + path = snapshot_download( + repo_id=resolved_model_id, + revision=revision, + token=os.environ.get("HF_TOKEN"), + ) + print(f"Downloaded {resolved_model_id} to {path}") + hf_cache_volume.commit() + + +@app.function( + image=image, + volumes={DATA_PATH.as_posix(): data_volume}, + timeout=24 * 60 * 60, +) +def prepare_dataset( + hf_dataset: str = "zhuzilin/gsm8k", + data_folder: str = "gsm8k", +): + from datasets import load_dataset + + data_volume.reload() + dataset = load_dataset(hf_dataset) + output_dir = DATA_PATH / data_folder + output_dir.mkdir(parents=True, exist_ok=True) + dataset["train"].to_parquet((output_dir / "train.parquet").as_posix()) + dataset["test"].to_parquet((output_dir / "test.parquet").as_posix()) + data_volume.commit() + print(f"Prepared dataset {hf_dataset} under {output_dir}") + + +@app.local_entrypoint() +def main( + recipe: str = "qwen25-0p5b-lora", + gpu: str = "", + extra_args: str = "", + extra_args_file: str = "", + custom_config: str = "", + list_recipes: bool = False, + dry_run: bool = False, + allow_cluster_mismatch: bool = False, +): + if list_recipes: + _print_recipe_table() + return + + selected_recipe = _get_recipe(recipe) + selected_gpu = gpu or selected_recipe.gpu + gpus_per_node = _parse_gpus_per_node(selected_gpu) + + if ( + not allow_cluster_mismatch + and CLUSTER_NODES != selected_recipe.recommended_nodes + ): + raise ValueError( + f"Recipe {selected_recipe.name} expects MILES_N_NODES={selected_recipe.recommended_nodes}, " + f"but this process was started with MILES_N_NODES={CLUSTER_NODES}. " + f"Rerun with the recommended value or pass --allow-cluster-mismatch." + ) + + merged_extra_args = "\n".join( + part for part in [extra_args, _read_optional_file(extra_args_file)] if part + ) + custom_config_yaml = _read_optional_file(custom_config) + wandb_key = os.environ.get("WANDB_API_KEY", "") + checkpoint_dir = CHECKPOINTS_PATH / selected_recipe.name / "DRY_RUN" + + if dry_run: + argv = _build_miles_argv( + selected_recipe, + model_path="$MODEL_PATH", + cluster_nodes=CLUSTER_NODES, + gpus_per_node=gpus_per_node, + checkpoint_dir=checkpoint_dir, + extra_args_text=merged_extra_args, + custom_config_path="/tmp/custom-config.yaml" if custom_config_yaml else None, + wandb_key="$WANDB_API_KEY" if wandb_key else None, + remote_recipe=False, + ) + print(f"Recipe: {selected_recipe.name}") + print(f"Model: {selected_recipe.model_id}") + print(f"Cluster nodes: {CLUSTER_NODES}") + print(f"GPU: {selected_gpu}") + print(shlex.join(argv)) + return + + print(f"Recipe: {selected_recipe.name}") + print(f"Model: {selected_recipe.model_id}") + print(f"Cluster nodes: {CLUSTER_NODES}") + print(f"GPU: {selected_gpu}") + + cluster = MilesCluster.with_options(gpu=selected_gpu)() + result = cluster.submit_training.remote( + recipe_name=selected_recipe.name, + gpus_per_node=gpus_per_node, + extra_args_text=merged_extra_args, + custom_config_yaml=custom_config_yaml, + wandb_key=wandb_key, + ) + print(result) diff --git a/miles/recipes/glm4-7-flash-lora.args b/miles/recipes/glm4-7-flash-lora.args new file mode 100644 index 0000000..0cf819f --- /dev/null +++ b/miles/recipes/glm4-7-flash-lora.args @@ -0,0 +1,120 @@ +# GLM-4.7-Flash LoRA validation recipe. + +# Model architecture from the upstream Miles model script. +--moe-layer-freq "[0]*1+[1]*46" +--num-experts 64 +--moe-shared-expert-intermediate-size 1536 +--moe-router-topk 4 +--moe-grouped-gemm +--moe-permute-fusion +--moe-ffn-hidden-size 1536 +--moe-router-score-function sigmoid +--moe-router-pre-softmax +--moe-router-enable-expert-bias +--moe-router-bias-update-rate 0 +--moe-router-load-balancing-type seq_aux_loss +--moe-router-topk-scaling-factor 1.8 +--moe-aux-loss-coeff 0 +--moe-router-dtype fp32 +--make-vocab-size-divisible-by 64 +--num-layers 47 +--hidden-size 2048 +--ffn-hidden-size 10240 +--num-attention-heads 20 +--disable-bias-linear +--add-qkv-bias +--swiglu +--untie-embeddings-and-output-weights +--position-embedding-type rope +--no-position-embedding +--normalization RMSNorm +--qk-layernorm +--multi-latent-attention +--q-lora-rank 768 +--kv-lora-rank 512 +--qk-head-dim 192 +--v-head-dim 256 +--kv-channels 192 +--qk-pos-emb-head-dim 64 +--vocab-size 154880 +--rotary-base 1000000 +--no-rope-fusion +--mtp-num-layers 1 + +# Checkpoint conversion +--megatron-to-hf-mode bridge + +# LoRA +--lora-rank 32 +--lora-alpha 32 +--lora-dropout 0.0 +--target-modules all-linear + +# Data and rollout +--prompt-data /data/gsm8k/train.parquet +--input-key messages +--label-key label +--apply-chat-template +--rollout-shuffle +--rm-type math +--num-rollout 100 +--rollout-batch-size 8 +--n-samples-per-prompt 8 +--rollout-max-response-len 2048 +--rollout-temperature 1 +--global-batch-size 64 + +# Evaluation +--eval-interval 10 +--eval-prompt-data gsm8k /data/gsm8k/test.parquet +--n-samples-per-eval-prompt 2 +--eval-max-response-len 4096 +--eval-top-k 1 + +# Parallelism and performance +--tensor-model-parallel-size 2 +--sequence-parallel +--pipeline-model-parallel-size 4 +--context-parallel-size 1 +--expert-model-parallel-size 4 +--expert-tensor-parallel-size 1 +--recompute-granularity full +--recompute-method uniform +--recompute-num-layers 1 +--use-dynamic-batch-size +--max-tokens-per-gpu 4096 + +# GRPO +--advantage-estimator grpo +--kl-loss-coef 0.0 +--kl-loss-type low_var_kl +--kl-coef 0.0 +--entropy-coef 0.0 +--eps-clip 0.2 +--eps-clip-high 0.28 + +# Optimizer +--optimizer adam +--lr 1e-5 +--lr-decay-style constant +--weight-decay 0.1 +--adam-beta1 0.9 +--adam-beta2 0.98 + +# Rollout serving +--rollout-num-gpus-per-engine 1 +--sglang-mem-fraction-static 0.55 + +# Training runtime +--attention-dropout 0.0 +--hidden-dropout 0.0 +--accumulate-allreduce-grads-in-fp32 +--attention-softmax-in-fp32 +--calculate-per-token-loss +--use-miles-router +--save-interval 25 + +# Logging defaults +--wandb-project miles-modal +--wandb-group glm4-7-flash-lora +--disable-wandb-random-suffix diff --git a/miles/recipes/glm5-744b-a40b-20layer-lora.args b/miles/recipes/glm5-744b-a40b-20layer-lora.args new file mode 100644 index 0000000..5a2e760 --- /dev/null +++ b/miles/recipes/glm5-744b-a40b-20layer-lora.args @@ -0,0 +1,119 @@ +# GLM-5 20-layer testing recipe. + +# Model architecture from the upstream Miles GLM-5 model script with the 20-layer override. +--spec miles_plugins.models.glm5.glm5 get_glm5_spec +--moe-layer-freq "[0]*3+[1]*17" +--num-experts 256 +--moe-shared-expert-intermediate-size 2048 +--moe-router-topk 8 +--moe-grouped-gemm +--moe-permute-fusion +--moe-ffn-hidden-size 2048 +--moe-router-score-function sigmoid +--moe-router-pre-softmax +--moe-router-enable-expert-bias +--moe-router-bias-update-rate 0 +--moe-router-load-balancing-type seq_aux_loss +--moe-router-topk-scaling-factor 2.5 +--moe-aux-loss-coeff 0 +--moe-router-dtype fp32 +--make-vocab-size-divisible-by 16 +--num-layers 20 +--hidden-size 6144 +--ffn-hidden-size 12288 +--num-attention-heads 64 +--disable-bias-linear +--swiglu +--untie-embeddings-and-output-weights +--position-embedding-type rope +--no-position-embedding +--normalization RMSNorm +--qk-layernorm +--multi-latent-attention +--q-lora-rank 2048 +--kv-lora-rank 512 +--qk-head-dim 192 +--v-head-dim 256 +--kv-channels 192 +--qk-pos-emb-head-dim 64 +--vocab-size 154880 +--rotary-base 1000000 +--enable-experimental + +# Checkpoint conversion +--megatron-to-hf-mode bridge + +# LoRA +--lora-rank 32 +--lora-alpha 32 +--lora-dropout 0.0 +--target-modules all-linear + +# Data and rollout +--prompt-data /data/gsm8k/train.parquet +--input-key messages +--label-key label +--apply-chat-template +--rollout-shuffle +--rm-type math +--num-rollout 100 +--rollout-batch-size 4 +--n-samples-per-prompt 8 +--rollout-max-response-len 1536 +--rollout-temperature 1 +--global-batch-size 32 + +# Evaluation +--eval-interval 10 +--eval-prompt-data gsm8k /data/gsm8k/test.parquet +--n-samples-per-eval-prompt 2 +--eval-max-response-len 3072 +--eval-top-k 1 + +# Parallelism and performance +--tensor-model-parallel-size 2 +--sequence-parallel +--pipeline-model-parallel-size 2 +--context-parallel-size 1 +--expert-model-parallel-size 4 +--expert-tensor-parallel-size 1 +--recompute-granularity full +--recompute-method uniform +--recompute-num-layers 1 +--use-dynamic-batch-size +--max-tokens-per-gpu 3072 + +# GRPO +--advantage-estimator grpo +--kl-loss-coef 0.0 +--kl-loss-type low_var_kl +--kl-coef 0.0 +--entropy-coef 0.0 +--eps-clip 0.2 +--eps-clip-high 0.28 + +# Optimizer +--optimizer adam +--lr 5e-6 +--lr-decay-style constant +--weight-decay 0.1 +--adam-beta1 0.9 +--adam-beta2 0.98 + +# Rollout serving +--rollout-num-gpus-per-engine 1 +--sglang-mem-fraction-static 0.5 + +# Training runtime +--attention-dropout 0.0 +--hidden-dropout 0.0 +--accumulate-allreduce-grads-in-fp32 +--attention-softmax-in-fp32 +--calculate-per-token-loss +--use-miles-router +--save-interval 25 + +# Logging defaults +--wandb-project miles-modal +--wandb-group glm5-744b-a40b-20layer-lora +--disable-wandb-random-suffix diff --git a/miles/recipes/glm5-744b-a40b-4layer-lora.args b/miles/recipes/glm5-744b-a40b-4layer-lora.args new file mode 100644 index 0000000..64ac703 --- /dev/null +++ b/miles/recipes/glm5-744b-a40b-4layer-lora.args @@ -0,0 +1,119 @@ +# GLM-5 4-layer testing recipe. + +# Model architecture from the upstream Miles GLM-5 model script with the 4-layer override. +--spec miles_plugins.models.glm5.glm5 get_glm5_spec +--moe-layer-freq "[0]*3+[1]*1" +--num-experts 256 +--moe-shared-expert-intermediate-size 2048 +--moe-router-topk 8 +--moe-grouped-gemm +--moe-permute-fusion +--moe-ffn-hidden-size 2048 +--moe-router-score-function sigmoid +--moe-router-pre-softmax +--moe-router-enable-expert-bias +--moe-router-bias-update-rate 0 +--moe-router-load-balancing-type seq_aux_loss +--moe-router-topk-scaling-factor 2.5 +--moe-aux-loss-coeff 0 +--moe-router-dtype fp32 +--make-vocab-size-divisible-by 16 +--num-layers 4 +--hidden-size 6144 +--ffn-hidden-size 12288 +--num-attention-heads 64 +--disable-bias-linear +--swiglu +--untie-embeddings-and-output-weights +--position-embedding-type rope +--no-position-embedding +--normalization RMSNorm +--qk-layernorm +--multi-latent-attention +--q-lora-rank 2048 +--kv-lora-rank 512 +--qk-head-dim 192 +--v-head-dim 256 +--kv-channels 192 +--qk-pos-emb-head-dim 64 +--vocab-size 154880 +--rotary-base 1000000 +--enable-experimental + +# Checkpoint conversion +--megatron-to-hf-mode bridge + +# LoRA +--lora-rank 32 +--lora-alpha 32 +--lora-dropout 0.0 +--target-modules all-linear + +# Data and rollout +--prompt-data /data/gsm8k/train.parquet +--input-key messages +--label-key label +--apply-chat-template +--rollout-shuffle +--rm-type math +--num-rollout 100 +--rollout-batch-size 4 +--n-samples-per-prompt 8 +--rollout-max-response-len 1024 +--rollout-temperature 1 +--global-batch-size 32 + +# Evaluation +--eval-interval 10 +--eval-prompt-data gsm8k /data/gsm8k/test.parquet +--n-samples-per-eval-prompt 2 +--eval-max-response-len 2048 +--eval-top-k 1 + +# Parallelism and performance +--tensor-model-parallel-size 2 +--sequence-parallel +--pipeline-model-parallel-size 1 +--context-parallel-size 1 +--expert-model-parallel-size 4 +--expert-tensor-parallel-size 1 +--recompute-granularity full +--recompute-method uniform +--recompute-num-layers 1 +--use-dynamic-batch-size +--max-tokens-per-gpu 3072 + +# GRPO +--advantage-estimator grpo +--kl-loss-coef 0.0 +--kl-loss-type low_var_kl +--kl-coef 0.0 +--entropy-coef 0.0 +--eps-clip 0.2 +--eps-clip-high 0.28 + +# Optimizer +--optimizer adam +--lr 5e-6 +--lr-decay-style constant +--weight-decay 0.1 +--adam-beta1 0.9 +--adam-beta2 0.98 + +# Rollout serving +--rollout-num-gpus-per-engine 1 +--sglang-mem-fraction-static 0.5 + +# Training runtime +--attention-dropout 0.0 +--hidden-dropout 0.0 +--accumulate-allreduce-grads-in-fp32 +--attention-softmax-in-fp32 +--calculate-per-token-loss +--use-miles-router +--save-interval 25 + +# Logging defaults +--wandb-project miles-modal +--wandb-group glm5-744b-a40b-4layer-lora +--disable-wandb-random-suffix diff --git a/miles/recipes/glm5-744b-a40b-lora.args b/miles/recipes/glm5-744b-a40b-lora.args new file mode 100644 index 0000000..81ebca5 --- /dev/null +++ b/miles/recipes/glm5-744b-a40b-lora.args @@ -0,0 +1,119 @@ +# Full GLM-5 starter recipe. + +# Model architecture from the upstream Miles GLM-5 model script. +--spec miles_plugins.models.glm5.glm5 get_glm5_spec +--moe-layer-freq "[0]*3+[1]*75" +--num-experts 256 +--moe-shared-expert-intermediate-size 2048 +--moe-router-topk 8 +--moe-grouped-gemm +--moe-permute-fusion +--moe-ffn-hidden-size 2048 +--moe-router-score-function sigmoid +--moe-router-pre-softmax +--moe-router-enable-expert-bias +--moe-router-bias-update-rate 0 +--moe-router-load-balancing-type seq_aux_loss +--moe-router-topk-scaling-factor 2.5 +--moe-aux-loss-coeff 0 +--moe-router-dtype fp32 +--make-vocab-size-divisible-by 16 +--num-layers 78 +--hidden-size 6144 +--ffn-hidden-size 12288 +--num-attention-heads 64 +--disable-bias-linear +--swiglu +--untie-embeddings-and-output-weights +--position-embedding-type rope +--no-position-embedding +--normalization RMSNorm +--qk-layernorm +--multi-latent-attention +--q-lora-rank 2048 +--kv-lora-rank 512 +--qk-head-dim 192 +--v-head-dim 256 +--kv-channels 192 +--qk-pos-emb-head-dim 64 +--vocab-size 154880 +--rotary-base 1000000 +--enable-experimental + +# Checkpoint conversion +--megatron-to-hf-mode bridge + +# LoRA +--lora-rank 32 +--lora-alpha 32 +--lora-dropout 0.0 +--target-modules all-linear + +# Data and rollout +--prompt-data /data/gsm8k/train.parquet +--input-key messages +--label-key label +--apply-chat-template +--rollout-shuffle +--rm-type math +--num-rollout 100 +--rollout-batch-size 2 +--n-samples-per-prompt 8 +--rollout-max-response-len 2048 +--rollout-temperature 1 +--global-batch-size 16 + +# Evaluation +--eval-interval 10 +--eval-prompt-data gsm8k /data/gsm8k/test.parquet +--n-samples-per-eval-prompt 2 +--eval-max-response-len 4096 +--eval-top-k 1 + +# Parallelism and performance +--tensor-model-parallel-size 2 +--sequence-parallel +--pipeline-model-parallel-size 4 +--context-parallel-size 1 +--expert-model-parallel-size 8 +--expert-tensor-parallel-size 1 +--recompute-granularity full +--recompute-method uniform +--recompute-num-layers 1 +--use-dynamic-batch-size +--max-tokens-per-gpu 2048 + +# GRPO +--advantage-estimator grpo +--kl-loss-coef 0.0 +--kl-loss-type low_var_kl +--kl-coef 0.0 +--entropy-coef 0.0 +--eps-clip 0.2 +--eps-clip-high 0.28 + +# Optimizer +--optimizer adam +--lr 5e-6 +--lr-decay-style constant +--weight-decay 0.1 +--adam-beta1 0.9 +--adam-beta2 0.98 + +# Rollout serving +--rollout-num-gpus-per-engine 1 +--sglang-mem-fraction-static 0.45 + +# Training runtime +--attention-dropout 0.0 +--hidden-dropout 0.0 +--accumulate-allreduce-grads-in-fp32 +--attention-softmax-in-fp32 +--calculate-per-token-loss +--use-miles-router +--save-interval 25 + +# Logging defaults +--wandb-project miles-modal +--wandb-group glm5-744b-a40b-lora +--disable-wandb-random-suffix diff --git a/miles/recipes/qwen25-0p5b-lora.args b/miles/recipes/qwen25-0p5b-lora.args new file mode 100644 index 0000000..a3cffc7 --- /dev/null +++ b/miles/recipes/qwen25-0p5b-lora.args @@ -0,0 +1,93 @@ +# Qwen2.5-0.5B single-node smoke test adapted from the upstream Miles LoRA demo. + +# Model architecture +--swiglu +--num-layers 24 +--hidden-size 896 +--ffn-hidden-size 4864 +--num-attention-heads 14 +--use-rotary-position-embeddings +--disable-bias-linear +--add-qkv-bias +--normalization RMSNorm +--norm-epsilon 1e-6 +--rotary-base 1000000 +--group-query-attention +--num-query-groups 2 +--vocab-size 151936 + +# Checkpoint conversion +--megatron-to-hf-mode bridge + +# LoRA +--lora-rank 32 +--lora-alpha 32 +--lora-dropout 0.0 +--target-modules all-linear + +# Data and rollout +--prompt-data /data/gsm8k/train.parquet +--input-key messages +--label-key label +--apply-chat-template +--rollout-shuffle +--rm-type math +--num-rollout 100 +--rollout-batch-size 32 +--n-samples-per-prompt 8 +--rollout-max-response-len 1024 +--rollout-temperature 1 +--global-batch-size 256 + +# Evaluation +--eval-interval 10 +--eval-prompt-data gsm8k /data/gsm8k/test.parquet +--n-samples-per-eval-prompt 1 +--eval-max-response-len 1024 +--eval-top-k 1 + +# Parallelism and performance +--tensor-model-parallel-size 1 +--sequence-parallel +--pipeline-model-parallel-size 1 +--context-parallel-size 1 +--expert-model-parallel-size 1 +--expert-tensor-parallel-size 1 +--use-dynamic-batch-size +--max-tokens-per-gpu 9216 + +# GRPO +--advantage-estimator grpo +--kl-loss-coef 0.0 +--kl-loss-type low_var_kl +--kl-coef 0.0 +--entropy-coef 0.0 +--eps-clip 0.2 +--eps-clip-high 0.28 + +# Optimizer +--optimizer adam +--lr 1e-5 +--lr-decay-style constant +--weight-decay 0.1 +--adam-beta1 0.9 +--adam-beta2 0.98 + +# Rollout serving +--rollout-num-gpus-per-engine 1 +--sglang-mem-fraction-static 0.4 + +# Training runtime +--attention-dropout 0.0 +--hidden-dropout 0.0 +--accumulate-allreduce-grads-in-fp32 +--attention-softmax-in-fp32 +--attention-backend flash +--calculate-per-token-loss +--use-miles-router +--save-interval 25 + +# Logging defaults +--wandb-project miles-modal +--wandb-group qwen25-0p5b-lora +--disable-wandb-random-suffix From 8d9664f26bf2af9c53b540d93fdd92e80418d255 Mon Sep 17 00:00:00 2001 From: Jason Mancuso <7891333+jvmncs@users.noreply.github.com> Date: Thu, 26 Mar 2026 19:04:32 -0400 Subject: [PATCH 2/5] Validate Qwen3-30B-A3B LoRA on Modal and move test recipes --- miles/README.md | 98 ++++- miles/modal_patches/sitecustomize.py | 415 ++++++++++++++++++ miles/modal_train.py | 74 ++-- miles/recipes/glm4-7-flash-lora.args | 120 ----- .../recipes/glm5-744b-a40b-20layer-lora.args | 119 ----- ...a.args => qwen3-30b-a3b-experts-lora.args} | 94 ++-- ...ayer-lora.args => qwen3-30b-a3b-lora.args} | 95 ++-- .../recipes/{ => tests}/qwen25-0p5b-lora.args | 0 .../tests/qwen3-30b-a3b-experts-fewstep.args | 110 +++++ .../tests/qwen3-30b-a3b-lora-fewstep.args | 110 +++++ .../qwen3-30b-a3b-lora-greedy-debug.args | 111 +++++ 11 files changed, 964 insertions(+), 382 deletions(-) create mode 100644 miles/modal_patches/sitecustomize.py delete mode 100644 miles/recipes/glm4-7-flash-lora.args delete mode 100644 miles/recipes/glm5-744b-a40b-20layer-lora.args rename miles/recipes/{glm5-744b-a40b-lora.args => qwen3-30b-a3b-experts-lora.args} (56%) rename miles/recipes/{glm5-744b-a40b-4layer-lora.args => qwen3-30b-a3b-lora.args} (56%) rename miles/recipes/{ => tests}/qwen25-0p5b-lora.args (100%) create mode 100644 miles/recipes/tests/qwen3-30b-a3b-experts-fewstep.args create mode 100644 miles/recipes/tests/qwen3-30b-a3b-lora-fewstep.args create mode 100644 miles/recipes/tests/qwen3-30b-a3b-lora-greedy-debug.args diff --git a/miles/README.md b/miles/README.md index 26541f5..9d7c67c 100644 --- a/miles/README.md +++ b/miles/README.md @@ -24,12 +24,16 @@ Current recipes: - `qwen25-0p5b-lora`: single-node smoke test adapted from the upstream Miles LoRA example. -- `glm4-7-flash-lora`: first real GLM MoE validation recipe. -- `glm5-744b-a40b-4layer-lora`: GLM-5 testing recipe using the 4-layer script - shape from upstream Miles. -- `glm5-744b-a40b-20layer-lora`: larger GLM-5 testing recipe using the 20-layer - script shape from upstream Miles. -- `glm5-744b-a40b-lora`: full GLM-5 starter recipe. +- `qwen3-30b-a3b-lora`: first-pass Qwen3-30B-A3B bridge-mode LoRA validation + recipe, restricted to attention targets (`linear_qkv`, `linear_proj`). +- `qwen3-30b-a3b-lora-fewstep`: trimmed attention-only recipe that is intended + to prove a few full RL updates on Modal. +- `qwen3-30b-a3b-experts-lora`: second-pass Qwen3-30B-A3B recipe widened to + expert `linear_fc1` and `linear_fc2` targets after the baseline path works. +- `qwen3-30b-a3b-experts-fewstep`: trimmed expert-target recipe built from the + working few-step shape. + +Testing/debug recipe files live under [`recipes/tests/`](./recipes/tests). ## Prepare assets @@ -42,7 +46,7 @@ modal run miles/modal_train.py::prepare_dataset Download a recipe's base model into the shared Hugging Face cache: ```bash -modal run miles/modal_train.py::download_model --recipe glm4-7-flash-lora +modal run miles/modal_train.py::download_model --recipe qwen3-30b-a3b-lora ``` ## Train @@ -56,30 +60,94 @@ Single-node smoke test: MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen25-0p5b-lora ``` -GLM-4.7-Flash multi-node validation: +Qwen3-30B-A3B baseline LoRA validation: ```bash -MILES_N_NODES=4 modal run miles/modal_train.py --recipe glm4-7-flash-lora +MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen3-30b-a3b-lora ``` -GLM-5 4-layer testing recipe: +Qwen3-30B-A3B few-step attention-only validation: ```bash -MILES_N_NODES=1 modal run miles/modal_train.py --recipe glm5-744b-a40b-4layer-lora --gpu H200:8 +MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen3-30b-a3b-lora-fewstep ``` -GLM-5 20-layer testing recipe: +Qwen3-30B-A3B expert-target LoRA follow-up: ```bash -MILES_N_NODES=2 modal run miles/modal_train.py --recipe glm5-744b-a40b-20layer-lora --gpu H200:8 +MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen3-30b-a3b-experts-lora ``` -Full GLM-5 starter recipe: +Qwen3-30B-A3B expert-target few-step validation: ```bash -MILES_N_NODES=8 modal run miles/modal_train.py --recipe glm5-744b-a40b-lora --gpu H200:8 +MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen3-30b-a3b-experts-fewstep ``` +## Qwen3 Notes + +- Start with standard LoRA, not DoRA. Miles' current rollout sync and adapter + filtering are LoRA-specific and keyed off `lora_A` / `lora_B` names, so DoRA + is not the first validation target. +- The baked Qwen3 recipes are single-node `H100:8` shapes. They are intended to + validate end-to-end bridge-mode LoRA with colocated rollout first, not to + exhaustively cover every parallelism combination. +- Source inspection suggests the training path should handle TP / PP / EP / CP + because the bridge setup forwards all of those settings into Megatron-Bridge, + and Megatron-Bridge's PEFT tests cover pipeline-style model chunk lists. That + is still weaker than an actual Miles e2e validation for each shape. +- Miles currently supports LoRA weight sync only for colocated rollout engines. + Distributed non-colocated rollout sync is not yet implemented for LoRA. +- The baseline Qwen3 recipe stays close to the upstream Miles single-node + Qwen3-30B-A3B shape. The expert-target recipe is a follow-on experiment, not + the initial correctness target. + +## Observed On Modal + +The current wrapper includes runtime patches in +[`modal_patches/sitecustomize.py`](./modal_patches/sitecustomize.py) that: + +- register Megatron-Bridge's `LinearCrossEntropyModule` as column-parallel + before Hugging Face weights are loaded, which fixes bridge-mode Qwen3 load on + `output_layer.weight`; +- serialize colocated LoRA weight buckets in a builtins-only format and + rehydrate them inside SGLang, which fixes the Modal colocated LoRA sync path; +- sanitize non-finite SGLang logprob values before JSON serialization; +- sanitize invalid SGLang sampling probability rows before `torch.multinomial`. + +What the Modal runs have validated so far on `modal-labs`: + +- `qwen3-30b-a3b-lora` gets through bridge-mode LoRA creation and attention + module injection (`linear_qkv`, `linear_proj`), and it can start loading the + Hugging Face checkpoint into Megatron. +- `qwen3-30b-a3b-lora-fewstep` now gets through full RL training on Modal. In + recent runs it passed rollout, weight sync, and actor training repeatedly and + reached at least `train/step` 6 on a single-node `H100:8` shape. +- `qwen3-30b-a3b-experts-lora` goes further: it creates LoRA with + `linear_qkv`, `linear_proj`, `linear_fc1`, and `linear_fc2`, injects those + expert targets under `decoder.layers.*.mlp.experts.*`, loads weights, pushes + the adapter into SGLang, and starts `Eval gsm8k`. +- `qwen3-30b-a3b-experts-fewstep` has validated the widened target surface on + Modal: Miles creates LoRA with `linear_fc1` / `linear_fc2`, injects those + expert modules, completes weight sync, and reaches rollout collection plus + actor training. A detached confirmation of a full expert-target train step is + still in progress. +- The remaining instability has been in the colocated SGLang rollout path, not + in LoRA target discovery. The main concrete runtime failures we hit were: + non-finite logprobs breaking HTTP JSON serialization, and invalid sampling + probability tensors breaking `torch.multinomial`. + +Current interpretation: + +- Qwen3-30B-A3B MoE LoRA support in Miles is real enough to instantiate, + target, load, and export adapters for both attention and expert MLP layers. +- Attention-only Qwen3-30B-A3B LoRA is now runtime-validated for repeated RL + updates on `modal-labs`. +- The remaining risk is concentrated in the colocated SGLang rollout lifecycle, + which is coupled to `offload_rollout` / `enable_memory_saver=True` in the + current Miles SGLang engine setup, especially once expert-target LoRA is + enabled. + Useful options: - `--dry-run`: print the assembled Miles command with a `$MODEL_PATH` diff --git a/miles/modal_patches/sitecustomize.py b/miles/modal_patches/sitecustomize.py new file mode 100644 index 0000000..0662644 --- /dev/null +++ b/miles/modal_patches/sitecustomize.py @@ -0,0 +1,415 @@ +"""Modal runtime patches loaded automatically via PYTHONPATH.""" + + +def _log(message: str) -> None: + print(message, flush=True) + + +def _register_linear_cross_entropy_module() -> None: + try: + from megatron.bridge.models.conversion.param_mapping import AutoMapping + except Exception as exc: + _log( + "[miles-modal] bridge patch unavailable for LinearCrossEntropyModule: " + f"{type(exc).__name__}: {exc}" + ) + return + + try: + AutoMapping.register_module_type("LinearCrossEntropyModule", "column") + except Exception as exc: + message = str(exc).lower() + if any(token in message for token in ("already", "exists", "duplicate")): + _log( + "[miles-modal] bridge patch already present for " + "LinearCrossEntropyModule" + ) + return + _log( + "[miles-modal] bridge patch failed for LinearCrossEntropyModule: " + f"{type(exc).__name__}: {exc}" + ) + return + + _log("[miles-modal] registered LinearCrossEntropyModule as column parallel") + + +def _patch_lora_cpu_serialization() -> None: + try: + import base64 + import io + import torch + from miles.backends.megatron_utils.update_weight import ( + update_weight_from_tensor as update_weight_mod, + ) + except Exception as exc: + _log( + "[miles-modal] LoRA CPU serialization patch unavailable: " + f"{type(exc).__name__}: {exc}" + ) + return + + original = getattr(update_weight_mod, "_send_to_colocated_engine", None) + if original is None: + _log("[miles-modal] LoRA CPU serialization patch missing target function") + return + + if getattr(original, "__module__", "") == __name__: + _log("[miles-modal] LoRA CPU serialization patch already present") + return + + dist = update_weight_mod.dist + ray = update_weight_mod.ray + FlattenedTensorBucket = update_weight_mod.FlattenedTensorBucket + MultiprocessingSerializer = update_weight_mod.MultiprocessingSerializer + + def _send_to_colocated_engine( + hf_named_tensors, + *, + ipc_engine, + ipc_gather_src, + ipc_gather_group, + weight_version=None, + lora_config=None, + lora_name=None, + lora_loaded=False, + ): + # Placeholder ranks (GPU slots reserved but no engine) have no gather group. + # gather_object is only collective among group members, so we skip entirely. + if ipc_gather_group is None: + return [], None + + is_lora = lora_config is not None + long_live_tensors = [] + + if getattr(FlattenedTensorBucket, "supports_multi_dtypes", False): + converted_named_tensors_by_dtypes = {"dtype": hf_named_tensors} + else: + converted_named_tensors_by_dtypes = {} + for name, tensor in hf_named_tensors: + dtype = tensor.dtype + if dtype not in converted_named_tensors_by_dtypes: + converted_named_tensors_by_dtypes[dtype] = [] + converted_named_tensors_by_dtypes[dtype].append((name, tensor)) + + serialized_tensors = [] + for _dtype, named_tensors in converted_named_tensors_by_dtypes.items(): + flattened_tensor_bucket = FlattenedTensorBucket(named_tensors=named_tensors) + flattened_tensor = flattened_tensor_bucket.get_flattened_tensor() + + # Modal's colocated LoRA sync can fail on CUDA IPC, and CPU torch.Tensor + # pickling still goes through multiprocessing resource_sharer. Serialize + # LoRA flattened buckets into a builtins-only payload so SGLang's safe + # unpickler can accept it without touching multiprocessing shims. + if is_lora and isinstance(flattened_tensor, torch.Tensor) and flattened_tensor.is_cuda: + flattened_tensor = flattened_tensor.detach().cpu() + + if is_lora: + if not isinstance(flattened_tensor, torch.Tensor): + raise TypeError( + "Expected LoRA flattened tensor to be a torch.Tensor, got " + f"{type(flattened_tensor).__name__}" + ) + buffer = io.BytesIO() + torch.save(flattened_tensor.contiguous(), buffer) + flattened_tensor_data = { + "_miles_modal_format": "torch_save_flattened_lora_v2", + "flattened_tensor_torch_save_b64": base64.b64encode(buffer.getvalue()).decode("ascii"), + "metadata": [ + { + "name": meta.name, + "shape": list(meta.shape), + "dtype": str(meta.dtype).removeprefix("torch."), + "start_idx": meta.start_idx, + "end_idx": meta.end_idx, + "numel": meta.numel, + } + for meta in flattened_tensor_bucket.get_metadata() + ], + } + else: + flattened_tensor_data = { + "flattened_tensor": flattened_tensor, + "metadata": flattened_tensor_bucket.get_metadata(), + } + long_live_tensors.append(flattened_tensor_data) + serialized_tensors.append( + MultiprocessingSerializer.serialize( + flattened_tensor_data, + output_str=True, + ) + ) + + serialized_named_tensors = ( + [None] * dist.get_world_size(ipc_gather_group) + if ipc_gather_src == dist.get_rank() + else None + ) + dist.gather_object( + serialized_tensors, + object_gather_list=serialized_named_tensors, + dst=ipc_gather_src, + group=ipc_gather_group, + ) + + refs = [] + if dist.get_rank() == ipc_gather_src: + if is_lora: + if lora_loaded: + ray.get(ipc_engine.unload_lora_adapter.remote(lora_name=lora_name)) + + refs.append( + ipc_engine.load_lora_adapter_from_tensors.remote( + lora_name=lora_name, + config_dict=lora_config, + serialized_tensors=serialized_named_tensors[0][0], + load_format="flattened_bucket", + ) + ) + else: + num_dtypes = len(serialized_named_tensors[0]) + for i in range(num_dtypes): + kwargs = { + "serialized_named_tensors": [tensors[i] for tensors in serialized_named_tensors], + "load_format": "flattened_bucket", + "weight_version": str(weight_version), + } + refs.append(ipc_engine.update_weights_from_tensor.remote(**kwargs)) + + return refs, long_live_tensors + + update_weight_mod._send_to_colocated_engine = _send_to_colocated_engine + _log("[miles-modal] patched colocated LoRA sync to builtins-only flattened buckets") + + +def _patch_sglang_lora_numpy_rehydration() -> None: + try: + import base64 + import io + import torch + from sglang.srt.managers import tp_worker as tp_worker_mod + from sglang.srt.weight_sync.tensor_bucket import FlattenedTensorMetadata + except Exception as exc: + _log( + "[miles-modal] SGLang LoRA rehydration patch unavailable: " + f"{type(exc).__name__}: {exc}" + ) + return + + TpModelWorker = getattr(tp_worker_mod, "TpModelWorker", None) + if TpModelWorker is None: + _log("[miles-modal] SGLang LoRA rehydration patch missing TpModelWorker") + return + + original = getattr(TpModelWorker, "load_lora_adapter_from_tensors", None) + if original is None: + _log("[miles-modal] SGLang LoRA rehydration patch missing target method") + return + + if getattr(original, "__module__", "") == __name__: + _log("[miles-modal] SGLang LoRA rehydration patch already present") + return + + MultiprocessingSerializer = tp_worker_mod.MultiprocessingSerializer + FlattenedTensorBucket = tp_worker_mod.FlattenedTensorBucket + + def _torch_dtype_from_name(dtype_name: str): + return getattr(torch, dtype_name.removeprefix("torch.")) + + def load_lora_adapter_from_tensors(self, recv_req): + # The LoRA code handles TP sharding internally using slice_lora_a_weights + # and slice_lora_b_weights methods (see lora/layers.py:46-49, mem_pool.py:437-440). + if recv_req.load_format == "flattened_bucket": + flattened_data = MultiprocessingSerializer.deserialize( + recv_req.serialized_tensors + ) + if flattened_data.get("_miles_modal_format") == "torch_save_flattened_lora_v2": + raw_bytes = base64.b64decode(flattened_data["flattened_tensor_torch_save_b64"]) + flattened_tensor = torch.load( + io.BytesIO(raw_bytes), + map_location="cpu", + ) + metadata = [ + FlattenedTensorMetadata( + name=meta["name"], + shape=torch.Size(meta["shape"]), + dtype=_torch_dtype_from_name(meta["dtype"]), + start_idx=meta["start_idx"], + end_idx=meta["end_idx"], + numel=meta["numel"], + ) + for meta in flattened_data["metadata"] + ] + elif flattened_data.get("_miles_modal_format") == "raw_flattened_lora_v1": + raw_bytes = base64.b64decode(flattened_data["flattened_tensor_b64"]) + flattened_tensor = torch.frombuffer( + memoryview(raw_bytes), + dtype=torch.uint8, + ).clone() + metadata = [ + FlattenedTensorMetadata( + name=meta["name"], + shape=torch.Size(meta["shape"]), + dtype=_torch_dtype_from_name(meta["dtype"]), + start_idx=meta["start_idx"], + end_idx=meta["end_idx"], + numel=meta["numel"], + ) + for meta in flattened_data["metadata"] + ] + else: + flattened_tensor = flattened_data["flattened_tensor"] + metadata = flattened_data["metadata"] + bucket = FlattenedTensorBucket( + flattened_tensor=flattened_tensor, + metadata=metadata, + ) + tensors = dict(bucket.reconstruct_tensors()) + else: + tensors = MultiprocessingSerializer.deserialize(recv_req.serialized_tensors) + result = self.model_runner.load_lora_adapter_from_tensors( + recv_req.to_ref(), + tensors, + recv_req.config_dict, + recv_req.added_tokens_config, + ) + return result + + TpModelWorker.load_lora_adapter_from_tensors = load_lora_adapter_from_tensors + _log("[miles-modal] patched SGLang LoRA load path to rehydrate builtins-only flattened buckets") + + +def _patch_sglang_logprob_sanitization() -> None: + try: + import math + from sglang.srt.managers import tokenizer_manager as tokenizer_manager_mod + except Exception as exc: + _log( + "[miles-modal] SGLang logprob sanitization patch unavailable: " + f"{type(exc).__name__}: {exc}" + ) + return + + TokenizerManager = getattr(tokenizer_manager_mod, "TokenizerManager", None) + if TokenizerManager is None: + _log("[miles-modal] SGLang logprob sanitization patch missing TokenizerManager") + return + + original = getattr(TokenizerManager, "detokenize_logprob_tokens", None) + if original is None: + _log("[miles-modal] SGLang logprob sanitization patch missing target method") + return + + if getattr(original, "__module__", "") == __name__: + _log("[miles-modal] SGLang logprob sanitization patch already present") + return + + sanitize_state = {"count": 0} + + def _sanitize_logprob(value): + try: + numeric = float(value) + except Exception: + return value + + if math.isnan(numeric) or math.isinf(numeric): + sanitized = 0.0 + elif numeric > 0.0: + sanitized = 0.0 + else: + sanitized = numeric + + if sanitized != numeric: + sanitize_state["count"] += 1 + if sanitize_state["count"] <= 8: + _log( + "[miles-modal] sanitized SGLang logprob " + f"{numeric!r} -> {sanitized!r}" + ) + return sanitized + + def detokenize_logprob_tokens(self, token_logprobs_val, token_logprobs_idx, decode_to_text): + sanitized_vals = [_sanitize_logprob(value) for value in token_logprobs_val] + return original(self, sanitized_vals, token_logprobs_idx, decode_to_text) + + TokenizerManager.detokenize_logprob_tokens = detokenize_logprob_tokens + _log("[miles-modal] patched SGLang logprob detokenization to sanitize non-finite values") + + +def _patch_sglang_sampling_probability_sanitization() -> None: + try: + import torch + from sglang.srt.layers import sampler as sampler_mod + except Exception as exc: + _log( + "[miles-modal] SGLang sampling probability patch unavailable: " + f"{type(exc).__name__}: {exc}" + ) + return + + original = getattr(sampler_mod, "sampling_from_probs_torch", None) + if original is None: + _log("[miles-modal] SGLang sampling probability patch missing target function") + return + + if getattr(original, "__module__", "") == __name__: + _log("[miles-modal] SGLang sampling probability patch already present") + return + + sanitize_state = {"count": 0} + + def _sanitize_probs(probs: torch.Tensor) -> torch.Tensor: + probs_fp32 = probs.float() + valid_mask = torch.isfinite(probs_fp32) & (probs_fp32 >= 0) + safe_probs = torch.where(valid_mask, probs_fp32, torch.zeros_like(probs_fp32)) + row_sums = safe_probs.sum(dim=-1, keepdim=True) + zero_rows = row_sums <= 0 + + has_invalid = bool((~valid_mask).any().item()) + has_zero_rows = bool(zero_rows.any().item()) + + if has_zero_rows: + fallback_scores = torch.nan_to_num( + probs_fp32, + nan=float("-inf"), + posinf=float("-inf"), + neginf=float("-inf"), + ) + fallback_indices = fallback_scores.argmax(dim=-1, keepdim=True) + fallback_probs = torch.zeros_like(safe_probs) + fallback_probs.scatter_(-1, fallback_indices, 1.0) + safe_probs = torch.where(zero_rows, fallback_probs, safe_probs) + row_sums = safe_probs.sum(dim=-1, keepdim=True) + + if has_invalid or has_zero_rows: + sanitize_state["count"] += 1 + if sanitize_state["count"] <= 8: + _log( + "[miles-modal] sanitized SGLang sampling probs " + f"(invalid_entries={int((~valid_mask).sum().item())}, " + f"zero_rows={int(zero_rows.sum().item())})" + ) + + return safe_probs / row_sums.clamp_min(1e-12) + + def sampling_from_probs_torch( + probs: torch.Tensor, + sampling_seed=None, + positions=None, + ): + safe_probs = _sanitize_probs(probs) + return original( + safe_probs, + sampling_seed=sampling_seed, + positions=positions, + ) + + sampler_mod.sampling_from_probs_torch = sampling_from_probs_torch + _log("[miles-modal] patched SGLang sampling to sanitize invalid probability rows") + + +_register_linear_cross_entropy_module() +_patch_lora_cpu_serialization() +_patch_sglang_lora_numpy_rehydration() +_patch_sglang_logprob_sanitization() +_patch_sglang_sampling_probability_sanitization() diff --git a/miles/modal_train.py b/miles/modal_train.py index 35b075e..5477a33 100644 --- a/miles/modal_train.py +++ b/miles/modal_train.py @@ -35,6 +35,7 @@ DATA_PATH = pathlib.Path("/data") CHECKPOINTS_PATH = pathlib.Path("/checkpoints") REMOTE_RECIPES_DIR = pathlib.Path("/root/miles-recipes") +REMOTE_PATCH_DIR = pathlib.Path("/root/miles-modal-patches") REMOTE_MILES_DIR = pathlib.Path("/root/miles") REMOTE_TRAIN_SCRIPT = REMOTE_MILES_DIR / "train.py" @@ -63,41 +64,49 @@ class Recipe: name="qwen25-0p5b-lora", description="Single-node smoke test adapted from the upstream Miles LoRA example.", model_id="Qwen/Qwen2.5-0.5B-Instruct", - args_file="qwen25-0p5b-lora.args", + args_file="tests/qwen25-0p5b-lora.args", recommended_nodes=1, gpu="H100:8", ), - "glm4-7-flash-lora": Recipe( - name="glm4-7-flash-lora", - description="First real GLM MoE validation recipe on multiple nodes.", - model_id="zai-org/GLM-4.7-Flash", - args_file="glm4-7-flash-lora.args", - recommended_nodes=4, + "qwen3-30b-a3b-lora": Recipe( + name="qwen3-30b-a3b-lora", + description="Single-node Qwen3-30B-A3B bridge-mode LoRA validation recipe.", + model_id="Qwen/Qwen3-30B-A3B", + args_file="qwen3-30b-a3b-lora.args", + recommended_nodes=1, gpu="H100:8", ), - "glm5-744b-a40b-4layer-lora": Recipe( - name="glm5-744b-a40b-4layer-lora", - description="GLM-5 testing recipe using the upstream 4-layer model script shape.", - model_id="zai-org/GLM-5", - args_file="glm5-744b-a40b-4layer-lora.args", + "qwen3-30b-a3b-lora-fewstep": Recipe( + name="qwen3-30b-a3b-lora-fewstep", + description="Single-node Qwen3-30B-A3B attention-only LoRA recipe trimmed to chase a few full RL steps.", + model_id="Qwen/Qwen3-30B-A3B", + args_file="tests/qwen3-30b-a3b-lora-fewstep.args", recommended_nodes=1, - gpu="H200:8", + gpu="H100:8", ), - "glm5-744b-a40b-20layer-lora": Recipe( - name="glm5-744b-a40b-20layer-lora", - description="GLM-5 testing recipe using the upstream 20-layer model script shape.", - model_id="zai-org/GLM-5", - args_file="glm5-744b-a40b-20layer-lora.args", - recommended_nodes=2, - gpu="H200:8", + "qwen3-30b-a3b-lora-greedy-debug": Recipe( + name="qwen3-30b-a3b-lora-greedy-debug", + description="Single-node Qwen3-30B-A3B attention-only LoRA debug recipe with greedy rollout to validate LoRA sync.", + model_id="Qwen/Qwen3-30B-A3B", + args_file="tests/qwen3-30b-a3b-lora-greedy-debug.args", + recommended_nodes=1, + gpu="H100:8", ), - "glm5-744b-a40b-lora": Recipe( - name="glm5-744b-a40b-lora", - description="Full GLM-5 starter recipe for LoRA RLVR experiments.", - model_id="zai-org/GLM-5", - args_file="glm5-744b-a40b-lora.args", - recommended_nodes=8, - gpu="H200:8", + "qwen3-30b-a3b-experts-lora": Recipe( + name="qwen3-30b-a3b-experts-lora", + description="Second-phase Qwen3-30B-A3B recipe widened to expert linear_fc1/fc2 targets.", + model_id="Qwen/Qwen3-30B-A3B", + args_file="qwen3-30b-a3b-experts-lora.args", + recommended_nodes=1, + gpu="H100:8", + ), + "qwen3-30b-a3b-experts-fewstep": Recipe( + name="qwen3-30b-a3b-experts-fewstep", + description="Single-node Qwen3-30B-A3B expert-target LoRA recipe trimmed to chase a few RL steps.", + model_id="Qwen/Qwen3-30B-A3B", + args_file="tests/qwen3-30b-a3b-experts-fewstep.args", + recommended_nodes=1, + gpu="H100:8", ), } @@ -205,7 +214,7 @@ def _build_runtime_env(master_addr: str, wandb_key: Optional[str]) -> dict: env_vars = { "MASTER_ADDR": master_addr, "no_proxy": master_addr, - "PYTHONPATH": "/root/Megatron-LM", + "PYTHONPATH": f"{REMOTE_PATCH_DIR.as_posix()}:/root/Megatron-LM", "CUDA_DEVICE_MAX_CONNECTIONS": "1", "NCCL_ALGO": "Ring", "NVTE_ALLOW_NONDETERMINISTIC_ALGO": "0", @@ -229,6 +238,11 @@ def _print_recipe_table(): modal.Image.from_registry(MILES_IMAGE) .entrypoint([]) .add_local_dir(here / "recipes", remote_path=REMOTE_RECIPES_DIR.as_posix(), copy=True) + .add_local_dir( + here / "modal_patches", + remote_path=REMOTE_PATCH_DIR.as_posix(), + copy=True, + ) ) if LOCAL_MILES_PATH: @@ -422,7 +436,7 @@ async def submit_training( timeout=24 * 60 * 60, ) def download_model( - recipe: str = "glm4-7-flash-lora", + recipe: str = "qwen3-30b-a3b-lora", revision: Optional[str] = None, model_id: Optional[str] = None, ): @@ -462,7 +476,7 @@ def prepare_dataset( @app.local_entrypoint() def main( - recipe: str = "qwen25-0p5b-lora", + recipe: str = "qwen3-30b-a3b-lora", gpu: str = "", extra_args: str = "", extra_args_file: str = "", diff --git a/miles/recipes/glm4-7-flash-lora.args b/miles/recipes/glm4-7-flash-lora.args deleted file mode 100644 index 0cf819f..0000000 --- a/miles/recipes/glm4-7-flash-lora.args +++ /dev/null @@ -1,120 +0,0 @@ -# GLM-4.7-Flash LoRA validation recipe. - -# Model architecture from the upstream Miles model script. ---moe-layer-freq "[0]*1+[1]*46" ---num-experts 64 ---moe-shared-expert-intermediate-size 1536 ---moe-router-topk 4 ---moe-grouped-gemm ---moe-permute-fusion ---moe-ffn-hidden-size 1536 ---moe-router-score-function sigmoid ---moe-router-pre-softmax ---moe-router-enable-expert-bias ---moe-router-bias-update-rate 0 ---moe-router-load-balancing-type seq_aux_loss ---moe-router-topk-scaling-factor 1.8 ---moe-aux-loss-coeff 0 ---moe-router-dtype fp32 ---make-vocab-size-divisible-by 64 ---num-layers 47 ---hidden-size 2048 ---ffn-hidden-size 10240 ---num-attention-heads 20 ---disable-bias-linear ---add-qkv-bias ---swiglu ---untie-embeddings-and-output-weights ---position-embedding-type rope ---no-position-embedding ---normalization RMSNorm ---qk-layernorm ---multi-latent-attention ---q-lora-rank 768 ---kv-lora-rank 512 ---qk-head-dim 192 ---v-head-dim 256 ---kv-channels 192 ---qk-pos-emb-head-dim 64 ---vocab-size 154880 ---rotary-base 1000000 ---no-rope-fusion ---mtp-num-layers 1 - -# Checkpoint conversion ---megatron-to-hf-mode bridge - -# LoRA ---lora-rank 32 ---lora-alpha 32 ---lora-dropout 0.0 ---target-modules all-linear - -# Data and rollout ---prompt-data /data/gsm8k/train.parquet ---input-key messages ---label-key label ---apply-chat-template ---rollout-shuffle ---rm-type math ---num-rollout 100 ---rollout-batch-size 8 ---n-samples-per-prompt 8 ---rollout-max-response-len 2048 ---rollout-temperature 1 ---global-batch-size 64 - -# Evaluation ---eval-interval 10 ---eval-prompt-data gsm8k /data/gsm8k/test.parquet ---n-samples-per-eval-prompt 2 ---eval-max-response-len 4096 ---eval-top-k 1 - -# Parallelism and performance ---tensor-model-parallel-size 2 ---sequence-parallel ---pipeline-model-parallel-size 4 ---context-parallel-size 1 ---expert-model-parallel-size 4 ---expert-tensor-parallel-size 1 ---recompute-granularity full ---recompute-method uniform ---recompute-num-layers 1 ---use-dynamic-batch-size ---max-tokens-per-gpu 4096 - -# GRPO ---advantage-estimator grpo ---kl-loss-coef 0.0 ---kl-loss-type low_var_kl ---kl-coef 0.0 ---entropy-coef 0.0 ---eps-clip 0.2 ---eps-clip-high 0.28 - -# Optimizer ---optimizer adam ---lr 1e-5 ---lr-decay-style constant ---weight-decay 0.1 ---adam-beta1 0.9 ---adam-beta2 0.98 - -# Rollout serving ---rollout-num-gpus-per-engine 1 ---sglang-mem-fraction-static 0.55 - -# Training runtime ---attention-dropout 0.0 ---hidden-dropout 0.0 ---accumulate-allreduce-grads-in-fp32 ---attention-softmax-in-fp32 ---calculate-per-token-loss ---use-miles-router ---save-interval 25 - -# Logging defaults ---wandb-project miles-modal ---wandb-group glm4-7-flash-lora ---disable-wandb-random-suffix diff --git a/miles/recipes/glm5-744b-a40b-20layer-lora.args b/miles/recipes/glm5-744b-a40b-20layer-lora.args deleted file mode 100644 index 5a2e760..0000000 --- a/miles/recipes/glm5-744b-a40b-20layer-lora.args +++ /dev/null @@ -1,119 +0,0 @@ -# GLM-5 20-layer testing recipe. - -# Model architecture from the upstream Miles GLM-5 model script with the 20-layer override. ---spec miles_plugins.models.glm5.glm5 get_glm5_spec ---moe-layer-freq "[0]*3+[1]*17" ---num-experts 256 ---moe-shared-expert-intermediate-size 2048 ---moe-router-topk 8 ---moe-grouped-gemm ---moe-permute-fusion ---moe-ffn-hidden-size 2048 ---moe-router-score-function sigmoid ---moe-router-pre-softmax ---moe-router-enable-expert-bias ---moe-router-bias-update-rate 0 ---moe-router-load-balancing-type seq_aux_loss ---moe-router-topk-scaling-factor 2.5 ---moe-aux-loss-coeff 0 ---moe-router-dtype fp32 ---make-vocab-size-divisible-by 16 ---num-layers 20 ---hidden-size 6144 ---ffn-hidden-size 12288 ---num-attention-heads 64 ---disable-bias-linear ---swiglu ---untie-embeddings-and-output-weights ---position-embedding-type rope ---no-position-embedding ---normalization RMSNorm ---qk-layernorm ---multi-latent-attention ---q-lora-rank 2048 ---kv-lora-rank 512 ---qk-head-dim 192 ---v-head-dim 256 ---kv-channels 192 ---qk-pos-emb-head-dim 64 ---vocab-size 154880 ---rotary-base 1000000 ---enable-experimental - -# Checkpoint conversion ---megatron-to-hf-mode bridge - -# LoRA ---lora-rank 32 ---lora-alpha 32 ---lora-dropout 0.0 ---target-modules all-linear - -# Data and rollout ---prompt-data /data/gsm8k/train.parquet ---input-key messages ---label-key label ---apply-chat-template ---rollout-shuffle ---rm-type math ---num-rollout 100 ---rollout-batch-size 4 ---n-samples-per-prompt 8 ---rollout-max-response-len 1536 ---rollout-temperature 1 ---global-batch-size 32 - -# Evaluation ---eval-interval 10 ---eval-prompt-data gsm8k /data/gsm8k/test.parquet ---n-samples-per-eval-prompt 2 ---eval-max-response-len 3072 ---eval-top-k 1 - -# Parallelism and performance ---tensor-model-parallel-size 2 ---sequence-parallel ---pipeline-model-parallel-size 2 ---context-parallel-size 1 ---expert-model-parallel-size 4 ---expert-tensor-parallel-size 1 ---recompute-granularity full ---recompute-method uniform ---recompute-num-layers 1 ---use-dynamic-batch-size ---max-tokens-per-gpu 3072 - -# GRPO ---advantage-estimator grpo ---kl-loss-coef 0.0 ---kl-loss-type low_var_kl ---kl-coef 0.0 ---entropy-coef 0.0 ---eps-clip 0.2 ---eps-clip-high 0.28 - -# Optimizer ---optimizer adam ---lr 5e-6 ---lr-decay-style constant ---weight-decay 0.1 ---adam-beta1 0.9 ---adam-beta2 0.98 - -# Rollout serving ---rollout-num-gpus-per-engine 1 ---sglang-mem-fraction-static 0.5 - -# Training runtime ---attention-dropout 0.0 ---hidden-dropout 0.0 ---accumulate-allreduce-grads-in-fp32 ---attention-softmax-in-fp32 ---calculate-per-token-loss ---use-miles-router ---save-interval 25 - -# Logging defaults ---wandb-project miles-modal ---wandb-group glm5-744b-a40b-20layer-lora ---disable-wandb-random-suffix diff --git a/miles/recipes/glm5-744b-a40b-lora.args b/miles/recipes/qwen3-30b-a3b-experts-lora.args similarity index 56% rename from miles/recipes/glm5-744b-a40b-lora.args rename to miles/recipes/qwen3-30b-a3b-experts-lora.args index 81ebca5..39a25ec 100644 --- a/miles/recipes/glm5-744b-a40b-lora.args +++ b/miles/recipes/qwen3-30b-a3b-experts-lora.args @@ -1,44 +1,35 @@ -# Full GLM-5 starter recipe. +# Qwen3-30B-A3B bridge-mode LoRA validation recipe. +# Phase 2: widen the working baseline to include expert linear_fc1 / linear_fc2. -# Model architecture from the upstream Miles GLM-5 model script. ---spec miles_plugins.models.glm5.glm5 get_glm5_spec ---moe-layer-freq "[0]*3+[1]*75" ---num-experts 256 ---moe-shared-expert-intermediate-size 2048 ---moe-router-topk 8 ---moe-grouped-gemm ---moe-permute-fusion ---moe-ffn-hidden-size 2048 ---moe-router-score-function sigmoid ---moe-router-pre-softmax ---moe-router-enable-expert-bias ---moe-router-bias-update-rate 0 ---moe-router-load-balancing-type seq_aux_loss ---moe-router-topk-scaling-factor 2.5 ---moe-aux-loss-coeff 0 ---moe-router-dtype fp32 ---make-vocab-size-divisible-by 16 ---num-layers 78 ---hidden-size 6144 ---ffn-hidden-size 12288 ---num-attention-heads 64 +# Model architecture from the upstream Miles Qwen3-30B-A3B model script. --disable-bias-linear +--qk-layernorm +--group-query-attention +--num-attention-heads 32 +--num-query-groups 4 +--kv-channels 128 +--num-layers 48 +--hidden-size 2048 +--ffn-hidden-size 6144 +--normalization RMSNorm +--position-embedding-type rope +--norm-epsilon 1e-6 +--rotary-percent 1.0 --swiglu --untie-embeddings-and-output-weights ---position-embedding-type rope ---no-position-embedding ---normalization RMSNorm ---qk-layernorm ---multi-latent-attention ---q-lora-rank 2048 ---kv-lora-rank 512 ---qk-head-dim 192 ---v-head-dim 256 ---kv-channels 192 ---qk-pos-emb-head-dim 64 ---vocab-size 154880 +--vocab-size 151936 --rotary-base 1000000 ---enable-experimental +--moe-ffn-hidden-size 768 +--moe-router-score-function softmax +--moe-token-dispatcher-type alltoall +--moe-router-topk 8 +--moe-layer-freq "[1]*48" +--num-experts 128 +--moe-grouped-gemm +--moe-token-drop-policy probs +--moe-router-dtype fp32 +--moe-permute-fusion +--moe-aux-loss-coeff 0 # Checkpoint conversion --megatron-to-hf-mode bridge @@ -47,7 +38,7 @@ --lora-rank 32 --lora-alpha 32 --lora-dropout 0.0 ---target-modules all-linear +--target-modules linear_qkv,linear_proj,linear_fc1,linear_fc2 # Data and rollout --prompt-data /data/gsm8k/train.parquet @@ -56,24 +47,24 @@ --apply-chat-template --rollout-shuffle --rm-type math ---num-rollout 100 ---rollout-batch-size 2 ---n-samples-per-prompt 8 ---rollout-max-response-len 2048 +--num-rollout 40 +--rollout-batch-size 4 +--n-samples-per-prompt 4 +--rollout-max-response-len 1024 --rollout-temperature 1 --global-batch-size 16 # Evaluation --eval-interval 10 --eval-prompt-data gsm8k /data/gsm8k/test.parquet ---n-samples-per-eval-prompt 2 ---eval-max-response-len 4096 +--n-samples-per-eval-prompt 1 +--eval-max-response-len 1024 --eval-top-k 1 # Parallelism and performance ---tensor-model-parallel-size 2 +--tensor-model-parallel-size 4 --sequence-parallel ---pipeline-model-parallel-size 4 +--pipeline-model-parallel-size 1 --context-parallel-size 1 --expert-model-parallel-size 8 --expert-tensor-parallel-size 1 @@ -94,26 +85,31 @@ # Optimizer --optimizer adam ---lr 5e-6 +--lr 1e-5 --lr-decay-style constant --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.98 +--optimizer-cpu-offload +--overlap-cpu-optimizer-d2h-h2d +--use-precision-aware-optimizer # Rollout serving ---rollout-num-gpus-per-engine 1 ---sglang-mem-fraction-static 0.45 +--rollout-num-gpus-per-engine 8 +--sglang-mem-fraction-static 0.7 +--sglang-cuda-graph-max-bs 256 # Training runtime --attention-dropout 0.0 --hidden-dropout 0.0 --accumulate-allreduce-grads-in-fp32 --attention-softmax-in-fp32 +--attention-backend flash --calculate-per-token-loss --use-miles-router --save-interval 25 # Logging defaults --wandb-project miles-modal ---wandb-group glm5-744b-a40b-lora +--wandb-group qwen3-30b-a3b-experts-lora --disable-wandb-random-suffix diff --git a/miles/recipes/glm5-744b-a40b-4layer-lora.args b/miles/recipes/qwen3-30b-a3b-lora.args similarity index 56% rename from miles/recipes/glm5-744b-a40b-4layer-lora.args rename to miles/recipes/qwen3-30b-a3b-lora.args index 64ac703..d80d9b3 100644 --- a/miles/recipes/glm5-744b-a40b-4layer-lora.args +++ b/miles/recipes/qwen3-30b-a3b-lora.args @@ -1,44 +1,36 @@ -# GLM-5 4-layer testing recipe. +# Qwen3-30B-A3B bridge-mode LoRA validation recipe. +# Phase 1: confirm end-to-end Miles + Megatron-Bridge + SGLang support with +# attention-only LoRA targets before widening into expert MLP modules. -# Model architecture from the upstream Miles GLM-5 model script with the 4-layer override. ---spec miles_plugins.models.glm5.glm5 get_glm5_spec ---moe-layer-freq "[0]*3+[1]*1" ---num-experts 256 ---moe-shared-expert-intermediate-size 2048 ---moe-router-topk 8 ---moe-grouped-gemm ---moe-permute-fusion ---moe-ffn-hidden-size 2048 ---moe-router-score-function sigmoid ---moe-router-pre-softmax ---moe-router-enable-expert-bias ---moe-router-bias-update-rate 0 ---moe-router-load-balancing-type seq_aux_loss ---moe-router-topk-scaling-factor 2.5 ---moe-aux-loss-coeff 0 ---moe-router-dtype fp32 ---make-vocab-size-divisible-by 16 ---num-layers 4 ---hidden-size 6144 ---ffn-hidden-size 12288 ---num-attention-heads 64 +# Model architecture from the upstream Miles Qwen3-30B-A3B model script. --disable-bias-linear +--qk-layernorm +--group-query-attention +--num-attention-heads 32 +--num-query-groups 4 +--kv-channels 128 +--num-layers 48 +--hidden-size 2048 +--ffn-hidden-size 6144 +--normalization RMSNorm +--position-embedding-type rope +--norm-epsilon 1e-6 +--rotary-percent 1.0 --swiglu --untie-embeddings-and-output-weights ---position-embedding-type rope ---no-position-embedding ---normalization RMSNorm ---qk-layernorm ---multi-latent-attention ---q-lora-rank 2048 ---kv-lora-rank 512 ---qk-head-dim 192 ---v-head-dim 256 ---kv-channels 192 ---qk-pos-emb-head-dim 64 ---vocab-size 154880 +--vocab-size 151936 --rotary-base 1000000 ---enable-experimental +--moe-ffn-hidden-size 768 +--moe-router-score-function softmax +--moe-token-dispatcher-type alltoall +--moe-router-topk 8 +--moe-layer-freq "[1]*48" +--num-experts 128 +--moe-grouped-gemm +--moe-token-drop-policy probs +--moe-router-dtype fp32 +--moe-permute-fusion +--moe-aux-loss-coeff 0 # Checkpoint conversion --megatron-to-hf-mode bridge @@ -47,7 +39,7 @@ --lora-rank 32 --lora-alpha 32 --lora-dropout 0.0 ---target-modules all-linear +--target-modules linear_qkv,linear_proj # Data and rollout --prompt-data /data/gsm8k/train.parquet @@ -56,32 +48,32 @@ --apply-chat-template --rollout-shuffle --rm-type math ---num-rollout 100 +--num-rollout 40 --rollout-batch-size 4 ---n-samples-per-prompt 8 +--n-samples-per-prompt 4 --rollout-max-response-len 1024 --rollout-temperature 1 ---global-batch-size 32 +--global-batch-size 16 # Evaluation --eval-interval 10 --eval-prompt-data gsm8k /data/gsm8k/test.parquet ---n-samples-per-eval-prompt 2 ---eval-max-response-len 2048 +--n-samples-per-eval-prompt 1 +--eval-max-response-len 1024 --eval-top-k 1 # Parallelism and performance ---tensor-model-parallel-size 2 +--tensor-model-parallel-size 4 --sequence-parallel --pipeline-model-parallel-size 1 --context-parallel-size 1 ---expert-model-parallel-size 4 +--expert-model-parallel-size 8 --expert-tensor-parallel-size 1 --recompute-granularity full --recompute-method uniform --recompute-num-layers 1 --use-dynamic-batch-size ---max-tokens-per-gpu 3072 +--max-tokens-per-gpu 2048 # GRPO --advantage-estimator grpo @@ -94,26 +86,31 @@ # Optimizer --optimizer adam ---lr 5e-6 +--lr 1e-5 --lr-decay-style constant --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.98 +--optimizer-cpu-offload +--overlap-cpu-optimizer-d2h-h2d +--use-precision-aware-optimizer # Rollout serving ---rollout-num-gpus-per-engine 1 ---sglang-mem-fraction-static 0.5 +--rollout-num-gpus-per-engine 8 +--sglang-mem-fraction-static 0.7 +--sglang-cuda-graph-max-bs 256 # Training runtime --attention-dropout 0.0 --hidden-dropout 0.0 --accumulate-allreduce-grads-in-fp32 --attention-softmax-in-fp32 +--attention-backend flash --calculate-per-token-loss --use-miles-router --save-interval 25 # Logging defaults --wandb-project miles-modal ---wandb-group glm5-744b-a40b-4layer-lora +--wandb-group qwen3-30b-a3b-lora --disable-wandb-random-suffix diff --git a/miles/recipes/qwen25-0p5b-lora.args b/miles/recipes/tests/qwen25-0p5b-lora.args similarity index 100% rename from miles/recipes/qwen25-0p5b-lora.args rename to miles/recipes/tests/qwen25-0p5b-lora.args diff --git a/miles/recipes/tests/qwen3-30b-a3b-experts-fewstep.args b/miles/recipes/tests/qwen3-30b-a3b-experts-fewstep.args new file mode 100644 index 0000000..d479fcb --- /dev/null +++ b/miles/recipes/tests/qwen3-30b-a3b-experts-fewstep.args @@ -0,0 +1,110 @@ +# Qwen3-30B-A3B bridge-mode LoRA recipe tuned to get through a few RL steps. +# This widens the working attention-only few-step shape to expert linear_fc1 +# and linear_fc2 targets while keeping rollout pressure trimmed. + +# Model architecture from the upstream Miles Qwen3-30B-A3B model script. +--disable-bias-linear +--qk-layernorm +--group-query-attention +--num-attention-heads 32 +--num-query-groups 4 +--kv-channels 128 +--num-layers 48 +--hidden-size 2048 +--ffn-hidden-size 6144 +--normalization RMSNorm +--position-embedding-type rope +--norm-epsilon 1e-6 +--rotary-percent 1.0 +--swiglu +--untie-embeddings-and-output-weights +--vocab-size 151936 +--rotary-base 1000000 +--moe-ffn-hidden-size 768 +--moe-router-score-function softmax +--moe-token-dispatcher-type alltoall +--moe-router-topk 8 +--moe-layer-freq "[1]*48" +--num-experts 128 +--moe-grouped-gemm +--moe-token-drop-policy probs +--moe-router-dtype fp32 +--moe-permute-fusion +--moe-aux-loss-coeff 0 + +# Checkpoint conversion +--megatron-to-hf-mode bridge + +# LoRA +--lora-rank 32 +--lora-alpha 32 +--lora-dropout 0.0 +--target-modules linear_qkv,linear_proj,linear_fc1,linear_fc2 + +# Data and rollout +--prompt-data /data/gsm8k/train.parquet +--input-key messages +--label-key label +--apply-chat-template +--rollout-shuffle +--rm-type math +--num-rollout 8 +--rollout-batch-size 2 +--n-samples-per-prompt 2 +--rollout-max-response-len 512 +--rollout-temperature 1 +--global-batch-size 4 + +# Parallelism and performance +--tensor-model-parallel-size 4 +--sequence-parallel +--pipeline-model-parallel-size 1 +--context-parallel-size 1 +--expert-model-parallel-size 8 +--expert-tensor-parallel-size 1 +--recompute-granularity full +--recompute-method uniform +--recompute-num-layers 1 +--use-dynamic-batch-size +--max-tokens-per-gpu 1024 + +# GRPO +--advantage-estimator grpo +--kl-loss-coef 0.0 +--kl-loss-type low_var_kl +--kl-coef 0.0 +--entropy-coef 0.0 +--eps-clip 0.2 +--eps-clip-high 0.28 + +# Optimizer +--optimizer adam +--lr 1e-5 +--lr-decay-style constant +--weight-decay 0.1 +--adam-beta1 0.9 +--adam-beta2 0.98 +--optimizer-cpu-offload +--overlap-cpu-optimizer-d2h-h2d +--use-precision-aware-optimizer + +# Rollout serving +--rollout-num-gpus-per-engine 8 +--sglang-mem-fraction-static 0.7 +--sglang-cuda-graph-max-bs 64 +--sglang-disable-cuda-graph + +# Training runtime +--attention-dropout 0.0 +--hidden-dropout 0.0 +--accumulate-allreduce-grads-in-fp32 +--attention-softmax-in-fp32 +--attention-backend flash +--calculate-per-token-loss +--use-miles-router +--save-interval 1000 + +# Logging defaults +--wandb-project miles-modal +--wandb-group qwen3-30b-a3b-experts-fewstep +--disable-wandb-random-suffix diff --git a/miles/recipes/tests/qwen3-30b-a3b-lora-fewstep.args b/miles/recipes/tests/qwen3-30b-a3b-lora-fewstep.args new file mode 100644 index 0000000..dda4790 --- /dev/null +++ b/miles/recipes/tests/qwen3-30b-a3b-lora-fewstep.args @@ -0,0 +1,110 @@ +# Qwen3-30B-A3B bridge-mode LoRA recipe tuned to get through a few RL steps. +# This keeps attention-only LoRA targets but trims rollout pressure and disables +# eval so the run can reach training sooner. + +# Model architecture from the upstream Miles Qwen3-30B-A3B model script. +--disable-bias-linear +--qk-layernorm +--group-query-attention +--num-attention-heads 32 +--num-query-groups 4 +--kv-channels 128 +--num-layers 48 +--hidden-size 2048 +--ffn-hidden-size 6144 +--normalization RMSNorm +--position-embedding-type rope +--norm-epsilon 1e-6 +--rotary-percent 1.0 +--swiglu +--untie-embeddings-and-output-weights +--vocab-size 151936 +--rotary-base 1000000 +--moe-ffn-hidden-size 768 +--moe-router-score-function softmax +--moe-token-dispatcher-type alltoall +--moe-router-topk 8 +--moe-layer-freq "[1]*48" +--num-experts 128 +--moe-grouped-gemm +--moe-token-drop-policy probs +--moe-router-dtype fp32 +--moe-permute-fusion +--moe-aux-loss-coeff 0 + +# Checkpoint conversion +--megatron-to-hf-mode bridge + +# LoRA +--lora-rank 32 +--lora-alpha 32 +--lora-dropout 0.0 +--target-modules linear_qkv,linear_proj + +# Data and rollout +--prompt-data /data/gsm8k/train.parquet +--input-key messages +--label-key label +--apply-chat-template +--rollout-shuffle +--rm-type math +--num-rollout 8 +--rollout-batch-size 2 +--n-samples-per-prompt 2 +--rollout-max-response-len 512 +--rollout-temperature 1 +--global-batch-size 4 + +# Parallelism and performance +--tensor-model-parallel-size 4 +--sequence-parallel +--pipeline-model-parallel-size 1 +--context-parallel-size 1 +--expert-model-parallel-size 8 +--expert-tensor-parallel-size 1 +--recompute-granularity full +--recompute-method uniform +--recompute-num-layers 1 +--use-dynamic-batch-size +--max-tokens-per-gpu 1024 + +# GRPO +--advantage-estimator grpo +--kl-loss-coef 0.0 +--kl-loss-type low_var_kl +--kl-coef 0.0 +--entropy-coef 0.0 +--eps-clip 0.2 +--eps-clip-high 0.28 + +# Optimizer +--optimizer adam +--lr 1e-5 +--lr-decay-style constant +--weight-decay 0.1 +--adam-beta1 0.9 +--adam-beta2 0.98 +--optimizer-cpu-offload +--overlap-cpu-optimizer-d2h-h2d +--use-precision-aware-optimizer + +# Rollout serving +--rollout-num-gpus-per-engine 8 +--sglang-mem-fraction-static 0.7 +--sglang-cuda-graph-max-bs 64 +--sglang-disable-cuda-graph + +# Training runtime +--attention-dropout 0.0 +--hidden-dropout 0.0 +--accumulate-allreduce-grads-in-fp32 +--attention-softmax-in-fp32 +--attention-backend flash +--calculate-per-token-loss +--use-miles-router +--save-interval 1000 + +# Logging defaults +--wandb-project miles-modal +--wandb-group qwen3-30b-a3b-lora-fewstep +--disable-wandb-random-suffix diff --git a/miles/recipes/tests/qwen3-30b-a3b-lora-greedy-debug.args b/miles/recipes/tests/qwen3-30b-a3b-lora-greedy-debug.args new file mode 100644 index 0000000..351f4ed --- /dev/null +++ b/miles/recipes/tests/qwen3-30b-a3b-lora-greedy-debug.args @@ -0,0 +1,111 @@ +# Qwen3-30B-A3B bridge-mode LoRA recipe narrowed for faster Modal debugging. +# This keeps attention-only LoRA targets, trims rollout pressure, and uses +# deterministic top-k=1 decoding so the next run quickly isolates rollout bugs. + +# Model architecture from the upstream Miles Qwen3-30B-A3B model script. +--disable-bias-linear +--qk-layernorm +--group-query-attention +--num-attention-heads 32 +--num-query-groups 4 +--kv-channels 128 +--num-layers 48 +--hidden-size 2048 +--ffn-hidden-size 6144 +--normalization RMSNorm +--position-embedding-type rope +--norm-epsilon 1e-6 +--rotary-percent 1.0 +--swiglu +--untie-embeddings-and-output-weights +--vocab-size 151936 +--rotary-base 1000000 +--moe-ffn-hidden-size 768 +--moe-router-score-function softmax +--moe-token-dispatcher-type alltoall +--moe-router-topk 8 +--moe-layer-freq "[1]*48" +--num-experts 128 +--moe-grouped-gemm +--moe-token-drop-policy probs +--moe-router-dtype fp32 +--moe-permute-fusion +--moe-aux-loss-coeff 0 + +# Checkpoint conversion +--megatron-to-hf-mode bridge + +# LoRA +--lora-rank 32 +--lora-alpha 32 +--lora-dropout 0.0 +--target-modules linear_qkv,linear_proj + +# Data and rollout +--prompt-data /data/gsm8k/train.parquet +--input-key messages +--label-key label +--apply-chat-template +--rollout-shuffle +--rm-type math +--num-rollout 2 +--rollout-batch-size 1 +--n-samples-per-prompt 2 +--rollout-max-response-len 128 +--rollout-temperature 1 +--rollout-top-k 1 +--global-batch-size 2 + +# Parallelism and performance +--tensor-model-parallel-size 4 +--sequence-parallel +--pipeline-model-parallel-size 1 +--context-parallel-size 1 +--expert-model-parallel-size 8 +--expert-tensor-parallel-size 1 +--recompute-granularity full +--recompute-method uniform +--recompute-num-layers 1 +--use-dynamic-batch-size +--max-tokens-per-gpu 768 + +# GRPO +--advantage-estimator grpo +--kl-loss-coef 0.0 +--kl-loss-type low_var_kl +--kl-coef 0.0 +--entropy-coef 0.0 +--eps-clip 0.2 +--eps-clip-high 0.28 + +# Optimizer +--optimizer adam +--lr 1e-5 +--lr-decay-style constant +--weight-decay 0.1 +--adam-beta1 0.9 +--adam-beta2 0.98 +--optimizer-cpu-offload +--overlap-cpu-optimizer-d2h-h2d +--use-precision-aware-optimizer + +# Rollout serving +--rollout-num-gpus-per-engine 8 +--sglang-mem-fraction-static 0.6 +--sglang-cuda-graph-max-bs 16 +--sglang-disable-cuda-graph + +# Training runtime +--attention-dropout 0.0 +--hidden-dropout 0.0 +--accumulate-allreduce-grads-in-fp32 +--attention-softmax-in-fp32 +--attention-backend flash +--calculate-per-token-loss +--use-miles-router +--save-interval 1000 + +# Logging defaults +--wandb-project miles-modal +--wandb-group qwen3-30b-a3b-lora-greedy-debug +--disable-wandb-random-suffix From 4ad8c8f68c371cad51bfcde83fe447994e7694c0 Mon Sep 17 00:00:00 2001 From: Jason Mancuso <7891333+jvmncs@users.noreply.github.com> Date: Thu, 26 Mar 2026 23:18:00 -0400 Subject: [PATCH 3/5] Align Qwen3 LoRA defaults with thinky method --- miles/README.md | 63 ++++++++++--------- miles/modal_train.py | 10 +-- miles/recipes/qwen3-30b-a3b-lora.args | 8 +-- .../tests/qwen3-30b-a3b-lora-fewstep.args | 6 +- .../qwen3-30b-a3b-lora-greedy-debug.args | 4 +- 5 files changed, 49 insertions(+), 42 deletions(-) diff --git a/miles/README.md b/miles/README.md index 9d7c67c..4902222 100644 --- a/miles/README.md +++ b/miles/README.md @@ -24,16 +24,20 @@ Current recipes: - `qwen25-0p5b-lora`: single-node smoke test adapted from the upstream Miles LoRA example. -- `qwen3-30b-a3b-lora`: first-pass Qwen3-30B-A3B bridge-mode LoRA validation - recipe, restricted to attention targets (`linear_qkv`, `linear_proj`). -- `qwen3-30b-a3b-lora-fewstep`: trimmed attention-only recipe that is intended - to prove a few full RL updates on Modal. -- `qwen3-30b-a3b-experts-lora`: second-pass Qwen3-30B-A3B recipe widened to - expert `linear_fc1` and `linear_fc2` targets after the baseline path works. -- `qwen3-30b-a3b-experts-fewstep`: trimmed expert-target recipe built from the - working few-step shape. +- `qwen3-30b-a3b-lora`: default Qwen3-30B-A3B all-layer recipe, targeting + attention plus MLP/MoE layers (`linear_qkv`, `linear_proj`, `linear_fc1`, + `linear_fc2`). +- `qwen3-30b-a3b-lora-fewstep`: trimmed all-layer recipe that is intended to + prove a few full RL updates on Modal. +- `qwen3-30b-a3b-experts-lora`: explicit all-layer alias that makes the expert + `linear_fc1` / `linear_fc2` targeting obvious in the name. +- `qwen3-30b-a3b-experts-fewstep`: trimmed explicit all-layer alias built from + the working few-step shape. Testing/debug recipe files live under [`recipes/tests/`](./recipes/tests). +The attention-only recipe is kept only as +[`qwen3-30b-a3b-lora-greedy-debug`](./recipes/tests/qwen3-30b-a3b-lora-greedy-debug.args), +as a diagnostic control rather than a recommended training setup. ## Prepare assets @@ -66,7 +70,7 @@ Qwen3-30B-A3B baseline LoRA validation: MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen3-30b-a3b-lora ``` -Qwen3-30B-A3B few-step attention-only validation: +Qwen3-30B-A3B few-step all-layer validation: ```bash MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen3-30b-a3b-lora-fewstep @@ -89,6 +93,16 @@ MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen3-30b-a3b-experts-fe - Start with standard LoRA, not DoRA. Miles' current rollout sync and adapter filtering are LoRA-specific and keyed off `lora_A` / `lora_B` names, so DoRA is not the first validation target. +- The default Qwen3 recipes now follow the recommendations from Thinking + Machines' “LoRA Without Regret”: keep the standard `alpha=32` / `1/r` + parameterization, use a LoRA LR around 10x the FullFT baseline (`1e-5` here + vs. the upstream Qwen3-30B-A3B FullFT `1e-6`), and include the MLP/MoE layers + rather than using attention-only LoRA. +- One MoE-specific nuance from the article is not exposed cleanly by the current + Miles recipe surface: their Qwen3 MoE experiments scale per-expert LoRA rank + by the number of active experts. Our recipes currently use a uniform + `--lora-rank 32` across all targeted modules because Miles exposes one global + LoRA rank, not per-module or per-expert ranks. - The baked Qwen3 recipes are single-node `H100:8` shapes. They are intended to validate end-to-end bridge-mode LoRA with colocated rollout first, not to exhaustively cover every parallelism combination. @@ -99,8 +113,8 @@ MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen3-30b-a3b-experts-fe - Miles currently supports LoRA weight sync only for colocated rollout engines. Distributed non-colocated rollout sync is not yet implemented for LoRA. - The baseline Qwen3 recipe stays close to the upstream Miles single-node - Qwen3-30B-A3B shape. The expert-target recipe is a follow-on experiment, not - the initial correctness target. + Qwen3-30B-A3B shape while using all-layer LoRA. The explicit expert-target + recipe names are kept mainly for clarity and backwards compatibility. ## Observed On Modal @@ -117,21 +131,14 @@ The current wrapper includes runtime patches in What the Modal runs have validated so far on `modal-labs`: -- `qwen3-30b-a3b-lora` gets through bridge-mode LoRA creation and attention - module injection (`linear_qkv`, `linear_proj`), and it can start loading the - Hugging Face checkpoint into Megatron. -- `qwen3-30b-a3b-lora-fewstep` now gets through full RL training on Modal. In - recent runs it passed rollout, weight sync, and actor training repeatedly and - reached at least `train/step` 6 on a single-node `H100:8` shape. -- `qwen3-30b-a3b-experts-lora` goes further: it creates LoRA with - `linear_qkv`, `linear_proj`, `linear_fc1`, and `linear_fc2`, injects those - expert targets under `decoder.layers.*.mlp.experts.*`, loads weights, pushes - the adapter into SGLang, and starts `Eval gsm8k`. -- `qwen3-30b-a3b-experts-fewstep` has validated the widened target surface on - Modal: Miles creates LoRA with `linear_fc1` / `linear_fc2`, injects those - expert modules, completes weight sync, and reaches rollout collection plus - actor training. A detached confirmation of a full expert-target train step is - still in progress. +- The all-layer Qwen3-30B-A3B LoRA shape now has runtime validation on Modal. + In recent detached runs of the few-step recipe shape, Miles created LoRA with + `linear_qkv`, `linear_proj`, `linear_fc1`, and `linear_fc2`, injected expert + modules under `decoder.layers.*.mlp.experts.*`, completed rollout collection, + and reached at least `train/step` 1 on a single-node `H100:8` shape. +- The attention-only debug/control recipe also works, but it is no longer the + recommended configuration after comparing against the Thinking Machines + guidance. - The remaining instability has been in the colocated SGLang rollout path, not in LoRA target discovery. The main concrete runtime failures we hit were: non-finite logprobs breaking HTTP JSON serialization, and invalid sampling @@ -141,8 +148,8 @@ Current interpretation: - Qwen3-30B-A3B MoE LoRA support in Miles is real enough to instantiate, target, load, and export adapters for both attention and expert MLP layers. -- Attention-only Qwen3-30B-A3B LoRA is now runtime-validated for repeated RL - updates on `modal-labs`. +- Attention-only Qwen3-30B-A3B LoRA is still runtime-validated as a debug + control on `modal-labs`, but it is no longer the recommended default. - The remaining risk is concentrated in the colocated SGLang rollout lifecycle, which is coupled to `offload_rollout` / `enable_memory_saver=True` in the current Miles SGLang engine setup, especially once expert-target LoRA is diff --git a/miles/modal_train.py b/miles/modal_train.py index 5477a33..9384fbb 100644 --- a/miles/modal_train.py +++ b/miles/modal_train.py @@ -70,7 +70,7 @@ class Recipe: ), "qwen3-30b-a3b-lora": Recipe( name="qwen3-30b-a3b-lora", - description="Single-node Qwen3-30B-A3B bridge-mode LoRA validation recipe.", + description="Single-node Qwen3-30B-A3B all-layer bridge-mode LoRA recipe aligned with current best practices.", model_id="Qwen/Qwen3-30B-A3B", args_file="qwen3-30b-a3b-lora.args", recommended_nodes=1, @@ -78,7 +78,7 @@ class Recipe: ), "qwen3-30b-a3b-lora-fewstep": Recipe( name="qwen3-30b-a3b-lora-fewstep", - description="Single-node Qwen3-30B-A3B attention-only LoRA recipe trimmed to chase a few full RL steps.", + description="Single-node Qwen3-30B-A3B all-layer LoRA recipe trimmed to chase a few full RL steps.", model_id="Qwen/Qwen3-30B-A3B", args_file="tests/qwen3-30b-a3b-lora-fewstep.args", recommended_nodes=1, @@ -86,7 +86,7 @@ class Recipe: ), "qwen3-30b-a3b-lora-greedy-debug": Recipe( name="qwen3-30b-a3b-lora-greedy-debug", - description="Single-node Qwen3-30B-A3B attention-only LoRA debug recipe with greedy rollout to validate LoRA sync.", + description="Single-node Qwen3-30B-A3B attention-only debug/control recipe with greedy rollout.", model_id="Qwen/Qwen3-30B-A3B", args_file="tests/qwen3-30b-a3b-lora-greedy-debug.args", recommended_nodes=1, @@ -94,7 +94,7 @@ class Recipe: ), "qwen3-30b-a3b-experts-lora": Recipe( name="qwen3-30b-a3b-experts-lora", - description="Second-phase Qwen3-30B-A3B recipe widened to expert linear_fc1/fc2 targets.", + description="Explicit all-layer Qwen3-30B-A3B LoRA recipe including expert linear_fc1/fc2 targets.", model_id="Qwen/Qwen3-30B-A3B", args_file="qwen3-30b-a3b-experts-lora.args", recommended_nodes=1, @@ -102,7 +102,7 @@ class Recipe: ), "qwen3-30b-a3b-experts-fewstep": Recipe( name="qwen3-30b-a3b-experts-fewstep", - description="Single-node Qwen3-30B-A3B expert-target LoRA recipe trimmed to chase a few RL steps.", + description="Explicit all-layer Qwen3-30B-A3B few-step recipe including expert linear_fc1/fc2 targets.", model_id="Qwen/Qwen3-30B-A3B", args_file="tests/qwen3-30b-a3b-experts-fewstep.args", recommended_nodes=1, diff --git a/miles/recipes/qwen3-30b-a3b-lora.args b/miles/recipes/qwen3-30b-a3b-lora.args index d80d9b3..a208cee 100644 --- a/miles/recipes/qwen3-30b-a3b-lora.args +++ b/miles/recipes/qwen3-30b-a3b-lora.args @@ -1,6 +1,6 @@ -# Qwen3-30B-A3B bridge-mode LoRA validation recipe. -# Phase 1: confirm end-to-end Miles + Megatron-Bridge + SGLang support with -# attention-only LoRA targets before widening into expert MLP modules. +# Qwen3-30B-A3B bridge-mode LoRA recipe aligned with the +# "LoRA Without Regret" guidance from Thinking Machines: +# apply LoRA to the attention and MLP/MoE linear layers, not attention only. # Model architecture from the upstream Miles Qwen3-30B-A3B model script. --disable-bias-linear @@ -39,7 +39,7 @@ --lora-rank 32 --lora-alpha 32 --lora-dropout 0.0 ---target-modules linear_qkv,linear_proj +--target-modules linear_qkv,linear_proj,linear_fc1,linear_fc2 # Data and rollout --prompt-data /data/gsm8k/train.parquet diff --git a/miles/recipes/tests/qwen3-30b-a3b-lora-fewstep.args b/miles/recipes/tests/qwen3-30b-a3b-lora-fewstep.args index dda4790..75d749b 100644 --- a/miles/recipes/tests/qwen3-30b-a3b-lora-fewstep.args +++ b/miles/recipes/tests/qwen3-30b-a3b-lora-fewstep.args @@ -1,6 +1,6 @@ # Qwen3-30B-A3B bridge-mode LoRA recipe tuned to get through a few RL steps. -# This keeps attention-only LoRA targets but trims rollout pressure and disables -# eval so the run can reach training sooner. +# This keeps the smaller rollout shape from validation, but uses the +# article-aligned all-layer LoRA target set rather than attention only. # Model architecture from the upstream Miles Qwen3-30B-A3B model script. --disable-bias-linear @@ -39,7 +39,7 @@ --lora-rank 32 --lora-alpha 32 --lora-dropout 0.0 ---target-modules linear_qkv,linear_proj +--target-modules linear_qkv,linear_proj,linear_fc1,linear_fc2 # Data and rollout --prompt-data /data/gsm8k/train.parquet diff --git a/miles/recipes/tests/qwen3-30b-a3b-lora-greedy-debug.args b/miles/recipes/tests/qwen3-30b-a3b-lora-greedy-debug.args index 351f4ed..36972be 100644 --- a/miles/recipes/tests/qwen3-30b-a3b-lora-greedy-debug.args +++ b/miles/recipes/tests/qwen3-30b-a3b-lora-greedy-debug.args @@ -1,6 +1,6 @@ # Qwen3-30B-A3B bridge-mode LoRA recipe narrowed for faster Modal debugging. -# This keeps attention-only LoRA targets, trims rollout pressure, and uses -# deterministic top-k=1 decoding so the next run quickly isolates rollout bugs. +# This intentionally keeps attention-only LoRA as a diagnostic control, trims +# rollout pressure, and uses deterministic top-k=1 decoding to isolate bugs. # Model architecture from the upstream Miles Qwen3-30B-A3B model script. --disable-bias-linear From 491525918f4782dd92423a7d182fd2abb255ad65 Mon Sep 17 00:00:00 2001 From: Jason Mancuso <7891333+jvmncs@users.noreply.github.com> Date: Fri, 27 Mar 2026 12:44:10 -0400 Subject: [PATCH 4/5] Validate non-colocated Qwen3 LoRA on Modal --- miles/README.md | 15 ++ miles/modal_patches/sitecustomize.py | 208 +++++++++++++++++++++++++++ miles/modal_train.py | 102 ++++++++++++- 3 files changed, 322 insertions(+), 3 deletions(-) diff --git a/miles/README.md b/miles/README.md index 4902222..26bd650 100644 --- a/miles/README.md +++ b/miles/README.md @@ -76,6 +76,12 @@ Qwen3-30B-A3B few-step all-layer validation: MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen3-30b-a3b-lora-fewstep ``` +Qwen3-30B-A3B few-step all-layer validation in non-colocated mode: + +```bash +MILES_N_NODES=2 modal run miles/modal_train.py --recipe qwen3-30b-a3b-lora-fewstep --no-colocate --actor-nodes 1 --allow-cluster-mismatch +``` + Qwen3-30B-A3B expert-target LoRA follow-up: ```bash @@ -106,6 +112,10 @@ MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen3-30b-a3b-experts-fe - The baked Qwen3 recipes are single-node `H100:8` shapes. They are intended to validate end-to-end bridge-mode LoRA with colocated rollout first, not to exhaustively cover every parallelism combination. +- The Modal wrapper now also supports a non-colocated split for experimentation: + total cluster size still comes from `MILES_N_NODES`, while `--actor-nodes` + controls how many of those nodes are reserved for Megatron training and the + remaining GPUs default to rollout. - Source inspection suggests the training path should handle TP / PP / EP / CP because the bridge setup forwards all of those settings into Megatron-Bridge, and Megatron-Bridge's PEFT tests cover pipeline-style model chunk lists. That @@ -136,6 +146,11 @@ What the Modal runs have validated so far on `modal-labs`: `linear_qkv`, `linear_proj`, `linear_fc1`, and `linear_fc2`, injected expert modules under `decoder.layers.*.mlp.experts.*`, completed rollout collection, and reached at least `train/step` 1 on a single-node `H100:8` shape. +- The same all-layer few-step recipe now also has non-colocated runtime + validation on `modal-labs` with a 2-node split: 1 node for actor training and + 1 node for rollout (`--no-colocate --actor-nodes 1`). After forcing the Miles + router to advertise the cluster head IP and patching distributed LoRA sync to + export bridge adapters directly, the run reached at least `train/step` 2. - The attention-only debug/control recipe also works, but it is no longer the recommended configuration after comparing against the Thinking Machines guidance. diff --git a/miles/modal_patches/sitecustomize.py b/miles/modal_patches/sitecustomize.py index 0662644..0b47f4b 100644 --- a/miles/modal_patches/sitecustomize.py +++ b/miles/modal_patches/sitecustomize.py @@ -408,8 +408,216 @@ def sampling_from_probs_torch( _log("[miles-modal] patched SGLang sampling to sanitize invalid probability rows") +def _patch_distributed_lora_sync() -> None: + try: + import base64 + import io + import torch + from miles.backends.megatron_utils.lora_utils import ( + LORA_ADAPTER_NAME, + build_lora_sync_config, + is_lora_weight_name, + ) + from miles.backends.megatron_utils.update_weight import ( + update_weight_from_distributed as distributed_mod, + update_weight_from_tensor as tensor_mod, + ) + from miles.backends.megatron_utils.update_weight.hf_weight_iterator_base import ( + HfWeightIteratorBase, + ) + except Exception as exc: + _log( + "[miles-modal] distributed LoRA sync patch unavailable: " + f"{type(exc).__name__}: {exc}" + ) + return + + UpdateWeightFromDistributed = getattr( + distributed_mod, "UpdateWeightFromDistributed", None + ) + if UpdateWeightFromDistributed is None: + _log("[miles-modal] distributed LoRA sync patch missing updater class") + return + + original_init = getattr(UpdateWeightFromDistributed, "__init__", None) + original_update_weights = getattr(UpdateWeightFromDistributed, "update_weights", None) + if original_init is None or original_update_weights is None: + _log("[miles-modal] distributed LoRA sync patch missing target methods") + return + + if getattr(original_update_weights, "__module__", "") == __name__: + _log("[miles-modal] distributed LoRA sync patch already present") + return + + ray = distributed_mod.ray + dist = distributed_mod.dist + get_gloo_group = distributed_mod.get_gloo_group + post_process_weights = distributed_mod.post_process_weights + FlattenedTensorBucket = tensor_mod.FlattenedTensorBucket + MultiprocessingSerializer = tensor_mod.MultiprocessingSerializer + check_results = tensor_mod._check_weight_sync_results + + def _serialize_lora_named_tensors(named_tensors): + flattened_tensor_bucket = FlattenedTensorBucket(named_tensors=named_tensors) + flattened_tensor = flattened_tensor_bucket.get_flattened_tensor() + if not isinstance(flattened_tensor, torch.Tensor): + raise TypeError( + "Expected LoRA flattened tensor to be a torch.Tensor, got " + f"{type(flattened_tensor).__name__}" + ) + if flattened_tensor.is_cuda: + flattened_tensor = flattened_tensor.detach().cpu() + buffer = io.BytesIO() + torch.save(flattened_tensor.contiguous(), buffer) + flattened_tensor_data = { + "_miles_modal_format": "torch_save_flattened_lora_v2", + "flattened_tensor_torch_save_b64": base64.b64encode(buffer.getvalue()).decode( + "ascii" + ), + "metadata": [ + { + "name": meta.name, + "shape": list(meta.shape), + "dtype": str(meta.dtype).removeprefix("torch."), + "start_idx": meta.start_idx, + "end_idx": meta.end_idx, + "numel": meta.numel, + } + for meta in flattened_tensor_bucket.get_metadata() + ], + } + return MultiprocessingSerializer.serialize(flattened_tensor_data, output_str=True) + + def __init__( + self, + args, + model, + weights_getter, + *, + model_name, + quantization_config, + is_lora=False, + ): + original_init( + self, + args, + model, + weights_getter, + model_name=model_name, + quantization_config=quantization_config, + is_lora=is_lora, + ) + self.weights_getter = weights_getter + self.is_lora = is_lora + self._lora_loaded = False + if self.is_lora: + self._hf_weight_iterator = HfWeightIteratorBase.create( + args=args, + model=model, + model_name=model_name, + quantization_config=quantization_config, + is_lora=True, + ) + self._lora_config = build_lora_sync_config(args) + else: + self._hf_weight_iterator = None + self._lora_config = None + + @torch.no_grad() + def update_weights(self): + if not getattr(self, "is_lora", False): + return original_update_weights(self) + + self.weight_version += 1 + rank = dist.get_rank() + + if rank == 0: + ray.get([engine.pause_generation.remote() for engine in self.rollout_engines]) + ray.get([engine.flush_cache.remote() for engine in self.rollout_engines]) + if self.quantization_config and self.quantization_config["quant_method"] in [ + "compressed-tensors" + ]: + post_process_weights( + restore_weights_before_load=True, + post_process_quantization=False, + rollout_engines=self.rollout_engines, + ) + dist.barrier(group=get_gloo_group()) + + megatron_local_weights = self.weights_getter() if self.weights_getter else {} + all_lora_named_tensors = [] + sync_chunk_count = 0 + + for hf_named_tensors in self._hf_weight_iterator.get_hf_weight_chunks( + megatron_local_weights + ): + lora_named_tensors = [ + (name, tensor) + for name, tensor in hf_named_tensors + if is_lora_weight_name(name) + ] + if not lora_named_tensors: + continue + + sync_chunk_count += 1 + if self._is_pp_src_rank: + all_lora_named_tensors.extend( + (name, tensor.detach().cpu()) + for name, tensor in lora_named_tensors + ) + + if self._is_pp_src_rank: + if sync_chunk_count == 0: + raise RuntimeError( + "Distributed LoRA weight sync failed: bridge export produced zero " + "LoRA chunks." + ) + + serialized_tensors = _serialize_lora_named_tensors(all_lora_named_tensors) + if self._lora_loaded: + ray.get( + [ + engine.unload_lora_adapter.remote(lora_name=LORA_ADAPTER_NAME) + for engine in self.rollout_engines + ] + ) + + results = ray.get( + [ + engine.load_lora_adapter_from_tensors.remote( + lora_name=LORA_ADAPTER_NAME, + config_dict=self._lora_config, + serialized_tensors=serialized_tensors, + load_format="flattened_bucket", + ) + for engine in self.rollout_engines + ] + ) + check_results(results, is_lora=True) + self._lora_loaded = True + + dist.barrier(group=get_gloo_group()) + if rank == 0: + if self.quantization_config and self.quantization_config["quant_method"] in [ + "compressed-tensors", + "mxfp8", + ]: + post_process_weights( + restore_weights_before_load=False, + post_process_quantization=True, + rollout_engines=self.rollout_engines, + ) + ray.get([engine.continue_generation.remote() for engine in self.rollout_engines]) + dist.barrier(group=get_gloo_group()) + + UpdateWeightFromDistributed.__init__ = __init__ + UpdateWeightFromDistributed.update_weights = update_weights + _log("[miles-modal] patched distributed LoRA sync to export bridge adapters directly") + + _register_linear_cross_entropy_module() _patch_lora_cpu_serialization() _patch_sglang_lora_numpy_rehydration() _patch_sglang_logprob_sanitization() _patch_sglang_sampling_probability_sanitization() +_patch_distributed_lora_sync() diff --git a/miles/modal_train.py b/miles/modal_train.py index 9384fbb..70b35b6 100644 --- a/miles/modal_train.py +++ b/miles/modal_train.py @@ -150,11 +150,17 @@ def _build_enforced_args( *, model_path: str, cluster_nodes: int, + actor_nodes: int, gpus_per_node: int, checkpoint_dir: pathlib.Path, custom_config_path: Optional[str], wandb_key: Optional[str], + colocate: bool, + rollout_num_gpus: Optional[int], ) -> list[str]: + if actor_nodes < 1: + raise ValueError(f"actor_nodes must be >= 1, got {actor_nodes}") + args = [ "--train-backend", "megatron", @@ -165,13 +171,20 @@ def _build_enforced_args( "--save", checkpoint_dir.as_posix(), "--actor-num-nodes", - str(cluster_nodes), + str(actor_nodes), "--actor-num-gpus-per-node", str(gpus_per_node), "--num-gpus-per-node", str(gpus_per_node), - "--colocate", ] + if colocate: + args.append("--colocate") + else: + if rollout_num_gpus is None or rollout_num_gpus < 1: + raise ValueError( + "rollout_num_gpus must be >= 1 when launching non-colocated rollout." + ) + args.extend(["--rollout-num-gpus", str(rollout_num_gpus)]) if custom_config_path: args.extend(["--custom-config-path", custom_config_path]) if wandb_key: @@ -184,26 +197,67 @@ def _build_miles_argv( *, model_path: str, cluster_nodes: int, + actor_nodes: int, gpus_per_node: int, checkpoint_dir: pathlib.Path, extra_args_text: str, custom_config_path: Optional[str], wandb_key: Optional[str], remote_recipe: bool, + colocate: bool, + rollout_num_gpus: Optional[int], ) -> list[str]: recipe_args = _parse_arg_text(_load_recipe_text(recipe, remote=remote_recipe)) extra_args = _parse_arg_text(extra_args_text) enforced_args = _build_enforced_args( model_path=model_path, cluster_nodes=cluster_nodes, + actor_nodes=actor_nodes, gpus_per_node=gpus_per_node, checkpoint_dir=checkpoint_dir, custom_config_path=custom_config_path, wandb_key=wandb_key, + colocate=colocate, + rollout_num_gpus=rollout_num_gpus, ) return ["python3", REMOTE_TRAIN_SCRIPT.as_posix(), *recipe_args, *extra_args, *enforced_args] +def _resolve_actor_nodes(cluster_nodes: int, *, colocate: bool, actor_nodes: int) -> int: + if colocate: + return cluster_nodes + if actor_nodes > 0: + return actor_nodes + if cluster_nodes < 2: + raise ValueError( + "Non-colocated rollout needs spare cluster capacity. " + "Set MILES_N_NODES>=2 or pass --colocate." + ) + return cluster_nodes - 1 + + +def _resolve_rollout_num_gpus( + cluster_nodes: int, + *, + actor_nodes: int, + gpus_per_node: int, + colocate: bool, + rollout_num_gpus: int, +) -> Optional[int]: + if colocate: + return None + if rollout_num_gpus > 0: + return rollout_num_gpus + spare_gpus = (cluster_nodes - actor_nodes) * gpus_per_node + if spare_gpus < 1: + raise ValueError( + "Non-colocated rollout needs spare GPUs after reserving actor nodes. " + f"cluster_nodes={cluster_nodes}, actor_nodes={actor_nodes}, " + f"gpus_per_node={gpus_per_node}" + ) + return spare_gpus + + def _read_optional_file(path_str: str) -> str: if not path_str: return "" @@ -213,6 +267,7 @@ def _read_optional_file(path_str: str) -> str: def _build_runtime_env(master_addr: str, wandb_key: Optional[str]) -> dict: env_vars = { "MASTER_ADDR": master_addr, + "MILES_HOST_IP": master_addr, "no_proxy": master_addr, "PYTHONPATH": f"{REMOTE_PATCH_DIR.as_posix()}:/root/Megatron-LM", "CUDA_DEVICE_MAX_CONNECTIONS": "1", @@ -365,6 +420,9 @@ async def submit_training( extra_args_text: str = "", custom_config_yaml: str = "", wandb_key: str = "", + actor_nodes: int | None = None, + colocate: bool = True, + rollout_num_gpus: int | None = None, ) -> dict: self._ensure_ray_started() @@ -389,23 +447,32 @@ async def submit_training( custom_config_path = f"/tmp/{recipe.name}-{run_id}-overrides.yaml" pathlib.Path(custom_config_path).write_text(custom_config_yaml) + resolved_actor_nodes = actor_nodes if actor_nodes is not None else CLUSTER_NODES + resolved_rollout_num_gpus = rollout_num_gpus argv = _build_miles_argv( recipe, model_path=model_path, cluster_nodes=CLUSTER_NODES, + actor_nodes=resolved_actor_nodes, gpus_per_node=gpus_per_node, checkpoint_dir=checkpoint_dir, extra_args_text=extra_args_text, custom_config_path=custom_config_path, wandb_key=wandb_key or None, remote_recipe=True, + colocate=colocate, + rollout_num_gpus=resolved_rollout_num_gpus, ) entrypoint = shlex.join(argv) runtime_env = _build_runtime_env(self.main_addr, wandb_key or None) print(f"Recipe: {recipe.name}") print(f"Model: {recipe.model_id}") - print(f"Nodes: {CLUSTER_NODES}") + print(f"Cluster nodes: {CLUSTER_NODES}") + print(f"Actor nodes: {resolved_actor_nodes}") + print(f"Colocate: {colocate}") + if not colocate: + print(f"Rollout GPUs: {resolved_rollout_num_gpus}") print(f"GPUs per node: {gpus_per_node}") print(f"Checkpoint dir: {checkpoint_dir}") print(f"Entrypoint: {entrypoint}") @@ -484,6 +551,9 @@ def main( list_recipes: bool = False, dry_run: bool = False, allow_cluster_mismatch: bool = False, + colocate: bool = True, + actor_nodes: int = 0, + rollout_num_gpus: int = 0, ): if list_recipes: _print_recipe_table() @@ -492,6 +562,18 @@ def main( selected_recipe = _get_recipe(recipe) selected_gpu = gpu or selected_recipe.gpu gpus_per_node = _parse_gpus_per_node(selected_gpu) + resolved_actor_nodes = _resolve_actor_nodes( + CLUSTER_NODES, + colocate=colocate, + actor_nodes=actor_nodes, + ) + resolved_rollout_num_gpus = _resolve_rollout_num_gpus( + CLUSTER_NODES, + actor_nodes=resolved_actor_nodes, + gpus_per_node=gpus_per_node, + colocate=colocate, + rollout_num_gpus=rollout_num_gpus, + ) if ( not allow_cluster_mismatch @@ -515,16 +597,23 @@ def main( selected_recipe, model_path="$MODEL_PATH", cluster_nodes=CLUSTER_NODES, + actor_nodes=resolved_actor_nodes, gpus_per_node=gpus_per_node, checkpoint_dir=checkpoint_dir, extra_args_text=merged_extra_args, custom_config_path="/tmp/custom-config.yaml" if custom_config_yaml else None, wandb_key="$WANDB_API_KEY" if wandb_key else None, remote_recipe=False, + colocate=colocate, + rollout_num_gpus=resolved_rollout_num_gpus, ) print(f"Recipe: {selected_recipe.name}") print(f"Model: {selected_recipe.model_id}") print(f"Cluster nodes: {CLUSTER_NODES}") + print(f"Actor nodes: {resolved_actor_nodes}") + print(f"Colocate: {colocate}") + if not colocate: + print(f"Rollout GPUs: {resolved_rollout_num_gpus}") print(f"GPU: {selected_gpu}") print(shlex.join(argv)) return @@ -532,6 +621,10 @@ def main( print(f"Recipe: {selected_recipe.name}") print(f"Model: {selected_recipe.model_id}") print(f"Cluster nodes: {CLUSTER_NODES}") + print(f"Actor nodes: {resolved_actor_nodes}") + print(f"Colocate: {colocate}") + if not colocate: + print(f"Rollout GPUs: {resolved_rollout_num_gpus}") print(f"GPU: {selected_gpu}") cluster = MilesCluster.with_options(gpu=selected_gpu)() @@ -541,5 +634,8 @@ def main( extra_args_text=merged_extra_args, custom_config_yaml=custom_config_yaml, wandb_key=wandb_key, + actor_nodes=resolved_actor_nodes, + colocate=colocate, + rollout_num_gpus=resolved_rollout_num_gpus, ) print(result) From bada71606c9158fcdb6eeb4f38f738225e7f3c37 Mon Sep 17 00:00:00 2001 From: Jason Mancuso <7891333+jvmncs@users.noreply.github.com> Date: Mon, 30 Mar 2026 15:25:19 -0400 Subject: [PATCH 5/5] Refactor Miles Modal launcher and recipe argument plumbing --- miles/README.md | 214 ++++++++--------------- miles/modal_train.py | 345 +++++++++++++++++++++----------------- miles/recipes/__init__.py | 29 ++++ miles/recipes/util.py | 122 ++++++++++++++ 4 files changed, 416 insertions(+), 294 deletions(-) create mode 100644 miles/recipes/__init__.py create mode 100644 miles/recipes/util.py diff --git a/miles/README.md b/miles/README.md index 26bd650..f01f58b 100644 --- a/miles/README.md +++ b/miles/README.md @@ -1,191 +1,121 @@ -# Miles example +# Miles on Modal -Multi-node Miles RL training on Modal using the same Ray bootstrap pattern as -[`ray/modal_train.py`](../ray/modal_train.py), but with Miles recipes stored as -native CLI flag files instead of Python config classes. +Run Miles RL training on Modal with recipe files stored under [`recipes/`](./recipes/). +The wrapper handles Modal and Ray orchestration; model and training flags stay in +recipe arg files. ## Prerequisites - A Modal account with multi-node access. -- A `huggingface-secret` Modal secret containing `HF_TOKEN` for gated model - downloads. -- Optionally export `WANDB_API_KEY` in your local shell before `modal run` to - auto-enable Weights & Biases logging for training runs. +- A `huggingface-secret` Modal secret containing `HF_TOKEN`. +- Optional: `WANDB_API_KEY` in your local shell for Weights & Biases logging. +- Optional: `modal deploy miles/modal_train.py`. The local entrypoint will try + the deployed `MilesCluster` first and fall back to an ephemeral app if it is + not deployed. -## Recipes +## Prepare Shared Assets -List the built-in recipes: +Prepare the default GSM8K dataset: ```bash -modal run miles/modal_train.py --list-recipes +modal run miles/modal_train.py::prepare_dataset ``` -Current recipes: - -- `qwen25-0p5b-lora`: single-node smoke test adapted from the upstream Miles - LoRA example. -- `qwen3-30b-a3b-lora`: default Qwen3-30B-A3B all-layer recipe, targeting - attention plus MLP/MoE layers (`linear_qkv`, `linear_proj`, `linear_fc1`, - `linear_fc2`). -- `qwen3-30b-a3b-lora-fewstep`: trimmed all-layer recipe that is intended to - prove a few full RL updates on Modal. -- `qwen3-30b-a3b-experts-lora`: explicit all-layer alias that makes the expert - `linear_fc1` / `linear_fc2` targeting obvious in the name. -- `qwen3-30b-a3b-experts-fewstep`: trimmed explicit all-layer alias built from - the working few-step shape. +Download a model for a built-in recipe: -Testing/debug recipe files live under [`recipes/tests/`](./recipes/tests). -The attention-only recipe is kept only as -[`qwen3-30b-a3b-lora-greedy-debug`](./recipes/tests/qwen3-30b-a3b-lora-greedy-debug.args), -as a diagnostic control rather than a recommended training setup. - -## Prepare assets +```bash +modal run miles/modal_train.py::download_model --recipe qwen3-30b-a3b-lora +``` -Prepare a small GSM8K dataset in the shared volume: +Or download any model directly: ```bash -modal run miles/modal_train.py::prepare_dataset +modal run miles/modal_train.py::download_model --model-id Qwen/Qwen3-30B-A3B ``` -Download a recipe's base model into the shared Hugging Face cache: +## Recipes + +List the available recipes: ```bash -modal run miles/modal_train.py::download_model --recipe qwen3-30b-a3b-lora +modal run miles/modal_train.py --list-recipes ``` +Recommended starting points: + +- `qwen3-30b-a3b-lora`: default Qwen3 recipe. +- `qwen3-30b-a3b-lora-fewstep`: smallest end-to-end Qwen3 validation recipe. +- `qwen3-30b-a3b-experts-lora`: explicit expert-target variant. +- `qwen3-30b-a3b-experts-fewstep`: trimmed expert-target validation recipe. +- `qwen25-0p5b-lora`: small smoke test. + +Testing and debug recipes live under [`recipes/tests/`](./recipes/tests). + ## Train -The cluster size is chosen at import time by `MILES_N_NODES`, so set it in the -same shell invocation as `modal run`. +Set `MILES_N_NODES` in the same shell invocation as `modal run`. -Single-node smoke test: +Single-node Qwen3 few-step validation: ```bash -MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen25-0p5b-lora +MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen3-30b-a3b-lora-fewstep ``` -Qwen3-30B-A3B baseline LoRA validation: +Single-node Qwen3 default recipe: ```bash MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen3-30b-a3b-lora ``` -Qwen3-30B-A3B few-step all-layer validation: +Single-node expert-target follow-up: ```bash -MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen3-30b-a3b-lora-fewstep +MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen3-30b-a3b-experts-fewstep ``` -Qwen3-30B-A3B few-step all-layer validation in non-colocated mode: +Non-colocated Qwen3 validation: ```bash -MILES_N_NODES=2 modal run miles/modal_train.py --recipe qwen3-30b-a3b-lora-fewstep --no-colocate --actor-nodes 1 --allow-cluster-mismatch +MILES_N_NODES=2 modal run miles/modal_train.py \ + --recipe qwen3-30b-a3b-lora-fewstep \ + --no-colocate \ + --actor-nodes 1 \ + --allow-cluster-mismatch ``` -Qwen3-30B-A3B expert-target LoRA follow-up: +Small smoke test: ```bash -MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen3-30b-a3b-experts-lora +MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen25-0p5b-lora ``` -Qwen3-30B-A3B expert-target few-step validation: +## Ad Hoc Runs + +You can launch without a predefined recipe by passing args directly: ```bash -MILES_N_NODES=1 modal run miles/modal_train.py --recipe qwen3-30b-a3b-experts-fewstep +MILES_N_NODES=1 modal run miles/modal_train.py \ + --model-id Qwen/Qwen3-30B-A3B \ + --args-file miles/recipes/qwen3-30b-a3b-lora.args \ + --extra-args "--train-samples 8 --eval-interval 1" \ + --run-name qwen3-adhoc ``` -## Qwen3 Notes - -- Start with standard LoRA, not DoRA. Miles' current rollout sync and adapter - filtering are LoRA-specific and keyed off `lora_A` / `lora_B` names, so DoRA - is not the first validation target. -- The default Qwen3 recipes now follow the recommendations from Thinking - Machines' “LoRA Without Regret”: keep the standard `alpha=32` / `1/r` - parameterization, use a LoRA LR around 10x the FullFT baseline (`1e-5` here - vs. the upstream Qwen3-30B-A3B FullFT `1e-6`), and include the MLP/MoE layers - rather than using attention-only LoRA. -- One MoE-specific nuance from the article is not exposed cleanly by the current - Miles recipe surface: their Qwen3 MoE experiments scale per-expert LoRA rank - by the number of active experts. Our recipes currently use a uniform - `--lora-rank 32` across all targeted modules because Miles exposes one global - LoRA rank, not per-module or per-expert ranks. -- The baked Qwen3 recipes are single-node `H100:8` shapes. They are intended to - validate end-to-end bridge-mode LoRA with colocated rollout first, not to - exhaustively cover every parallelism combination. -- The Modal wrapper now also supports a non-colocated split for experimentation: - total cluster size still comes from `MILES_N_NODES`, while `--actor-nodes` - controls how many of those nodes are reserved for Megatron training and the - remaining GPUs default to rollout. -- Source inspection suggests the training path should handle TP / PP / EP / CP - because the bridge setup forwards all of those settings into Megatron-Bridge, - and Megatron-Bridge's PEFT tests cover pipeline-style model chunk lists. That - is still weaker than an actual Miles e2e validation for each shape. -- Miles currently supports LoRA weight sync only for colocated rollout engines. - Distributed non-colocated rollout sync is not yet implemented for LoRA. -- The baseline Qwen3 recipe stays close to the upstream Miles single-node - Qwen3-30B-A3B shape while using all-layer LoRA. The explicit expert-target - recipe names are kept mainly for clarity and backwards compatibility. - -## Observed On Modal - -The current wrapper includes runtime patches in -[`modal_patches/sitecustomize.py`](./modal_patches/sitecustomize.py) that: - -- register Megatron-Bridge's `LinearCrossEntropyModule` as column-parallel - before Hugging Face weights are loaded, which fixes bridge-mode Qwen3 load on - `output_layer.weight`; -- serialize colocated LoRA weight buckets in a builtins-only format and - rehydrate them inside SGLang, which fixes the Modal colocated LoRA sync path; -- sanitize non-finite SGLang logprob values before JSON serialization; -- sanitize invalid SGLang sampling probability rows before `torch.multinomial`. - -What the Modal runs have validated so far on `modal-labs`: - -- The all-layer Qwen3-30B-A3B LoRA shape now has runtime validation on Modal. - In recent detached runs of the few-step recipe shape, Miles created LoRA with - `linear_qkv`, `linear_proj`, `linear_fc1`, and `linear_fc2`, injected expert - modules under `decoder.layers.*.mlp.experts.*`, completed rollout collection, - and reached at least `train/step` 1 on a single-node `H100:8` shape. -- The same all-layer few-step recipe now also has non-colocated runtime - validation on `modal-labs` with a 2-node split: 1 node for actor training and - 1 node for rollout (`--no-colocate --actor-nodes 1`). After forcing the Miles - router to advertise the cluster head IP and patching distributed LoRA sync to - export bridge adapters directly, the run reached at least `train/step` 2. -- The attention-only debug/control recipe also works, but it is no longer the - recommended configuration after comparing against the Thinking Machines - guidance. -- The remaining instability has been in the colocated SGLang rollout path, not - in LoRA target discovery. The main concrete runtime failures we hit were: - non-finite logprobs breaking HTTP JSON serialization, and invalid sampling - probability tensors breaking `torch.multinomial`. - -Current interpretation: - -- Qwen3-30B-A3B MoE LoRA support in Miles is real enough to instantiate, - target, load, and export adapters for both attention and expert MLP layers. -- Attention-only Qwen3-30B-A3B LoRA is still runtime-validated as a debug - control on `modal-labs`, but it is no longer the recommended default. -- The remaining risk is concentrated in the colocated SGLang rollout lifecycle, - which is coupled to `offload_rollout` / `enable_memory_saver=True` in the - current Miles SGLang engine setup, especially once expert-target LoRA is - enabled. - -Useful options: - -- `--dry-run`: print the assembled Miles command with a `$MODEL_PATH` - placeholder without launching the cluster. -- `--extra-args "...flags..."`: append ad hoc Miles CLI overrides. -- `--extra-args-file path/to/file.args`: append overrides from a local text - file. -- `--custom-config path/to/overrides.yaml`: pass a flat YAML override map to - Miles via `--custom-config-path`. -- `--allow-cluster-mismatch`: bypass recipe/node-count validation if you are - intentionally adapting a canned recipe. -- `USE_LOCAL_MILES=/path/to/miles`: overlay a local Miles checkout on top of - the pinned container image. -- `MILES_IMAGE=radixark/miles:...`: override the pinned image tag. The current - default is `radixark/miles:dev-202603231227`. - -The wrapper intentionally owns only Modal/Ray plumbing plus a small set of -cluster-critical flags. All model and training settings live in -[`miles/recipes/`](./recipes/). +## Useful Options + +- `--dry-run`: print the assembled Miles command without launching a job. +- `--args` / `--args-file`: provide the base Miles CLI args. +- `--extra-args` / `--extra-args-file`: append overrides to a recipe or ad hoc run. +- `--custom-config`: pass a YAML override file through to Miles. +- `--run-name`: override the checkpoint subdirectory name. +- `--allow-cluster-mismatch`: bypass recipe node-count checks. +- `USE_LOCAL_MILES=/path/to/miles`: overlay a local Miles checkout. +- `MILES_IMAGE=radixark/miles:...`: override the pinned container image. + +## Notes + +- The default Qwen3 recipes use standard all-layer LoRA over + `linear_qkv`, `linear_proj`, `linear_fc1`, and `linear_fc2`. +- Start with the few-step recipes when validating a new environment. +- Modal-specific runtime compatibility patches live in + [`modal_patches/sitecustomize.py`](./modal_patches/sitecustomize.py). diff --git a/miles/modal_train.py b/miles/modal_train.py index 70b35b6..97fb3e7 100644 --- a/miles/modal_train.py +++ b/miles/modal_train.py @@ -1,27 +1,38 @@ -""" -Thin Modal launcher for multi-node Miles training. - -Design: -- Bootstrap the Ray cluster once in modal.enter() inside a clustered modal.Cls. -- Submit the actual Miles job from a modal.method() on rank 0. -- Keep Miles recipes as native CLI flag files under miles/recipes/. -- Own only infrastructure-critical flags in Python: - cluster size, GPUs per node, model path resolution, checkpoint path, and - optional YAML override transport. +"""Launch Miles training jobs on Modal. + +This module defines the clustered `MilesCluster` launcher, helper functions for +model download and dataset preparation, and a local CLI entrypoint for +submitting runs. + +It supports two execution modes: + - deployed: submit runs to a deployed `MilesCluster` with a fixed compute shape + - ephemeral: launch a one-off app directly from the local entrypoint + +The Python wrapper owns only Modal and Ray orchestration plus a small set of +infrastructure-critical flags. Model and training arguments live in `recipes/`. """ import datetime as dt import os import pathlib +import re import shlex import subprocess import time -from dataclasses import dataclass from typing import Optional import modal import modal.experimental +from recipes import ( + Recipe, + format_recipe_table, + get_optional_recipe, + load_recipe_text, + merge_arg_texts, + parse_arg_text, + read_arg_file, +) here = pathlib.Path(__file__).parent.resolve() @@ -34,7 +45,7 @@ HF_CACHE_PATH = pathlib.Path("/root/.cache/huggingface") DATA_PATH = pathlib.Path("/data") CHECKPOINTS_PATH = pathlib.Path("/checkpoints") -REMOTE_RECIPES_DIR = pathlib.Path("/root/miles-recipes") +REMOTE_RECIPES_DIR = pathlib.Path("/root/recipes") REMOTE_PATCH_DIR = pathlib.Path("/root/miles-modal-patches") REMOTE_MILES_DIR = pathlib.Path("/root/miles") REMOTE_TRAIN_SCRIPT = REMOTE_MILES_DIR / "train.py" @@ -49,75 +60,6 @@ ) -@dataclass(frozen=True) -class Recipe: - name: str - description: str - model_id: str - args_file: str - recommended_nodes: int - gpu: str - - -RECIPES = { - "qwen25-0p5b-lora": Recipe( - name="qwen25-0p5b-lora", - description="Single-node smoke test adapted from the upstream Miles LoRA example.", - model_id="Qwen/Qwen2.5-0.5B-Instruct", - args_file="tests/qwen25-0p5b-lora.args", - recommended_nodes=1, - gpu="H100:8", - ), - "qwen3-30b-a3b-lora": Recipe( - name="qwen3-30b-a3b-lora", - description="Single-node Qwen3-30B-A3B all-layer bridge-mode LoRA recipe aligned with current best practices.", - model_id="Qwen/Qwen3-30B-A3B", - args_file="qwen3-30b-a3b-lora.args", - recommended_nodes=1, - gpu="H100:8", - ), - "qwen3-30b-a3b-lora-fewstep": Recipe( - name="qwen3-30b-a3b-lora-fewstep", - description="Single-node Qwen3-30B-A3B all-layer LoRA recipe trimmed to chase a few full RL steps.", - model_id="Qwen/Qwen3-30B-A3B", - args_file="tests/qwen3-30b-a3b-lora-fewstep.args", - recommended_nodes=1, - gpu="H100:8", - ), - "qwen3-30b-a3b-lora-greedy-debug": Recipe( - name="qwen3-30b-a3b-lora-greedy-debug", - description="Single-node Qwen3-30B-A3B attention-only debug/control recipe with greedy rollout.", - model_id="Qwen/Qwen3-30B-A3B", - args_file="tests/qwen3-30b-a3b-lora-greedy-debug.args", - recommended_nodes=1, - gpu="H100:8", - ), - "qwen3-30b-a3b-experts-lora": Recipe( - name="qwen3-30b-a3b-experts-lora", - description="Explicit all-layer Qwen3-30B-A3B LoRA recipe including expert linear_fc1/fc2 targets.", - model_id="Qwen/Qwen3-30B-A3B", - args_file="qwen3-30b-a3b-experts-lora.args", - recommended_nodes=1, - gpu="H100:8", - ), - "qwen3-30b-a3b-experts-fewstep": Recipe( - name="qwen3-30b-a3b-experts-fewstep", - description="Explicit all-layer Qwen3-30B-A3B few-step recipe including expert linear_fc1/fc2 targets.", - model_id="Qwen/Qwen3-30B-A3B", - args_file="tests/qwen3-30b-a3b-experts-fewstep.args", - recommended_nodes=1, - gpu="H100:8", - ), -} - - -def _get_recipe(name: str) -> Recipe: - if name not in RECIPES: - available = ", ".join(sorted(RECIPES)) - raise ValueError(f"Unknown recipe: {name}. Available recipes: {available}") - return RECIPES[name] - - def _parse_gpus_per_node(gpu: str) -> int: try: return int(gpu.rsplit(":", 1)[1]) @@ -127,29 +69,44 @@ def _parse_gpus_per_node(gpu: str) -> int: ) from exc -def _clean_arg_text(arg_text: str) -> str: - lines: list[str] = [] - for raw_line in arg_text.splitlines(): - line = raw_line.split("#", 1)[0].strip() - if line: - lines.append(line) - return "\n".join(lines) +def _resolve_model_id(recipe: Recipe | None, model_id: str) -> str: + if model_id: + return model_id + if recipe: + return recipe.model_id + raise ValueError("Pass --recipe or --model-id.") + + +def _sanitize_path_component(value: str) -> str: + sanitized = re.sub(r"[^A-Za-z0-9._-]+", "-", value).strip("-") + return sanitized or "run" -def _parse_arg_text(arg_text: str) -> list[str]: - cleaned = _clean_arg_text(arg_text) - return shlex.split(cleaned) if cleaned else [] +def _resolve_run_label(recipe: Recipe | None, *, model_id: str, run_name: str) -> str: + if run_name: + return _sanitize_path_component(run_name) + if recipe: + return recipe.name + return _sanitize_path_component(model_id) -def _load_recipe_text(recipe: Recipe, remote: bool = False) -> str: - base_dir = REMOTE_RECIPES_DIR if remote else here / "recipes" - return (base_dir / recipe.args_file).read_text() +def _resolve_base_args_text( + recipe: Recipe | None, + *, + args_text: str, + recipes_dir: pathlib.Path, +) -> str: + parts: list[str] = [] + if recipe: + parts.append(load_recipe_text(recipe, base_dir=recipes_dir)) + if args_text: + parts.append(args_text) + return merge_arg_texts(*parts) def _build_enforced_args( *, model_path: str, - cluster_nodes: int, actor_nodes: int, gpus_per_node: int, checkpoint_dir: pathlib.Path, @@ -193,25 +150,22 @@ def _build_enforced_args( def _build_miles_argv( - recipe: Recipe, *, + base_args_text: str, model_path: str, - cluster_nodes: int, actor_nodes: int, gpus_per_node: int, checkpoint_dir: pathlib.Path, extra_args_text: str, custom_config_path: Optional[str], wandb_key: Optional[str], - remote_recipe: bool, colocate: bool, rollout_num_gpus: Optional[int], ) -> list[str]: - recipe_args = _parse_arg_text(_load_recipe_text(recipe, remote=remote_recipe)) - extra_args = _parse_arg_text(extra_args_text) + base_args = parse_arg_text(base_args_text) + extra_args = parse_arg_text(extra_args_text) enforced_args = _build_enforced_args( model_path=model_path, - cluster_nodes=cluster_nodes, actor_nodes=actor_nodes, gpus_per_node=gpus_per_node, checkpoint_dir=checkpoint_dir, @@ -220,10 +174,18 @@ def _build_miles_argv( colocate=colocate, rollout_num_gpus=rollout_num_gpus, ) - return ["python3", REMOTE_TRAIN_SCRIPT.as_posix(), *recipe_args, *extra_args, *enforced_args] + return [ + "python3", + REMOTE_TRAIN_SCRIPT.as_posix(), + *base_args, + *extra_args, + *enforced_args, + ] -def _resolve_actor_nodes(cluster_nodes: int, *, colocate: bool, actor_nodes: int) -> int: +def _resolve_actor_nodes( + cluster_nodes: int, *, colocate: bool, actor_nodes: int +) -> int: if colocate: return cluster_nodes if actor_nodes > 0: @@ -258,12 +220,6 @@ def _resolve_rollout_num_gpus( return spare_gpus -def _read_optional_file(path_str: str) -> str: - if not path_str: - return "" - return pathlib.Path(path_str).read_text() - - def _build_runtime_env(master_addr: str, wandb_key: Optional[str]) -> dict: env_vars = { "MASTER_ADDR": master_addr, @@ -282,21 +238,17 @@ def _build_runtime_env(master_addr: str, wandb_key: Optional[str]) -> dict: def _print_recipe_table(): print("Available recipes:") - for recipe in sorted(RECIPES.values(), key=lambda item: item.name): - print( - f" - {recipe.name}: {recipe.description} " - f"(model={recipe.model_id}, nodes={recipe.recommended_nodes}, gpu={recipe.gpu})" - ) + for line in format_recipe_table(): + print(line) image = ( modal.Image.from_registry(MILES_IMAGE) .entrypoint([]) - .add_local_dir(here / "recipes", remote_path=REMOTE_RECIPES_DIR.as_posix(), copy=True) + .add_local_dir(here / "recipes", remote_path=REMOTE_RECIPES_DIR.as_posix()) .add_local_dir( here / "modal_patches", remote_path=REMOTE_PATCH_DIR.as_posix(), - copy=True, ) ) @@ -318,6 +270,9 @@ def _print_recipe_table(): app = modal.App(APP_NAME) +# ---- Training Cluster Cls ---- # + + @app.cls( image=image, gpu=DEFAULT_GPU, @@ -414,12 +369,15 @@ def _ensure_ray_started(self): @modal.method() async def submit_training( self, - recipe_name: str, + recipe_name: str = "", *, + model_id: str = "", + base_args_text: str = "", gpus_per_node: int, extra_args_text: str = "", custom_config_yaml: str = "", wandb_key: str = "", + run_name: str = "", actor_nodes: int | None = None, colocate: bool = True, rollout_num_gpus: int | None = None, @@ -430,44 +388,65 @@ async def submit_training( while True: time.sleep(10) - recipe = _get_recipe(recipe_name) + recipe = get_optional_recipe(recipe_name) + resolved_model_id = _resolve_model_id(recipe, model_id) + resolved_base_args_text = _resolve_base_args_text( + recipe, + args_text=base_args_text, + recipes_dir=REMOTE_RECIPES_DIR, + ) + if not resolved_base_args_text: + raise ValueError( + "No training args were provided. Choose a recipe or pass --args/--args-file." + ) + run_label = _resolve_run_label( + recipe, + model_id=resolved_model_id, + run_name=run_name, + ) try: - model_path = snapshot_download(repo_id=recipe.model_id, local_files_only=True) + model_path = snapshot_download( + repo_id=resolved_model_id, local_files_only=True + ) except Exception as exc: - raise RuntimeError( - f"Model {recipe.model_id} is not present in the shared HF cache. " + recipe_hint = ( f"Run `modal run miles/modal_train.py::download_model --recipe {recipe.name}` first." + if recipe + else f"Run `modal run miles/modal_train.py::download_model --model-id {resolved_model_id}` first." + ) + raise RuntimeError( + f"Model {resolved_model_id} is not present in the shared HF cache. " + f"{recipe_hint}" ) from exc run_id = dt.datetime.utcnow().strftime("%Y%m%d-%H%M%S") - checkpoint_dir = CHECKPOINTS_PATH / recipe.name / run_id + checkpoint_dir = CHECKPOINTS_PATH / run_label / run_id custom_config_path = None if custom_config_yaml: - custom_config_path = f"/tmp/{recipe.name}-{run_id}-overrides.yaml" + custom_config_path = f"/tmp/{run_label}-{run_id}-overrides.yaml" pathlib.Path(custom_config_path).write_text(custom_config_yaml) resolved_actor_nodes = actor_nodes if actor_nodes is not None else CLUSTER_NODES resolved_rollout_num_gpus = rollout_num_gpus argv = _build_miles_argv( - recipe, + base_args_text=resolved_base_args_text, model_path=model_path, - cluster_nodes=CLUSTER_NODES, actor_nodes=resolved_actor_nodes, gpus_per_node=gpus_per_node, checkpoint_dir=checkpoint_dir, extra_args_text=extra_args_text, custom_config_path=custom_config_path, wandb_key=wandb_key or None, - remote_recipe=True, colocate=colocate, rollout_num_gpus=resolved_rollout_num_gpus, ) entrypoint = shlex.join(argv) runtime_env = _build_runtime_env(self.main_addr, wandb_key or None) - print(f"Recipe: {recipe.name}") - print(f"Model: {recipe.model_id}") + print(f"Recipe: {recipe.name if recipe else ''}") + print(f"Model: {resolved_model_id}") + print(f"Run label: {run_label}") print(f"Cluster nodes: {CLUSTER_NODES}") print(f"Actor nodes: {resolved_actor_nodes}") print(f"Colocate: {colocate}") @@ -479,7 +458,9 @@ async def submit_training( with modal.forward(RAY_DASHBOARD_PORT) as tunnel: print(f"Dashboard URL: {tunnel.url}") - job_id = self.client.submit_job(entrypoint=entrypoint, runtime_env=runtime_env) + job_id = self.client.submit_job( + entrypoint=entrypoint, runtime_env=runtime_env + ) print(f"Submitted Ray job: {job_id}") async for line in self.client.tail_job_logs(job_id): @@ -491,11 +472,16 @@ async def submit_training( return { "job_id": job_id, "status": status, - "recipe": recipe.name, + "recipe": recipe.name if recipe else None, + "model_id": resolved_model_id, + "run_name": run_label, "checkpoint_dir": checkpoint_dir.as_posix(), } +# ---- Model Download Utility ---- # + + @app.function( image=image, volumes={HF_CACHE_PATH.as_posix(): hf_cache_volume}, @@ -509,7 +495,12 @@ def download_model( ): from huggingface_hub import snapshot_download - resolved_model_id = model_id or _get_recipe(recipe).model_id + selected_recipe = get_optional_recipe(recipe) + resolved_model_id = model_id or ( + selected_recipe.model_id if selected_recipe else "" + ) + if not resolved_model_id: + raise ValueError("Pass --recipe or --model-id.") hf_cache_volume.reload() path = snapshot_download( repo_id=resolved_model_id, @@ -520,6 +511,9 @@ def download_model( hf_cache_volume.commit() +# ---- Dataset Processing Utility ---- # + + @app.function( image=image, volumes={DATA_PATH.as_posix(): data_volume}, @@ -528,29 +522,47 @@ def download_model( def prepare_dataset( hf_dataset: str = "zhuzilin/gsm8k", data_folder: str = "gsm8k", + train_limit: int = 0, + test_limit: int = 0, ): from datasets import load_dataset data_volume.reload() dataset = load_dataset(hf_dataset) + train_split = dataset["train"] + test_split = dataset["test"] + if train_limit > 0: + train_split = train_split.select(range(min(train_limit, len(train_split)))) + if test_limit > 0: + test_split = test_split.select(range(min(test_limit, len(test_split)))) output_dir = DATA_PATH / data_folder output_dir.mkdir(parents=True, exist_ok=True) - dataset["train"].to_parquet((output_dir / "train.parquet").as_posix()) - dataset["test"].to_parquet((output_dir / "test.parquet").as_posix()) + train_split.to_parquet((output_dir / "train.parquet").as_posix()) + test_split.to_parquet((output_dir / "test.parquet").as_posix()) data_volume.commit() - print(f"Prepared dataset {hf_dataset} under {output_dir}") + print( + f"Prepared dataset {hf_dataset} under {output_dir} " + f"(train_rows={len(train_split)}, test_rows={len(test_split)})" + ) + + +# ---- Local Entrypoint ---- # @app.local_entrypoint() def main( - recipe: str = "qwen3-30b-a3b-lora", + recipe: str = "", + model_id: str = "", gpu: str = "", + args: str = "", + args_file: str = "", extra_args: str = "", extra_args_file: str = "", custom_config: str = "", list_recipes: bool = False, dry_run: bool = False, allow_cluster_mismatch: bool = False, + run_name: str = "", colocate: bool = True, actor_nodes: int = 0, rollout_num_gpus: int = 0, @@ -559,8 +571,9 @@ def main( _print_recipe_table() return - selected_recipe = _get_recipe(recipe) - selected_gpu = gpu or selected_recipe.gpu + selected_recipe = get_optional_recipe(recipe) + resolved_model_id = _resolve_model_id(selected_recipe, model_id) + selected_gpu = gpu or (selected_recipe.gpu if selected_recipe else DEFAULT_GPU) gpus_per_node = _parse_gpus_per_node(selected_gpu) resolved_actor_nodes = _resolve_actor_nodes( CLUSTER_NODES, @@ -576,7 +589,8 @@ def main( ) if ( - not allow_cluster_mismatch + selected_recipe + and not allow_cluster_mismatch and CLUSTER_NODES != selected_recipe.recommended_nodes ): raise ValueError( @@ -585,30 +599,44 @@ def main( f"Rerun with the recommended value or pass --allow-cluster-mismatch." ) - merged_extra_args = "\n".join( - part for part in [extra_args, _read_optional_file(extra_args_file)] if part + base_args_text = merge_arg_texts(args, read_arg_file(args_file)) + resolved_base_args_text = _resolve_base_args_text( + selected_recipe, + args_text=base_args_text, + recipes_dir=here / "recipes", ) - custom_config_yaml = _read_optional_file(custom_config) + if not resolved_base_args_text: + raise ValueError( + "No training args were provided. Choose a recipe or pass --args/--args-file." + ) + merged_extra_args = merge_arg_texts(extra_args, read_arg_file(extra_args_file)) + custom_config_yaml = read_arg_file(custom_config) wandb_key = os.environ.get("WANDB_API_KEY", "") - checkpoint_dir = CHECKPOINTS_PATH / selected_recipe.name / "DRY_RUN" + resolved_run_label = _resolve_run_label( + selected_recipe, + model_id=resolved_model_id, + run_name=run_name, + ) + checkpoint_dir = CHECKPOINTS_PATH / resolved_run_label / "DRY_RUN" if dry_run: argv = _build_miles_argv( - selected_recipe, + base_args_text=resolved_base_args_text, model_path="$MODEL_PATH", - cluster_nodes=CLUSTER_NODES, actor_nodes=resolved_actor_nodes, gpus_per_node=gpus_per_node, checkpoint_dir=checkpoint_dir, extra_args_text=merged_extra_args, - custom_config_path="/tmp/custom-config.yaml" if custom_config_yaml else None, + custom_config_path="/tmp/custom-config.yaml" + if custom_config_yaml + else None, wandb_key="$WANDB_API_KEY" if wandb_key else None, - remote_recipe=False, colocate=colocate, rollout_num_gpus=resolved_rollout_num_gpus, ) - print(f"Recipe: {selected_recipe.name}") - print(f"Model: {selected_recipe.model_id}") + print(f"Recipe: {selected_recipe.name if selected_recipe else ''}") + print(f"Model: {resolved_model_id}") + print(f"Run label: {resolved_run_label}") print(f"Cluster nodes: {CLUSTER_NODES}") print(f"Actor nodes: {resolved_actor_nodes}") print(f"Colocate: {colocate}") @@ -618,8 +646,9 @@ def main( print(shlex.join(argv)) return - print(f"Recipe: {selected_recipe.name}") - print(f"Model: {selected_recipe.model_id}") + print(f"Recipe: {selected_recipe.name if selected_recipe else ''}") + print(f"Model: {resolved_model_id}") + print(f"Run label: {resolved_run_label}") print(f"Cluster nodes: {CLUSTER_NODES}") print(f"Actor nodes: {resolved_actor_nodes}") print(f"Colocate: {colocate}") @@ -627,13 +656,25 @@ def main( print(f"Rollout GPUs: {resolved_rollout_num_gpus}") print(f"GPU: {selected_gpu}") - cluster = MilesCluster.with_options(gpu=selected_gpu)() + try: + cluster_cls = modal.Cls.from_name(APP_NAME, "MilesCluster") + print(f"Using deployed Modal class: {APP_NAME}.MilesCluster") + except modal.exception.NotFoundError: + cluster_cls = MilesCluster + print( + f"No deployed Modal class found for {APP_NAME}.MilesCluster; using an ephemeral app." + ) + + cluster = cluster_cls.with_options(gpu=selected_gpu)() result = cluster.submit_training.remote( - recipe_name=selected_recipe.name, + recipe_name=selected_recipe.name if selected_recipe else "", + model_id=resolved_model_id, + base_args_text=base_args_text, gpus_per_node=gpus_per_node, extra_args_text=merged_extra_args, custom_config_yaml=custom_config_yaml, wandb_key=wandb_key, + run_name=resolved_run_label, actor_nodes=resolved_actor_nodes, colocate=colocate, rollout_num_gpus=resolved_rollout_num_gpus, diff --git a/miles/recipes/__init__.py b/miles/recipes/__init__.py new file mode 100644 index 0000000..288aa81 --- /dev/null +++ b/miles/recipes/__init__.py @@ -0,0 +1,29 @@ +from .util import ( + RECIPES, + RECIPES_DIR, + Recipe, + clean_arg_text, + format_recipe_table, + get_optional_recipe, + get_recipe, + iter_recipes, + load_recipe_text, + merge_arg_texts, + parse_arg_text, + read_arg_file, +) + +__all__ = [ + "RECIPES", + "RECIPES_DIR", + "Recipe", + "clean_arg_text", + "format_recipe_table", + "get_optional_recipe", + "get_recipe", + "iter_recipes", + "load_recipe_text", + "merge_arg_texts", + "parse_arg_text", + "read_arg_file", +] diff --git a/miles/recipes/util.py b/miles/recipes/util.py new file mode 100644 index 0000000..ec79479 --- /dev/null +++ b/miles/recipes/util.py @@ -0,0 +1,122 @@ +import pathlib +import shlex +from dataclasses import dataclass + + +RECIPES_DIR = pathlib.Path(__file__).parent.resolve() + + +@dataclass(frozen=True) +class Recipe: + name: str + description: str + model_id: str + args_file: str + recommended_nodes: int + gpu: str + + +RECIPES = { + "qwen25-0p5b-lora": Recipe( + name="qwen25-0p5b-lora", + description="Single-node smoke test adapted from the upstream Miles LoRA example.", + model_id="Qwen/Qwen2.5-0.5B-Instruct", + args_file="tests/qwen25-0p5b-lora.args", + recommended_nodes=1, + gpu="H100:8", + ), + "qwen3-30b-a3b-lora": Recipe( + name="qwen3-30b-a3b-lora", + description="Single-node Qwen3-30B-A3B all-layer bridge-mode LoRA recipe aligned with current best practices.", + model_id="Qwen/Qwen3-30B-A3B", + args_file="qwen3-30b-a3b-lora.args", + recommended_nodes=1, + gpu="H100:8", + ), + "qwen3-30b-a3b-lora-fewstep": Recipe( + name="qwen3-30b-a3b-lora-fewstep", + description="Single-node Qwen3-30B-A3B all-layer LoRA recipe trimmed to chase a few full RL steps.", + model_id="Qwen/Qwen3-30B-A3B", + args_file="tests/qwen3-30b-a3b-lora-fewstep.args", + recommended_nodes=1, + gpu="H100:8", + ), + "qwen3-30b-a3b-lora-greedy-debug": Recipe( + name="qwen3-30b-a3b-lora-greedy-debug", + description="Single-node Qwen3-30B-A3B attention-only debug/control recipe with greedy rollout.", + model_id="Qwen/Qwen3-30B-A3B", + args_file="tests/qwen3-30b-a3b-lora-greedy-debug.args", + recommended_nodes=1, + gpu="H100:8", + ), + "qwen3-30b-a3b-experts-lora": Recipe( + name="qwen3-30b-a3b-experts-lora", + description="Explicit all-layer Qwen3-30B-A3B LoRA recipe including expert linear_fc1/fc2 targets.", + model_id="Qwen/Qwen3-30B-A3B", + args_file="qwen3-30b-a3b-experts-lora.args", + recommended_nodes=1, + gpu="H100:8", + ), + "qwen3-30b-a3b-experts-fewstep": Recipe( + name="qwen3-30b-a3b-experts-fewstep", + description="Explicit all-layer Qwen3-30B-A3B few-step recipe including expert linear_fc1/fc2 targets.", + model_id="Qwen/Qwen3-30B-A3B", + args_file="tests/qwen3-30b-a3b-experts-fewstep.args", + recommended_nodes=1, + gpu="H100:8", + ), +} + + +def get_recipe(name: str) -> Recipe: + if name not in RECIPES: + available = ", ".join(sorted(RECIPES)) + raise ValueError(f"Unknown recipe: {name}. Available recipes: {available}") + return RECIPES[name] + + +def get_optional_recipe(name: str) -> Recipe | None: + if not name: + return None + return get_recipe(name) + + +def iter_recipes() -> list[Recipe]: + return sorted(RECIPES.values(), key=lambda item: item.name) + + +def clean_arg_text(arg_text: str) -> str: + lines: list[str] = [] + for raw_line in arg_text.splitlines(): + line = raw_line.split("#", 1)[0].strip() + if line: + lines.append(line) + return "\n".join(lines) + + +def parse_arg_text(arg_text: str) -> list[str]: + cleaned = clean_arg_text(arg_text) + return shlex.split(cleaned) if cleaned else [] + + +def read_arg_file(path_str: str) -> str: + if not path_str: + return "" + return pathlib.Path(path_str).read_text() + + +def merge_arg_texts(*parts: str) -> str: + return "\n".join(part for part in parts if part and part.strip()) + + +def load_recipe_text(recipe: Recipe, base_dir: pathlib.Path | None = None) -> str: + recipe_dir = base_dir if base_dir is not None else RECIPES_DIR + return (recipe_dir / recipe.args_file).read_text() + + +def format_recipe_table() -> list[str]: + return [ + f" - {recipe.name}: {recipe.description} " + f"(model={recipe.model_id}, nodes={recipe.recommended_nodes}, gpu={recipe.gpu})" + for recipe in iter_recipes() + ]