modal-labs
diff --git a/‎README.md‎
Lines changed: 15 additions & 0 deletions b/‎README.md‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎haiku/haiku_config.py‎
Lines changed: 153 additions & 0 deletions b/‎haiku/haiku_config.py‎
Lines changed: 153 additions & 0 deletions
diff --git a/‎haiku/llm_judges/__init__.py‎ b/‎haiku/llm_judges/__init__.py‎
diff --git a/‎haiku/llm_judges/base.py‎
Lines changed: 155 additions & 0 deletions b/‎haiku/llm_judges/base.py‎
Lines changed: 155 additions & 0 deletions
@@ -14,6 +14,19 @@
 Well documented examples of running distributed training jobs on [Modal](https://modal.com).
 Use this repository to learn how to build distributed training jobs on Modal.
 
+## Example of Async RL using slime on Modal
+
+```
+modal profile activate modal-labs
+modal config set-environment clairez-dev 
+modal deploy slime/tests/modal_train.py # once
+modal run slime/tests/modal_train.py::prepare # once
+modal run slime/tests/modal_train.py::execute
+```
+<!-- prepare_dataset
+download_model
+train -->
+
 # Examples
 
 - [**`benchmark/`**](/benchmark/) contains performance and reliability testing, using AWS EFA by default.
@@ -39,3 +52,5 @@ Other relevant documentation in our guide:
 ## License
 
 The [MIT license](LICENSE).
+
+
@@ -0,0 +1,153 @@
+"""Configuration for Qwen3-4B GRPO training on Haiku dataset."""
+
+from dataclasses import dataclass, field
+from pathlib import Path
+
+
+@dataclass
+class RLConfig:
+    """Training config that passes raw CLI args directly to slime."""
+
+    model_name: str
+    model_id: str
+
+    # Modal settings
+    app_name: str = "slime-grpo"
+    n_nodes: int = 4
+    gpu: str = "H100:8"
+
+    # Training mode
+    sync: bool = True
+
+    # Wandb
+    wandb_project: str = "slime-grpo"
+    wandb_run_name_prefix: str = ""
+
+    # Raw CLI args passed directly to slime
+    slime_args: str = ""
+
+    # Extra args that get appended (for easy overrides)
+    extra_args: list[str] = field(default_factory=list)
+
+    @property
+    def train_script(self) -> str:
+        return "slime/train.py" if self.sync else "slime/train_async.py"
+
+    def _clean_args(self, args: str) -> str:
+        """Remove comments and normalize whitespace."""
+        lines = []
+        for line in args.split("\n"):
+            if "#" in line:
+                line = line[: line.index("#")]
+            line = line.strip()
+            if line:
+                lines.append(line)
+        return " ".join(lines)
+
+    def generate_train_args(self, models_path: Path, data_path: Path, is_infinite_run: bool) -> str:
+        base_args = f"--hf-checkpoint {models_path}/{self.model_name} --ref-load {models_path}/{self.model_name}"
+
+        cleaned_slime_args = self._clean_args(self.slime_args)
+        cleaned_slime_args = cleaned_slime_args.replace("{data_path}", str(data_path))
+        cleaned_slime_args = cleaned_slime_args.replace("{models_path}", str(models_path))
+
+        extra = " ".join(self.extra_args) if self.extra_args else ""
+
+        return f"{base_args} {cleaned_slime_args} {extra}".strip()
+
+
+# ── Model architecture constants ──
+
+QWEN3_4B_MODEL_ARGS = """
+    --num-layers 36 --hidden-size 2560 --ffn-hidden-size 9728
+    --num-attention-heads 32 --group-query-attention --num-query-groups 8
+    --kv-channels 128 --vocab-size 151936
+    --normalization RMSNorm --norm-epsilon 1e-6 --swiglu
+    --disable-bias-linear --qk-layernorm
+    --use-rotary-position-embeddings --rotary-base 1000000
+"""
+
+DEFAULT_TRAINING_ARGS = """
+    --tensor-model-parallel-size 2 --sequence-parallel
+    --recompute-granularity full --recompute-method uniform --recompute-num-layers 1
+    --use-dynamic-batch-size --max-tokens-per-gpu 9216
+    --megatron-to-hf-mode bridge
+    --attention-dropout 0.0 --hidden-dropout 0.0
+    --accumulate-allreduce-grads-in-fp32 --attention-softmax-in-fp32
+"""
+
+DEFAULT_OPTIMIZER_ARGS = """
+    --optimizer adam
+    --lr 1e-6 --lr-decay-style constant
+    --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.98
+"""
+
+DEFAULT_GRPO_ARGS = """
+    --advantage-estimator grpo
+    --use-kl-loss --kl-loss-coef 0.00 --kl-loss-type low_var_kl
+    --entropy-coef 0.00
+    --eps-clip 0.2 --eps-clip-high 0.28
+"""
+
+
+# ── Config factory ──
+
+def get_config(run_name: str = "qwen3-4b-haiku") -> RLConfig:
+    return RLConfig(
+        model_name="Qwen3-4B",
+        model_id="Qwen/Qwen3-4B",
+        n_nodes=1,
+        gpu="H200:8",
+        app_name="slime-qwen3-4b-haiku",
+        sync=True,
+        wandb_project="slime-grpo-haiku",
+        wandb_run_name_prefix=run_name,
+        slime_args=f"""
+            # Model architecture
+            {QWEN3_4B_MODEL_ARGS}
+
+            # Training parallelism and optimization
+            {DEFAULT_TRAINING_ARGS}
+
+            # Optimizer
+            {DEFAULT_OPTIMIZER_ARGS}
+
+            # GRPO algorithm
+            {DEFAULT_GRPO_ARGS}
+
+            # Data
+            --input-key messages --label-key label
+            --apply-chat-template --rollout-shuffle
+            --prompt-data {{data_path}}/haiku/train.parquet
+
+            # Custom reward model
+            --rm-type remote_rm
+            --rm-url https://modal-labs-joy-dev--llm-judge-reward-model-llmjudgeflash.us-east.modal.direct/score
+
+            --num-rollout 50
+            --rollout-batch-size 128
+            --n-samples-per-prompt 8
+            --global-batch-size 64
+
+            # SGLang
+            --rollout-num-gpus-per-engine 2
+            --sglang-mem-fraction-static 0.7
+
+            --rollout-max-response-len 300
+
+            --rollout-temperature 1
+            --rollout-skip-special-tokens
+
+            # Orchestration
+            --actor-num-nodes 1
+            --actor-num-gpus-per-node 8
+            --colocate
+
+            # Eval
+            --eval-prompt-data haiku {{data_path}}/haiku/test.parquet
+            --eval-interval 20
+            --n-samples-per-eval-prompt 8
+            --eval-max-response-len 300
+            --eval-top-p 1
+        """,
+    )
@@ -0,0 +1,155 @@
+"""
+Haiku LLM judge — scores structure (syllable counting) and style (LLM evaluation).
+
+Structure scoring uses CMUdict for syllable counting.
+Style scoring uses a local vLLM instance to evaluate relevance, poetic quality, etc.
+"""
+
+import re
+
+import aiohttp
+
+from llm_judges.deploy import VLLM_PORT
+from llm_judges.nlp import score_haiku_structure
+
+
+MODAL_VOCABS = [
+    "modal",
+    "volume",
+    "function",
+    "sandbox",
+    "flash",
+    "inference",
+    "train",
+]
+
+
+def _build_judge_prompt(prompt: str, response: str, label: str = "") -> tuple[str, int]:
+    """Build the LLM judge prompt. Returns (prompt_text, max_score)."""
+    modal_vocab_str = ", ".join(MODAL_VOCABS)
+
+    max_score = 15  # relevance(5) + poetic(5) + modal vocab(5)
+
+    text = f"""You are evaluating a haiku poem.
+
+    Score the response based on the following criteria:
+
+    Relevance (5 points total)
+    - 5 points: if the central theme and punchline of the haiku is "{prompt}"
+    - 3 points: if the response directly discusses "{prompt}" but it is not the central theme
+    - 2 points: if the response is relevant to the topic "{prompt}" but very plain
+    - 0 points: if the response is not relevant to the topic "{prompt}"
+
+    Poetic quality (5 points total)
+    - 5 points: if the response makes sense, can be considered a poetic haiku, with a clear theme and punchline
+    - 3 point: if the response makes sense, but is not very poetic
+    - 1 point: if the response doesn't make sense
+    - 0 points: if the response is not poetic and incoherent
+"""
+
+    if label:
+        max_score = 20
+        text += f"""
+    Better than the existing poem (5 points total):
+    Given the existing poem, score the response by comparing its quality to the existing poem:
+    {label}
+    - 5 points: if the response is better than the poem "{label}".
+    - 3 points: if the response is equal in quality to the poem "{label}".
+    - 0 points: if the response is worse than the poem "{label}".
+"""
+
+    prereq_score = max_score - 5
+    text += f"""
+    Uses Modal vocabulary (5 points total): (modal vocab: {modal_vocab_str})
+    - 5 points: if the response uses the above words in a way that is coherent and relevant to the topic "{prompt}"
+    - 3 points: if the response uses the above words in a way that is not relevant to the topic "{prompt}"
+    - 0 points: if the response does not use the above words
+    DO NOT GIVE ANY POINTS TO USE MODAL VOCABULARY IF THE POEM ITSELF DOES NOT ALREADY ACHIEVE A SCORE OF {prereq_score} OR HIGHER
+
+    Add up the scores from the above criteria to get the total score.
+
+    --
+    **Topic:** {prompt}
+
+    **Response to evaluate:**
+    {response}
+    ---
+
+    Output ONLY a single number (0-{max_score}), nothing else."""
+
+    return text, max_score
+
+
+class HaikuJudge:
+    """Scores haikus on structure (syllable counting) and style (LLM evaluation).
+
+    Args:
+        gate_style_on_structure: If True, only evaluate style when structure
+            score is perfect (1.0). If False, always evaluate style.
+    """
+
+    def __init__(self, gate_style_on_structure: bool = True):
+        self.gate_style_on_structure = gate_style_on_structure
+
+    async def score_style(
+        self,
+        model_name: str,
+        session: aiohttp.ClientSession,
+        prompt: str,
+        response: str,
+        label: str = "",
+        vllm_base_url: str = f"http://localhost:{VLLM_PORT}",
+    ) -> float:
+        """Score haiku style via LLM judge, normalized to [0, 1]."""
+        judge_prompt, max_score = _build_judge_prompt(prompt, response, label)
+
+        try:
+            async with session.post(
+                f"{vllm_base_url}/v1/chat/completions",
+                headers={"content-type": "application/json"},
+                json={
+                    "model": model_name,
+                    "messages": [{"role": "user", "content": judge_prompt}],
+                    "max_tokens": 100,
+                },
+            ) as resp:
+                if resp.status != 200:
+                    error_text = await resp.text()
+                    print(f"vLLM error: {resp.status} - {error_text}")
+                    return 0
+
+                data = await resp.json()
+                score_text = data["choices"][0]["message"]["content"].strip()
+                print(f"Scored {response} with score {score_text}")
+
+                match = re.search(r"(\d+(?:\.\d+)?)", score_text)
+                if match:
+                    score = float(match.group(1))
+                    return min(max(score, 0), max_score) / max_score
+                return 0
+        except Exception as e:
+            print(f"Error scoring response: {e}")
+            return 0
+
+    async def score_single(
+        self,
+        model_name: str,
+        session: aiohttp.ClientSession,
+        prompt: str,
+        response: str,
+        cmudict: dict,
+        label: str = "",
+    ) -> float:
+        """Score a single haiku. Returns a score in [0, 2]."""
+        structure_score = score_haiku_structure(response, cmudict)
+
+        style_score = 0.0
+        if not self.gate_style_on_structure or structure_score >= 1.0:
+            style_score = await self.score_style(
+                model_name, session, prompt, response, label
+            )
+            style_score = max(style_score, 0.0)
+
+        total = structure_score + style_score
+        print(f"[HaikuJudge] structure={structure_score}, style={style_score}, gated={self.gate_style_on_structure}")
+        return total