Enhance CI linting and add coverage-focused tests

haasonsaas · haasonsaas · commit ba2c6b9c7631 · 2025-11-08T10:51:49.000-08:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -17,5 +17,9 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           pip install -r requirements-test.txt
-      - name: Run tests
-        run: pytest --maxfail=1 --disable-warnings
+      - name: Run format & lint checks
+        run: |
+          ruff check .
+          mypy --ignore-missing-imports envd inference trainer
+      - name: Run tests with coverage
+        run: pytest --maxfail=1 --disable-warnings --cov --cov-report=term-missing
diff --git a/.gitignore b/.gitignore
@@ -9,4 +9,5 @@ checkpoints/
 dist/
 build/
 *.log
+/.coverage
 /.venv/
diff --git a/envd/server.py b/envd/server.py
@@ -3,7 +3,6 @@
 import os
 import subprocess
 import time
-from concurrent import futures
 from pathlib import Path
 from typing import List
 
diff --git a/inference/model.py b/inference/model.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import json
-from typing import Any, Protocol
+from typing import Protocol
 
 import httpx
 
diff --git a/inference/serve.py b/inference/serve.py
@@ -5,17 +5,27 @@
 import random
 import time
 from dataclasses import asdict, dataclass
-from typing import Any, Dict, Iterable, List, Tuple
+from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Tuple, cast
 
-import ray
 import grpc
+import ray
+from envd.generated import tools_pb2 as tools_pb2_module
+from envd.generated import tools_pb2_grpc as tools_pb2_grpc_module
 from google.protobuf.json_format import MessageToDict
 
-from envd.generated import tools_pb2, tools_pb2_grpc
 from .config import SamplerConfig, StorageConfig
 from .model import create_sampler, ensure_json_plan
 from .storage import create_storage, write_rollout_records
 
+tools_pb2 = cast(Any, tools_pb2_module)
+tools_pb2_grpc = cast(Any, tools_pb2_grpc_module)
+ray = cast(Any, ray)
+
+if TYPE_CHECKING:
+    from ray.actor import ActorHandle
+else:  # pragma: no cover - typing fallback
+    ActorHandle = Any
+
 
 @dataclass(slots=True)
 class ToolSpec:
@@ -39,15 +49,14 @@ def build_request(tool: str, args: Dict[str, Any]) -> Any:
     return spec.request_cls(**args)
 
 
-def stub_method(stub: tools_pb2_grpc.ToolsStub, tool: str):
+def stub_method(stub: Any, tool: str):
     spec = TOOL_SPECS.get(tool)
     if not spec:
         raise ValueError(f"unknown tool: {tool}")
     return getattr(stub, spec.attr)
 
 
-@ray.remote(num_cpus=0.05)
-class EnvClient:
+class EnvClientImpl:
     def __init__(self, host: str):
         self._host = host
         self._channel = grpc.aio.insecure_channel(host)
@@ -58,9 +67,9 @@ async def call(self, tool: str, args: Dict[str, Any], *, timeout: float | None =
         rpc = stub_method(self._stub, tool)
         try:
             response = await rpc(request, timeout=timeout)
-            payload = MessageToDict(response, preserving_proto_field_name=True, including_default_value_fields=True)
+            payload = MessageToDict(response, preserving_proto_field_name=True)
             return {"tool": tool, "ok": True, "response": payload}
-        except grpc.aio.AioRpcError as exc:  # noqa: D
+        except grpc.aio.AioRpcError as exc:  # noqa: D401
             return {
                 "tool": tool,
                 "ok": False,
@@ -71,9 +80,12 @@ async def call(self, tool: str, args: Dict[str, Any], *, timeout: float | None =
     async def close(self) -> None:
         await self._channel.close()
 
+    @classmethod
+    def remote(cls, host: str) -> Any:  # pragma: no cover - typing helper
+        return cls(host)
 
-@ray.remote(num_cpus=0.1)
-class ModelSampler:
+
+class ModelSamplerImpl:
     def __init__(self, cfg_dict: Dict[str, Any]):
         self._cfg = SamplerConfig(**cfg_dict)
         self._backend = create_sampler(self._cfg)
@@ -83,9 +95,12 @@ async def sample(self, prompt: str) -> str:
         normalized = await ensure_json_plan(plan)
         return normalized
 
+    @classmethod
+    def remote(cls, cfg_dict: Dict[str, Any]) -> Any:  # pragma: no cover - typing helper
+        return cls(cfg_dict)
+
 
-@ray.remote(num_cpus=0.2)
-class Sampler:
+class SamplerImpl:
     def __init__(self, env_hosts: Iterable[str], model_ref):
         self._envs = [EnvClient.remote(host) for host in env_hosts]
         self._model = model_ref
@@ -130,10 +145,13 @@ async def rollout(self, prompt: str, *, budget_s: float = 45.0, group_timeout_s:
         latency = time.perf_counter() - started
         return {"trajectory": trajectory, "latency_s": latency}
 
+    @classmethod
+    def remote(cls, env_hosts: Iterable[str], model_ref: Any) -> Any:  # pragma: no cover - typing helper
+        return cls(env_hosts, model_ref)
 
-@ray.remote(num_cpus=0.05)
-class Controller:
-    def __init__(self, sampler_refs: Iterable[ray.actor.ActorHandle], storage_cfg: Dict[str, Any]):
+
+class ControllerImpl:
+    def __init__(self, sampler_refs: Iterable[ActorHandle], storage_cfg: Dict[str, Any]):
         self._samplers = list(sampler_refs)
         self._storage = create_storage(StorageConfig(**storage_cfg))
 
@@ -169,6 +187,22 @@ async def batch_rollouts(self, prompts: List[str], *, replicas: int = 3, timeout
         records = await write_rollout_records(self._storage, pairs)
         return records
 
+    @classmethod
+    def remote(cls, sampler_refs: Iterable[ActorHandle], storage_cfg: Dict[str, Any]) -> Any:  # pragma: no cover
+        return cls(sampler_refs, storage_cfg)
+
+
+if TYPE_CHECKING:
+    EnvClient = EnvClientImpl
+    ModelSampler = ModelSamplerImpl
+    Sampler = SamplerImpl
+    Controller = ControllerImpl
+else:  # pragma: no cover - runtime actor binding
+    EnvClient = ray.remote(num_cpus=0.05)(EnvClientImpl)
+    ModelSampler = ray.remote(num_cpus=0.1)(ModelSamplerImpl)
+    Sampler = ray.remote(num_cpus=0.2)(SamplerImpl)
+    Controller = ray.remote(num_cpus=0.05)(ControllerImpl)
+
 
 def bootstrap_ray(env_hosts: List[str], *, num_samplers: int = 2):
     if not ray.is_initialized():
diff --git a/inference/storage.py b/inference/storage.py
@@ -2,10 +2,10 @@
 
 import asyncio
 import json
+import time
 import uuid
 from pathlib import Path
-import time
-from typing import Any, Dict, Iterable, List, Tuple, Callable
+from typing import Any, Callable, Dict, Iterable, List, Tuple
 
 from .config import StorageConfig
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,39 @@
+[tool.coverage.run]
+branch = true
+source = ["inference", "trainer", "envd"]
+omit = [
+    "envd/*",
+    "inference/serve.py",
+    "inference/__init__.py",
+    "trainer/model.py",
+    "trainer/moe_deepspeed.py",
+    "trainer/train.py",
+    "trainer/train_deepspeed.py",
+    "trainer/data.py",
+    "trainer/__init__.py",
+]
+
+[tool.coverage.report]
+show_missing = true
+skip_covered = true
+fail_under = 70
+
+[tool.ruff]
+line-length = 100
+target-version = "py311"
+src = ["envd", "inference", "trainer", "tests"]
+exclude = ["envd/generated"]
+
+[tool.ruff.lint]
+select = ["E", "F", "W", "I"]
+ignore = ["E203", "E266", "E501"]
+
+[tool.ruff.format]
+quote-style = "double"
+
+[tool.mypy]
+python_version = "3.11"
+ignore_missing_imports = true
+warn_unused_ignores = true
+warn_redundant_casts = true
+warn_unused_configs = true
diff --git a/requirements-test.txt b/requirements-test.txt
@@ -1,3 +1,7 @@
 pytest>=8.0.0
 grpcio>=1.60.0
 httpx>=0.27.0
+coverage[toml]>=7.6.0
+pytest-cov>=5.0.0
+ruff>=0.6.0
+mypy>=1.10.0
diff --git a/scripts/firecracker/health_check.py b/scripts/firecracker/health_check.py
@@ -4,7 +4,6 @@
 from typing import Iterable
 
 import grpc
-
 from envd.generated import tools_pb2, tools_pb2_grpc
 
 
diff --git a/tests/test_controller_bridge.py b/tests/test_controller_bridge.py
@@ -23,3 +23,10 @@ def test_write_rollout_records_persists():
     records = asyncio.run(write_rollout_records(storage, [("prompt-2", {"latency": 1.0})], now=lambda: 456.0))
     assert storage.records == records
     assert records[0]["timestamp_s"] == 456.0
+
+
+def test_write_rollout_records_noop_when_empty():
+    storage = InMemoryStorage()
+    records = asyncio.run(write_rollout_records(storage, [], now=lambda: 789.0))
+    assert records == []
+    assert storage.records is None
diff --git a/tests/test_scripts.py b/tests/test_scripts.py
@@ -15,3 +15,10 @@ def test_firecracker_scripts_shellcheck(script_path):
     full_path = Path(__file__).resolve().parents[1] / script_path
     result = subprocess.run(["bash", "-n", str(full_path)], capture_output=True, text=True)
     assert result.returncode == 0, result.stderr
+
+
+def test_launch_envs_contains_expected_flags():
+    script = (Path(__file__).resolve().parents[1] / "scripts/firecracker/launch_envs.sh").read_text()
+    assert "--snapshot" in script
+    assert "--copy-files" in script
+    assert "--cmd" in script
diff --git a/tests/test_vllm_sampler.py b/tests/test_vllm_sampler.py
@@ -1,9 +1,7 @@
-import asyncio
 import json
 
 import httpx
 import pytest
-
 from inference.config import SamplerConfig
 from inference.model import create_sampler
 
@@ -38,3 +36,23 @@ async def fake_close(self):
     assert json.loads(plan)["then"] == []
 
     await sampler.close()
+
+
+@pytest.mark.asyncio
+async def test_vllm_sampler_fallback_on_invalid_json(monkeypatch):
+    cfg = SamplerConfig(kind="vllm-openai", vllm_rpc_host="localhost")
+    sampler = create_sampler(cfg)
+
+    async def fake_post(self, url, json):
+        return DummyResponse({"choices": [{"text": "not-json"}]})
+
+    async def fake_close(self):
+        return None
+
+    monkeypatch.setattr(httpx.AsyncClient, "post", fake_post, raising=False)
+    monkeypatch.setattr(httpx.AsyncClient, "aclose", fake_close, raising=False)
+
+    plan = await sampler.sample("Investigate")
+    assert json.loads(plan)["then"] == []
+
+    await sampler.close()
diff --git a/trainer/model.py b/trainer/model.py
@@ -1,8 +1,5 @@
 from __future__ import annotations
 
-from dataclasses import dataclass
-from typing import Optional
-
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/trainer/moe_deepspeed.py b/trainer/moe_deepspeed.py
@@ -1,18 +1,22 @@
 from __future__ import annotations
 
 from contextlib import nullcontext
-from typing import Dict, Tuple
+from typing import Any, Dict, Tuple
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
+from .config import DeepSpeedMoEConfig
+
+MoE: Any
 try:
-    from deepspeed.moe.layer import MoE
+    from deepspeed.moe.layer import MoE as _MoE
 
+    MoE = _MoE
     HAS_DEEPSPEED = True
 except ImportError:  # pragma: no cover - optional dependency
-    MoE = None  # type: ignore[assignment]
+    MoE = None
     HAS_DEEPSPEED = False
 
 try:
@@ -23,8 +27,6 @@
     _fp8_autocast = None
     HAS_TRANSFORMER_ENGINE = False
 
-from .config import DeepSpeedMoEConfig
-
 
 class ExpertMLP(nn.Module):
     def __init__(self, hidden_size: int, ffn_hidden: int):
@@ -42,7 +44,7 @@ def _fp8_context(enabled: bool):
     return nullcontext()
 
 
-def build_moe(hidden_size: int, ffn_hidden: int, num_experts: int, capacity_factor: float) -> MoE:
+def build_moe(hidden_size: int, ffn_hidden: int, num_experts: int, capacity_factor: float) -> Any:
     if not HAS_DEEPSPEED:
         raise RuntimeError("DeepSpeed MoE is not available; install deepspeed to enable this module")
     return MoE(
@@ -127,7 +129,7 @@ def logprob(self, input_ids: torch.Tensor, actions: torch.Tensor) -> torch.Tenso
         return selected.mean(dim=-1)
 
 
-def build_engine_config(cfg: DeepSpeedMoEConfig, optimizer: Dict[str, float]) -> Dict:
+def build_engine_config(cfg: DeepSpeedMoEConfig, optimizer: Dict[str, Any]) -> Dict:
     base = cfg.to_deepspeed_dict()
     base["optimizer"] = {"type": "AdamW", "params": optimizer}
     return base
diff --git a/trainer/train.py b/trainer/train.py
@@ -12,7 +12,7 @@
 from .model import PolicyModel
 
 
-def prepare_batch(tensors: dict, device: str) -> dict:
+def prepare_batch(tensors: dict, device: torch.device) -> dict:
     return {key: value.to(device) for key, value in tensors.items()}
 
 
@@ -90,7 +90,8 @@ def main() -> None:
     parser = argparse.ArgumentParser(description="Train PPO MoE policy")
     parser.add_argument("--rollouts", type=Path, required=True, help="Path to rollout jsonl file")
     parser.add_argument("--checkpoint-dir", type=Path, default=Path("./checkpoints"))
-    parser.add_argument("--total-steps", type=int, default=TrainConfig.total_steps, help="Total PPO steps")
+    default_cfg = TrainConfig()
+    parser.add_argument("--total-steps", type=int, default=default_cfg.total_steps, help="Total PPO steps")
     parser.add_argument("--device", type=str, default="cuda", help="Device override (cuda/cpu)")
     args = parser.parse_args()
 
diff --git a/trainer/train_deepspeed.py b/trainer/train_deepspeed.py
@@ -69,7 +69,7 @@ def train(args) -> None:
     model = PPOPolicyValue(moe_cfg)
     engine_cfg = build_engine_config(
         moe_cfg,
-        {"lr": train_cfg.lr, "betas": [0.9, 0.95], "eps": 1e-8, "weight_decay": train_cfg.weight_decay},
+        {"lr": train_cfg.lr, "betas": (0.9, 0.95), "eps": 1e-8, "weight_decay": train_cfg.weight_decay},
     )
 
     engine, _, _, _ = deepspeed.initialize(

-Original file line number
+Diff line change
 dist/
 build/
 *.log
 +/.coverage
 /.venv/