From 63045fdca9c7c771b6dd0d23b683026c7ca4782c Mon Sep 17 00:00:00 2001 From: Sanyam Bhutani Date: Thu, 2 Oct 2025 18:50:19 -0700 Subject: [PATCH 01/28] Create ReadMe.MD --- docs/Tutorials/ReadMe.MD | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 docs/Tutorials/ReadMe.MD diff --git a/docs/Tutorials/ReadMe.MD b/docs/Tutorials/ReadMe.MD new file mode 100644 index 000000000..6294c8ec8 --- /dev/null +++ b/docs/Tutorials/ReadMe.MD @@ -0,0 +1,11 @@ +Welcome to the Tutorials section! This section is inspired by the A-Z PyTorch tutorial, shoutout to our friends that remember! + +This section currently is structured in 3 detailed parts: + +1. []() +2. []() +3. []() + +Each part builds upon the next and the entire section can be consumed in roughly an hour-Grab a Chai and Enjoy! + +If you're eager, please checkout our SFT Tutorial too (Coming soon!) as well as [App Examples](../../apps/). \ No newline at end of file From 7dbf75f03739fdf3b4433fd68f73a789972091b0 Mon Sep 17 00:00:00 2001 From: Sanyam Bhutani Date: Thu, 2 Oct 2025 19:02:51 -0700 Subject: [PATCH 02/28] add part 1 --- docs/Tutorials/1_RL_and_Forge_Fundamentals.MD | 385 ++++++++++++++++++ docs/Tutorials/2_.MD | 0 docs/Tutorials/3_.MD | 0 docs/Tutorials/ReadMe.MD | 12 +- 4 files changed, 395 insertions(+), 2 deletions(-) create mode 100644 docs/Tutorials/1_RL_and_Forge_Fundamentals.MD create mode 100644 docs/Tutorials/2_.MD create mode 100644 docs/Tutorials/3_.MD diff --git a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD new file mode 100644 index 000000000..96710b57a --- /dev/null +++ b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD @@ -0,0 +1,385 @@ +# Part 1: RL Fundamentals - Using Forge Terminology + +## Core RL Components in Forge + +Let's start with a simple math tutoring example to understand RL concepts with the exact names Forge uses: + +### The Toy Example: Teaching Math + +```mermaid +graph TD + subgraph Example["Math Tutoring RL Example"] + Dataset["Dataset
math problems
'What is 2+2?'"] + Policy["Policy
student AI
generates: 'The answer is 4'"] + Reward["Reward Model
Evaluation Exam
scores: 0.95 (excellent)"] + Reference["Reference Model
original student
baseline comparison"] + ReplayBuffer["Replay Buffer
notebook
stores experiences"] + Trainer["Trainer
tutor
improves student"] + end + + Dataset --> Policy + Policy --> Reward + Policy --> Reference + Reward --> ReplayBuffer + Reference --> ReplayBuffer + ReplayBuffer --> Trainer + Trainer --> Policy + + style Policy fill:#99ff99 + style Reward fill:#ffcc99 + style Trainer fill:#ff99cc +``` + +### RL Components Defined (Forge Names) + +1. **Dataset**: Provides questions/prompts (like "What is 2+2?") +2. **Policy**: The AI being trained (generates answers like "The answer is 4") +3. **Reward Model**: Evaluates answer quality (gives scores like 0.95) +4. **Reference Model**: Original policy copy (prevents drift from baseline) +5. **Replay Buffer**: Stores experiences (question + answer + score) +6. **Trainer**: Updates the policy weights based on experiences + +### The RL Learning Flow + +```python +# CONCEPTUAL EXAMPLE - see apps/grpo/main.py for GRPO Code + +def conceptual_rl_step(): + # 1. Get a math problem + question = dataset.sample() # "What is 2+2?" + + # 2. Student generates answer + answer = policy.generate(question) # "The answer is 4" + + # 3. Teacher grades it + score = reward_model.evaluate(question, answer) # 0.95 + + # 4. Compare to original student + baseline = reference_model.compute_logprobs(question, answer) + + # 5. Store the experience + experience = Episode(question, answer, score, baseline) + replay_buffer.add(experience) + + # 6. When enough experiences collected, improve student + batch = replay_buffer.sample(curr_policy_version=0) + if batch is not None: + trainer.train_step(batch) # Student gets better! + +# 🔄 See complete working example below with actual Forge service calls +``` + +## From Concepts to Forge Services + +Here's the key insight: **Each RL component becomes a Forge service**. The toy example above maps directly to Forge: + +```mermaid +graph LR + subgraph Concepts["RL Concepts"] + C1["Dataset"] + C2["Policy"] + C3["Reward Model"] + C4["Reference Model"] + C5["Replay Buffer"] + C6["Trainer"] + end + + subgraph Services["Forge Services (Real Classes)"] + S1["DatasetActor"] + S2["Policy"] + S3["RewardActor"] + S4["ReferenceModel"] + S5["ReplayBuffer"] + S6["RLTrainer"] + end + + C1 --> S1 + C2 --> S2 + C3 --> S3 + C4 --> S4 + C5 --> S5 + C6 --> S6 + + style C2 fill:#99ff99 + style S2 fill:#99ff99 + style C3 fill:#ffcc99 + style S3 fill:#ffcc99 +``` + +### RL Step with Forge Services + +```python +# Conceptual Example + +async def conceptual_forge_rl_step(services, step): + # 1. Get a math problem - CONCEPTUAL API + sample = await services['dataloader'].get_sample() + question, target = sample["question"], sample["answer"] + + # 2. Student generates answer - CONCEPTUAL API + # Actual method names vary by implementation + responses = await services['policy'].generate(prompt=question) + answer = responses[0].text + + # 3. Teacher grades it - CONCEPTUAL API + # Actual reward evaluation varies by implementation + score = await services['reward_actor'].evaluate( + prompt=question, response=answer, target=target + ) + + # 4. Compare to baseline - CONCEPTUAL API + ref_logprobs = await services['ref_model'].compute_baseline(responses[0].token_ids) + + # 5. Store experience - CONCEPTUAL Episode structure + # Real Episode structure in src/forge/data_models/episode.py + episode = create_episode(responses[0], score, ref_logprobs, step) + await services['replay_buffer'].store(episode) + + # 6. Improve student - CONCEPTUAL API + batch = await services['replay_buffer'].get_batch(policy_version=step) + if batch is not None: + loss = await services['trainer'].update_policy(batch) + return loss +``` + +**Key difference**: Same RL logic, but each component is now a distributed, fault-tolerant, auto-scaling service. + + +## Why This Matters: Traditional ML Infrastructure Fails + +### The Infrastructure Challenge + +Our simple RL loop above has complex requirements: + +#### Problem 1: Different Resource Needs + +```mermaid +graph TD + subgraph Components["Each Component Needs Different Resources"] + Policy["Policy (Student AI)
Generates: 'The answer is 4'
Needs: Large GPU memory
Scaling: Multiple replicas for speed"] + + Reward["Reward Model (Teacher)
Scores answers: 0.95
Needs: Moderate compute
Scaling: CPU or small GPU"] + + Trainer["Trainer (Tutor)
Improves student weights
Needs: Massive GPU compute
Scaling: Distributed training"] + + Dataset["Dataset (Question Bank)
Provides: 'What is 2+2?'
Needs: CPU intensive I/O
Scaling: High memory bandwidth"] + end + + style Policy fill:#99ff99 + style Reward fill:#ffcc99 + style Trainer fill:#ff99cc + style Dataset fill:#ccccff +``` + +### Problem 2: Complex Interdependencies + +```mermaid +graph LR + A["Policy: Student AI
'What is 2+2?' → 'The answer is 4'"] + B["Reward: Teacher
Scores answer: 0.95"] + C["Reference: Original Student
Provides baseline comparison"] + D["Replay Buffer: Notebook
Stores: question + answer + score"] + E["Trainer: Tutor
Improves student using experiences"] + + A --> B + A --> C + B --> D + C --> D + D --> E + E --> A + + style A fill:#99ff99 + style B fill:#ffcc99 + style C fill:#99ccff + style D fill:#ccff99 + style E fill:#ff99cc +``` + +Each step has different: +- **Latency requirements**: Policy inference needs low latency, training can batch +- **Scaling patterns**: Reward evaluation scales with response count, training with model size +- **Failure modes**: Policy failure stops generation, reward failure affects learning quality +- **Resource utilization**: GPUs for inference/training, CPUs for data processing + +### Problem 3: The Coordination Challenge + +Unlike supervised learning where you process independent batches, RL requires coordination: + +```python +# This won't work - creates bottlenecks and resource waste +def naive_rl_step(): + # Policy waits idle while reward model works + response = policy_model.generate(prompt) # GPU busy + reward = reward_model.evaluate(prompt, response) # Policy GPU idle + + # Training waits for single episode + loss = compute_loss(response, reward) # Batch size = 1, inefficient + + # Everything stops if any component fails + if policy_fails or reward_fails or trainer_fails: + entire_system_stops() +``` + +## Enter Forge: RL-Native Architecture + +Forge solves these problems by treating each RL component as an **independent, scalable service** + +Let's see how core RL concepts map to Forge services: + +```python +async def real_rl_training_step(services, step): + """Single RL step using verified Forge APIs""" + + # 1. Environment interaction + sample = await services['dataloader'].__next__.call_one() + prompt, target = sample["question"], sample["answer"] + + responses = await services['policy'].generate.route(prompt=prompt) + + # 2. Reward computation + score = await services['reward_actor'].evaluate_response.route( + prompt=prompt, response=responses[0].text, target=target + ) + + # 3. Get reference logprobs + ref_logprobs = await services['ref_model'].forward.route(responses[0].token_ids) + + # 4. Experience storage - Episode creation pattern + # Note: Actual Episode structure requires token tensors, not text + episode = create_episode_from_response(responses[0], score, ref_logprobs, step) + await services['replay_buffer'].add.call_one(episode) + + # 5. Learning - trainer endpoint + batch = await services['replay_buffer'].sample.call_one( + curr_policy_version=step + ) + if batch is not None: + loss = await services['trainer'].train_step.call_one(batch) + + # 6. Policy synchronization - weight update pattern + await services['trainer'].push_weights.call_one(step + 1) + await services['policy'].update_weights.fanout(step + 1) + + return loss +``` + +**Key insight**: Each line of RL pseudocode becomes a service call. The complexity of distribution, scaling, and fault tolerance is hidden behind these simple interfaces. + +## What Makes This Powerful + +### Automatic Resource Management +```python +responses = await policy.generate.route(prompt=question) +answer = responses[0].text # responses is list[Completion] + +# Forge handles behind the scenes: +# - Routing to least loaded replica +# - GPU memory management +# - Batch optimization +# - Failure recovery +# - Auto-scaling based on demand +``` + +### Independent Scaling +```python + +from forge.actors.policy import Policy, PolicyConfig, SamplingOverrides, WorkerConfig +from forge.actors.replay_buffer import ReplayBuffer +from forge.controller.service import shutdown_service +from apps.grpo.main import Trainer, RewardActor, ComputeAdvantages, RefModel, DatasetActor +from forge.data.rewards import MathReward, ThinkingReward +import asyncio + +model = "Qwen/Qwen3-1.7B" +group_size = 1 + +( + dataloader, + policy, + trainer, + replay_buffer, + compute_advantages, + ref_model, + reward_actor, +) = await asyncio.gather( + # Dataset service + spawn_service( + ServiceConfig(procs_per_replica=1, num_replicas=1), + DatasetActor, + path="openai/gsm8k", + config_name="main", + split="train", + streaming=True, + ), + # Policy service with GPU + spawn_service( + ServiceConfig(procs_per_replica=1, with_gpus=True, num_replicas=1), + Policy, + config=PolicyConfig( + worker_params=WorkerConfig(model=model), + sampling_params=SamplingOverrides( + num_samples=group_size, max_tokens=16 + ), + ), + ), + # Trainer service with GPU + spawn_service( + ServiceConfig(procs_per_replica=1, with_gpus=True, num_replicas=1), + Trainer, + learning_rate=1e-5, + beta=0.1, + model_name=model, + ), + # Replay buffer (CPU) + spawn_service( + ServiceConfig(procs_per_replica=1, num_replicas=1), + ReplayBuffer, + batch_size=2, + max_policy_age=1, + ), + # Advantage computation (CPU) + spawn_service( + ServiceConfig(procs_per_replica=1, num_replicas=1), + ComputeAdvantages, + gamma=0.99, + lambda_=0.95, + ), + # Reference model with GPU + spawn_service( + ServiceConfig(procs_per_replica=1, num_replicas=1, with_gpus=True), + RefModel, + model_name=model, + ), + # Reward actor (CPU) + spawn_service( + ServiceConfig(procs_per_replica=1, num_replicas=1), + RewardActor, + reward_functions=[MathReward(), ThinkingReward()], + ) + ) + +# Production scaling - multiply num_replicas: +# Policy: num_replicas=8 for high inference demand +# RewardActor: num_replicas=16 for parallel evaluation +# Trainer: num_replicas=4 for distributed training +``` + +### Fault Tolerance +```python +# If a policy replica fails: +responses = await policy.generate.route(prompt=question) +answer = responses[0].text +# -> Forge automatically routes to healthy replica +# -> Failed replica respawns in background +# -> No impact on training loop + +# If reward service fails: +score = await reward_actor.evaluate_response.route( + prompt=question, response=answer, target=target +) +# -> Retries on different replica automatically +# -> Graceful degradation if all replicas fail +# -> System continues (may need application-level handling) +``` + +This is fundamentally different from monolithic RL implementations where any component failure stops everything. diff --git a/docs/Tutorials/2_.MD b/docs/Tutorials/2_.MD new file mode 100644 index 000000000..e69de29bb diff --git a/docs/Tutorials/3_.MD b/docs/Tutorials/3_.MD new file mode 100644 index 000000000..e69de29bb diff --git a/docs/Tutorials/ReadMe.MD b/docs/Tutorials/ReadMe.MD index 6294c8ec8..01d750d06 100644 --- a/docs/Tutorials/ReadMe.MD +++ b/docs/Tutorials/ReadMe.MD @@ -1,8 +1,16 @@ -Welcome to the Tutorials section! This section is inspired by the A-Z PyTorch tutorial, shoutout to our friends that remember! +## Zero to Forge: From RL Theory to Production-Scale Implementation + +A comprehensive guide for ML Engineers building distributed RL systems for language models. + +Some of the examples mentioned below will be conceptual in nature for understanding. Please refer to API Docs (Coming Soon!) for more details + +Welcome to the Tutorials section! This section is inspired by the A-Z PyTorch tutorial, shoutout to our PyTorch friends that remember! + +### This section currently is structured in 3 detailed parts: -1. []() +1. [RL Fundamentals and Understanding Forge Terminology](./1_RL_and_Forge_Fundamentals.MD): This gives a quick refresher of Reinforcement Learning and teaches you Forge Fundamentals 2. []() 3. []() From 8abcadbae7997252a800db6f57aa8263bb3f7088 Mon Sep 17 00:00:00 2001 From: Sanyam Bhutani Date: Thu, 2 Oct 2025 19:06:35 -0700 Subject: [PATCH 03/28] Update 1_RL_and_Forge_Fundamentals.MD --- docs/Tutorials/1_RL_and_Forge_Fundamentals.MD | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD index 96710b57a..bcffc733c 100644 --- a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD +++ b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD @@ -85,6 +85,7 @@ graph LR end subgraph Services["Forge Services (Real Classes)"] + S1["DatasetActor"] S2["Policy"] S3["RewardActor"] From c0c09cb43a588dc9bbd99dc18fc7fa65149d4f11 Mon Sep 17 00:00:00 2001 From: Sanyam Bhutani Date: Thu, 2 Oct 2025 19:08:03 -0700 Subject: [PATCH 04/28] Update 1_RL_and_Forge_Fundamentals.MD --- docs/Tutorials/1_RL_and_Forge_Fundamentals.MD | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD index bcffc733c..223a6e152 100644 --- a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD +++ b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD @@ -85,7 +85,6 @@ graph LR end subgraph Services["Forge Services (Real Classes)"] - S1["DatasetActor"] S2["Policy"] S3["RewardActor"] @@ -109,6 +108,8 @@ graph LR ### RL Step with Forge Services +Let's look at the example from above again, but this time we would use the names from Forge: + ```python # Conceptual Example @@ -145,6 +146,8 @@ async def conceptual_forge_rl_step(services, step): **Key difference**: Same RL logic, but each component is now a distributed, fault-tolerant, auto-scaling service. +Did you realise-we are not worrying about any Infra code here! Forge Automagically handles the details behind the scenes and you can focus on writing your RL Algorthms! + ## Why This Matters: Traditional ML Infrastructure Fails From 0a77675f94258aa63ef8038211a7581b12e3a4ea Mon Sep 17 00:00:00 2001 From: Sanyam Bhutani Date: Thu, 2 Oct 2025 19:12:43 -0700 Subject: [PATCH 05/28] part 2 --- docs/Tutorials/1_RL_and_Forge_Fundamentals.MD | 36 +- docs/Tutorials/2_Forge_Internals.MD | 665 ++++++++++++++++++ docs/Tutorials/3_.MD | 0 docs/Tutorials/{2_.MD => 3_Monarch_101.MD} | 0 4 files changed, 685 insertions(+), 16 deletions(-) create mode 100644 docs/Tutorials/2_Forge_Internals.MD delete mode 100644 docs/Tutorials/3_.MD rename docs/Tutorials/{2_.MD => 3_Monarch_101.MD} (100%) diff --git a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD index 223a6e152..810ef373f 100644 --- a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD +++ b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD @@ -275,15 +275,15 @@ async def real_rl_training_step(services, step): ```python responses = await policy.generate.route(prompt=question) answer = responses[0].text # responses is list[Completion] - -# Forge handles behind the scenes: -# - Routing to least loaded replica -# - GPU memory management -# - Batch optimization -# - Failure recovery -# - Auto-scaling based on demand ``` +Forge handles behind the scenes: +- Routing to least loaded replica +- GPU memory management +- Batch optimization +- Failure recovery +- Auto-scaling based on demand + ### Independent Scaling ```python @@ -361,13 +361,14 @@ group_size = 1 reward_functions=[MathReward(), ThinkingReward()], ) ) - -# Production scaling - multiply num_replicas: -# Policy: num_replicas=8 for high inference demand -# RewardActor: num_replicas=16 for parallel evaluation -# Trainer: num_replicas=4 for distributed training ``` +Production scaling - multiply num_replicas: +- Policy: num_replicas=8 for high inference demand +- RewardActor: num_replicas=16 for parallel evaluation +- Trainer: num_replicas=4 for distributed training + + ### Fault Tolerance ```python # If a policy replica fails: @@ -381,9 +382,12 @@ answer = responses[0].text score = await reward_actor.evaluate_response.route( prompt=question, response=answer, target=target ) -# -> Retries on different replica automatically -# -> Graceful degradation if all replicas fail -# -> System continues (may need application-level handling) ``` -This is fundamentally different from monolithic RL implementations where any component failure stops everything. +- Retries on different replica automatically +- Graceful degradation if all replicas fail +- System continues (may need application-level handling) + +This is fundamentally different from monolithic RL implementations where any component failure stops everything! + +In the next Section, we will go a layer deeper and learn how ForgeServices work. Continue to [Part 2 here](./2_Forge_Internals.MD) \ No newline at end of file diff --git a/docs/Tutorials/2_Forge_Internals.MD b/docs/Tutorials/2_Forge_Internals.MD new file mode 100644 index 000000000..d55eda51a --- /dev/null +++ b/docs/Tutorials/2_Forge_Internals.MD @@ -0,0 +1,665 @@ +# Part 2: Peeling Back the Abstraction - What Are Services? + +We highly recommend reading [Part 1](./1_RL_and_Forge_Fundamentals.MD) before this, it explains RL Concepts and how they land in Forge. + +Now that you see the power of the service abstraction, let's understand what's actually happening under the hood, Grab your chai! + +## Service Anatomy: Beyond the Interface + +When you call `await policy_service.generate(question)`, here's what actually happens: + +```mermaid +graph TD + Call["Your Code:
await policy_service.generate"] + + subgraph ServiceLayer["Service Layer"] + Proxy["Service Proxy
Load balancing
Health checking
Request routing"] + LB["Load Balancer
Replica selection
Circuit breaker
Retry logic"] + end + + subgraph Replicas["Replica Management"] + R1["Replica 1
GPU 0
Healthy"] + R2["Replica 2
GPU 1
Overloaded"] + R3["Replica 3
GPU 2
Failed"] + R4["Replica 4
GPU 3
Healthy"] + end + + subgraph Compute["Actual Computation"] + Actor["Policy Actor
vLLM engine
Model weights
KV cache"] + end + + Call --> Proxy + Proxy --> LB + LB --> R1 + LB -.-> R2 + LB -.-> R3 + LB --> R4 + R1 --> Actor + R4 --> Actor + + style Call fill:#99ff99 + style LB fill:#ffcc99 + style R3 fill:#ff9999 + style Actor fill:#cc99ff +``` + +## Service Components Deep Dive + +### 1. Real Service Configuration + +Here's the actual ServiceConfig from Forge source code: + +```python +# Configuration pattern from apps/grpo/main.py: +Policy.options( + procs=1, # Processes per replica + num_replicas=4, # Number of replicas + with_gpus=True # Allocate GPUs + # Other available options: + # hosts=None +) + +# This is the ACTUAL way services are configured in Forge +``` + +### 2. Real Service Creation + +Services are created using the `spawn_service` function: + +```python +# This is what ACTUALLY works - copied directly from the notebook + +from forge.controller.service import ServiceConfig, spawn_service +from forge.actors.policy import Policy, PolicyConfig, SamplingOverrides, WorkerConfig + +model = "Qwen/Qwen3-1.7B" + +policy = await spawn_service( + ServiceConfig(procs_per_replica=1, with_gpus=True, num_replicas=1), + Policy, + config=PolicyConfig( + worker_params=WorkerConfig(model=model), + sampling_params=SamplingOverrides( + num_samples=1, max_tokens=16 + ), + ), +) + +prompt = "What is 3 + 5?" +responses = await policy.generate.choose(prompt=prompt) +print(f"Response: {responses[0].text}") + +# The spawn_service() function automatically handles: +# - Spawning actor replicas across processes/GPUs +# - Load balancing with .choose() method +# - Health monitoring and failure recovery +# - Message routing and serialization + +# Cleanup when done +await shutdown_service(policy) +``` + +### 3. How Services Actually Work + +Forge services are implemented as ServiceActors that manage collections of your ForgeActor replicas: + +```python +# Forge internals - What happens behind the scenes: +# 1. .as_service() creates a ServiceInterface +# 2. ServiceInterface manages N replicas of your ForgeActor class +# 3. ServiceInterface handles routing between replicas +# 4. You get methods like .route(), .fanout(), etc. + +# Your code sees this: +responses = await policy.generate.route(prompt=prompt) + +# But behind the scenes: +# - ServiceInterface selects healthy replica +# - Routes message to that replica's Policy.generate() endpoint +# - Handles failures and retries automatically +# - Returns list[Completion] from the selected replica +``` + +### 3. Different Service Types and Their Characteristics + +```mermaid +graph TD + subgraph GPU["GPU-Intensive Services"] + PolicySvc["Policy Service
Large model inference
High GPU memory
Batch optimization"] + TrainerSvc["Trainer Service
Distributed training
Gradient sync
Massive compute"] + RefSvc["Reference Service
Frozen model
Baseline computation
Read-only ops"] + end + + subgraph CPU["CPU-Intensive Services"] + RewardSvc["Reward Service
Evaluation logic
Rule-based scoring
High throughput"] + DataSvc["Data Service
Dataset streaming
Preprocessing
I/O optimization"] + end + + subgraph Memory["Memory-Intensive Services"] + BufferSvc["Buffer Service
Experience storage
Efficient sampling
Persistence"] + MetricsSvc["Metrics Service
Logging aggregation
Performance tracking
Analytics"] + end + + style PolicySvc fill:#ff9999 + style TrainerSvc fill:#ff9999 + style RewardSvc fill:#99ff99 + style BufferSvc fill:#9999ff +``` + +## Deep Dive: Service Communication Patterns + +These communication patterns (\"adverbs\") determine how your service calls are routed to replicas. Understanding when to use each pattern is key to effective Forge usage. + +### 1. `.route()` - Load Balanced Single Replica + +**When to use**: Normal request routing where any replica can handle the request. + +```python +responses = await policy.generate.route(prompt=question) +answer = responses[0].text # Extract text from Completion object + +# Behind the scenes: +# 1. Health check eliminates failed replicas +# 2. Load balancer picks least loaded healthy replica +# 3. Request routes to that specific replica +# 4. Automatic retry on different replica if failure +``` + +**Performance characteristics**: +- **Latency**: Lowest (single network hop) +- **Throughput**: Limited by single replica capacity +- **Fault tolerance**: Automatic failover to other replicas + +**Critical insight**: `.route()` is your default choice for stateless operations in Forge services. + +### 2. `.fanout()` - Broadcast with Results Collection + +**When to use**: You need responses from ALL replicas. + +```python +# Get version from all policy replicas +current_versions = await policy.get_version.fanout() +# Returns: [version_replica_1, version_replica_2, ...] + +# Update weights on all replicas +await policy.update_weights.fanout(new_policy_version) +# Broadcasts to all replicas simultaneously +``` + +**Performance characteristics**: +- **Latency**: Slowest replica determines total latency +- **Throughput**: Network bandwidth × number of replicas +- **Fault tolerance**: Fails if ANY replica fails (unless configured otherwise) + +**Critical gotcha**: Don't use `.fanout()` for high-frequency operations - it contacts all replicas. + +### 3. Streaming Operations - Custom Implementation Pattern + +**When to use**: You want to process results as they arrive, not wait for all. + +```python +# 📝 CONCEPTUAL - Streaming requires custom implementation in your training loop +# The basic ReplayBuffer doesn't have built-in streaming methods +# Pattern from apps/grpo/main.py continuous training: + +while training: + # This is the real API call pattern + batch = await replay_buffer.sample.call_one(curr_policy_version=step) + if batch is not None: + # Process batch immediately + loss = await trainer.train_step.call_one(batch) + print(f"Training loss: {loss}") + else: + await asyncio.sleep(0.1) # Wait for more data +``` + +**Performance characteristics**: +- **Latency**: Process first result immediately +- **Throughput**: Pipeline parallelism (much higher than sequential) +- **Fault tolerance**: Continues if some replicas fail + +**Critical insight**: This is essential for high-throughput RL where you can't wait for batches. + +### 4. Fire-and-Forget Operations + +**When to use**: Side effects that don't need responses (notifications, cache updates). + +```python +# 📝 CONCEPTUAL - Fire-and-forget requires custom @endpoint implementations +# The basic services don't have broadcast methods built-in +# You would implement custom endpoints in your ForgeActor: + +class CustomPolicy(Policy): + @endpoint + async def clear_cache(self) -> None: + """Custom endpoint for cache clearing""" + self.policy_worker.clear_kv_cache() + +# Then use it (hypothetical): +# await custom_policy.clear_cache.fanout() # Clear all replica caches +# Note: Actual cache clearing would use existing Policy methods +``` + +**Performance characteristics**: +- **Latency**: Immediately returns (doesn't wait for completion) +- **Throughput**: Network limited, but non-blocking +- **Fault tolerance**: Fire-and-forget (you don't know if it worked) + +**Critical warning**: Only use for non-critical operations - you get no confirmation. + +### 5. Service Sessions for Stateful Operations + +**When to use**: When you need multiple calls to hit the same replica (like KV cache preservation). + +```python +# This Counter example demonstrates the session pattern + +from forge.controller import ForgeActor +from forge.controller.service import ServiceConfig, spawn_service, shutdown_service +from monarch.actor import endpoint + +class ForgeCounter(ForgeActor): + def __init__(self, initial_value: int): + self.value = initial_value + + @endpoint + def increment(self) -> int: + self.value += 1 + return self.value + + @endpoint + def get_value(self) -> int: + return self.value + + @endpoint + async def reset(self): + self.value = 0 + +counter_service = await spawn_service( + ServiceConfig(procs_per_replica=1, num_replicas=4), + ForgeCounter, + initial_value=0 +) + +# Test basic operations +await counter_service.increment.choose() +results = await counter_service.increment.call() +print(f"All replica values: {results}") + +# STICKY SESSIONS +print("\nUsing sticky sessions:") +async with counter_service.session(): + await counter_service.reset.choose() + print(await counter_service.increment.choose()) # 1 + print(await counter_service.increment.choose()) # 2 + print(await counter_service.increment.choose()) # 3 + + final_value = await counter_service.get_value.choose() + print(f"Final value on this replica: {final_value}") # 3 + +# Same pattern works with Policy for multi-turn conversations: +# async with policy.session(): +# response1 = await policy.generate.choose(prompt=turn1) +# full_prompt = turn1 + response1[0].text + turn2 +# response2 = await policy.generate.choose(prompt=full_prompt) +# # Both calls hit same replica, preserving KV cache + +# Cleanup +await shutdown_service(counter_service) +``` + +**Performance impact**: Critical for maintaining KV cache in multi-turn conversations. + +## Deep Dive: State Management Reality + +The most complex challenge in distributed RL is maintaining state consistency while maximizing performance. + +### The KV Cache Problem + +**The challenge**: Policy inference is much faster with KV cache, but cache is tied to specific conversation history. + +```python +# This breaks KV cache optimization: +async def naive_multi_turn(): + # Each call might go to different replica = cache miss + response1 = await policy_service.generate.choose(question1) + response2 = await policy_service.generate.choose(question1 + response1) # Cache miss! + response3 = await policy_service.generate.choose(conversation_so_far) # Cache miss! +``` + +**The solution**: Sticky sessions ensure all calls go to same replica. + +```python +async def optimized_multi_turn(): + async with policy.session(): + # All calls guaranteed to hit same replica = cache hits + response1 = await policy.generate.route(prompt=question1) + full_prompt = question1 + response1[0].text + response2 = await policy.generate.route(prompt=full_prompt) # Cache hit! + conversation = full_prompt + response2[0].text + response3 = await policy.generate.route(prompt=conversation) # Cache hit! + + # Session ends, replica can be garbage collected or reused +``` + +**Performance impact**: Maintaining KV cache across turns avoids recomputing previous tokens. + +### Replay Buffer Consistency + +**The challenge**: Multiple trainers and experience collectors reading/writing concurrently. + +**Real Forge approach**: The ReplayBuffer actor handles concurrency internally: + +```python +# Forge ReplayBuffer endpoints (verified from source code) +# Add episodes (thread-safe by actor model) +await replay_buffer.add.call_one(episode) # Note: .call_one() not .choose() + +# Sample batches for training +batch = await replay_buffer.sample.call_one( + curr_policy_version=step_number, + batch_size=None # Optional parameter, uses default from config +) + +# Additional methods available: +# await replay_buffer.clear.call_one() # Clear buffer +# await replay_buffer.evict.call_one(curr_policy_version) # Remove old episodes +# state = await replay_buffer.state_dict.call_one() # Get state for checkpointing +``` + +**Critical insight**: The actor model provides natural thread safety - each actor processes messages sequentially. + +### Weight Synchronization Strategy + +**The challenge**: Trainer updates policy weights, but policy service needs those weights. + +```python +# Forge weight synchronization pattern from apps/grpo/main.py +async def real_weight_sync(trainer, policy, step): + # Trainer pushes weights to TorchStore with version number + await trainer.push_weights.call_one(policy_version=step + 1) + + # Policy service updates to new version from TorchStore + # Use .fanout() to update ALL policy replicas + await policy.update_weights.fanout(policy_version=step + 1) + +# Check current policy version +current_version = await policy.get_version.route() +print(f"Current policy version: {current_version}") +``` + +## Deep Dive: Asynchronous Coordination Patterns + +**The real challenge**: Different services run at different speeds, but Forge's service abstraction handles the coordination complexity. + +### The Forge Approach: Let Services Handle Coordination + +Instead of manual coordination, Forge services handle speed mismatches automatically: + +```python + +from apps.grpo.main import Episode, Group + +async def simple_rl_step(): + + # ===== Generate a rollout ===== + sample = await dataloader.__next__.choose() + prompt, target = sample["question"], sample["answer"] + + print(f"Prompt: {prompt}") + print(f"Target: {target}") + + actions = await policy.generate.choose(prompt=prompt) + print(f"Policy response: {actions[0].text}") + + ref_logprobs = await ref_model.forward.choose(actions[0].token_ids) + reward = await reward_actor.evaluate_response.choose( + prompt=prompt, + response=actions[0].text, + target=target + ) + print(f"Reward: {reward}") + + episode = Episode( + episode_id=0, + prompt=prompt, + target=target, + policy_version=0, + ) + + episode.add_group(Group( + response=actions[0].text, + ref_logprobs=ref_logprobs, + reward=reward, + )) + + advantages = await compute_advantages.__call__.choose(episode.groups) + episode.groups[0].advantage = advantages[0] + print(f"Advantage: {advantages[0]}") + await replay_buffer.add.choose(episode) + print("Episode stored in replay buffer") + + # ===== Train on the batch ===== + batch = await replay_buffer.sample.choose(curr_policy_version=0) + if batch is not None: + print("Training on batch...") + training_result = await trainer.train_step.choose(batch) + loss = training_result.get("loss", 0.0) + print(f"Training loss: {loss}") + return loss + else: + print("Not enough data in buffer yet") + return None + +for step in range(10): + print(f"\n--- RL Step {step + 1} ---") + loss = await simple_rl_step() + if loss: + print(f"Step {step + 1} complete, loss: {loss:.4f}") + else: + print(f"Step {step + 1} complete, building buffer...") +``` + +### Handling Speed Mismatches with Service Scaling + +**The insight**: Scale services independently based on their bottlenecks. + +```python +# Scale fast services with more replicas +policy = await Policy.options( + procs=1, num_replicas=8, with_gpus=True # Many replicas for high throughput +).as_service( + engine_config=EngineConfig(model=model_name) +) + +# Reward evaluation might be CPU-bound +reward_actor = await RewardActor.options( + procs=1, num_replicas=16, with_gpus=False # More CPU replicas +).as_service( + reward_functions=[MathReward()] +) + +# Training needs fewer but more powerful replicas +trainer = await RLTrainer.options( + procs=1, num_replicas=2, with_gpus=True # Fewer but GPU-heavy +).as_actor( # Trainer typically uses .as_actor() not .as_service() + optimizer=Optimizer(lr=1e-5) +) +``` + +### Natural Backpressure Through Service APIs + +```python +# backpressure pattern - The replay buffer naturally provides backpressure +batch = await replay_buffer.sample.call_one(curr_policy_version=step) +if batch is None: + # Not enough data yet - natural rate limiting + print("Buffer not ready, collecting more experiences...") + continue +else: + # Proceed with training + loss = await trainer.train_step.call_one(batch) + print(f"Training loss: {loss}") +``` + +These patterns address the core technical challenges in distributed RL. The key insight: **Forge services handle coordination complexity automatically, letting you focus on RL algorithm logic**. + +## Service Implementation Example + +Let's see how a reward service is actually implemented: + +```python +# ✅ COMPLETE WORKING EXAMPLE - Exact RewardActor from apps/grpo/main.py + +from forge.controller import ForgeActor +from monarch.actor import endpoint +from forge.data.rewards import MathReward, ThinkingReward +from forge.controller.service import ServiceConfig, spawn_service + +# EXACT class definition from apps/grpo/main.py lines 68-83 +class RewardActor(ForgeActor): + def __init__(self, reward_functions: list): + self.reward_functions = reward_functions + + @endpoint + async def evaluate_response(self, prompt: str, response: str, target: str) -> float: + """Evaluate response quality using multiple reward functions""" + total_reward = 0.0 + + for reward_fn in self.reward_functions: + # Each reward function contributes to total score + reward = reward_fn(prompt, response, target) + total_reward += reward + + # Return average reward across all functions + return total_reward / len(self.reward_functions) if self.reward_functions else 0.0 + +reward_actor = await spawn_service( + ServiceConfig(procs_per_replica=1, num_replicas=1), + RewardActor, + reward_functions=[MathReward(), ThinkingReward()] +) + +prompt = "What is 15% of 240?" +response = "15% of 240 is 36" +target = "36" + +score = await reward_actor.evaluate_response.choose( + prompt=prompt, + response=response, + target=target +) +print(f"Reward score: {score}") # Usually around 1.0 for correct math answers + +# For production scaling - increase num_replicas for parallel evaluation: +# ServiceConfig(procs_per_replica=1, num_replicas=16) # 16 parallel evaluators + +# Cleanup when done +await shutdown_service(reward_actor) +``` + +## Service Orchestration: The Training Loop + +Now let's see how services coordinate in a real training loop: + +```python +# This is the REAL way production RL systems are built with Forge + +import asyncio +from forge.actors.policy import Policy +from forge.actors.reference_model import ReferenceModel +from forge.actors.replay_buffer import ReplayBuffer +from forge.actors.trainer import RLTrainer +from forge.controller.actor import ForgeActor +from forge.data.rewards import MathReward, ThinkingReward +from monarch.actor import endpoint +from omegaconf import DictConfig + +# EXACT service creation from apps/grpo/main.py lines 322-344 +print("Initializing all services...") +( + dataloader, + policy, + trainer, + replay_buffer, + compute_advantages, + ref_model, + reward_actor, +) = await asyncio.gather( + DatasetActor.options(**cfg.actors.dataset).as_actor(**cfg.dataset), + Policy.options(**cfg.services.policy).as_service(**cfg.policy), + RLTrainer.options(**cfg.actors.trainer).as_actor( + **cfg.trainer, loss=simple_grpo_loss + ), + ReplayBuffer.options(**cfg.actors.replay_buffer).as_actor( + **cfg.replay_buffer, collate=collate + ), + ComputeAdvantages.options(**cfg.actors.compute_advantages).as_actor(), + ReferenceModel.options(**cfg.services.ref_model).as_service(**cfg.ref_model), + RewardActor.options(**cfg.services.reward_actor).as_service( + reward_functions=[MathReward(), ThinkingReward()] + ), +) + +print("All services initialized successfully!") + +# EXACT usage patterns from apps/grpo/main.py continuous training loop +async def production_training_loop(): + """Real training loop pattern from apps/grpo/main.py""" + step = 0 + + while True: + # Data generation + sample = await dataloader.sample.call_one() + + # Policy generation service call + responses = await policy.generate.route(prompt=sample["question"]) + + # Reference computation service call + ref_logprobs = await ref_model.forward.route(responses[0].token_ids) + + # Reward evaluation service call + reward = await reward_actor.evaluate_response.route( + prompt=sample["question"], + response=responses[0].text, + target=sample["answer"] + ) + + # Experience storage (simplified structure for illustration) + episode = create_episode(sample, responses[0], reward, ref_logprobs, step) + await replay_buffer.add.call_one(episode) + + # Training when ready endpoints + batch = await replay_buffer.sample.call_one(curr_policy_version=step) + if batch is not None: + loss = await trainer.train_step.call_one(batch) + + # Weight synchronization pattern + await trainer.push_weights.call_one(step + 1) + await policy.update_weights.route(step + 1) + + print(f"Step {step}, Loss: {loss:.4f}") + step += 1 + +# EXACT cleanup pattern from apps/grpo/main.py lines 493-504 +print("Shutting down services...") +await asyncio.gather( + DatasetActor.shutdown(dataloader), + policy.shutdown(), + RLTrainer.shutdown(trainer), + ReplayBuffer.shutdown(replay_buffer), + ComputeAdvantages.shutdown(compute_advantages), + ref_model.shutdown(), + reward_actor.shutdown(), +) +print("All services shut down successfully!") +``` + +**Key observations:** +1. **Parallelism**: Independent operations run concurrently +2. **Load balancing**: Each `choose()` call automatically selects optimal replica +3. **Fault tolerance**: Failures automatically retry on different replicas +4. **Resource efficiency**: CPU and GPU services scale independently +5. **Coordination**: Services coordinate through shared state (replay buffer, weight versions) + +This is the power of the service abstraction - complex distributed coordination looks like simple async Python code. diff --git a/docs/Tutorials/3_.MD b/docs/Tutorials/3_.MD deleted file mode 100644 index e69de29bb..000000000 diff --git a/docs/Tutorials/2_.MD b/docs/Tutorials/3_Monarch_101.MD similarity index 100% rename from docs/Tutorials/2_.MD rename to docs/Tutorials/3_Monarch_101.MD From f3710077ec97e3289b5a3aa3882fee48572bb223 Mon Sep 17 00:00:00 2001 From: Sanyam Bhutani Date: Thu, 2 Oct 2025 19:15:10 -0700 Subject: [PATCH 06/28] add part 3 --- docs/Tutorials/3_Monarch_101.MD | 437 ++++++++++++++++++++++++++++++++ docs/Tutorials/ReadMe.MD | 4 +- 2 files changed, 439 insertions(+), 2 deletions(-) diff --git a/docs/Tutorials/3_Monarch_101.MD b/docs/Tutorials/3_Monarch_101.MD index e69de29bb..9369be13a 100644 --- a/docs/Tutorials/3_Monarch_101.MD +++ b/docs/Tutorials/3_Monarch_101.MD @@ -0,0 +1,437 @@ +# Part 3: The Forge-Monarch Connection + +Now let's peel back the layers. Forge services are built on top of **Monarch**, PyTorch's distributed actor framework. Understanding this connection is crucial for optimization and debugging. + +## The Complete Hierarchy: Service to Silicon + +```mermaid +graph TD + subgraph YourCode["1. Your RL Code"] + Call["await policy_service.generate.choose('What is 2+2?')"] + end + + subgraph ForgeServices["2. Forge Service Layer"] + ServiceInterface["ServiceInterface
• Routes .choose() to replica
• Handles load balancing
• Manages health checks"] + ServiceActor["ServiceActor
• Manages replica lifecycle
• Monitors health
• Coordinates failures"] + end + + subgraph MonarchLayer["3. Monarch Actor Layer"] + ActorMesh["ActorMesh[PolicyActor]
• 4 PolicyActor instances
• Each on different GPU
• Message passing interface"] + ProcMesh["ProcMesh
• 4 processes
• GPU topology: [0,1,2,3]
• Network interconnect"] + end + + subgraph Hardware["4. Physical Hardware"] + GPU0["GPU 0
PolicyActor #1
vLLM Engine
Model Weights"] + GPU1["GPU 1
PolicyActor #2
vLLM Engine
Model Weights"] + GPU2["GPU 2
PolicyActor #3
vLLM Engine
Model Weights"] + GPU3["GPU 3
PolicyActor #4
vLLM Engine
Model Weights"] + end + + Call --> ServiceInterface + ServiceInterface --> ServiceActor + ServiceActor --> ActorMesh + ActorMesh --> ProcMesh + ProcMesh --> GPU0 + ProcMesh --> GPU1 + ProcMesh --> GPU2 + ProcMesh --> GPU3 + + style Call fill:#99ff99 + style ServiceActor fill:#ffcc99 + style ActorMesh fill:#cc99ff + style ProcMesh fill:#ccccff +``` + +## Deep Dive: ProcMesh - The Foundation + +**ProcMesh** is Monarch's core abstraction for organizing processes across hardware. Think of it as a multi-dimensional grid that maps directly to your cluster topology. + +### Single Host ProcMesh + +```mermaid +graph TD + subgraph Host["Single Host (8 GPUs)"] + subgraph ProcMesh["ProcMesh: per_host={'gpus': 8}"] + P0["Process 0
GPU 0"] + P1["Process 1
GPU 1"] + P2["Process 2
GPU 2"] + P3["Process 3
GPU 3"] + P4["Process 4
GPU 4"] + P5["Process 5
GPU 5"] + P6["Process 6
GPU 6"] + P7["Process 7
GPU 7"] + end + + P0 -.->|"Network"| P1 + P1 -.->|"Network"| P2 + P2 -.->|"Network"| P3 + P3 -.->|"Network"| P4 + P4 -.->|"Network"| P5 + P5 -.->|"Network"| P6 + P6 -.->|"Network"| P7 + P7 -.->|"Network"| P0 + end + + style P0 fill:#ff9999 + style P1 fill:#ff9999 + style P2 fill:#ff9999 + style P3 fill:#ff9999 + style P4 fill:#ff9999 + style P5 fill:#ff9999 + style P6 fill:#ff9999 + style P7 fill:#ff9999 +``` + +### Multi-Host ProcMesh + +```mermaid +graph TD + subgraph Cluster["Multi-Host Cluster"] + subgraph Host1["Host 1"] + subgraph PM1["ProcMesh Segment 1"] + H1P0["Process 0
GPU 0"] + H1P1["Process 1
GPU 1"] + H1P2["Process 2
GPU 2"] + H1P3["Process 3
GPU 3"] + end + end + + subgraph Host2["Host 2"] + subgraph PM2["ProcMesh Segment 2"] + H2P0["Process 4
GPU 0"] + H2P1["Process 5
GPU 1"] + H2P2["Process 6
GPU 2"] + H2P3["Process 7
GPU 3"] + end + end + + subgraph Host3["Host 3"] + subgraph PM3["ProcMesh Segment 3"] + H3P0["Process 8
GPU 0"] + H3P1["Process 9
GPU 1"] + H3P2["Process 10
GPU 2"] + H3P3["Process 11
GPU 3"] + end + end + end + + H1P0 -.->|"InfiniBand"| H2P0 + H1P1 -.->|"InfiniBand"| H2P1 + H2P0 -.->|"InfiniBand"| H3P0 + H2P1 -.->|"InfiniBand"| H3P1 + + style PM1 fill:#ff9999 + style PM2 fill:#99ff99 + style PM3 fill:#99ccff +``` + +```python +# This shows the underlying actor system that powers Forge services + +from monarch.actor import Actor, endpoint, this_proc, Future +from monarch.actor import ProcMesh, this_host +import asyncio + +# STEP 1: Define a basic actor +class Counter(Actor): + def __init__(self, initial_value: int): + self.value = initial_value + + @endpoint + def increment(self) -> None: + self.value += 1 + + @endpoint + def get_value(self) -> int: + return self.value + +# STEP 2: Single actor in local process +counter: Counter = this_proc().spawn("counter", Counter, initial_value=0) + +# STEP 3: Send messages +fut: Future[int] = counter.get_value.call_one() +value = await fut +print(f"Counter value: {value}") # 0 + +# STEP 4: Multiple actors across processes +procs: ProcMesh = this_host().spawn_procs(per_host={"gpus": 8}) +counters: Counter = procs.spawn("counters", Counter, 0) + +# STEP 5: Broadcast to all actors +await counters.increment.call() + +# STEP 6: Different message patterns +# call_one() - single actor +value = await counters.get_value.call_one() +print(f"One counter: {value}") + +# choose() - random single actor +value = await counters.get_value.choose() +print(f"Random counter: {value}") + +# call() - all actors, collect results +values = await counters.get_value.call() +print(f"All counters: {values}") + +# broadcast() - fire and forget +await counters.increment.broadcast() + +# Cleanup +await procs.stop() +``` + +## Actor Meshes: Your Code Running Distributed + +**ActorMesh** is created when you spawn actors across a ProcMesh. Each process in the ProcMesh gets one instance of your actor. + +```mermaid +graph TD + subgraph Creation["Actor Creation Process"] + Code["mesh.spawn('policy', PolicyActor, model='Qwen/Qwen3-7B')"] + + subgraph ProcMesh["ProcMesh (4 processes)"] + P0["Process 0
GPU 0"] + P1["Process 1
GPU 1"] + P2["Process 2
GPU 2"] + P3["Process 3
GPU 3"] + end + + subgraph ActorMesh["ActorMesh[PolicyActor]"] + A0["PolicyActor
Instance #0
model=Qwen/Qwen3-7B
generation_count=0"] + A1["PolicyActor
Instance #1
model=Qwen/Qwen3-7B
generation_count=0"] + A2["PolicyActor
Instance #2
model=Qwen/Qwen3-7B
generation_count=0"] + A3["PolicyActor
Instance #3
model=Qwen/Qwen3-7B
generation_count=0"] + end + + Code --> ProcMesh + P0 --> A0 + P1 --> A1 + P2 --> A2 + P3 --> A3 + end + + style A0 fill:#99ff99 + style A1 fill:#99ff99 + style A2 fill:#99ff99 + style A3 fill:#99ff99 +``` + +### Message Routing Through ActorMesh + +```mermaid +graph TD + subgraph MessageFlow["Message Flow Patterns"] + Client["await policy_actors.generate.METHOD(prompt)"] + + subgraph Methods["Different Adverbs Route Differently"] + Choose["choose()
→ Routes to ONE actor
→ Load balanced"] + Call["call()
→ Routes to ALL actors
→ Collects all results"] + Broadcast["broadcast()
→ Routes to ALL actors
→ Fire and forget"] + Stream["stream()
→ Routes to ALL actors
→ Iterator of results"] + end + + subgraph ActorInstances["PolicyActor Instances"] + A0["Actor 0
GPU 0
generates response"] + A1["Actor 1
GPU 1
generates response"] + A2["Actor 2
GPU 2
generates response"] + A3["Actor 3
GPU 3
generates response"] + end + + Client --> Choose + Client --> Call + Client --> Broadcast + Client --> Stream + + Choose -.->|"Load balanced"| A1 + Call --> A0 + Call --> A1 + Call --> A2 + Call --> A3 + Broadcast --> A0 + Broadcast --> A1 + Broadcast --> A2 + Broadcast --> A3 + Stream --> A0 + Stream --> A1 + Stream --> A2 + Stream --> A3 + end + + style Choose fill:#99ff99 + style Call fill:#ffcc99 + style Broadcast fill:#ff99cc + style Stream fill:#cc99ff +``` + +## How Forge Services Use Monarch + +Now the key insight: **Forge services are ServiceActors that manage ActorMeshes of your ForgeActor replicas**. + +### The Service Creation Process + +```mermaid +graph TD + subgraph ServiceCreation["spawn_service() Process"] + Call["await spawn_service(ServiceConfig(num_replicas=4), PolicyActor, model='Qwen')"] + + ServiceActor["ServiceActor
• Manages 4 replicas
• Handles health checks
• Routes service calls"] + + subgraph Replicas["4 Independent Replicas"] + subgraph R0["Replica 0"] + PM0["ProcMesh
1 process
GPU 0"] + AM0["ActorMesh
1 PolicyActor"] + end + + subgraph R1["Replica 1"] + PM1["ProcMesh
1 process
GPU 1"] + AM1["ActorMesh
1 PolicyActor"] + end + + subgraph R2["Replica 2"] + PM2["ProcMesh
1 process
GPU 2"] + AM2["ActorMesh
1 PolicyActor"] + end + + subgraph R3["Replica 3"] + PM3["ProcMesh
1 process
GPU 3"] + AM3["ActorMesh
1 PolicyActor"] + end + end + + Call --> ServiceActor + ServiceActor --> R0 + ServiceActor --> R1 + ServiceActor --> R2 + ServiceActor --> R3 + PM0 --> AM0 + PM1 --> AM1 + PM2 --> AM2 + PM3 --> AM3 + end + + style ServiceActor fill:#ffcc99 + style AM0 fill:#99ff99 + style AM1 fill:#99ff99 + style AM2 fill:#99ff99 + style AM3 fill:#99ff99 +``` + +### Service Call to Actor Execution + +```mermaid +graph TD + subgraph CallFlow["Complete Call Flow"] + UserCall["await policy_service.generate.choose('What is 2+2?')"] + + ServiceInterface["ServiceInterface
• Receives .choose() call
• Routes to ServiceActor"] + + ServiceActor["ServiceActor
• Selects healthy replica
• Load balancing logic
• Failure handling"] + + SelectedReplica["Selected Replica #2
• ProcMesh with 1 process
• ActorMesh with 1 PolicyActor"] + + PolicyActor["PolicyActor Instance
• Loads model
• Runs vLLM inference
• Returns 'The answer is 4'"] + + GPU["GPU 2
• vLLM engine
• Model weights
• KV cache
• CUDA kernels"] + + UserCall --> ServiceInterface + ServiceInterface --> ServiceActor + ServiceActor --> SelectedReplica + SelectedReplica --> PolicyActor + PolicyActor --> GPU + + GPU -.->|"Response"| PolicyActor + PolicyActor -.->|"Response"| SelectedReplica + SelectedReplica -.->|"Response"| ServiceActor + ServiceActor -.->|"Response"| ServiceInterface + ServiceInterface -.->|"'The answer is 4'"| UserCall + end + + style UserCall fill:#99ff99 + style ServiceActor fill:#ffcc99 + style PolicyActor fill:#cc99ff + style GPU fill:#ffcccc +``` + +## Multiple Services Sharing Infrastructure + +In real RL systems, you have multiple services that can share or use separate ProcMeshes: + +```mermaid +graph TD + subgraph Cluster["RL Training Cluster"] + subgraph Services["Forge Services"] + PS["Policy Service
4 GPU replicas"] + TS["Trainer Service
2 GPU replicas"] + RS["Reward Service
4 CPU replicas"] + BS["Buffer Service
1 CPU replica"] + end + + subgraph MonarchInfra["Monarch Infrastructure"] + subgraph GPUMesh["GPU ProcMesh (6 processes)"] + G0["Process 0
GPU 0"] + G1["Process 1
GPU 1"] + G2["Process 2
GPU 2"] + G3["Process 3
GPU 3"] + G4["Process 4
GPU 4"] + G5["Process 5
GPU 5"] + end + + subgraph CPUMesh["CPU ProcMesh (5 processes)"] + C0["Process 0
CPU"] + C1["Process 1
CPU"] + C2["Process 2
CPU"] + C3["Process 3
CPU"] + C4["Process 4
CPU"] + end + end + + PS --> G0 + PS --> G1 + PS --> G2 + PS --> G3 + TS --> G4 + TS --> G5 + RS --> C0 + RS --> C1 + RS --> C2 + RS --> C3 + BS --> C4 + end + + style PS fill:#99ff99 + style TS fill:#ff99cc + style RS fill:#ffcc99 + style BS fill:#cc99ff + style GPUMesh fill:#ffe6e6 + style CPUMesh fill:#e6f3ff +``` + +## Key Insights: Why This Architecture Matters + +1. **Process Isolation**: Each actor runs in its own process - failures don't cascade +2. **Location Transparency**: Actors can be local or remote with identical APIs +3. **Structured Distribution**: ProcMesh maps directly to hardware topology +4. **Message Passing**: No shared memory means no race conditions or locks +5. **Service Abstraction**: Forge hides Monarch complexity while preserving power + +Understanding this hierarchy helps you: +- **Debug performance issues**: Is the bottleneck at service, actor, or hardware level? +- **Optimize resource usage**: How many replicas per service? GPU vs CPU processes? +- **Handle failures gracefully**: Which layer failed and how to recover? +- **Scale effectively**: Where to add resources for maximum impact? + +# Conclusion + +## What You've Learned + +1. **RL Fundamentals**: How RL concepts map to Forge services with REAL, working examples +2. **Service Abstraction**: How to use Forge services effectively with verified communication patterns +3. **Monarch Foundation**: How Forge services connect to distributed actors and hardware + +## Key Takeaways + +- **Services hide complexity**: Your RL code looks like simple async functions, but runs on distributed clusters +- **Communication patterns matter**: `.route()`, `.fanout()`, sessions, and `.call_one()` each serve specific purposes +- **Architecture understanding helps**: Knowing the Service → Actor → Process → Hardware hierarchy helps you debug, optimize, and scale +- **Always verify APIs**: This guide is verified, but cross-check with source code for latest changes +- **Real API patterns**: Use `.options().as_service()` not `spawn_service()`, use `.route()` not `.choose()`, etc. diff --git a/docs/Tutorials/ReadMe.MD b/docs/Tutorials/ReadMe.MD index 01d750d06..7798b147d 100644 --- a/docs/Tutorials/ReadMe.MD +++ b/docs/Tutorials/ReadMe.MD @@ -11,8 +11,8 @@ Welcome to the Tutorials section! This section is inspired by the A-Z PyTorch tu This section currently is structured in 3 detailed parts: 1. [RL Fundamentals and Understanding Forge Terminology](./1_RL_and_Forge_Fundamentals.MD): This gives a quick refresher of Reinforcement Learning and teaches you Forge Fundamentals -2. []() -3. []() +2. [Forge Internals](./2_Forge_Internals.MD): Goes a layer deeper and explains the internals of Forge +3. [Monarch 101](./3_Monarch_101.MD): It's a 101 to Monarch and how Forge Talks to Monarch Each part builds upon the next and the entire section can be consumed in roughly an hour-Grab a Chai and Enjoy! From 2c39598383a6d4efe4183edf9e9d3c703ea78056 Mon Sep 17 00:00:00 2001 From: Sanyam Bhutani Date: Thu, 2 Oct 2025 19:26:45 -0700 Subject: [PATCH 07/28] Update 2_Forge_Internals.MD --- docs/Tutorials/2_Forge_Internals.MD | 42 ++++++++++++++--------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/docs/Tutorials/2_Forge_Internals.MD b/docs/Tutorials/2_Forge_Internals.MD index d55eda51a..0c810a08e 100644 --- a/docs/Tutorials/2_Forge_Internals.MD +++ b/docs/Tutorials/2_Forge_Internals.MD @@ -8,6 +8,8 @@ Now that you see the power of the service abstraction, let's understand what's a When you call `await policy_service.generate(question)`, here's what actually happens: +(Don't worry, we will understand Services right in the next section!) + ```mermaid graph TD Call["Your Code:
await policy_service.generate"] @@ -58,17 +60,19 @@ Policy.options( # Other available options: # hosts=None ) - -# This is the ACTUAL way services are configured in Forge ``` ### 2. Real Service Creation Services are created using the `spawn_service` function: -```python -# This is what ACTUALLY works - copied directly from the notebook +The spawn_service() function automatically handles: +- Spawning actor replicas across processes/GPUs +- Load balancing with .choose() method +- Health monitoring and failure recovery +- Message routing and serialization +```python from forge.controller.service import ServiceConfig, spawn_service from forge.actors.policy import Policy, PolicyConfig, SamplingOverrides, WorkerConfig @@ -89,12 +93,6 @@ prompt = "What is 3 + 5?" responses = await policy.generate.choose(prompt=prompt) print(f"Response: {responses[0].text}") -# The spawn_service() function automatically handles: -# - Spawning actor replicas across processes/GPUs -# - Load balancing with .choose() method -# - Health monitoring and failure recovery -# - Message routing and serialization - # Cleanup when done await shutdown_service(policy) ``` @@ -103,23 +101,23 @@ await shutdown_service(policy) Forge services are implemented as ServiceActors that manage collections of your ForgeActor replicas: -```python -# Forge internals - What happens behind the scenes: -# 1. .as_service() creates a ServiceInterface -# 2. ServiceInterface manages N replicas of your ForgeActor class -# 3. ServiceInterface handles routing between replicas -# 4. You get methods like .route(), .fanout(), etc. +Forge internals - What happens behind the scenes: +1. `.as_service()` creates a `ServiceInterface` +2. `ServiceInterface` manages N replicas of your `ForgeActor` class +3. `ServiceInterface` handles routing between replicas +4. You get methods like `.route()`, `.fanout()`, etc. +```python # Your code sees this: responses = await policy.generate.route(prompt=prompt) - -# But behind the scenes: -# - ServiceInterface selects healthy replica -# - Routes message to that replica's Policy.generate() endpoint -# - Handles failures and retries automatically -# - Returns list[Completion] from the selected replica ``` +But behind the scenes: +- `ServiceInterface` selects healthy replica +- Routes message to that replica's `Policy.generate()` endpoint +- Handles failures and retries automatically +- Returns list[Completion] from the selected replica + ### 3. Different Service Types and Their Characteristics ```mermaid From f086c60cf59ad2b16f55b6aebdff6142e4ff608c Mon Sep 17 00:00:00 2001 From: Sanyam Bhutani Date: Thu, 2 Oct 2025 19:34:03 -0700 Subject: [PATCH 08/28] add --- docs/Tutorials/2_Forge_Internals.MD | 43 +++++++++-------------------- docs/Tutorials/3_Monarch_101.MD | 2 ++ 2 files changed, 15 insertions(+), 30 deletions(-) diff --git a/docs/Tutorials/2_Forge_Internals.MD b/docs/Tutorials/2_Forge_Internals.MD index 0c810a08e..9018afe3d 100644 --- a/docs/Tutorials/2_Forge_Internals.MD +++ b/docs/Tutorials/2_Forge_Internals.MD @@ -155,14 +155,14 @@ These communication patterns (\"adverbs\") determine how your service calls are ```python responses = await policy.generate.route(prompt=question) answer = responses[0].text # Extract text from Completion object - -# Behind the scenes: -# 1. Health check eliminates failed replicas -# 2. Load balancer picks least loaded healthy replica -# 3. Request routes to that specific replica -# 4. Automatic retry on different replica if failure ``` +Behind the scenes: +1. Health check eliminates failed replicas +2. Load balancer picks least loaded healthy replica +3. Request routes to that specific replica +4. Automatic retry on different replica if failure + **Performance characteristics**: - **Latency**: Lowest (single network hop) - **Throughput**: Limited by single replica capacity @@ -196,7 +196,7 @@ await policy.update_weights.fanout(new_policy_version) **When to use**: You want to process results as they arrive, not wait for all. ```python -# 📝 CONCEPTUAL - Streaming requires custom implementation in your training loop +# CONCEPTUAL - Streaming requires custom implementation in your training loop # The basic ReplayBuffer doesn't have built-in streaming methods # Pattern from apps/grpo/main.py continuous training: @@ -223,7 +223,7 @@ while training: **When to use**: Side effects that don't need responses (notifications, cache updates). ```python -# 📝 CONCEPTUAL - Fire-and-forget requires custom @endpoint implementations +# CONCEPTUAL - Fire-and-forget requires custom @endpoint implementations # The basic services don't have broadcast methods built-in # You would implement custom endpoints in your ForgeActor: @@ -485,36 +485,19 @@ trainer = await RLTrainer.options( ) ``` -### Natural Backpressure Through Service APIs - -```python -# backpressure pattern - The replay buffer naturally provides backpressure -batch = await replay_buffer.sample.call_one(curr_policy_version=step) -if batch is None: - # Not enough data yet - natural rate limiting - print("Buffer not ready, collecting more experiences...") - continue -else: - # Proceed with training - loss = await trainer.train_step.call_one(batch) - print(f"Training loss: {loss}") -``` - -These patterns address the core technical challenges in distributed RL. The key insight: **Forge services handle coordination complexity automatically, letting you focus on RL algorithm logic**. - ## Service Implementation Example Let's see how a reward service is actually implemented: ```python -# ✅ COMPLETE WORKING EXAMPLE - Exact RewardActor from apps/grpo/main.py +# Exact RewardActor from apps/grpo/main.py from forge.controller import ForgeActor from monarch.actor import endpoint from forge.data.rewards import MathReward, ThinkingReward from forge.controller.service import ServiceConfig, spawn_service -# EXACT class definition from apps/grpo/main.py lines 68-83 +# class definition from apps/grpo/main.py class RewardActor(ForgeActor): def __init__(self, reward_functions: list): self.reward_functions = reward_functions @@ -573,7 +556,7 @@ from forge.data.rewards import MathReward, ThinkingReward from monarch.actor import endpoint from omegaconf import DictConfig -# EXACT service creation from apps/grpo/main.py lines 322-344 +# Service creation from apps/grpo/main.py lines 322-344 print("Initializing all services...") ( dataloader, @@ -601,7 +584,6 @@ print("Initializing all services...") print("All services initialized successfully!") -# EXACT usage patterns from apps/grpo/main.py continuous training loop async def production_training_loop(): """Real training loop pattern from apps/grpo/main.py""" step = 0 @@ -639,7 +621,6 @@ async def production_training_loop(): print(f"Step {step}, Loss: {loss:.4f}") step += 1 -# EXACT cleanup pattern from apps/grpo/main.py lines 493-504 print("Shutting down services...") await asyncio.gather( DatasetActor.shutdown(dataloader), @@ -661,3 +642,5 @@ print("All services shut down successfully!") 5. **Coordination**: Services coordinate through shared state (replay buffer, weight versions) This is the power of the service abstraction - complex distributed coordination looks like simple async Python code. + +In the next part we will learn about [Monarch internals](./3_Monarch_101.MD) \ No newline at end of file diff --git a/docs/Tutorials/3_Monarch_101.MD b/docs/Tutorials/3_Monarch_101.MD index 9369be13a..94c02c37e 100644 --- a/docs/Tutorials/3_Monarch_101.MD +++ b/docs/Tutorials/3_Monarch_101.MD @@ -1,5 +1,7 @@ # Part 3: The Forge-Monarch Connection +This is part 3 of our series, in the previous sections: we learned [RL Concepts and how they map to Forge](./1_RL_and_Forge_Fundamentals.MD), [Forge Internals](./2_Forge_Internals.MD). + Now let's peel back the layers. Forge services are built on top of **Monarch**, PyTorch's distributed actor framework. Understanding this connection is crucial for optimization and debugging. ## The Complete Hierarchy: Service to Silicon From 56f6a5c8b4b82ef88a331a7a1745d855b22f12f8 Mon Sep 17 00:00:00 2001 From: Sanyam Bhutani Date: Thu, 2 Oct 2025 19:38:48 -0700 Subject: [PATCH 09/28] Update 3_Monarch_101.MD --- docs/Tutorials/3_Monarch_101.MD | 124 ++++++++++++++++---------------- 1 file changed, 62 insertions(+), 62 deletions(-) diff --git a/docs/Tutorials/3_Monarch_101.MD b/docs/Tutorials/3_Monarch_101.MD index 94c02c37e..7b3f6d310 100644 --- a/docs/Tutorials/3_Monarch_101.MD +++ b/docs/Tutorials/3_Monarch_101.MD @@ -11,24 +11,24 @@ graph TD subgraph YourCode["1. Your RL Code"] Call["await policy_service.generate.choose('What is 2+2?')"] end - + subgraph ForgeServices["2. Forge Service Layer"] ServiceInterface["ServiceInterface
• Routes .choose() to replica
• Handles load balancing
• Manages health checks"] ServiceActor["ServiceActor
• Manages replica lifecycle
• Monitors health
• Coordinates failures"] end - - subgraph MonarchLayer["3. Monarch Actor Layer"] - ActorMesh["ActorMesh[PolicyActor]
• 4 PolicyActor instances
• Each on different GPU
• Message passing interface"] - ProcMesh["ProcMesh
• 4 processes
• GPU topology: [0,1,2,3]
• Network interconnect"] + + subgraph MonarchLayer["3. Monarch Actor Layer"] + ActorMesh["ActorMesh PolicyActor
• 4 PolicyActor instances
• Each on different GPU
• Message passing interface"] + ProcMesh["ProcMesh
• 4 processes
• GPU topology: 0,1,2,3
• Network interconnect"] end - + subgraph Hardware["4. Physical Hardware"] GPU0["GPU 0
PolicyActor #1
vLLM Engine
Model Weights"] - GPU1["GPU 1
PolicyActor #2
vLLM Engine
Model Weights"] + GPU1["GPU 1
PolicyActor #2
vLLM Engine
Model Weights"] GPU2["GPU 2
PolicyActor #3
vLLM Engine
Model Weights"] GPU3["GPU 3
PolicyActor #4
vLLM Engine
Model Weights"] end - + Call --> ServiceInterface ServiceInterface --> ServiceActor ServiceActor --> ActorMesh @@ -37,7 +37,7 @@ graph TD ProcMesh --> GPU1 ProcMesh --> GPU2 ProcMesh --> GPU3 - + style Call fill:#99ff99 style ServiceActor fill:#ffcc99 style ActorMesh fill:#cc99ff @@ -55,17 +55,17 @@ graph TD subgraph Host["Single Host (8 GPUs)"] subgraph ProcMesh["ProcMesh: per_host={'gpus': 8}"] P0["Process 0
GPU 0"] - P1["Process 1
GPU 1"] + P1["Process 1
GPU 1"] P2["Process 2
GPU 2"] P3["Process 3
GPU 3"] P4["Process 4
GPU 4"] P5["Process 5
GPU 5"] - P6["Process 6
GPU 6"] + P6["Process 6
GPU 6"] P7["Process 7
GPU 7"] end - + P0 -.->|"Network"| P1 - P1 -.->|"Network"| P2 + P1 -.->|"Network"| P2 P2 -.->|"Network"| P3 P3 -.->|"Network"| P4 P4 -.->|"Network"| P5 @@ -73,7 +73,7 @@ graph TD P6 -.->|"Network"| P7 P7 -.->|"Network"| P0 end - + style P0 fill:#ff9999 style P1 fill:#ff9999 style P2 fill:#ff9999 @@ -97,8 +97,8 @@ graph TD H1P3["Process 3
GPU 3"] end end - - subgraph Host2["Host 2"] + + subgraph Host2["Host 2"] subgraph PM2["ProcMesh Segment 2"] H2P0["Process 4
GPU 0"] H2P1["Process 5
GPU 1"] @@ -106,22 +106,22 @@ graph TD H2P3["Process 7
GPU 3"] end end - + subgraph Host3["Host 3"] subgraph PM3["ProcMesh Segment 3"] H3P0["Process 8
GPU 0"] H3P1["Process 9
GPU 1"] - H3P2["Process 10
GPU 2"] + H3P2["Process 10
GPU 2"] H3P3["Process 11
GPU 3"] end end end - + H1P0 -.->|"InfiniBand"| H2P0 H1P1 -.->|"InfiniBand"| H2P1 H2P0 -.->|"InfiniBand"| H3P0 H2P1 -.->|"InfiniBand"| H3P1 - + style PM1 fill:#ff9999 style PM2 fill:#99ff99 style PM3 fill:#99ccff @@ -167,7 +167,7 @@ await counters.increment.call() value = await counters.get_value.call_one() print(f"One counter: {value}") -# choose() - random single actor +# choose() - random single actor value = await counters.get_value.choose() print(f"Random counter: {value}") @@ -190,28 +190,28 @@ await procs.stop() graph TD subgraph Creation["Actor Creation Process"] Code["mesh.spawn('policy', PolicyActor, model='Qwen/Qwen3-7B')"] - + subgraph ProcMesh["ProcMesh (4 processes)"] - P0["Process 0
GPU 0"] + P0["Process 0
GPU 0"] P1["Process 1
GPU 1"] P2["Process 2
GPU 2"] P3["Process 3
GPU 3"] end - + subgraph ActorMesh["ActorMesh[PolicyActor]"] A0["PolicyActor
Instance #0
model=Qwen/Qwen3-7B
generation_count=0"] A1["PolicyActor
Instance #1
model=Qwen/Qwen3-7B
generation_count=0"] A2["PolicyActor
Instance #2
model=Qwen/Qwen3-7B
generation_count=0"] A3["PolicyActor
Instance #3
model=Qwen/Qwen3-7B
generation_count=0"] end - + Code --> ProcMesh P0 --> A0 P1 --> A1 P2 --> A2 P3 --> A3 end - + style A0 fill:#99ff99 style A1 fill:#99ff99 style A2 fill:#99ff99 @@ -224,29 +224,29 @@ graph TD graph TD subgraph MessageFlow["Message Flow Patterns"] Client["await policy_actors.generate.METHOD(prompt)"] - + subgraph Methods["Different Adverbs Route Differently"] Choose["choose()
→ Routes to ONE actor
→ Load balanced"] - Call["call()
→ Routes to ALL actors
→ Collects all results"] + Call["call()
→ Routes to ALL actors
→ Collects all results"] Broadcast["broadcast()
→ Routes to ALL actors
→ Fire and forget"] Stream["stream()
→ Routes to ALL actors
→ Iterator of results"] end - + subgraph ActorInstances["PolicyActor Instances"] A0["Actor 0
GPU 0
generates response"] - A1["Actor 1
GPU 1
generates response"] + A1["Actor 1
GPU 1
generates response"] A2["Actor 2
GPU 2
generates response"] A3["Actor 3
GPU 3
generates response"] end - + Client --> Choose Client --> Call Client --> Broadcast Client --> Stream - + Choose -.->|"Load balanced"| A1 Call --> A0 - Call --> A1 + Call --> A1 Call --> A2 Call --> A3 Broadcast --> A0 @@ -258,7 +258,7 @@ graph TD Stream --> A2 Stream --> A3 end - + style Choose fill:#99ff99 style Call fill:#ffcc99 style Broadcast fill:#ff99cc @@ -275,31 +275,31 @@ Now the key insight: **Forge services are ServiceActors that manage ActorMeshes graph TD subgraph ServiceCreation["spawn_service() Process"] Call["await spawn_service(ServiceConfig(num_replicas=4), PolicyActor, model='Qwen')"] - + ServiceActor["ServiceActor
• Manages 4 replicas
• Handles health checks
• Routes service calls"] - - subgraph Replicas["4 Independent Replicas"] + + subgraph Replicas["4 Independent Replicas"] subgraph R0["Replica 0"] PM0["ProcMesh
1 process
GPU 0"] AM0["ActorMesh
1 PolicyActor"] end - + subgraph R1["Replica 1"] - PM1["ProcMesh
1 process
GPU 1"] + PM1["ProcMesh
1 process
GPU 1"] AM1["ActorMesh
1 PolicyActor"] end - + subgraph R2["Replica 2"] PM2["ProcMesh
1 process
GPU 2"] AM2["ActorMesh
1 PolicyActor"] end - + subgraph R3["Replica 3"] PM3["ProcMesh
1 process
GPU 3"] AM3["ActorMesh
1 PolicyActor"] end end - + Call --> ServiceActor ServiceActor --> R0 ServiceActor --> R1 @@ -310,7 +310,7 @@ graph TD PM2 --> AM2 PM3 --> AM3 end - + style ServiceActor fill:#ffcc99 style AM0 fill:#99ff99 style AM1 fill:#99ff99 @@ -324,30 +324,30 @@ graph TD graph TD subgraph CallFlow["Complete Call Flow"] UserCall["await policy_service.generate.choose('What is 2+2?')"] - + ServiceInterface["ServiceInterface
• Receives .choose() call
• Routes to ServiceActor"] - + ServiceActor["ServiceActor
• Selects healthy replica
• Load balancing logic
• Failure handling"] - + SelectedReplica["Selected Replica #2
• ProcMesh with 1 process
• ActorMesh with 1 PolicyActor"] - + PolicyActor["PolicyActor Instance
• Loads model
• Runs vLLM inference
• Returns 'The answer is 4'"] - + GPU["GPU 2
• vLLM engine
• Model weights
• KV cache
• CUDA kernels"] - + UserCall --> ServiceInterface ServiceInterface --> ServiceActor ServiceActor --> SelectedReplica SelectedReplica --> PolicyActor PolicyActor --> GPU - + GPU -.->|"Response"| PolicyActor PolicyActor -.->|"Response"| SelectedReplica SelectedReplica -.->|"Response"| ServiceActor ServiceActor -.->|"Response"| ServiceInterface ServiceInterface -.->|"'The answer is 4'"| UserCall end - + style UserCall fill:#99ff99 style ServiceActor fill:#ffcc99 style PolicyActor fill:#cc99ff @@ -361,32 +361,32 @@ In real RL systems, you have multiple services that can share or use separate Pr ```mermaid graph TD subgraph Cluster["RL Training Cluster"] - subgraph Services["Forge Services"] + subgraph Services["Forge Services"] PS["Policy Service
4 GPU replicas"] - TS["Trainer Service
2 GPU replicas"] + TS["Trainer Service
2 GPU replicas"] RS["Reward Service
4 CPU replicas"] BS["Buffer Service
1 CPU replica"] end - + subgraph MonarchInfra["Monarch Infrastructure"] subgraph GPUMesh["GPU ProcMesh (6 processes)"] G0["Process 0
GPU 0"] G1["Process 1
GPU 1"] - G2["Process 2
GPU 2"] + G2["Process 2
GPU 2"] G3["Process 3
GPU 3"] G4["Process 4
GPU 4"] G5["Process 5
GPU 5"] end - + subgraph CPUMesh["CPU ProcMesh (5 processes)"] C0["Process 0
CPU"] - C1["Process 1
CPU"] + C1["Process 1
CPU"] C2["Process 2
CPU"] C3["Process 3
CPU"] C4["Process 4
CPU"] end end - + PS --> G0 PS --> G1 PS --> G2 @@ -399,7 +399,7 @@ graph TD RS --> C3 BS --> C4 end - + style PS fill:#99ff99 style TS fill:#ff99cc style RS fill:#ffcc99 @@ -411,7 +411,7 @@ graph TD ## Key Insights: Why This Architecture Matters 1. **Process Isolation**: Each actor runs in its own process - failures don't cascade -2. **Location Transparency**: Actors can be local or remote with identical APIs +2. **Location Transparency**: Actors can be local or remote with identical APIs 3. **Structured Distribution**: ProcMesh maps directly to hardware topology 4. **Message Passing**: No shared memory means no race conditions or locks 5. **Service Abstraction**: Forge hides Monarch complexity while preserving power @@ -427,13 +427,13 @@ Understanding this hierarchy helps you: ## What You've Learned 1. **RL Fundamentals**: How RL concepts map to Forge services with REAL, working examples -2. **Service Abstraction**: How to use Forge services effectively with verified communication patterns +2. **Service Abstraction**: How to use Forge services effectively with verified communication patterns 3. **Monarch Foundation**: How Forge services connect to distributed actors and hardware ## Key Takeaways - **Services hide complexity**: Your RL code looks like simple async functions, but runs on distributed clusters -- **Communication patterns matter**: `.route()`, `.fanout()`, sessions, and `.call_one()` each serve specific purposes +- **Communication patterns matter**: `.route()`, `.fanout()`, sessions, and `.call_one()` each serve specific purposes - **Architecture understanding helps**: Knowing the Service → Actor → Process → Hardware hierarchy helps you debug, optimize, and scale - **Always verify APIs**: This guide is verified, but cross-check with source code for latest changes - **Real API patterns**: Use `.options().as_service()` not `spawn_service()`, use `.route()` not `.choose()`, etc. From 8f501d362dc3d7cf53ec7ce315e66216787d49dc Mon Sep 17 00:00:00 2001 From: Sanyam Bhutani Date: Thu, 2 Oct 2025 19:40:02 -0700 Subject: [PATCH 10/28] Update 3_Monarch_101.MD --- docs/Tutorials/3_Monarch_101.MD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/Tutorials/3_Monarch_101.MD b/docs/Tutorials/3_Monarch_101.MD index 7b3f6d310..0b1b4bd79 100644 --- a/docs/Tutorials/3_Monarch_101.MD +++ b/docs/Tutorials/3_Monarch_101.MD @@ -198,7 +198,7 @@ graph TD P3["Process 3
GPU 3"] end - subgraph ActorMesh["ActorMesh[PolicyActor]"] + subgraph ActorMesh["ActorMesh PolicyActor"] A0["PolicyActor
Instance #0
model=Qwen/Qwen3-7B
generation_count=0"] A1["PolicyActor
Instance #1
model=Qwen/Qwen3-7B
generation_count=0"] A2["PolicyActor
Instance #2
model=Qwen/Qwen3-7B
generation_count=0"] From 7e47e025977718217b1d000abadfe9bdbffa461c Mon Sep 17 00:00:00 2001 From: Sanyam Bhutani Date: Thu, 2 Oct 2025 19:40:40 -0700 Subject: [PATCH 11/28] Update 3_Monarch_101.MD --- docs/Tutorials/3_Monarch_101.MD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/Tutorials/3_Monarch_101.MD b/docs/Tutorials/3_Monarch_101.MD index 0b1b4bd79..52a058dcc 100644 --- a/docs/Tutorials/3_Monarch_101.MD +++ b/docs/Tutorials/3_Monarch_101.MD @@ -1,6 +1,6 @@ # Part 3: The Forge-Monarch Connection -This is part 3 of our series, in the previous sections: we learned [RL Concepts and how they map to Forge](./1_RL_and_Forge_Fundamentals.MD), [Forge Internals](./2_Forge_Internals.MD). +This is part 3 of our series, in the previous sections: we learned Part 1: [RL Concepts and how they map to Forge](./1_RL_and_Forge_Fundamentals.MD), Part 2: [Forge Internals](./2_Forge_Internals.MD). Now let's peel back the layers. Forge services are built on top of **Monarch**, PyTorch's distributed actor framework. Understanding this connection is crucial for optimization and debugging. From 1c8d8c1244a1c86b4011edc8321940b1f433707d Mon Sep 17 00:00:00 2001 From: Sanyam Bhutani Date: Fri, 3 Oct 2025 00:22:10 -0700 Subject: [PATCH 12/28] fix funcs --- docs/Tutorials/1_RL_and_Forge_Fundamentals.MD | 152 ++++++------- docs/Tutorials/2_Forge_Internals.MD | 200 ++++++++++-------- docs/Tutorials/3_Monarch_101.MD | 14 +- 3 files changed, 199 insertions(+), 167 deletions(-) diff --git a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD index 810ef373f..c34ae6639 100644 --- a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD +++ b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD @@ -114,33 +114,36 @@ Let's look at the example from above again, but this time we would use the names # Conceptual Example async def conceptual_forge_rl_step(services, step): - # 1. Get a math problem - CONCEPTUAL API - sample = await services['dataloader'].get_sample() - question, target = sample["question"], sample["answer"] + # 1. Get a math problem - Using actual DatasetActor API + sample = await services['dataloader'].sample.call_one() + question, target = sample["request"], sample["target"] - # 2. Student generates answer - CONCEPTUAL API - # Actual method names vary by implementation - responses = await services['policy'].generate(prompt=question) + # 2. Student generates answer - Using actual Policy API + responses = await services['policy'].generate.route(prompt=question) answer = responses[0].text - # 3. Teacher grades it - CONCEPTUAL API - # Actual reward evaluation varies by implementation - score = await services['reward_actor'].evaluate( + # 3. Teacher grades it - Using actual RewardActor API + score = await services['reward_actor'].evaluate_response.route( prompt=question, response=answer, target=target ) - # 4. Compare to baseline - CONCEPTUAL API - ref_logprobs = await services['ref_model'].compute_baseline(responses[0].token_ids) + # 4. Compare to baseline - Using actual ReferenceModel API + # Note: ReferenceModel.forward requires input_ids, max_req_tokens, return_logprobs + ref_logprobs = await services['ref_model'].forward.route( + input_ids, max_req_tokens, return_logprobs=True + ) - # 5. Store experience - CONCEPTUAL Episode structure - # Real Episode structure in src/forge/data_models/episode.py - episode = create_episode(responses[0], score, ref_logprobs, step) - await services['replay_buffer'].store(episode) + # 5. Store experience - Using actual Episode structure from apps/grpo/main.py + episode = create_episode_from_response(responses[0], score, ref_logprobs, step) + await services['replay_buffer'].add.call_one(episode) - # 6. Improve student - CONCEPTUAL API - batch = await services['replay_buffer'].get_batch(policy_version=step) + # 6. Improve student - Using actual training pattern + batch = await services['replay_buffer'].sample.call_one( + curr_policy_version=step + ) if batch is not None: - loss = await services['trainer'].update_policy(batch) + inputs, targets = batch + loss = await services['trainer'].train_step.call(inputs, targets) return loss ``` @@ -234,34 +237,38 @@ Let's see how core RL concepts map to Forge services: async def real_rl_training_step(services, step): """Single RL step using verified Forge APIs""" - # 1. Environment interaction - sample = await services['dataloader'].__next__.call_one() - prompt, target = sample["question"], sample["answer"] + # 1. Environment interaction - Using actual DatasetActor API + sample = await services['dataloader'].sample.call_one() + prompt, target = sample["request"], sample["target"] - responses = await services['policy'].generate.route(prompt=prompt) + responses = await services['policy'].generate.route(prompt) - # 2. Reward computation + # 2. Reward computation - Using actual RewardActor API score = await services['reward_actor'].evaluate_response.route( prompt=prompt, response=responses[0].text, target=target ) - # 3. Get reference logprobs - ref_logprobs = await services['ref_model'].forward.route(responses[0].token_ids) + # 3. Get reference logprobs - Using actual ReferenceModel API + # Note: ReferenceModel requires full input_ids tensor, not just tokens + input_ids = torch.cat([responses[0].prompt_ids, responses[0].token_ids]) + ref_logprobs = await services['ref_model'].forward.route( + input_ids.unsqueeze(0), max_req_tokens=512, return_logprobs=True + ) - # 4. Experience storage - Episode creation pattern - # Note: Actual Episode structure requires token tensors, not text + # 4. Experience storage - Using actual Episode pattern from GRPO episode = create_episode_from_response(responses[0], score, ref_logprobs, step) await services['replay_buffer'].add.call_one(episode) - # 5. Learning - trainer endpoint + # 5. Learning - Using actual trainer pattern batch = await services['replay_buffer'].sample.call_one( curr_policy_version=step ) if batch is not None: - loss = await services['trainer'].train_step.call_one(batch) + inputs, targets = batch # GRPO returns (inputs, targets) tuple + loss = await services['trainer'].train_step.call(inputs, targets) - # 6. Policy synchronization - weight update pattern - await services['trainer'].push_weights.call_one(step + 1) + # 6. Policy synchronization - Using actual weight update pattern + await services['trainer'].push_weights.call(step + 1) await services['policy'].update_weights.fanout(step + 1) return loss @@ -287,12 +294,14 @@ Forge handles behind the scenes: ### Independent Scaling ```python -from forge.actors.policy import Policy, PolicyConfig, SamplingOverrides, WorkerConfig +from forge.actors.policy import Policy from forge.actors.replay_buffer import ReplayBuffer -from forge.controller.service import shutdown_service -from apps.grpo.main import Trainer, RewardActor, ComputeAdvantages, RefModel, DatasetActor +from forge.actors.reference_model import ReferenceModel +from forge.actors.trainer import RLTrainer +from apps.grpo.main import DatasetActor, RewardActor, ComputeAdvantages from forge.data.rewards import MathReward, ThinkingReward import asyncio +import torch model = "Qwen/Qwen3-1.7B" group_size = 1 @@ -306,67 +315,60 @@ group_size = 1 ref_model, reward_actor, ) = await asyncio.gather( - # Dataset service - spawn_service( - ServiceConfig(procs_per_replica=1, num_replicas=1), - DatasetActor, + # Dataset actor (CPU) + DatasetActor.options(procs=1).as_actor( path="openai/gsm8k", - config_name="main", - split="train", + revision="main", + data_split="train", streaming=True, + model=model, ), # Policy service with GPU - spawn_service( - ServiceConfig(procs_per_replica=1, with_gpus=True, num_replicas=1), - Policy, - config=PolicyConfig( - worker_params=WorkerConfig(model=model), - sampling_params=SamplingOverrides( - num_samples=group_size, max_tokens=16 - ), - ), + Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service( + engine_config={ + "model": model, + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "enforce_eager": False + }, + sampling_config={ + "n": group_size, + "max_tokens": 16, + "temperature": 1.0, + "top_p": 1.0 + } ), - # Trainer service with GPU - spawn_service( - ServiceConfig(procs_per_replica=1, with_gpus=True, num_replicas=1), - Trainer, - learning_rate=1e-5, - beta=0.1, - model_name=model, + # Trainer actor with GPU + RLTrainer.options(procs=1, with_gpus=True).as_actor( + # Trainer config would come from YAML in real usage + model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"}, + optimizer={"name": "AdamW", "lr": 1e-5}, + training={"local_batch_size": 2, "seq_len": 2048} ), # Replay buffer (CPU) - spawn_service( - ServiceConfig(procs_per_replica=1, num_replicas=1), - ReplayBuffer, + ReplayBuffer.options(procs=1).as_actor( batch_size=2, max_policy_age=1, + dp_size=1 ), # Advantage computation (CPU) - spawn_service( - ServiceConfig(procs_per_replica=1, num_replicas=1), - ComputeAdvantages, - gamma=0.99, - lambda_=0.95, - ), + ComputeAdvantages.options(procs=1).as_actor(), # Reference model with GPU - spawn_service( - ServiceConfig(procs_per_replica=1, num_replicas=1, with_gpus=True), - RefModel, - model_name=model, + ReferenceModel.options(procs=1, with_gpus=True).as_actor( + model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"}, + training={"dtype": "bfloat16"} ), # Reward actor (CPU) - spawn_service( - ServiceConfig(procs_per_replica=1, num_replicas=1), - RewardActor, - reward_functions=[MathReward(), ThinkingReward()], + RewardActor.options(procs=1, num_replicas=1).as_service( + reward_functions=[MathReward(), ThinkingReward()] ) ) ``` -Production scaling - multiply num_replicas: +Production scaling - multiply num_replicas for services or spawn multiple actors: - Policy: num_replicas=8 for high inference demand - RewardActor: num_replicas=16 for parallel evaluation -- Trainer: num_replicas=4 for distributed training +- Trainer: Multiple actors for distributed training (RLTrainer handles this internally) ### Fault Tolerance diff --git a/docs/Tutorials/2_Forge_Internals.MD b/docs/Tutorials/2_Forge_Internals.MD index 9018afe3d..634f04f85 100644 --- a/docs/Tutorials/2_Forge_Internals.MD +++ b/docs/Tutorials/2_Forge_Internals.MD @@ -65,36 +65,44 @@ Policy.options( ### 2. Real Service Creation Services are created using the `spawn_service` function: +Services are created using the `.options().as_service()` pattern from the actual GRPO implementation: -The spawn_service() function automatically handles: +The service creation automatically handles: - Spawning actor replicas across processes/GPUs -- Load balancing with .choose() method +- Load balancing with .route() method for services - Health monitoring and failure recovery - Message routing and serialization ```python -from forge.controller.service import ServiceConfig, spawn_service -from forge.actors.policy import Policy, PolicyConfig, SamplingOverrides, WorkerConfig +from forge.actors.policy import Policy model = "Qwen/Qwen3-1.7B" -policy = await spawn_service( - ServiceConfig(procs_per_replica=1, with_gpus=True, num_replicas=1), - Policy, - config=PolicyConfig( - worker_params=WorkerConfig(model=model), - sampling_params=SamplingOverrides( - num_samples=1, max_tokens=16 - ), - ), +policy = await Policy.options( + procs=1, + with_gpus=True, + num_replicas=1 +).as_service( + engine_config={ + "model": model, + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "enforce_eager": False + }, + sampling_config={ + "n": 1, + "max_tokens": 16, + "temperature": 1.0, + "top_p": 1.0 + } ) prompt = "What is 3 + 5?" -responses = await policy.generate.choose(prompt=prompt) +responses = await policy.generate.route(prompt) print(f"Response: {responses[0].text}") # Cleanup when done -await shutdown_service(policy) +await policy.shutdown() ``` ### 3. How Services Actually Work @@ -253,7 +261,6 @@ class CustomPolicy(Policy): # This Counter example demonstrates the session pattern from forge.controller import ForgeActor -from forge.controller.service import ServiceConfig, spawn_service, shutdown_service from monarch.actor import endpoint class ForgeCounter(ForgeActor): @@ -273,37 +280,35 @@ class ForgeCounter(ForgeActor): async def reset(self): self.value = 0 -counter_service = await spawn_service( - ServiceConfig(procs_per_replica=1, num_replicas=4), - ForgeCounter, - initial_value=0 -) +counter_service = await ForgeCounter.options( + procs=1, num_replicas=4 +).as_service(initial_value=0) # Test basic operations -await counter_service.increment.choose() -results = await counter_service.increment.call() +await counter_service.increment.route() +results = await counter_service.increment.fanout() # Get from all replicas print(f"All replica values: {results}") # STICKY SESSIONS print("\nUsing sticky sessions:") async with counter_service.session(): - await counter_service.reset.choose() - print(await counter_service.increment.choose()) # 1 - print(await counter_service.increment.choose()) # 2 - print(await counter_service.increment.choose()) # 3 + await counter_service.reset.route() # Uses .route() within session + print(await counter_service.increment.route()) # 1 + print(await counter_service.increment.route()) # 2 + print(await counter_service.increment.route()) # 3 - final_value = await counter_service.get_value.choose() + final_value = await counter_service.get_value.route() print(f"Final value on this replica: {final_value}") # 3 # Same pattern works with Policy for multi-turn conversations: # async with policy.session(): -# response1 = await policy.generate.choose(prompt=turn1) +# response1 = await policy.generate.route(turn1) # full_prompt = turn1 + response1[0].text + turn2 -# response2 = await policy.generate.choose(prompt=full_prompt) +# response2 = await policy.generate.route(full_prompt) # # Both calls hit same replica, preserving KV cache # Cleanup -await shutdown_service(counter_service) +await counter_service.shutdown() ``` **Performance impact**: Critical for maintaining KV cache in multi-turn conversations. @@ -395,60 +400,72 @@ print(f"Current policy version: {current_version}") Instead of manual coordination, Forge services handle speed mismatches automatically: ```python - from apps.grpo.main import Episode, Group async def simple_rl_step(): # ===== Generate a rollout ===== - sample = await dataloader.__next__.choose() - prompt, target = sample["question"], sample["answer"] + sample = await dataloader.sample.call_one() # DatasetActor is an actor, not service + prompt, target = sample["request"], sample["target"] # Correct field names print(f"Prompt: {prompt}") print(f"Target: {target}") - actions = await policy.generate.choose(prompt=prompt) + actions = await policy.generate.route(prompt=prompt) # Policy is a service print(f"Policy response: {actions[0].text}") - ref_logprobs = await ref_model.forward.choose(actions[0].token_ids) - reward = await reward_actor.evaluate_response.choose( + # Create input tensor for reference model (requires full context) + input_ids = torch.cat([actions[0].prompt_ids, actions[0].token_ids]) + ref_logprobs = await ref_model.forward.route( + input_ids.unsqueeze(0), max_req_tokens=512, return_logprobs=True + ) + reward = await reward_actor.evaluate_response.route( # RewardActor is a service prompt=prompt, response=actions[0].text, target=target ) print(f"Reward: {reward}") + # Create episode using actual GRPO Episode structure episode = Episode( - episode_id=0, - prompt=prompt, - target=target, + episode_id="0", + request=prompt, policy_version=0, + pad_id=tokenizer.pad_token_id, + request_len=512, + response_len=512, + target=target ) - episode.add_group(Group( - response=actions[0].text, - ref_logprobs=ref_logprobs, - reward=reward, - )) + # Add response data + episode.response = actions[0].text + episode.request_tokens = actions[0].prompt_ids.tolist() + episode.response_tokens = actions[0].token_ids.tolist() + episode.ref_logprobs = ref_logprobs[0] # Extract from batch dimension + episode.reward = reward - advantages = await compute_advantages.__call__.choose(episode.groups) - episode.groups[0].advantage = advantages[0] + # Compute advantages using actual ComputeAdvantages actor + group = Group.new_group(0, 1, prompt, 0, tokenizer.pad_token_id, 512, 512, target) + group.episodes[0] = episode + advantages = await compute_advantages.compute.call_one(group) # ComputeAdvantages is an actor + episode.advantage = advantages[0] print(f"Advantage: {advantages[0]}") - await replay_buffer.add.choose(episode) + await replay_buffer.add.call_one(episode) # ReplayBuffer is an actor print("Episode stored in replay buffer") # ===== Train on the batch ===== - batch = await replay_buffer.sample.choose(curr_policy_version=0) + batch = await replay_buffer.sample.call_one(curr_policy_version=0) if batch is not None: print("Training on batch...") - training_result = await trainer.train_step.choose(batch) - loss = training_result.get("loss", 0.0) + inputs, targets = batch # GRPO returns (inputs, targets) tuple + loss = await trainer.train_step.call(inputs, targets) # RLTrainer is an actor print(f"Training loss: {loss}") return loss else: print("Not enough data in buffer yet") return None +# Note: This simplified example assumes tokenizer and services are already initialized for step in range(10): print(f"\n--- RL Step {step + 1} ---") loss = await simple_rl_step() @@ -467,7 +484,7 @@ for step in range(10): policy = await Policy.options( procs=1, num_replicas=8, with_gpus=True # Many replicas for high throughput ).as_service( - engine_config=EngineConfig(model=model_name) + engine_config={"model": model_name, "tensor_parallel_size": 1} ) # Reward evaluation might be CPU-bound @@ -479,9 +496,10 @@ reward_actor = await RewardActor.options( # Training needs fewer but more powerful replicas trainer = await RLTrainer.options( - procs=1, num_replicas=2, with_gpus=True # Fewer but GPU-heavy + procs=1, with_gpus=True # Fewer but GPU-heavy ).as_actor( # Trainer typically uses .as_actor() not .as_service() - optimizer=Optimizer(lr=1e-5) + model={"name": "qwen3", "flavor": "1.7B"}, + optimizer={"name": "AdamW", "lr": 1e-5} ) ``` @@ -495,7 +513,6 @@ Let's see how a reward service is actually implemented: from forge.controller import ForgeActor from monarch.actor import endpoint from forge.data.rewards import MathReward, ThinkingReward -from forge.controller.service import ServiceConfig, spawn_service # class definition from apps/grpo/main.py class RewardActor(ForgeActor): @@ -515,9 +532,9 @@ class RewardActor(ForgeActor): # Return average reward across all functions return total_reward / len(self.reward_functions) if self.reward_functions else 0.0 -reward_actor = await spawn_service( - ServiceConfig(procs_per_replica=1, num_replicas=1), - RewardActor, +reward_actor = await RewardActor.options( + procs=1, num_replicas=1 +).as_service( reward_functions=[MathReward(), ThinkingReward()] ) @@ -525,7 +542,7 @@ prompt = "What is 15% of 240?" response = "15% of 240 is 36" target = "36" -score = await reward_actor.evaluate_response.choose( +score = await reward_actor.evaluate_response.route( prompt=prompt, response=response, target=target @@ -533,10 +550,10 @@ score = await reward_actor.evaluate_response.choose( print(f"Reward score: {score}") # Usually around 1.0 for correct math answers # For production scaling - increase num_replicas for parallel evaluation: -# ServiceConfig(procs_per_replica=1, num_replicas=16) # 16 parallel evaluators +# RewardActor.options(procs=1, num_replicas=16) # 16 parallel evaluators # Cleanup when done -await shutdown_service(reward_actor) +await reward_actor.shutdown() ``` ## Service Orchestration: The Training Loop @@ -547,16 +564,15 @@ Now let's see how services coordinate in a real training loop: # This is the REAL way production RL systems are built with Forge import asyncio +import torch from forge.actors.policy import Policy from forge.actors.reference_model import ReferenceModel from forge.actors.replay_buffer import ReplayBuffer from forge.actors.trainer import RLTrainer -from forge.controller.actor import ForgeActor +from apps.grpo.main import DatasetActor, RewardActor, ComputeAdvantages from forge.data.rewards import MathReward, ThinkingReward -from monarch.actor import endpoint -from omegaconf import DictConfig -# Service creation from apps/grpo/main.py lines 322-344 +# Service creation pattern from apps/grpo/main.py lines 322-344 print("Initializing all services...") ( dataloader, @@ -567,17 +583,27 @@ print("Initializing all services...") ref_model, reward_actor, ) = await asyncio.gather( - DatasetActor.options(**cfg.actors.dataset).as_actor(**cfg.dataset), - Policy.options(**cfg.services.policy).as_service(**cfg.policy), - RLTrainer.options(**cfg.actors.trainer).as_actor( - **cfg.trainer, loss=simple_grpo_loss + DatasetActor.options(procs=1).as_actor( + path="openai/gsm8k", revision="main", data_split="train", + streaming=True, model="Qwen/Qwen3-1.7B" + ), + Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service( + engine_config={"model": "Qwen/Qwen3-1.7B", "tensor_parallel_size": 1}, + sampling_config={"n": 1, "max_tokens": 512} ), - ReplayBuffer.options(**cfg.actors.replay_buffer).as_actor( - **cfg.replay_buffer, collate=collate + RLTrainer.options(procs=1, with_gpus=True).as_actor( + model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": "hf://Qwen/Qwen3-1.7B"}, + optimizer={"name": "AdamW", "lr": 1e-5}, + training={"local_batch_size": 2, "seq_len": 2048} ), - ComputeAdvantages.options(**cfg.actors.compute_advantages).as_actor(), - ReferenceModel.options(**cfg.services.ref_model).as_service(**cfg.ref_model), - RewardActor.options(**cfg.services.reward_actor).as_service( + ReplayBuffer.options(procs=1).as_actor( + batch_size=2, max_policy_age=1, dp_size=1 + ), + ComputeAdvantages.options(procs=1).as_actor(), + ReferenceModel.options(procs=1, with_gpus=True).as_actor( + model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": "hf://Qwen/Qwen3-1.7B"} + ), + RewardActor.options(procs=1, num_replicas=1).as_service( reward_functions=[MathReward(), ThinkingReward()] ), ) @@ -593,10 +619,13 @@ async def production_training_loop(): sample = await dataloader.sample.call_one() # Policy generation service call - responses = await policy.generate.route(prompt=sample["question"]) + responses = await policy.generate.route(sample["request"]) # Correct field name - # Reference computation service call - ref_logprobs = await ref_model.forward.route(responses[0].token_ids) + # Reference computation service call (requires full input tensor) + input_ids = torch.cat([responses[0].prompt_ids, responses[0].token_ids]) + ref_logprobs = await ref_model.forward.route( + input_ids.unsqueeze(0), max_req_tokens=512, return_logprobs=True + ) # Reward evaluation service call reward = await reward_actor.evaluate_response.route( @@ -605,18 +634,19 @@ async def production_training_loop(): target=sample["answer"] ) - # Experience storage (simplified structure for illustration) - episode = create_episode(sample, responses[0], reward, ref_logprobs, step) + # Experience storage (using actual Episode structure) + episode = create_episode_from_grpo_data(sample, responses[0], reward, ref_logprobs[0], step) await replay_buffer.add.call_one(episode) - # Training when ready endpoints + # Training when ready batch = await replay_buffer.sample.call_one(curr_policy_version=step) if batch is not None: - loss = await trainer.train_step.call_one(batch) + inputs, targets = batch # GRPO returns (inputs, targets) tuple + loss = await trainer.train_step.call(inputs, targets) # Weight synchronization pattern - await trainer.push_weights.call_one(step + 1) - await policy.update_weights.route(step + 1) + await trainer.push_weights.call(step + 1) + await policy.update_weights.fanout(step + 1) # Fanout to all replicas print(f"Step {step}, Loss: {loss:.4f}") step += 1 @@ -628,7 +658,7 @@ await asyncio.gather( RLTrainer.shutdown(trainer), ReplayBuffer.shutdown(replay_buffer), ComputeAdvantages.shutdown(compute_advantages), - ref_model.shutdown(), + ReferenceModel.shutdown(ref_model), reward_actor.shutdown(), ) print("All services shut down successfully!") @@ -636,7 +666,7 @@ print("All services shut down successfully!") **Key observations:** 1. **Parallelism**: Independent operations run concurrently -2. **Load balancing**: Each `choose()` call automatically selects optimal replica +2. **Load balancing**: Each `.route()` call automatically selects optimal replica 3. **Fault tolerance**: Failures automatically retry on different replicas 4. **Resource efficiency**: CPU and GPU services scale independently 5. **Coordination**: Services coordinate through shared state (replay buffer, weight versions) diff --git a/docs/Tutorials/3_Monarch_101.MD b/docs/Tutorials/3_Monarch_101.MD index 52a058dcc..0cbdcbd88 100644 --- a/docs/Tutorials/3_Monarch_101.MD +++ b/docs/Tutorials/3_Monarch_101.MD @@ -9,11 +9,11 @@ Now let's peel back the layers. Forge services are built on top of **Monarch**, ```mermaid graph TD subgraph YourCode["1. Your RL Code"] - Call["await policy_service.generate.choose('What is 2+2?')"] + Call["await policy_service.generate.route('What is 2+2?')"] end subgraph ForgeServices["2. Forge Service Layer"] - ServiceInterface["ServiceInterface
• Routes .choose() to replica
• Handles load balancing
• Manages health checks"] + ServiceInterface["ServiceInterface
• Routes .route() to replica
• Handles load balancing
• Manages health checks"] ServiceActor["ServiceActor
• Manages replica lifecycle
• Monitors health
• Coordinates failures"] end @@ -167,7 +167,7 @@ await counters.increment.call() value = await counters.get_value.call_one() print(f"One counter: {value}") -# choose() - random single actor +# choose() - random single actor (actors only, not services) value = await counters.get_value.choose() print(f"Random counter: {value}") @@ -273,8 +273,8 @@ Now the key insight: **Forge services are ServiceActors that manage ActorMeshes ```mermaid graph TD - subgraph ServiceCreation["spawn_service() Process"] - Call["await spawn_service(ServiceConfig(num_replicas=4), PolicyActor, model='Qwen')"] + subgraph ServiceCreation["Service Creation Process"] + Call["await PolicyActor.options(num_replicas=4, procs=1).as_service(model='Qwen')"] ServiceActor["ServiceActor
• Manages 4 replicas
• Handles health checks
• Routes service calls"] @@ -323,9 +323,9 @@ graph TD ```mermaid graph TD subgraph CallFlow["Complete Call Flow"] - UserCall["await policy_service.generate.choose('What is 2+2?')"] + UserCall["await policy_service.generate.route('What is 2+2?')"] - ServiceInterface["ServiceInterface
• Receives .choose() call
• Routes to ServiceActor"] + ServiceInterface["ServiceInterface
• Receives .route() call
• Routes to ServiceActor"] ServiceActor["ServiceActor
• Selects healthy replica
• Load balancing logic
• Failure handling"] From aca4c961590a1a5081235d088a4f3383bc38932c Mon Sep 17 00:00:00 2001 From: Sanyam Bhutani Date: Fri, 3 Oct 2025 13:50:49 -0700 Subject: [PATCH 13/28] Update docs/Tutorials/2_Forge_Internals.MD Co-authored-by: Allen Wang <9057208+allenwang28@users.noreply.github.com> --- docs/Tutorials/2_Forge_Internals.MD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/Tutorials/2_Forge_Internals.MD b/docs/Tutorials/2_Forge_Internals.MD index 634f04f85..09c39fb7e 100644 --- a/docs/Tutorials/2_Forge_Internals.MD +++ b/docs/Tutorials/2_Forge_Internals.MD @@ -58,7 +58,7 @@ Policy.options( num_replicas=4, # Number of replicas with_gpus=True # Allocate GPUs # Other available options: - # hosts=None + # hosts=None # the number of remote hosts used per replica ) ``` From 10863aaf13a3580bf1eb9e0c04a635357f60fff7 Mon Sep 17 00:00:00 2001 From: Sanyam Bhutani Date: Fri, 10 Oct 2025 14:10:38 -0700 Subject: [PATCH 14/28] update part 1 and 2 --- docs/Tutorials/1_RL_and_Forge_Fundamentals.MD | 4 +- docs/Tutorials/2_Forge_Internals.MD | 56 +------------------ 2 files changed, 3 insertions(+), 57 deletions(-) diff --git a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD index c34ae6639..32ada41cb 100644 --- a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD +++ b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD @@ -213,7 +213,7 @@ Each step has different: Unlike supervised learning where you process independent batches, RL requires coordination: ```python -# This won't work - creates bottlenecks and resource waste +# While this does work, it creates bottlenecks and resource waste def naive_rl_step(): # Policy waits idle while reward model works response = policy_model.generate(prompt) # GPU busy @@ -368,7 +368,7 @@ group_size = 1 Production scaling - multiply num_replicas for services or spawn multiple actors: - Policy: num_replicas=8 for high inference demand - RewardActor: num_replicas=16 for parallel evaluation -- Trainer: Multiple actors for distributed training (RLTrainer handles this internally) +- Trainer: Multiple processes for distributed training (RLTrainer handles this internally) ### Fault Tolerance diff --git a/docs/Tutorials/2_Forge_Internals.MD b/docs/Tutorials/2_Forge_Internals.MD index 09c39fb7e..c21485bb0 100644 --- a/docs/Tutorials/2_Forge_Internals.MD +++ b/docs/Tutorials/2_Forge_Internals.MD @@ -64,7 +64,6 @@ Policy.options( ### 2. Real Service Creation -Services are created using the `spawn_service` function: Services are created using the `.options().as_service()` pattern from the actual GRPO implementation: The service creation automatically handles: @@ -126,32 +125,6 @@ But behind the scenes: - Handles failures and retries automatically - Returns list[Completion] from the selected replica -### 3. Different Service Types and Their Characteristics - -```mermaid -graph TD - subgraph GPU["GPU-Intensive Services"] - PolicySvc["Policy Service
Large model inference
High GPU memory
Batch optimization"] - TrainerSvc["Trainer Service
Distributed training
Gradient sync
Massive compute"] - RefSvc["Reference Service
Frozen model
Baseline computation
Read-only ops"] - end - - subgraph CPU["CPU-Intensive Services"] - RewardSvc["Reward Service
Evaluation logic
Rule-based scoring
High throughput"] - DataSvc["Data Service
Dataset streaming
Preprocessing
I/O optimization"] - end - - subgraph Memory["Memory-Intensive Services"] - BufferSvc["Buffer Service
Experience storage
Efficient sampling
Persistence"] - MetricsSvc["Metrics Service
Logging aggregation
Performance tracking
Analytics"] - end - - style PolicySvc fill:#ff9999 - style TrainerSvc fill:#ff9999 - style RewardSvc fill:#99ff99 - style BufferSvc fill:#9999ff -``` - ## Deep Dive: Service Communication Patterns These communication patterns (\"adverbs\") determine how your service calls are routed to replicas. Understanding when to use each pattern is key to effective Forge usage. @@ -226,34 +199,7 @@ while training: **Critical insight**: This is essential for high-throughput RL where you can't wait for batches. -### 4. Fire-and-Forget Operations - -**When to use**: Side effects that don't need responses (notifications, cache updates). - -```python -# CONCEPTUAL - Fire-and-forget requires custom @endpoint implementations -# The basic services don't have broadcast methods built-in -# You would implement custom endpoints in your ForgeActor: - -class CustomPolicy(Policy): - @endpoint - async def clear_cache(self) -> None: - """Custom endpoint for cache clearing""" - self.policy_worker.clear_kv_cache() - -# Then use it (hypothetical): -# await custom_policy.clear_cache.fanout() # Clear all replica caches -# Note: Actual cache clearing would use existing Policy methods -``` - -**Performance characteristics**: -- **Latency**: Immediately returns (doesn't wait for completion) -- **Throughput**: Network limited, but non-blocking -- **Fault tolerance**: Fire-and-forget (you don't know if it worked) - -**Critical warning**: Only use for non-critical operations - you get no confirmation. - -### 5. Service Sessions for Stateful Operations +### 3. Service Sessions for Stateful Operations **When to use**: When you need multiple calls to hit the same replica (like KV cache preservation). From 67a0a9852bd76e775db082bf940ffe97f48fe0a1 Mon Sep 17 00:00:00 2001 From: Sanyam Bhutani Date: Sun, 12 Oct 2025 11:43:57 -0700 Subject: [PATCH 15/28] address more comments --- docs/Tutorials/1_RL_and_Forge_Fundamentals.MD | 8 ++++---- docs/Tutorials/2_Forge_Internals.MD | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD index 32ada41cb..66b32a2b3 100644 --- a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD +++ b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD @@ -204,8 +204,8 @@ graph LR Each step has different: - **Latency requirements**: Policy inference needs low latency, training can batch -- **Scaling patterns**: Reward evaluation scales with response count, training with model size -- **Failure modes**: Policy failure stops generation, reward failure affects learning quality +- **Scaling patterns**: Need N policy replicas to keep trainer busy, plus different sharding strategies (tensor parallel for training vs replicated inference) +- **Failure modes**: Any component failure cascades to halt the entire pipeline (Forge prevents this with automatic failover) - **Resource utilization**: GPUs for inference/training, CPUs for data processing ### Problem 3: The Coordination Challenge @@ -229,9 +229,9 @@ def naive_rl_step(): ## Enter Forge: RL-Native Architecture -Forge solves these problems by treating each RL component as an **independent, scalable service** +Forge solves these problems by treating each RL component as an **independent, distributed unit** - some as fault-tolerant services (like Policy inference where failures are easy to handle), others as actors (like Trainers where recovery semantics differ) -Let's see how core RL concepts map to Forge services: +Let's see how core RL concepts map to Forge components (you'll notice a mix of `.route()` for services and `.call_one()` for actors - we cover when to use each in Part 2): ```python async def real_rl_training_step(services, step): diff --git a/docs/Tutorials/2_Forge_Internals.MD b/docs/Tutorials/2_Forge_Internals.MD index c21485bb0..2ed3301e5 100644 --- a/docs/Tutorials/2_Forge_Internals.MD +++ b/docs/Tutorials/2_Forge_Internals.MD @@ -140,7 +140,7 @@ answer = responses[0].text # Extract text from Completion object Behind the scenes: 1. Health check eliminates failed replicas -2. Load balancer picks least loaded healthy replica +2. Load balancer picks replica (currently round robin, configurable balancers coming soon) 3. Request routes to that specific replica 4. Automatic retry on different replica if failure @@ -302,7 +302,7 @@ async def optimized_multi_turn(): ```python # Forge ReplayBuffer endpoints (verified from source code) # Add episodes (thread-safe by actor model) -await replay_buffer.add.call_one(episode) # Note: .call_one() not .choose() +await replay_buffer.add.call_one(episode) # .choose() would work too, but .call_one() clarifies it's a singleton actor not ActorMesh # Sample batches for training batch = await replay_buffer.sample.call_one( From 27a48a829e07bea0ede28a2580ac61d3b68f6afc Mon Sep 17 00:00:00 2001 From: Sanyam Bhutani Date: Sun, 12 Oct 2025 11:46:05 -0700 Subject: [PATCH 16/28] fix multi line issue --- docs/Tutorials/1_RL_and_Forge_Fundamentals.MD | 20 +++---- docs/Tutorials/2_Forge_Internals.MD | 14 ++--- docs/Tutorials/3_Monarch_101.MD | 60 +++++++++---------- 3 files changed, 47 insertions(+), 47 deletions(-) diff --git a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD index 66b32a2b3..26f90092c 100644 --- a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD +++ b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD @@ -9,12 +9,12 @@ Let's start with a simple math tutoring example to understand RL concepts with t ```mermaid graph TD subgraph Example["Math Tutoring RL Example"] - Dataset["Dataset
math problems
'What is 2+2?'"] - Policy["Policy
student AI
generates: 'The answer is 4'"] - Reward["Reward Model
Evaluation Exam
scores: 0.95 (excellent)"] - Reference["Reference Model
original student
baseline comparison"] - ReplayBuffer["Replay Buffer
notebook
stores experiences"] - Trainer["Trainer
tutor
improves student"] + Dataset["Dataset: math problems"] + Policy["Policy: student AI"] + Reward["Reward Model: scores answers"] + Reference["Reference Model: baseline"] + ReplayBuffer["Replay Buffer: stores experiences"] + Trainer["Trainer: improves student"] end Dataset --> Policy @@ -163,13 +163,13 @@ Our simple RL loop above has complex requirements: ```mermaid graph TD subgraph Components["Each Component Needs Different Resources"] - Policy["Policy (Student AI)
Generates: 'The answer is 4'
Needs: Large GPU memory
Scaling: Multiple replicas for speed"] + Policy["Policy (Student AI): Large GPU memory, Multiple replicas"] - Reward["Reward Model (Teacher)
Scores answers: 0.95
Needs: Moderate compute
Scaling: CPU or small GPU"] + Reward["Reward Model (Teacher): Moderate compute, CPU/small GPU"] - Trainer["Trainer (Tutor)
Improves student weights
Needs: Massive GPU compute
Scaling: Distributed training"] + Trainer["Trainer (Tutor): Massive GPU compute, Distributed training"] - Dataset["Dataset (Question Bank)
Provides: 'What is 2+2?'
Needs: CPU intensive I/O
Scaling: High memory bandwidth"] + Dataset["Dataset (Question Bank): CPU intensive I/O, High memory bandwidth"] end style Policy fill:#99ff99 diff --git a/docs/Tutorials/2_Forge_Internals.MD b/docs/Tutorials/2_Forge_Internals.MD index 2ed3301e5..ef53ddfe5 100644 --- a/docs/Tutorials/2_Forge_Internals.MD +++ b/docs/Tutorials/2_Forge_Internals.MD @@ -15,19 +15,19 @@ graph TD Call["Your Code:
await policy_service.generate"] subgraph ServiceLayer["Service Layer"] - Proxy["Service Proxy
Load balancing
Health checking
Request routing"] - LB["Load Balancer
Replica selection
Circuit breaker
Retry logic"] + Proxy["Service Proxy: Load balancing, Health checking"] + LB["Load Balancer: Replica selection, Circuit breaker"] end subgraph Replicas["Replica Management"] - R1["Replica 1
GPU 0
Healthy"] - R2["Replica 2
GPU 1
Overloaded"] - R3["Replica 3
GPU 2
Failed"] - R4["Replica 4
GPU 3
Healthy"] + R1["Replica 1: GPU 0, Healthy"] + R2["Replica 2: GPU 1, Overloaded"] + R3["Replica 3: GPU 2, Failed"] + R4["Replica 4: GPU 3, Healthy"] end subgraph Compute["Actual Computation"] - Actor["Policy Actor
vLLM engine
Model weights
KV cache"] + Actor["Policy Actor: vLLM engine, Model weights, KV cache"] end Call --> Proxy diff --git a/docs/Tutorials/3_Monarch_101.MD b/docs/Tutorials/3_Monarch_101.MD index 0cbdcbd88..502d8a34d 100644 --- a/docs/Tutorials/3_Monarch_101.MD +++ b/docs/Tutorials/3_Monarch_101.MD @@ -13,20 +13,20 @@ graph TD end subgraph ForgeServices["2. Forge Service Layer"] - ServiceInterface["ServiceInterface
• Routes .route() to replica
• Handles load balancing
• Manages health checks"] - ServiceActor["ServiceActor
• Manages replica lifecycle
• Monitors health
• Coordinates failures"] + ServiceInterface["ServiceInterface: Routes requests, Load balancing, Health checks"] + ServiceActor["ServiceActor: Manages replicas, Monitors health, Coordinates failures"] end subgraph MonarchLayer["3. Monarch Actor Layer"] - ActorMesh["ActorMesh PolicyActor
• 4 PolicyActor instances
• Each on different GPU
• Message passing interface"] - ProcMesh["ProcMesh
• 4 processes
• GPU topology: 0,1,2,3
• Network interconnect"] + ActorMesh["ActorMesh PolicyActor: 4 instances, Different GPUs, Message passing"] + ProcMesh["ProcMesh: 4 processes, GPU topology 0,1,2,3, Network interconnect"] end subgraph Hardware["4. Physical Hardware"] - GPU0["GPU 0
PolicyActor #1
vLLM Engine
Model Weights"] - GPU1["GPU 1
PolicyActor #2
vLLM Engine
Model Weights"] - GPU2["GPU 2
PolicyActor #3
vLLM Engine
Model Weights"] - GPU3["GPU 3
PolicyActor #4
vLLM Engine
Model Weights"] + GPU0["GPU 0: PolicyActor #1, vLLM Engine, Model Weights"] + GPU1["GPU 1: PolicyActor #2, vLLM Engine, Model Weights"] + GPU2["GPU 2: PolicyActor #3, vLLM Engine, Model Weights"] + GPU3["GPU 3: PolicyActor #4, vLLM Engine, Model Weights"] end Call --> ServiceInterface @@ -199,10 +199,10 @@ graph TD end subgraph ActorMesh["ActorMesh PolicyActor"] - A0["PolicyActor
Instance #0
model=Qwen/Qwen3-7B
generation_count=0"] - A1["PolicyActor
Instance #1
model=Qwen/Qwen3-7B
generation_count=0"] - A2["PolicyActor
Instance #2
model=Qwen/Qwen3-7B
generation_count=0"] - A3["PolicyActor
Instance #3
model=Qwen/Qwen3-7B
generation_count=0"] + A0["PolicyActor Instance #0: model=Qwen/Qwen3-7B"] + A1["PolicyActor Instance #1: model=Qwen/Qwen3-7B"] + A2["PolicyActor Instance #2: model=Qwen/Qwen3-7B"] + A3["PolicyActor Instance #3: model=Qwen/Qwen3-7B"] end Code --> ProcMesh @@ -226,17 +226,17 @@ graph TD Client["await policy_actors.generate.METHOD(prompt)"] subgraph Methods["Different Adverbs Route Differently"] - Choose["choose()
→ Routes to ONE actor
→ Load balanced"] - Call["call()
→ Routes to ALL actors
→ Collects all results"] - Broadcast["broadcast()
→ Routes to ALL actors
→ Fire and forget"] - Stream["stream()
→ Routes to ALL actors
→ Iterator of results"] + Choose["choose(): Routes to ONE actor, Load balanced"] + Call["call(): Routes to ALL actors, Collects results"] + Broadcast["broadcast(): Routes to ALL actors, Fire and forget"] + Stream["stream(): Routes to ALL actors, Iterator of results"] end subgraph ActorInstances["PolicyActor Instances"] - A0["Actor 0
GPU 0
generates response"] - A1["Actor 1
GPU 1
generates response"] - A2["Actor 2
GPU 2
generates response"] - A3["Actor 3
GPU 3
generates response"] + A0["Actor 0: GPU 0, generates response"] + A1["Actor 1: GPU 1, generates response"] + A2["Actor 2: GPU 2, generates response"] + A3["Actor 3: GPU 3, generates response"] end Client --> Choose @@ -276,26 +276,26 @@ graph TD subgraph ServiceCreation["Service Creation Process"] Call["await PolicyActor.options(num_replicas=4, procs=1).as_service(model='Qwen')"] - ServiceActor["ServiceActor
• Manages 4 replicas
• Handles health checks
• Routes service calls"] + ServiceActor["ServiceActor: Manages 4 replicas, Health checks, Routes calls"] subgraph Replicas["4 Independent Replicas"] subgraph R0["Replica 0"] - PM0["ProcMesh
1 process
GPU 0"] + PM0["ProcMesh: 1 process, GPU 0"] AM0["ActorMesh
1 PolicyActor"] end subgraph R1["Replica 1"] - PM1["ProcMesh
1 process
GPU 1"] + PM1["ProcMesh: 1 process, GPU 1"] AM1["ActorMesh
1 PolicyActor"] end subgraph R2["Replica 2"] - PM2["ProcMesh
1 process
GPU 2"] + PM2["ProcMesh: 1 process, GPU 2"] AM2["ActorMesh
1 PolicyActor"] end subgraph R3["Replica 3"] - PM3["ProcMesh
1 process
GPU 3"] + PM3["ProcMesh: 1 process, GPU 3"] AM3["ActorMesh
1 PolicyActor"] end end @@ -325,15 +325,15 @@ graph TD subgraph CallFlow["Complete Call Flow"] UserCall["await policy_service.generate.route('What is 2+2?')"] - ServiceInterface["ServiceInterface
• Receives .route() call
• Routes to ServiceActor"] + ServiceInterface["ServiceInterface: Receives .route() call, Routes to ServiceActor"] - ServiceActor["ServiceActor
• Selects healthy replica
• Load balancing logic
• Failure handling"] + ServiceActor["ServiceActor: Selects healthy replica, Load balancing, Failure handling"] - SelectedReplica["Selected Replica #2
• ProcMesh with 1 process
• ActorMesh with 1 PolicyActor"] + SelectedReplica["Selected Replica #2: ProcMesh 1 process, ActorMesh 1 PolicyActor"] - PolicyActor["PolicyActor Instance
• Loads model
• Runs vLLM inference
• Returns 'The answer is 4'"] + PolicyActor["PolicyActor Instance: Loads model, Runs vLLM inference"] - GPU["GPU 2
• vLLM engine
• Model weights
• KV cache
• CUDA kernels"] + GPU["GPU 2: vLLM engine, Model weights, KV cache, CUDA kernels"] UserCall --> ServiceInterface ServiceInterface --> ServiceActor From 858c28b8dc6f3e07b989b40112dc9301ecf861f2 Mon Sep 17 00:00:00 2001 From: Sanyam Bhutani Date: Sun, 12 Oct 2025 11:48:16 -0700 Subject: [PATCH 17/28] fix colours --- docs/Tutorials/1_RL_and_Forge_Fundamentals.MD | 32 ++++---- docs/Tutorials/2_Forge_Internals.MD | 8 +- docs/Tutorials/3_Monarch_101.MD | 76 +++++++++---------- 3 files changed, 58 insertions(+), 58 deletions(-) diff --git a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD index 26f90092c..2565d626e 100644 --- a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD +++ b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD @@ -25,9 +25,9 @@ graph TD ReplayBuffer --> Trainer Trainer --> Policy - style Policy fill:#99ff99 - style Reward fill:#ffcc99 - style Trainer fill:#ff99cc + style Policy fill:#4CAF50 + style Reward fill:#FF9800 + style Trainer fill:#E91E63 ``` ### RL Components Defined (Forge Names) @@ -100,10 +100,10 @@ graph LR C5 --> S5 C6 --> S6 - style C2 fill:#99ff99 - style S2 fill:#99ff99 - style C3 fill:#ffcc99 - style S3 fill:#ffcc99 + style C2 fill:#4CAF50 + style S2 fill:#4CAF50 + style C3 fill:#FF9800 + style S3 fill:#FF9800 ``` ### RL Step with Forge Services @@ -172,10 +172,10 @@ graph TD Dataset["Dataset (Question Bank): CPU intensive I/O, High memory bandwidth"] end - style Policy fill:#99ff99 - style Reward fill:#ffcc99 - style Trainer fill:#ff99cc - style Dataset fill:#ccccff + style Policy fill:#4CAF50 + style Reward fill:#FF9800 + style Trainer fill:#E91E63 + style Dataset fill:#2196F3 ``` ### Problem 2: Complex Interdependencies @@ -195,11 +195,11 @@ graph LR D --> E E --> A - style A fill:#99ff99 - style B fill:#ffcc99 - style C fill:#99ccff - style D fill:#ccff99 - style E fill:#ff99cc + style A fill:#4CAF50 + style B fill:#FF9800 + style C fill:#2196F3 + style D fill:#8BC34A + style E fill:#E91E63 ``` Each step has different: diff --git a/docs/Tutorials/2_Forge_Internals.MD b/docs/Tutorials/2_Forge_Internals.MD index ef53ddfe5..05a40e4a5 100644 --- a/docs/Tutorials/2_Forge_Internals.MD +++ b/docs/Tutorials/2_Forge_Internals.MD @@ -39,10 +39,10 @@ graph TD R1 --> Actor R4 --> Actor - style Call fill:#99ff99 - style LB fill:#ffcc99 - style R3 fill:#ff9999 - style Actor fill:#cc99ff + style Call fill:#4CAF50 + style LB fill:#FF9800 + style R3 fill:#F44336 + style Actor fill:#9C27B0 ``` ## Service Components Deep Dive diff --git a/docs/Tutorials/3_Monarch_101.MD b/docs/Tutorials/3_Monarch_101.MD index 502d8a34d..52bdb17d0 100644 --- a/docs/Tutorials/3_Monarch_101.MD +++ b/docs/Tutorials/3_Monarch_101.MD @@ -38,10 +38,10 @@ graph TD ProcMesh --> GPU2 ProcMesh --> GPU3 - style Call fill:#99ff99 - style ServiceActor fill:#ffcc99 - style ActorMesh fill:#cc99ff - style ProcMesh fill:#ccccff + style Call fill:#4CAF50 + style ServiceActor fill:#FF9800 + style ActorMesh fill:#9C27B0 + style ProcMesh fill:#2196F3 ``` ## Deep Dive: ProcMesh - The Foundation @@ -74,14 +74,14 @@ graph TD P7 -.->|"Network"| P0 end - style P0 fill:#ff9999 - style P1 fill:#ff9999 - style P2 fill:#ff9999 - style P3 fill:#ff9999 - style P4 fill:#ff9999 - style P5 fill:#ff9999 - style P6 fill:#ff9999 - style P7 fill:#ff9999 + style P0 fill:#F44336 + style P1 fill:#F44336 + style P2 fill:#F44336 + style P3 fill:#F44336 + style P4 fill:#F44336 + style P5 fill:#F44336 + style P6 fill:#F44336 + style P7 fill:#F44336 ``` ### Multi-Host ProcMesh @@ -122,9 +122,9 @@ graph TD H2P0 -.->|"InfiniBand"| H3P0 H2P1 -.->|"InfiniBand"| H3P1 - style PM1 fill:#ff9999 - style PM2 fill:#99ff99 - style PM3 fill:#99ccff + style PM1 fill:#F44336 + style PM2 fill:#4CAF50 + style PM3 fill:#2196F3 ``` ```python @@ -212,10 +212,10 @@ graph TD P3 --> A3 end - style A0 fill:#99ff99 - style A1 fill:#99ff99 - style A2 fill:#99ff99 - style A3 fill:#99ff99 + style A0 fill:#4CAF50 + style A1 fill:#4CAF50 + style A2 fill:#4CAF50 + style A3 fill:#4CAF50 ``` ### Message Routing Through ActorMesh @@ -259,10 +259,10 @@ graph TD Stream --> A3 end - style Choose fill:#99ff99 - style Call fill:#ffcc99 - style Broadcast fill:#ff99cc - style Stream fill:#cc99ff + style Choose fill:#4CAF50 + style Call fill:#FF9800 + style Broadcast fill:#E91E63 + style Stream fill:#9C27B0 ``` ## How Forge Services Use Monarch @@ -311,11 +311,11 @@ graph TD PM3 --> AM3 end - style ServiceActor fill:#ffcc99 - style AM0 fill:#99ff99 - style AM1 fill:#99ff99 - style AM2 fill:#99ff99 - style AM3 fill:#99ff99 + style ServiceActor fill:#FF9800 + style AM0 fill:#4CAF50 + style AM1 fill:#4CAF50 + style AM2 fill:#4CAF50 + style AM3 fill:#4CAF50 ``` ### Service Call to Actor Execution @@ -348,10 +348,10 @@ graph TD ServiceInterface -.->|"'The answer is 4'"| UserCall end - style UserCall fill:#99ff99 - style ServiceActor fill:#ffcc99 - style PolicyActor fill:#cc99ff - style GPU fill:#ffcccc + style UserCall fill:#4CAF50 + style ServiceActor fill:#FF9800 + style PolicyActor fill:#9C27B0 + style GPU fill:#FF5722 ``` ## Multiple Services Sharing Infrastructure @@ -400,12 +400,12 @@ graph TD BS --> C4 end - style PS fill:#99ff99 - style TS fill:#ff99cc - style RS fill:#ffcc99 - style BS fill:#cc99ff - style GPUMesh fill:#ffe6e6 - style CPUMesh fill:#e6f3ff + style PS fill:#4CAF50 + style TS fill:#E91E63 + style RS fill:#FF9800 + style BS fill:#9C27B0 + style GPUMesh fill:#FFEBEE + style CPUMesh fill:#E3F2FD ``` ## Key Insights: Why This Architecture Matters From 3862ce9d82fac3f5348d777ad892c3160e40dc84 Mon Sep 17 00:00:00 2001 From: Sanyam Bhutani Date: Sun, 12 Oct 2025 12:03:09 -0700 Subject: [PATCH 18/28] fix linter and ohter comments --- docs/Tutorials/1_RL_and_Forge_Fundamentals.MD | 114 +++++++++--------- docs/Tutorials/2_Forge_Internals.MD | 98 +++++++-------- docs/Tutorials/ReadMe.MD | 6 +- 3 files changed, 109 insertions(+), 109 deletions(-) diff --git a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD index 2565d626e..39b6d62aa 100644 --- a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD +++ b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD @@ -16,7 +16,7 @@ graph TD ReplayBuffer["Replay Buffer: stores experiences"] Trainer["Trainer: improves student"] end - + Dataset --> Policy Policy --> Reward Policy --> Reference @@ -24,7 +24,7 @@ graph TD Reference --> ReplayBuffer ReplayBuffer --> Trainer Trainer --> Policy - + style Policy fill:#4CAF50 style Reward fill:#FF9800 style Trainer fill:#E91E63 @@ -47,25 +47,25 @@ graph TD def conceptual_rl_step(): # 1. Get a math problem question = dataset.sample() # "What is 2+2?" - - # 2. Student generates answer + + # 2. Student generates answer answer = policy.generate(question) # "The answer is 4" - + # 3. Teacher grades it score = reward_model.evaluate(question, answer) # 0.95 - + # 4. Compare to original student baseline = reference_model.compute_logprobs(question, answer) - + # 5. Store the experience experience = Episode(question, answer, score, baseline) replay_buffer.add(experience) - + # 6. When enough experiences collected, improve student batch = replay_buffer.sample(curr_policy_version=0) if batch is not None: trainer.train_step(batch) # Student gets better! - + # 🔄 See complete working example below with actual Forge service calls ``` @@ -83,7 +83,7 @@ graph LR C5["Replay Buffer"] C6["Trainer"] end - + subgraph Services["Forge Services (Real Classes)"] S1["DatasetActor"] S2["Policy"] @@ -92,14 +92,14 @@ graph LR S5["ReplayBuffer"] S6["RLTrainer"] end - + C1 --> S1 C2 --> S2 C3 --> S3 C4 --> S4 C5 --> S5 C6 --> S6 - + style C2 fill:#4CAF50 style S2 fill:#4CAF50 style C3 fill:#FF9800 @@ -117,26 +117,26 @@ async def conceptual_forge_rl_step(services, step): # 1. Get a math problem - Using actual DatasetActor API sample = await services['dataloader'].sample.call_one() question, target = sample["request"], sample["target"] - + # 2. Student generates answer - Using actual Policy API responses = await services['policy'].generate.route(prompt=question) - answer = responses[0].text - + answer = responses[0].text + # 3. Teacher grades it - Using actual RewardActor API score = await services['reward_actor'].evaluate_response.route( prompt=question, response=answer, target=target ) - + # 4. Compare to baseline - Using actual ReferenceModel API # Note: ReferenceModel.forward requires input_ids, max_req_tokens, return_logprobs ref_logprobs = await services['ref_model'].forward.route( input_ids, max_req_tokens, return_logprobs=True ) - + # 5. Store experience - Using actual Episode structure from apps/grpo/main.py episode = create_episode_from_response(responses[0], score, ref_logprobs, step) await services['replay_buffer'].add.call_one(episode) - + # 6. Improve student - Using actual training pattern batch = await services['replay_buffer'].sample.call_one( curr_policy_version=step @@ -160,23 +160,12 @@ Our simple RL loop above has complex requirements: #### Problem 1: Different Resource Needs -```mermaid -graph TD - subgraph Components["Each Component Needs Different Resources"] - Policy["Policy (Student AI): Large GPU memory, Multiple replicas"] - - Reward["Reward Model (Teacher): Moderate compute, CPU/small GPU"] - - Trainer["Trainer (Tutor): Massive GPU compute, Distributed training"] - - Dataset["Dataset (Question Bank): CPU intensive I/O, High memory bandwidth"] - end - - style Policy fill:#4CAF50 - style Reward fill:#FF9800 - style Trainer fill:#E91E63 - style Dataset fill:#2196F3 -``` +| Component | Resource Needs | Scaling Strategy | +|-----------|----------------|------------------| +| **Policy** (Student AI) | Large GPU memory | Multiple replicas for throughput | +| **Reward Heuristic** (Teacher) | Small compute | CPU or small GPU | +| **Trainer** (Tutor) | Massive GPU compute | Distributed training | +| **Dataset** (Question Bank) | CPU intensive I/O | High memory bandwidth | ### Problem 2: Complex Interdependencies @@ -187,14 +176,14 @@ graph LR C["Reference: Original Student
Provides baseline comparison"] D["Replay Buffer: Notebook
Stores: question + answer + score"] E["Trainer: Tutor
Improves student using experiences"] - + A --> B A --> C B --> D C --> D D --> E E --> A - + style A fill:#4CAF50 style B fill:#FF9800 style C fill:#2196F3 @@ -203,7 +192,7 @@ graph LR ``` Each step has different: -- **Latency requirements**: Policy inference needs low latency, training can batch +- **Latency requirements**: Policy inference needs low latency (each episode waits), training can batch multiple episodes together - **Scaling patterns**: Need N policy replicas to keep trainer busy, plus different sharding strategies (tensor parallel for training vs replicated inference) - **Failure modes**: Any component failure cascades to halt the entire pipeline (Forge prevents this with automatic failover) - **Resource utilization**: GPUs for inference/training, CPUs for data processing @@ -218,10 +207,10 @@ def naive_rl_step(): # Policy waits idle while reward model works response = policy_model.generate(prompt) # GPU busy reward = reward_model.evaluate(prompt, response) # Policy GPU idle - - # Training waits for single episode + + # Training waits for single episode loss = compute_loss(response, reward) # Batch size = 1, inefficient - + # Everything stops if any component fails if policy_fails or reward_fails or trainer_fails: entire_system_stops() @@ -233,32 +222,37 @@ Forge solves these problems by treating each RL component as an **independent, d Let's see how core RL concepts map to Forge components (you'll notice a mix of `.route()` for services and `.call_one()` for actors - we cover when to use each in Part 2): +**Quick API Reference:** (covered in detail in Part 2: Service Communication Patterns) +- `.route()` - Send request to any healthy replica in a service (load balanced) +- `.call_one()` - Send request to a single actor instance +- `.fanout()` - Send request to ALL replicas in a service + ```python async def real_rl_training_step(services, step): """Single RL step using verified Forge APIs""" - + # 1. Environment interaction - Using actual DatasetActor API sample = await services['dataloader'].sample.call_one() prompt, target = sample["request"], sample["target"] - + responses = await services['policy'].generate.route(prompt) - + # 2. Reward computation - Using actual RewardActor API score = await services['reward_actor'].evaluate_response.route( prompt=prompt, response=responses[0].text, target=target ) - + # 3. Get reference logprobs - Using actual ReferenceModel API # Note: ReferenceModel requires full input_ids tensor, not just tokens input_ids = torch.cat([responses[0].prompt_ids, responses[0].token_ids]) ref_logprobs = await services['ref_model'].forward.route( input_ids.unsqueeze(0), max_req_tokens=512, return_logprobs=True ) - + # 4. Experience storage - Using actual Episode pattern from GRPO episode = create_episode_from_response(responses[0], score, ref_logprobs, step) await services['replay_buffer'].add.call_one(episode) - + # 5. Learning - Using actual trainer pattern batch = await services['replay_buffer'].sample.call_one( curr_policy_version=step @@ -266,11 +260,11 @@ async def real_rl_training_step(services, step): if batch is not None: inputs, targets = batch # GRPO returns (inputs, targets) tuple loss = await services['trainer'].train_step.call(inputs, targets) - + # 6. Policy synchronization - Using actual weight update pattern await services['trainer'].push_weights.call(step + 1) await services['policy'].update_weights.fanout(step + 1) - + return loss ``` @@ -286,7 +280,7 @@ answer = responses[0].text # responses is list[Completion] Forge handles behind the scenes: - Routing to least loaded replica -- GPU memory management +- GPU memory management - Batch optimization - Failure recovery - Auto-scaling based on demand @@ -365,10 +359,16 @@ group_size = 1 ) ``` -Production scaling - multiply num_replicas for services or spawn multiple actors: -- Policy: num_replicas=8 for high inference demand -- RewardActor: num_replicas=16 for parallel evaluation -- Trainer: Multiple processes for distributed training (RLTrainer handles this internally) +**Forge Components: Services vs Actors** + +Forge has two types of distributed components: +- **Services**: Multiple replicas with automatic load balancing (like Policy, RewardActor) +- **Actors**: Single instances that handle their own internal distribution (like RLTrainer, ReplayBuffer) + +We cover this distinction in detail in Part 2, but for now this explains the scaling patterns: +- Policy service: num_replicas=8 for high inference demand +- RewardActor service: num_replicas=16 for parallel evaluation +- RLTrainer actor: Single instance with internal distributed training ### Fault Tolerance @@ -377,13 +377,13 @@ Production scaling - multiply num_replicas for services or spawn multiple actors responses = await policy.generate.route(prompt=question) answer = responses[0].text # -> Forge automatically routes to healthy replica -# -> Failed replica respawns in background +# -> Failed replica respawns in background # -> No impact on training loop # If reward service fails: score = await reward_actor.evaluate_response.route( prompt=question, response=answer, target=target -) +) ``` - Retries on different replica automatically @@ -392,4 +392,4 @@ score = await reward_actor.evaluate_response.route( This is fundamentally different from monolithic RL implementations where any component failure stops everything! -In the next Section, we will go a layer deeper and learn how ForgeServices work. Continue to [Part 2 here](./2_Forge_Internals.MD) \ No newline at end of file +In the next Section, we will go a layer deeper and learn how ForgeServices work. Continue to [Part 2 here](./2_Forge_Internals.MD) diff --git a/docs/Tutorials/2_Forge_Internals.MD b/docs/Tutorials/2_Forge_Internals.MD index 05a40e4a5..e1af9cde3 100644 --- a/docs/Tutorials/2_Forge_Internals.MD +++ b/docs/Tutorials/2_Forge_Internals.MD @@ -13,23 +13,23 @@ When you call `await policy_service.generate(question)`, here's what actually ha ```mermaid graph TD Call["Your Code:
await policy_service.generate"] - + subgraph ServiceLayer["Service Layer"] Proxy["Service Proxy: Load balancing, Health checking"] LB["Load Balancer: Replica selection, Circuit breaker"] end - + subgraph Replicas["Replica Management"] R1["Replica 1: GPU 0, Healthy"] R2["Replica 2: GPU 1, Overloaded"] R3["Replica 3: GPU 2, Failed"] R4["Replica 4: GPU 3, Healthy"] end - + subgraph Compute["Actual Computation"] Actor["Policy Actor: vLLM engine, Model weights, KV cache"] end - + Call --> Proxy Proxy --> LB LB --> R1 @@ -38,7 +38,7 @@ graph TD LB --> R4 R1 --> Actor R4 --> Actor - + style Call fill:#4CAF50 style LB fill:#FF9800 style R3 fill:#F44336 @@ -55,7 +55,7 @@ Here's the actual ServiceConfig from Forge source code: # Configuration pattern from apps/grpo/main.py: Policy.options( procs=1, # Processes per replica - num_replicas=4, # Number of replicas + num_replicas=4, # Number of replicas with_gpus=True # Allocate GPUs # Other available options: # hosts=None # the number of remote hosts used per replica @@ -69,7 +69,7 @@ Services are created using the `.options().as_service()` pattern from the actual The service creation automatically handles: - Spawning actor replicas across processes/GPUs - Load balancing with .route() method for services -- Health monitoring and failure recovery +- Health monitoring and failure recovery - Message routing and serialization ```python @@ -78,8 +78,8 @@ from forge.actors.policy import Policy model = "Qwen/Qwen3-1.7B" policy = await Policy.options( - procs=1, - with_gpus=True, + procs=1, + with_gpus=True, num_replicas=1 ).as_service( engine_config={ @@ -158,7 +158,7 @@ Behind the scenes: ```python # Get version from all policy replicas current_versions = await policy.get_version.fanout() -# Returns: [version_replica_1, version_replica_2, ...] +# Returns: [version_replica_1, version_replica_2, ...] # Update weights on all replicas await policy.update_weights.fanout(new_policy_version) @@ -193,8 +193,8 @@ while training: ``` **Performance characteristics**: -- **Latency**: Process first result immediately -- **Throughput**: Pipeline parallelism (much higher than sequential) +- **Latency**: Process first result immediately +- **Throughput**: Non-blocking async operations (much higher than waiting for full batches) - **Fault tolerance**: Continues if some replicas fail **Critical insight**: This is essential for high-throughput RL where you can't wait for batches. @@ -242,7 +242,7 @@ async with counter_service.session(): print(await counter_service.increment.route()) # 1 print(await counter_service.increment.route()) # 2 print(await counter_service.increment.route()) # 3 - + final_value = await counter_service.get_value.route() print(f"Final value on this replica: {final_value}") # 3 @@ -263,7 +263,7 @@ await counter_service.shutdown() The most complex challenge in distributed RL is maintaining state consistency while maximizing performance. -### The KV Cache Problem +### The KV Cache Problem **The challenge**: Policy inference is much faster with KV cache, but cache is tied to specific conversation history. @@ -278,16 +278,16 @@ async def naive_multi_turn(): **The solution**: Sticky sessions ensure all calls go to same replica. -```python +```python async def optimized_multi_turn(): async with policy.session(): # All calls guaranteed to hit same replica = cache hits response1 = await policy.generate.route(prompt=question1) - full_prompt = question1 + response1[0].text + full_prompt = question1 + response1[0].text response2 = await policy.generate.route(prompt=full_prompt) # Cache hit! conversation = full_prompt + response2[0].text response3 = await policy.generate.route(prompt=conversation) # Cache hit! - + # Session ends, replica can be garbage collected or reused ``` @@ -327,11 +327,11 @@ batch = await replay_buffer.sample.call_one( async def real_weight_sync(trainer, policy, step): # Trainer pushes weights to TorchStore with version number await trainer.push_weights.call_one(policy_version=step + 1) - - # Policy service updates to new version from TorchStore + + # Policy service updates to new version from TorchStore # Use .fanout() to update ALL policy replicas await policy.update_weights.fanout(policy_version=step + 1) - + # Check current policy version current_version = await policy.get_version.route() print(f"Current policy version: {current_version}") @@ -349,29 +349,29 @@ Instead of manual coordination, Forge services handle speed mismatches automatic from apps.grpo.main import Episode, Group async def simple_rl_step(): - + # ===== Generate a rollout ===== sample = await dataloader.sample.call_one() # DatasetActor is an actor, not service prompt, target = sample["request"], sample["target"] # Correct field names - + print(f"Prompt: {prompt}") print(f"Target: {target}") - + actions = await policy.generate.route(prompt=prompt) # Policy is a service print(f"Policy response: {actions[0].text}") - + # Create input tensor for reference model (requires full context) input_ids = torch.cat([actions[0].prompt_ids, actions[0].token_ids]) ref_logprobs = await ref_model.forward.route( input_ids.unsqueeze(0), max_req_tokens=512, return_logprobs=True - ) + ) reward = await reward_actor.evaluate_response.route( # RewardActor is a service - prompt=prompt, - response=actions[0].text, + prompt=prompt, + response=actions[0].text, target=target ) print(f"Reward: {reward}") - + # Create episode using actual GRPO Episode structure episode = Episode( episode_id="0", @@ -382,24 +382,24 @@ async def simple_rl_step(): response_len=512, target=target ) - + # Add response data episode.response = actions[0].text episode.request_tokens = actions[0].prompt_ids.tolist() episode.response_tokens = actions[0].token_ids.tolist() episode.ref_logprobs = ref_logprobs[0] # Extract from batch dimension episode.reward = reward - + # Compute advantages using actual ComputeAdvantages actor group = Group.new_group(0, 1, prompt, 0, tokenizer.pad_token_id, 512, 512, target) group.episodes[0] = episode advantages = await compute_advantages.compute.call_one(group) # ComputeAdvantages is an actor episode.advantage = advantages[0] - print(f"Advantage: {advantages[0]}") + print(f"Advantage: {advantages[0]}") await replay_buffer.add.call_one(episode) # ReplayBuffer is an actor print("Episode stored in replay buffer") - - # ===== Train on the batch ===== + + # ===== Train on the batch ===== batch = await replay_buffer.sample.call_one(curr_policy_version=0) if batch is not None: print("Training on batch...") @@ -469,12 +469,12 @@ class RewardActor(ForgeActor): async def evaluate_response(self, prompt: str, response: str, target: str) -> float: """Evaluate response quality using multiple reward functions""" total_reward = 0.0 - + for reward_fn in self.reward_functions: # Each reward function contributes to total score reward = reward_fn(prompt, response, target) total_reward += reward - + # Return average reward across all functions return total_reward / len(self.reward_functions) if self.reward_functions else 0.0 @@ -490,7 +490,7 @@ target = "36" score = await reward_actor.evaluate_response.route( prompt=prompt, - response=response, + response=response, target=target ) print(f"Reward score: {score}") # Usually around 1.0 for correct math answers @@ -530,7 +530,7 @@ print("Initializing all services...") reward_actor, ) = await asyncio.gather( DatasetActor.options(procs=1).as_actor( - path="openai/gsm8k", revision="main", data_split="train", + path="openai/gsm8k", revision="main", data_split="train", streaming=True, model="Qwen/Qwen3-1.7B" ), Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service( @@ -559,41 +559,41 @@ print("All services initialized successfully!") async def production_training_loop(): """Real training loop pattern from apps/grpo/main.py""" step = 0 - + while True: - # Data generation + # Data generation sample = await dataloader.sample.call_one() - + # Policy generation service call responses = await policy.generate.route(sample["request"]) # Correct field name - + # Reference computation service call (requires full input tensor) input_ids = torch.cat([responses[0].prompt_ids, responses[0].token_ids]) ref_logprobs = await ref_model.forward.route( input_ids.unsqueeze(0), max_req_tokens=512, return_logprobs=True ) - - # Reward evaluation service call + + # Reward evaluation service call reward = await reward_actor.evaluate_response.route( prompt=sample["question"], response=responses[0].text, target=sample["answer"] ) - + # Experience storage (using actual Episode structure) episode = create_episode_from_grpo_data(sample, responses[0], reward, ref_logprobs[0], step) await replay_buffer.add.call_one(episode) - + # Training when ready batch = await replay_buffer.sample.call_one(curr_policy_version=step) if batch is not None: inputs, targets = batch # GRPO returns (inputs, targets) tuple loss = await trainer.train_step.call(inputs, targets) - + # Weight synchronization pattern await trainer.push_weights.call(step + 1) await policy.update_weights.fanout(step + 1) # Fanout to all replicas - + print(f"Step {step}, Loss: {loss:.4f}") step += 1 @@ -612,11 +612,11 @@ print("All services shut down successfully!") **Key observations:** 1. **Parallelism**: Independent operations run concurrently -2. **Load balancing**: Each `.route()` call automatically selects optimal replica +2. **Load balancing**: Each `.route()` call automatically selects optimal replica 3. **Fault tolerance**: Failures automatically retry on different replicas 4. **Resource efficiency**: CPU and GPU services scale independently 5. **Coordination**: Services coordinate through shared state (replay buffer, weight versions) This is the power of the service abstraction - complex distributed coordination looks like simple async Python code. -In the next part we will learn about [Monarch internals](./3_Monarch_101.MD) \ No newline at end of file +In the next part we will learn about [Monarch internals](./3_Monarch_101.MD) diff --git a/docs/Tutorials/ReadMe.MD b/docs/Tutorials/ReadMe.MD index 7798b147d..084710853 100644 --- a/docs/Tutorials/ReadMe.MD +++ b/docs/Tutorials/ReadMe.MD @@ -4,7 +4,7 @@ A comprehensive guide for ML Engineers building distributed RL systems for langu Some of the examples mentioned below will be conceptual in nature for understanding. Please refer to API Docs (Coming Soon!) for more details -Welcome to the Tutorials section! This section is inspired by the A-Z PyTorch tutorial, shoutout to our PyTorch friends that remember! +Welcome to the Tutorials section! This section is inspired by the A-Z PyTorch tutorial, shoutout to our PyTorch friends that remember! ### @@ -14,6 +14,6 @@ This section currently is structured in 3 detailed parts: 2. [Forge Internals](./2_Forge_Internals.MD): Goes a layer deeper and explains the internals of Forge 3. [Monarch 101](./3_Monarch_101.MD): It's a 101 to Monarch and how Forge Talks to Monarch -Each part builds upon the next and the entire section can be consumed in roughly an hour-Grab a Chai and Enjoy! +Each part builds upon the next and the entire section can be consumed in roughly an hour-Grab a Chai and Enjoy! -If you're eager, please checkout our SFT Tutorial too (Coming soon!) as well as [App Examples](../../apps/). \ No newline at end of file +If you're eager, please checkout our SFT Tutorial too (Coming soon!) as well as [App Examples](../../apps/). From c8c26ab21346e924db5d05c540b22524df6d0035 Mon Sep 17 00:00:00 2001 From: Sanyam Bhutani Date: Mon, 13 Oct 2025 15:04:16 -0700 Subject: [PATCH 19/28] address felipe's comments, add image and fix sticky session examples --- docs/Tutorials/2_Forge_Internals.MD | 83 +++++++++++++++++++++++------ 1 file changed, 66 insertions(+), 17 deletions(-) diff --git a/docs/Tutorials/2_Forge_Internals.MD b/docs/Tutorials/2_Forge_Internals.MD index e1af9cde3..8189cf8a5 100644 --- a/docs/Tutorials/2_Forge_Internals.MD +++ b/docs/Tutorials/2_Forge_Internals.MD @@ -108,22 +108,54 @@ await policy.shutdown() Forge services are implemented as ServiceActors that manage collections of your ForgeActor replicas: -Forge internals - What happens behind the scenes: -1. `.as_service()` creates a `ServiceInterface` -2. `ServiceInterface` manages N replicas of your `ForgeActor` class -3. `ServiceInterface` handles routing between replicas -4. You get methods like `.route()`, `.fanout()`, etc. +When you call `.as_service()`, Forge creates a `ServiceInterface` that manages N replicas of your `ForgeActor` class and gives you methods like `.route()`, `.fanout()`, etc. ```python -# Your code sees this: +# Your code sees this simple interface: responses = await policy.generate.route(prompt=prompt) +# But Forge handles all the complexity of replica management, load balancing, and fault tolerance ``` -But behind the scenes: -- `ServiceInterface` selects healthy replica -- Routes message to that replica's `Policy.generate()` endpoint -- Handles failures and retries automatically -- Returns list[Completion] from the selected replica +## Communication Patterns: Quick Reference + +**API Summary:** +- `.route()` - Send request to any healthy replica in a service (load balanced) +- `.call_one()` - Send request to a single actor instance +- `.fanout()` - Send request to ALL replicas in a service + +```mermaid +graph LR + subgraph Request["Your Request"] + Code["await service.method.ADVERB()"] + end + + subgraph Patterns["Communication Patterns"] + Route[".route()
→ One healthy replica"] + CallOne[".call_one()
→ Single actor"] + Fanout[".fanout()
→ ALL replicas"] + end + + subgraph Replicas["Replicas/Actors"] + R1["Replica 1"] + R2["Replica 2"] + R3["Replica 3"] + A1["Actor"] + end + + Code --> Route + Code --> CallOne + Code --> Fanout + + Route --> R2 + CallOne --> A1 + Fanout --> R1 + Fanout --> R2 + Fanout --> R3 + + style Route fill:#4CAF50 + style CallOne fill:#FF9800 + style Fanout fill:#9C27B0 +``` ## Deep Dive: Service Communication Patterns @@ -203,8 +235,10 @@ while training: **When to use**: When you need multiple calls to hit the same replica (like KV cache preservation). +**What are sticky sessions?** A session ensures all your service calls within the `async with` block go to the same replica, instead of being load-balanced across different replicas. + ```python -# This Counter example demonstrates the session pattern +# This Counter example demonstrates the difference between regular routing and sessions from forge.controller import ForgeActor from monarch.actor import endpoint @@ -230,22 +264,37 @@ counter_service = await ForgeCounter.options( procs=1, num_replicas=4 ).as_service(initial_value=0) -# Test basic operations -await counter_service.increment.route() +# WITHOUT SESSIONS: Each .route() call goes to a different replica +await counter_service.increment.route() # Might go to replica 2 +await counter_service.increment.route() # Might go to replica 1 +await counter_service.increment.route() # Might go to replica 3 + results = await counter_service.increment.fanout() # Get from all replicas print(f"All replica values: {results}") +# Output: All replica values: [1, 2, 1, 1] - Each replica has different state! +``` -# STICKY SESSIONS +The problem: each `.route()` call can go to different replicas, creating inconsistent state. + +```python +# WITH SESSIONS: All calls go to the SAME replica print("\nUsing sticky sessions:") -async with counter_service.session(): +async with counter_service.session(): # Creates a session that picks one replica await counter_service.reset.route() # Uses .route() within session print(await counter_service.increment.route()) # 1 - print(await counter_service.increment.route()) # 2 + print(await counter_service.increment.route()) # 2 print(await counter_service.increment.route()) # 3 final_value = await counter_service.get_value.route() print(f"Final value on this replica: {final_value}") # 3 +# Output: +# Using sticky sessions: +# 1 +# 2 +# 3 +# Final value on this replica: 3 + # Same pattern works with Policy for multi-turn conversations: # async with policy.session(): # response1 = await policy.generate.route(turn1) From 78539f0e04749dcd8084ddf411744b2c667a499c Mon Sep 17 00:00:00 2001 From: Sanyam Bhutani Date: Mon, 13 Oct 2025 15:07:27 -0700 Subject: [PATCH 20/28] fix PR tests --- docs/Tutorials/2_Forge_Internals.MD | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/Tutorials/2_Forge_Internals.MD b/docs/Tutorials/2_Forge_Internals.MD index 8189cf8a5..1a9421a96 100644 --- a/docs/Tutorials/2_Forge_Internals.MD +++ b/docs/Tutorials/2_Forge_Internals.MD @@ -120,7 +120,7 @@ responses = await policy.generate.route(prompt=prompt) **API Summary:** - `.route()` - Send request to any healthy replica in a service (load balanced) -- `.call_one()` - Send request to a single actor instance +- `.call_one()` - Send request to a single actor instance - `.fanout()` - Send request to ALL replicas in a service ```mermaid @@ -128,30 +128,30 @@ graph LR subgraph Request["Your Request"] Code["await service.method.ADVERB()"] end - + subgraph Patterns["Communication Patterns"] Route[".route()
→ One healthy replica"] CallOne[".call_one()
→ Single actor"] Fanout[".fanout()
→ ALL replicas"] end - + subgraph Replicas["Replicas/Actors"] R1["Replica 1"] R2["Replica 2"] R3["Replica 3"] A1["Actor"] end - + Code --> Route Code --> CallOne Code --> Fanout - + Route --> R2 CallOne --> A1 Fanout --> R1 Fanout --> R2 Fanout --> R3 - + style Route fill:#4CAF50 style CallOne fill:#FF9800 style Fanout fill:#9C27B0 @@ -266,7 +266,7 @@ counter_service = await ForgeCounter.options( # WITHOUT SESSIONS: Each .route() call goes to a different replica await counter_service.increment.route() # Might go to replica 2 -await counter_service.increment.route() # Might go to replica 1 +await counter_service.increment.route() # Might go to replica 1 await counter_service.increment.route() # Might go to replica 3 results = await counter_service.increment.fanout() # Get from all replicas @@ -282,7 +282,7 @@ print("\nUsing sticky sessions:") async with counter_service.session(): # Creates a session that picks one replica await counter_service.reset.route() # Uses .route() within session print(await counter_service.increment.route()) # 1 - print(await counter_service.increment.route()) # 2 + print(await counter_service.increment.route()) # 2 print(await counter_service.increment.route()) # 3 final_value = await counter_service.get_value.route() From aa3d85c97481334baceb86de98227c30b2c6ff82 Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Tue, 14 Oct 2025 11:57:42 -0700 Subject: [PATCH 21/28] Convert tutorials to .py --- docs/Tutorials/1_RL_and_Forge_Fundamentals.MD | 395 --------- docs/Tutorials/2_Forge_Internals.MD | 671 --------------- docs/Tutorials/3_Monarch_101.MD | 439 ---------- docs/source/conf.py | 12 +- docs/source/tutorial_sources/README.txt | 5 - .../1_RL_and_Forge_Fundamentals.py | 503 ++++++++++++ .../zero-to-forge/2_Forge_Internals.py | 767 ++++++++++++++++++ .../zero-to-forge/3_Monarch_101.py | 572 +++++++++++++ .../tutorial_sources/zero-to-forge/README.md} | 21 +- docs/source/tutorials.md | 5 +- docs/source/zero-to-forge-intro.md | 28 + 11 files changed, 1897 insertions(+), 1521 deletions(-) delete mode 100644 docs/Tutorials/1_RL_and_Forge_Fundamentals.MD delete mode 100644 docs/Tutorials/2_Forge_Internals.MD delete mode 100644 docs/Tutorials/3_Monarch_101.MD create mode 100644 docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py create mode 100644 docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py create mode 100644 docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py rename docs/{Tutorials/ReadMe.MD => source/tutorial_sources/zero-to-forge/README.md} (57%) create mode 100644 docs/source/zero-to-forge-intro.md diff --git a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD deleted file mode 100644 index 39b6d62aa..000000000 --- a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD +++ /dev/null @@ -1,395 +0,0 @@ -# Part 1: RL Fundamentals - Using Forge Terminology - -## Core RL Components in Forge - -Let's start with a simple math tutoring example to understand RL concepts with the exact names Forge uses: - -### The Toy Example: Teaching Math - -```mermaid -graph TD - subgraph Example["Math Tutoring RL Example"] - Dataset["Dataset: math problems"] - Policy["Policy: student AI"] - Reward["Reward Model: scores answers"] - Reference["Reference Model: baseline"] - ReplayBuffer["Replay Buffer: stores experiences"] - Trainer["Trainer: improves student"] - end - - Dataset --> Policy - Policy --> Reward - Policy --> Reference - Reward --> ReplayBuffer - Reference --> ReplayBuffer - ReplayBuffer --> Trainer - Trainer --> Policy - - style Policy fill:#4CAF50 - style Reward fill:#FF9800 - style Trainer fill:#E91E63 -``` - -### RL Components Defined (Forge Names) - -1. **Dataset**: Provides questions/prompts (like "What is 2+2?") -2. **Policy**: The AI being trained (generates answers like "The answer is 4") -3. **Reward Model**: Evaluates answer quality (gives scores like 0.95) -4. **Reference Model**: Original policy copy (prevents drift from baseline) -5. **Replay Buffer**: Stores experiences (question + answer + score) -6. **Trainer**: Updates the policy weights based on experiences - -### The RL Learning Flow - -```python -# CONCEPTUAL EXAMPLE - see apps/grpo/main.py for GRPO Code - -def conceptual_rl_step(): - # 1. Get a math problem - question = dataset.sample() # "What is 2+2?" - - # 2. Student generates answer - answer = policy.generate(question) # "The answer is 4" - - # 3. Teacher grades it - score = reward_model.evaluate(question, answer) # 0.95 - - # 4. Compare to original student - baseline = reference_model.compute_logprobs(question, answer) - - # 5. Store the experience - experience = Episode(question, answer, score, baseline) - replay_buffer.add(experience) - - # 6. When enough experiences collected, improve student - batch = replay_buffer.sample(curr_policy_version=0) - if batch is not None: - trainer.train_step(batch) # Student gets better! - -# 🔄 See complete working example below with actual Forge service calls -``` - -## From Concepts to Forge Services - -Here's the key insight: **Each RL component becomes a Forge service**. The toy example above maps directly to Forge: - -```mermaid -graph LR - subgraph Concepts["RL Concepts"] - C1["Dataset"] - C2["Policy"] - C3["Reward Model"] - C4["Reference Model"] - C5["Replay Buffer"] - C6["Trainer"] - end - - subgraph Services["Forge Services (Real Classes)"] - S1["DatasetActor"] - S2["Policy"] - S3["RewardActor"] - S4["ReferenceModel"] - S5["ReplayBuffer"] - S6["RLTrainer"] - end - - C1 --> S1 - C2 --> S2 - C3 --> S3 - C4 --> S4 - C5 --> S5 - C6 --> S6 - - style C2 fill:#4CAF50 - style S2 fill:#4CAF50 - style C3 fill:#FF9800 - style S3 fill:#FF9800 -``` - -### RL Step with Forge Services - -Let's look at the example from above again, but this time we would use the names from Forge: - -```python -# Conceptual Example - -async def conceptual_forge_rl_step(services, step): - # 1. Get a math problem - Using actual DatasetActor API - sample = await services['dataloader'].sample.call_one() - question, target = sample["request"], sample["target"] - - # 2. Student generates answer - Using actual Policy API - responses = await services['policy'].generate.route(prompt=question) - answer = responses[0].text - - # 3. Teacher grades it - Using actual RewardActor API - score = await services['reward_actor'].evaluate_response.route( - prompt=question, response=answer, target=target - ) - - # 4. Compare to baseline - Using actual ReferenceModel API - # Note: ReferenceModel.forward requires input_ids, max_req_tokens, return_logprobs - ref_logprobs = await services['ref_model'].forward.route( - input_ids, max_req_tokens, return_logprobs=True - ) - - # 5. Store experience - Using actual Episode structure from apps/grpo/main.py - episode = create_episode_from_response(responses[0], score, ref_logprobs, step) - await services['replay_buffer'].add.call_one(episode) - - # 6. Improve student - Using actual training pattern - batch = await services['replay_buffer'].sample.call_one( - curr_policy_version=step - ) - if batch is not None: - inputs, targets = batch - loss = await services['trainer'].train_step.call(inputs, targets) - return loss -``` - -**Key difference**: Same RL logic, but each component is now a distributed, fault-tolerant, auto-scaling service. - -Did you realise-we are not worrying about any Infra code here! Forge Automagically handles the details behind the scenes and you can focus on writing your RL Algorthms! - - -## Why This Matters: Traditional ML Infrastructure Fails - -### The Infrastructure Challenge - -Our simple RL loop above has complex requirements: - -#### Problem 1: Different Resource Needs - -| Component | Resource Needs | Scaling Strategy | -|-----------|----------------|------------------| -| **Policy** (Student AI) | Large GPU memory | Multiple replicas for throughput | -| **Reward Heuristic** (Teacher) | Small compute | CPU or small GPU | -| **Trainer** (Tutor) | Massive GPU compute | Distributed training | -| **Dataset** (Question Bank) | CPU intensive I/O | High memory bandwidth | - -### Problem 2: Complex Interdependencies - -```mermaid -graph LR - A["Policy: Student AI
'What is 2+2?' → 'The answer is 4'"] - B["Reward: Teacher
Scores answer: 0.95"] - C["Reference: Original Student
Provides baseline comparison"] - D["Replay Buffer: Notebook
Stores: question + answer + score"] - E["Trainer: Tutor
Improves student using experiences"] - - A --> B - A --> C - B --> D - C --> D - D --> E - E --> A - - style A fill:#4CAF50 - style B fill:#FF9800 - style C fill:#2196F3 - style D fill:#8BC34A - style E fill:#E91E63 -``` - -Each step has different: -- **Latency requirements**: Policy inference needs low latency (each episode waits), training can batch multiple episodes together -- **Scaling patterns**: Need N policy replicas to keep trainer busy, plus different sharding strategies (tensor parallel for training vs replicated inference) -- **Failure modes**: Any component failure cascades to halt the entire pipeline (Forge prevents this with automatic failover) -- **Resource utilization**: GPUs for inference/training, CPUs for data processing - -### Problem 3: The Coordination Challenge - -Unlike supervised learning where you process independent batches, RL requires coordination: - -```python -# While this does work, it creates bottlenecks and resource waste -def naive_rl_step(): - # Policy waits idle while reward model works - response = policy_model.generate(prompt) # GPU busy - reward = reward_model.evaluate(prompt, response) # Policy GPU idle - - # Training waits for single episode - loss = compute_loss(response, reward) # Batch size = 1, inefficient - - # Everything stops if any component fails - if policy_fails or reward_fails or trainer_fails: - entire_system_stops() -``` - -## Enter Forge: RL-Native Architecture - -Forge solves these problems by treating each RL component as an **independent, distributed unit** - some as fault-tolerant services (like Policy inference where failures are easy to handle), others as actors (like Trainers where recovery semantics differ) - -Let's see how core RL concepts map to Forge components (you'll notice a mix of `.route()` for services and `.call_one()` for actors - we cover when to use each in Part 2): - -**Quick API Reference:** (covered in detail in Part 2: Service Communication Patterns) -- `.route()` - Send request to any healthy replica in a service (load balanced) -- `.call_one()` - Send request to a single actor instance -- `.fanout()` - Send request to ALL replicas in a service - -```python -async def real_rl_training_step(services, step): - """Single RL step using verified Forge APIs""" - - # 1. Environment interaction - Using actual DatasetActor API - sample = await services['dataloader'].sample.call_one() - prompt, target = sample["request"], sample["target"] - - responses = await services['policy'].generate.route(prompt) - - # 2. Reward computation - Using actual RewardActor API - score = await services['reward_actor'].evaluate_response.route( - prompt=prompt, response=responses[0].text, target=target - ) - - # 3. Get reference logprobs - Using actual ReferenceModel API - # Note: ReferenceModel requires full input_ids tensor, not just tokens - input_ids = torch.cat([responses[0].prompt_ids, responses[0].token_ids]) - ref_logprobs = await services['ref_model'].forward.route( - input_ids.unsqueeze(0), max_req_tokens=512, return_logprobs=True - ) - - # 4. Experience storage - Using actual Episode pattern from GRPO - episode = create_episode_from_response(responses[0], score, ref_logprobs, step) - await services['replay_buffer'].add.call_one(episode) - - # 5. Learning - Using actual trainer pattern - batch = await services['replay_buffer'].sample.call_one( - curr_policy_version=step - ) - if batch is not None: - inputs, targets = batch # GRPO returns (inputs, targets) tuple - loss = await services['trainer'].train_step.call(inputs, targets) - - # 6. Policy synchronization - Using actual weight update pattern - await services['trainer'].push_weights.call(step + 1) - await services['policy'].update_weights.fanout(step + 1) - - return loss -``` - -**Key insight**: Each line of RL pseudocode becomes a service call. The complexity of distribution, scaling, and fault tolerance is hidden behind these simple interfaces. - -## What Makes This Powerful - -### Automatic Resource Management -```python -responses = await policy.generate.route(prompt=question) -answer = responses[0].text # responses is list[Completion] -``` - -Forge handles behind the scenes: -- Routing to least loaded replica -- GPU memory management -- Batch optimization -- Failure recovery -- Auto-scaling based on demand - -### Independent Scaling -```python - -from forge.actors.policy import Policy -from forge.actors.replay_buffer import ReplayBuffer -from forge.actors.reference_model import ReferenceModel -from forge.actors.trainer import RLTrainer -from apps.grpo.main import DatasetActor, RewardActor, ComputeAdvantages -from forge.data.rewards import MathReward, ThinkingReward -import asyncio -import torch - -model = "Qwen/Qwen3-1.7B" -group_size = 1 - -( - dataloader, - policy, - trainer, - replay_buffer, - compute_advantages, - ref_model, - reward_actor, -) = await asyncio.gather( - # Dataset actor (CPU) - DatasetActor.options(procs=1).as_actor( - path="openai/gsm8k", - revision="main", - data_split="train", - streaming=True, - model=model, - ), - # Policy service with GPU - Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service( - engine_config={ - "model": model, - "tensor_parallel_size": 1, - "pipeline_parallel_size": 1, - "enforce_eager": False - }, - sampling_config={ - "n": group_size, - "max_tokens": 16, - "temperature": 1.0, - "top_p": 1.0 - } - ), - # Trainer actor with GPU - RLTrainer.options(procs=1, with_gpus=True).as_actor( - # Trainer config would come from YAML in real usage - model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"}, - optimizer={"name": "AdamW", "lr": 1e-5}, - training={"local_batch_size": 2, "seq_len": 2048} - ), - # Replay buffer (CPU) - ReplayBuffer.options(procs=1).as_actor( - batch_size=2, - max_policy_age=1, - dp_size=1 - ), - # Advantage computation (CPU) - ComputeAdvantages.options(procs=1).as_actor(), - # Reference model with GPU - ReferenceModel.options(procs=1, with_gpus=True).as_actor( - model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"}, - training={"dtype": "bfloat16"} - ), - # Reward actor (CPU) - RewardActor.options(procs=1, num_replicas=1).as_service( - reward_functions=[MathReward(), ThinkingReward()] - ) - ) -``` - -**Forge Components: Services vs Actors** - -Forge has two types of distributed components: -- **Services**: Multiple replicas with automatic load balancing (like Policy, RewardActor) -- **Actors**: Single instances that handle their own internal distribution (like RLTrainer, ReplayBuffer) - -We cover this distinction in detail in Part 2, but for now this explains the scaling patterns: -- Policy service: num_replicas=8 for high inference demand -- RewardActor service: num_replicas=16 for parallel evaluation -- RLTrainer actor: Single instance with internal distributed training - - -### Fault Tolerance -```python -# If a policy replica fails: -responses = await policy.generate.route(prompt=question) -answer = responses[0].text -# -> Forge automatically routes to healthy replica -# -> Failed replica respawns in background -# -> No impact on training loop - -# If reward service fails: -score = await reward_actor.evaluate_response.route( - prompt=question, response=answer, target=target -) -``` - -- Retries on different replica automatically -- Graceful degradation if all replicas fail -- System continues (may need application-level handling) - -This is fundamentally different from monolithic RL implementations where any component failure stops everything! - -In the next Section, we will go a layer deeper and learn how ForgeServices work. Continue to [Part 2 here](./2_Forge_Internals.MD) diff --git a/docs/Tutorials/2_Forge_Internals.MD b/docs/Tutorials/2_Forge_Internals.MD deleted file mode 100644 index 1a9421a96..000000000 --- a/docs/Tutorials/2_Forge_Internals.MD +++ /dev/null @@ -1,671 +0,0 @@ -# Part 2: Peeling Back the Abstraction - What Are Services? - -We highly recommend reading [Part 1](./1_RL_and_Forge_Fundamentals.MD) before this, it explains RL Concepts and how they land in Forge. - -Now that you see the power of the service abstraction, let's understand what's actually happening under the hood, Grab your chai! - -## Service Anatomy: Beyond the Interface - -When you call `await policy_service.generate(question)`, here's what actually happens: - -(Don't worry, we will understand Services right in the next section!) - -```mermaid -graph TD - Call["Your Code:
await policy_service.generate"] - - subgraph ServiceLayer["Service Layer"] - Proxy["Service Proxy: Load balancing, Health checking"] - LB["Load Balancer: Replica selection, Circuit breaker"] - end - - subgraph Replicas["Replica Management"] - R1["Replica 1: GPU 0, Healthy"] - R2["Replica 2: GPU 1, Overloaded"] - R3["Replica 3: GPU 2, Failed"] - R4["Replica 4: GPU 3, Healthy"] - end - - subgraph Compute["Actual Computation"] - Actor["Policy Actor: vLLM engine, Model weights, KV cache"] - end - - Call --> Proxy - Proxy --> LB - LB --> R1 - LB -.-> R2 - LB -.-> R3 - LB --> R4 - R1 --> Actor - R4 --> Actor - - style Call fill:#4CAF50 - style LB fill:#FF9800 - style R3 fill:#F44336 - style Actor fill:#9C27B0 -``` - -## Service Components Deep Dive - -### 1. Real Service Configuration - -Here's the actual ServiceConfig from Forge source code: - -```python -# Configuration pattern from apps/grpo/main.py: -Policy.options( - procs=1, # Processes per replica - num_replicas=4, # Number of replicas - with_gpus=True # Allocate GPUs - # Other available options: - # hosts=None # the number of remote hosts used per replica -) -``` - -### 2. Real Service Creation - -Services are created using the `.options().as_service()` pattern from the actual GRPO implementation: - -The service creation automatically handles: -- Spawning actor replicas across processes/GPUs -- Load balancing with .route() method for services -- Health monitoring and failure recovery -- Message routing and serialization - -```python -from forge.actors.policy import Policy - -model = "Qwen/Qwen3-1.7B" - -policy = await Policy.options( - procs=1, - with_gpus=True, - num_replicas=1 -).as_service( - engine_config={ - "model": model, - "tensor_parallel_size": 1, - "pipeline_parallel_size": 1, - "enforce_eager": False - }, - sampling_config={ - "n": 1, - "max_tokens": 16, - "temperature": 1.0, - "top_p": 1.0 - } -) - -prompt = "What is 3 + 5?" -responses = await policy.generate.route(prompt) -print(f"Response: {responses[0].text}") - -# Cleanup when done -await policy.shutdown() -``` - -### 3. How Services Actually Work - -Forge services are implemented as ServiceActors that manage collections of your ForgeActor replicas: - -When you call `.as_service()`, Forge creates a `ServiceInterface` that manages N replicas of your `ForgeActor` class and gives you methods like `.route()`, `.fanout()`, etc. - -```python -# Your code sees this simple interface: -responses = await policy.generate.route(prompt=prompt) -# But Forge handles all the complexity of replica management, load balancing, and fault tolerance -``` - -## Communication Patterns: Quick Reference - -**API Summary:** -- `.route()` - Send request to any healthy replica in a service (load balanced) -- `.call_one()` - Send request to a single actor instance -- `.fanout()` - Send request to ALL replicas in a service - -```mermaid -graph LR - subgraph Request["Your Request"] - Code["await service.method.ADVERB()"] - end - - subgraph Patterns["Communication Patterns"] - Route[".route()
→ One healthy replica"] - CallOne[".call_one()
→ Single actor"] - Fanout[".fanout()
→ ALL replicas"] - end - - subgraph Replicas["Replicas/Actors"] - R1["Replica 1"] - R2["Replica 2"] - R3["Replica 3"] - A1["Actor"] - end - - Code --> Route - Code --> CallOne - Code --> Fanout - - Route --> R2 - CallOne --> A1 - Fanout --> R1 - Fanout --> R2 - Fanout --> R3 - - style Route fill:#4CAF50 - style CallOne fill:#FF9800 - style Fanout fill:#9C27B0 -``` - -## Deep Dive: Service Communication Patterns - -These communication patterns (\"adverbs\") determine how your service calls are routed to replicas. Understanding when to use each pattern is key to effective Forge usage. - -### 1. `.route()` - Load Balanced Single Replica - -**When to use**: Normal request routing where any replica can handle the request. - -```python -responses = await policy.generate.route(prompt=question) -answer = responses[0].text # Extract text from Completion object -``` - -Behind the scenes: -1. Health check eliminates failed replicas -2. Load balancer picks replica (currently round robin, configurable balancers coming soon) -3. Request routes to that specific replica -4. Automatic retry on different replica if failure - -**Performance characteristics**: -- **Latency**: Lowest (single network hop) -- **Throughput**: Limited by single replica capacity -- **Fault tolerance**: Automatic failover to other replicas - -**Critical insight**: `.route()` is your default choice for stateless operations in Forge services. - -### 2. `.fanout()` - Broadcast with Results Collection - -**When to use**: You need responses from ALL replicas. - -```python -# Get version from all policy replicas -current_versions = await policy.get_version.fanout() -# Returns: [version_replica_1, version_replica_2, ...] - -# Update weights on all replicas -await policy.update_weights.fanout(new_policy_version) -# Broadcasts to all replicas simultaneously -``` - -**Performance characteristics**: -- **Latency**: Slowest replica determines total latency -- **Throughput**: Network bandwidth × number of replicas -- **Fault tolerance**: Fails if ANY replica fails (unless configured otherwise) - -**Critical gotcha**: Don't use `.fanout()` for high-frequency operations - it contacts all replicas. - -### 3. Streaming Operations - Custom Implementation Pattern - -**When to use**: You want to process results as they arrive, not wait for all. - -```python -# CONCEPTUAL - Streaming requires custom implementation in your training loop -# The basic ReplayBuffer doesn't have built-in streaming methods -# Pattern from apps/grpo/main.py continuous training: - -while training: - # This is the real API call pattern - batch = await replay_buffer.sample.call_one(curr_policy_version=step) - if batch is not None: - # Process batch immediately - loss = await trainer.train_step.call_one(batch) - print(f"Training loss: {loss}") - else: - await asyncio.sleep(0.1) # Wait for more data -``` - -**Performance characteristics**: -- **Latency**: Process first result immediately -- **Throughput**: Non-blocking async operations (much higher than waiting for full batches) -- **Fault tolerance**: Continues if some replicas fail - -**Critical insight**: This is essential for high-throughput RL where you can't wait for batches. - -### 3. Service Sessions for Stateful Operations - -**When to use**: When you need multiple calls to hit the same replica (like KV cache preservation). - -**What are sticky sessions?** A session ensures all your service calls within the `async with` block go to the same replica, instead of being load-balanced across different replicas. - -```python -# This Counter example demonstrates the difference between regular routing and sessions - -from forge.controller import ForgeActor -from monarch.actor import endpoint - -class ForgeCounter(ForgeActor): - def __init__(self, initial_value: int): - self.value = initial_value - - @endpoint - def increment(self) -> int: - self.value += 1 - return self.value - - @endpoint - def get_value(self) -> int: - return self.value - - @endpoint - async def reset(self): - self.value = 0 - -counter_service = await ForgeCounter.options( - procs=1, num_replicas=4 -).as_service(initial_value=0) - -# WITHOUT SESSIONS: Each .route() call goes to a different replica -await counter_service.increment.route() # Might go to replica 2 -await counter_service.increment.route() # Might go to replica 1 -await counter_service.increment.route() # Might go to replica 3 - -results = await counter_service.increment.fanout() # Get from all replicas -print(f"All replica values: {results}") -# Output: All replica values: [1, 2, 1, 1] - Each replica has different state! -``` - -The problem: each `.route()` call can go to different replicas, creating inconsistent state. - -```python -# WITH SESSIONS: All calls go to the SAME replica -print("\nUsing sticky sessions:") -async with counter_service.session(): # Creates a session that picks one replica - await counter_service.reset.route() # Uses .route() within session - print(await counter_service.increment.route()) # 1 - print(await counter_service.increment.route()) # 2 - print(await counter_service.increment.route()) # 3 - - final_value = await counter_service.get_value.route() - print(f"Final value on this replica: {final_value}") # 3 - -# Output: -# Using sticky sessions: -# 1 -# 2 -# 3 -# Final value on this replica: 3 - -# Same pattern works with Policy for multi-turn conversations: -# async with policy.session(): -# response1 = await policy.generate.route(turn1) -# full_prompt = turn1 + response1[0].text + turn2 -# response2 = await policy.generate.route(full_prompt) -# # Both calls hit same replica, preserving KV cache - -# Cleanup -await counter_service.shutdown() -``` - -**Performance impact**: Critical for maintaining KV cache in multi-turn conversations. - -## Deep Dive: State Management Reality - -The most complex challenge in distributed RL is maintaining state consistency while maximizing performance. - -### The KV Cache Problem - -**The challenge**: Policy inference is much faster with KV cache, but cache is tied to specific conversation history. - -```python -# This breaks KV cache optimization: -async def naive_multi_turn(): - # Each call might go to different replica = cache miss - response1 = await policy_service.generate.choose(question1) - response2 = await policy_service.generate.choose(question1 + response1) # Cache miss! - response3 = await policy_service.generate.choose(conversation_so_far) # Cache miss! -``` - -**The solution**: Sticky sessions ensure all calls go to same replica. - -```python -async def optimized_multi_turn(): - async with policy.session(): - # All calls guaranteed to hit same replica = cache hits - response1 = await policy.generate.route(prompt=question1) - full_prompt = question1 + response1[0].text - response2 = await policy.generate.route(prompt=full_prompt) # Cache hit! - conversation = full_prompt + response2[0].text - response3 = await policy.generate.route(prompt=conversation) # Cache hit! - - # Session ends, replica can be garbage collected or reused -``` - -**Performance impact**: Maintaining KV cache across turns avoids recomputing previous tokens. - -### Replay Buffer Consistency - -**The challenge**: Multiple trainers and experience collectors reading/writing concurrently. - -**Real Forge approach**: The ReplayBuffer actor handles concurrency internally: - -```python -# Forge ReplayBuffer endpoints (verified from source code) -# Add episodes (thread-safe by actor model) -await replay_buffer.add.call_one(episode) # .choose() would work too, but .call_one() clarifies it's a singleton actor not ActorMesh - -# Sample batches for training -batch = await replay_buffer.sample.call_one( - curr_policy_version=step_number, - batch_size=None # Optional parameter, uses default from config -) - -# Additional methods available: -# await replay_buffer.clear.call_one() # Clear buffer -# await replay_buffer.evict.call_one(curr_policy_version) # Remove old episodes -# state = await replay_buffer.state_dict.call_one() # Get state for checkpointing -``` - -**Critical insight**: The actor model provides natural thread safety - each actor processes messages sequentially. - -### Weight Synchronization Strategy - -**The challenge**: Trainer updates policy weights, but policy service needs those weights. - -```python -# Forge weight synchronization pattern from apps/grpo/main.py -async def real_weight_sync(trainer, policy, step): - # Trainer pushes weights to TorchStore with version number - await trainer.push_weights.call_one(policy_version=step + 1) - - # Policy service updates to new version from TorchStore - # Use .fanout() to update ALL policy replicas - await policy.update_weights.fanout(policy_version=step + 1) - -# Check current policy version -current_version = await policy.get_version.route() -print(f"Current policy version: {current_version}") -``` - -## Deep Dive: Asynchronous Coordination Patterns - -**The real challenge**: Different services run at different speeds, but Forge's service abstraction handles the coordination complexity. - -### The Forge Approach: Let Services Handle Coordination - -Instead of manual coordination, Forge services handle speed mismatches automatically: - -```python -from apps.grpo.main import Episode, Group - -async def simple_rl_step(): - - # ===== Generate a rollout ===== - sample = await dataloader.sample.call_one() # DatasetActor is an actor, not service - prompt, target = sample["request"], sample["target"] # Correct field names - - print(f"Prompt: {prompt}") - print(f"Target: {target}") - - actions = await policy.generate.route(prompt=prompt) # Policy is a service - print(f"Policy response: {actions[0].text}") - - # Create input tensor for reference model (requires full context) - input_ids = torch.cat([actions[0].prompt_ids, actions[0].token_ids]) - ref_logprobs = await ref_model.forward.route( - input_ids.unsqueeze(0), max_req_tokens=512, return_logprobs=True - ) - reward = await reward_actor.evaluate_response.route( # RewardActor is a service - prompt=prompt, - response=actions[0].text, - target=target - ) - print(f"Reward: {reward}") - - # Create episode using actual GRPO Episode structure - episode = Episode( - episode_id="0", - request=prompt, - policy_version=0, - pad_id=tokenizer.pad_token_id, - request_len=512, - response_len=512, - target=target - ) - - # Add response data - episode.response = actions[0].text - episode.request_tokens = actions[0].prompt_ids.tolist() - episode.response_tokens = actions[0].token_ids.tolist() - episode.ref_logprobs = ref_logprobs[0] # Extract from batch dimension - episode.reward = reward - - # Compute advantages using actual ComputeAdvantages actor - group = Group.new_group(0, 1, prompt, 0, tokenizer.pad_token_id, 512, 512, target) - group.episodes[0] = episode - advantages = await compute_advantages.compute.call_one(group) # ComputeAdvantages is an actor - episode.advantage = advantages[0] - print(f"Advantage: {advantages[0]}") - await replay_buffer.add.call_one(episode) # ReplayBuffer is an actor - print("Episode stored in replay buffer") - - # ===== Train on the batch ===== - batch = await replay_buffer.sample.call_one(curr_policy_version=0) - if batch is not None: - print("Training on batch...") - inputs, targets = batch # GRPO returns (inputs, targets) tuple - loss = await trainer.train_step.call(inputs, targets) # RLTrainer is an actor - print(f"Training loss: {loss}") - return loss - else: - print("Not enough data in buffer yet") - return None - -# Note: This simplified example assumes tokenizer and services are already initialized -for step in range(10): - print(f"\n--- RL Step {step + 1} ---") - loss = await simple_rl_step() - if loss: - print(f"Step {step + 1} complete, loss: {loss:.4f}") - else: - print(f"Step {step + 1} complete, building buffer...") -``` - -### Handling Speed Mismatches with Service Scaling - -**The insight**: Scale services independently based on their bottlenecks. - -```python -# Scale fast services with more replicas -policy = await Policy.options( - procs=1, num_replicas=8, with_gpus=True # Many replicas for high throughput -).as_service( - engine_config={"model": model_name, "tensor_parallel_size": 1} -) - -# Reward evaluation might be CPU-bound -reward_actor = await RewardActor.options( - procs=1, num_replicas=16, with_gpus=False # More CPU replicas -).as_service( - reward_functions=[MathReward()] -) - -# Training needs fewer but more powerful replicas -trainer = await RLTrainer.options( - procs=1, with_gpus=True # Fewer but GPU-heavy -).as_actor( # Trainer typically uses .as_actor() not .as_service() - model={"name": "qwen3", "flavor": "1.7B"}, - optimizer={"name": "AdamW", "lr": 1e-5} -) -``` - -## Service Implementation Example - -Let's see how a reward service is actually implemented: - -```python -# Exact RewardActor from apps/grpo/main.py - -from forge.controller import ForgeActor -from monarch.actor import endpoint -from forge.data.rewards import MathReward, ThinkingReward - -# class definition from apps/grpo/main.py -class RewardActor(ForgeActor): - def __init__(self, reward_functions: list): - self.reward_functions = reward_functions - - @endpoint - async def evaluate_response(self, prompt: str, response: str, target: str) -> float: - """Evaluate response quality using multiple reward functions""" - total_reward = 0.0 - - for reward_fn in self.reward_functions: - # Each reward function contributes to total score - reward = reward_fn(prompt, response, target) - total_reward += reward - - # Return average reward across all functions - return total_reward / len(self.reward_functions) if self.reward_functions else 0.0 - -reward_actor = await RewardActor.options( - procs=1, num_replicas=1 -).as_service( - reward_functions=[MathReward(), ThinkingReward()] -) - -prompt = "What is 15% of 240?" -response = "15% of 240 is 36" -target = "36" - -score = await reward_actor.evaluate_response.route( - prompt=prompt, - response=response, - target=target -) -print(f"Reward score: {score}") # Usually around 1.0 for correct math answers - -# For production scaling - increase num_replicas for parallel evaluation: -# RewardActor.options(procs=1, num_replicas=16) # 16 parallel evaluators - -# Cleanup when done -await reward_actor.shutdown() -``` - -## Service Orchestration: The Training Loop - -Now let's see how services coordinate in a real training loop: - -```python -# This is the REAL way production RL systems are built with Forge - -import asyncio -import torch -from forge.actors.policy import Policy -from forge.actors.reference_model import ReferenceModel -from forge.actors.replay_buffer import ReplayBuffer -from forge.actors.trainer import RLTrainer -from apps.grpo.main import DatasetActor, RewardActor, ComputeAdvantages -from forge.data.rewards import MathReward, ThinkingReward - -# Service creation pattern from apps/grpo/main.py lines 322-344 -print("Initializing all services...") -( - dataloader, - policy, - trainer, - replay_buffer, - compute_advantages, - ref_model, - reward_actor, -) = await asyncio.gather( - DatasetActor.options(procs=1).as_actor( - path="openai/gsm8k", revision="main", data_split="train", - streaming=True, model="Qwen/Qwen3-1.7B" - ), - Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service( - engine_config={"model": "Qwen/Qwen3-1.7B", "tensor_parallel_size": 1}, - sampling_config={"n": 1, "max_tokens": 512} - ), - RLTrainer.options(procs=1, with_gpus=True).as_actor( - model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": "hf://Qwen/Qwen3-1.7B"}, - optimizer={"name": "AdamW", "lr": 1e-5}, - training={"local_batch_size": 2, "seq_len": 2048} - ), - ReplayBuffer.options(procs=1).as_actor( - batch_size=2, max_policy_age=1, dp_size=1 - ), - ComputeAdvantages.options(procs=1).as_actor(), - ReferenceModel.options(procs=1, with_gpus=True).as_actor( - model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": "hf://Qwen/Qwen3-1.7B"} - ), - RewardActor.options(procs=1, num_replicas=1).as_service( - reward_functions=[MathReward(), ThinkingReward()] - ), -) - -print("All services initialized successfully!") - -async def production_training_loop(): - """Real training loop pattern from apps/grpo/main.py""" - step = 0 - - while True: - # Data generation - sample = await dataloader.sample.call_one() - - # Policy generation service call - responses = await policy.generate.route(sample["request"]) # Correct field name - - # Reference computation service call (requires full input tensor) - input_ids = torch.cat([responses[0].prompt_ids, responses[0].token_ids]) - ref_logprobs = await ref_model.forward.route( - input_ids.unsqueeze(0), max_req_tokens=512, return_logprobs=True - ) - - # Reward evaluation service call - reward = await reward_actor.evaluate_response.route( - prompt=sample["question"], - response=responses[0].text, - target=sample["answer"] - ) - - # Experience storage (using actual Episode structure) - episode = create_episode_from_grpo_data(sample, responses[0], reward, ref_logprobs[0], step) - await replay_buffer.add.call_one(episode) - - # Training when ready - batch = await replay_buffer.sample.call_one(curr_policy_version=step) - if batch is not None: - inputs, targets = batch # GRPO returns (inputs, targets) tuple - loss = await trainer.train_step.call(inputs, targets) - - # Weight synchronization pattern - await trainer.push_weights.call(step + 1) - await policy.update_weights.fanout(step + 1) # Fanout to all replicas - - print(f"Step {step}, Loss: {loss:.4f}") - step += 1 - -print("Shutting down services...") -await asyncio.gather( - DatasetActor.shutdown(dataloader), - policy.shutdown(), - RLTrainer.shutdown(trainer), - ReplayBuffer.shutdown(replay_buffer), - ComputeAdvantages.shutdown(compute_advantages), - ReferenceModel.shutdown(ref_model), - reward_actor.shutdown(), -) -print("All services shut down successfully!") -``` - -**Key observations:** -1. **Parallelism**: Independent operations run concurrently -2. **Load balancing**: Each `.route()` call automatically selects optimal replica -3. **Fault tolerance**: Failures automatically retry on different replicas -4. **Resource efficiency**: CPU and GPU services scale independently -5. **Coordination**: Services coordinate through shared state (replay buffer, weight versions) - -This is the power of the service abstraction - complex distributed coordination looks like simple async Python code. - -In the next part we will learn about [Monarch internals](./3_Monarch_101.MD) diff --git a/docs/Tutorials/3_Monarch_101.MD b/docs/Tutorials/3_Monarch_101.MD deleted file mode 100644 index 52bdb17d0..000000000 --- a/docs/Tutorials/3_Monarch_101.MD +++ /dev/null @@ -1,439 +0,0 @@ -# Part 3: The Forge-Monarch Connection - -This is part 3 of our series, in the previous sections: we learned Part 1: [RL Concepts and how they map to Forge](./1_RL_and_Forge_Fundamentals.MD), Part 2: [Forge Internals](./2_Forge_Internals.MD). - -Now let's peel back the layers. Forge services are built on top of **Monarch**, PyTorch's distributed actor framework. Understanding this connection is crucial for optimization and debugging. - -## The Complete Hierarchy: Service to Silicon - -```mermaid -graph TD - subgraph YourCode["1. Your RL Code"] - Call["await policy_service.generate.route('What is 2+2?')"] - end - - subgraph ForgeServices["2. Forge Service Layer"] - ServiceInterface["ServiceInterface: Routes requests, Load balancing, Health checks"] - ServiceActor["ServiceActor: Manages replicas, Monitors health, Coordinates failures"] - end - - subgraph MonarchLayer["3. Monarch Actor Layer"] - ActorMesh["ActorMesh PolicyActor: 4 instances, Different GPUs, Message passing"] - ProcMesh["ProcMesh: 4 processes, GPU topology 0,1,2,3, Network interconnect"] - end - - subgraph Hardware["4. Physical Hardware"] - GPU0["GPU 0: PolicyActor #1, vLLM Engine, Model Weights"] - GPU1["GPU 1: PolicyActor #2, vLLM Engine, Model Weights"] - GPU2["GPU 2: PolicyActor #3, vLLM Engine, Model Weights"] - GPU3["GPU 3: PolicyActor #4, vLLM Engine, Model Weights"] - end - - Call --> ServiceInterface - ServiceInterface --> ServiceActor - ServiceActor --> ActorMesh - ActorMesh --> ProcMesh - ProcMesh --> GPU0 - ProcMesh --> GPU1 - ProcMesh --> GPU2 - ProcMesh --> GPU3 - - style Call fill:#4CAF50 - style ServiceActor fill:#FF9800 - style ActorMesh fill:#9C27B0 - style ProcMesh fill:#2196F3 -``` - -## Deep Dive: ProcMesh - The Foundation - -**ProcMesh** is Monarch's core abstraction for organizing processes across hardware. Think of it as a multi-dimensional grid that maps directly to your cluster topology. - -### Single Host ProcMesh - -```mermaid -graph TD - subgraph Host["Single Host (8 GPUs)"] - subgraph ProcMesh["ProcMesh: per_host={'gpus': 8}"] - P0["Process 0
GPU 0"] - P1["Process 1
GPU 1"] - P2["Process 2
GPU 2"] - P3["Process 3
GPU 3"] - P4["Process 4
GPU 4"] - P5["Process 5
GPU 5"] - P6["Process 6
GPU 6"] - P7["Process 7
GPU 7"] - end - - P0 -.->|"Network"| P1 - P1 -.->|"Network"| P2 - P2 -.->|"Network"| P3 - P3 -.->|"Network"| P4 - P4 -.->|"Network"| P5 - P5 -.->|"Network"| P6 - P6 -.->|"Network"| P7 - P7 -.->|"Network"| P0 - end - - style P0 fill:#F44336 - style P1 fill:#F44336 - style P2 fill:#F44336 - style P3 fill:#F44336 - style P4 fill:#F44336 - style P5 fill:#F44336 - style P6 fill:#F44336 - style P7 fill:#F44336 -``` - -### Multi-Host ProcMesh - -```mermaid -graph TD - subgraph Cluster["Multi-Host Cluster"] - subgraph Host1["Host 1"] - subgraph PM1["ProcMesh Segment 1"] - H1P0["Process 0
GPU 0"] - H1P1["Process 1
GPU 1"] - H1P2["Process 2
GPU 2"] - H1P3["Process 3
GPU 3"] - end - end - - subgraph Host2["Host 2"] - subgraph PM2["ProcMesh Segment 2"] - H2P0["Process 4
GPU 0"] - H2P1["Process 5
GPU 1"] - H2P2["Process 6
GPU 2"] - H2P3["Process 7
GPU 3"] - end - end - - subgraph Host3["Host 3"] - subgraph PM3["ProcMesh Segment 3"] - H3P0["Process 8
GPU 0"] - H3P1["Process 9
GPU 1"] - H3P2["Process 10
GPU 2"] - H3P3["Process 11
GPU 3"] - end - end - end - - H1P0 -.->|"InfiniBand"| H2P0 - H1P1 -.->|"InfiniBand"| H2P1 - H2P0 -.->|"InfiniBand"| H3P0 - H2P1 -.->|"InfiniBand"| H3P1 - - style PM1 fill:#F44336 - style PM2 fill:#4CAF50 - style PM3 fill:#2196F3 -``` - -```python -# This shows the underlying actor system that powers Forge services - -from monarch.actor import Actor, endpoint, this_proc, Future -from monarch.actor import ProcMesh, this_host -import asyncio - -# STEP 1: Define a basic actor -class Counter(Actor): - def __init__(self, initial_value: int): - self.value = initial_value - - @endpoint - def increment(self) -> None: - self.value += 1 - - @endpoint - def get_value(self) -> int: - return self.value - -# STEP 2: Single actor in local process -counter: Counter = this_proc().spawn("counter", Counter, initial_value=0) - -# STEP 3: Send messages -fut: Future[int] = counter.get_value.call_one() -value = await fut -print(f"Counter value: {value}") # 0 - -# STEP 4: Multiple actors across processes -procs: ProcMesh = this_host().spawn_procs(per_host={"gpus": 8}) -counters: Counter = procs.spawn("counters", Counter, 0) - -# STEP 5: Broadcast to all actors -await counters.increment.call() - -# STEP 6: Different message patterns -# call_one() - single actor -value = await counters.get_value.call_one() -print(f"One counter: {value}") - -# choose() - random single actor (actors only, not services) -value = await counters.get_value.choose() -print(f"Random counter: {value}") - -# call() - all actors, collect results -values = await counters.get_value.call() -print(f"All counters: {values}") - -# broadcast() - fire and forget -await counters.increment.broadcast() - -# Cleanup -await procs.stop() -``` - -## Actor Meshes: Your Code Running Distributed - -**ActorMesh** is created when you spawn actors across a ProcMesh. Each process in the ProcMesh gets one instance of your actor. - -```mermaid -graph TD - subgraph Creation["Actor Creation Process"] - Code["mesh.spawn('policy', PolicyActor, model='Qwen/Qwen3-7B')"] - - subgraph ProcMesh["ProcMesh (4 processes)"] - P0["Process 0
GPU 0"] - P1["Process 1
GPU 1"] - P2["Process 2
GPU 2"] - P3["Process 3
GPU 3"] - end - - subgraph ActorMesh["ActorMesh PolicyActor"] - A0["PolicyActor Instance #0: model=Qwen/Qwen3-7B"] - A1["PolicyActor Instance #1: model=Qwen/Qwen3-7B"] - A2["PolicyActor Instance #2: model=Qwen/Qwen3-7B"] - A3["PolicyActor Instance #3: model=Qwen/Qwen3-7B"] - end - - Code --> ProcMesh - P0 --> A0 - P1 --> A1 - P2 --> A2 - P3 --> A3 - end - - style A0 fill:#4CAF50 - style A1 fill:#4CAF50 - style A2 fill:#4CAF50 - style A3 fill:#4CAF50 -``` - -### Message Routing Through ActorMesh - -```mermaid -graph TD - subgraph MessageFlow["Message Flow Patterns"] - Client["await policy_actors.generate.METHOD(prompt)"] - - subgraph Methods["Different Adverbs Route Differently"] - Choose["choose(): Routes to ONE actor, Load balanced"] - Call["call(): Routes to ALL actors, Collects results"] - Broadcast["broadcast(): Routes to ALL actors, Fire and forget"] - Stream["stream(): Routes to ALL actors, Iterator of results"] - end - - subgraph ActorInstances["PolicyActor Instances"] - A0["Actor 0: GPU 0, generates response"] - A1["Actor 1: GPU 1, generates response"] - A2["Actor 2: GPU 2, generates response"] - A3["Actor 3: GPU 3, generates response"] - end - - Client --> Choose - Client --> Call - Client --> Broadcast - Client --> Stream - - Choose -.->|"Load balanced"| A1 - Call --> A0 - Call --> A1 - Call --> A2 - Call --> A3 - Broadcast --> A0 - Broadcast --> A1 - Broadcast --> A2 - Broadcast --> A3 - Stream --> A0 - Stream --> A1 - Stream --> A2 - Stream --> A3 - end - - style Choose fill:#4CAF50 - style Call fill:#FF9800 - style Broadcast fill:#E91E63 - style Stream fill:#9C27B0 -``` - -## How Forge Services Use Monarch - -Now the key insight: **Forge services are ServiceActors that manage ActorMeshes of your ForgeActor replicas**. - -### The Service Creation Process - -```mermaid -graph TD - subgraph ServiceCreation["Service Creation Process"] - Call["await PolicyActor.options(num_replicas=4, procs=1).as_service(model='Qwen')"] - - ServiceActor["ServiceActor: Manages 4 replicas, Health checks, Routes calls"] - - subgraph Replicas["4 Independent Replicas"] - subgraph R0["Replica 0"] - PM0["ProcMesh: 1 process, GPU 0"] - AM0["ActorMesh
1 PolicyActor"] - end - - subgraph R1["Replica 1"] - PM1["ProcMesh: 1 process, GPU 1"] - AM1["ActorMesh
1 PolicyActor"] - end - - subgraph R2["Replica 2"] - PM2["ProcMesh: 1 process, GPU 2"] - AM2["ActorMesh
1 PolicyActor"] - end - - subgraph R3["Replica 3"] - PM3["ProcMesh: 1 process, GPU 3"] - AM3["ActorMesh
1 PolicyActor"] - end - end - - Call --> ServiceActor - ServiceActor --> R0 - ServiceActor --> R1 - ServiceActor --> R2 - ServiceActor --> R3 - PM0 --> AM0 - PM1 --> AM1 - PM2 --> AM2 - PM3 --> AM3 - end - - style ServiceActor fill:#FF9800 - style AM0 fill:#4CAF50 - style AM1 fill:#4CAF50 - style AM2 fill:#4CAF50 - style AM3 fill:#4CAF50 -``` - -### Service Call to Actor Execution - -```mermaid -graph TD - subgraph CallFlow["Complete Call Flow"] - UserCall["await policy_service.generate.route('What is 2+2?')"] - - ServiceInterface["ServiceInterface: Receives .route() call, Routes to ServiceActor"] - - ServiceActor["ServiceActor: Selects healthy replica, Load balancing, Failure handling"] - - SelectedReplica["Selected Replica #2: ProcMesh 1 process, ActorMesh 1 PolicyActor"] - - PolicyActor["PolicyActor Instance: Loads model, Runs vLLM inference"] - - GPU["GPU 2: vLLM engine, Model weights, KV cache, CUDA kernels"] - - UserCall --> ServiceInterface - ServiceInterface --> ServiceActor - ServiceActor --> SelectedReplica - SelectedReplica --> PolicyActor - PolicyActor --> GPU - - GPU -.->|"Response"| PolicyActor - PolicyActor -.->|"Response"| SelectedReplica - SelectedReplica -.->|"Response"| ServiceActor - ServiceActor -.->|"Response"| ServiceInterface - ServiceInterface -.->|"'The answer is 4'"| UserCall - end - - style UserCall fill:#4CAF50 - style ServiceActor fill:#FF9800 - style PolicyActor fill:#9C27B0 - style GPU fill:#FF5722 -``` - -## Multiple Services Sharing Infrastructure - -In real RL systems, you have multiple services that can share or use separate ProcMeshes: - -```mermaid -graph TD - subgraph Cluster["RL Training Cluster"] - subgraph Services["Forge Services"] - PS["Policy Service
4 GPU replicas"] - TS["Trainer Service
2 GPU replicas"] - RS["Reward Service
4 CPU replicas"] - BS["Buffer Service
1 CPU replica"] - end - - subgraph MonarchInfra["Monarch Infrastructure"] - subgraph GPUMesh["GPU ProcMesh (6 processes)"] - G0["Process 0
GPU 0"] - G1["Process 1
GPU 1"] - G2["Process 2
GPU 2"] - G3["Process 3
GPU 3"] - G4["Process 4
GPU 4"] - G5["Process 5
GPU 5"] - end - - subgraph CPUMesh["CPU ProcMesh (5 processes)"] - C0["Process 0
CPU"] - C1["Process 1
CPU"] - C2["Process 2
CPU"] - C3["Process 3
CPU"] - C4["Process 4
CPU"] - end - end - - PS --> G0 - PS --> G1 - PS --> G2 - PS --> G3 - TS --> G4 - TS --> G5 - RS --> C0 - RS --> C1 - RS --> C2 - RS --> C3 - BS --> C4 - end - - style PS fill:#4CAF50 - style TS fill:#E91E63 - style RS fill:#FF9800 - style BS fill:#9C27B0 - style GPUMesh fill:#FFEBEE - style CPUMesh fill:#E3F2FD -``` - -## Key Insights: Why This Architecture Matters - -1. **Process Isolation**: Each actor runs in its own process - failures don't cascade -2. **Location Transparency**: Actors can be local or remote with identical APIs -3. **Structured Distribution**: ProcMesh maps directly to hardware topology -4. **Message Passing**: No shared memory means no race conditions or locks -5. **Service Abstraction**: Forge hides Monarch complexity while preserving power - -Understanding this hierarchy helps you: -- **Debug performance issues**: Is the bottleneck at service, actor, or hardware level? -- **Optimize resource usage**: How many replicas per service? GPU vs CPU processes? -- **Handle failures gracefully**: Which layer failed and how to recover? -- **Scale effectively**: Where to add resources for maximum impact? - -# Conclusion - -## What You've Learned - -1. **RL Fundamentals**: How RL concepts map to Forge services with REAL, working examples -2. **Service Abstraction**: How to use Forge services effectively with verified communication patterns -3. **Monarch Foundation**: How Forge services connect to distributed actors and hardware - -## Key Takeaways - -- **Services hide complexity**: Your RL code looks like simple async functions, but runs on distributed clusters -- **Communication patterns matter**: `.route()`, `.fanout()`, sessions, and `.call_one()` each serve specific purposes -- **Architecture understanding helps**: Knowing the Service → Actor → Process → Hardware hierarchy helps you debug, optimize, and scale -- **Always verify APIs**: This guide is verified, but cross-check with source code for latest changes -- **Real API patterns**: Use `.options().as_service()` not `spawn_service()`, use `.route()` not `.choose()`, etc. diff --git a/docs/source/conf.py b/docs/source/conf.py index 4e3cec1fa..179a32437 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -15,6 +15,7 @@ import sys import pytorch_sphinx_theme2 +from sphinx_gallery.sorting import ExplicitOrder, FileNameSortKey # Add the source directory to Python path so modules can be imported sys.path.insert(0, os.path.abspath("../../src/forge")) @@ -82,7 +83,12 @@ def get_version_path(): "_templates", os.path.join(os.path.dirname(pytorch_sphinx_theme2.__file__), "templates"), ] -exclude_patterns = ["tutorials/index.rst", "tutorials/template_tutorial.rst"] +exclude_patterns = [ + "tutorials/index.rst", + "tutorials/template_tutorial.rst", + "tutorials/**/index.rst", + "tutorial_sources/**/README.md", +] html_static_path = ["_static"] html_css_files = ["custom.css"] @@ -204,7 +210,7 @@ def get_version_path(): sphinx_gallery_conf = { "examples_dirs": "tutorial_sources", # Path to examples directory "gallery_dirs": "tutorials", # Path to generate gallery - "filename_pattern": ".*", # Include all files + "filename_pattern": ".*", # Match all Python files "download_all_examples": False, "first_notebook_cell": "%matplotlib inline", "plot_gallery": "True", @@ -212,6 +218,8 @@ def get_version_path(): "backreferences_dir": None, "show_signature": False, "write_computation_times": False, + "subsection_order": ExplicitOrder(["tutorial_sources/zero-to-forge"]), + "within_subsection_order": FileNameSortKey, } diff --git a/docs/source/tutorial_sources/README.txt b/docs/source/tutorial_sources/README.txt index 1fadb0a08..e69de29bb 100644 --- a/docs/source/tutorial_sources/README.txt +++ b/docs/source/tutorial_sources/README.txt @@ -1,5 +0,0 @@ -Tutorials -========= - -This gallery contains tutorials and examples to help you get started with Forge. -Each tutorial demonstrates specific features and use cases with practical examples. diff --git a/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py b/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py new file mode 100644 index 000000000..08f7193c0 --- /dev/null +++ b/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py @@ -0,0 +1,503 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Part 1: RL Fundamentals - Using Forge Terminology +================================================== + +**Author:** `Sanyam Bhutani `_ + +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * Core RL components in Forge + * How RL concepts map to Forge services + * The RL training loop with Forge APIs + * Forge's distributed architecture benefits + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * Understanding of basic RL concepts + * Familiarity with Python async/await + * PyTorch experience recommended +""" + +######################################################################### +# Core RL Components in Forge +# ---------------------------- +# +# Let's start with a simple math tutoring example to understand RL concepts +# with the exact names Forge uses: +# +# The Toy Example: Teaching Math +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# .. mermaid:: +# +# graph TD +# subgraph Example["Math Tutoring RL Example"] +# Dataset["Dataset: math problems"] +# Policy["Policy: student AI"] +# Reward["Reward Model: scores answers"] +# Reference["Reference Model: baseline"] +# ReplayBuffer["Replay Buffer: stores experiences"] +# Trainer["Trainer: improves student"] +# end +# +# Dataset --> Policy +# Policy --> Reward +# Policy --> Reference +# Reward --> ReplayBuffer +# Reference --> ReplayBuffer +# ReplayBuffer --> Trainer +# Trainer --> Policy +# +# style Policy fill:#4CAF50 +# style Reward fill:#FF9800 +# style Trainer fill:#E91E63 +# +# RL Components Defined (Forge Names) +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# 1. **Dataset**: Provides questions/prompts (like "What is 2+2?") +# 2. **Policy**: The AI being trained (generates answers like "The answer is 4") +# 3. **Reward Model**: Evaluates answer quality (gives scores like 0.95) +# 4. **Reference Model**: Original policy copy (prevents drift from baseline) +# 5. **Replay Buffer**: Stores experiences (question + answer + score) +# 6. **Trainer**: Updates the policy weights based on experiences + +###################################################################### +# The RL Learning Flow +# -------------------- +# +# Here's a conceptual example of how an RL step works. +# This is CONCEPTUAL - see apps/grpo/main.py for actual GRPO implementation. + + +def conceptual_rl_step(): + """Conceptual RL training step showing the flow.""" + # 1. Get a math problem + question = dataset.sample() # "What is 2+2?" + + # 2. Student generates answer + answer = policy.generate(question) # "The answer is 4" + + # 3. Teacher grades it + score = reward_model.evaluate(question, answer) # 0.95 + + # 4. Compare to original student + baseline = reference_model.compute_logprobs(question, answer) + + # 5. Store the experience + experience = Episode(question, answer, score, baseline) + replay_buffer.add(experience) + + # 6. When enough experiences collected, improve student + batch = replay_buffer.sample(curr_policy_version=0) + if batch is not None: + trainer.train_step(batch) # Student gets better! + + +###################################################################### +# From Concepts to Forge Services +# -------------------------------- +# +# Here's the key insight: **Each RL component becomes a Forge service**. +# The toy example above maps directly to Forge: +# +# .. mermaid:: +# +# graph LR +# subgraph Concepts["RL Concepts"] +# C1["Dataset"] +# C2["Policy"] +# C3["Reward Model"] +# C4["Reference Model"] +# C5["Replay Buffer"] +# C6["Trainer"] +# end +# +# subgraph Services["Forge Services (Real Classes)"] +# S1["DatasetActor"] +# S2["Policy"] +# S3["RewardActor"] +# S4["ReferenceModel"] +# S5["ReplayBuffer"] +# S6["RLTrainer"] +# end +# +# C1 --> S1 +# C2 --> S2 +# C3 --> S3 +# C4 --> S4 +# C5 --> S5 +# C6 --> S6 +# +# style C2 fill:#4CAF50 +# style S2 fill:#4CAF50 +# style C3 fill:#FF9800 +# style S3 fill:#FF9800 + +###################################################################### +# RL Step with Forge Services +# ---------------------------- +# +# Let's look at the example from above again, but this time we use the +# actual Forge API names: + +import asyncio + + +async def conceptual_forge_rl_step(services, step): + """Single RL step using verified Forge APIs.""" + # 1. Get a math problem - Using actual DatasetActor API + sample = await services["dataloader"].sample.call_one() + question, target = sample["request"], sample["target"] + + # 2. Student generates answer - Using actual Policy API + responses = await services["policy"].generate.route(prompt=question) + answer = responses[0].text + + # 3. Teacher grades it - Using actual RewardActor API + score = await services["reward_actor"].evaluate_response.route( + prompt=question, response=answer, target=target + ) + + # 4. Compare to baseline - Using actual ReferenceModel API + # Note: ReferenceModel.forward requires input_ids, max_req_tokens, return_logprobs + # ref_logprobs = await services['ref_model'].forward.route( + # input_ids, max_req_tokens, return_logprobs=True + # ) + + # 5. Store experience - Using actual Episode structure from apps/grpo/main.py + # episode = create_episode_from_response(responses[0], score, ref_logprobs, step) + # await services['replay_buffer'].add.call_one(episode) + + # 6. Improve student - Using actual training pattern + batch = await services["replay_buffer"].sample.call_one(curr_policy_version=step) + if batch is not None: + inputs, targets = batch # GRPO returns (inputs, targets) tuple + loss = await services["trainer"].train_step.call(inputs, targets) + + # 7. Policy synchronization - Using actual weight update pattern + await services["trainer"].push_weights.call(step + 1) + await services["policy"].update_weights.fanout(step + 1) + + return loss + + +###################################################################### +# Why This Matters: Traditional ML Infrastructure Fails +# ----------------------------------------------------- +# +# Our simple RL loop above has complex requirements: +# +# Problem 1: Different Resource Needs +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# +------------------+-------------------------+---------------------------+ +# | Component | Resource Needs | Scaling Strategy | +# +==================+=========================+===========================+ +# | **Policy** | Large GPU memory | Multiple replicas | +# +------------------+-------------------------+---------------------------+ +# | **Reward** | Small compute | CPU or small GPU | +# +------------------+-------------------------+---------------------------+ +# | **Trainer** | Massive GPU compute | Distributed training | +# +------------------+-------------------------+---------------------------+ +# | **Dataset** | CPU intensive I/O | High memory bandwidth | +# +------------------+-------------------------+---------------------------+ +# +# Problem 2: Complex Interdependencies +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# .. mermaid:: +# +# graph LR +# A["Policy: Student AI
'What is 2+2?' → 'The answer is 4'"] +# B["Reward: Teacher
Scores answer: 0.95"] +# C["Reference: Original Student
Provides baseline comparison"] +# D["Replay Buffer: Notebook
Stores: question + answer + score"] +# E["Trainer: Tutor
Improves student using experiences"] +# +# A --> B +# A --> C +# B --> D +# C --> D +# D --> E +# E --> A +# +# style A fill:#4CAF50 +# style B fill:#FF9800 +# style C fill:#2196F3 +# style D fill:#8BC34A +# style E fill:#E91E63 +# +# Each step has different: +# +# * **Latency requirements**: Policy inference needs low latency +# * **Scaling patterns**: Need N policy replicas to keep trainer busy +# * **Failure modes**: Any component failure cascades to halt pipeline +# * **Resource utilization**: GPUs for inference/training, CPUs for data + +###################################################################### +# Enter Forge: RL-Native Architecture +# ------------------------------------ +# +# Forge solves these problems by treating each RL component as an +# **independent, distributed unit**. +# +# Quick API Reference (covered in detail in Part 2): +# +# * ``.route()`` - Send request to any healthy replica (load balanced) +# * ``.call_one()`` - Send request to a single actor instance +# * ``.fanout()`` - Send request to ALL replicas in a service + + +async def real_rl_training_step(services, step): + """Single RL step using verified Forge APIs.""" + # 1. Environment interaction - Using actual DatasetActor API + sample = await services["dataloader"].sample.call_one() + prompt, target = sample["request"], sample["target"] + + responses = await services["policy"].generate.route(prompt) + + # 2. Reward computation - Using actual RewardActor API + score = await services["reward_actor"].evaluate_response.route( + prompt=prompt, response=responses[0].text, target=target + ) + + # 3. Get reference logprobs - Using actual ReferenceModel API + import torch + + input_ids = torch.cat([responses[0].prompt_ids, responses[0].token_ids]) + ref_logprobs = await services["ref_model"].forward.route( + input_ids.unsqueeze(0), max_req_tokens=512, return_logprobs=True + ) + + # 4. Experience storage - Using actual Episode pattern from GRPO + # episode = create_episode_from_response(responses[0], score, ref_logprobs, step) + # await services['replay_buffer'].add.call_one(episode) + + # 5. Learning - Using actual trainer pattern + batch = await services["replay_buffer"].sample.call_one(curr_policy_version=step) + if batch is not None: + inputs, targets = batch + loss = await services["trainer"].train_step.call(inputs, targets) + + # 6. Policy synchronization + await services["trainer"].push_weights.call(step + 1) + await services["policy"].update_weights.fanout(step + 1) + + return loss + + +###################################################################### +# What Makes This Powerful +# ------------------------- +# +# Automatic Resource Management +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +async def example_automatic_management(policy): + """Forge handles routing, GPU memory, batching, and scaling.""" + responses = await policy.generate.route(prompt="What is 2+2?") + answer = responses[0].text + return answer + + +###################################################################### +# Independent Scaling +# ~~~~~~~~~~~~~~~~~~~ +# +# Here's how you configure different components with different resources: + +# Note: This is example code showing the Forge API +# For actual imports, see apps/grpo/main.py +try: + from forge.actors.policy import Policy + from forge.actors.replay_buffer import ReplayBuffer + from forge.actors.reference_model import ReferenceModel + from forge.actors.trainer import RLTrainer + from forge.data.rewards import MathReward, ThinkingReward + + # Mock classes for the example + class DatasetActor: + pass + + class RewardActor: + pass + + class ComputeAdvantages: + pass + +except ImportError: + # Provide mock classes if imports fail during doc build + class Policy: + pass + + class ReplayBuffer: + pass + + class ReferenceModel: + pass + + class RLTrainer: + pass + + class DatasetActor: + pass + + class RewardActor: + pass + + class ComputeAdvantages: + pass + + class MathReward: + pass + + class ThinkingReward: + pass + + +async def setup_forge_services(): + """Configure Forge services with independent scaling.""" + model = "Qwen/Qwen3-1.7B" + group_size = 1 + + ( + dataloader, + policy, + trainer, + replay_buffer, + compute_advantages, + ref_model, + reward_actor, + ) = await asyncio.gather( + # Dataset actor (CPU) + DatasetActor.options(procs=1).as_actor( + path="openai/gsm8k", + revision="main", + data_split="train", + streaming=True, + model=model, + ), + # Policy service with GPU and multiple replicas + Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service( + engine_config={ + "model": model, + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "enforce_eager": False, + }, + sampling_config={ + "n": group_size, + "max_tokens": 16, + "temperature": 1.0, + "top_p": 1.0, + }, + ), + # Trainer actor with GPU + RLTrainer.options(procs=1, with_gpus=True).as_actor( + model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"}, + optimizer={"name": "AdamW", "lr": 1e-5}, + training={"local_batch_size": 2, "seq_len": 2048}, + ), + # Replay buffer (CPU) + ReplayBuffer.options(procs=1).as_actor( + batch_size=2, max_policy_age=1, dp_size=1 + ), + # Advantage computation (CPU) + ComputeAdvantages.options(procs=1).as_actor(), + # Reference model with GPU + ReferenceModel.options(procs=1, with_gpus=True).as_actor( + model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"}, + training={"dtype": "bfloat16"}, + ), + # Reward actor (CPU) + RewardActor.options(procs=1, num_replicas=1).as_service( + reward_functions=[MathReward(), ThinkingReward()] + ), + ) + + return { + "dataloader": dataloader, + "policy": policy, + "trainer": trainer, + "replay_buffer": replay_buffer, + "compute_advantages": compute_advantages, + "ref_model": ref_model, + "reward_actor": reward_actor, + } + + +###################################################################### +# Forge Components: Services vs Actors +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# Forge has two types of distributed components: +# +# * **Services**: Multiple replicas with automatic load balancing +# (like Policy, RewardActor) +# * **Actors**: Single instances that handle their own internal +# distribution (like RLTrainer, ReplayBuffer) +# +# We cover this distinction in detail in Part 2, but for now this +# explains the scaling patterns: +# +# * Policy service: ``num_replicas=8`` for high inference demand +# * RewardActor service: ``num_replicas=16`` for parallel evaluation +# * RLTrainer actor: Single instance with internal distributed training + +###################################################################### +# Fault Tolerance +# ~~~~~~~~~~~~~~~ +# +# Forge provides automatic fault tolerance: + + +async def example_fault_tolerance(policy, reward_actor): + """If a replica fails, Forge automatically handles it.""" + # If a policy replica fails: + responses = await policy.generate.route(prompt="What is 2+2?") + answer = responses[0].text + # -> Forge automatically routes to healthy replica + # -> Failed replica respawns in background + # -> No impact on training loop + + # If reward service fails: + score = await reward_actor.evaluate_response.route( + prompt="question", response=answer, target="target" + ) + # -> Retries on different replica automatically + # -> Graceful degradation if all replicas fail + # -> System continues (may need application-level handling) + + +###################################################################### +# Conclusion +# ---------- +# +# This tutorial covered: +# +# * How RL concepts map to Forge components +# * The challenges of traditional RL infrastructure +# * How Forge's architecture solves these challenges +# * Basic Forge API patterns (route, call_one, fanout) +# +# In the next section, we will go a layer deeper and learn how Forge +# services work internally. +# +# Further Reading +# --------------- +# +# * Continue to :doc:`2_Forge_Internals` +# * Check out the full `GRPO implementation `_ +# * Read about the :doc:`../../api_actors` documentation diff --git a/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py b/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py new file mode 100644 index 000000000..efecfdc72 --- /dev/null +++ b/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py @@ -0,0 +1,767 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Part 2: Peeling Back the Abstraction - What Are Services? +========================================================== + +**Author:** `Sanyam Bhutani `_ + +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * How Forge services work under the hood + * Service communication patterns (route, fanout, call_one) + * State management in distributed systems + * Real-world service orchestration patterns + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * Complete :doc:`1_RL_and_Forge_Fundamentals` + * Understanding of Python async/await + * Basic distributed systems knowledge + +We highly recommend reading Part 1 before this - it explains RL Concepts +and how they land in Forge. + +Now that you see the power of the service abstraction, let's understand +what's actually happening under the hood. Grab your chai! +""" + +###################################################################### +# Service Anatomy: Beyond the Interface +# -------------------------------------- +# +# When you call ``await policy_service.generate(question)``, here's what +# actually happens: +# +# (Don't worry, we will understand Services right in the next section!) +# +# .. mermaid:: +# +# graph TD +# Call["Your Code:
await policy_service.generate"] +# +# subgraph ServiceLayer["Service Layer"] +# Proxy["Service Proxy: Load balancing, Health checking"] +# LB["Load Balancer: Replica selection, Circuit breaker"] +# end +# +# subgraph Replicas["Replica Management"] +# R1["Replica 1: GPU 0, Healthy"] +# R2["Replica 2: GPU 1, Overloaded"] +# R3["Replica 3: GPU 2, Failed"] +# R4["Replica 4: GPU 3, Healthy"] +# end +# +# subgraph Compute["Actual Computation"] +# Actor["Policy Actor: vLLM engine, Model weights, KV cache"] +# end +# +# Call --> Proxy +# Proxy --> LB +# LB --> R1 +# LB -.-> R2 +# LB -.-> R3 +# LB --> R4 +# R1 --> Actor +# R4 --> Actor +# +# style Call fill:#4CAF50 +# style LB fill:#FF9800 +# style R3 fill:#F44336 +# style Actor fill:#9C27B0 + +###################################################################### +# Service Components Deep Dive +# ----------------------------- +# +# 1. Real Service Configuration +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# Here's the actual ServiceConfig from Forge source code: + +# Configuration pattern from apps/grpo/main.py: +# Policy.options( +# procs=1, # Processes per replica +# num_replicas=4, # Number of replicas +# with_gpus=True # Allocate GPUs +# # Other available options: +# # hosts=None # the number of remote hosts used per replica +# ) + +###################################################################### +# 2. Real Service Creation +# ~~~~~~~~~~~~~~~~~~~~~~~~ +# +# Services are created using the ``.options().as_service()`` pattern +# from the actual GRPO implementation. +# +# The service creation automatically handles: +# +# * Spawning actor replicas across processes/GPUs +# * Load balancing with .route() method for services +# * Health monitoring and failure recovery +# * Message routing and serialization + +import asyncio + +# Mock imports for documentation build +try: + from forge.actors.policy import Policy +except ImportError: + + class Policy: + pass + + +async def example_service_creation(): + """Example of creating a Policy service.""" + model = "Qwen/Qwen3-1.7B" + + policy = await Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service( + engine_config={ + "model": model, + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "enforce_eager": False, + }, + sampling_config={ + "n": 1, + "max_tokens": 16, + "temperature": 1.0, + "top_p": 1.0, + }, + ) + + prompt = "What is 3 + 5?" + responses = await policy.generate.route(prompt) + print(f"Response: {responses[0].text}") + + # Cleanup when done + await policy.shutdown() + + +###################################################################### +# 3. How Services Actually Work +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# Forge services are implemented as ServiceActors that manage +# collections of your ForgeActor replicas. +# +# When you call ``.as_service()``, Forge creates a ``ServiceInterface`` +# that manages N replicas of your ``ForgeActor`` class and gives you +# methods like ``.route()``, ``.fanout()``, etc. + + +async def service_interface_example(policy): + """Your code sees this simple interface.""" + # Simple call - but Forge handles all complexity + responses = await policy.generate.route(prompt="What is 2+2?") + # Forge handles: replica management, load balancing, fault tolerance + + +###################################################################### +# Communication Patterns: Quick Reference +# ---------------------------------------- +# +# **API Summary:** +# +# * ``.route()`` - Send request to any healthy replica in a service (load balanced) +# * ``.call_one()`` - Send request to a single actor instance +# * ``.fanout()`` - Send request to ALL replicas in a service +# +# .. mermaid:: +# +# graph LR +# subgraph Request["Your Request"] +# Code["await service.method.ADVERB()"] +# end +# +# subgraph Patterns["Communication Patterns"] +# Route[".route()
→ One healthy replica"] +# CallOne[".call_one()
→ Single actor"] +# Fanout[".fanout()
→ ALL replicas"] +# end +# +# subgraph Replicas["Replicas/Actors"] +# R1["Replica 1"] +# R2["Replica 2"] +# R3["Replica 3"] +# A1["Actor"] +# end +# +# Code --> Route +# Code --> CallOne +# Code --> Fanout +# +# Route --> R2 +# CallOne --> A1 +# Fanout --> R1 +# Fanout --> R2 +# Fanout --> R3 +# +# style Route fill:#4CAF50 +# style CallOne fill:#FF9800 +# style Fanout fill:#9C27B0 + +###################################################################### +# Deep Dive: Service Communication Patterns +# ------------------------------------------ +# +# These communication patterns ("adverbs") determine how your service +# calls are routed to replicas. Understanding when to use each pattern +# is key to effective Forge usage. +# +# 1. ``.route()`` - Load Balanced Single Replica +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# **When to use**: Normal request routing where any replica can handle +# the request. + + +async def route_example(policy): + """Using .route() for load-balanced requests.""" + question = "What is 2+2?" + responses = await policy.generate.route(prompt=question) + answer = responses[0].text # Extract text from Completion object + return answer + + +# Behind the scenes: +# 1. Health check eliminates failed replicas +# 2. Load balancer picks replica (currently round robin) +# 3. Request routes to that specific replica +# 4. Automatic retry on different replica if failure +# +# **Performance characteristics**: +# +# * **Latency**: Lowest (single network hop) +# * **Throughput**: Limited by single replica capacity +# * **Fault tolerance**: Automatic failover to other replicas +# +# **Critical insight**: ``.route()`` is your default choice for +# stateless operations in Forge services. + +###################################################################### +# 2. ``.fanout()`` - Broadcast with Results Collection +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# **When to use**: You need responses from ALL replicas. + + +async def fanout_example(policy): + """Using .fanout() to broadcast to all replicas.""" + # Get version from all policy replicas + current_versions = await policy.get_version.fanout() + # Returns: [version_replica_1, version_replica_2, ...] + + # Update weights on all replicas + await policy.update_weights.fanout(new_policy_version=1) + # Broadcasts to all replicas simultaneously + + +# **Performance characteristics**: +# +# * **Latency**: Slowest replica determines total latency +# * **Throughput**: Network bandwidth × number of replicas +# * **Fault tolerance**: Fails if ANY replica fails (unless configured) +# +# **Critical gotcha**: Don't use ``.fanout()`` for high-frequency +# operations - it contacts all replicas. + +###################################################################### +# 3. Streaming Operations - Custom Implementation Pattern +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# **When to use**: You want to process results as they arrive. + + +async def streaming_pattern_example(replay_buffer, trainer, step): + """Streaming pattern for continuous training.""" + # CONCEPTUAL - Streaming requires custom implementation + # Pattern from apps/grpo/main.py continuous training: + + while True: + # This is the real API call pattern + batch = await replay_buffer.sample.call_one(curr_policy_version=step) + if batch is not None: + # Process batch immediately + loss = await trainer.train_step.call_one(batch) + print(f"Training loss: {loss}") + else: + await asyncio.sleep(0.1) # Wait for more data + break # Just for example + + +# **Performance characteristics**: +# +# * **Latency**: Process first result immediately +# * **Throughput**: Non-blocking async operations +# * **Fault tolerance**: Continues if some replicas fail +# +# **Critical insight**: Essential for high-throughput RL where +# you can't wait for batches. + +###################################################################### +# Service Sessions for Stateful Operations +# ----------------------------------------- +# +# **When to use**: When you need multiple calls to hit the same replica +# (like KV cache preservation). +# +# **What are sticky sessions?** A session ensures all your service calls +# within the ``async with`` block go to the same replica, instead of +# being load-balanced across different replicas. + +# Mock classes for example +try: + from forge.controller import ForgeActor + from monarch.actor import endpoint +except ImportError: + + class ForgeActor: + pass + + def endpoint(func): + return func + + +class ForgeCounter(ForgeActor): + """Example counter to demonstrate sessions.""" + + def __init__(self, initial_value: int): + self.value = initial_value + + @endpoint + def increment(self) -> int: + self.value += 1 + return self.value + + @endpoint + def get_value(self) -> int: + return self.value + + @endpoint + async def reset(self): + self.value = 0 + + +async def without_sessions_example(): + """WITHOUT SESSIONS: Each .route() goes to different replica.""" + counter_service = await ForgeCounter.options(procs=1, num_replicas=4).as_service( + initial_value=0 + ) + + # Each call might go to different replica + await counter_service.increment.route() # Might go to replica 2 + await counter_service.increment.route() # Might go to replica 1 + await counter_service.increment.route() # Might go to replica 3 + + results = await counter_service.increment.fanout() + print(f"All replica values: {results}") + # Output: All replica values: [1, 2, 1, 1] + # Each replica has different state! + + await counter_service.shutdown() + + +async def with_sessions_example(): + """WITH SESSIONS: All calls go to the SAME replica.""" + counter_service = await ForgeCounter.options(procs=1, num_replicas=4).as_service( + initial_value=0 + ) + + print("\nUsing sticky sessions:") + async with counter_service.session(): + await counter_service.reset.route() + print(await counter_service.increment.route()) # 1 + print(await counter_service.increment.route()) # 2 + print(await counter_service.increment.route()) # 3 + + final_value = await counter_service.get_value.route() + print(f"Final value on this replica: {final_value}") # 3 + + # Cleanup + await counter_service.shutdown() + + +###################################################################### +# Deep Dive: State Management Reality +# ------------------------------------ +# +# The most complex challenge in distributed RL is maintaining state +# consistency while maximizing performance. +# +# The KV Cache Problem +# ~~~~~~~~~~~~~~~~~~~~ +# +# **The challenge**: Policy inference is much faster with KV cache, +# but cache is tied to specific conversation history. + + +async def naive_multi_turn(policy_service): + """This breaks KV cache optimization.""" + question1 = "What is 2+2?" + + # Each call might go to different replica = cache miss + response1 = await policy_service.generate.route(prompt=question1) + full_prompt = question1 + response1[0].text + response2 = await policy_service.generate.route(prompt=full_prompt) # Cache miss! + conversation = full_prompt + response2[0].text + response3 = await policy_service.generate.route( + prompt=conversation + ) # Cache miss! + + +async def optimized_multi_turn(policy): + """The solution: Sticky sessions ensure same replica.""" + async with policy.session(): + # All calls guaranteed to hit same replica = cache hits + question1 = "What is 2+2?" + response1 = await policy.generate.route(prompt=question1) + full_prompt = question1 + response1[0].text + response2 = await policy.generate.route(prompt=full_prompt) # Cache hit! + conversation = full_prompt + response2[0].text + response3 = await policy.generate.route(prompt=conversation) # Cache hit! + + # Session ends, replica can be garbage collected or reused + + +# **Performance impact**: Maintaining KV cache across turns avoids +# recomputing previous tokens. + +###################################################################### +# Replay Buffer Consistency +# ~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# **The challenge**: Multiple trainers and experience collectors +# reading/writing concurrently. +# +# **Real Forge approach**: The ReplayBuffer actor handles concurrency +# internally: + + +async def replay_buffer_example(replay_buffer): + """ReplayBuffer provides thread-safe operations.""" + # Add episodes (thread-safe by actor model) + episode = {} # Mock episode + await replay_buffer.add.call_one(episode) + + # Sample batches for training + batch = await replay_buffer.sample.call_one( + curr_policy_version=0, + batch_size=None, # Optional, uses default from config + ) + + # Additional methods available: + # await replay_buffer.clear.call_one() # Clear buffer + # await replay_buffer.evict.call_one(curr_policy_version) # Remove old + # state = await replay_buffer.state_dict.call_one() # Checkpoint + + +# **Critical insight**: The actor model provides natural thread safety - +# each actor processes messages sequentially. + +###################################################################### +# Weight Synchronization Strategy +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# **The challenge**: Trainer updates policy weights, but policy service +# needs those weights. + +import torch + + +async def real_weight_sync(trainer, policy, step): + """Forge weight synchronization pattern from apps/grpo/main.py.""" + # Trainer pushes weights to TorchStore with version number + await trainer.push_weights.call_one(policy_version=step + 1) + + # Policy service updates to new version from TorchStore + # Use .fanout() to update ALL policy replicas + await policy.update_weights.fanout(policy_version=step + 1) + + # Check current policy version + current_version = await policy.get_version.route() + print(f"Current policy version: {current_version}") + + +###################################################################### +# Deep Dive: Asynchronous Coordination Patterns +# ---------------------------------------------- +# +# **The real challenge**: Different services run at different speeds, +# but Forge's service abstraction handles the coordination complexity. +# +# The Forge Approach: Let Services Handle Coordination +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# Instead of manual coordination, Forge services handle speed mismatches +# automatically: + + +async def simple_rl_step( + dataloader, policy, ref_model, reward_actor, replay_buffer, compute_advantages, trainer +): + """Simple RL step showing service coordination.""" + # ===== Generate a rollout ===== + sample = await dataloader.sample.call_one() + prompt, target = sample["request"], sample["target"] + + print(f"Prompt: {prompt}") + print(f"Target: {target}") + + actions = await policy.generate.route(prompt=prompt) + print(f"Policy response: {actions[0].text}") + + # Create input tensor for reference model (requires full context) + input_ids = torch.cat([actions[0].prompt_ids, actions[0].token_ids]) + ref_logprobs = await ref_model.forward.route( + input_ids.unsqueeze(0), max_req_tokens=512, return_logprobs=True + ) + + reward = await reward_actor.evaluate_response.route( + prompt=prompt, response=actions[0].text, target=target + ) + print(f"Reward: {reward}") + + # Create episode (simplified for example) + episode = { + "episode_id": "0", + "request": prompt, + "response": actions[0].text, + "reward": reward, + "ref_logprobs": ref_logprobs[0], + } + + await replay_buffer.add.call_one(episode) + print("Episode stored in replay buffer") + + # ===== Train on the batch ===== + batch = await replay_buffer.sample.call_one(curr_policy_version=0) + if batch is not None: + print("Training on batch...") + inputs, targets = batch + loss = await trainer.train_step.call(inputs, targets) + print(f"Training loss: {loss}") + return loss + else: + print("Not enough data in buffer yet") + return None + + +###################################################################### +# Handling Speed Mismatches with Service Scaling +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# **The insight**: Scale services independently based on their +# bottlenecks. + +# Mock imports for example +try: + from forge.actors.policy import Policy + from forge.actors.trainer import RLTrainer +except ImportError: + + class Policy: + pass + + class RLTrainer: + pass + + class RewardActor: + pass + + +async def scaling_example(): + """Scale services independently based on bottlenecks.""" + model_name = "Qwen/Qwen3-1.7B" + + # Scale fast services with more replicas + policy = await Policy.options( + procs=1, num_replicas=8, with_gpus=True # Many replicas for throughput + ).as_service(engine_config={"model": model_name, "tensor_parallel_size": 1}) + + # Reward evaluation might be CPU-bound + reward_actor = await RewardActor.options( + procs=1, num_replicas=16, with_gpus=False # More CPU replicas + ).as_service(reward_functions=[]) + + # Training needs fewer but more powerful replicas + trainer = await RLTrainer.options( + procs=1, with_gpus=True # Fewer but GPU-heavy + ).as_actor( # Trainer typically uses .as_actor() not .as_service() + model={"name": "qwen3", "flavor": "1.7B"}, optimizer={"name": "AdamW", "lr": 1e-5} + ) + + +###################################################################### +# Service Implementation Example +# ------------------------------- +# +# Let's see how a reward service is actually implemented: + +# Mock imports +try: + from forge.controller import ForgeActor + from monarch.actor import endpoint + from forge.data.rewards import MathReward, ThinkingReward +except ImportError: + + class ForgeActor: + pass + + def endpoint(func): + return func + + class MathReward: + def __call__(self, prompt, response, target): + return 1.0 + + class ThinkingReward: + def __call__(self, prompt, response, target): + return 1.0 + + +class RewardActor(ForgeActor): + """Exact RewardActor from apps/grpo/main.py.""" + + def __init__(self, reward_functions: list): + self.reward_functions = reward_functions + + @endpoint + async def evaluate_response(self, prompt: str, response: str, target: str) -> float: + """Evaluate response quality using multiple reward functions.""" + total_reward = 0.0 + + for reward_fn in self.reward_functions: + # Each reward function contributes to total score + reward = reward_fn(prompt, response, target) + total_reward += reward + + # Return average reward across all functions + return total_reward / len(self.reward_functions) if self.reward_functions else 0.0 + + +async def reward_service_example(): + """Create and use a reward service.""" + reward_actor = await RewardActor.options(procs=1, num_replicas=1).as_service( + reward_functions=[MathReward(), ThinkingReward()] + ) + + prompt = "What is 15% of 240?" + response = "15% of 240 is 36" + target = "36" + + score = await reward_actor.evaluate_response.route( + prompt=prompt, response=response, target=target + ) + print(f"Reward score: {score}") # Usually around 1.0 for correct answers + + # For production scaling - increase num_replicas: + # RewardActor.options(procs=1, num_replicas=16) # 16 parallel evaluators + + # Cleanup when done + await reward_actor.shutdown() + + +###################################################################### +# Service Orchestration: The Training Loop +# ----------------------------------------- +# +# Now let's see how services coordinate in a real training loop: + + +async def production_training_loop(): + """Real training loop pattern from apps/grpo/main.py.""" + # Service creation pattern (abbreviated) + print("Initializing all services...") + + # (Services initialization code here - see Part 1) + + step = 0 + + while True: + # Data generation + sample = await dataloader.sample.call_one() + + # Policy generation service call + responses = await policy.generate.route(sample["request"]) + + # Reference computation service call + input_ids = torch.cat([responses[0].prompt_ids, responses[0].token_ids]) + ref_logprobs = await ref_model.forward.route( + input_ids.unsqueeze(0), max_req_tokens=512, return_logprobs=True + ) + + # Reward evaluation service call + reward = await reward_actor.evaluate_response.route( + prompt=sample["question"], response=responses[0].text, target=sample["answer"] + ) + + # Experience storage + await replay_buffer.add.call_one(episode) + + # Training when ready + batch = await replay_buffer.sample.call_one(curr_policy_version=step) + if batch is not None: + inputs, targets = batch + loss = await trainer.train_step.call(inputs, targets) + + # Weight synchronization pattern + await trainer.push_weights.call(step + 1) + await policy.update_weights.fanout(step + 1) # Fanout to all replicas + + print(f"Step {step}, Loss: {loss:.4f}") + step += 1 + + if step >= 100: + break + + +# **Key observations:** +# +# 1. **Parallelism**: Independent operations run concurrently +# 2. **Load balancing**: Each ``.route()`` call automatically selects optimal replica +# 3. **Fault tolerance**: Failures automatically retry on different replicas +# 4. **Resource efficiency**: CPU and GPU services scale independently +# 5. **Coordination**: Services coordinate through shared state (replay buffer, weight versions) +# +# This is the power of the service abstraction - complex distributed +# coordination looks like simple async Python code. + +###################################################################### +# Conclusion +# ---------- +# +# This tutorial covered: +# +# * How Forge services work under the hood +# * Communication patterns: ``.route()``, ``.fanout()``, ``.call_one()`` +# * State management with sessions and actors +# * Service scaling and orchestration patterns +# +# **Key takeaways:** +# +# * Use ``.route()`` for stateless load-balanced operations +# * Use ``.fanout()`` for coordinated updates across all replicas +# * Use sessions for stateful operations like multi-turn conversations +# * Scale services independently based on bottlenecks +# * Let Forge handle coordination complexity +# +# In the next part we will learn about Monarch internals. +# +# Further Reading +# --------------- +# +# * Continue to :doc:`3_Monarch_101` (coming soon) +# * Check the `Forge source code `_ +# * Review the :doc:`../../api_actors` documentation +# * Explore the `GRPO application `_ diff --git a/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py b/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py new file mode 100644 index 000000000..d7c8f86e8 --- /dev/null +++ b/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py @@ -0,0 +1,572 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Part 3: The Forge-Monarch Connection +===================================== + +**Author:** `Sanyam Bhutani `_ + +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * How Forge services are built on Monarch + * Understanding ProcMesh and ActorMesh + * The complete hierarchy from service to silicon + * Message routing patterns in distributed actors + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * Complete :doc:`1_RL_and_Forge_Fundamentals` + * Complete :doc:`2_Forge_Internals` + * Understanding of distributed systems + +This is part 3 of our series. In the previous sections we learned: + +* Part 1: [RL Concepts and how they map to Forge](1_RL_and_Forge_Fundamentals) +* Part 2: [Forge Internals](2_Forge_Internals) + +Now let's peel back the layers. Forge services are built on top of +**Monarch**, PyTorch's distributed actor framework. Understanding this +connection is crucial for optimization and debugging. +""" + +###################################################################### +# The Complete Hierarchy: Service to Silicon +# ------------------------------------------- +# +# .. mermaid:: +# +# graph TD +# subgraph YourCode["1. Your RL Code"] +# Call["await policy_service.generate.route('What is 2+2?')"] +# end +# +# subgraph ForgeServices["2. Forge Service Layer"] +# ServiceInterface["ServiceInterface: Routes requests, Load balancing, Health checks"] +# ServiceActor["ServiceActor: Manages replicas, Monitors health, Coordinates failures"] +# end +# +# subgraph MonarchLayer["3. Monarch Actor Layer"] +# ActorMesh["ActorMesh PolicyActor: 4 instances, Different GPUs, Message passing"] +# ProcMesh["ProcMesh: 4 processes, GPU topology 0,1,2,3, Network interconnect"] +# end +# +# subgraph Hardware["4. Physical Hardware"] +# GPU0["GPU 0: PolicyActor #1, vLLM Engine, Model Weights"] +# GPU1["GPU 1: PolicyActor #2, vLLM Engine, Model Weights"] +# GPU2["GPU 2: PolicyActor #3, vLLM Engine, Model Weights"] +# GPU3["GPU 3: PolicyActor #4, vLLM Engine, Model Weights"] +# end +# +# Call --> ServiceInterface +# ServiceInterface --> ServiceActor +# ServiceActor --> ActorMesh +# ActorMesh --> ProcMesh +# ProcMesh --> GPU0 +# ProcMesh --> GPU1 +# ProcMesh --> GPU2 +# ProcMesh --> GPU3 +# +# style Call fill:#4CAF50 +# style ServiceActor fill:#FF9800 +# style ActorMesh fill:#9C27B0 +# style ProcMesh fill:#2196F3 + +###################################################################### +# Deep Dive: ProcMesh - The Foundation +# ------------------------------------- +# +# **ProcMesh** is Monarch's core abstraction for organizing processes +# across hardware. Think of it as a multi-dimensional grid that maps +# directly to your cluster topology. +# +# Single Host ProcMesh +# ~~~~~~~~~~~~~~~~~~~~ +# +# .. mermaid:: +# +# graph TD +# subgraph Host["Single Host (8 GPUs)"] +# subgraph ProcMesh["ProcMesh: per_host={'gpus': 8}"] +# P0["Process 0
GPU 0"] +# P1["Process 1
GPU 1"] +# P2["Process 2
GPU 2"] +# P3["Process 3
GPU 3"] +# P4["Process 4
GPU 4"] +# P5["Process 5
GPU 5"] +# P6["Process 6
GPU 6"] +# P7["Process 7
GPU 7"] +# end +# +# P0 -.->|"Network"| P1 +# P1 -.->|"Network"| P2 +# P2 -.->|"Network"| P3 +# P3 -.->|"Network"| P4 +# P4 -.->|"Network"| P5 +# P5 -.->|"Network"| P6 +# P6 -.->|"Network"| P7 +# P7 -.->|"Network"| P0 +# end +# +# style P0 fill:#F44336 +# style P1 fill:#F44336 +# style P2 fill:#F44336 +# style P3 fill:#F44336 +# style P4 fill:#F44336 +# style P5 fill:#F44336 +# style P6 fill:#F44336 +# style P7 fill:#F44336 + +###################################################################### +# Multi-Host ProcMesh +# ~~~~~~~~~~~~~~~~~~~ +# +# .. mermaid:: +# +# graph TD +# subgraph Cluster["Multi-Host Cluster"] +# subgraph Host1["Host 1"] +# subgraph PM1["ProcMesh Segment 1"] +# H1P0["Process 0
GPU 0"] +# H1P1["Process 1
GPU 1"] +# H1P2["Process 2
GPU 2"] +# H1P3["Process 3
GPU 3"] +# end +# end +# +# subgraph Host2["Host 2"] +# subgraph PM2["ProcMesh Segment 2"] +# H2P0["Process 4
GPU 0"] +# H2P1["Process 5
GPU 1"] +# H2P2["Process 6
GPU 2"] +# H2P3["Process 7
GPU 3"] +# end +# end +# +# subgraph Host3["Host 3"] +# subgraph PM3["ProcMesh Segment 3"] +# H3P0["Process 8
GPU 0"] +# H3P1["Process 9
GPU 1"] +# H3P2["Process 10
GPU 2"] +# H3P3["Process 11
GPU 3"] +# end +# end +# end +# +# H1P0 -.->|"InfiniBand"| H2P0 +# H1P1 -.->|"InfiniBand"| H2P1 +# H2P0 -.->|"InfiniBand"| H3P0 +# H2P1 -.->|"InfiniBand"| H3P1 +# +# style PM1 fill:#F44336 +# style PM2 fill:#4CAF50 +# style PM3 fill:#2196F3 + +###################################################################### +# Monarch Actor System Basics +# ---------------------------- +# +# This shows the underlying actor system that powers Forge services. + +import asyncio + +# Mock imports for documentation build +try: + from monarch.actor import Actor, endpoint, this_proc, Future + from monarch.actor import ProcMesh, this_host +except ImportError: + + class Actor: + pass + + def endpoint(func): + return func + + class Future: + pass + + class ProcMesh: + pass + + def this_proc(): + return None + + def this_host(): + return None + + +# STEP 1: Define a basic actor +class Counter(Actor): + """Basic counter actor example.""" + + def __init__(self, initial_value: int): + self.value = initial_value + + @endpoint + def increment(self) -> None: + """Increment the counter.""" + self.value += 1 + + @endpoint + def get_value(self) -> int: + """Get current counter value.""" + return self.value + + +async def basic_actor_example(): + """Example of using Monarch actors.""" + # STEP 2: Single actor in local process + counter = this_proc().spawn("counter", Counter, initial_value=0) + + # STEP 3: Send messages + fut = counter.get_value.call_one() + value = await fut + print(f"Counter value: {value}") # 0 + + +async def distributed_actors_example(): + """Example of actors across multiple processes.""" + # STEP 4: Multiple actors across processes + procs = this_host().spawn_procs(per_host={"gpus": 8}) + counters = procs.spawn("counters", Counter, 0) + + # STEP 5: Broadcast to all actors + await counters.increment.call() + + # STEP 6: Different message patterns + # call_one() - single actor + value = await counters.get_value.call_one() + print(f"One counter: {value}") + + # choose() - random single actor (actors only, not services) + value = await counters.get_value.choose() + print(f"Random counter: {value}") + + # call() - all actors, collect results + values = await counters.get_value.call() + print(f"All counters: {values}") + + # broadcast() - fire and forget + await counters.increment.broadcast() + + # Cleanup + await procs.stop() + + +###################################################################### +# Actor Meshes: Your Code Running Distributed +# -------------------------------------------- +# +# **ActorMesh** is created when you spawn actors across a ProcMesh. +# Each process in the ProcMesh gets one instance of your actor. +# +# .. mermaid:: +# +# graph TD +# subgraph Creation["Actor Creation Process"] +# Code["mesh.spawn('policy', PolicyActor, model='Qwen/Qwen3-7B')"] +# +# subgraph ProcMesh["ProcMesh (4 processes)"] +# P0["Process 0
GPU 0"] +# P1["Process 1
GPU 1"] +# P2["Process 2
GPU 2"] +# P3["Process 3
GPU 3"] +# end +# +# subgraph ActorMesh["ActorMesh PolicyActor"] +# A0["PolicyActor Instance #0: model=Qwen/Qwen3-7B"] +# A1["PolicyActor Instance #1: model=Qwen/Qwen3-7B"] +# A2["PolicyActor Instance #2: model=Qwen/Qwen3-7B"] +# A3["PolicyActor Instance #3: model=Qwen/Qwen3-7B"] +# end +# +# Code --> ProcMesh +# P0 --> A0 +# P1 --> A1 +# P2 --> A2 +# P3 --> A3 +# end +# +# style A0 fill:#4CAF50 +# style A1 fill:#4CAF50 +# style A2 fill:#4CAF50 +# style A3 fill:#4CAF50 + +###################################################################### +# Message Routing Through ActorMesh +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# .. mermaid:: +# +# graph TD +# subgraph MessageFlow["Message Flow Patterns"] +# Client["await policy_actors.generate.METHOD(prompt)"] +# +# subgraph Methods["Different Adverbs Route Differently"] +# Choose["choose(): Routes to ONE actor, Load balanced"] +# Call["call(): Routes to ALL actors, Collects results"] +# Broadcast["broadcast(): Routes to ALL actors, Fire and forget"] +# Stream["stream(): Routes to ALL actors, Iterator of results"] +# end +# +# subgraph ActorInstances["PolicyActor Instances"] +# A0["Actor 0: GPU 0, generates response"] +# A1["Actor 1: GPU 1, generates response"] +# A2["Actor 2: GPU 2, generates response"] +# A3["Actor 3: GPU 3, generates response"] +# end +# +# Client --> Choose +# Client --> Call +# Client --> Broadcast +# Client --> Stream +# +# Choose -.->|"Load balanced"| A1 +# Call --> A0 +# Call --> A1 +# Call --> A2 +# Call --> A3 +# Broadcast --> A0 +# Broadcast --> A1 +# Broadcast --> A2 +# Broadcast --> A3 +# Stream --> A0 +# Stream --> A1 +# Stream --> A2 +# Stream --> A3 +# end +# +# style Choose fill:#4CAF50 +# style Call fill:#FF9800 +# style Broadcast fill:#E91E63 +# style Stream fill:#9C27B0 + +###################################################################### +# How Forge Services Use Monarch +# ------------------------------- +# +# Now the key insight: **Forge services are ServiceActors that manage +# ActorMeshes of your ForgeActor replicas**. +# +# The Service Creation Process +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# .. mermaid:: +# +# graph TD +# subgraph ServiceCreation["Service Creation Process"] +# Call["await PolicyActor.options(num_replicas=4, procs=1).as_service(model='Qwen')"] +# +# ServiceActor["ServiceActor: Manages 4 replicas, Health checks, Routes calls"] +# +# subgraph Replicas["4 Independent Replicas"] +# subgraph R0["Replica 0"] +# PM0["ProcMesh: 1 process, GPU 0"] +# AM0["ActorMesh
1 PolicyActor"] +# end +# +# subgraph R1["Replica 1"] +# PM1["ProcMesh: 1 process, GPU 1"] +# AM1["ActorMesh
1 PolicyActor"] +# end +# +# subgraph R2["Replica 2"] +# PM2["ProcMesh: 1 process, GPU 2"] +# AM2["ActorMesh
1 PolicyActor"] +# end +# +# subgraph R3["Replica 3"] +# PM3["ProcMesh: 1 process, GPU 3"] +# AM3["ActorMesh
1 PolicyActor"] +# end +# end +# +# Call --> ServiceActor +# ServiceActor --> R0 +# ServiceActor --> R1 +# ServiceActor --> R2 +# ServiceActor --> R3 +# PM0 --> AM0 +# PM1 --> AM1 +# PM2 --> AM2 +# PM3 --> AM3 +# end +# +# style ServiceActor fill:#FF9800 +# style AM0 fill:#4CAF50 +# style AM1 fill:#4CAF50 +# style AM2 fill:#4CAF50 +# style AM3 fill:#4CAF50 + +###################################################################### +# Service Call to Actor Execution +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# .. mermaid:: +# +# graph TD +# subgraph CallFlow["Complete Call Flow"] +# UserCall["await policy_service.generate.route('What is 2+2?')"] +# +# ServiceInterface["ServiceInterface: Receives .route() call, Routes to ServiceActor"] +# +# ServiceActor["ServiceActor: Selects healthy replica, Load balancing, Failure handling"] +# +# SelectedReplica["Selected Replica #2: ProcMesh 1 process, ActorMesh 1 PolicyActor"] +# +# PolicyActor["PolicyActor Instance: Loads model, Runs vLLM inference"] +# +# GPU["GPU 2: vLLM engine, Model weights, KV cache, CUDA kernels"] +# +# UserCall --> ServiceInterface +# ServiceInterface --> ServiceActor +# ServiceActor --> SelectedReplica +# SelectedReplica --> PolicyActor +# PolicyActor --> GPU +# +# GPU -.->|"Response"| PolicyActor +# PolicyActor -.->|"Response"| SelectedReplica +# SelectedReplica -.->|"Response"| ServiceActor +# ServiceActor -.->|"Response"| ServiceInterface +# ServiceInterface -.->|"'The answer is 4'"| UserCall +# end +# +# style UserCall fill:#4CAF50 +# style ServiceActor fill:#FF9800 +# style PolicyActor fill:#9C27B0 +# style GPU fill:#FF5722 + +###################################################################### +# Multiple Services Sharing Infrastructure +# ----------------------------------------- +# +# In real RL systems, you have multiple services that can share or use +# separate ProcMeshes: +# +# .. mermaid:: +# +# graph TD +# subgraph Cluster["RL Training Cluster"] +# subgraph Services["Forge Services"] +# PS["Policy Service
4 GPU replicas"] +# TS["Trainer Service
2 GPU replicas"] +# RS["Reward Service
4 CPU replicas"] +# BS["Buffer Service
1 CPU replica"] +# end +# +# subgraph MonarchInfra["Monarch Infrastructure"] +# subgraph GPUMesh["GPU ProcMesh (6 processes)"] +# G0["Process 0
GPU 0"] +# G1["Process 1
GPU 1"] +# G2["Process 2
GPU 2"] +# G3["Process 3
GPU 3"] +# G4["Process 4
GPU 4"] +# G5["Process 5
GPU 5"] +# end +# +# subgraph CPUMesh["CPU ProcMesh (5 processes)"] +# C0["Process 0
CPU"] +# C1["Process 1
CPU"] +# C2["Process 2
CPU"] +# C3["Process 3
CPU"] +# C4["Process 4
CPU"] +# end +# end +# +# PS --> G0 +# PS --> G1 +# PS --> G2 +# PS --> G3 +# TS --> G4 +# TS --> G5 +# RS --> C0 +# RS --> C1 +# RS --> C2 +# RS --> C3 +# BS --> C4 +# end +# +# style PS fill:#4CAF50 +# style TS fill:#E91E63 +# style RS fill:#FF9800 +# style BS fill:#9C27B0 +# style GPUMesh fill:#FFEBEE +# style CPUMesh fill:#E3F2FD + +###################################################################### +# Key Insights: Why This Architecture Matters +# -------------------------------------------- +# +# 1. **Process Isolation**: Each actor runs in its own process - failures don't cascade +# 2. **Location Transparency**: Actors can be local or remote with identical APIs +# 3. **Structured Distribution**: ProcMesh maps directly to hardware topology +# 4. **Message Passing**: No shared memory means no race conditions or locks +# 5. **Service Abstraction**: Forge hides Monarch complexity while preserving power +# +# Understanding this hierarchy helps you: +# +# * **Debug performance issues**: Is the bottleneck at service, actor, or hardware level? +# * **Optimize resource usage**: How many replicas per service? GPU vs CPU processes? +# * **Handle failures gracefully**: Which layer failed and how to recover? +# * **Scale effectively**: Where to add resources for maximum impact? + + +def demonstrate_architecture_benefits(): + """Example showing why the architecture matters.""" + # Process Isolation: Failures don't cascade + # If one PolicyActor crashes, others continue serving + + # Location Transparency: Same API whether local or remote + # await policy.generate.route(prompt) # Works same everywhere + + # Structured Distribution: ProcMesh maps to hardware + # per_host={"gpus": 8} creates 8 processes, 1 per GPU + + # Message Passing: No locks needed + # Each actor processes messages sequentially, naturally thread-safe + + # Service Abstraction: Simple interface, powerful backend + # await service.method.route() hides all distribution complexity + pass + + +###################################################################### +# Conclusion +# ---------- +# +# What You've Learned +# ~~~~~~~~~~~~~~~~~~~ +# +# 1. **RL Fundamentals**: How RL concepts map to Forge services with real examples +# 2. **Service Abstraction**: How to use Forge services effectively +# 3. **Monarch Foundation**: How Forge services connect to distributed actors and hardware +# +# Key Takeaways +# ~~~~~~~~~~~~~ +# +# * **Services hide complexity**: Your RL code looks like simple async functions, +# but runs on distributed clusters +# * **Communication patterns matter**: ``.route()``, ``.fanout()``, sessions, +# and ``.call_one()`` each serve specific purposes +# * **Architecture understanding helps**: Knowing the Service → Actor → Process → +# Hardware hierarchy helps you debug, optimize, and scale +# * **Always verify APIs**: This guide is verified, but cross-check with source +# code for latest changes +# * **Real API patterns**: Use ``.options().as_service()`` not ``spawn_service()``, +# use ``.route()`` not ``.choose()``, etc. +# +# Further Reading +# --------------- +# +# * Review :doc:`1_RL_and_Forge_Fundamentals` for RL concepts +# * Review :doc:`2_Forge_Internals` for service patterns +# * Check the `Forge source code `_ +# * Explore the `GRPO application `_ +# * Read about `Monarch `_ for deeper understanding diff --git a/docs/Tutorials/ReadMe.MD b/docs/source/tutorial_sources/zero-to-forge/README.md similarity index 57% rename from docs/Tutorials/ReadMe.MD rename to docs/source/tutorial_sources/zero-to-forge/README.md index 084710853..9be7e3b6a 100644 --- a/docs/Tutorials/ReadMe.MD +++ b/docs/source/tutorial_sources/zero-to-forge/README.md @@ -1,4 +1,4 @@ -## Zero to Forge: From RL Theory to Production-Scale Implementation +# Zero to Forge: From RL Theory to Production-Scale Implementation A comprehensive guide for ML Engineers building distributed RL systems for language models. @@ -6,14 +6,21 @@ Some of the examples mentioned below will be conceptual in nature for understand Welcome to the Tutorials section! This section is inspired by the A-Z PyTorch tutorial, shoutout to our PyTorch friends that remember! -### +## Tutorial Structure This section currently is structured in 3 detailed parts: -1. [RL Fundamentals and Understanding Forge Terminology](./1_RL_and_Forge_Fundamentals.MD): This gives a quick refresher of Reinforcement Learning and teaches you Forge Fundamentals -2. [Forge Internals](./2_Forge_Internals.MD): Goes a layer deeper and explains the internals of Forge -3. [Monarch 101](./3_Monarch_101.MD): It's a 101 to Monarch and how Forge Talks to Monarch +1. [RL Fundamentals and Understanding Forge Terminology](1_RL_and_Forge_Fundamentals.html): This gives a quick refresher of Reinforcement Learning and teaches you Forge Fundamentals +2. [Forge Internals](2_Forge_Internals.html): Goes a layer deeper and explains the internals of Forge +3. [Monarch 101](3_Monarch_101.html): It's a 101 to Monarch and how Forge Talks to Monarch -Each part builds upon the next and the entire section can be consumed in roughly an hour-Grab a Chai and Enjoy! +Each part builds upon the next and the entire section can be consumed in roughly an hour - Grab a Chai and Enjoy! -If you're eager, please checkout our SFT Tutorial too (Coming soon!) as well as [App Examples](../../apps/). +If you're eager, please checkout our SFT Tutorial too (Coming soon!)! + +.. toctree:: + :maxdepth: 1 + + 1_RL_and_Forge_Fundamentals + 2_Forge_Internals + 3_Monarch_101 diff --git a/docs/source/tutorials.md b/docs/source/tutorials.md index 6e06c636a..42339dfcb 100644 --- a/docs/source/tutorials.md +++ b/docs/source/tutorials.md @@ -1,9 +1,10 @@ # Tutorials - This section provides step-by-step guides to help you master TorchForge's capabilities, - from basic model fine-tuning to advanced distributed training scenarios. +This section provides step-by-step guides to help you master TorchForge's capabilities, +from basic model fine-tuning to advanced distributed training scenarios. ```{toctree} :maxdepth: 1 +zero-to-forge-intro ``` diff --git a/docs/source/zero-to-forge-intro.md b/docs/source/zero-to-forge-intro.md new file mode 100644 index 000000000..e56edc663 --- /dev/null +++ b/docs/source/zero-to-forge-intro.md @@ -0,0 +1,28 @@ +# Zero to Forge: From RL Theory to Production-Scale Implementation + +A comprehensive guide for ML Engineers building distributed RL systems for language models. + +Some of the examples mentioned below will be conceptual in nature for understanding. Please refer to API Docs (Coming Soon!) for more details + +Welcome to the Tutorials section! This section is inspired by the A-Z PyTorch tutorial, shoutout to our PyTorch friends that remember! + +## Tutorial Structure + +This section currently is structured in 3 detailed parts: + +1. [RL Fundamentals and Understanding Forge Terminology](tutorials/zero-to-forge/1_RL_and_Forge_Fundamentals.html): This gives a quick refresher of Reinforcement Learning and teaches you Forge Fundamentals +2. [Forge Internals](tutorials/zero-to-forge/2_Forge_Internals.html): Goes a layer deeper and explains the internals of Forge +3. [Monarch 101](tutorials/zero-to-forge/3_Monarch_101.html): It's a 101 to Monarch and how Forge Talks to Monarch + +Each part builds upon the next and the entire section can be consumed in roughly an hour - Grab a Chai and Enjoy! + +If you're eager, please checkout our SFT Tutorial too (Coming soon!)! + +```{toctree} +:maxdepth: 1 +:hidden: + +tutorials/zero-to-forge/1_RL_and_Forge_Fundamentals +tutorials/zero-to-forge/2_Forge_Internals +tutorials/zero-to-forge/3_Monarch_101 +``` From 6d2cb580bda321935a6947605f2f4fdb8b8e0d55 Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Tue, 14 Oct 2025 12:11:38 -0700 Subject: [PATCH 22/28] Update --- docs/source/tutorial_sources/zero-to-forge/README.md | 6 +++--- docs/source/zero-to-forge-intro.md | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/source/tutorial_sources/zero-to-forge/README.md b/docs/source/tutorial_sources/zero-to-forge/README.md index 9be7e3b6a..f32b01c20 100644 --- a/docs/source/tutorial_sources/zero-to-forge/README.md +++ b/docs/source/tutorial_sources/zero-to-forge/README.md @@ -10,9 +10,9 @@ Welcome to the Tutorials section! This section is inspired by the A-Z PyTorch tu This section currently is structured in 3 detailed parts: -1. [RL Fundamentals and Understanding Forge Terminology](1_RL_and_Forge_Fundamentals.html): This gives a quick refresher of Reinforcement Learning and teaches you Forge Fundamentals -2. [Forge Internals](2_Forge_Internals.html): Goes a layer deeper and explains the internals of Forge -3. [Monarch 101](3_Monarch_101.html): It's a 101 to Monarch and how Forge Talks to Monarch +1. [RL Fundamentals and Understanding Forge Terminology](1_RL_and_Forge_Fundamentals): This gives a quick refresher of Reinforcement Learning and teaches you Forge Fundamentals +2. [Forge Internals](2_Forge_Internals): Goes a layer deeper and explains the internals of Forge +3. [Monarch 101](3_Monarch_101): It's a 101 to Monarch and how Forge Talks to Monarch Each part builds upon the next and the entire section can be consumed in roughly an hour - Grab a Chai and Enjoy! diff --git a/docs/source/zero-to-forge-intro.md b/docs/source/zero-to-forge-intro.md index e56edc663..c7c31fdf1 100644 --- a/docs/source/zero-to-forge-intro.md +++ b/docs/source/zero-to-forge-intro.md @@ -10,9 +10,9 @@ Welcome to the Tutorials section! This section is inspired by the A-Z PyTorch tu This section currently is structured in 3 detailed parts: -1. [RL Fundamentals and Understanding Forge Terminology](tutorials/zero-to-forge/1_RL_and_Forge_Fundamentals.html): This gives a quick refresher of Reinforcement Learning and teaches you Forge Fundamentals -2. [Forge Internals](tutorials/zero-to-forge/2_Forge_Internals.html): Goes a layer deeper and explains the internals of Forge -3. [Monarch 101](tutorials/zero-to-forge/3_Monarch_101.html): It's a 101 to Monarch and how Forge Talks to Monarch +1. [RL Fundamentals and Understanding Forge Terminology](tutorials/zero-to-forge/1_RL_and_Forge_Fundamentals): This gives a quick refresher of Reinforcement Learning and teaches you Forge Fundamentals +2. [Forge Internals](tutorials/zero-to-forge/2_Forge_Internals): Goes a layer deeper and explains the internals of Forge +3. [Monarch 101](tutorials/zero-to-forge/3_Monarch_101): It's a 101 to Monarch and how Forge Talks to Monarch Each part builds upon the next and the entire section can be consumed in roughly an hour - Grab a Chai and Enjoy! From 42aef41e2623c3c155db9e8980ee061b4eea84ca Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Tue, 14 Oct 2025 12:31:37 -0700 Subject: [PATCH 23/28] Precommit --- .../1_RL_and_Forge_Fundamentals.py | 28 +++--- .../zero-to-forge/2_Forge_Internals.py | 86 ++++++++++--------- .../zero-to-forge/3_Monarch_101.py | 29 +++---- 3 files changed, 75 insertions(+), 68 deletions(-) diff --git a/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py b/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py index 08f7193c0..08c0f3335 100644 --- a/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py +++ b/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py @@ -32,7 +32,7 @@ # Core RL Components in Forge # ---------------------------- # -# Let's start with a simple math tutoring example to understand RL concepts +# Let's start with a simple math tutoring example to understand RL concepts # with the exact names Forge uses: # # The Toy Example: Teaching Math @@ -108,7 +108,7 @@ def conceptual_rl_step(): # From Concepts to Forge Services # -------------------------------- # -# Here's the key insight: **Each RL component becomes a Forge service**. +# Here's the key insight: **Each RL component becomes a Forge service**. # The toy example above maps directly to Forge: # # .. mermaid:: @@ -148,7 +148,7 @@ def conceptual_rl_step(): # RL Step with Forge Services # ---------------------------- # -# Let's look at the example from above again, but this time we use the +# Let's look at the example from above again, but this time we use the # actual Forge API names: import asyncio @@ -249,7 +249,7 @@ async def conceptual_forge_rl_step(services, step): # Enter Forge: RL-Native Architecture # ------------------------------------ # -# Forge solves these problems by treating each RL component as an +# Forge solves these problems by treating each RL component as an # **independent, distributed unit**. # # Quick API Reference (covered in detail in Part 2): @@ -322,8 +322,8 @@ async def example_automatic_management(policy): # For actual imports, see apps/grpo/main.py try: from forge.actors.policy import Policy - from forge.actors.replay_buffer import ReplayBuffer from forge.actors.reference_model import ReferenceModel + from forge.actors.replay_buffer import ReplayBuffer from forge.actors.trainer import RLTrainer from forge.data.rewards import MathReward, ThinkingReward @@ -406,7 +406,11 @@ async def setup_forge_services(): ), # Trainer actor with GPU RLTrainer.options(procs=1, with_gpus=True).as_actor( - model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"}, + model={ + "name": "qwen3", + "flavor": "1.7B", + "hf_assets_path": f"hf://{model}", + }, optimizer={"name": "AdamW", "lr": 1e-5}, training={"local_batch_size": 2, "seq_len": 2048}, ), @@ -418,7 +422,11 @@ async def setup_forge_services(): ComputeAdvantages.options(procs=1).as_actor(), # Reference model with GPU ReferenceModel.options(procs=1, with_gpus=True).as_actor( - model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"}, + model={ + "name": "qwen3", + "flavor": "1.7B", + "hf_assets_path": f"hf://{model}", + }, training={"dtype": "bfloat16"}, ), # Reward actor (CPU) @@ -444,12 +452,12 @@ async def setup_forge_services(): # # Forge has two types of distributed components: # -# * **Services**: Multiple replicas with automatic load balancing +# * **Services**: Multiple replicas with automatic load balancing # (like Policy, RewardActor) -# * **Actors**: Single instances that handle their own internal +# * **Actors**: Single instances that handle their own internal # distribution (like RLTrainer, ReplayBuffer) # -# We cover this distinction in detail in Part 2, but for now this +# We cover this distinction in detail in Part 2, but for now this # explains the scaling patterns: # # * Policy service: ``num_replicas=8`` for high inference demand diff --git a/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py b/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py index efecfdc72..6c4be76f6 100644 --- a/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py +++ b/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py @@ -27,10 +27,10 @@ * Understanding of Python async/await * Basic distributed systems knowledge -We highly recommend reading Part 1 before this - it explains RL Concepts +We highly recommend reading Part 1 before this - it explains RL Concepts and how they land in Forge. -Now that you see the power of the service abstraction, let's understand +Now that you see the power of the service abstraction, let's understand what's actually happening under the hood. Grab your chai! """ @@ -38,7 +38,7 @@ # Service Anatomy: Beyond the Interface # -------------------------------------- # -# When you call ``await policy_service.generate(question)``, here's what +# When you call ``await policy_service.generate(question)``, here's what # actually happens: # # (Don't worry, we will understand Services right in the next section!) @@ -100,7 +100,7 @@ # 2. Real Service Creation # ~~~~~~~~~~~~~~~~~~~~~~~~ # -# Services are created using the ``.options().as_service()`` pattern +# Services are created using the ``.options().as_service()`` pattern # from the actual GRPO implementation. # # The service creation automatically handles: @@ -152,11 +152,11 @@ async def example_service_creation(): # 3. How Services Actually Work # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# Forge services are implemented as ServiceActors that manage +# Forge services are implemented as ServiceActors that manage # collections of your ForgeActor replicas. # -# When you call ``.as_service()``, Forge creates a ``ServiceInterface`` -# that manages N replicas of your ``ForgeActor`` class and gives you +# When you call ``.as_service()``, Forge creates a ``ServiceInterface`` +# that manages N replicas of your ``ForgeActor`` class and gives you # methods like ``.route()``, ``.fanout()``, etc. @@ -215,14 +215,14 @@ async def service_interface_example(policy): # Deep Dive: Service Communication Patterns # ------------------------------------------ # -# These communication patterns ("adverbs") determine how your service -# calls are routed to replicas. Understanding when to use each pattern +# These communication patterns ("adverbs") determine how your service +# calls are routed to replicas. Understanding when to use each pattern # is key to effective Forge usage. # # 1. ``.route()`` - Load Balanced Single Replica # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# **When to use**: Normal request routing where any replica can handle +# **When to use**: Normal request routing where any replica can handle # the request. @@ -246,7 +246,7 @@ async def route_example(policy): # * **Throughput**: Limited by single replica capacity # * **Fault tolerance**: Automatic failover to other replicas # -# **Critical insight**: ``.route()`` is your default choice for +# **Critical insight**: ``.route()`` is your default choice for # stateless operations in Forge services. ###################################################################### @@ -273,7 +273,7 @@ async def fanout_example(policy): # * **Throughput**: Network bandwidth × number of replicas # * **Fault tolerance**: Fails if ANY replica fails (unless configured) # -# **Critical gotcha**: Don't use ``.fanout()`` for high-frequency +# **Critical gotcha**: Don't use ``.fanout()`` for high-frequency # operations - it contacts all replicas. ###################################################################### @@ -306,18 +306,18 @@ async def streaming_pattern_example(replay_buffer, trainer, step): # * **Throughput**: Non-blocking async operations # * **Fault tolerance**: Continues if some replicas fail # -# **Critical insight**: Essential for high-throughput RL where +# **Critical insight**: Essential for high-throughput RL where # you can't wait for batches. ###################################################################### # Service Sessions for Stateful Operations # ----------------------------------------- # -# **When to use**: When you need multiple calls to hit the same replica +# **When to use**: When you need multiple calls to hit the same replica # (like KV cache preservation). # -# **What are sticky sessions?** A session ensures all your service calls -# within the ``async with`` block go to the same replica, instead of +# **What are sticky sessions?** A session ensures all your service calls +# within the ``async with`` block go to the same replica, instead of # being load-balanced across different replicas. # Mock classes for example @@ -396,13 +396,13 @@ async def with_sessions_example(): # Deep Dive: State Management Reality # ------------------------------------ # -# The most complex challenge in distributed RL is maintaining state +# The most complex challenge in distributed RL is maintaining state # consistency while maximizing performance. # # The KV Cache Problem # ~~~~~~~~~~~~~~~~~~~~ # -# **The challenge**: Policy inference is much faster with KV cache, +# **The challenge**: Policy inference is much faster with KV cache, # but cache is tied to specific conversation history. @@ -415,9 +415,7 @@ async def naive_multi_turn(policy_service): full_prompt = question1 + response1[0].text response2 = await policy_service.generate.route(prompt=full_prompt) # Cache miss! conversation = full_prompt + response2[0].text - response3 = await policy_service.generate.route( - prompt=conversation - ) # Cache miss! + response3 = await policy_service.generate.route(prompt=conversation) # Cache miss! async def optimized_multi_turn(policy): @@ -434,17 +432,17 @@ async def optimized_multi_turn(policy): # Session ends, replica can be garbage collected or reused -# **Performance impact**: Maintaining KV cache across turns avoids +# **Performance impact**: Maintaining KV cache across turns avoids # recomputing previous tokens. ###################################################################### # Replay Buffer Consistency # ~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# **The challenge**: Multiple trainers and experience collectors +# **The challenge**: Multiple trainers and experience collectors # reading/writing concurrently. # -# **Real Forge approach**: The ReplayBuffer actor handles concurrency +# **Real Forge approach**: The ReplayBuffer actor handles concurrency # internally: @@ -466,14 +464,14 @@ async def replay_buffer_example(replay_buffer): # state = await replay_buffer.state_dict.call_one() # Checkpoint -# **Critical insight**: The actor model provides natural thread safety - +# **Critical insight**: The actor model provides natural thread safety - # each actor processes messages sequentially. ###################################################################### # Weight Synchronization Strategy # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# **The challenge**: Trainer updates policy weights, but policy service +# **The challenge**: Trainer updates policy weights, but policy service # needs those weights. import torch @@ -497,18 +495,24 @@ async def real_weight_sync(trainer, policy, step): # Deep Dive: Asynchronous Coordination Patterns # ---------------------------------------------- # -# **The real challenge**: Different services run at different speeds, +# **The real challenge**: Different services run at different speeds, # but Forge's service abstraction handles the coordination complexity. # # The Forge Approach: Let Services Handle Coordination # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# Instead of manual coordination, Forge services handle speed mismatches +# Instead of manual coordination, Forge services handle speed mismatches # automatically: async def simple_rl_step( - dataloader, policy, ref_model, reward_actor, replay_buffer, compute_advantages, trainer + dataloader, + policy, + ref_model, + reward_actor, + replay_buffer, + compute_advantages, + trainer, ): """Simple RL step showing service coordination.""" # ===== Generate a rollout ===== @@ -561,24 +565,17 @@ async def simple_rl_step( # Handling Speed Mismatches with Service Scaling # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# **The insight**: Scale services independently based on their +# **The insight**: Scale services independently based on their # bottlenecks. # Mock imports for example try: - from forge.actors.policy import Policy from forge.actors.trainer import RLTrainer except ImportError: - class Policy: - pass - class RLTrainer: pass - class RewardActor: - pass - async def scaling_example(): """Scale services independently based on bottlenecks.""" @@ -598,7 +595,8 @@ async def scaling_example(): trainer = await RLTrainer.options( procs=1, with_gpus=True # Fewer but GPU-heavy ).as_actor( # Trainer typically uses .as_actor() not .as_service() - model={"name": "qwen3", "flavor": "1.7B"}, optimizer={"name": "AdamW", "lr": 1e-5} + model={"name": "qwen3", "flavor": "1.7B"}, + optimizer={"name": "AdamW", "lr": 1e-5}, ) @@ -611,8 +609,8 @@ async def scaling_example(): # Mock imports try: from forge.controller import ForgeActor - from monarch.actor import endpoint from forge.data.rewards import MathReward, ThinkingReward + from monarch.actor import endpoint except ImportError: class ForgeActor: @@ -647,7 +645,9 @@ async def evaluate_response(self, prompt: str, response: str, target: str) -> fl total_reward += reward # Return average reward across all functions - return total_reward / len(self.reward_functions) if self.reward_functions else 0.0 + return ( + total_reward / len(self.reward_functions) if self.reward_functions else 0.0 + ) async def reward_service_example(): @@ -703,7 +703,9 @@ async def production_training_loop(): # Reward evaluation service call reward = await reward_actor.evaluate_response.route( - prompt=sample["question"], response=responses[0].text, target=sample["answer"] + prompt=sample["question"], + response=responses[0].text, + target=sample["answer"], ) # Experience storage @@ -734,7 +736,7 @@ async def production_training_loop(): # 4. **Resource efficiency**: CPU and GPU services scale independently # 5. **Coordination**: Services coordinate through shared state (replay buffer, weight versions) # -# This is the power of the service abstraction - complex distributed +# This is the power of the service abstraction - complex distributed # coordination looks like simple async Python code. ###################################################################### diff --git a/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py b/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py index d7c8f86e8..09e61e154 100644 --- a/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py +++ b/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py @@ -32,8 +32,8 @@ * Part 1: [RL Concepts and how they map to Forge](1_RL_and_Forge_Fundamentals) * Part 2: [Forge Internals](2_Forge_Internals) -Now let's peel back the layers. Forge services are built on top of -**Monarch**, PyTorch's distributed actor framework. Understanding this +Now let's peel back the layers. Forge services are built on top of +**Monarch**, PyTorch's distributed actor framework. Understanding this connection is crucial for optimization and debugging. """ @@ -83,8 +83,8 @@ # Deep Dive: ProcMesh - The Foundation # ------------------------------------- # -# **ProcMesh** is Monarch's core abstraction for organizing processes -# across hardware. Think of it as a multi-dimensional grid that maps +# **ProcMesh** is Monarch's core abstraction for organizing processes +# across hardware. Think of it as a multi-dimensional grid that maps # directly to your cluster topology. # # Single Host ProcMesh @@ -175,12 +175,9 @@ # # This shows the underlying actor system that powers Forge services. -import asyncio - # Mock imports for documentation build try: - from monarch.actor import Actor, endpoint, this_proc, Future - from monarch.actor import ProcMesh, this_host + from monarch.actor import Actor, endpoint, Future, ProcMesh, this_host, this_proc except ImportError: class Actor: @@ -264,7 +261,7 @@ async def distributed_actors_example(): # Actor Meshes: Your Code Running Distributed # -------------------------------------------- # -# **ActorMesh** is created when you spawn actors across a ProcMesh. +# **ActorMesh** is created when you spawn actors across a ProcMesh. # Each process in the ProcMesh gets one instance of your actor. # # .. mermaid:: @@ -352,7 +349,7 @@ async def distributed_actors_example(): # How Forge Services Use Monarch # ------------------------------- # -# Now the key insight: **Forge services are ServiceActors that manage +# Now the key insight: **Forge services are ServiceActors that manage # ActorMeshes of your ForgeActor replicas**. # # The Service Creation Process @@ -447,7 +444,7 @@ async def distributed_actors_example(): # Multiple Services Sharing Infrastructure # ----------------------------------------- # -# In real RL systems, you have multiple services that can share or use +# In real RL systems, you have multiple services that can share or use # separate ProcMeshes: # # .. mermaid:: @@ -551,15 +548,15 @@ def demonstrate_architecture_benefits(): # Key Takeaways # ~~~~~~~~~~~~~ # -# * **Services hide complexity**: Your RL code looks like simple async functions, +# * **Services hide complexity**: Your RL code looks like simple async functions, # but runs on distributed clusters -# * **Communication patterns matter**: ``.route()``, ``.fanout()``, sessions, +# * **Communication patterns matter**: ``.route()``, ``.fanout()``, sessions, # and ``.call_one()`` each serve specific purposes -# * **Architecture understanding helps**: Knowing the Service → Actor → Process → +# * **Architecture understanding helps**: Knowing the Service → Actor → Process → # Hardware hierarchy helps you debug, optimize, and scale -# * **Always verify APIs**: This guide is verified, but cross-check with source +# * **Always verify APIs**: This guide is verified, but cross-check with source # code for latest changes -# * **Real API patterns**: Use ``.options().as_service()`` not ``spawn_service()``, +# * **Real API patterns**: Use ``.options().as_service()`` not ``spawn_service()``, # use ``.route()`` not ``.choose()``, etc. # # Further Reading From 0296c34bda43fdb0d681922059bdddb974503776 Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Tue, 14 Oct 2025 14:09:02 -0700 Subject: [PATCH 24/28] Update --- .../1_RL_and_Forge_Fundamentals.py | 145 ++--- .../zero-to-forge/2_Forge_Internals.py | 563 +++++++++--------- 2 files changed, 343 insertions(+), 365 deletions(-) diff --git a/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py b/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py index 08c0f3335..cbdb1fe5f 100644 --- a/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py +++ b/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py @@ -191,6 +191,15 @@ async def conceptual_forge_rl_step(services, step): return loss +###################################################################### +# **Key difference**: Same RL logic, but each component is now a distributed, +# # fault-tolerant, auto-scaling service. + +# Did you realise-we are not worrying about any Infra code here! Forge +# # Automagically handles the details behind the scenes and you can focus on +# # writing your RL Algorthms! + + ###################################################################### # Why This Matters: Traditional ML Infrastructure Fails @@ -296,6 +305,10 @@ async def real_rl_training_step(services, step): return loss +##################################################################### +# **Key insight**: Each line of RL pseudocode becomes a service call. +# The complexity of distribution, scaling, and fault tolerance is hidden +# behind these simple interfaces. ###################################################################### # What Makes This Powerful @@ -311,6 +324,15 @@ async def example_automatic_management(policy): answer = responses[0].text return answer +###################################################################### +# Forge handles behind the scenes: +# +# - Routing to least loaded replica +# - GPU memory management +# - Batch optimization +# - Failure recovery +# - Auto-scaling based on demand + ###################################################################### # Independent Scaling @@ -320,67 +342,27 @@ async def example_automatic_management(policy): # Note: This is example code showing the Forge API # For actual imports, see apps/grpo/main.py -try: - from forge.actors.policy import Policy - from forge.actors.reference_model import ReferenceModel - from forge.actors.replay_buffer import ReplayBuffer - from forge.actors.trainer import RLTrainer - from forge.data.rewards import MathReward, ThinkingReward - - # Mock classes for the example - class DatasetActor: - pass - - class RewardActor: - pass - - class ComputeAdvantages: - pass - -except ImportError: - # Provide mock classes if imports fail during doc build - class Policy: - pass - - class ReplayBuffer: - pass - - class ReferenceModel: - pass - - class RLTrainer: - pass - - class DatasetActor: - pass - - class RewardActor: - pass - - class ComputeAdvantages: - pass - - class MathReward: - pass - - class ThinkingReward: - pass - - -async def setup_forge_services(): - """Configure Forge services with independent scaling.""" - model = "Qwen/Qwen3-1.7B" - group_size = 1 - - ( - dataloader, - policy, - trainer, - replay_buffer, - compute_advantages, - ref_model, - reward_actor, - ) = await asyncio.gather( +from forge.actors.policy import Policy +from forge.actors.replay_buffer import ReplayBuffer +from forge.actors.reference_model import ReferenceModel +from forge.actors.trainer import RLTrainer +from apps.grpo.main import DatasetActor, RewardActor, ComputeAdvantages +from forge.data.rewards import MathReward, ThinkingReward +import asyncio +import torch + +model = "Qwen/Qwen3-1.7B" +group_size = 1 + +( + dataloader, + policy, + trainer, + replay_buffer, + compute_advantages, + ref_model, + reward_actor, +) = await asyncio.gather( # Dataset actor (CPU) DatasetActor.options(procs=1).as_actor( path="openai/gsm8k", @@ -389,62 +371,47 @@ async def setup_forge_services(): streaming=True, model=model, ), - # Policy service with GPU and multiple replicas + # Policy service with GPU Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service( engine_config={ "model": model, "tensor_parallel_size": 1, "pipeline_parallel_size": 1, - "enforce_eager": False, + "enforce_eager": False }, sampling_config={ "n": group_size, "max_tokens": 16, "temperature": 1.0, - "top_p": 1.0, - }, + "top_p": 1.0 + } ), # Trainer actor with GPU RLTrainer.options(procs=1, with_gpus=True).as_actor( - model={ - "name": "qwen3", - "flavor": "1.7B", - "hf_assets_path": f"hf://{model}", - }, + # Trainer config would come from YAML in real usage + model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"}, optimizer={"name": "AdamW", "lr": 1e-5}, - training={"local_batch_size": 2, "seq_len": 2048}, + training={"local_batch_size": 2, "seq_len": 2048} ), # Replay buffer (CPU) ReplayBuffer.options(procs=1).as_actor( - batch_size=2, max_policy_age=1, dp_size=1 + batch_size=2, + max_policy_age=1, + dp_size=1 ), # Advantage computation (CPU) ComputeAdvantages.options(procs=1).as_actor(), # Reference model with GPU ReferenceModel.options(procs=1, with_gpus=True).as_actor( - model={ - "name": "qwen3", - "flavor": "1.7B", - "hf_assets_path": f"hf://{model}", - }, - training={"dtype": "bfloat16"}, + model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"}, + training={"dtype": "bfloat16"} ), # Reward actor (CPU) RewardActor.options(procs=1, num_replicas=1).as_service( reward_functions=[MathReward(), ThinkingReward()] - ), + ) ) - return { - "dataloader": dataloader, - "policy": policy, - "trainer": trainer, - "replay_buffer": replay_buffer, - "compute_advantages": compute_advantages, - "ref_model": ref_model, - "reward_actor": reward_actor, - } - ###################################################################### # Forge Components: Services vs Actors diff --git a/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py b/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py index 6c4be76f6..0a1e57c91 100644 --- a/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py +++ b/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py @@ -27,8 +27,8 @@ * Understanding of Python async/await * Basic distributed systems knowledge -We highly recommend reading Part 1 before this - it explains RL Concepts -and how they land in Forge. +We highly recommend completing Part 1 before starting this tutorial. +Part 1 explains RL Concepts and how they land in Forge. Now that you see the power of the service abstraction, let's understand what's actually happening under the hood. Grab your chai! @@ -85,16 +85,19 @@ # 1. Real Service Configuration # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# Here's the actual ServiceConfig from Forge source code: +# Here's the actual ``ServiceConfig`` from Forge source code: # Configuration pattern from apps/grpo/main.py: -# Policy.options( -# procs=1, # Processes per replica -# num_replicas=4, # Number of replicas -# with_gpus=True # Allocate GPUs -# # Other available options: -# # hosts=None # the number of remote hosts used per replica -# ) +# +# .. code-block:: python +# +# Policy.options( +# procs=1, # Processes per replica +# num_replicas=4, # Number of replicas +# with_gpus=True # Allocate GPUs +# # Other available options: +# # hosts=None # the number of remote hosts used per replica +# ) ###################################################################### # 2. Real Service Creation @@ -106,65 +109,56 @@ # The service creation automatically handles: # # * Spawning actor replicas across processes/GPUs -# * Load balancing with .route() method for services +# * Load balancing with ``.route()`` method for services # * Health monitoring and failure recovery # * Message routing and serialization -import asyncio - -# Mock imports for documentation build -try: - from forge.actors.policy import Policy -except ImportError: - - class Policy: - pass - - -async def example_service_creation(): - """Example of creating a Policy service.""" - model = "Qwen/Qwen3-1.7B" - - policy = await Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service( - engine_config={ - "model": model, - "tensor_parallel_size": 1, - "pipeline_parallel_size": 1, - "enforce_eager": False, - }, - sampling_config={ - "n": 1, - "max_tokens": 16, - "temperature": 1.0, - "top_p": 1.0, - }, - ) +from forge.actors.policy import Policy + +model = "Qwen/Qwen3-1.7B" + +policy = await Policy.options( + procs=1, + with_gpus=True, + num_replicas=1 +).as_service( + engine_config={ + "model": model, + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "enforce_eager": False + }, + sampling_config={ + "n": 1, + "max_tokens": 16, + "temperature": 1.0, + "top_p": 1.0 + } +) - prompt = "What is 3 + 5?" - responses = await policy.generate.route(prompt) - print(f"Response: {responses[0].text}") +prompt = "What is 3 + 5?" +responses = await policy.generate.route(prompt) +print(f"Response: {responses[0].text}") - # Cleanup when done - await policy.shutdown() +# Cleanup when done +await policy.shutdown() ###################################################################### # 3. How Services Actually Work # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# Forge services are implemented as ServiceActors that manage -# collections of your ForgeActor replicas. +# Forge services are implemented as ``ServiceActors`` that manage +# collections of your ``ForgeActor`` replicas. # # When you call ``.as_service()``, Forge creates a ``ServiceInterface`` # that manages N replicas of your ``ForgeActor`` class and gives you # methods like ``.route()``, ``.fanout()``, etc. -async def service_interface_example(policy): - """Your code sees this simple interface.""" - # Simple call - but Forge handles all complexity - responses = await policy.generate.route(prompt="What is 2+2?") - # Forge handles: replica management, load balancing, fault tolerance +# Your code sees this simple interface: +responses = await policy.generate.route(prompt=prompt) +# But Forge handles all the complexity of replica management, load balancing, and fault tolerance ###################################################################### @@ -225,16 +219,12 @@ async def service_interface_example(policy): # **When to use**: Normal request routing where any replica can handle # the request. +responses = await policy.generate.route(prompt=question) +answer = responses[0].text # Extract text from Completion object -async def route_example(policy): - """Using .route() for load-balanced requests.""" - question = "What is 2+2?" - responses = await policy.generate.route(prompt=question) - answer = responses[0].text # Extract text from Completion object - return answer - - +###################################################################### # Behind the scenes: +# # 1. Health check eliminates failed replicas # 2. Load balancer picks replica (currently round robin) # 3. Request routes to that specific replica @@ -256,15 +246,13 @@ async def route_example(policy): # **When to use**: You need responses from ALL replicas. -async def fanout_example(policy): - """Using .fanout() to broadcast to all replicas.""" - # Get version from all policy replicas - current_versions = await policy.get_version.fanout() - # Returns: [version_replica_1, version_replica_2, ...] +# Get version from all policy replicas +current_versions = await policy.get_version.fanout() +# Returns: [version_replica_1, version_replica_2, ...] - # Update weights on all replicas - await policy.update_weights.fanout(new_policy_version=1) - # Broadcasts to all replicas simultaneously +# Update weights on all replicas +await policy.update_weights.fanout(new_policy_version) +# Broadcasts to all replicas simultaneously # **Performance characteristics**: @@ -281,25 +269,22 @@ async def fanout_example(policy): # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # **When to use**: You want to process results as they arrive. +# +# Streaming requires custom implementation in your training loop. +# The basic ``ReplayBuffer`` doesn't have built-in streaming methods. +# Pattern from apps/grpo/main.py continuous training: +while training: + # This is the real API call pattern + batch = await replay_buffer.sample.call_one(curr_policy_version=step) + if batch is not None: + # Process batch immediately + loss = await trainer.train_step.call_one(batch) + print(f"Training loss: {loss}") + else: + await asyncio.sleep(0.1) # Wait for more data -async def streaming_pattern_example(replay_buffer, trainer, step): - """Streaming pattern for continuous training.""" - # CONCEPTUAL - Streaming requires custom implementation - # Pattern from apps/grpo/main.py continuous training: - - while True: - # This is the real API call pattern - batch = await replay_buffer.sample.call_one(curr_policy_version=step) - if batch is not None: - # Process batch immediately - loss = await trainer.train_step.call_one(batch) - print(f"Training loss: {loss}") - else: - await asyncio.sleep(0.1) # Wait for more data - break # Just for example - - +###################################################################### # **Performance characteristics**: # # * **Latency**: Process first result immediately @@ -319,23 +304,12 @@ async def streaming_pattern_example(replay_buffer, trainer, step): # **What are sticky sessions?** A session ensures all your service calls # within the ``async with`` block go to the same replica, instead of # being load-balanced across different replicas. +# This Counter example demonstrates the difference between regular routing and sessions: -# Mock classes for example -try: - from forge.controller import ForgeActor - from monarch.actor import endpoint -except ImportError: - - class ForgeActor: - pass - - def endpoint(func): - return func - +from forge.controller import ForgeActor +from monarch.actor import endpoint class ForgeCounter(ForgeActor): - """Example counter to demonstrate sessions.""" - def __init__(self, initial_value: int): self.value = initial_value @@ -352,45 +326,47 @@ def get_value(self) -> int: async def reset(self): self.value = 0 +counter_service = await ForgeCounter.options( + procs=1, num_replicas=4 +).as_service(initial_value=0) -async def without_sessions_example(): - """WITHOUT SESSIONS: Each .route() goes to different replica.""" - counter_service = await ForgeCounter.options(procs=1, num_replicas=4).as_service( - initial_value=0 - ) - - # Each call might go to different replica - await counter_service.increment.route() # Might go to replica 2 - await counter_service.increment.route() # Might go to replica 1 - await counter_service.increment.route() # Might go to replica 3 +# WITHOUT SESSIONS: Each .route() call goes to a different replica +await counter_service.increment.route() # Might go to replica 2 +await counter_service.increment.route() # Might go to replica 1 +await counter_service.increment.route() # Might go to replica 3 - results = await counter_service.increment.fanout() - print(f"All replica values: {results}") - # Output: All replica values: [1, 2, 1, 1] - # Each replica has different state! +results = await counter_service.increment.fanout() # Get from all replicas +print(f"All replica values: {results}") +# Output: All replica values: [1, 2, 1, 1] - Each replica has different state! - await counter_service.shutdown() +###################################################################### +# The problem: each `.route()` call can go to different replicas, creating inconsistent state. +# WITH SESSIONS: All calls go to the SAME replica +print("\nUsing sticky sessions:") +async with counter_service.session(): # Creates a session that picks one replica + await counter_service.reset.route() # Uses .route() within session + print(await counter_service.increment.route()) # 1 + print(await counter_service.increment.route()) # 2 + print(await counter_service.increment.route()) # 3 -async def with_sessions_example(): - """WITH SESSIONS: All calls go to the SAME replica.""" - counter_service = await ForgeCounter.options(procs=1, num_replicas=4).as_service( - initial_value=0 - ) + final_value = await counter_service.get_value.route() + print(f"Final value on this replica: {final_value}") # 3 - print("\nUsing sticky sessions:") - async with counter_service.session(): - await counter_service.reset.route() - print(await counter_service.increment.route()) # 1 - print(await counter_service.increment.route()) # 2 - print(await counter_service.increment.route()) # 3 +###################################################################### +# Same pattern works with Policy for multi-turn conversations: - final_value = await counter_service.get_value.route() - print(f"Final value on this replica: {final_value}") # 3 +async with policy.session(): + response1 = await policy.generate.route(turn1) + full_prompt = turn1 + response1[0].text + turn2 + response2 = await policy.generate.route(full_prompt) + # Both calls hit same replica, preserving KV cache - # Cleanup - await counter_service.shutdown() +# Cleanup +await counter_service.shutdown() +###################################################################### +# **Performance impact**: Critical for maintaining KV cache in multi-turn conversations. ###################################################################### # Deep Dive: State Management Reality @@ -405,17 +381,15 @@ async def with_sessions_example(): # **The challenge**: Policy inference is much faster with KV cache, # but cache is tied to specific conversation history. - -async def naive_multi_turn(policy_service): - """This breaks KV cache optimization.""" - question1 = "What is 2+2?" - +# This breaks KV cache optimization: +async def naive_multi_turn(): # Each call might go to different replica = cache miss - response1 = await policy_service.generate.route(prompt=question1) - full_prompt = question1 + response1[0].text - response2 = await policy_service.generate.route(prompt=full_prompt) # Cache miss! - conversation = full_prompt + response2[0].text - response3 = await policy_service.generate.route(prompt=conversation) # Cache miss! + response1 = await policy_service.generate.choose(question1) + response2 = await policy_service.generate.choose(question1 + response1) # Cache miss! + response3 = await policy_service.generate.choose(conversation_so_far) # Cache miss! + +###################################################################### +# **The solution**: Sticky sessions ensure all calls go to same replica. async def optimized_multi_turn(policy): @@ -431,7 +405,7 @@ async def optimized_multi_turn(policy): # Session ends, replica can be garbage collected or reused - +###################################################################### # **Performance impact**: Maintaining KV cache across turns avoids # recomputing previous tokens. @@ -445,23 +419,20 @@ async def optimized_multi_turn(policy): # **Real Forge approach**: The ReplayBuffer actor handles concurrency # internally: +# Forge ReplayBuffer endpoints (verified from source code) +# Add episodes (thread-safe by actor model) +await replay_buffer.add.call_one(episode) # .choose() would work too, but .call_one() clarifies it's a singleton actor not ActorMesh -async def replay_buffer_example(replay_buffer): - """ReplayBuffer provides thread-safe operations.""" - # Add episodes (thread-safe by actor model) - episode = {} # Mock episode - await replay_buffer.add.call_one(episode) - - # Sample batches for training - batch = await replay_buffer.sample.call_one( - curr_policy_version=0, - batch_size=None, # Optional, uses default from config - ) +# Sample batches for training +batch = await replay_buffer.sample.call_one( + curr_policy_version=step_number, + batch_size=None # Optional parameter, uses default from config +) - # Additional methods available: - # await replay_buffer.clear.call_one() # Clear buffer - # await replay_buffer.evict.call_one(curr_policy_version) # Remove old - # state = await replay_buffer.state_dict.call_one() # Checkpoint +# Additional methods available: +# await replay_buffer.clear.call_one() # Clear buffer +# await replay_buffer.evict.call_one(curr_policy_version) # Remove old episodes +# state = await replay_buffer.state_dict.call_one() # Get state for checkpointing # **Critical insight**: The actor model provides natural thread safety - @@ -474,11 +445,8 @@ async def replay_buffer_example(replay_buffer): # **The challenge**: Trainer updates policy weights, but policy service # needs those weights. -import torch - - +# Forge weight synchronization pattern from apps/grpo/main.py async def real_weight_sync(trainer, policy, step): - """Forge weight synchronization pattern from apps/grpo/main.py.""" # Trainer pushes weights to TorchStore with version number await trainer.push_weights.call_one(policy_version=step + 1) @@ -486,10 +454,9 @@ async def real_weight_sync(trainer, policy, step): # Use .fanout() to update ALL policy replicas await policy.update_weights.fanout(policy_version=step + 1) - # Check current policy version - current_version = await policy.get_version.route() - print(f"Current policy version: {current_version}") - +# Check current policy version +current_version = await policy.get_version.route() +print(f"Current policy version: {current_version}") ###################################################################### # Deep Dive: Asynchronous Coordination Patterns @@ -505,24 +472,18 @@ async def real_weight_sync(trainer, policy, step): # automatically: -async def simple_rl_step( - dataloader, - policy, - ref_model, - reward_actor, - replay_buffer, - compute_advantages, - trainer, -): - """Simple RL step showing service coordination.""" +from apps.grpo.main import Episode, Group + +async def simple_rl_step(): + # ===== Generate a rollout ===== - sample = await dataloader.sample.call_one() - prompt, target = sample["request"], sample["target"] + sample = await dataloader.sample.call_one() # DatasetActor is an actor, not service + prompt, target = sample["request"], sample["target"] # Correct field names print(f"Prompt: {prompt}") print(f"Target: {target}") - actions = await policy.generate.route(prompt=prompt) + actions = await policy.generate.route(prompt=prompt) # Policy is a service print(f"Policy response: {actions[0].text}") # Create input tensor for reference model (requires full context) @@ -530,36 +491,60 @@ async def simple_rl_step( ref_logprobs = await ref_model.forward.route( input_ids.unsqueeze(0), max_req_tokens=512, return_logprobs=True ) - - reward = await reward_actor.evaluate_response.route( - prompt=prompt, response=actions[0].text, target=target + reward = await reward_actor.evaluate_response.route( # RewardActor is a service + prompt=prompt, + response=actions[0].text, + target=target ) print(f"Reward: {reward}") - # Create episode (simplified for example) - episode = { - "episode_id": "0", - "request": prompt, - "response": actions[0].text, - "reward": reward, - "ref_logprobs": ref_logprobs[0], - } + # Create episode using actual GRPO Episode structure + episode = Episode( + episode_id="0", + request=prompt, + policy_version=0, + pad_id=tokenizer.pad_token_id, + request_len=512, + response_len=512, + target=target + ) - await replay_buffer.add.call_one(episode) + # Add response data + episode.response = actions[0].text + episode.request_tokens = actions[0].prompt_ids.tolist() + episode.response_tokens = actions[0].token_ids.tolist() + episode.ref_logprobs = ref_logprobs[0] # Extract from batch dimension + episode.reward = reward + + # Compute advantages using actual ComputeAdvantages actor + group = Group.new_group(0, 1, prompt, 0, tokenizer.pad_token_id, 512, 512, target) + group.episodes[0] = episode + advantages = await compute_advantages.compute.call_one(group) # ComputeAdvantages is an actor + episode.advantage = advantages[0] + print(f"Advantage: {advantages[0]}") + await replay_buffer.add.call_one(episode) # ReplayBuffer is an actor print("Episode stored in replay buffer") # ===== Train on the batch ===== batch = await replay_buffer.sample.call_one(curr_policy_version=0) if batch is not None: print("Training on batch...") - inputs, targets = batch - loss = await trainer.train_step.call(inputs, targets) + inputs, targets = batch # GRPO returns (inputs, targets) tuple + loss = await trainer.train_step.call(inputs, targets) # RLTrainer is an actor print(f"Training loss: {loss}") return loss else: print("Not enough data in buffer yet") return None +# Note: This simplified example assumes tokenizer and services are already initialized +for step in range(10): + print(f"\n--- RL Step {step + 1} ---") + loss = await simple_rl_step() + if loss: + print(f"Step {step + 1} complete, loss: {loss:.4f}") + else: + print(f"Step {step + 1} complete, building buffer...") ###################################################################### # Handling Speed Mismatches with Service Scaling @@ -568,36 +553,27 @@ async def simple_rl_step( # **The insight**: Scale services independently based on their # bottlenecks. -# Mock imports for example -try: - from forge.actors.trainer import RLTrainer -except ImportError: - - class RLTrainer: - pass - - -async def scaling_example(): - """Scale services independently based on bottlenecks.""" - model_name = "Qwen/Qwen3-1.7B" - - # Scale fast services with more replicas - policy = await Policy.options( - procs=1, num_replicas=8, with_gpus=True # Many replicas for throughput - ).as_service(engine_config={"model": model_name, "tensor_parallel_size": 1}) - - # Reward evaluation might be CPU-bound - reward_actor = await RewardActor.options( - procs=1, num_replicas=16, with_gpus=False # More CPU replicas - ).as_service(reward_functions=[]) - - # Training needs fewer but more powerful replicas - trainer = await RLTrainer.options( - procs=1, with_gpus=True # Fewer but GPU-heavy - ).as_actor( # Trainer typically uses .as_actor() not .as_service() - model={"name": "qwen3", "flavor": "1.7B"}, - optimizer={"name": "AdamW", "lr": 1e-5}, - ) +# Scale fast services with more replicas +policy = await Policy.options( + procs=1, num_replicas=8, with_gpus=True # Many replicas for high throughput +).as_service( + engine_config={"model": model_name, "tensor_parallel_size": 1} +) + +# Reward evaluation might be CPU-bound +reward_actor = await RewardActor.options( + procs=1, num_replicas=16, with_gpus=False # More CPU replicas +).as_service( + reward_functions=[MathReward()] +) + +# Training needs fewer but more powerful replicas +trainer = await RLTrainer.options( + procs=1, with_gpus=True # Fewer but GPU-heavy +).as_actor( # Trainer typically uses .as_actor() not .as_service() + model={"name": "qwen3", "flavor": "1.7B"}, + optimizer={"name": "AdamW", "lr": 1e-5} +) ###################################################################### @@ -606,37 +582,20 @@ async def scaling_example(): # # Let's see how a reward service is actually implemented: -# Mock imports -try: - from forge.controller import ForgeActor - from forge.data.rewards import MathReward, ThinkingReward - from monarch.actor import endpoint -except ImportError: - - class ForgeActor: - pass - - def endpoint(func): - return func - - class MathReward: - def __call__(self, prompt, response, target): - return 1.0 - - class ThinkingReward: - def __call__(self, prompt, response, target): - return 1.0 +# Exact RewardActor from apps/grpo/main.py +from forge.controller import ForgeActor +from monarch.actor import endpoint +from forge.data.rewards import MathReward, ThinkingReward +# class definition from apps/grpo/main.py class RewardActor(ForgeActor): - """Exact RewardActor from apps/grpo/main.py.""" - def __init__(self, reward_functions: list): self.reward_functions = reward_functions @endpoint async def evaluate_response(self, prompt: str, response: str, target: str) -> float: - """Evaluate response quality using multiple reward functions.""" + """Evaluate response quality using multiple reward functions""" total_reward = 0.0 for reward_fn in self.reward_functions: @@ -645,31 +604,29 @@ async def evaluate_response(self, prompt: str, response: str, target: str) -> fl total_reward += reward # Return average reward across all functions - return ( - total_reward / len(self.reward_functions) if self.reward_functions else 0.0 - ) - + return total_reward / len(self.reward_functions) if self.reward_functions else 0.0 -async def reward_service_example(): - """Create and use a reward service.""" - reward_actor = await RewardActor.options(procs=1, num_replicas=1).as_service( - reward_functions=[MathReward(), ThinkingReward()] - ) +reward_actor = await RewardActor.options( + procs=1, num_replicas=1 +).as_service( + reward_functions=[MathReward(), ThinkingReward()] +) - prompt = "What is 15% of 240?" - response = "15% of 240 is 36" - target = "36" +prompt = "What is 15% of 240?" +response = "15% of 240 is 36" +target = "36" - score = await reward_actor.evaluate_response.route( - prompt=prompt, response=response, target=target - ) - print(f"Reward score: {score}") # Usually around 1.0 for correct answers +score = await reward_actor.evaluate_response.route( + prompt=prompt, + response=response, + target=target +) +print(f"Reward score: {score}") # Usually around 1.0 for correct math answers +# For production scaling - increase num_replicas for parallel evaluation: +# RewardActor.options(procs=1, num_replicas=16) # 16 parallel evaluators - # For production scaling - increase num_replicas: - # RewardActor.options(procs=1, num_replicas=16) # 16 parallel evaluators - - # Cleanup when done - await reward_actor.shutdown() +# Cleanup when done +await reward_actor.shutdown() ###################################################################### @@ -679,13 +636,57 @@ async def reward_service_example(): # Now let's see how services coordinate in a real training loop: -async def production_training_loop(): - """Real training loop pattern from apps/grpo/main.py.""" - # Service creation pattern (abbreviated) - print("Initializing all services...") +# This is the REAL way production RL systems are built with Forge - # (Services initialization code here - see Part 1) +import asyncio +import torch +from forge.actors.policy import Policy +from forge.actors.reference_model import ReferenceModel +from forge.actors.replay_buffer import ReplayBuffer +from forge.actors.trainer import RLTrainer +from apps.grpo.main import DatasetActor, RewardActor, ComputeAdvantages +from forge.data.rewards import MathReward, ThinkingReward + +# Service creation pattern from apps/grpo/main.py lines 322-344 +print("Initializing all services...") +( + dataloader, + policy, + trainer, + replay_buffer, + compute_advantages, + ref_model, + reward_actor, +) = await asyncio.gather( + DatasetActor.options(procs=1).as_actor( + path="openai/gsm8k", revision="main", data_split="train", + streaming=True, model="Qwen/Qwen3-1.7B" + ), + Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service( + engine_config={"model": "Qwen/Qwen3-1.7B", "tensor_parallel_size": 1}, + sampling_config={"n": 1, "max_tokens": 512} + ), + RLTrainer.options(procs=1, with_gpus=True).as_actor( + model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": "hf://Qwen/Qwen3-1.7B"}, + optimizer={"name": "AdamW", "lr": 1e-5}, + training={"local_batch_size": 2, "seq_len": 2048} + ), + ReplayBuffer.options(procs=1).as_actor( + batch_size=2, max_policy_age=1, dp_size=1 + ), + ComputeAdvantages.options(procs=1).as_actor(), + ReferenceModel.options(procs=1, with_gpus=True).as_actor( + model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": "hf://Qwen/Qwen3-1.7B"} + ), + RewardActor.options(procs=1, num_replicas=1).as_service( + reward_functions=[MathReward(), ThinkingReward()] + ), +) +print("All services initialized successfully!") + +async def production_training_loop(): + """Real training loop pattern from apps/grpo/main.py""" step = 0 while True: @@ -693,9 +694,9 @@ async def production_training_loop(): sample = await dataloader.sample.call_one() # Policy generation service call - responses = await policy.generate.route(sample["request"]) + responses = await policy.generate.route(sample["request"]) # Correct field name - # Reference computation service call + # Reference computation service call (requires full input tensor) input_ids = torch.cat([responses[0].prompt_ids, responses[0].token_ids]) ref_logprobs = await ref_model.forward.route( input_ids.unsqueeze(0), max_req_tokens=512, return_logprobs=True @@ -705,16 +706,17 @@ async def production_training_loop(): reward = await reward_actor.evaluate_response.route( prompt=sample["question"], response=responses[0].text, - target=sample["answer"], + target=sample["answer"] ) - # Experience storage + # Experience storage (using actual Episode structure) + episode = create_episode_from_grpo_data(sample, responses[0], reward, ref_logprobs[0], step) await replay_buffer.add.call_one(episode) # Training when ready batch = await replay_buffer.sample.call_one(curr_policy_version=step) if batch is not None: - inputs, targets = batch + inputs, targets = batch # GRPO returns (inputs, targets) tuple loss = await trainer.train_step.call(inputs, targets) # Weight synchronization pattern @@ -724,8 +726,17 @@ async def production_training_loop(): print(f"Step {step}, Loss: {loss:.4f}") step += 1 - if step >= 100: - break +print("Shutting down services...") +await asyncio.gather( + DatasetActor.shutdown(dataloader), + policy.shutdown(), + RLTrainer.shutdown(trainer), + ReplayBuffer.shutdown(replay_buffer), + ComputeAdvantages.shutdown(compute_advantages), + ReferenceModel.shutdown(ref_model), + reward_actor.shutdown(), +) +print("All services shut down successfully!") # **Key observations:** From 6629c0f3dff18362ab4aa48fad304908eab48867 Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Tue, 14 Oct 2025 14:28:13 -0700 Subject: [PATCH 25/28] Update --- .../zero-to-forge/3_Monarch_101.py | 242 ++++++------------ 1 file changed, 84 insertions(+), 158 deletions(-) diff --git a/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py b/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py index 09e61e154..e6f071af9 100644 --- a/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py +++ b/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py @@ -90,6 +90,23 @@ # Single Host ProcMesh # ~~~~~~~~~~~~~~~~~~~~ # +# **Key insight**: ProcMesh creates one process per GPU, automatically handling the process-to-hardware mapping. +# + +# This simple call: +procs = this_host().spawn_procs(per_host={"gpus": 8}) + +# Creates: +# Process 0 → GPU 0 +# Process 1 → GPU 1 +# Process 2 → GPU 2 +# Process 3 → GPU 3 +# Process 4 → GPU 4 +# Process 5 → GPU 5 +# Process 6 → GPU 6 +# Process 7 → GPU 7 + +###################################################################### # .. mermaid:: # # graph TD @@ -124,10 +141,16 @@ # style P6 fill:#F44336 # style P7 fill:#F44336 +###################################################################### +# The beauty: you don't manage individual processes or GPU assignments - +# ProcMesh handles the topology for you. + ###################################################################### # Multi-Host ProcMesh # ~~~~~~~~~~~~~~~~~~~ # +# **Key insight**: ProcMesh seamlessly scales across multiple hosts with continuous process numbering. +# # .. mermaid:: # # graph TD @@ -169,92 +192,81 @@ # style PM2 fill:#4CAF50 # style PM3 fill:#2196F3 -###################################################################### -# Monarch Actor System Basics -# ---------------------------- -# -# This shows the underlying actor system that powers Forge services. - -# Mock imports for documentation build -try: - from monarch.actor import Actor, endpoint, Future, ProcMesh, this_host, this_proc -except ImportError: - - class Actor: - pass - - def endpoint(func): - return func +# Same simple API works across hosts: +cluster_procs = spawn_cluster_procs( + hosts=["host1", "host2", "host3"], + per_host={"gpus": 4} +) - class Future: - pass +# Automatically creates: +# Host 1: Processes 0-3 → GPUs 0-3 +# Host 2: Processes 4-7 → GPUs 0-3 +# Host 3: Processes 8-11 → GPUs 0-3 - class ProcMesh: - pass +# Your code stays the same whether it's 1 host or 100 hosts +actors = cluster_procs.spawn("my_actor", MyActor) - def this_proc(): - return None +###################################################################### +# **The power**: Scale from single host to cluster without changing your +# actor code - ProcMesh handles all the complexity. - def this_host(): - return None +# This shows the underlying actor system that powers Forge services +# NOTE: This is for educational purposes - use ForgeActor and .as_service() in real Forge apps! +from monarch.actor import Actor, endpoint, this_proc, Future +from monarch.actor import ProcMesh, this_host +import asyncio # STEP 1: Define a basic actor class Counter(Actor): - """Basic counter actor example.""" - def __init__(self, initial_value: int): self.value = initial_value @endpoint def increment(self) -> None: - """Increment the counter.""" self.value += 1 @endpoint def get_value(self) -> int: - """Get current counter value.""" return self.value +# STEP 2: Single actor in local process +counter: Counter = this_proc().spawn("counter", Counter, initial_value=0) -async def basic_actor_example(): - """Example of using Monarch actors.""" - # STEP 2: Single actor in local process - counter = this_proc().spawn("counter", Counter, initial_value=0) - - # STEP 3: Send messages - fut = counter.get_value.call_one() - value = await fut - print(f"Counter value: {value}") # 0 +# STEP 3: Send messages +fut: Future[int] = counter.get_value.call_one() +value = await fut +print(f"Counter value: {value}") # 0 +# STEP 4: Multiple actors across processes +procs: ProcMesh = this_host().spawn_procs(per_host={"gpus": 8}) +counters: Counter = procs.spawn("counters", Counter, 0) -async def distributed_actors_example(): - """Example of actors across multiple processes.""" - # STEP 4: Multiple actors across processes - procs = this_host().spawn_procs(per_host={"gpus": 8}) - counters = procs.spawn("counters", Counter, 0) +# STEP 5: Broadcast to all actors +await counters.increment.call() - # STEP 5: Broadcast to all actors - await counters.increment.call() +# STEP 6: Different message patterns +# call_one() - single actor +value = await counters.get_value.call_one() +print(f"One counter: {value}") # Output: One counter: 1 - # STEP 6: Different message patterns - # call_one() - single actor - value = await counters.get_value.call_one() - print(f"One counter: {value}") +# choose() - random single actor (actors only, not services) +value = await counters.get_value.choose() +print(f"Random counter: {value}") # Output: Random counter: 1 - # choose() - random single actor (actors only, not services) - value = await counters.get_value.choose() - print(f"Random counter: {value}") +# call() - all actors, collect results +values = await counters.get_value.call() +print(f"All counters: {values}") # Output: All counters: [1, 1, 1, 1, 1, 1, 1, 1] - # call() - all actors, collect results - values = await counters.get_value.call() - print(f"All counters: {values}") +# broadcast() - fire and forget +await counters.increment.broadcast() # No return value - just sends to all actors - # broadcast() - fire and forget - await counters.increment.broadcast() +# Cleanup +await procs.stop() - # Cleanup - await procs.stop() +###################################################################### +# Remember: This raw Monarch code is for understanding how Forge works internally. +# In your Forge applications, use ForgeActor, .as_service(), .as_actor() instead! ###################################################################### @@ -264,93 +276,25 @@ async def distributed_actors_example(): # **ActorMesh** is created when you spawn actors across a ProcMesh. # Each process in the ProcMesh gets one instance of your actor. # -# .. mermaid:: -# -# graph TD -# subgraph Creation["Actor Creation Process"] -# Code["mesh.spawn('policy', PolicyActor, model='Qwen/Qwen3-7B')"] -# -# subgraph ProcMesh["ProcMesh (4 processes)"] -# P0["Process 0
GPU 0"] -# P1["Process 1
GPU 1"] -# P2["Process 2
GPU 2"] -# P3["Process 3
GPU 3"] -# end -# -# subgraph ActorMesh["ActorMesh PolicyActor"] -# A0["PolicyActor Instance #0: model=Qwen/Qwen3-7B"] -# A1["PolicyActor Instance #1: model=Qwen/Qwen3-7B"] -# A2["PolicyActor Instance #2: model=Qwen/Qwen3-7B"] -# A3["PolicyActor Instance #3: model=Qwen/Qwen3-7B"] -# end -# -# Code --> ProcMesh -# P0 --> A0 -# P1 --> A1 -# P2 --> A2 -# P3 --> A3 -# end -# -# style A0 fill:#4CAF50 -# style A1 fill:#4CAF50 -# style A2 fill:#4CAF50 -# style A3 fill:#4CAF50 +# - **One actor instance per process**: `mesh.spawn("policy", PolicyActor)` creates one PolicyActor in each process +# - **Same constructor arguments**: All instances get the same initialization parameters +# - **Independent state**: Each actor instance maintains its own state and memory +# - **Message routing**: You can send messages to one actor or all actors using different methods + +# Simple example: +procs = spawn_procs(per_host={"gpus": 4}) # 4 processes +policy_actors = procs.spawn("policy", PolicyActor, model="Qwen/Qwen3-7B") + +# Now you have 4 PolicyActor instances, one per GPU +# All initialized with the same model parameter -###################################################################### -# Message Routing Through ActorMesh -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# .. mermaid:: -# -# graph TD -# subgraph MessageFlow["Message Flow Patterns"] -# Client["await policy_actors.generate.METHOD(prompt)"] -# -# subgraph Methods["Different Adverbs Route Differently"] -# Choose["choose(): Routes to ONE actor, Load balanced"] -# Call["call(): Routes to ALL actors, Collects results"] -# Broadcast["broadcast(): Routes to ALL actors, Fire and forget"] -# Stream["stream(): Routes to ALL actors, Iterator of results"] -# end -# -# subgraph ActorInstances["PolicyActor Instances"] -# A0["Actor 0: GPU 0, generates response"] -# A1["Actor 1: GPU 1, generates response"] -# A2["Actor 2: GPU 2, generates response"] -# A3["Actor 3: GPU 3, generates response"] -# end -# -# Client --> Choose -# Client --> Call -# Client --> Broadcast -# Client --> Stream -# -# Choose -.->|"Load balanced"| A1 -# Call --> A0 -# Call --> A1 -# Call --> A2 -# Call --> A3 -# Broadcast --> A0 -# Broadcast --> A1 -# Broadcast --> A2 -# Broadcast --> A3 -# Stream --> A0 -# Stream --> A1 -# Stream --> A2 -# Stream --> A3 -# end -# -# style Choose fill:#4CAF50 -# style Call fill:#FF9800 -# style Broadcast fill:#E91E63 -# style Stream fill:#9C27B0 ###################################################################### # How Forge Services Use Monarch # ------------------------------- # -# Now the key insight: **Forge services are ServiceActors that manage -# ActorMeshes of your ForgeActor replicas**. +# Now the key insight: **Forge services are ``ServiceActors`` that manage +# ``ActorMeshes`` of your ``ForgeActor`` replicas**. # # The Service Creation Process # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -445,7 +389,7 @@ async def distributed_actors_example(): # ----------------------------------------- # # In real RL systems, you have multiple services that can share or use -# separate ProcMeshes: +# separate ``ProcMeshes``: # # .. mermaid:: # @@ -515,24 +459,6 @@ async def distributed_actors_example(): # * **Scale effectively**: Where to add resources for maximum impact? -def demonstrate_architecture_benefits(): - """Example showing why the architecture matters.""" - # Process Isolation: Failures don't cascade - # If one PolicyActor crashes, others continue serving - - # Location Transparency: Same API whether local or remote - # await policy.generate.route(prompt) # Works same everywhere - - # Structured Distribution: ProcMesh maps to hardware - # per_host={"gpus": 8} creates 8 processes, 1 per GPU - - # Message Passing: No locks needed - # Each actor processes messages sequentially, naturally thread-safe - - # Service Abstraction: Simple interface, powerful backend - # await service.method.route() hides all distribution complexity - pass - ###################################################################### # Conclusion From 19db1e81f12e6c957380634085274d715cccec3b Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Tue, 14 Oct 2025 14:35:48 -0700 Subject: [PATCH 26/28] Update --- .../1_RL_and_Forge_Fundamentals.py | 125 +++++++++--------- .../zero-to-forge/2_Forge_Internals.py | 55 ++++---- 2 files changed, 97 insertions(+), 83 deletions(-) diff --git a/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py b/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py index cbdb1fe5f..d5d50f4b6 100644 --- a/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py +++ b/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py @@ -351,66 +351,73 @@ async def example_automatic_management(policy): import asyncio import torch -model = "Qwen/Qwen3-1.7B" -group_size = 1 - -( - dataloader, - policy, - trainer, - replay_buffer, - compute_advantages, - ref_model, - reward_actor, -) = await asyncio.gather( - # Dataset actor (CPU) - DatasetActor.options(procs=1).as_actor( - path="openai/gsm8k", - revision="main", - data_split="train", - streaming=True, - model=model, - ), - # Policy service with GPU - Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service( - engine_config={ - "model": model, - "tensor_parallel_size": 1, - "pipeline_parallel_size": 1, - "enforce_eager": False - }, - sampling_config={ - "n": group_size, - "max_tokens": 16, - "temperature": 1.0, - "top_p": 1.0 - } - ), - # Trainer actor with GPU - RLTrainer.options(procs=1, with_gpus=True).as_actor( - # Trainer config would come from YAML in real usage - model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"}, - optimizer={"name": "AdamW", "lr": 1e-5}, - training={"local_batch_size": 2, "seq_len": 2048} - ), - # Replay buffer (CPU) - ReplayBuffer.options(procs=1).as_actor( - batch_size=2, - max_policy_age=1, - dp_size=1 - ), - # Advantage computation (CPU) - ComputeAdvantages.options(procs=1).as_actor(), - # Reference model with GPU - ReferenceModel.options(procs=1, with_gpus=True).as_actor( - model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"}, - training={"dtype": "bfloat16"} - ), - # Reward actor (CPU) - RewardActor.options(procs=1, num_replicas=1).as_service( - reward_functions=[MathReward(), ThinkingReward()] +async def example_forge_service_initialization(): + """Example of initializing Forge services for RL training.""" + model = "Qwen/Qwen3-1.7B" + group_size = 1 + + ( + dataloader, + policy, + trainer, + replay_buffer, + compute_advantages, + ref_model, + reward_actor, + ) = await asyncio.gather( + # Dataset actor (CPU) + DatasetActor.options(procs=1).as_actor( + path="openai/gsm8k", + revision="main", + data_split="train", + streaming=True, + model=model, + ), + # Policy service with GPU + Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service( + engine_config={ + "model": model, + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "enforce_eager": False + }, + sampling_config={ + "n": group_size, + "max_tokens": 16, + "temperature": 1.0, + "top_p": 1.0 + } + ), + # Trainer actor with GPU + RLTrainer.options(procs=1, with_gpus=True).as_actor( + # Trainer config would come from YAML in real usage + model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"}, + optimizer={"name": "AdamW", "lr": 1e-5}, + training={"local_batch_size": 2, "seq_len": 2048} + ), + # Replay buffer (CPU) + ReplayBuffer.options(procs=1).as_actor( + batch_size=2, + max_policy_age=1, + dp_size=1 + ), + # Advantage computation (CPU) + ComputeAdvantages.options(procs=1).as_actor(), + # Reference model with GPU + ReferenceModel.options(procs=1, with_gpus=True).as_actor( + model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"}, + training={"dtype": "bfloat16"} + ), + # Reward actor (CPU) + RewardActor.options(procs=1, num_replicas=1).as_service( + reward_functions=[MathReward(), ThinkingReward()] + ) ) - ) + + return dataloader, policy, trainer, replay_buffer, compute_advantages, ref_model, reward_actor + +# Run the example (commented out to avoid execution during doc build) +# asyncio.run(example_forge_service_initialization()) ###################################################################### diff --git a/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py b/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py index 0a1e57c91..d946cc50d 100644 --- a/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py +++ b/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py @@ -113,35 +113,42 @@ # * Health monitoring and failure recovery # * Message routing and serialization +import asyncio from forge.actors.policy import Policy -model = "Qwen/Qwen3-1.7B" +async def example_service_creation(): + """Example of creating and using a policy service.""" + model = "Qwen/Qwen3-1.7B" + + policy = await Policy.options( + procs=1, + with_gpus=True, + num_replicas=1 + ).as_service( + engine_config={ + "model": model, + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "enforce_eager": False + }, + sampling_config={ + "n": 1, + "max_tokens": 16, + "temperature": 1.0, + "top_p": 1.0 + } + ) -policy = await Policy.options( - procs=1, - with_gpus=True, - num_replicas=1 -).as_service( - engine_config={ - "model": model, - "tensor_parallel_size": 1, - "pipeline_parallel_size": 1, - "enforce_eager": False - }, - sampling_config={ - "n": 1, - "max_tokens": 16, - "temperature": 1.0, - "top_p": 1.0 - } -) + prompt = "What is 3 + 5?" + responses = await policy.generate.route(prompt) + print(f"Response: {responses[0].text}") -prompt = "What is 3 + 5?" -responses = await policy.generate.route(prompt) -print(f"Response: {responses[0].text}") + # Cleanup when done + await policy.shutdown() + return policy -# Cleanup when done -await policy.shutdown() +# Run the example +asyncio.run(example_service_creation()) ###################################################################### From 7c9e61f73fccb77dd3a2603572855df46bfc66bc Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Tue, 14 Oct 2025 14:43:09 -0700 Subject: [PATCH 27/28] Precomit --- .../1_RL_and_Forge_Fundamentals.py | 130 +++--- .../zero-to-forge/2_Forge_Internals.py | 428 ++++++++++-------- .../zero-to-forge/3_Monarch_101.py | 63 +-- 3 files changed, 345 insertions(+), 276 deletions(-) diff --git a/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py b/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py index d5d50f4b6..d97d715a3 100644 --- a/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py +++ b/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py @@ -191,6 +191,7 @@ async def conceptual_forge_rl_step(services, step): return loss + ###################################################################### # **Key difference**: Same RL logic, but each component is now a distributed, # # fault-tolerant, auto-scaling service. @@ -200,7 +201,6 @@ async def conceptual_forge_rl_step(services, step): # # writing your RL Algorthms! - ###################################################################### # Why This Matters: Traditional ML Infrastructure Fails # ----------------------------------------------------- @@ -282,8 +282,6 @@ async def real_rl_training_step(services, step): ) # 3. Get reference logprobs - Using actual ReferenceModel API - import torch - input_ids = torch.cat([responses[0].prompt_ids, responses[0].token_ids]) ref_logprobs = await services["ref_model"].forward.route( input_ids.unsqueeze(0), max_req_tokens=512, return_logprobs=True @@ -305,6 +303,7 @@ async def real_rl_training_step(services, step): return loss + ##################################################################### # **Key insight**: Each line of RL pseudocode becomes a service call. # The complexity of distribution, scaling, and fault tolerance is hidden @@ -324,6 +323,10 @@ async def example_automatic_management(policy): answer = responses[0].text return answer + +import torch +from apps.grpo.main import ComputeAdvantages, DatasetActor, RewardActor + ###################################################################### # Forge handles behind the scenes: # @@ -343,13 +346,11 @@ async def example_automatic_management(policy): # Note: This is example code showing the Forge API # For actual imports, see apps/grpo/main.py from forge.actors.policy import Policy -from forge.actors.replay_buffer import ReplayBuffer from forge.actors.reference_model import ReferenceModel +from forge.actors.replay_buffer import ReplayBuffer from forge.actors.trainer import RLTrainer -from apps.grpo.main import DatasetActor, RewardActor, ComputeAdvantages from forge.data.rewards import MathReward, ThinkingReward -import asyncio -import torch + async def example_forge_service_initialization(): """Example of initializing Forge services for RL training.""" @@ -365,56 +366,71 @@ async def example_forge_service_initialization(): ref_model, reward_actor, ) = await asyncio.gather( - # Dataset actor (CPU) - DatasetActor.options(procs=1).as_actor( - path="openai/gsm8k", - revision="main", - data_split="train", - streaming=True, - model=model, - ), - # Policy service with GPU - Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service( - engine_config={ - "model": model, - "tensor_parallel_size": 1, - "pipeline_parallel_size": 1, - "enforce_eager": False - }, - sampling_config={ - "n": group_size, - "max_tokens": 16, - "temperature": 1.0, - "top_p": 1.0 - } - ), - # Trainer actor with GPU - RLTrainer.options(procs=1, with_gpus=True).as_actor( - # Trainer config would come from YAML in real usage - model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"}, - optimizer={"name": "AdamW", "lr": 1e-5}, - training={"local_batch_size": 2, "seq_len": 2048} - ), - # Replay buffer (CPU) - ReplayBuffer.options(procs=1).as_actor( - batch_size=2, - max_policy_age=1, - dp_size=1 - ), - # Advantage computation (CPU) - ComputeAdvantages.options(procs=1).as_actor(), - # Reference model with GPU - ReferenceModel.options(procs=1, with_gpus=True).as_actor( - model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"}, - training={"dtype": "bfloat16"} - ), - # Reward actor (CPU) - RewardActor.options(procs=1, num_replicas=1).as_service( - reward_functions=[MathReward(), ThinkingReward()] - ) - ) - - return dataloader, policy, trainer, replay_buffer, compute_advantages, ref_model, reward_actor + # Dataset actor (CPU) + DatasetActor.options(procs=1).as_actor( + path="openai/gsm8k", + revision="main", + data_split="train", + streaming=True, + model=model, + ), + # Policy service with GPU + Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service( + engine_config={ + "model": model, + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "enforce_eager": False, + }, + sampling_config={ + "n": group_size, + "max_tokens": 16, + "temperature": 1.0, + "top_p": 1.0, + }, + ), + # Trainer actor with GPU + RLTrainer.options(procs=1, with_gpus=True).as_actor( + # Trainer config would come from YAML in real usage + model={ + "name": "qwen3", + "flavor": "1.7B", + "hf_assets_path": f"hf://{model}", + }, + optimizer={"name": "AdamW", "lr": 1e-5}, + training={"local_batch_size": 2, "seq_len": 2048}, + ), + # Replay buffer (CPU) + ReplayBuffer.options(procs=1).as_actor( + batch_size=2, max_policy_age=1, dp_size=1 + ), + # Advantage computation (CPU) + ComputeAdvantages.options(procs=1).as_actor(), + # Reference model with GPU + ReferenceModel.options(procs=1, with_gpus=True).as_actor( + model={ + "name": "qwen3", + "flavor": "1.7B", + "hf_assets_path": f"hf://{model}", + }, + training={"dtype": "bfloat16"}, + ), + # Reward actor (CPU) + RewardActor.options(procs=1, num_replicas=1).as_service( + reward_functions=[MathReward(), ThinkingReward()] + ), + ) + + return ( + dataloader, + policy, + trainer, + replay_buffer, + compute_advantages, + ref_model, + reward_actor, + ) + # Run the example (commented out to avoid execution during doc build) # asyncio.run(example_forge_service_initialization()) diff --git a/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py b/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py index d946cc50d..72b63d728 100644 --- a/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py +++ b/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py @@ -27,7 +27,7 @@ * Understanding of Python async/await * Basic distributed systems knowledge -We highly recommend completing Part 1 before starting this tutorial. +We highly recommend completing Part 1 before starting this tutorial. Part 1 explains RL Concepts and how they land in Forge. Now that you see the power of the service abstraction, let's understand @@ -88,9 +88,9 @@ # Here's the actual ``ServiceConfig`` from Forge source code: # Configuration pattern from apps/grpo/main.py: -# +# # .. code-block:: python -# +# # Policy.options( # procs=1, # Processes per replica # num_replicas=4, # Number of replicas @@ -114,29 +114,22 @@ # * Message routing and serialization import asyncio + from forge.actors.policy import Policy + async def example_service_creation(): """Example of creating and using a policy service.""" model = "Qwen/Qwen3-1.7B" - policy = await Policy.options( - procs=1, - with_gpus=True, - num_replicas=1 - ).as_service( + policy = await Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service( engine_config={ "model": model, "tensor_parallel_size": 1, "pipeline_parallel_size": 1, - "enforce_eager": False + "enforce_eager": False, }, - sampling_config={ - "n": 1, - "max_tokens": 16, - "temperature": 1.0, - "top_p": 1.0 - } + sampling_config={"n": 1, "max_tokens": 16, "temperature": 1.0, "top_p": 1.0}, ) prompt = "What is 3 + 5?" @@ -147,6 +140,7 @@ async def example_service_creation(): await policy.shutdown() return policy + # Run the example asyncio.run(example_service_creation()) @@ -164,7 +158,7 @@ async def example_service_creation(): # Your code sees this simple interface: -responses = await policy.generate.route(prompt=prompt) +# responses = await policy.generate.route(prompt=prompt) # But Forge handles all the complexity of replica management, load balancing, and fault tolerance @@ -226,8 +220,13 @@ async def example_service_creation(): # **When to use**: Normal request routing where any replica can handle # the request. -responses = await policy.generate.route(prompt=question) -answer = responses[0].text # Extract text from Completion object + +def example_route_pattern(): + """Example showing route pattern for load balanced requests.""" + # responses = await policy.generate.route(prompt=question) + # answer = responses[0].text # Extract text from Completion object + pass + ###################################################################### # Behind the scenes: @@ -253,13 +252,16 @@ async def example_service_creation(): # **When to use**: You need responses from ALL replicas. -# Get version from all policy replicas -current_versions = await policy.get_version.fanout() -# Returns: [version_replica_1, version_replica_2, ...] +async def example_fanout_pattern(): + """Example showing fanout pattern for broadcast operations.""" + # Get version from all policy replicas + # current_versions = await policy.get_version.fanout() + # Returns: [version_replica_1, version_replica_2, ...] -# Update weights on all replicas -await policy.update_weights.fanout(new_policy_version) -# Broadcasts to all replicas simultaneously + # Update weights on all replicas + # await policy.update_weights.fanout(new_policy_version) + # Broadcasts to all replicas simultaneously + pass # **Performance characteristics**: @@ -280,16 +282,20 @@ async def example_service_creation(): # Streaming requires custom implementation in your training loop. # The basic ``ReplayBuffer`` doesn't have built-in streaming methods. -# Pattern from apps/grpo/main.py continuous training: -while training: + +async def example_streaming_pattern(): + """Pattern from apps/grpo/main.py continuous training.""" # This is the real API call pattern - batch = await replay_buffer.sample.call_one(curr_policy_version=step) - if batch is not None: - # Process batch immediately - loss = await trainer.train_step.call_one(batch) - print(f"Training loss: {loss}") - else: - await asyncio.sleep(0.1) # Wait for more data + # while training: + # batch = await replay_buffer.sample.call_one(curr_policy_version=step) + # if batch is not None: + # # Process batch immediately + # loss = await trainer.train_step.call_one(batch) + # print(f"Training loss: {loss}") + # else: + # await asyncio.sleep(0.1) # Wait for more data + pass + ###################################################################### # **Performance characteristics**: @@ -316,6 +322,7 @@ async def example_service_creation(): from forge.controller import ForgeActor from monarch.actor import endpoint + class ForgeCounter(ForgeActor): def __init__(self, initial_value: int): self.value = initial_value @@ -333,44 +340,46 @@ def get_value(self) -> int: async def reset(self): self.value = 0 -counter_service = await ForgeCounter.options( - procs=1, num_replicas=4 -).as_service(initial_value=0) -# WITHOUT SESSIONS: Each .route() call goes to a different replica -await counter_service.increment.route() # Might go to replica 2 -await counter_service.increment.route() # Might go to replica 1 -await counter_service.increment.route() # Might go to replica 3 +async def example_session_comparison(): + """Demonstrate the difference between sessions and normal routing.""" + counter_service = await ForgeCounter.options(procs=1, num_replicas=4).as_service( + initial_value=0 + ) -results = await counter_service.increment.fanout() # Get from all replicas -print(f"All replica values: {results}") -# Output: All replica values: [1, 2, 1, 1] - Each replica has different state! + # WITHOUT SESSIONS: Each .route() call goes to a different replica + await counter_service.increment.route() # Might go to replica 2 + await counter_service.increment.route() # Might go to replica 1 + await counter_service.increment.route() # Might go to replica 3 -###################################################################### -# The problem: each `.route()` call can go to different replicas, creating inconsistent state. + results = await counter_service.increment.fanout() # Get from all replicas + print(f"All replica values: {results}") + # Output: All replica values: [1, 2, 1, 1] - Each replica has different state! -# WITH SESSIONS: All calls go to the SAME replica -print("\nUsing sticky sessions:") -async with counter_service.session(): # Creates a session that picks one replica - await counter_service.reset.route() # Uses .route() within session - print(await counter_service.increment.route()) # 1 - print(await counter_service.increment.route()) # 2 - print(await counter_service.increment.route()) # 3 + # WITH SESSIONS: All calls go to the SAME replica + print("\nUsing sticky sessions:") + async with counter_service.session(): # Creates a session that picks one replica + await counter_service.reset.route() # Uses .route() within session + print(await counter_service.increment.route()) # 1 + print(await counter_service.increment.route()) # 2 + print(await counter_service.increment.route()) # 3 - final_value = await counter_service.get_value.route() - print(f"Final value on this replica: {final_value}") # 3 + final_value = await counter_service.get_value.route() + print(f"Final value on this replica: {final_value}") # 3 -###################################################################### -# Same pattern works with Policy for multi-turn conversations: + # Cleanup + await counter_service.shutdown() -async with policy.session(): - response1 = await policy.generate.route(turn1) - full_prompt = turn1 + response1[0].text + turn2 - response2 = await policy.generate.route(full_prompt) - # Both calls hit same replica, preserving KV cache -# Cleanup -await counter_service.shutdown() +async def example_multi_turn_conversation(policy, turn1, turn2): + """Same pattern works with Policy for multi-turn conversations.""" + async with policy.session(): + response1 = await policy.generate.route(turn1) + full_prompt = turn1 + response1[0].text + turn2 + response2 = await policy.generate.route(full_prompt) + # Both calls hit same replica, preserving KV cache + return response2 + ###################################################################### # **Performance impact**: Critical for maintaining KV cache in multi-turn conversations. @@ -392,8 +401,11 @@ async def reset(self): async def naive_multi_turn(): # Each call might go to different replica = cache miss response1 = await policy_service.generate.choose(question1) - response2 = await policy_service.generate.choose(question1 + response1) # Cache miss! - response3 = await policy_service.generate.choose(conversation_so_far) # Cache miss! + response2 = await policy_service.generate.choose( + question1 + response1 + ) # Cache miss! + response3 = await policy_service.generate.choose(conversation_so_far) # Cache miss! + ###################################################################### # **The solution**: Sticky sessions ensure all calls go to same replica. @@ -412,6 +424,7 @@ async def optimized_multi_turn(policy): # Session ends, replica can be garbage collected or reused + ###################################################################### # **Performance impact**: Maintaining KV cache across turns avoids # recomputing previous tokens. @@ -426,20 +439,23 @@ async def optimized_multi_turn(policy): # **Real Forge approach**: The ReplayBuffer actor handles concurrency # internally: -# Forge ReplayBuffer endpoints (verified from source code) -# Add episodes (thread-safe by actor model) -await replay_buffer.add.call_one(episode) # .choose() would work too, but .call_one() clarifies it's a singleton actor not ActorMesh -# Sample batches for training -batch = await replay_buffer.sample.call_one( - curr_policy_version=step_number, - batch_size=None # Optional parameter, uses default from config -) +async def example_replay_buffer_usage(): + """Forge ReplayBuffer endpoints (verified from source code).""" + # Add episodes (thread-safe by actor model) + # await replay_buffer.add.call_one(episode) -# Additional methods available: -# await replay_buffer.clear.call_one() # Clear buffer -# await replay_buffer.evict.call_one(curr_policy_version) # Remove old episodes -# state = await replay_buffer.state_dict.call_one() # Get state for checkpointing + # Sample batches for training + # batch = await replay_buffer.sample.call_one( + # curr_policy_version=step_number, + # batch_size=None, # Optional parameter, uses default from config + # ) + + # Additional methods available: + # await replay_buffer.clear.call_one() # Clear buffer + # await replay_buffer.evict.call_one(curr_policy_version) # Remove old episodes + # state = await replay_buffer.state_dict.call_one() # Get state + pass # **Critical insight**: The actor model provides natural thread safety - @@ -461,9 +477,13 @@ async def real_weight_sync(trainer, policy, step): # Use .fanout() to update ALL policy replicas await policy.update_weights.fanout(policy_version=step + 1) -# Check current policy version -current_version = await policy.get_version.route() -print(f"Current policy version: {current_version}") + +async def check_policy_version(policy): + """Check current policy version.""" + current_version = await policy.get_version.route() + print(f"Current policy version: {current_version}") + return current_version + ###################################################################### # Deep Dive: Asynchronous Coordination Patterns @@ -481,6 +501,7 @@ async def real_weight_sync(trainer, policy, step): from apps.grpo.main import Episode, Group + async def simple_rl_step(): # ===== Generate a rollout ===== @@ -499,9 +520,7 @@ async def simple_rl_step(): input_ids.unsqueeze(0), max_req_tokens=512, return_logprobs=True ) reward = await reward_actor.evaluate_response.route( # RewardActor is a service - prompt=prompt, - response=actions[0].text, - target=target + prompt=prompt, response=actions[0].text, target=target ) print(f"Reward: {reward}") @@ -513,7 +532,7 @@ async def simple_rl_step(): pad_id=tokenizer.pad_token_id, request_len=512, response_len=512, - target=target + target=target, ) # Add response data @@ -526,7 +545,9 @@ async def simple_rl_step(): # Compute advantages using actual ComputeAdvantages actor group = Group.new_group(0, 1, prompt, 0, tokenizer.pad_token_id, 512, 512, target) group.episodes[0] = episode - advantages = await compute_advantages.compute.call_one(group) # ComputeAdvantages is an actor + advantages = await compute_advantages.compute.call_one( + group + ) # ComputeAdvantages is an actor episode.advantage = advantages[0] print(f"Advantage: {advantages[0]}") await replay_buffer.add.call_one(episode) # ReplayBuffer is an actor @@ -544,14 +565,18 @@ async def simple_rl_step(): print("Not enough data in buffer yet") return None -# Note: This simplified example assumes tokenizer and services are already initialized -for step in range(10): - print(f"\n--- RL Step {step + 1} ---") - loss = await simple_rl_step() - if loss: - print(f"Step {step + 1} complete, loss: {loss:.4f}") - else: - print(f"Step {step + 1} complete, building buffer...") + +async def run_training_steps(): + """Run multiple RL training steps.""" + # Note: This simplified example assumes tokenizer and services are already initialized + for step in range(10): + print(f"\n--- RL Step {step + 1} ---") + loss = await simple_rl_step() + if loss: + print(f"Step {step + 1} complete, loss: {loss:.4f}") + else: + print(f"Step {step + 1} complete, building buffer...") + ###################################################################### # Handling Speed Mismatches with Service Scaling @@ -560,27 +585,27 @@ async def simple_rl_step(): # **The insight**: Scale services independently based on their # bottlenecks. -# Scale fast services with more replicas -policy = await Policy.options( - procs=1, num_replicas=8, with_gpus=True # Many replicas for high throughput -).as_service( - engine_config={"model": model_name, "tensor_parallel_size": 1} -) - -# Reward evaluation might be CPU-bound -reward_actor = await RewardActor.options( - procs=1, num_replicas=16, with_gpus=False # More CPU replicas -).as_service( - reward_functions=[MathReward()] -) - -# Training needs fewer but more powerful replicas -trainer = await RLTrainer.options( - procs=1, with_gpus=True # Fewer but GPU-heavy -).as_actor( # Trainer typically uses .as_actor() not .as_service() - model={"name": "qwen3", "flavor": "1.7B"}, - optimizer={"name": "AdamW", "lr": 1e-5} -) + +async def example_service_scaling(): + """Example showing how to scale services independently.""" + # Scale fast services with more replicas + policy = await Policy.options( + procs=1, num_replicas=8, with_gpus=True # Many replicas for high throughput + ).as_service(engine_config={"model": "model_name", "tensor_parallel_size": 1}) + + # Reward evaluation might be CPU-bound + reward_actor = await RewardActor.options( + procs=1, num_replicas=16, with_gpus=False # More CPU replicas + ).as_service(reward_functions=[MathReward()]) + + # Training needs fewer but more powerful replicas + trainer = await RLTrainer.options( + procs=1, with_gpus=True # Fewer but GPU-heavy + ).as_actor( # Trainer typically uses .as_actor() not .as_service() + model={"name": "qwen3", "flavor": "1.7B"}, + optimizer={"name": "AdamW", "lr": 1e-5}, + ) + return policy, reward_actor, trainer ###################################################################### @@ -592,8 +617,8 @@ async def simple_rl_step(): # Exact RewardActor from apps/grpo/main.py from forge.controller import ForgeActor -from monarch.actor import endpoint from forge.data.rewards import MathReward, ThinkingReward +from monarch.actor import endpoint # class definition from apps/grpo/main.py class RewardActor(ForgeActor): @@ -611,29 +636,30 @@ async def evaluate_response(self, prompt: str, response: str, target: str) -> fl total_reward += reward # Return average reward across all functions - return total_reward / len(self.reward_functions) if self.reward_functions else 0.0 + return ( + total_reward / len(self.reward_functions) if self.reward_functions else 0.0 + ) -reward_actor = await RewardActor.options( - procs=1, num_replicas=1 -).as_service( - reward_functions=[MathReward(), ThinkingReward()] -) -prompt = "What is 15% of 240?" -response = "15% of 240 is 36" -target = "36" +async def example_reward_actor_usage(): + """Example of using the RewardActor service.""" + reward_actor = await RewardActor.options(procs=1, num_replicas=1).as_service( + reward_functions=[MathReward(), ThinkingReward()] + ) -score = await reward_actor.evaluate_response.route( - prompt=prompt, - response=response, - target=target -) -print(f"Reward score: {score}") # Usually around 1.0 for correct math answers -# For production scaling - increase num_replicas for parallel evaluation: -# RewardActor.options(procs=1, num_replicas=16) # 16 parallel evaluators + prompt = "What is 15% of 240?" + response = "15% of 240 is 36" + target = "36" -# Cleanup when done -await reward_actor.shutdown() + score = await reward_actor.evaluate_response.route( + prompt=prompt, response=response, target=target + ) + print(f"Reward score: {score}") # Usually around 1.0 for correct math answers + # For production scaling - increase num_replicas for parallel evaluation: + # RewardActor.options(procs=1, num_replicas=16) # 16 parallel evaluators + + # Cleanup when done + await reward_actor.shutdown() ###################################################################### @@ -645,18 +671,82 @@ async def evaluate_response(self, prompt: str, response: str, target: str) -> fl # This is the REAL way production RL systems are built with Forge -import asyncio -import torch -from forge.actors.policy import Policy -from forge.actors.reference_model import ReferenceModel -from forge.actors.replay_buffer import ReplayBuffer -from forge.actors.trainer import RLTrainer -from apps.grpo.main import DatasetActor, RewardActor, ComputeAdvantages -from forge.data.rewards import MathReward, ThinkingReward -# Service creation pattern from apps/grpo/main.py lines 322-344 -print("Initializing all services...") -( +async def example_full_service_orchestration(): + """Service creation pattern from apps/grpo/main.py lines 322-344.""" + print("Initializing all services...") + ( + dataloader, + policy, + trainer, + replay_buffer, + compute_advantages, + ref_model, + reward_actor, + ) = await asyncio.gather( + DatasetActor.options(procs=1).as_actor( + path="openai/gsm8k", + revision="main", + data_split="train", + streaming=True, + model="Qwen/Qwen3-1.7B", + ), + Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service( + engine_config={"model": "Qwen/Qwen3-1.7B", "tensor_parallel_size": 1}, + sampling_config={"n": 1, "max_tokens": 512}, + ), + RLTrainer.options(procs=1, with_gpus=True).as_actor( + model={ + "name": "qwen3", + "flavor": "1.7B", + "hf_assets_path": "hf://Qwen/Qwen3-1.7B", + }, + optimizer={"name": "AdamW", "lr": 1e-5}, + training={"local_batch_size": 2, "seq_len": 2048}, + ), + ReplayBuffer.options(procs=1).as_actor( + batch_size=2, max_policy_age=1, dp_size=1 + ), + ComputeAdvantages.options(procs=1).as_actor(), + ReferenceModel.options(procs=1, with_gpus=True).as_actor( + model={ + "name": "qwen3", + "flavor": "1.7B", + "hf_assets_path": "hf://Qwen/Qwen3-1.7B", + } + ), + RewardActor.options(procs=1, num_replicas=1).as_service( + reward_functions=[MathReward(), ThinkingReward()] + ), + ) + + print("All services initialized successfully!") + + # Run training loop + await production_training_loop( + dataloader, + policy, + trainer, + replay_buffer, + compute_advantages, + ref_model, + reward_actor, + ) + + print("Shutting down services...") + await asyncio.gather( + DatasetActor.shutdown(dataloader), + policy.shutdown(), + RLTrainer.shutdown(trainer), + ReplayBuffer.shutdown(replay_buffer), + ComputeAdvantages.shutdown(compute_advantages), + ReferenceModel.shutdown(ref_model), + reward_actor.shutdown(), + ) + print("All services shut down successfully!") + + +async def production_training_loop( dataloader, policy, trainer, @@ -664,35 +754,7 @@ async def evaluate_response(self, prompt: str, response: str, target: str) -> fl compute_advantages, ref_model, reward_actor, -) = await asyncio.gather( - DatasetActor.options(procs=1).as_actor( - path="openai/gsm8k", revision="main", data_split="train", - streaming=True, model="Qwen/Qwen3-1.7B" - ), - Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service( - engine_config={"model": "Qwen/Qwen3-1.7B", "tensor_parallel_size": 1}, - sampling_config={"n": 1, "max_tokens": 512} - ), - RLTrainer.options(procs=1, with_gpus=True).as_actor( - model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": "hf://Qwen/Qwen3-1.7B"}, - optimizer={"name": "AdamW", "lr": 1e-5}, - training={"local_batch_size": 2, "seq_len": 2048} - ), - ReplayBuffer.options(procs=1).as_actor( - batch_size=2, max_policy_age=1, dp_size=1 - ), - ComputeAdvantages.options(procs=1).as_actor(), - ReferenceModel.options(procs=1, with_gpus=True).as_actor( - model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": "hf://Qwen/Qwen3-1.7B"} - ), - RewardActor.options(procs=1, num_replicas=1).as_service( - reward_functions=[MathReward(), ThinkingReward()] - ), -) - -print("All services initialized successfully!") - -async def production_training_loop(): +): """Real training loop pattern from apps/grpo/main.py""" step = 0 @@ -713,11 +775,13 @@ async def production_training_loop(): reward = await reward_actor.evaluate_response.route( prompt=sample["question"], response=responses[0].text, - target=sample["answer"] + target=sample["answer"], ) # Experience storage (using actual Episode structure) - episode = create_episode_from_grpo_data(sample, responses[0], reward, ref_logprobs[0], step) + episode = create_episode_from_grpo_data( + sample, responses[0], reward, ref_logprobs[0], step + ) await replay_buffer.add.call_one(episode) # Training when ready @@ -733,18 +797,6 @@ async def production_training_loop(): print(f"Step {step}, Loss: {loss:.4f}") step += 1 -print("Shutting down services...") -await asyncio.gather( - DatasetActor.shutdown(dataloader), - policy.shutdown(), - RLTrainer.shutdown(trainer), - ReplayBuffer.shutdown(replay_buffer), - ComputeAdvantages.shutdown(compute_advantages), - ReferenceModel.shutdown(ref_model), - reward_actor.shutdown(), -) -print("All services shut down successfully!") - # **Key observations:** # diff --git a/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py b/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py index e6f071af9..c989724a0 100644 --- a/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py +++ b/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py @@ -194,8 +194,7 @@ # Same simple API works across hosts: cluster_procs = spawn_cluster_procs( - hosts=["host1", "host2", "host3"], - per_host={"gpus": 4} + hosts=["host1", "host2", "host3"], per_host={"gpus": 4} ) # Automatically creates: @@ -213,9 +212,8 @@ # This shows the underlying actor system that powers Forge services # NOTE: This is for educational purposes - use ForgeActor and .as_service() in real Forge apps! -from monarch.actor import Actor, endpoint, this_proc, Future -from monarch.actor import ProcMesh, this_host -import asyncio +from monarch.actor import Actor, endpoint, Future, ProcMesh, this_host, this_proc + # STEP 1: Define a basic actor class Counter(Actor): @@ -230,39 +228,43 @@ def increment(self) -> None: def get_value(self) -> int: return self.value -# STEP 2: Single actor in local process -counter: Counter = this_proc().spawn("counter", Counter, initial_value=0) -# STEP 3: Send messages -fut: Future[int] = counter.get_value.call_one() -value = await fut -print(f"Counter value: {value}") # 0 +async def example_monarch_counter_usage(): + """Example showing basic Monarch actor usage.""" + # STEP 2: Single actor in local process + counter: Counter = this_proc().spawn("counter", Counter, initial_value=0) + + # STEP 3: Send messages + fut: Future[int] = counter.get_value.call_one() + value = await fut + print(f"Counter value: {value}") # 0 -# STEP 4: Multiple actors across processes -procs: ProcMesh = this_host().spawn_procs(per_host={"gpus": 8}) -counters: Counter = procs.spawn("counters", Counter, 0) + # STEP 4: Multiple actors across processes + procs: ProcMesh = this_host().spawn_procs(per_host={"gpus": 8}) + counters: Counter = procs.spawn("counters", Counter, 0) -# STEP 5: Broadcast to all actors -await counters.increment.call() + # STEP 5: Broadcast to all actors + await counters.increment.call() -# STEP 6: Different message patterns -# call_one() - single actor -value = await counters.get_value.call_one() -print(f"One counter: {value}") # Output: One counter: 1 + # STEP 6: Different message patterns + # call_one() - single actor + value = await counters.get_value.call_one() + print(f"One counter: {value}") # Output: One counter: 1 -# choose() - random single actor (actors only, not services) -value = await counters.get_value.choose() -print(f"Random counter: {value}") # Output: Random counter: 1 + # choose() - random single actor (actors only, not services) + value = await counters.get_value.choose() + print(f"Random counter: {value}") # Output: Random counter: 1 -# call() - all actors, collect results -values = await counters.get_value.call() -print(f"All counters: {values}") # Output: All counters: [1, 1, 1, 1, 1, 1, 1, 1] + # call() - all actors, collect results + values = await counters.get_value.call() + print(f"All counters: {values}") # Output: All counters: [1, 1, 1, 1, 1, 1, 1, 1] -# broadcast() - fire and forget -await counters.increment.broadcast() # No return value - just sends to all actors + # broadcast() - fire and forget + await counters.increment.broadcast() # No return value - just sends to all actors + + # Cleanup + await procs.stop() -# Cleanup -await procs.stop() ###################################################################### # Remember: This raw Monarch code is for understanding how Forge works internally. @@ -459,7 +461,6 @@ def get_value(self) -> int: # * **Scale effectively**: Where to add resources for maximum impact? - ###################################################################### # Conclusion # ---------- From b680e7e241f24efa35f5fbdd518943887132e53d Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Tue, 14 Oct 2025 14:56:50 -0700 Subject: [PATCH 28/28] Update --- .../zero-to-forge/1_RL_and_Forge_Fundamentals.py | 7 ++++++- .../tutorial_sources/zero-to-forge/2_Forge_Internals.py | 4 ++-- .../source/tutorial_sources/zero-to-forge/3_Monarch_101.py | 6 +++--- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py b/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py index d97d715a3..e408cffae 100644 --- a/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py +++ b/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py @@ -325,7 +325,12 @@ async def example_automatic_management(policy): import torch -from apps.grpo.main import ComputeAdvantages, DatasetActor, RewardActor + +try: + from apps.grpo.main import ComputeAdvantages, DatasetActor, RewardActor +except ImportError: + # Module not available during doc build + ComputeAdvantages = DatasetActor = RewardActor = None ###################################################################### # Forge handles behind the scenes: diff --git a/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py b/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py index 72b63d728..001d3c02d 100644 --- a/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py +++ b/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py @@ -141,8 +141,8 @@ async def example_service_creation(): return policy -# Run the example -asyncio.run(example_service_creation()) +# Run the example (commented out to avoid execution during doc build) +# asyncio.run(example_service_creation()) ###################################################################### diff --git a/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py b/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py index c989724a0..c0c3c1411 100644 --- a/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py +++ b/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py @@ -93,10 +93,10 @@ # **Key insight**: ProcMesh creates one process per GPU, automatically handling the process-to-hardware mapping. # -# This simple call: -procs = this_host().spawn_procs(per_host={"gpus": 8}) +# Example call (commented out since this_host is not defined at module level): +# procs = this_host().spawn_procs(per_host={"gpus": 8}) -# Creates: +# This creates: # Process 0 → GPU 0 # Process 1 → GPU 1 # Process 2 → GPU 2