From 63045fdca9c7c771b6dd0d23b683026c7ca4782c Mon Sep 17 00:00:00 2001
From: Sanyam Bhutani <sanyambhutani@meta.com>
Date: Thu, 2 Oct 2025 18:50:19 -0700
Subject: [PATCH 01/28] Create ReadMe.MD

---
 docs/Tutorials/ReadMe.MD | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 docs/Tutorials/ReadMe.MD
diff --git a/docs/Tutorials/ReadMe.MD b/docs/Tutorials/ReadMe.MD
new file mode 100644
index 000000000..6294c8ec8
--- /dev/null
+++ b/docs/Tutorials/ReadMe.MD
@@ -0,0 +1,11 @@
+Welcome to the Tutorials section! This section is inspired by the A-Z PyTorch tutorial, shoutout to our friends that remember! 
+
+This section currently is structured in 3 detailed parts:
+
+1. []()
+2. []()
+3. []()
+
+Each part builds upon the next and the entire section can be consumed in roughly an hour-Grab a Chai and Enjoy! 
+
+If you're eager, please checkout our SFT Tutorial too (Coming soon!) as well as [App Examples](../../apps/).
\ No newline at end of file

From 7dbf75f03739fdf3b4433fd68f73a789972091b0 Mon Sep 17 00:00:00 2001
From: Sanyam Bhutani <sanyambhutani@meta.com>
Date: Thu, 2 Oct 2025 19:02:51 -0700
Subject: [PATCH 02/28] add part 1

---
 docs/Tutorials/1_RL_and_Forge_Fundamentals.MD | 385 ++++++++++++++++++
 docs/Tutorials/2_.MD                          |   0
 docs/Tutorials/3_.MD                          |   0
 docs/Tutorials/ReadMe.MD                      |  12 +-
 4 files changed, 395 insertions(+), 2 deletions(-)
 create mode 100644 docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
 create mode 100644 docs/Tutorials/2_.MD
 create mode 100644 docs/Tutorials/3_.MD

diff --git a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
new file mode 100644
index 000000000..96710b57a
--- /dev/null
+++ b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
@@ -0,0 +1,385 @@
+# Part 1: RL Fundamentals - Using Forge Terminology
+
+## Core RL Components in Forge
+
+Let's start with a simple math tutoring example to understand RL concepts with the exact names Forge uses:
+
+### The Toy Example: Teaching Math
+
+```mermaid
+graph TD
+    subgraph Example["Math Tutoring RL Example"]
+        Dataset["Dataset<br/>math problems<br/>'What is 2+2?'"]
+        Policy["Policy<br/>student AI<br/>generates: 'The answer is 4'"]
+        Reward["Reward Model<br/>Evaluation Exam<br/>scores: 0.95 (excellent)"]
+        Reference["Reference Model<br/>original student<br/>baseline comparison"]
+        ReplayBuffer["Replay Buffer<br/>notebook<br/>stores experiences"]
+        Trainer["Trainer<br/>tutor<br/>improves student"]
+    end
+    
+    Dataset --> Policy
+    Policy --> Reward
+    Policy --> Reference
+    Reward --> ReplayBuffer
+    Reference --> ReplayBuffer
+    ReplayBuffer --> Trainer
+    Trainer --> Policy
+    
+    style Policy fill:#99ff99
+    style Reward fill:#ffcc99
+    style Trainer fill:#ff99cc
+```
+
+### RL Components Defined (Forge Names)
+
+1. **Dataset**: Provides questions/prompts (like "What is 2+2?")
+2. **Policy**: The AI being trained (generates answers like "The answer is 4")
+3. **Reward Model**: Evaluates answer quality (gives scores like 0.95)
+4. **Reference Model**: Original policy copy (prevents drift from baseline)
+5. **Replay Buffer**: Stores experiences (question + answer + score)
+6. **Trainer**: Updates the policy weights based on experiences
+
+### The RL Learning Flow
+
+```python
+# CONCEPTUAL EXAMPLE - see apps/grpo/main.py for GRPO Code
+
+def conceptual_rl_step():
+    # 1. Get a math problem
+    question = dataset.sample()  # "What is 2+2?"
+    
+    # 2. Student generates answer  
+    answer = policy.generate(question)  # "The answer is 4"
+    
+    # 3. Teacher grades it
+    score = reward_model.evaluate(question, answer)  # 0.95
+    
+    # 4. Compare to original student
+    baseline = reference_model.compute_logprobs(question, answer)
+    
+    # 5. Store the experience
+    experience = Episode(question, answer, score, baseline)
+    replay_buffer.add(experience)
+    
+    # 6. When enough experiences collected, improve student
+    batch = replay_buffer.sample(curr_policy_version=0)
+    if batch is not None:
+        trainer.train_step(batch)  # Student gets better!
+        
+# 🔄 See complete working example below with actual Forge service calls
+```
+
+## From Concepts to Forge Services
+
+Here's the key insight: **Each RL component becomes a Forge service**. The toy example above maps directly to Forge:
+
+```mermaid
+graph LR
+    subgraph Concepts["RL Concepts"]
+        C1["Dataset"]
+        C2["Policy"]
+        C3["Reward Model"]
+        C4["Reference Model"]
+        C5["Replay Buffer"]
+        C6["Trainer"]
+    end
+    
+    subgraph Services["Forge Services (Real Classes)"]
+        S1["DatasetActor"]
+        S2["Policy"]
+        S3["RewardActor"]
+        S4["ReferenceModel"]
+        S5["ReplayBuffer"]
+        S6["RLTrainer"]
+    end
+    
+    C1 --> S1
+    C2 --> S2
+    C3 --> S3
+    C4 --> S4
+    C5 --> S5
+    C6 --> S6
+    
+    style C2 fill:#99ff99
+    style S2 fill:#99ff99
+    style C3 fill:#ffcc99
+    style S3 fill:#ffcc99
+```
+
+### RL Step with Forge Services
+
+```python
+# Conceptual Example
+
+async def conceptual_forge_rl_step(services, step):
+    # 1. Get a math problem - CONCEPTUAL API
+    sample = await services['dataloader'].get_sample()
+    question, target = sample["question"], sample["answer"]
+    
+    # 2. Student generates answer - CONCEPTUAL API
+    # Actual method names vary by implementation
+    responses = await services['policy'].generate(prompt=question)
+    answer = responses[0].text  
+    
+    # 3. Teacher grades it - CONCEPTUAL API  
+    # Actual reward evaluation varies by implementation
+    score = await services['reward_actor'].evaluate(
+        prompt=question, response=answer, target=target
+    )
+    
+    # 4. Compare to baseline - CONCEPTUAL API
+    ref_logprobs = await services['ref_model'].compute_baseline(responses[0].token_ids)
+    
+    # 5. Store experience - CONCEPTUAL Episode structure
+    # Real Episode structure in src/forge/data_models/episode.py
+    episode = create_episode(responses[0], score, ref_logprobs, step)
+    await services['replay_buffer'].store(episode)
+    
+    # 6. Improve student - CONCEPTUAL API
+    batch = await services['replay_buffer'].get_batch(policy_version=step)
+    if batch is not None:
+        loss = await services['trainer'].update_policy(batch)
+        return loss
+```
+
+**Key difference**: Same RL logic, but each component is now a distributed, fault-tolerant, auto-scaling service.
+
+
+## Why This Matters: Traditional ML Infrastructure Fails
+
+### The Infrastructure Challenge
+
+Our simple RL loop above has complex requirements:
+
+#### Problem 1: Different Resource Needs
+
+```mermaid
+graph TD
+    subgraph Components["Each Component Needs Different Resources"]
+        Policy["Policy (Student AI)<br/>Generates: 'The answer is 4'<br/>Needs: Large GPU memory<br/>Scaling: Multiple replicas for speed"]
+        
+        Reward["Reward Model (Teacher)<br/>Scores answers: 0.95<br/>Needs: Moderate compute<br/>Scaling: CPU or small GPU"]
+        
+        Trainer["Trainer (Tutor)<br/>Improves student weights<br/>Needs: Massive GPU compute<br/>Scaling: Distributed training"]
+        
+        Dataset["Dataset (Question Bank)<br/>Provides: 'What is 2+2?'<br/>Needs: CPU intensive I/O<br/>Scaling: High memory bandwidth"]
+    end
+    
+    style Policy fill:#99ff99
+    style Reward fill:#ffcc99
+    style Trainer fill:#ff99cc
+    style Dataset fill:#ccccff
+```
+
+### Problem 2: Complex Interdependencies
+
+```mermaid
+graph LR
+    A["Policy: Student AI<br/>'What is 2+2?' → 'The answer is 4'"]
+    B["Reward: Teacher<br/>Scores answer: 0.95"]
+    C["Reference: Original Student<br/>Provides baseline comparison"]
+    D["Replay Buffer: Notebook<br/>Stores: question + answer + score"]
+    E["Trainer: Tutor<br/>Improves student using experiences"]
+    
+    A --> B
+    A --> C
+    B --> D
+    C --> D
+    D --> E
+    E --> A
+    
+    style A fill:#99ff99
+    style B fill:#ffcc99
+    style C fill:#99ccff
+    style D fill:#ccff99
+    style E fill:#ff99cc
+```
+
+Each step has different:
+- **Latency requirements**: Policy inference needs low latency, training can batch
+- **Scaling patterns**: Reward evaluation scales with response count, training with model size
+- **Failure modes**: Policy failure stops generation, reward failure affects learning quality
+- **Resource utilization**: GPUs for inference/training, CPUs for data processing
+
+### Problem 3: The Coordination Challenge
+
+Unlike supervised learning where you process independent batches, RL requires coordination:
+
+```python
+# This won't work - creates bottlenecks and resource waste
+def naive_rl_step():
+    # Policy waits idle while reward model works
+    response = policy_model.generate(prompt)  # GPU busy
+    reward = reward_model.evaluate(prompt, response)  # Policy GPU idle
+    
+    # Training waits for single episode  
+    loss = compute_loss(response, reward)  # Batch size = 1, inefficient
+    
+    # Everything stops if any component fails
+    if policy_fails or reward_fails or trainer_fails:
+        entire_system_stops()
+```
+
+## Enter Forge: RL-Native Architecture
+
+Forge solves these problems by treating each RL component as an **independent, scalable service**
+
+Let's see how core RL concepts map to Forge services:
+
+```python
+async def real_rl_training_step(services, step):
+    """Single RL step using verified Forge APIs"""
+    
+    # 1. Environment interaction
+    sample = await services['dataloader'].__next__.call_one()
+    prompt, target = sample["question"], sample["answer"]
+    
+    responses = await services['policy'].generate.route(prompt=prompt)
+    
+    # 2. Reward computation
+    score = await services['reward_actor'].evaluate_response.route(
+        prompt=prompt, response=responses[0].text, target=target
+    )
+    
+    # 3. Get reference logprobs
+    ref_logprobs = await services['ref_model'].forward.route(responses[0].token_ids)
+    
+    # 4. Experience storage - Episode creation pattern
+    # Note: Actual Episode structure requires token tensors, not text
+    episode = create_episode_from_response(responses[0], score, ref_logprobs, step)
+    await services['replay_buffer'].add.call_one(episode)
+    
+    # 5. Learning - trainer endpoint
+    batch = await services['replay_buffer'].sample.call_one(
+        curr_policy_version=step
+    )
+    if batch is not None:
+        loss = await services['trainer'].train_step.call_one(batch)
+        
+        # 6. Policy synchronization - weight update pattern
+        await services['trainer'].push_weights.call_one(step + 1)
+        await services['policy'].update_weights.fanout(step + 1)
+        
+        return loss
+```
+
+**Key insight**: Each line of RL pseudocode becomes a service call. The complexity of distribution, scaling, and fault tolerance is hidden behind these simple interfaces.
+
+## What Makes This Powerful
+
+### Automatic Resource Management
+```python
+responses = await policy.generate.route(prompt=question)
+answer = responses[0].text  # responses is list[Completion]
+
+# Forge handles behind the scenes:
+# - Routing to least loaded replica
+# - GPU memory management  
+# - Batch optimization
+# - Failure recovery
+# - Auto-scaling based on demand
+```
+
+### Independent Scaling
+```python
+
+from forge.actors.policy import Policy, PolicyConfig, SamplingOverrides, WorkerConfig
+from forge.actors.replay_buffer import ReplayBuffer
+from forge.controller.service import shutdown_service
+from apps.grpo.main import Trainer, RewardActor, ComputeAdvantages, RefModel, DatasetActor
+from forge.data.rewards import MathReward, ThinkingReward
+import asyncio
+
+model = "Qwen/Qwen3-1.7B"
+group_size = 1
+
+(
+    dataloader,
+    policy,
+    trainer,
+    replay_buffer,
+    compute_advantages,
+    ref_model,
+    reward_actor,
+) = await asyncio.gather(
+        # Dataset service
+        spawn_service(
+            ServiceConfig(procs_per_replica=1, num_replicas=1),
+            DatasetActor,
+            path="openai/gsm8k",
+            config_name="main",
+            split="train",
+            streaming=True,
+        ),
+        # Policy service with GPU
+        spawn_service(
+            ServiceConfig(procs_per_replica=1, with_gpus=True, num_replicas=1),
+            Policy,
+            config=PolicyConfig(
+                worker_params=WorkerConfig(model=model),
+                sampling_params=SamplingOverrides(
+                    num_samples=group_size, max_tokens=16
+                ),
+            ),
+        ),
+        # Trainer service with GPU
+        spawn_service(
+            ServiceConfig(procs_per_replica=1, with_gpus=True, num_replicas=1),
+            Trainer,
+            learning_rate=1e-5,
+            beta=0.1,
+            model_name=model,
+        ),
+        # Replay buffer (CPU)
+        spawn_service(
+            ServiceConfig(procs_per_replica=1, num_replicas=1),
+            ReplayBuffer,
+            batch_size=2,
+            max_policy_age=1,
+        ),
+        # Advantage computation (CPU)
+        spawn_service(
+            ServiceConfig(procs_per_replica=1, num_replicas=1),
+            ComputeAdvantages,
+            gamma=0.99,
+            lambda_=0.95,
+        ),
+        # Reference model with GPU
+        spawn_service(
+            ServiceConfig(procs_per_replica=1, num_replicas=1, with_gpus=True),
+            RefModel,
+            model_name=model,
+        ),
+        # Reward actor (CPU)
+        spawn_service(
+            ServiceConfig(procs_per_replica=1, num_replicas=1),
+            RewardActor,
+            reward_functions=[MathReward(), ThinkingReward()],
+        )
+    )
+
+# Production scaling - multiply num_replicas:
+# Policy: num_replicas=8 for high inference demand
+# RewardActor: num_replicas=16 for parallel evaluation
+# Trainer: num_replicas=4 for distributed training
+```
+
+### Fault Tolerance
+```python
+# If a policy replica fails:
+responses = await policy.generate.route(prompt=question)
+answer = responses[0].text
+# -> Forge automatically routes to healthy replica
+# -> Failed replica respawns in background  
+# -> No impact on training loop
+
+# If reward service fails:
+score = await reward_actor.evaluate_response.route(
+    prompt=question, response=answer, target=target
+) 
+# -> Retries on different replica automatically
+# -> Graceful degradation if all replicas fail
+# -> System continues (may need application-level handling)
+```
+
+This is fundamentally different from monolithic RL implementations where any component failure stops everything.
diff --git a/docs/Tutorials/2_.MD b/docs/Tutorials/2_.MD
new file mode 100644
index 000000000..e69de29bb
diff --git a/docs/Tutorials/3_.MD b/docs/Tutorials/3_.MD
new file mode 100644
index 000000000..e69de29bb
diff --git a/docs/Tutorials/ReadMe.MD b/docs/Tutorials/ReadMe.MD
index 6294c8ec8..01d750d06 100644
--- a/docs/Tutorials/ReadMe.MD
+++ b/docs/Tutorials/ReadMe.MD
@@ -1,8 +1,16 @@
-Welcome to the Tutorials section! This section is inspired by the A-Z PyTorch tutorial, shoutout to our friends that remember! 
+## Zero to Forge: From RL Theory to Production-Scale Implementation
+
+A comprehensive guide for ML Engineers building distributed RL systems for language models.
+
+Some of the examples mentioned below will be conceptual in nature for understanding. Please refer to API Docs (Coming Soon!) for more details
+
+Welcome to the Tutorials section! This section is inspired by the A-Z PyTorch tutorial, shoutout to our PyTorch friends that remember! 
+
+###
 
 This section currently is structured in 3 detailed parts:
 
-1. []()
+1. [RL Fundamentals and Understanding Forge Terminology](./1_RL_and_Forge_Fundamentals.MD): This gives a quick refresher of Reinforcement Learning and teaches you Forge Fundamentals
 2. []()
 3. []()
 

From 8abcadbae7997252a800db6f57aa8263bb3f7088 Mon Sep 17 00:00:00 2001
From: Sanyam Bhutani <sanyambhutani@meta.com>
Date: Thu, 2 Oct 2025 19:06:35 -0700
Subject: [PATCH 03/28] Update 1_RL_and_Forge_Fundamentals.MD

---
 docs/Tutorials/1_RL_and_Forge_Fundamentals.MD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
index 96710b57a..bcffc733c 100644
--- a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
+++ b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
@@ -85,6 +85,7 @@ graph LR
     end
     
     subgraph Services["Forge Services (Real Classes)"]
+    
         S1["DatasetActor"]
         S2["Policy"]
         S3["RewardActor"]

From c0c09cb43a588dc9bbd99dc18fc7fa65149d4f11 Mon Sep 17 00:00:00 2001
From: Sanyam Bhutani <sanyambhutani@meta.com>
Date: Thu, 2 Oct 2025 19:08:03 -0700
Subject: [PATCH 04/28] Update 1_RL_and_Forge_Fundamentals.MD

---
 docs/Tutorials/1_RL_and_Forge_Fundamentals.MD | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
index bcffc733c..223a6e152 100644
--- a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
+++ b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
@@ -85,7 +85,6 @@ graph LR
     end
     
     subgraph Services["Forge Services (Real Classes)"]
-    
         S1["DatasetActor"]
         S2["Policy"]
         S3["RewardActor"]
@@ -109,6 +108,8 @@ graph LR
 
 ### RL Step with Forge Services
 
+Let's look at the example from above again, but this time we would use the names from Forge:
+
 ```python
 # Conceptual Example
 
@@ -145,6 +146,8 @@ async def conceptual_forge_rl_step(services, step):
 
 **Key difference**: Same RL logic, but each component is now a distributed, fault-tolerant, auto-scaling service.
 
+Did you realise-we are not worrying about any Infra code here! Forge Automagically handles the details behind the scenes and you can focus on writing your RL Algorthms!
+
 
 ## Why This Matters: Traditional ML Infrastructure Fails
 

From 0a77675f94258aa63ef8038211a7581b12e3a4ea Mon Sep 17 00:00:00 2001
From: Sanyam Bhutani <sanyambhutani@meta.com>
Date: Thu, 2 Oct 2025 19:12:43 -0700
Subject: [PATCH 05/28] part 2

---
 docs/Tutorials/1_RL_and_Forge_Fundamentals.MD |  36 +-
 docs/Tutorials/2_Forge_Internals.MD           | 665 ++++++++++++++++++
 docs/Tutorials/3_.MD                          |   0
 docs/Tutorials/{2_.MD => 3_Monarch_101.MD}    |   0
 4 files changed, 685 insertions(+), 16 deletions(-)
 create mode 100644 docs/Tutorials/2_Forge_Internals.MD
 delete mode 100644 docs/Tutorials/3_.MD
 rename docs/Tutorials/{2_.MD => 3_Monarch_101.MD} (100%)

diff --git a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
index 223a6e152..810ef373f 100644
--- a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
+++ b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
@@ -275,15 +275,15 @@ async def real_rl_training_step(services, step):
 ```python
 responses = await policy.generate.route(prompt=question)
 answer = responses[0].text  # responses is list[Completion]
-
-# Forge handles behind the scenes:
-# - Routing to least loaded replica
-# - GPU memory management  
-# - Batch optimization
-# - Failure recovery
-# - Auto-scaling based on demand
 ```
 
+Forge handles behind the scenes:
+- Routing to least loaded replica
+- GPU memory management  
+- Batch optimization
+- Failure recovery
+- Auto-scaling based on demand
+
 ### Independent Scaling
 ```python
 
@@ -361,13 +361,14 @@ group_size = 1
             reward_functions=[MathReward(), ThinkingReward()],
         )
     )
-
-# Production scaling - multiply num_replicas:
-# Policy: num_replicas=8 for high inference demand
-# RewardActor: num_replicas=16 for parallel evaluation
-# Trainer: num_replicas=4 for distributed training
 ```
 
+Production scaling - multiply num_replicas:
+- Policy: num_replicas=8 for high inference demand
+- RewardActor: num_replicas=16 for parallel evaluation
+- Trainer: num_replicas=4 for distributed training
+
+
 ### Fault Tolerance
 ```python
 # If a policy replica fails:
@@ -381,9 +382,12 @@ answer = responses[0].text
 score = await reward_actor.evaluate_response.route(
     prompt=question, response=answer, target=target
 ) 
-# -> Retries on different replica automatically
-# -> Graceful degradation if all replicas fail
-# -> System continues (may need application-level handling)
 ```
 
-This is fundamentally different from monolithic RL implementations where any component failure stops everything.
+- Retries on different replica automatically
+- Graceful degradation if all replicas fail
+- System continues (may need application-level handling)
+
+This is fundamentally different from monolithic RL implementations where any component failure stops everything!
+
+In the next Section, we will go a layer deeper and learn how ForgeServices work. Continue to [Part 2 here](./2_Forge_Internals.MD)
\ No newline at end of file
diff --git a/docs/Tutorials/2_Forge_Internals.MD b/docs/Tutorials/2_Forge_Internals.MD
new file mode 100644
index 000000000..d55eda51a
--- /dev/null
+++ b/docs/Tutorials/2_Forge_Internals.MD
@@ -0,0 +1,665 @@
+# Part 2: Peeling Back the Abstraction - What Are Services?
+
+We highly recommend reading [Part 1](./1_RL_and_Forge_Fundamentals.MD) before this, it explains RL Concepts and how they land in Forge.
+
+Now that you see the power of the service abstraction, let's understand what's actually happening under the hood, Grab your chai!
+
+## Service Anatomy: Beyond the Interface
+
+When you call `await policy_service.generate(question)`, here's what actually happens:
+
+```mermaid
+graph TD
+    Call["Your Code:<br/>await policy_service.generate"]
+    
+    subgraph ServiceLayer["Service Layer"]
+        Proxy["Service Proxy<br/>Load balancing<br/>Health checking<br/>Request routing"]
+        LB["Load Balancer<br/>Replica selection<br/>Circuit breaker<br/>Retry logic"]
+    end
+    
+    subgraph Replicas["Replica Management"]
+        R1["Replica 1<br/>GPU 0<br/>Healthy"]
+        R2["Replica 2<br/>GPU 1<br/>Overloaded"]
+        R3["Replica 3<br/>GPU 2<br/>Failed"]
+        R4["Replica 4<br/>GPU 3<br/>Healthy"]
+    end
+    
+    subgraph Compute["Actual Computation"]
+        Actor["Policy Actor<br/>vLLM engine<br/>Model weights<br/>KV cache"]
+    end
+    
+    Call --> Proxy
+    Proxy --> LB
+    LB --> R1
+    LB -.-> R2
+    LB -.-> R3
+    LB --> R4
+    R1 --> Actor
+    R4 --> Actor
+    
+    style Call fill:#99ff99
+    style LB fill:#ffcc99
+    style R3 fill:#ff9999
+    style Actor fill:#cc99ff
+```
+
+## Service Components Deep Dive
+
+### 1. Real Service Configuration
+
+Here's the actual ServiceConfig from Forge source code:
+
+```python
+# Configuration pattern from apps/grpo/main.py:
+Policy.options(
+    procs=1,           # Processes per replica
+    num_replicas=4,    # Number of replicas  
+    with_gpus=True     # Allocate GPUs
+    # Other available options:
+    # hosts=None
+)
+
+# This is the ACTUAL way services are configured in Forge
+```
+
+### 2. Real Service Creation
+
+Services are created using the `spawn_service` function:
+
+```python
+# This is what ACTUALLY works - copied directly from the notebook
+
+from forge.controller.service import ServiceConfig, spawn_service
+from forge.actors.policy import Policy, PolicyConfig, SamplingOverrides, WorkerConfig
+
+model = "Qwen/Qwen3-1.7B"
+
+policy = await spawn_service(
+    ServiceConfig(procs_per_replica=1, with_gpus=True, num_replicas=1),
+    Policy,
+    config=PolicyConfig(
+        worker_params=WorkerConfig(model=model),
+        sampling_params=SamplingOverrides(
+            num_samples=1, max_tokens=16
+        ),
+    ),
+)
+
+prompt = "What is 3 + 5?"
+responses = await policy.generate.choose(prompt=prompt)
+print(f"Response: {responses[0].text}")
+
+# The spawn_service() function automatically handles:
+# - Spawning actor replicas across processes/GPUs
+# - Load balancing with .choose() method
+# - Health monitoring and failure recovery  
+# - Message routing and serialization
+
+# Cleanup when done
+await shutdown_service(policy)
+```
+
+### 3. How Services Actually Work
+
+Forge services are implemented as ServiceActors that manage collections of your ForgeActor replicas:
+
+```python
+# Forge internals - What happens behind the scenes:
+# 1. .as_service() creates a ServiceInterface
+# 2. ServiceInterface manages N replicas of your ForgeActor class
+# 3. ServiceInterface handles routing between replicas
+# 4. You get methods like .route(), .fanout(), etc.
+
+# Your code sees this:
+responses = await policy.generate.route(prompt=prompt)
+
+# But behind the scenes:
+# - ServiceInterface selects healthy replica
+# - Routes message to that replica's Policy.generate() endpoint
+# - Handles failures and retries automatically
+# - Returns list[Completion] from the selected replica
+```
+
+### 3. Different Service Types and Their Characteristics
+
+```mermaid
+graph TD
+    subgraph GPU["GPU-Intensive Services"]
+        PolicySvc["Policy Service<br/>Large model inference<br/>High GPU memory<br/>Batch optimization"]
+        TrainerSvc["Trainer Service<br/>Distributed training<br/>Gradient sync<br/>Massive compute"]
+        RefSvc["Reference Service<br/>Frozen model<br/>Baseline computation<br/>Read-only ops"]
+    end
+    
+    subgraph CPU["CPU-Intensive Services"]
+        RewardSvc["Reward Service<br/>Evaluation logic<br/>Rule-based scoring<br/>High throughput"]
+        DataSvc["Data Service<br/>Dataset streaming<br/>Preprocessing<br/>I/O optimization"]
+    end
+    
+    subgraph Memory["Memory-Intensive Services"]
+        BufferSvc["Buffer Service<br/>Experience storage<br/>Efficient sampling<br/>Persistence"]
+        MetricsSvc["Metrics Service<br/>Logging aggregation<br/>Performance tracking<br/>Analytics"]
+    end
+    
+    style PolicySvc fill:#ff9999
+    style TrainerSvc fill:#ff9999
+    style RewardSvc fill:#99ff99
+    style BufferSvc fill:#9999ff
+```
+
+## Deep Dive: Service Communication Patterns
+
+These communication patterns (\"adverbs\") determine how your service calls are routed to replicas. Understanding when to use each pattern is key to effective Forge usage.
+
+### 1. `.route()` - Load Balanced Single Replica
+
+**When to use**: Normal request routing where any replica can handle the request.
+
+```python
+responses = await policy.generate.route(prompt=question)
+answer = responses[0].text  # Extract text from Completion object
+
+# Behind the scenes:
+# 1. Health check eliminates failed replicas
+# 2. Load balancer picks least loaded healthy replica  
+# 3. Request routes to that specific replica
+# 4. Automatic retry on different replica if failure
+```
+
+**Performance characteristics**:
+- **Latency**: Lowest (single network hop)
+- **Throughput**: Limited by single replica capacity
+- **Fault tolerance**: Automatic failover to other replicas
+
+**Critical insight**: `.route()` is your default choice for stateless operations in Forge services.
+
+### 2. `.fanout()` - Broadcast with Results Collection
+
+**When to use**: You need responses from ALL replicas.
+
+```python
+# Get version from all policy replicas
+current_versions = await policy.get_version.fanout()
+# Returns: [version_replica_1, version_replica_2, ...] 
+
+# Update weights on all replicas
+await policy.update_weights.fanout(new_policy_version)
+# Broadcasts to all replicas simultaneously
+```
+
+**Performance characteristics**:
+- **Latency**: Slowest replica determines total latency
+- **Throughput**: Network bandwidth × number of replicas
+- **Fault tolerance**: Fails if ANY replica fails (unless configured otherwise)
+
+**Critical gotcha**: Don't use `.fanout()` for high-frequency operations - it contacts all replicas.
+
+### 3. Streaming Operations - Custom Implementation Pattern
+
+**When to use**: You want to process results as they arrive, not wait for all.
+
+```python
+# 📝 CONCEPTUAL - Streaming requires custom implementation in your training loop
+# The basic ReplayBuffer doesn't have built-in streaming methods
+# Pattern from apps/grpo/main.py continuous training:
+
+while training:
+    # This is the real API call pattern
+    batch = await replay_buffer.sample.call_one(curr_policy_version=step)
+    if batch is not None:
+        # Process batch immediately
+        loss = await trainer.train_step.call_one(batch)
+        print(f"Training loss: {loss}")
+    else:
+        await asyncio.sleep(0.1)  # Wait for more data
+```
+
+**Performance characteristics**:
+- **Latency**: Process first result immediately  
+- **Throughput**: Pipeline parallelism (much higher than sequential)
+- **Fault tolerance**: Continues if some replicas fail
+
+**Critical insight**: This is essential for high-throughput RL where you can't wait for batches.
+
+### 4. Fire-and-Forget Operations
+
+**When to use**: Side effects that don't need responses (notifications, cache updates).
+
+```python
+# 📝 CONCEPTUAL - Fire-and-forget requires custom @endpoint implementations
+# The basic services don't have broadcast methods built-in
+# You would implement custom endpoints in your ForgeActor:
+
+class CustomPolicy(Policy):
+    @endpoint
+    async def clear_cache(self) -> None:
+        """Custom endpoint for cache clearing"""
+        self.policy_worker.clear_kv_cache()
+
+# Then use it (hypothetical):
+# await custom_policy.clear_cache.fanout()  # Clear all replica caches
+# Note: Actual cache clearing would use existing Policy methods
+```
+
+**Performance characteristics**:
+- **Latency**: Immediately returns (doesn't wait for completion)
+- **Throughput**: Network limited, but non-blocking
+- **Fault tolerance**: Fire-and-forget (you don't know if it worked)
+
+**Critical warning**: Only use for non-critical operations - you get no confirmation.
+
+### 5. Service Sessions for Stateful Operations
+
+**When to use**: When you need multiple calls to hit the same replica (like KV cache preservation).
+
+```python
+# This Counter example demonstrates the session pattern
+
+from forge.controller import ForgeActor
+from forge.controller.service import ServiceConfig, spawn_service, shutdown_service
+from monarch.actor import endpoint
+
+class ForgeCounter(ForgeActor):
+    def __init__(self, initial_value: int):
+        self.value = initial_value
+
+    @endpoint
+    def increment(self) -> int:
+        self.value += 1
+        return self.value
+
+    @endpoint
+    def get_value(self) -> int:
+        return self.value
+
+    @endpoint
+    async def reset(self):
+        self.value = 0
+
+counter_service = await spawn_service(
+    ServiceConfig(procs_per_replica=1, num_replicas=4),
+    ForgeCounter,
+    initial_value=0
+)
+
+# Test basic operations
+await counter_service.increment.choose()
+results = await counter_service.increment.call()
+print(f"All replica values: {results}")
+
+# STICKY SESSIONS
+print("\nUsing sticky sessions:")
+async with counter_service.session():
+    await counter_service.reset.choose()
+    print(await counter_service.increment.choose())  # 1
+    print(await counter_service.increment.choose())  # 2
+    print(await counter_service.increment.choose())  # 3
+          
+    final_value = await counter_service.get_value.choose()
+    print(f"Final value on this replica: {final_value}")  # 3
+
+# Same pattern works with Policy for multi-turn conversations:
+# async with policy.session():
+#     response1 = await policy.generate.choose(prompt=turn1)
+#     full_prompt = turn1 + response1[0].text + turn2
+#     response2 = await policy.generate.choose(prompt=full_prompt)
+#     # Both calls hit same replica, preserving KV cache
+
+# Cleanup
+await shutdown_service(counter_service)
+```
+
+**Performance impact**: Critical for maintaining KV cache in multi-turn conversations.
+
+## Deep Dive: State Management Reality
+
+The most complex challenge in distributed RL is maintaining state consistency while maximizing performance.
+
+### The KV Cache Problem  
+
+**The challenge**: Policy inference is much faster with KV cache, but cache is tied to specific conversation history.
+
+```python
+# This breaks KV cache optimization:
+async def naive_multi_turn():
+    # Each call might go to different replica = cache miss
+    response1 = await policy_service.generate.choose(question1)
+    response2 = await policy_service.generate.choose(question1 + response1) # Cache miss!
+    response3 = await policy_service.generate.choose(conversation_so_far)   # Cache miss!
+```
+
+**The solution**: Sticky sessions ensure all calls go to same replica.
+
+```python  
+async def optimized_multi_turn():
+    async with policy.session():
+        # All calls guaranteed to hit same replica = cache hits
+        response1 = await policy.generate.route(prompt=question1)
+        full_prompt = question1 + response1[0].text  
+        response2 = await policy.generate.route(prompt=full_prompt) # Cache hit!
+        conversation = full_prompt + response2[0].text
+        response3 = await policy.generate.route(prompt=conversation)   # Cache hit!
+        
+    # Session ends, replica can be garbage collected or reused
+```
+
+**Performance impact**: Maintaining KV cache across turns avoids recomputing previous tokens.
+
+### Replay Buffer Consistency
+
+**The challenge**: Multiple trainers and experience collectors reading/writing concurrently.
+
+**Real Forge approach**: The ReplayBuffer actor handles concurrency internally:
+
+```python
+# Forge ReplayBuffer endpoints (verified from source code)
+# Add episodes (thread-safe by actor model)
+await replay_buffer.add.call_one(episode)  # Note: .call_one() not .choose()
+
+# Sample batches for training
+batch = await replay_buffer.sample.call_one(
+    curr_policy_version=step_number,
+    batch_size=None  # Optional parameter, uses default from config
+)
+
+# Additional methods available:
+# await replay_buffer.clear.call_one()  # Clear buffer
+# await replay_buffer.evict.call_one(curr_policy_version)  # Remove old episodes
+# state = await replay_buffer.state_dict.call_one()  # Get state for checkpointing
+```
+
+**Critical insight**: The actor model provides natural thread safety - each actor processes messages sequentially.
+
+### Weight Synchronization Strategy
+
+**The challenge**: Trainer updates policy weights, but policy service needs those weights.
+
+```python
+# Forge weight synchronization pattern from apps/grpo/main.py
+async def real_weight_sync(trainer, policy, step):
+    # Trainer pushes weights to TorchStore with version number
+    await trainer.push_weights.call_one(policy_version=step + 1)
+    
+    # Policy service updates to new version from TorchStore  
+    # Use .fanout() to update ALL policy replicas
+    await policy.update_weights.fanout(policy_version=step + 1)
+    
+# Check current policy version
+current_version = await policy.get_version.route()
+print(f"Current policy version: {current_version}")
+```
+
+## Deep Dive: Asynchronous Coordination Patterns
+
+**The real challenge**: Different services run at different speeds, but Forge's service abstraction handles the coordination complexity.
+
+### The Forge Approach: Let Services Handle Coordination
+
+Instead of manual coordination, Forge services handle speed mismatches automatically:
+
+```python
+
+from apps.grpo.main import Episode, Group
+
+async def simple_rl_step():
+    
+    # ===== Generate a rollout =====
+    sample = await dataloader.__next__.choose()
+    prompt, target = sample["question"], sample["answer"]
+    
+    print(f"Prompt: {prompt}")
+    print(f"Target: {target}")
+    
+    actions = await policy.generate.choose(prompt=prompt)
+    print(f"Policy response: {actions[0].text}")
+    
+    ref_logprobs = await ref_model.forward.choose(actions[0].token_ids)    
+    reward = await reward_actor.evaluate_response.choose(
+        prompt=prompt, 
+        response=actions[0].text, 
+        target=target
+    )
+    print(f"Reward: {reward}")
+    
+    episode = Episode(
+        episode_id=0,
+        prompt=prompt,
+        target=target, 
+        policy_version=0,
+    )
+    
+    episode.add_group(Group(
+        response=actions[0].text,
+        ref_logprobs=ref_logprobs,
+        reward=reward,
+    ))
+    
+    advantages = await compute_advantages.__call__.choose(episode.groups)
+    episode.groups[0].advantage = advantages[0]
+    print(f"Advantage: {advantages[0]}")    
+    await replay_buffer.add.choose(episode)
+    print("Episode stored in replay buffer")
+    
+    # ===== Train on the batch ===== 
+    batch = await replay_buffer.sample.choose(curr_policy_version=0)
+    if batch is not None:
+        print("Training on batch...")
+        training_result = await trainer.train_step.choose(batch)
+        loss = training_result.get("loss", 0.0)
+        print(f"Training loss: {loss}")
+        return loss
+    else:
+        print("Not enough data in buffer yet")
+        return None
+
+for step in range(10):
+    print(f"\n--- RL Step {step + 1} ---")
+    loss = await simple_rl_step()
+    if loss:
+        print(f"Step {step + 1} complete, loss: {loss:.4f}")
+    else:
+        print(f"Step {step + 1} complete, building buffer...")
+```
+
+### Handling Speed Mismatches with Service Scaling
+
+**The insight**: Scale services independently based on their bottlenecks.
+
+```python
+# Scale fast services with more replicas
+policy = await Policy.options(
+    procs=1, num_replicas=8, with_gpus=True  # Many replicas for high throughput
+).as_service(
+    engine_config=EngineConfig(model=model_name)
+)
+
+# Reward evaluation might be CPU-bound
+reward_actor = await RewardActor.options(
+    procs=1, num_replicas=16, with_gpus=False  # More CPU replicas
+).as_service(
+    reward_functions=[MathReward()]
+)
+
+# Training needs fewer but more powerful replicas
+trainer = await RLTrainer.options(
+    procs=1, num_replicas=2, with_gpus=True  # Fewer but GPU-heavy
+).as_actor(  # Trainer typically uses .as_actor() not .as_service()
+    optimizer=Optimizer(lr=1e-5)
+)
+```
+
+### Natural Backpressure Through Service APIs
+
+```python
+# backpressure pattern - The replay buffer naturally provides backpressure
+batch = await replay_buffer.sample.call_one(curr_policy_version=step)
+if batch is None:
+    # Not enough data yet - natural rate limiting
+    print("Buffer not ready, collecting more experiences...")
+    continue
+else:
+    # Proceed with training
+    loss = await trainer.train_step.call_one(batch)
+    print(f"Training loss: {loss}")
+```
+
+These patterns address the core technical challenges in distributed RL. The key insight: **Forge services handle coordination complexity automatically, letting you focus on RL algorithm logic**.
+
+## Service Implementation Example
+
+Let's see how a reward service is actually implemented:
+
+```python
+# ✅ COMPLETE WORKING EXAMPLE - Exact RewardActor from apps/grpo/main.py
+
+from forge.controller import ForgeActor
+from monarch.actor import endpoint
+from forge.data.rewards import MathReward, ThinkingReward
+from forge.controller.service import ServiceConfig, spawn_service
+
+# EXACT class definition from apps/grpo/main.py lines 68-83  
+class RewardActor(ForgeActor):
+    def __init__(self, reward_functions: list):
+        self.reward_functions = reward_functions
+
+    @endpoint
+    async def evaluate_response(self, prompt: str, response: str, target: str) -> float:
+        """Evaluate response quality using multiple reward functions"""
+        total_reward = 0.0
+        
+        for reward_fn in self.reward_functions:
+            # Each reward function contributes to total score
+            reward = reward_fn(prompt, response, target)
+            total_reward += reward
+            
+        # Return average reward across all functions
+        return total_reward / len(self.reward_functions) if self.reward_functions else 0.0
+
+reward_actor = await spawn_service(
+    ServiceConfig(procs_per_replica=1, num_replicas=1),
+    RewardActor,
+    reward_functions=[MathReward(), ThinkingReward()]
+)
+
+prompt = "What is 15% of 240?"
+response = "15% of 240 is 36"
+target = "36"
+
+score = await reward_actor.evaluate_response.choose(
+    prompt=prompt,
+    response=response, 
+    target=target
+)
+print(f"Reward score: {score}")  # Usually around 1.0 for correct math answers
+
+# For production scaling - increase num_replicas for parallel evaluation:
+# ServiceConfig(procs_per_replica=1, num_replicas=16)  # 16 parallel evaluators
+
+# Cleanup when done
+await shutdown_service(reward_actor)
+```
+
+## Service Orchestration: The Training Loop
+
+Now let's see how services coordinate in a real training loop:
+
+```python
+# This is the REAL way production RL systems are built with Forge
+
+import asyncio
+from forge.actors.policy import Policy
+from forge.actors.reference_model import ReferenceModel
+from forge.actors.replay_buffer import ReplayBuffer
+from forge.actors.trainer import RLTrainer
+from forge.controller.actor import ForgeActor
+from forge.data.rewards import MathReward, ThinkingReward
+from monarch.actor import endpoint
+from omegaconf import DictConfig
+
+# EXACT service creation from apps/grpo/main.py lines 322-344
+print("Initializing all services...")
+(
+    dataloader,
+    policy,
+    trainer,
+    replay_buffer,
+    compute_advantages,
+    ref_model,
+    reward_actor,
+) = await asyncio.gather(
+    DatasetActor.options(**cfg.actors.dataset).as_actor(**cfg.dataset),
+    Policy.options(**cfg.services.policy).as_service(**cfg.policy),
+    RLTrainer.options(**cfg.actors.trainer).as_actor(
+        **cfg.trainer, loss=simple_grpo_loss
+    ),
+    ReplayBuffer.options(**cfg.actors.replay_buffer).as_actor(
+        **cfg.replay_buffer, collate=collate
+    ),
+    ComputeAdvantages.options(**cfg.actors.compute_advantages).as_actor(),
+    ReferenceModel.options(**cfg.services.ref_model).as_service(**cfg.ref_model),
+    RewardActor.options(**cfg.services.reward_actor).as_service(
+        reward_functions=[MathReward(), ThinkingReward()]
+    ),
+)
+
+print("All services initialized successfully!")
+
+# EXACT usage patterns from apps/grpo/main.py continuous training loop
+async def production_training_loop():
+    """Real training loop pattern from apps/grpo/main.py"""
+    step = 0
+    
+    while True:
+        # Data generation 
+        sample = await dataloader.sample.call_one()
+        
+        # Policy generation service call
+        responses = await policy.generate.route(prompt=sample["question"])
+        
+        # Reference computation service call
+        ref_logprobs = await ref_model.forward.route(responses[0].token_ids)
+        
+        # Reward evaluation service call 
+        reward = await reward_actor.evaluate_response.route(
+            prompt=sample["question"],
+            response=responses[0].text,
+            target=sample["answer"]
+        )
+        
+        # Experience storage (simplified structure for illustration)
+        episode = create_episode(sample, responses[0], reward, ref_logprobs, step)
+        await replay_buffer.add.call_one(episode)
+        
+        # Training when ready endpoints
+        batch = await replay_buffer.sample.call_one(curr_policy_version=step)
+        if batch is not None:
+            loss = await trainer.train_step.call_one(batch)
+            
+            # Weight synchronization pattern
+            await trainer.push_weights.call_one(step + 1)
+            await policy.update_weights.route(step + 1)
+            
+            print(f"Step {step}, Loss: {loss:.4f}")
+            step += 1
+
+# EXACT cleanup pattern from apps/grpo/main.py lines 493-504  
+print("Shutting down services...")
+await asyncio.gather(
+    DatasetActor.shutdown(dataloader),
+    policy.shutdown(),
+    RLTrainer.shutdown(trainer),
+    ReplayBuffer.shutdown(replay_buffer),
+    ComputeAdvantages.shutdown(compute_advantages),
+    ref_model.shutdown(),
+    reward_actor.shutdown(),
+)
+print("All services shut down successfully!")
+```
+
+**Key observations:**
+1. **Parallelism**: Independent operations run concurrently
+2. **Load balancing**: Each `choose()` call automatically selects optimal replica  
+3. **Fault tolerance**: Failures automatically retry on different replicas
+4. **Resource efficiency**: CPU and GPU services scale independently
+5. **Coordination**: Services coordinate through shared state (replay buffer, weight versions)
+
+This is the power of the service abstraction - complex distributed coordination looks like simple async Python code.
diff --git a/docs/Tutorials/3_.MD b/docs/Tutorials/3_.MD
deleted file mode 100644
index e69de29bb..000000000
diff --git a/docs/Tutorials/2_.MD b/docs/Tutorials/3_Monarch_101.MD
similarity index 100%
rename from docs/Tutorials/2_.MD
rename to docs/Tutorials/3_Monarch_101.MD

From f3710077ec97e3289b5a3aa3882fee48572bb223 Mon Sep 17 00:00:00 2001
From: Sanyam Bhutani <sanyambhutani@meta.com>
Date: Thu, 2 Oct 2025 19:15:10 -0700
Subject: [PATCH 06/28] add part 3

---
 docs/Tutorials/3_Monarch_101.MD | 437 ++++++++++++++++++++++++++++++++
 docs/Tutorials/ReadMe.MD        |   4 +-
 2 files changed, 439 insertions(+), 2 deletions(-)

diff --git a/docs/Tutorials/3_Monarch_101.MD b/docs/Tutorials/3_Monarch_101.MD
index e69de29bb..9369be13a 100644
--- a/docs/Tutorials/3_Monarch_101.MD
+++ b/docs/Tutorials/3_Monarch_101.MD
@@ -0,0 +1,437 @@
+# Part 3: The Forge-Monarch Connection
+
+Now let's peel back the layers. Forge services are built on top of **Monarch**, PyTorch's distributed actor framework. Understanding this connection is crucial for optimization and debugging.
+
+## The Complete Hierarchy: Service to Silicon
+
+```mermaid
+graph TD
+    subgraph YourCode["1. Your RL Code"]
+        Call["await policy_service.generate.choose('What is 2+2?')"]
+    end
+    
+    subgraph ForgeServices["2. Forge Service Layer"]
+        ServiceInterface["ServiceInterface<br/>• Routes .choose() to replica<br/>• Handles load balancing<br/>• Manages health checks"]
+        ServiceActor["ServiceActor<br/>• Manages replica lifecycle<br/>• Monitors health<br/>• Coordinates failures"]
+    end
+    
+    subgraph MonarchLayer["3. Monarch Actor Layer"]  
+        ActorMesh["ActorMesh[PolicyActor]<br/>• 4 PolicyActor instances<br/>• Each on different GPU<br/>• Message passing interface"]
+        ProcMesh["ProcMesh<br/>• 4 processes<br/>• GPU topology: [0,1,2,3]<br/>• Network interconnect"]
+    end
+    
+    subgraph Hardware["4. Physical Hardware"]
+        GPU0["GPU 0<br/>PolicyActor #1<br/>vLLM Engine<br/>Model Weights"]
+        GPU1["GPU 1<br/>PolicyActor #2<br/>vLLM Engine<br/>Model Weights"] 
+        GPU2["GPU 2<br/>PolicyActor #3<br/>vLLM Engine<br/>Model Weights"]
+        GPU3["GPU 3<br/>PolicyActor #4<br/>vLLM Engine<br/>Model Weights"]
+    end
+    
+    Call --> ServiceInterface
+    ServiceInterface --> ServiceActor
+    ServiceActor --> ActorMesh
+    ActorMesh --> ProcMesh
+    ProcMesh --> GPU0
+    ProcMesh --> GPU1
+    ProcMesh --> GPU2
+    ProcMesh --> GPU3
+    
+    style Call fill:#99ff99
+    style ServiceActor fill:#ffcc99
+    style ActorMesh fill:#cc99ff
+    style ProcMesh fill:#ccccff
+```
+
+## Deep Dive: ProcMesh - The Foundation
+
+**ProcMesh** is Monarch's core abstraction for organizing processes across hardware. Think of it as a multi-dimensional grid that maps directly to your cluster topology.
+
+### Single Host ProcMesh
+
+```mermaid
+graph TD
+    subgraph Host["Single Host (8 GPUs)"]
+        subgraph ProcMesh["ProcMesh: per_host={'gpus': 8}"]
+            P0["Process 0<br/>GPU 0"]
+            P1["Process 1<br/>GPU 1"] 
+            P2["Process 2<br/>GPU 2"]
+            P3["Process 3<br/>GPU 3"]
+            P4["Process 4<br/>GPU 4"]
+            P5["Process 5<br/>GPU 5"]
+            P6["Process 6<br/>GPU 6"] 
+            P7["Process 7<br/>GPU 7"]
+        end
+        
+        P0 -.->|"Network"| P1
+        P1 -.->|"Network"| P2  
+        P2 -.->|"Network"| P3
+        P3 -.->|"Network"| P4
+        P4 -.->|"Network"| P5
+        P5 -.->|"Network"| P6
+        P6 -.->|"Network"| P7
+        P7 -.->|"Network"| P0
+    end
+    
+    style P0 fill:#ff9999
+    style P1 fill:#ff9999
+    style P2 fill:#ff9999
+    style P3 fill:#ff9999
+    style P4 fill:#ff9999
+    style P5 fill:#ff9999
+    style P6 fill:#ff9999
+    style P7 fill:#ff9999
+```
+
+### Multi-Host ProcMesh
+
+```mermaid
+graph TD
+    subgraph Cluster["Multi-Host Cluster"]
+        subgraph Host1["Host 1"]
+            subgraph PM1["ProcMesh Segment 1"]
+                H1P0["Process 0<br/>GPU 0"]
+                H1P1["Process 1<br/>GPU 1"]
+                H1P2["Process 2<br/>GPU 2"]
+                H1P3["Process 3<br/>GPU 3"]
+            end
+        end
+        
+        subgraph Host2["Host 2"] 
+            subgraph PM2["ProcMesh Segment 2"]
+                H2P0["Process 4<br/>GPU 0"]
+                H2P1["Process 5<br/>GPU 1"]
+                H2P2["Process 6<br/>GPU 2"]
+                H2P3["Process 7<br/>GPU 3"]
+            end
+        end
+        
+        subgraph Host3["Host 3"]
+            subgraph PM3["ProcMesh Segment 3"]
+                H3P0["Process 8<br/>GPU 0"]
+                H3P1["Process 9<br/>GPU 1"]
+                H3P2["Process 10<br/>GPU 2"] 
+                H3P3["Process 11<br/>GPU 3"]
+            end
+        end
+    end
+    
+    H1P0 -.->|"InfiniBand"| H2P0
+    H1P1 -.->|"InfiniBand"| H2P1
+    H2P0 -.->|"InfiniBand"| H3P0
+    H2P1 -.->|"InfiniBand"| H3P1
+    
+    style PM1 fill:#ff9999
+    style PM2 fill:#99ff99
+    style PM3 fill:#99ccff
+```
+
+```python
+# This shows the underlying actor system that powers Forge services
+
+from monarch.actor import Actor, endpoint, this_proc, Future
+from monarch.actor import ProcMesh, this_host
+import asyncio
+
+# STEP 1: Define a basic actor
+class Counter(Actor):
+    def __init__(self, initial_value: int):
+        self.value = initial_value
+
+    @endpoint
+    def increment(self) -> None:
+        self.value += 1
+
+    @endpoint
+    def get_value(self) -> int:
+        return self.value
+
+# STEP 2: Single actor in local process
+counter: Counter = this_proc().spawn("counter", Counter, initial_value=0)
+
+# STEP 3: Send messages
+fut: Future[int] = counter.get_value.call_one()
+value = await fut
+print(f"Counter value: {value}")  # 0
+
+# STEP 4: Multiple actors across processes
+procs: ProcMesh = this_host().spawn_procs(per_host={"gpus": 8})
+counters: Counter = procs.spawn("counters", Counter, 0)
+
+# STEP 5: Broadcast to all actors
+await counters.increment.call()
+
+# STEP 6: Different message patterns
+# call_one() - single actor
+value = await counters.get_value.call_one()
+print(f"One counter: {value}")
+
+# choose() - random single actor  
+value = await counters.get_value.choose()
+print(f"Random counter: {value}")
+
+# call() - all actors, collect results
+values = await counters.get_value.call()
+print(f"All counters: {values}")
+
+# broadcast() - fire and forget
+await counters.increment.broadcast()
+
+# Cleanup
+await procs.stop()
+```
+
+## Actor Meshes: Your Code Running Distributed
+
+**ActorMesh** is created when you spawn actors across a ProcMesh. Each process in the ProcMesh gets one instance of your actor.
+
+```mermaid
+graph TD
+    subgraph Creation["Actor Creation Process"]
+        Code["mesh.spawn('policy', PolicyActor, model='Qwen/Qwen3-7B')"]
+        
+        subgraph ProcMesh["ProcMesh (4 processes)"]
+            P0["Process 0<br/>GPU 0"] 
+            P1["Process 1<br/>GPU 1"]
+            P2["Process 2<br/>GPU 2"]
+            P3["Process 3<br/>GPU 3"]
+        end
+        
+        subgraph ActorMesh["ActorMesh[PolicyActor]"]
+            A0["PolicyActor<br/>Instance #0<br/>model=Qwen/Qwen3-7B<br/>generation_count=0"]
+            A1["PolicyActor<br/>Instance #1<br/>model=Qwen/Qwen3-7B<br/>generation_count=0"]
+            A2["PolicyActor<br/>Instance #2<br/>model=Qwen/Qwen3-7B<br/>generation_count=0"]
+            A3["PolicyActor<br/>Instance #3<br/>model=Qwen/Qwen3-7B<br/>generation_count=0"]
+        end
+        
+        Code --> ProcMesh
+        P0 --> A0
+        P1 --> A1
+        P2 --> A2
+        P3 --> A3
+    end
+    
+    style A0 fill:#99ff99
+    style A1 fill:#99ff99
+    style A2 fill:#99ff99
+    style A3 fill:#99ff99
+```
+
+### Message Routing Through ActorMesh
+
+```mermaid
+graph TD
+    subgraph MessageFlow["Message Flow Patterns"]
+        Client["await policy_actors.generate.METHOD(prompt)"]
+        
+        subgraph Methods["Different Adverbs Route Differently"]
+            Choose["choose()<br/>→ Routes to ONE actor<br/>→ Load balanced"]
+            Call["call()<br/>→ Routes to ALL actors<br/>→ Collects all results"] 
+            Broadcast["broadcast()<br/>→ Routes to ALL actors<br/>→ Fire and forget"]
+            Stream["stream()<br/>→ Routes to ALL actors<br/>→ Iterator of results"]
+        end
+        
+        subgraph ActorInstances["PolicyActor Instances"]
+            A0["Actor 0<br/>GPU 0<br/>generates response"]
+            A1["Actor 1<br/>GPU 1<br/>generates response"] 
+            A2["Actor 2<br/>GPU 2<br/>generates response"]
+            A3["Actor 3<br/>GPU 3<br/>generates response"]
+        end
+        
+        Client --> Choose
+        Client --> Call
+        Client --> Broadcast
+        Client --> Stream
+        
+        Choose -.->|"Load balanced"| A1
+        Call --> A0
+        Call --> A1  
+        Call --> A2
+        Call --> A3
+        Broadcast --> A0
+        Broadcast --> A1
+        Broadcast --> A2
+        Broadcast --> A3
+        Stream --> A0
+        Stream --> A1
+        Stream --> A2
+        Stream --> A3
+    end
+    
+    style Choose fill:#99ff99
+    style Call fill:#ffcc99
+    style Broadcast fill:#ff99cc
+    style Stream fill:#cc99ff
+```
+
+## How Forge Services Use Monarch
+
+Now the key insight: **Forge services are ServiceActors that manage ActorMeshes of your ForgeActor replicas**.
+
+### The Service Creation Process
+
+```mermaid
+graph TD
+    subgraph ServiceCreation["spawn_service() Process"]
+        Call["await spawn_service(ServiceConfig(num_replicas=4), PolicyActor, model='Qwen')"]
+        
+        ServiceActor["ServiceActor<br/>• Manages 4 replicas<br/>• Handles health checks<br/>• Routes service calls"]
+        
+        subgraph Replicas["4 Independent Replicas"] 
+            subgraph R0["Replica 0"]
+                PM0["ProcMesh<br/>1 process<br/>GPU 0"]
+                AM0["ActorMesh<br/>1 PolicyActor"]
+            end
+            
+            subgraph R1["Replica 1"]
+                PM1["ProcMesh<br/>1 process<br/>GPU 1"] 
+                AM1["ActorMesh<br/>1 PolicyActor"]
+            end
+            
+            subgraph R2["Replica 2"]
+                PM2["ProcMesh<br/>1 process<br/>GPU 2"]
+                AM2["ActorMesh<br/>1 PolicyActor"]
+            end
+            
+            subgraph R3["Replica 3"]
+                PM3["ProcMesh<br/>1 process<br/>GPU 3"]
+                AM3["ActorMesh<br/>1 PolicyActor"]
+            end
+        end
+        
+        Call --> ServiceActor
+        ServiceActor --> R0
+        ServiceActor --> R1
+        ServiceActor --> R2
+        ServiceActor --> R3
+        PM0 --> AM0
+        PM1 --> AM1
+        PM2 --> AM2
+        PM3 --> AM3
+    end
+    
+    style ServiceActor fill:#ffcc99
+    style AM0 fill:#99ff99
+    style AM1 fill:#99ff99
+    style AM2 fill:#99ff99
+    style AM3 fill:#99ff99
+```
+
+### Service Call to Actor Execution
+
+```mermaid
+graph TD
+    subgraph CallFlow["Complete Call Flow"]
+        UserCall["await policy_service.generate.choose('What is 2+2?')"]
+        
+        ServiceInterface["ServiceInterface<br/>• Receives .choose() call<br/>• Routes to ServiceActor"]
+        
+        ServiceActor["ServiceActor<br/>• Selects healthy replica<br/>• Load balancing logic<br/>• Failure handling"]
+        
+        SelectedReplica["Selected Replica #2<br/>• ProcMesh with 1 process<br/>• ActorMesh with 1 PolicyActor"]
+        
+        PolicyActor["PolicyActor Instance<br/>• Loads model<br/>• Runs vLLM inference<br/>• Returns 'The answer is 4'"]
+        
+        GPU["GPU 2<br/>• vLLM engine<br/>• Model weights<br/>• KV cache<br/>• CUDA kernels"]
+        
+        UserCall --> ServiceInterface
+        ServiceInterface --> ServiceActor
+        ServiceActor --> SelectedReplica
+        SelectedReplica --> PolicyActor
+        PolicyActor --> GPU
+        
+        GPU -.->|"Response"| PolicyActor
+        PolicyActor -.->|"Response"| SelectedReplica
+        SelectedReplica -.->|"Response"| ServiceActor
+        ServiceActor -.->|"Response"| ServiceInterface
+        ServiceInterface -.->|"'The answer is 4'"| UserCall
+    end
+    
+    style UserCall fill:#99ff99
+    style ServiceActor fill:#ffcc99
+    style PolicyActor fill:#cc99ff
+    style GPU fill:#ffcccc
+```
+
+## Multiple Services Sharing Infrastructure
+
+In real RL systems, you have multiple services that can share or use separate ProcMeshes:
+
+```mermaid
+graph TD
+    subgraph Cluster["RL Training Cluster"]
+        subgraph Services["Forge Services"] 
+            PS["Policy Service<br/>4 GPU replicas"]
+            TS["Trainer Service<br/>2 GPU replicas"] 
+            RS["Reward Service<br/>4 CPU replicas"]
+            BS["Buffer Service<br/>1 CPU replica"]
+        end
+        
+        subgraph MonarchInfra["Monarch Infrastructure"]
+            subgraph GPUMesh["GPU ProcMesh (6 processes)"]
+                G0["Process 0<br/>GPU 0"]
+                G1["Process 1<br/>GPU 1"]
+                G2["Process 2<br/>GPU 2"] 
+                G3["Process 3<br/>GPU 3"]
+                G4["Process 4<br/>GPU 4"]
+                G5["Process 5<br/>GPU 5"]
+            end
+            
+            subgraph CPUMesh["CPU ProcMesh (5 processes)"]
+                C0["Process 0<br/>CPU"]
+                C1["Process 1<br/>CPU"] 
+                C2["Process 2<br/>CPU"]
+                C3["Process 3<br/>CPU"]
+                C4["Process 4<br/>CPU"]
+            end
+        end
+        
+        PS --> G0
+        PS --> G1
+        PS --> G2
+        PS --> G3
+        TS --> G4
+        TS --> G5
+        RS --> C0
+        RS --> C1
+        RS --> C2
+        RS --> C3
+        BS --> C4
+    end
+    
+    style PS fill:#99ff99
+    style TS fill:#ff99cc
+    style RS fill:#ffcc99
+    style BS fill:#cc99ff
+    style GPUMesh fill:#ffe6e6
+    style CPUMesh fill:#e6f3ff
+```
+
+## Key Insights: Why This Architecture Matters
+
+1. **Process Isolation**: Each actor runs in its own process - failures don't cascade
+2. **Location Transparency**: Actors can be local or remote with identical APIs  
+3. **Structured Distribution**: ProcMesh maps directly to hardware topology
+4. **Message Passing**: No shared memory means no race conditions or locks
+5. **Service Abstraction**: Forge hides Monarch complexity while preserving power
+
+Understanding this hierarchy helps you:
+- **Debug performance issues**: Is the bottleneck at service, actor, or hardware level?
+- **Optimize resource usage**: How many replicas per service? GPU vs CPU processes?
+- **Handle failures gracefully**: Which layer failed and how to recover?
+- **Scale effectively**: Where to add resources for maximum impact?
+
+# Conclusion
+
+## What You've Learned
+
+1. **RL Fundamentals**: How RL concepts map to Forge services with REAL, working examples
+2. **Service Abstraction**: How to use Forge services effectively with verified communication patterns  
+3. **Monarch Foundation**: How Forge services connect to distributed actors and hardware
+
+## Key Takeaways
+
+- **Services hide complexity**: Your RL code looks like simple async functions, but runs on distributed clusters
+- **Communication patterns matter**: `.route()`, `.fanout()`, sessions, and `.call_one()` each serve specific purposes  
+- **Architecture understanding helps**: Knowing the Service → Actor → Process → Hardware hierarchy helps you debug, optimize, and scale
+- **Always verify APIs**: This guide is verified, but cross-check with source code for latest changes
+- **Real API patterns**: Use `.options().as_service()` not `spawn_service()`, use `.route()` not `.choose()`, etc.
diff --git a/docs/Tutorials/ReadMe.MD b/docs/Tutorials/ReadMe.MD
index 01d750d06..7798b147d 100644
--- a/docs/Tutorials/ReadMe.MD
+++ b/docs/Tutorials/ReadMe.MD
@@ -11,8 +11,8 @@ Welcome to the Tutorials section! This section is inspired by the A-Z PyTorch tu
 This section currently is structured in 3 detailed parts:
 
 1. [RL Fundamentals and Understanding Forge Terminology](./1_RL_and_Forge_Fundamentals.MD): This gives a quick refresher of Reinforcement Learning and teaches you Forge Fundamentals
-2. []()
-3. []()
+2. [Forge Internals](./2_Forge_Internals.MD): Goes a layer deeper and explains the internals of Forge
+3. [Monarch 101](./3_Monarch_101.MD): It's a 101 to Monarch and how Forge Talks to Monarch
 
 Each part builds upon the next and the entire section can be consumed in roughly an hour-Grab a Chai and Enjoy! 
 

From 2c39598383a6d4efe4183edf9e9d3c703ea78056 Mon Sep 17 00:00:00 2001
From: Sanyam Bhutani <sanyambhutani@meta.com>
Date: Thu, 2 Oct 2025 19:26:45 -0700
Subject: [PATCH 07/28] Update 2_Forge_Internals.MD

---
 docs/Tutorials/2_Forge_Internals.MD | 42 ++++++++++++++---------------
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/docs/Tutorials/2_Forge_Internals.MD b/docs/Tutorials/2_Forge_Internals.MD
index d55eda51a..0c810a08e 100644
--- a/docs/Tutorials/2_Forge_Internals.MD
+++ b/docs/Tutorials/2_Forge_Internals.MD
@@ -8,6 +8,8 @@ Now that you see the power of the service abstraction, let's understand what's a
 
 When you call `await policy_service.generate(question)`, here's what actually happens:
 
+(Don't worry, we will understand Services right in the next section!)
+
 ```mermaid
 graph TD
     Call["Your Code:<br/>await policy_service.generate"]
@@ -58,17 +60,19 @@ Policy.options(
     # Other available options:
     # hosts=None
 )
-
-# This is the ACTUAL way services are configured in Forge
 ```
 
 ### 2. Real Service Creation
 
 Services are created using the `spawn_service` function:
 
-```python
-# This is what ACTUALLY works - copied directly from the notebook
+The spawn_service() function automatically handles:
+- Spawning actor replicas across processes/GPUs
+- Load balancing with .choose() method
+- Health monitoring and failure recovery  
+- Message routing and serialization
 
+```python
 from forge.controller.service import ServiceConfig, spawn_service
 from forge.actors.policy import Policy, PolicyConfig, SamplingOverrides, WorkerConfig
 
@@ -89,12 +93,6 @@ prompt = "What is 3 + 5?"
 responses = await policy.generate.choose(prompt=prompt)
 print(f"Response: {responses[0].text}")
 
-# The spawn_service() function automatically handles:
-# - Spawning actor replicas across processes/GPUs
-# - Load balancing with .choose() method
-# - Health monitoring and failure recovery  
-# - Message routing and serialization
-
 # Cleanup when done
 await shutdown_service(policy)
 ```
@@ -103,23 +101,23 @@ await shutdown_service(policy)
 
 Forge services are implemented as ServiceActors that manage collections of your ForgeActor replicas:
 
-```python
-# Forge internals - What happens behind the scenes:
-# 1. .as_service() creates a ServiceInterface
-# 2. ServiceInterface manages N replicas of your ForgeActor class
-# 3. ServiceInterface handles routing between replicas
-# 4. You get methods like .route(), .fanout(), etc.
+Forge internals - What happens behind the scenes:
+1. `.as_service()` creates a `ServiceInterface`
+2. `ServiceInterface` manages N replicas of your `ForgeActor` class
+3. `ServiceInterface` handles routing between replicas
+4. You get methods like `.route()`, `.fanout()`, etc.
 
+```python
 # Your code sees this:
 responses = await policy.generate.route(prompt=prompt)
-
-# But behind the scenes:
-# - ServiceInterface selects healthy replica
-# - Routes message to that replica's Policy.generate() endpoint
-# - Handles failures and retries automatically
-# - Returns list[Completion] from the selected replica
 ```
 
+But behind the scenes:
+- `ServiceInterface` selects healthy replica
+- Routes message to that replica's `Policy.generate()` endpoint
+- Handles failures and retries automatically
+- Returns list[Completion] from the selected replica
+
 ### 3. Different Service Types and Their Characteristics
 
 ```mermaid

From f086c60cf59ad2b16f55b6aebdff6142e4ff608c Mon Sep 17 00:00:00 2001
From: Sanyam Bhutani <sanyambhutani@meta.com>
Date: Thu, 2 Oct 2025 19:34:03 -0700
Subject: [PATCH 08/28] add

---
 docs/Tutorials/2_Forge_Internals.MD | 43 +++++++++--------------------
 docs/Tutorials/3_Monarch_101.MD     |  2 ++
 2 files changed, 15 insertions(+), 30 deletions(-)

diff --git a/docs/Tutorials/2_Forge_Internals.MD b/docs/Tutorials/2_Forge_Internals.MD
index 0c810a08e..9018afe3d 100644
--- a/docs/Tutorials/2_Forge_Internals.MD
+++ b/docs/Tutorials/2_Forge_Internals.MD
@@ -155,14 +155,14 @@ These communication patterns (\"adverbs\") determine how your service calls are
 ```python
 responses = await policy.generate.route(prompt=question)
 answer = responses[0].text  # Extract text from Completion object
-
-# Behind the scenes:
-# 1. Health check eliminates failed replicas
-# 2. Load balancer picks least loaded healthy replica  
-# 3. Request routes to that specific replica
-# 4. Automatic retry on different replica if failure
 ```
 
+Behind the scenes:
+1. Health check eliminates failed replicas
+2. Load balancer picks least loaded healthy replica  
+3. Request routes to that specific replica
+4. Automatic retry on different replica if failure
+
 **Performance characteristics**:
 - **Latency**: Lowest (single network hop)
 - **Throughput**: Limited by single replica capacity
@@ -196,7 +196,7 @@ await policy.update_weights.fanout(new_policy_version)
 **When to use**: You want to process results as they arrive, not wait for all.
 
 ```python
-# 📝 CONCEPTUAL - Streaming requires custom implementation in your training loop
+# CONCEPTUAL - Streaming requires custom implementation in your training loop
 # The basic ReplayBuffer doesn't have built-in streaming methods
 # Pattern from apps/grpo/main.py continuous training:
 
@@ -223,7 +223,7 @@ while training:
 **When to use**: Side effects that don't need responses (notifications, cache updates).
 
 ```python
-# 📝 CONCEPTUAL - Fire-and-forget requires custom @endpoint implementations
+# CONCEPTUAL - Fire-and-forget requires custom @endpoint implementations
 # The basic services don't have broadcast methods built-in
 # You would implement custom endpoints in your ForgeActor:
 
@@ -485,36 +485,19 @@ trainer = await RLTrainer.options(
 )
 ```
 
-### Natural Backpressure Through Service APIs
-
-```python
-# backpressure pattern - The replay buffer naturally provides backpressure
-batch = await replay_buffer.sample.call_one(curr_policy_version=step)
-if batch is None:
-    # Not enough data yet - natural rate limiting
-    print("Buffer not ready, collecting more experiences...")
-    continue
-else:
-    # Proceed with training
-    loss = await trainer.train_step.call_one(batch)
-    print(f"Training loss: {loss}")
-```
-
-These patterns address the core technical challenges in distributed RL. The key insight: **Forge services handle coordination complexity automatically, letting you focus on RL algorithm logic**.
-
 ## Service Implementation Example
 
 Let's see how a reward service is actually implemented:
 
 ```python
-# ✅ COMPLETE WORKING EXAMPLE - Exact RewardActor from apps/grpo/main.py
+# Exact RewardActor from apps/grpo/main.py
 
 from forge.controller import ForgeActor
 from monarch.actor import endpoint
 from forge.data.rewards import MathReward, ThinkingReward
 from forge.controller.service import ServiceConfig, spawn_service
 
-# EXACT class definition from apps/grpo/main.py lines 68-83  
+# class definition from apps/grpo/main.py
 class RewardActor(ForgeActor):
     def __init__(self, reward_functions: list):
         self.reward_functions = reward_functions
@@ -573,7 +556,7 @@ from forge.data.rewards import MathReward, ThinkingReward
 from monarch.actor import endpoint
 from omegaconf import DictConfig
 
-# EXACT service creation from apps/grpo/main.py lines 322-344
+# Service creation from apps/grpo/main.py lines 322-344
 print("Initializing all services...")
 (
     dataloader,
@@ -601,7 +584,6 @@ print("Initializing all services...")
 
 print("All services initialized successfully!")
 
-# EXACT usage patterns from apps/grpo/main.py continuous training loop
 async def production_training_loop():
     """Real training loop pattern from apps/grpo/main.py"""
     step = 0
@@ -639,7 +621,6 @@ async def production_training_loop():
             print(f"Step {step}, Loss: {loss:.4f}")
             step += 1
 
-# EXACT cleanup pattern from apps/grpo/main.py lines 493-504  
 print("Shutting down services...")
 await asyncio.gather(
     DatasetActor.shutdown(dataloader),
@@ -661,3 +642,5 @@ print("All services shut down successfully!")
 5. **Coordination**: Services coordinate through shared state (replay buffer, weight versions)
 
 This is the power of the service abstraction - complex distributed coordination looks like simple async Python code.
+
+In the next part we will learn about [Monarch internals](./3_Monarch_101.MD)
\ No newline at end of file
diff --git a/docs/Tutorials/3_Monarch_101.MD b/docs/Tutorials/3_Monarch_101.MD
index 9369be13a..94c02c37e 100644
--- a/docs/Tutorials/3_Monarch_101.MD
+++ b/docs/Tutorials/3_Monarch_101.MD
@@ -1,5 +1,7 @@
 # Part 3: The Forge-Monarch Connection
 
+This is part 3 of our series, in the previous sections: we learned [RL Concepts and how they map to Forge](./1_RL_and_Forge_Fundamentals.MD), [Forge Internals](./2_Forge_Internals.MD).
+
 Now let's peel back the layers. Forge services are built on top of **Monarch**, PyTorch's distributed actor framework. Understanding this connection is crucial for optimization and debugging.
 
 ## The Complete Hierarchy: Service to Silicon

From 56f6a5c8b4b82ef88a331a7a1745d855b22f12f8 Mon Sep 17 00:00:00 2001
From: Sanyam Bhutani <sanyambhutani@meta.com>
Date: Thu, 2 Oct 2025 19:38:48 -0700
Subject: [PATCH 09/28] Update 3_Monarch_101.MD

---
 docs/Tutorials/3_Monarch_101.MD | 124 ++++++++++++++++----------------
 1 file changed, 62 insertions(+), 62 deletions(-)

diff --git a/docs/Tutorials/3_Monarch_101.MD b/docs/Tutorials/3_Monarch_101.MD
index 94c02c37e..7b3f6d310 100644
--- a/docs/Tutorials/3_Monarch_101.MD
+++ b/docs/Tutorials/3_Monarch_101.MD
@@ -11,24 +11,24 @@ graph TD
     subgraph YourCode["1. Your RL Code"]
         Call["await policy_service.generate.choose('What is 2+2?')"]
     end
-    
+
     subgraph ForgeServices["2. Forge Service Layer"]
         ServiceInterface["ServiceInterface<br/>• Routes .choose() to replica<br/>• Handles load balancing<br/>• Manages health checks"]
         ServiceActor["ServiceActor<br/>• Manages replica lifecycle<br/>• Monitors health<br/>• Coordinates failures"]
     end
-    
-    subgraph MonarchLayer["3. Monarch Actor Layer"]  
-        ActorMesh["ActorMesh[PolicyActor]<br/>• 4 PolicyActor instances<br/>• Each on different GPU<br/>• Message passing interface"]
-        ProcMesh["ProcMesh<br/>• 4 processes<br/>• GPU topology: [0,1,2,3]<br/>• Network interconnect"]
+
+    subgraph MonarchLayer["3. Monarch Actor Layer"]
+        ActorMesh["ActorMesh PolicyActor<br/>• 4 PolicyActor instances<br/>• Each on different GPU<br/>• Message passing interface"]
+        ProcMesh["ProcMesh<br/>• 4 processes<br/>• GPU topology: 0,1,2,3<br/>• Network interconnect"]
     end
-    
+
     subgraph Hardware["4. Physical Hardware"]
         GPU0["GPU 0<br/>PolicyActor #1<br/>vLLM Engine<br/>Model Weights"]
-        GPU1["GPU 1<br/>PolicyActor #2<br/>vLLM Engine<br/>Model Weights"] 
+        GPU1["GPU 1<br/>PolicyActor #2<br/>vLLM Engine<br/>Model Weights"]
         GPU2["GPU 2<br/>PolicyActor #3<br/>vLLM Engine<br/>Model Weights"]
         GPU3["GPU 3<br/>PolicyActor #4<br/>vLLM Engine<br/>Model Weights"]
     end
-    
+
     Call --> ServiceInterface
     ServiceInterface --> ServiceActor
     ServiceActor --> ActorMesh
@@ -37,7 +37,7 @@ graph TD
     ProcMesh --> GPU1
     ProcMesh --> GPU2
     ProcMesh --> GPU3
-    
+
     style Call fill:#99ff99
     style ServiceActor fill:#ffcc99
     style ActorMesh fill:#cc99ff
@@ -55,17 +55,17 @@ graph TD
     subgraph Host["Single Host (8 GPUs)"]
         subgraph ProcMesh["ProcMesh: per_host={'gpus': 8}"]
             P0["Process 0<br/>GPU 0"]
-            P1["Process 1<br/>GPU 1"] 
+            P1["Process 1<br/>GPU 1"]
             P2["Process 2<br/>GPU 2"]
             P3["Process 3<br/>GPU 3"]
             P4["Process 4<br/>GPU 4"]
             P5["Process 5<br/>GPU 5"]
-            P6["Process 6<br/>GPU 6"] 
+            P6["Process 6<br/>GPU 6"]
             P7["Process 7<br/>GPU 7"]
         end
-        
+
         P0 -.->|"Network"| P1
-        P1 -.->|"Network"| P2  
+        P1 -.->|"Network"| P2
         P2 -.->|"Network"| P3
         P3 -.->|"Network"| P4
         P4 -.->|"Network"| P5
@@ -73,7 +73,7 @@ graph TD
         P6 -.->|"Network"| P7
         P7 -.->|"Network"| P0
     end
-    
+
     style P0 fill:#ff9999
     style P1 fill:#ff9999
     style P2 fill:#ff9999
@@ -97,8 +97,8 @@ graph TD
                 H1P3["Process 3<br/>GPU 3"]
             end
         end
-        
-        subgraph Host2["Host 2"] 
+
+        subgraph Host2["Host 2"]
             subgraph PM2["ProcMesh Segment 2"]
                 H2P0["Process 4<br/>GPU 0"]
                 H2P1["Process 5<br/>GPU 1"]
@@ -106,22 +106,22 @@ graph TD
                 H2P3["Process 7<br/>GPU 3"]
             end
         end
-        
+
         subgraph Host3["Host 3"]
             subgraph PM3["ProcMesh Segment 3"]
                 H3P0["Process 8<br/>GPU 0"]
                 H3P1["Process 9<br/>GPU 1"]
-                H3P2["Process 10<br/>GPU 2"] 
+                H3P2["Process 10<br/>GPU 2"]
                 H3P3["Process 11<br/>GPU 3"]
             end
         end
     end
-    
+
     H1P0 -.->|"InfiniBand"| H2P0
     H1P1 -.->|"InfiniBand"| H2P1
     H2P0 -.->|"InfiniBand"| H3P0
     H2P1 -.->|"InfiniBand"| H3P1
-    
+
     style PM1 fill:#ff9999
     style PM2 fill:#99ff99
     style PM3 fill:#99ccff
@@ -167,7 +167,7 @@ await counters.increment.call()
 value = await counters.get_value.call_one()
 print(f"One counter: {value}")
 
-# choose() - random single actor  
+# choose() - random single actor
 value = await counters.get_value.choose()
 print(f"Random counter: {value}")
 
@@ -190,28 +190,28 @@ await procs.stop()
 graph TD
     subgraph Creation["Actor Creation Process"]
         Code["mesh.spawn('policy', PolicyActor, model='Qwen/Qwen3-7B')"]
-        
+
         subgraph ProcMesh["ProcMesh (4 processes)"]
-            P0["Process 0<br/>GPU 0"] 
+            P0["Process 0<br/>GPU 0"]
             P1["Process 1<br/>GPU 1"]
             P2["Process 2<br/>GPU 2"]
             P3["Process 3<br/>GPU 3"]
         end
-        
+
         subgraph ActorMesh["ActorMesh[PolicyActor]"]
             A0["PolicyActor<br/>Instance #0<br/>model=Qwen/Qwen3-7B<br/>generation_count=0"]
             A1["PolicyActor<br/>Instance #1<br/>model=Qwen/Qwen3-7B<br/>generation_count=0"]
             A2["PolicyActor<br/>Instance #2<br/>model=Qwen/Qwen3-7B<br/>generation_count=0"]
             A3["PolicyActor<br/>Instance #3<br/>model=Qwen/Qwen3-7B<br/>generation_count=0"]
         end
-        
+
         Code --> ProcMesh
         P0 --> A0
         P1 --> A1
         P2 --> A2
         P3 --> A3
     end
-    
+
     style A0 fill:#99ff99
     style A1 fill:#99ff99
     style A2 fill:#99ff99
@@ -224,29 +224,29 @@ graph TD
 graph TD
     subgraph MessageFlow["Message Flow Patterns"]
         Client["await policy_actors.generate.METHOD(prompt)"]
-        
+
         subgraph Methods["Different Adverbs Route Differently"]
             Choose["choose()<br/>→ Routes to ONE actor<br/>→ Load balanced"]
-            Call["call()<br/>→ Routes to ALL actors<br/>→ Collects all results"] 
+            Call["call()<br/>→ Routes to ALL actors<br/>→ Collects all results"]
             Broadcast["broadcast()<br/>→ Routes to ALL actors<br/>→ Fire and forget"]
             Stream["stream()<br/>→ Routes to ALL actors<br/>→ Iterator of results"]
         end
-        
+
         subgraph ActorInstances["PolicyActor Instances"]
             A0["Actor 0<br/>GPU 0<br/>generates response"]
-            A1["Actor 1<br/>GPU 1<br/>generates response"] 
+            A1["Actor 1<br/>GPU 1<br/>generates response"]
             A2["Actor 2<br/>GPU 2<br/>generates response"]
             A3["Actor 3<br/>GPU 3<br/>generates response"]
         end
-        
+
         Client --> Choose
         Client --> Call
         Client --> Broadcast
         Client --> Stream
-        
+
         Choose -.->|"Load balanced"| A1
         Call --> A0
-        Call --> A1  
+        Call --> A1
         Call --> A2
         Call --> A3
         Broadcast --> A0
@@ -258,7 +258,7 @@ graph TD
         Stream --> A2
         Stream --> A3
     end
-    
+
     style Choose fill:#99ff99
     style Call fill:#ffcc99
     style Broadcast fill:#ff99cc
@@ -275,31 +275,31 @@ Now the key insight: **Forge services are ServiceActors that manage ActorMeshes
 graph TD
     subgraph ServiceCreation["spawn_service() Process"]
         Call["await spawn_service(ServiceConfig(num_replicas=4), PolicyActor, model='Qwen')"]
-        
+
         ServiceActor["ServiceActor<br/>• Manages 4 replicas<br/>• Handles health checks<br/>• Routes service calls"]
-        
-        subgraph Replicas["4 Independent Replicas"] 
+
+        subgraph Replicas["4 Independent Replicas"]
             subgraph R0["Replica 0"]
                 PM0["ProcMesh<br/>1 process<br/>GPU 0"]
                 AM0["ActorMesh<br/>1 PolicyActor"]
             end
-            
+
             subgraph R1["Replica 1"]
-                PM1["ProcMesh<br/>1 process<br/>GPU 1"] 
+                PM1["ProcMesh<br/>1 process<br/>GPU 1"]
                 AM1["ActorMesh<br/>1 PolicyActor"]
             end
-            
+
             subgraph R2["Replica 2"]
                 PM2["ProcMesh<br/>1 process<br/>GPU 2"]
                 AM2["ActorMesh<br/>1 PolicyActor"]
             end
-            
+
             subgraph R3["Replica 3"]
                 PM3["ProcMesh<br/>1 process<br/>GPU 3"]
                 AM3["ActorMesh<br/>1 PolicyActor"]
             end
         end
-        
+
         Call --> ServiceActor
         ServiceActor --> R0
         ServiceActor --> R1
@@ -310,7 +310,7 @@ graph TD
         PM2 --> AM2
         PM3 --> AM3
     end
-    
+
     style ServiceActor fill:#ffcc99
     style AM0 fill:#99ff99
     style AM1 fill:#99ff99
@@ -324,30 +324,30 @@ graph TD
 graph TD
     subgraph CallFlow["Complete Call Flow"]
         UserCall["await policy_service.generate.choose('What is 2+2?')"]
-        
+
         ServiceInterface["ServiceInterface<br/>• Receives .choose() call<br/>• Routes to ServiceActor"]
-        
+
         ServiceActor["ServiceActor<br/>• Selects healthy replica<br/>• Load balancing logic<br/>• Failure handling"]
-        
+
         SelectedReplica["Selected Replica #2<br/>• ProcMesh with 1 process<br/>• ActorMesh with 1 PolicyActor"]
-        
+
         PolicyActor["PolicyActor Instance<br/>• Loads model<br/>• Runs vLLM inference<br/>• Returns 'The answer is 4'"]
-        
+
         GPU["GPU 2<br/>• vLLM engine<br/>• Model weights<br/>• KV cache<br/>• CUDA kernels"]
-        
+
         UserCall --> ServiceInterface
         ServiceInterface --> ServiceActor
         ServiceActor --> SelectedReplica
         SelectedReplica --> PolicyActor
         PolicyActor --> GPU
-        
+
         GPU -.->|"Response"| PolicyActor
         PolicyActor -.->|"Response"| SelectedReplica
         SelectedReplica -.->|"Response"| ServiceActor
         ServiceActor -.->|"Response"| ServiceInterface
         ServiceInterface -.->|"'The answer is 4'"| UserCall
     end
-    
+
     style UserCall fill:#99ff99
     style ServiceActor fill:#ffcc99
     style PolicyActor fill:#cc99ff
@@ -361,32 +361,32 @@ In real RL systems, you have multiple services that can share or use separate Pr
 ```mermaid
 graph TD
     subgraph Cluster["RL Training Cluster"]
-        subgraph Services["Forge Services"] 
+        subgraph Services["Forge Services"]
             PS["Policy Service<br/>4 GPU replicas"]
-            TS["Trainer Service<br/>2 GPU replicas"] 
+            TS["Trainer Service<br/>2 GPU replicas"]
             RS["Reward Service<br/>4 CPU replicas"]
             BS["Buffer Service<br/>1 CPU replica"]
         end
-        
+
         subgraph MonarchInfra["Monarch Infrastructure"]
             subgraph GPUMesh["GPU ProcMesh (6 processes)"]
                 G0["Process 0<br/>GPU 0"]
                 G1["Process 1<br/>GPU 1"]
-                G2["Process 2<br/>GPU 2"] 
+                G2["Process 2<br/>GPU 2"]
                 G3["Process 3<br/>GPU 3"]
                 G4["Process 4<br/>GPU 4"]
                 G5["Process 5<br/>GPU 5"]
             end
-            
+
             subgraph CPUMesh["CPU ProcMesh (5 processes)"]
                 C0["Process 0<br/>CPU"]
-                C1["Process 1<br/>CPU"] 
+                C1["Process 1<br/>CPU"]
                 C2["Process 2<br/>CPU"]
                 C3["Process 3<br/>CPU"]
                 C4["Process 4<br/>CPU"]
             end
         end
-        
+
         PS --> G0
         PS --> G1
         PS --> G2
@@ -399,7 +399,7 @@ graph TD
         RS --> C3
         BS --> C4
     end
-    
+
     style PS fill:#99ff99
     style TS fill:#ff99cc
     style RS fill:#ffcc99
@@ -411,7 +411,7 @@ graph TD
 ## Key Insights: Why This Architecture Matters
 
 1. **Process Isolation**: Each actor runs in its own process - failures don't cascade
-2. **Location Transparency**: Actors can be local or remote with identical APIs  
+2. **Location Transparency**: Actors can be local or remote with identical APIs
 3. **Structured Distribution**: ProcMesh maps directly to hardware topology
 4. **Message Passing**: No shared memory means no race conditions or locks
 5. **Service Abstraction**: Forge hides Monarch complexity while preserving power
@@ -427,13 +427,13 @@ Understanding this hierarchy helps you:
 ## What You've Learned
 
 1. **RL Fundamentals**: How RL concepts map to Forge services with REAL, working examples
-2. **Service Abstraction**: How to use Forge services effectively with verified communication patterns  
+2. **Service Abstraction**: How to use Forge services effectively with verified communication patterns
 3. **Monarch Foundation**: How Forge services connect to distributed actors and hardware
 
 ## Key Takeaways
 
 - **Services hide complexity**: Your RL code looks like simple async functions, but runs on distributed clusters
-- **Communication patterns matter**: `.route()`, `.fanout()`, sessions, and `.call_one()` each serve specific purposes  
+- **Communication patterns matter**: `.route()`, `.fanout()`, sessions, and `.call_one()` each serve specific purposes
 - **Architecture understanding helps**: Knowing the Service → Actor → Process → Hardware hierarchy helps you debug, optimize, and scale
 - **Always verify APIs**: This guide is verified, but cross-check with source code for latest changes
 - **Real API patterns**: Use `.options().as_service()` not `spawn_service()`, use `.route()` not `.choose()`, etc.

From 8f501d362dc3d7cf53ec7ce315e66216787d49dc Mon Sep 17 00:00:00 2001
From: Sanyam Bhutani <sanyambhutani@meta.com>
Date: Thu, 2 Oct 2025 19:40:02 -0700
Subject: [PATCH 10/28] Update 3_Monarch_101.MD

---
 docs/Tutorials/3_Monarch_101.MD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/Tutorials/3_Monarch_101.MD b/docs/Tutorials/3_Monarch_101.MD
index 7b3f6d310..0b1b4bd79 100644
--- a/docs/Tutorials/3_Monarch_101.MD
+++ b/docs/Tutorials/3_Monarch_101.MD
@@ -198,7 +198,7 @@ graph TD
             P3["Process 3<br/>GPU 3"]
         end
 
-        subgraph ActorMesh["ActorMesh[PolicyActor]"]
+        subgraph ActorMesh["ActorMesh PolicyActor"]
             A0["PolicyActor<br/>Instance #0<br/>model=Qwen/Qwen3-7B<br/>generation_count=0"]
             A1["PolicyActor<br/>Instance #1<br/>model=Qwen/Qwen3-7B<br/>generation_count=0"]
             A2["PolicyActor<br/>Instance #2<br/>model=Qwen/Qwen3-7B<br/>generation_count=0"]

From 7e47e025977718217b1d000abadfe9bdbffa461c Mon Sep 17 00:00:00 2001
From: Sanyam Bhutani <sanyambhutani@meta.com>
Date: Thu, 2 Oct 2025 19:40:40 -0700
Subject: [PATCH 11/28] Update 3_Monarch_101.MD

---
 docs/Tutorials/3_Monarch_101.MD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/Tutorials/3_Monarch_101.MD b/docs/Tutorials/3_Monarch_101.MD
index 0b1b4bd79..52a058dcc 100644
--- a/docs/Tutorials/3_Monarch_101.MD
+++ b/docs/Tutorials/3_Monarch_101.MD
@@ -1,6 +1,6 @@
 # Part 3: The Forge-Monarch Connection
 
-This is part 3 of our series, in the previous sections: we learned [RL Concepts and how they map to Forge](./1_RL_and_Forge_Fundamentals.MD), [Forge Internals](./2_Forge_Internals.MD).
+This is part 3 of our series, in the previous sections: we learned Part 1: [RL Concepts and how they map to Forge](./1_RL_and_Forge_Fundamentals.MD), Part 2: [Forge Internals](./2_Forge_Internals.MD).
 
 Now let's peel back the layers. Forge services are built on top of **Monarch**, PyTorch's distributed actor framework. Understanding this connection is crucial for optimization and debugging.
 

From 1c8d8c1244a1c86b4011edc8321940b1f433707d Mon Sep 17 00:00:00 2001
From: Sanyam Bhutani <sanyam.bhutani05@gmail.com>
Date: Fri, 3 Oct 2025 00:22:10 -0700
Subject: [PATCH 12/28] fix funcs

---
 docs/Tutorials/1_RL_and_Forge_Fundamentals.MD | 152 ++++++-------
 docs/Tutorials/2_Forge_Internals.MD           | 200 ++++++++++--------
 docs/Tutorials/3_Monarch_101.MD               |  14 +-
 3 files changed, 199 insertions(+), 167 deletions(-)

diff --git a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
index 810ef373f..c34ae6639 100644
--- a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
+++ b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
@@ -114,33 +114,36 @@ Let's look at the example from above again, but this time we would use the names
 # Conceptual Example
 
 async def conceptual_forge_rl_step(services, step):
-    # 1. Get a math problem - CONCEPTUAL API
-    sample = await services['dataloader'].get_sample()
-    question, target = sample["question"], sample["answer"]
+    # 1. Get a math problem - Using actual DatasetActor API
+    sample = await services['dataloader'].sample.call_one()
+    question, target = sample["request"], sample["target"]
     
-    # 2. Student generates answer - CONCEPTUAL API
-    # Actual method names vary by implementation
-    responses = await services['policy'].generate(prompt=question)
+    # 2. Student generates answer - Using actual Policy API
+    responses = await services['policy'].generate.route(prompt=question)
     answer = responses[0].text  
     
-    # 3. Teacher grades it - CONCEPTUAL API  
-    # Actual reward evaluation varies by implementation
-    score = await services['reward_actor'].evaluate(
+    # 3. Teacher grades it - Using actual RewardActor API
+    score = await services['reward_actor'].evaluate_response.route(
         prompt=question, response=answer, target=target
     )
     
-    # 4. Compare to baseline - CONCEPTUAL API
-    ref_logprobs = await services['ref_model'].compute_baseline(responses[0].token_ids)
+    # 4. Compare to baseline - Using actual ReferenceModel API
+    # Note: ReferenceModel.forward requires input_ids, max_req_tokens, return_logprobs
+    ref_logprobs = await services['ref_model'].forward.route(
+        input_ids, max_req_tokens, return_logprobs=True
+    )
     
-    # 5. Store experience - CONCEPTUAL Episode structure
-    # Real Episode structure in src/forge/data_models/episode.py
-    episode = create_episode(responses[0], score, ref_logprobs, step)
-    await services['replay_buffer'].store(episode)
+    # 5. Store experience - Using actual Episode structure from apps/grpo/main.py
+    episode = create_episode_from_response(responses[0], score, ref_logprobs, step)
+    await services['replay_buffer'].add.call_one(episode)
     
-    # 6. Improve student - CONCEPTUAL API
-    batch = await services['replay_buffer'].get_batch(policy_version=step)
+    # 6. Improve student - Using actual training pattern
+    batch = await services['replay_buffer'].sample.call_one(
+        curr_policy_version=step
+    )
     if batch is not None:
-        loss = await services['trainer'].update_policy(batch)
+        inputs, targets = batch
+        loss = await services['trainer'].train_step.call(inputs, targets)
         return loss
 ```
 
@@ -234,34 +237,38 @@ Let's see how core RL concepts map to Forge services:
 async def real_rl_training_step(services, step):
     """Single RL step using verified Forge APIs"""
     
-    # 1. Environment interaction
-    sample = await services['dataloader'].__next__.call_one()
-    prompt, target = sample["question"], sample["answer"]
+    # 1. Environment interaction - Using actual DatasetActor API
+    sample = await services['dataloader'].sample.call_one()
+    prompt, target = sample["request"], sample["target"]
     
-    responses = await services['policy'].generate.route(prompt=prompt)
+    responses = await services['policy'].generate.route(prompt)
     
-    # 2. Reward computation
+    # 2. Reward computation - Using actual RewardActor API
     score = await services['reward_actor'].evaluate_response.route(
         prompt=prompt, response=responses[0].text, target=target
     )
     
-    # 3. Get reference logprobs
-    ref_logprobs = await services['ref_model'].forward.route(responses[0].token_ids)
+    # 3. Get reference logprobs - Using actual ReferenceModel API
+    # Note: ReferenceModel requires full input_ids tensor, not just tokens
+    input_ids = torch.cat([responses[0].prompt_ids, responses[0].token_ids])
+    ref_logprobs = await services['ref_model'].forward.route(
+        input_ids.unsqueeze(0), max_req_tokens=512, return_logprobs=True
+    )
     
-    # 4. Experience storage - Episode creation pattern
-    # Note: Actual Episode structure requires token tensors, not text
+    # 4. Experience storage - Using actual Episode pattern from GRPO
     episode = create_episode_from_response(responses[0], score, ref_logprobs, step)
     await services['replay_buffer'].add.call_one(episode)
     
-    # 5. Learning - trainer endpoint
+    # 5. Learning - Using actual trainer pattern
     batch = await services['replay_buffer'].sample.call_one(
         curr_policy_version=step
     )
     if batch is not None:
-        loss = await services['trainer'].train_step.call_one(batch)
+        inputs, targets = batch  # GRPO returns (inputs, targets) tuple
+        loss = await services['trainer'].train_step.call(inputs, targets)
         
-        # 6. Policy synchronization - weight update pattern
-        await services['trainer'].push_weights.call_one(step + 1)
+        # 6. Policy synchronization - Using actual weight update pattern
+        await services['trainer'].push_weights.call(step + 1)
         await services['policy'].update_weights.fanout(step + 1)
         
         return loss
@@ -287,12 +294,14 @@ Forge handles behind the scenes:
 ### Independent Scaling
 ```python
 
-from forge.actors.policy import Policy, PolicyConfig, SamplingOverrides, WorkerConfig
+from forge.actors.policy import Policy
 from forge.actors.replay_buffer import ReplayBuffer
-from forge.controller.service import shutdown_service
-from apps.grpo.main import Trainer, RewardActor, ComputeAdvantages, RefModel, DatasetActor
+from forge.actors.reference_model import ReferenceModel
+from forge.actors.trainer import RLTrainer
+from apps.grpo.main import DatasetActor, RewardActor, ComputeAdvantages
 from forge.data.rewards import MathReward, ThinkingReward
 import asyncio
+import torch
 
 model = "Qwen/Qwen3-1.7B"
 group_size = 1
@@ -306,67 +315,60 @@ group_size = 1
     ref_model,
     reward_actor,
 ) = await asyncio.gather(
-        # Dataset service
-        spawn_service(
-            ServiceConfig(procs_per_replica=1, num_replicas=1),
-            DatasetActor,
+        # Dataset actor (CPU)
+        DatasetActor.options(procs=1).as_actor(
             path="openai/gsm8k",
-            config_name="main",
-            split="train",
+            revision="main",
+            data_split="train",
             streaming=True,
+            model=model,
         ),
         # Policy service with GPU
-        spawn_service(
-            ServiceConfig(procs_per_replica=1, with_gpus=True, num_replicas=1),
-            Policy,
-            config=PolicyConfig(
-                worker_params=WorkerConfig(model=model),
-                sampling_params=SamplingOverrides(
-                    num_samples=group_size, max_tokens=16
-                ),
-            ),
+        Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service(
+            engine_config={
+                "model": model,
+                "tensor_parallel_size": 1,
+                "pipeline_parallel_size": 1,
+                "enforce_eager": False
+            },
+            sampling_config={
+                "n": group_size,
+                "max_tokens": 16,
+                "temperature": 1.0,
+                "top_p": 1.0
+            }
         ),
-        # Trainer service with GPU
-        spawn_service(
-            ServiceConfig(procs_per_replica=1, with_gpus=True, num_replicas=1),
-            Trainer,
-            learning_rate=1e-5,
-            beta=0.1,
-            model_name=model,
+        # Trainer actor with GPU
+        RLTrainer.options(procs=1, with_gpus=True).as_actor(
+            # Trainer config would come from YAML in real usage
+            model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"},
+            optimizer={"name": "AdamW", "lr": 1e-5},
+            training={"local_batch_size": 2, "seq_len": 2048}
         ),
         # Replay buffer (CPU)
-        spawn_service(
-            ServiceConfig(procs_per_replica=1, num_replicas=1),
-            ReplayBuffer,
+        ReplayBuffer.options(procs=1).as_actor(
             batch_size=2,
             max_policy_age=1,
+            dp_size=1
         ),
         # Advantage computation (CPU)
-        spawn_service(
-            ServiceConfig(procs_per_replica=1, num_replicas=1),
-            ComputeAdvantages,
-            gamma=0.99,
-            lambda_=0.95,
-        ),
+        ComputeAdvantages.options(procs=1).as_actor(),
         # Reference model with GPU
-        spawn_service(
-            ServiceConfig(procs_per_replica=1, num_replicas=1, with_gpus=True),
-            RefModel,
-            model_name=model,
+        ReferenceModel.options(procs=1, with_gpus=True).as_actor(
+            model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"},
+            training={"dtype": "bfloat16"}
         ),
         # Reward actor (CPU)
-        spawn_service(
-            ServiceConfig(procs_per_replica=1, num_replicas=1),
-            RewardActor,
-            reward_functions=[MathReward(), ThinkingReward()],
+        RewardActor.options(procs=1, num_replicas=1).as_service(
+            reward_functions=[MathReward(), ThinkingReward()]
         )
     )
 ```
 
-Production scaling - multiply num_replicas:
+Production scaling - multiply num_replicas for services or spawn multiple actors:
 - Policy: num_replicas=8 for high inference demand
 - RewardActor: num_replicas=16 for parallel evaluation
-- Trainer: num_replicas=4 for distributed training
+- Trainer: Multiple actors for distributed training (RLTrainer handles this internally)
 
 
 ### Fault Tolerance
diff --git a/docs/Tutorials/2_Forge_Internals.MD b/docs/Tutorials/2_Forge_Internals.MD
index 9018afe3d..634f04f85 100644
--- a/docs/Tutorials/2_Forge_Internals.MD
+++ b/docs/Tutorials/2_Forge_Internals.MD
@@ -65,36 +65,44 @@ Policy.options(
 ### 2. Real Service Creation
 
 Services are created using the `spawn_service` function:
+Services are created using the `.options().as_service()` pattern from the actual GRPO implementation:
 
-The spawn_service() function automatically handles:
+The service creation automatically handles:
 - Spawning actor replicas across processes/GPUs
-- Load balancing with .choose() method
+- Load balancing with .route() method for services
 - Health monitoring and failure recovery  
 - Message routing and serialization
 
 ```python
-from forge.controller.service import ServiceConfig, spawn_service
-from forge.actors.policy import Policy, PolicyConfig, SamplingOverrides, WorkerConfig
+from forge.actors.policy import Policy
 
 model = "Qwen/Qwen3-1.7B"
 
-policy = await spawn_service(
-    ServiceConfig(procs_per_replica=1, with_gpus=True, num_replicas=1),
-    Policy,
-    config=PolicyConfig(
-        worker_params=WorkerConfig(model=model),
-        sampling_params=SamplingOverrides(
-            num_samples=1, max_tokens=16
-        ),
-    ),
+policy = await Policy.options(
+    procs=1, 
+    with_gpus=True, 
+    num_replicas=1
+).as_service(
+    engine_config={
+        "model": model,
+        "tensor_parallel_size": 1,
+        "pipeline_parallel_size": 1,
+        "enforce_eager": False
+    },
+    sampling_config={
+        "n": 1,
+        "max_tokens": 16,
+        "temperature": 1.0,
+        "top_p": 1.0
+    }
 )
 
 prompt = "What is 3 + 5?"
-responses = await policy.generate.choose(prompt=prompt)
+responses = await policy.generate.route(prompt)
 print(f"Response: {responses[0].text}")
 
 # Cleanup when done
-await shutdown_service(policy)
+await policy.shutdown()
 ```
 
 ### 3. How Services Actually Work
@@ -253,7 +261,6 @@ class CustomPolicy(Policy):
 # This Counter example demonstrates the session pattern
 
 from forge.controller import ForgeActor
-from forge.controller.service import ServiceConfig, spawn_service, shutdown_service
 from monarch.actor import endpoint
 
 class ForgeCounter(ForgeActor):
@@ -273,37 +280,35 @@ class ForgeCounter(ForgeActor):
     async def reset(self):
         self.value = 0
 
-counter_service = await spawn_service(
-    ServiceConfig(procs_per_replica=1, num_replicas=4),
-    ForgeCounter,
-    initial_value=0
-)
+counter_service = await ForgeCounter.options(
+    procs=1, num_replicas=4
+).as_service(initial_value=0)
 
 # Test basic operations
-await counter_service.increment.choose()
-results = await counter_service.increment.call()
+await counter_service.increment.route()
+results = await counter_service.increment.fanout()  # Get from all replicas
 print(f"All replica values: {results}")
 
 # STICKY SESSIONS
 print("\nUsing sticky sessions:")
 async with counter_service.session():
-    await counter_service.reset.choose()
-    print(await counter_service.increment.choose())  # 1
-    print(await counter_service.increment.choose())  # 2
-    print(await counter_service.increment.choose())  # 3
+    await counter_service.reset.route()  # Uses .route() within session
+    print(await counter_service.increment.route())  # 1
+    print(await counter_service.increment.route())  # 2
+    print(await counter_service.increment.route())  # 3
           
-    final_value = await counter_service.get_value.choose()
+    final_value = await counter_service.get_value.route()
     print(f"Final value on this replica: {final_value}")  # 3
 
 # Same pattern works with Policy for multi-turn conversations:
 # async with policy.session():
-#     response1 = await policy.generate.choose(prompt=turn1)
+#     response1 = await policy.generate.route(turn1)
 #     full_prompt = turn1 + response1[0].text + turn2
-#     response2 = await policy.generate.choose(prompt=full_prompt)
+#     response2 = await policy.generate.route(full_prompt)
 #     # Both calls hit same replica, preserving KV cache
 
 # Cleanup
-await shutdown_service(counter_service)
+await counter_service.shutdown()
 ```
 
 **Performance impact**: Critical for maintaining KV cache in multi-turn conversations.
@@ -395,60 +400,72 @@ print(f"Current policy version: {current_version}")
 Instead of manual coordination, Forge services handle speed mismatches automatically:
 
 ```python
-
 from apps.grpo.main import Episode, Group
 
 async def simple_rl_step():
     
     # ===== Generate a rollout =====
-    sample = await dataloader.__next__.choose()
-    prompt, target = sample["question"], sample["answer"]
+    sample = await dataloader.sample.call_one()  # DatasetActor is an actor, not service
+    prompt, target = sample["request"], sample["target"]  # Correct field names
     
     print(f"Prompt: {prompt}")
     print(f"Target: {target}")
     
-    actions = await policy.generate.choose(prompt=prompt)
+    actions = await policy.generate.route(prompt=prompt)  # Policy is a service
     print(f"Policy response: {actions[0].text}")
     
-    ref_logprobs = await ref_model.forward.choose(actions[0].token_ids)    
-    reward = await reward_actor.evaluate_response.choose(
+    # Create input tensor for reference model (requires full context)
+    input_ids = torch.cat([actions[0].prompt_ids, actions[0].token_ids])
+    ref_logprobs = await ref_model.forward.route(
+        input_ids.unsqueeze(0), max_req_tokens=512, return_logprobs=True
+    )    
+    reward = await reward_actor.evaluate_response.route(  # RewardActor is a service
         prompt=prompt, 
         response=actions[0].text, 
         target=target
     )
     print(f"Reward: {reward}")
     
+    # Create episode using actual GRPO Episode structure
     episode = Episode(
-        episode_id=0,
-        prompt=prompt,
-        target=target, 
+        episode_id="0",
+        request=prompt,
         policy_version=0,
+        pad_id=tokenizer.pad_token_id,
+        request_len=512,
+        response_len=512,
+        target=target
     )
     
-    episode.add_group(Group(
-        response=actions[0].text,
-        ref_logprobs=ref_logprobs,
-        reward=reward,
-    ))
+    # Add response data
+    episode.response = actions[0].text
+    episode.request_tokens = actions[0].prompt_ids.tolist()
+    episode.response_tokens = actions[0].token_ids.tolist()
+    episode.ref_logprobs = ref_logprobs[0]  # Extract from batch dimension
+    episode.reward = reward
     
-    advantages = await compute_advantages.__call__.choose(episode.groups)
-    episode.groups[0].advantage = advantages[0]
+    # Compute advantages using actual ComputeAdvantages actor
+    group = Group.new_group(0, 1, prompt, 0, tokenizer.pad_token_id, 512, 512, target)
+    group.episodes[0] = episode
+    advantages = await compute_advantages.compute.call_one(group)  # ComputeAdvantages is an actor
+    episode.advantage = advantages[0]
     print(f"Advantage: {advantages[0]}")    
-    await replay_buffer.add.choose(episode)
+    await replay_buffer.add.call_one(episode)  # ReplayBuffer is an actor
     print("Episode stored in replay buffer")
     
     # ===== Train on the batch ===== 
-    batch = await replay_buffer.sample.choose(curr_policy_version=0)
+    batch = await replay_buffer.sample.call_one(curr_policy_version=0)
     if batch is not None:
         print("Training on batch...")
-        training_result = await trainer.train_step.choose(batch)
-        loss = training_result.get("loss", 0.0)
+        inputs, targets = batch  # GRPO returns (inputs, targets) tuple
+        loss = await trainer.train_step.call(inputs, targets)  # RLTrainer is an actor
         print(f"Training loss: {loss}")
         return loss
     else:
         print("Not enough data in buffer yet")
         return None
 
+# Note: This simplified example assumes tokenizer and services are already initialized
 for step in range(10):
     print(f"\n--- RL Step {step + 1} ---")
     loss = await simple_rl_step()
@@ -467,7 +484,7 @@ for step in range(10):
 policy = await Policy.options(
     procs=1, num_replicas=8, with_gpus=True  # Many replicas for high throughput
 ).as_service(
-    engine_config=EngineConfig(model=model_name)
+    engine_config={"model": model_name, "tensor_parallel_size": 1}
 )
 
 # Reward evaluation might be CPU-bound
@@ -479,9 +496,10 @@ reward_actor = await RewardActor.options(
 
 # Training needs fewer but more powerful replicas
 trainer = await RLTrainer.options(
-    procs=1, num_replicas=2, with_gpus=True  # Fewer but GPU-heavy
+    procs=1, with_gpus=True  # Fewer but GPU-heavy
 ).as_actor(  # Trainer typically uses .as_actor() not .as_service()
-    optimizer=Optimizer(lr=1e-5)
+    model={"name": "qwen3", "flavor": "1.7B"},
+    optimizer={"name": "AdamW", "lr": 1e-5}
 )
 ```
 
@@ -495,7 +513,6 @@ Let's see how a reward service is actually implemented:
 from forge.controller import ForgeActor
 from monarch.actor import endpoint
 from forge.data.rewards import MathReward, ThinkingReward
-from forge.controller.service import ServiceConfig, spawn_service
 
 # class definition from apps/grpo/main.py
 class RewardActor(ForgeActor):
@@ -515,9 +532,9 @@ class RewardActor(ForgeActor):
         # Return average reward across all functions
         return total_reward / len(self.reward_functions) if self.reward_functions else 0.0
 
-reward_actor = await spawn_service(
-    ServiceConfig(procs_per_replica=1, num_replicas=1),
-    RewardActor,
+reward_actor = await RewardActor.options(
+    procs=1, num_replicas=1
+).as_service(
     reward_functions=[MathReward(), ThinkingReward()]
 )
 
@@ -525,7 +542,7 @@ prompt = "What is 15% of 240?"
 response = "15% of 240 is 36"
 target = "36"
 
-score = await reward_actor.evaluate_response.choose(
+score = await reward_actor.evaluate_response.route(
     prompt=prompt,
     response=response, 
     target=target
@@ -533,10 +550,10 @@ score = await reward_actor.evaluate_response.choose(
 print(f"Reward score: {score}")  # Usually around 1.0 for correct math answers
 
 # For production scaling - increase num_replicas for parallel evaluation:
-# ServiceConfig(procs_per_replica=1, num_replicas=16)  # 16 parallel evaluators
+# RewardActor.options(procs=1, num_replicas=16)  # 16 parallel evaluators
 
 # Cleanup when done
-await shutdown_service(reward_actor)
+await reward_actor.shutdown()
 ```
 
 ## Service Orchestration: The Training Loop
@@ -547,16 +564,15 @@ Now let's see how services coordinate in a real training loop:
 # This is the REAL way production RL systems are built with Forge
 
 import asyncio
+import torch
 from forge.actors.policy import Policy
 from forge.actors.reference_model import ReferenceModel
 from forge.actors.replay_buffer import ReplayBuffer
 from forge.actors.trainer import RLTrainer
-from forge.controller.actor import ForgeActor
+from apps.grpo.main import DatasetActor, RewardActor, ComputeAdvantages
 from forge.data.rewards import MathReward, ThinkingReward
-from monarch.actor import endpoint
-from omegaconf import DictConfig
 
-# Service creation from apps/grpo/main.py lines 322-344
+# Service creation pattern from apps/grpo/main.py lines 322-344
 print("Initializing all services...")
 (
     dataloader,
@@ -567,17 +583,27 @@ print("Initializing all services...")
     ref_model,
     reward_actor,
 ) = await asyncio.gather(
-    DatasetActor.options(**cfg.actors.dataset).as_actor(**cfg.dataset),
-    Policy.options(**cfg.services.policy).as_service(**cfg.policy),
-    RLTrainer.options(**cfg.actors.trainer).as_actor(
-        **cfg.trainer, loss=simple_grpo_loss
+    DatasetActor.options(procs=1).as_actor(
+        path="openai/gsm8k", revision="main", data_split="train", 
+        streaming=True, model="Qwen/Qwen3-1.7B"
+    ),
+    Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service(
+        engine_config={"model": "Qwen/Qwen3-1.7B", "tensor_parallel_size": 1},
+        sampling_config={"n": 1, "max_tokens": 512}
     ),
-    ReplayBuffer.options(**cfg.actors.replay_buffer).as_actor(
-        **cfg.replay_buffer, collate=collate
+    RLTrainer.options(procs=1, with_gpus=True).as_actor(
+        model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": "hf://Qwen/Qwen3-1.7B"},
+        optimizer={"name": "AdamW", "lr": 1e-5},
+        training={"local_batch_size": 2, "seq_len": 2048}
     ),
-    ComputeAdvantages.options(**cfg.actors.compute_advantages).as_actor(),
-    ReferenceModel.options(**cfg.services.ref_model).as_service(**cfg.ref_model),
-    RewardActor.options(**cfg.services.reward_actor).as_service(
+    ReplayBuffer.options(procs=1).as_actor(
+        batch_size=2, max_policy_age=1, dp_size=1
+    ),
+    ComputeAdvantages.options(procs=1).as_actor(),
+    ReferenceModel.options(procs=1, with_gpus=True).as_actor(
+        model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": "hf://Qwen/Qwen3-1.7B"}
+    ),
+    RewardActor.options(procs=1, num_replicas=1).as_service(
         reward_functions=[MathReward(), ThinkingReward()]
     ),
 )
@@ -593,10 +619,13 @@ async def production_training_loop():
         sample = await dataloader.sample.call_one()
         
         # Policy generation service call
-        responses = await policy.generate.route(prompt=sample["question"])
+        responses = await policy.generate.route(sample["request"])  # Correct field name
         
-        # Reference computation service call
-        ref_logprobs = await ref_model.forward.route(responses[0].token_ids)
+        # Reference computation service call (requires full input tensor)
+        input_ids = torch.cat([responses[0].prompt_ids, responses[0].token_ids])
+        ref_logprobs = await ref_model.forward.route(
+            input_ids.unsqueeze(0), max_req_tokens=512, return_logprobs=True
+        )
         
         # Reward evaluation service call 
         reward = await reward_actor.evaluate_response.route(
@@ -605,18 +634,19 @@ async def production_training_loop():
             target=sample["answer"]
         )
         
-        # Experience storage (simplified structure for illustration)
-        episode = create_episode(sample, responses[0], reward, ref_logprobs, step)
+        # Experience storage (using actual Episode structure)
+        episode = create_episode_from_grpo_data(sample, responses[0], reward, ref_logprobs[0], step)
         await replay_buffer.add.call_one(episode)
         
-        # Training when ready endpoints
+        # Training when ready
         batch = await replay_buffer.sample.call_one(curr_policy_version=step)
         if batch is not None:
-            loss = await trainer.train_step.call_one(batch)
+            inputs, targets = batch  # GRPO returns (inputs, targets) tuple
+            loss = await trainer.train_step.call(inputs, targets)
             
             # Weight synchronization pattern
-            await trainer.push_weights.call_one(step + 1)
-            await policy.update_weights.route(step + 1)
+            await trainer.push_weights.call(step + 1)
+            await policy.update_weights.fanout(step + 1)  # Fanout to all replicas
             
             print(f"Step {step}, Loss: {loss:.4f}")
             step += 1
@@ -628,7 +658,7 @@ await asyncio.gather(
     RLTrainer.shutdown(trainer),
     ReplayBuffer.shutdown(replay_buffer),
     ComputeAdvantages.shutdown(compute_advantages),
-    ref_model.shutdown(),
+    ReferenceModel.shutdown(ref_model),
     reward_actor.shutdown(),
 )
 print("All services shut down successfully!")
@@ -636,7 +666,7 @@ print("All services shut down successfully!")
 
 **Key observations:**
 1. **Parallelism**: Independent operations run concurrently
-2. **Load balancing**: Each `choose()` call automatically selects optimal replica  
+2. **Load balancing**: Each `.route()` call automatically selects optimal replica  
 3. **Fault tolerance**: Failures automatically retry on different replicas
 4. **Resource efficiency**: CPU and GPU services scale independently
 5. **Coordination**: Services coordinate through shared state (replay buffer, weight versions)
diff --git a/docs/Tutorials/3_Monarch_101.MD b/docs/Tutorials/3_Monarch_101.MD
index 52a058dcc..0cbdcbd88 100644
--- a/docs/Tutorials/3_Monarch_101.MD
+++ b/docs/Tutorials/3_Monarch_101.MD
@@ -9,11 +9,11 @@ Now let's peel back the layers. Forge services are built on top of **Monarch**,
 ```mermaid
 graph TD
     subgraph YourCode["1. Your RL Code"]
-        Call["await policy_service.generate.choose('What is 2+2?')"]
+        Call["await policy_service.generate.route('What is 2+2?')"]
     end
 
     subgraph ForgeServices["2. Forge Service Layer"]
-        ServiceInterface["ServiceInterface<br/>• Routes .choose() to replica<br/>• Handles load balancing<br/>• Manages health checks"]
+        ServiceInterface["ServiceInterface<br/>• Routes .route() to replica<br/>• Handles load balancing<br/>• Manages health checks"]
         ServiceActor["ServiceActor<br/>• Manages replica lifecycle<br/>• Monitors health<br/>• Coordinates failures"]
     end
 
@@ -167,7 +167,7 @@ await counters.increment.call()
 value = await counters.get_value.call_one()
 print(f"One counter: {value}")
 
-# choose() - random single actor
+# choose() - random single actor (actors only, not services)
 value = await counters.get_value.choose()
 print(f"Random counter: {value}")
 
@@ -273,8 +273,8 @@ Now the key insight: **Forge services are ServiceActors that manage ActorMeshes
 
 ```mermaid
 graph TD
-    subgraph ServiceCreation["spawn_service() Process"]
-        Call["await spawn_service(ServiceConfig(num_replicas=4), PolicyActor, model='Qwen')"]
+    subgraph ServiceCreation["Service Creation Process"]
+        Call["await PolicyActor.options(num_replicas=4, procs=1).as_service(model='Qwen')"]
 
         ServiceActor["ServiceActor<br/>• Manages 4 replicas<br/>• Handles health checks<br/>• Routes service calls"]
 
@@ -323,9 +323,9 @@ graph TD
 ```mermaid
 graph TD
     subgraph CallFlow["Complete Call Flow"]
-        UserCall["await policy_service.generate.choose('What is 2+2?')"]
+        UserCall["await policy_service.generate.route('What is 2+2?')"]
 
-        ServiceInterface["ServiceInterface<br/>• Receives .choose() call<br/>• Routes to ServiceActor"]
+        ServiceInterface["ServiceInterface<br/>• Receives .route() call<br/>• Routes to ServiceActor"]
 
         ServiceActor["ServiceActor<br/>• Selects healthy replica<br/>• Load balancing logic<br/>• Failure handling"]
 

From aca4c961590a1a5081235d088a4f3383bc38932c Mon Sep 17 00:00:00 2001
From: Sanyam Bhutani <sanyam.bhutani05@gmail.com>
Date: Fri, 3 Oct 2025 13:50:49 -0700
Subject: [PATCH 13/28] Update docs/Tutorials/2_Forge_Internals.MD

Co-authored-by: Allen Wang <9057208+allenwang28@users.noreply.github.com>
---
 docs/Tutorials/2_Forge_Internals.MD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/Tutorials/2_Forge_Internals.MD b/docs/Tutorials/2_Forge_Internals.MD
index 634f04f85..09c39fb7e 100644
--- a/docs/Tutorials/2_Forge_Internals.MD
+++ b/docs/Tutorials/2_Forge_Internals.MD
@@ -58,7 +58,7 @@ Policy.options(
     num_replicas=4,    # Number of replicas  
     with_gpus=True     # Allocate GPUs
     # Other available options:
-    # hosts=None
+    # hosts=None   #  the number of remote hosts used per replica
 )
 ```
 

From 10863aaf13a3580bf1eb9e0c04a635357f60fff7 Mon Sep 17 00:00:00 2001
From: Sanyam Bhutani <sanyambhutani@meta.com>
Date: Fri, 10 Oct 2025 14:10:38 -0700
Subject: [PATCH 14/28] update part 1 and 2

---
 docs/Tutorials/1_RL_and_Forge_Fundamentals.MD |  4 +-
 docs/Tutorials/2_Forge_Internals.MD           | 56 +------------------
 2 files changed, 3 insertions(+), 57 deletions(-)

diff --git a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
index c34ae6639..32ada41cb 100644
--- a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
+++ b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
@@ -213,7 +213,7 @@ Each step has different:
 Unlike supervised learning where you process independent batches, RL requires coordination:
 
 ```python
-# This won't work - creates bottlenecks and resource waste
+# While this does work, it creates bottlenecks and resource waste
 def naive_rl_step():
     # Policy waits idle while reward model works
     response = policy_model.generate(prompt)  # GPU busy
@@ -368,7 +368,7 @@ group_size = 1
 Production scaling - multiply num_replicas for services or spawn multiple actors:
 - Policy: num_replicas=8 for high inference demand
 - RewardActor: num_replicas=16 for parallel evaluation
-- Trainer: Multiple actors for distributed training (RLTrainer handles this internally)
+- Trainer: Multiple processes for distributed training (RLTrainer handles this internally)
 
 
 ### Fault Tolerance
diff --git a/docs/Tutorials/2_Forge_Internals.MD b/docs/Tutorials/2_Forge_Internals.MD
index 09c39fb7e..c21485bb0 100644
--- a/docs/Tutorials/2_Forge_Internals.MD
+++ b/docs/Tutorials/2_Forge_Internals.MD
@@ -64,7 +64,6 @@ Policy.options(
 
 ### 2. Real Service Creation
 
-Services are created using the `spawn_service` function:
 Services are created using the `.options().as_service()` pattern from the actual GRPO implementation:
 
 The service creation automatically handles:
@@ -126,32 +125,6 @@ But behind the scenes:
 - Handles failures and retries automatically
 - Returns list[Completion] from the selected replica
 
-### 3. Different Service Types and Their Characteristics
-
-```mermaid
-graph TD
-    subgraph GPU["GPU-Intensive Services"]
-        PolicySvc["Policy Service<br/>Large model inference<br/>High GPU memory<br/>Batch optimization"]
-        TrainerSvc["Trainer Service<br/>Distributed training<br/>Gradient sync<br/>Massive compute"]
-        RefSvc["Reference Service<br/>Frozen model<br/>Baseline computation<br/>Read-only ops"]
-    end
-    
-    subgraph CPU["CPU-Intensive Services"]
-        RewardSvc["Reward Service<br/>Evaluation logic<br/>Rule-based scoring<br/>High throughput"]
-        DataSvc["Data Service<br/>Dataset streaming<br/>Preprocessing<br/>I/O optimization"]
-    end
-    
-    subgraph Memory["Memory-Intensive Services"]
-        BufferSvc["Buffer Service<br/>Experience storage<br/>Efficient sampling<br/>Persistence"]
-        MetricsSvc["Metrics Service<br/>Logging aggregation<br/>Performance tracking<br/>Analytics"]
-    end
-    
-    style PolicySvc fill:#ff9999
-    style TrainerSvc fill:#ff9999
-    style RewardSvc fill:#99ff99
-    style BufferSvc fill:#9999ff
-```
-
 ## Deep Dive: Service Communication Patterns
 
 These communication patterns (\"adverbs\") determine how your service calls are routed to replicas. Understanding when to use each pattern is key to effective Forge usage.
@@ -226,34 +199,7 @@ while training:
 
 **Critical insight**: This is essential for high-throughput RL where you can't wait for batches.
 
-### 4. Fire-and-Forget Operations
-
-**When to use**: Side effects that don't need responses (notifications, cache updates).
-
-```python
-# CONCEPTUAL - Fire-and-forget requires custom @endpoint implementations
-# The basic services don't have broadcast methods built-in
-# You would implement custom endpoints in your ForgeActor:
-
-class CustomPolicy(Policy):
-    @endpoint
-    async def clear_cache(self) -> None:
-        """Custom endpoint for cache clearing"""
-        self.policy_worker.clear_kv_cache()
-
-# Then use it (hypothetical):
-# await custom_policy.clear_cache.fanout()  # Clear all replica caches
-# Note: Actual cache clearing would use existing Policy methods
-```
-
-**Performance characteristics**:
-- **Latency**: Immediately returns (doesn't wait for completion)
-- **Throughput**: Network limited, but non-blocking
-- **Fault tolerance**: Fire-and-forget (you don't know if it worked)
-
-**Critical warning**: Only use for non-critical operations - you get no confirmation.
-
-### 5. Service Sessions for Stateful Operations
+### 3. Service Sessions for Stateful Operations
 
 **When to use**: When you need multiple calls to hit the same replica (like KV cache preservation).
 

From 67a0a9852bd76e775db082bf940ffe97f48fe0a1 Mon Sep 17 00:00:00 2001
From: Sanyam Bhutani <sanyam.bhutani05@gmail.com>
Date: Sun, 12 Oct 2025 11:43:57 -0700
Subject: [PATCH 15/28] address more comments

---
 docs/Tutorials/1_RL_and_Forge_Fundamentals.MD | 8 ++++----
 docs/Tutorials/2_Forge_Internals.MD           | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
index 32ada41cb..66b32a2b3 100644
--- a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
+++ b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
@@ -204,8 +204,8 @@ graph LR
 
 Each step has different:
 - **Latency requirements**: Policy inference needs low latency, training can batch
-- **Scaling patterns**: Reward evaluation scales with response count, training with model size
-- **Failure modes**: Policy failure stops generation, reward failure affects learning quality
+- **Scaling patterns**: Need N policy replicas to keep trainer busy, plus different sharding strategies (tensor parallel for training vs replicated inference)
+- **Failure modes**: Any component failure cascades to halt the entire pipeline (Forge prevents this with automatic failover)
 - **Resource utilization**: GPUs for inference/training, CPUs for data processing
 
 ### Problem 3: The Coordination Challenge
@@ -229,9 +229,9 @@ def naive_rl_step():
 
 ## Enter Forge: RL-Native Architecture
 
-Forge solves these problems by treating each RL component as an **independent, scalable service**
+Forge solves these problems by treating each RL component as an **independent, distributed unit** - some as fault-tolerant services (like Policy inference where failures are easy to handle), others as actors (like Trainers where recovery semantics differ)
 
-Let's see how core RL concepts map to Forge services:
+Let's see how core RL concepts map to Forge components (you'll notice a mix of `.route()` for services and `.call_one()` for actors - we cover when to use each in Part 2):
 
 ```python
 async def real_rl_training_step(services, step):
diff --git a/docs/Tutorials/2_Forge_Internals.MD b/docs/Tutorials/2_Forge_Internals.MD
index c21485bb0..2ed3301e5 100644
--- a/docs/Tutorials/2_Forge_Internals.MD
+++ b/docs/Tutorials/2_Forge_Internals.MD
@@ -140,7 +140,7 @@ answer = responses[0].text  # Extract text from Completion object
 
 Behind the scenes:
 1. Health check eliminates failed replicas
-2. Load balancer picks least loaded healthy replica  
+2. Load balancer picks replica (currently round robin, configurable balancers coming soon)
 3. Request routes to that specific replica
 4. Automatic retry on different replica if failure
 
@@ -302,7 +302,7 @@ async def optimized_multi_turn():
 ```python
 # Forge ReplayBuffer endpoints (verified from source code)
 # Add episodes (thread-safe by actor model)
-await replay_buffer.add.call_one(episode)  # Note: .call_one() not .choose()
+await replay_buffer.add.call_one(episode)  # .choose() would work too, but .call_one() clarifies it's a singleton actor not ActorMesh
 
 # Sample batches for training
 batch = await replay_buffer.sample.call_one(

From 27a48a829e07bea0ede28a2580ac61d3b68f6afc Mon Sep 17 00:00:00 2001
From: Sanyam Bhutani <sanyam.bhutani05@gmail.com>
Date: Sun, 12 Oct 2025 11:46:05 -0700
Subject: [PATCH 16/28] fix multi line issue

---
 docs/Tutorials/1_RL_and_Forge_Fundamentals.MD | 20 +++----
 docs/Tutorials/2_Forge_Internals.MD           | 14 ++---
 docs/Tutorials/3_Monarch_101.MD               | 60 +++++++++----------
 3 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
index 66b32a2b3..26f90092c 100644
--- a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
+++ b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
@@ -9,12 +9,12 @@ Let's start with a simple math tutoring example to understand RL concepts with t
 ```mermaid
 graph TD
     subgraph Example["Math Tutoring RL Example"]
-        Dataset["Dataset<br/>math problems<br/>'What is 2+2?'"]
-        Policy["Policy<br/>student AI<br/>generates: 'The answer is 4'"]
-        Reward["Reward Model<br/>Evaluation Exam<br/>scores: 0.95 (excellent)"]
-        Reference["Reference Model<br/>original student<br/>baseline comparison"]
-        ReplayBuffer["Replay Buffer<br/>notebook<br/>stores experiences"]
-        Trainer["Trainer<br/>tutor<br/>improves student"]
+        Dataset["Dataset: math problems"]
+        Policy["Policy: student AI"]
+        Reward["Reward Model: scores answers"]
+        Reference["Reference Model: baseline"]
+        ReplayBuffer["Replay Buffer: stores experiences"]
+        Trainer["Trainer: improves student"]
     end
     
     Dataset --> Policy
@@ -163,13 +163,13 @@ Our simple RL loop above has complex requirements:
 ```mermaid
 graph TD
     subgraph Components["Each Component Needs Different Resources"]
-        Policy["Policy (Student AI)<br/>Generates: 'The answer is 4'<br/>Needs: Large GPU memory<br/>Scaling: Multiple replicas for speed"]
+        Policy["Policy (Student AI): Large GPU memory, Multiple replicas"]
         
-        Reward["Reward Model (Teacher)<br/>Scores answers: 0.95<br/>Needs: Moderate compute<br/>Scaling: CPU or small GPU"]
+        Reward["Reward Model (Teacher): Moderate compute, CPU/small GPU"]
         
-        Trainer["Trainer (Tutor)<br/>Improves student weights<br/>Needs: Massive GPU compute<br/>Scaling: Distributed training"]
+        Trainer["Trainer (Tutor): Massive GPU compute, Distributed training"]
         
-        Dataset["Dataset (Question Bank)<br/>Provides: 'What is 2+2?'<br/>Needs: CPU intensive I/O<br/>Scaling: High memory bandwidth"]
+        Dataset["Dataset (Question Bank): CPU intensive I/O, High memory bandwidth"]
     end
     
     style Policy fill:#99ff99
diff --git a/docs/Tutorials/2_Forge_Internals.MD b/docs/Tutorials/2_Forge_Internals.MD
index 2ed3301e5..ef53ddfe5 100644
--- a/docs/Tutorials/2_Forge_Internals.MD
+++ b/docs/Tutorials/2_Forge_Internals.MD
@@ -15,19 +15,19 @@ graph TD
     Call["Your Code:<br/>await policy_service.generate"]
     
     subgraph ServiceLayer["Service Layer"]
-        Proxy["Service Proxy<br/>Load balancing<br/>Health checking<br/>Request routing"]
-        LB["Load Balancer<br/>Replica selection<br/>Circuit breaker<br/>Retry logic"]
+        Proxy["Service Proxy: Load balancing, Health checking"]
+        LB["Load Balancer: Replica selection, Circuit breaker"]
     end
     
     subgraph Replicas["Replica Management"]
-        R1["Replica 1<br/>GPU 0<br/>Healthy"]
-        R2["Replica 2<br/>GPU 1<br/>Overloaded"]
-        R3["Replica 3<br/>GPU 2<br/>Failed"]
-        R4["Replica 4<br/>GPU 3<br/>Healthy"]
+        R1["Replica 1: GPU 0, Healthy"]
+        R2["Replica 2: GPU 1, Overloaded"]
+        R3["Replica 3: GPU 2, Failed"]
+        R4["Replica 4: GPU 3, Healthy"]
     end
     
     subgraph Compute["Actual Computation"]
-        Actor["Policy Actor<br/>vLLM engine<br/>Model weights<br/>KV cache"]
+        Actor["Policy Actor: vLLM engine, Model weights, KV cache"]
     end
     
     Call --> Proxy
diff --git a/docs/Tutorials/3_Monarch_101.MD b/docs/Tutorials/3_Monarch_101.MD
index 0cbdcbd88..502d8a34d 100644
--- a/docs/Tutorials/3_Monarch_101.MD
+++ b/docs/Tutorials/3_Monarch_101.MD
@@ -13,20 +13,20 @@ graph TD
     end
 
     subgraph ForgeServices["2. Forge Service Layer"]
-        ServiceInterface["ServiceInterface<br/>• Routes .route() to replica<br/>• Handles load balancing<br/>• Manages health checks"]
-        ServiceActor["ServiceActor<br/>• Manages replica lifecycle<br/>• Monitors health<br/>• Coordinates failures"]
+        ServiceInterface["ServiceInterface: Routes requests, Load balancing, Health checks"]
+        ServiceActor["ServiceActor: Manages replicas, Monitors health, Coordinates failures"]
     end
 
     subgraph MonarchLayer["3. Monarch Actor Layer"]
-        ActorMesh["ActorMesh PolicyActor<br/>• 4 PolicyActor instances<br/>• Each on different GPU<br/>• Message passing interface"]
-        ProcMesh["ProcMesh<br/>• 4 processes<br/>• GPU topology: 0,1,2,3<br/>• Network interconnect"]
+        ActorMesh["ActorMesh PolicyActor: 4 instances, Different GPUs, Message passing"]
+        ProcMesh["ProcMesh: 4 processes, GPU topology 0,1,2,3, Network interconnect"]
     end
 
     subgraph Hardware["4. Physical Hardware"]
-        GPU0["GPU 0<br/>PolicyActor #1<br/>vLLM Engine<br/>Model Weights"]
-        GPU1["GPU 1<br/>PolicyActor #2<br/>vLLM Engine<br/>Model Weights"]
-        GPU2["GPU 2<br/>PolicyActor #3<br/>vLLM Engine<br/>Model Weights"]
-        GPU3["GPU 3<br/>PolicyActor #4<br/>vLLM Engine<br/>Model Weights"]
+        GPU0["GPU 0: PolicyActor #1, vLLM Engine, Model Weights"]
+        GPU1["GPU 1: PolicyActor #2, vLLM Engine, Model Weights"]
+        GPU2["GPU 2: PolicyActor #3, vLLM Engine, Model Weights"]
+        GPU3["GPU 3: PolicyActor #4, vLLM Engine, Model Weights"]
     end
 
     Call --> ServiceInterface
@@ -199,10 +199,10 @@ graph TD
         end
 
         subgraph ActorMesh["ActorMesh PolicyActor"]
-            A0["PolicyActor<br/>Instance #0<br/>model=Qwen/Qwen3-7B<br/>generation_count=0"]
-            A1["PolicyActor<br/>Instance #1<br/>model=Qwen/Qwen3-7B<br/>generation_count=0"]
-            A2["PolicyActor<br/>Instance #2<br/>model=Qwen/Qwen3-7B<br/>generation_count=0"]
-            A3["PolicyActor<br/>Instance #3<br/>model=Qwen/Qwen3-7B<br/>generation_count=0"]
+            A0["PolicyActor Instance #0: model=Qwen/Qwen3-7B"]
+            A1["PolicyActor Instance #1: model=Qwen/Qwen3-7B"]
+            A2["PolicyActor Instance #2: model=Qwen/Qwen3-7B"]
+            A3["PolicyActor Instance #3: model=Qwen/Qwen3-7B"]
         end
 
         Code --> ProcMesh
@@ -226,17 +226,17 @@ graph TD
         Client["await policy_actors.generate.METHOD(prompt)"]
 
         subgraph Methods["Different Adverbs Route Differently"]
-            Choose["choose()<br/>→ Routes to ONE actor<br/>→ Load balanced"]
-            Call["call()<br/>→ Routes to ALL actors<br/>→ Collects all results"]
-            Broadcast["broadcast()<br/>→ Routes to ALL actors<br/>→ Fire and forget"]
-            Stream["stream()<br/>→ Routes to ALL actors<br/>→ Iterator of results"]
+            Choose["choose(): Routes to ONE actor, Load balanced"]
+            Call["call(): Routes to ALL actors, Collects results"]
+            Broadcast["broadcast(): Routes to ALL actors, Fire and forget"]
+            Stream["stream(): Routes to ALL actors, Iterator of results"]
         end
 
         subgraph ActorInstances["PolicyActor Instances"]
-            A0["Actor 0<br/>GPU 0<br/>generates response"]
-            A1["Actor 1<br/>GPU 1<br/>generates response"]
-            A2["Actor 2<br/>GPU 2<br/>generates response"]
-            A3["Actor 3<br/>GPU 3<br/>generates response"]
+            A0["Actor 0: GPU 0, generates response"]
+            A1["Actor 1: GPU 1, generates response"]
+            A2["Actor 2: GPU 2, generates response"]
+            A3["Actor 3: GPU 3, generates response"]
         end
 
         Client --> Choose
@@ -276,26 +276,26 @@ graph TD
     subgraph ServiceCreation["Service Creation Process"]
         Call["await PolicyActor.options(num_replicas=4, procs=1).as_service(model='Qwen')"]
 
-        ServiceActor["ServiceActor<br/>• Manages 4 replicas<br/>• Handles health checks<br/>• Routes service calls"]
+        ServiceActor["ServiceActor: Manages 4 replicas, Health checks, Routes calls"]
 
         subgraph Replicas["4 Independent Replicas"]
             subgraph R0["Replica 0"]
-                PM0["ProcMesh<br/>1 process<br/>GPU 0"]
+                PM0["ProcMesh: 1 process, GPU 0"]
                 AM0["ActorMesh<br/>1 PolicyActor"]
             end
 
             subgraph R1["Replica 1"]
-                PM1["ProcMesh<br/>1 process<br/>GPU 1"]
+                PM1["ProcMesh: 1 process, GPU 1"]
                 AM1["ActorMesh<br/>1 PolicyActor"]
             end
 
             subgraph R2["Replica 2"]
-                PM2["ProcMesh<br/>1 process<br/>GPU 2"]
+                PM2["ProcMesh: 1 process, GPU 2"]
                 AM2["ActorMesh<br/>1 PolicyActor"]
             end
 
             subgraph R3["Replica 3"]
-                PM3["ProcMesh<br/>1 process<br/>GPU 3"]
+                PM3["ProcMesh: 1 process, GPU 3"]
                 AM3["ActorMesh<br/>1 PolicyActor"]
             end
         end
@@ -325,15 +325,15 @@ graph TD
     subgraph CallFlow["Complete Call Flow"]
         UserCall["await policy_service.generate.route('What is 2+2?')"]
 
-        ServiceInterface["ServiceInterface<br/>• Receives .route() call<br/>• Routes to ServiceActor"]
+        ServiceInterface["ServiceInterface: Receives .route() call, Routes to ServiceActor"]
 
-        ServiceActor["ServiceActor<br/>• Selects healthy replica<br/>• Load balancing logic<br/>• Failure handling"]
+        ServiceActor["ServiceActor: Selects healthy replica, Load balancing, Failure handling"]
 
-        SelectedReplica["Selected Replica #2<br/>• ProcMesh with 1 process<br/>• ActorMesh with 1 PolicyActor"]
+        SelectedReplica["Selected Replica #2: ProcMesh 1 process, ActorMesh 1 PolicyActor"]
 
-        PolicyActor["PolicyActor Instance<br/>• Loads model<br/>• Runs vLLM inference<br/>• Returns 'The answer is 4'"]
+        PolicyActor["PolicyActor Instance: Loads model, Runs vLLM inference"]
 
-        GPU["GPU 2<br/>• vLLM engine<br/>• Model weights<br/>• KV cache<br/>• CUDA kernels"]
+        GPU["GPU 2: vLLM engine, Model weights, KV cache, CUDA kernels"]
 
         UserCall --> ServiceInterface
         ServiceInterface --> ServiceActor

From 858c28b8dc6f3e07b989b40112dc9301ecf861f2 Mon Sep 17 00:00:00 2001
From: Sanyam Bhutani <sanyam.bhutani05@gmail.com>
Date: Sun, 12 Oct 2025 11:48:16 -0700
Subject: [PATCH 17/28] fix colours

---
 docs/Tutorials/1_RL_and_Forge_Fundamentals.MD | 32 ++++----
 docs/Tutorials/2_Forge_Internals.MD           |  8 +-
 docs/Tutorials/3_Monarch_101.MD               | 76 +++++++++----------
 3 files changed, 58 insertions(+), 58 deletions(-)

diff --git a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
index 26f90092c..2565d626e 100644
--- a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
+++ b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
@@ -25,9 +25,9 @@ graph TD
     ReplayBuffer --> Trainer
     Trainer --> Policy
     
-    style Policy fill:#99ff99
-    style Reward fill:#ffcc99
-    style Trainer fill:#ff99cc
+    style Policy fill:#4CAF50
+    style Reward fill:#FF9800
+    style Trainer fill:#E91E63
 ```
 
 ### RL Components Defined (Forge Names)
@@ -100,10 +100,10 @@ graph LR
     C5 --> S5
     C6 --> S6
     
-    style C2 fill:#99ff99
-    style S2 fill:#99ff99
-    style C3 fill:#ffcc99
-    style S3 fill:#ffcc99
+    style C2 fill:#4CAF50
+    style S2 fill:#4CAF50
+    style C3 fill:#FF9800
+    style S3 fill:#FF9800
 ```
 
 ### RL Step with Forge Services
@@ -172,10 +172,10 @@ graph TD
         Dataset["Dataset (Question Bank): CPU intensive I/O, High memory bandwidth"]
     end
     
-    style Policy fill:#99ff99
-    style Reward fill:#ffcc99
-    style Trainer fill:#ff99cc
-    style Dataset fill:#ccccff
+    style Policy fill:#4CAF50
+    style Reward fill:#FF9800
+    style Trainer fill:#E91E63
+    style Dataset fill:#2196F3
 ```
 
 ### Problem 2: Complex Interdependencies
@@ -195,11 +195,11 @@ graph LR
     D --> E
     E --> A
     
-    style A fill:#99ff99
-    style B fill:#ffcc99
-    style C fill:#99ccff
-    style D fill:#ccff99
-    style E fill:#ff99cc
+    style A fill:#4CAF50
+    style B fill:#FF9800
+    style C fill:#2196F3
+    style D fill:#8BC34A
+    style E fill:#E91E63
 ```
 
 Each step has different:
diff --git a/docs/Tutorials/2_Forge_Internals.MD b/docs/Tutorials/2_Forge_Internals.MD
index ef53ddfe5..05a40e4a5 100644
--- a/docs/Tutorials/2_Forge_Internals.MD
+++ b/docs/Tutorials/2_Forge_Internals.MD
@@ -39,10 +39,10 @@ graph TD
     R1 --> Actor
     R4 --> Actor
     
-    style Call fill:#99ff99
-    style LB fill:#ffcc99
-    style R3 fill:#ff9999
-    style Actor fill:#cc99ff
+    style Call fill:#4CAF50
+    style LB fill:#FF9800
+    style R3 fill:#F44336
+    style Actor fill:#9C27B0
 ```
 
 ## Service Components Deep Dive
diff --git a/docs/Tutorials/3_Monarch_101.MD b/docs/Tutorials/3_Monarch_101.MD
index 502d8a34d..52bdb17d0 100644
--- a/docs/Tutorials/3_Monarch_101.MD
+++ b/docs/Tutorials/3_Monarch_101.MD
@@ -38,10 +38,10 @@ graph TD
     ProcMesh --> GPU2
     ProcMesh --> GPU3
 
-    style Call fill:#99ff99
-    style ServiceActor fill:#ffcc99
-    style ActorMesh fill:#cc99ff
-    style ProcMesh fill:#ccccff
+    style Call fill:#4CAF50
+    style ServiceActor fill:#FF9800
+    style ActorMesh fill:#9C27B0
+    style ProcMesh fill:#2196F3
 ```
 
 ## Deep Dive: ProcMesh - The Foundation
@@ -74,14 +74,14 @@ graph TD
         P7 -.->|"Network"| P0
     end
 
-    style P0 fill:#ff9999
-    style P1 fill:#ff9999
-    style P2 fill:#ff9999
-    style P3 fill:#ff9999
-    style P4 fill:#ff9999
-    style P5 fill:#ff9999
-    style P6 fill:#ff9999
-    style P7 fill:#ff9999
+    style P0 fill:#F44336
+    style P1 fill:#F44336
+    style P2 fill:#F44336
+    style P3 fill:#F44336
+    style P4 fill:#F44336
+    style P5 fill:#F44336
+    style P6 fill:#F44336
+    style P7 fill:#F44336
 ```
 
 ### Multi-Host ProcMesh
@@ -122,9 +122,9 @@ graph TD
     H2P0 -.->|"InfiniBand"| H3P0
     H2P1 -.->|"InfiniBand"| H3P1
 
-    style PM1 fill:#ff9999
-    style PM2 fill:#99ff99
-    style PM3 fill:#99ccff
+    style PM1 fill:#F44336
+    style PM2 fill:#4CAF50
+    style PM3 fill:#2196F3
 ```
 
 ```python
@@ -212,10 +212,10 @@ graph TD
         P3 --> A3
     end
 
-    style A0 fill:#99ff99
-    style A1 fill:#99ff99
-    style A2 fill:#99ff99
-    style A3 fill:#99ff99
+    style A0 fill:#4CAF50
+    style A1 fill:#4CAF50
+    style A2 fill:#4CAF50
+    style A3 fill:#4CAF50
 ```
 
 ### Message Routing Through ActorMesh
@@ -259,10 +259,10 @@ graph TD
         Stream --> A3
     end
 
-    style Choose fill:#99ff99
-    style Call fill:#ffcc99
-    style Broadcast fill:#ff99cc
-    style Stream fill:#cc99ff
+    style Choose fill:#4CAF50
+    style Call fill:#FF9800
+    style Broadcast fill:#E91E63
+    style Stream fill:#9C27B0
 ```
 
 ## How Forge Services Use Monarch
@@ -311,11 +311,11 @@ graph TD
         PM3 --> AM3
     end
 
-    style ServiceActor fill:#ffcc99
-    style AM0 fill:#99ff99
-    style AM1 fill:#99ff99
-    style AM2 fill:#99ff99
-    style AM3 fill:#99ff99
+    style ServiceActor fill:#FF9800
+    style AM0 fill:#4CAF50
+    style AM1 fill:#4CAF50
+    style AM2 fill:#4CAF50
+    style AM3 fill:#4CAF50
 ```
 
 ### Service Call to Actor Execution
@@ -348,10 +348,10 @@ graph TD
         ServiceInterface -.->|"'The answer is 4'"| UserCall
     end
 
-    style UserCall fill:#99ff99
-    style ServiceActor fill:#ffcc99
-    style PolicyActor fill:#cc99ff
-    style GPU fill:#ffcccc
+    style UserCall fill:#4CAF50
+    style ServiceActor fill:#FF9800
+    style PolicyActor fill:#9C27B0
+    style GPU fill:#FF5722
 ```
 
 ## Multiple Services Sharing Infrastructure
@@ -400,12 +400,12 @@ graph TD
         BS --> C4
     end
 
-    style PS fill:#99ff99
-    style TS fill:#ff99cc
-    style RS fill:#ffcc99
-    style BS fill:#cc99ff
-    style GPUMesh fill:#ffe6e6
-    style CPUMesh fill:#e6f3ff
+    style PS fill:#4CAF50
+    style TS fill:#E91E63
+    style RS fill:#FF9800
+    style BS fill:#9C27B0
+    style GPUMesh fill:#FFEBEE
+    style CPUMesh fill:#E3F2FD
 ```
 
 ## Key Insights: Why This Architecture Matters

From 3862ce9d82fac3f5348d777ad892c3160e40dc84 Mon Sep 17 00:00:00 2001
From: Sanyam Bhutani <sanyam.bhutani05@gmail.com>
Date: Sun, 12 Oct 2025 12:03:09 -0700
Subject: [PATCH 18/28] fix linter and ohter comments

---
 docs/Tutorials/1_RL_and_Forge_Fundamentals.MD | 114 +++++++++---------
 docs/Tutorials/2_Forge_Internals.MD           |  98 +++++++--------
 docs/Tutorials/ReadMe.MD                      |   6 +-
 3 files changed, 109 insertions(+), 109 deletions(-)

diff --git a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
index 2565d626e..39b6d62aa 100644
--- a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
+++ b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
@@ -16,7 +16,7 @@ graph TD
         ReplayBuffer["Replay Buffer: stores experiences"]
         Trainer["Trainer: improves student"]
     end
-    
+
     Dataset --> Policy
     Policy --> Reward
     Policy --> Reference
@@ -24,7 +24,7 @@ graph TD
     Reference --> ReplayBuffer
     ReplayBuffer --> Trainer
     Trainer --> Policy
-    
+
     style Policy fill:#4CAF50
     style Reward fill:#FF9800
     style Trainer fill:#E91E63
@@ -47,25 +47,25 @@ graph TD
 def conceptual_rl_step():
     # 1. Get a math problem
     question = dataset.sample()  # "What is 2+2?"
-    
-    # 2. Student generates answer  
+
+    # 2. Student generates answer
     answer = policy.generate(question)  # "The answer is 4"
-    
+
     # 3. Teacher grades it
     score = reward_model.evaluate(question, answer)  # 0.95
-    
+
     # 4. Compare to original student
     baseline = reference_model.compute_logprobs(question, answer)
-    
+
     # 5. Store the experience
     experience = Episode(question, answer, score, baseline)
     replay_buffer.add(experience)
-    
+
     # 6. When enough experiences collected, improve student
     batch = replay_buffer.sample(curr_policy_version=0)
     if batch is not None:
         trainer.train_step(batch)  # Student gets better!
-        
+
 # 🔄 See complete working example below with actual Forge service calls
 ```
 
@@ -83,7 +83,7 @@ graph LR
         C5["Replay Buffer"]
         C6["Trainer"]
     end
-    
+
     subgraph Services["Forge Services (Real Classes)"]
         S1["DatasetActor"]
         S2["Policy"]
@@ -92,14 +92,14 @@ graph LR
         S5["ReplayBuffer"]
         S6["RLTrainer"]
     end
-    
+
     C1 --> S1
     C2 --> S2
     C3 --> S3
     C4 --> S4
     C5 --> S5
     C6 --> S6
-    
+
     style C2 fill:#4CAF50
     style S2 fill:#4CAF50
     style C3 fill:#FF9800
@@ -117,26 +117,26 @@ async def conceptual_forge_rl_step(services, step):
     # 1. Get a math problem - Using actual DatasetActor API
     sample = await services['dataloader'].sample.call_one()
     question, target = sample["request"], sample["target"]
-    
+
     # 2. Student generates answer - Using actual Policy API
     responses = await services['policy'].generate.route(prompt=question)
-    answer = responses[0].text  
-    
+    answer = responses[0].text
+
     # 3. Teacher grades it - Using actual RewardActor API
     score = await services['reward_actor'].evaluate_response.route(
         prompt=question, response=answer, target=target
     )
-    
+
     # 4. Compare to baseline - Using actual ReferenceModel API
     # Note: ReferenceModel.forward requires input_ids, max_req_tokens, return_logprobs
     ref_logprobs = await services['ref_model'].forward.route(
         input_ids, max_req_tokens, return_logprobs=True
     )
-    
+
     # 5. Store experience - Using actual Episode structure from apps/grpo/main.py
     episode = create_episode_from_response(responses[0], score, ref_logprobs, step)
     await services['replay_buffer'].add.call_one(episode)
-    
+
     # 6. Improve student - Using actual training pattern
     batch = await services['replay_buffer'].sample.call_one(
         curr_policy_version=step
@@ -160,23 +160,12 @@ Our simple RL loop above has complex requirements:
 
 #### Problem 1: Different Resource Needs
 
-```mermaid
-graph TD
-    subgraph Components["Each Component Needs Different Resources"]
-        Policy["Policy (Student AI): Large GPU memory, Multiple replicas"]
-        
-        Reward["Reward Model (Teacher): Moderate compute, CPU/small GPU"]
-        
-        Trainer["Trainer (Tutor): Massive GPU compute, Distributed training"]
-        
-        Dataset["Dataset (Question Bank): CPU intensive I/O, High memory bandwidth"]
-    end
-    
-    style Policy fill:#4CAF50
-    style Reward fill:#FF9800
-    style Trainer fill:#E91E63
-    style Dataset fill:#2196F3
-```
+| Component | Resource Needs | Scaling Strategy |
+|-----------|----------------|------------------|
+| **Policy** (Student AI) | Large GPU memory | Multiple replicas for throughput |
+| **Reward Heuristic** (Teacher) | Small compute | CPU or small GPU |
+| **Trainer** (Tutor) | Massive GPU compute | Distributed training |
+| **Dataset** (Question Bank) | CPU intensive I/O | High memory bandwidth |
 
 ### Problem 2: Complex Interdependencies
 
@@ -187,14 +176,14 @@ graph LR
     C["Reference: Original Student<br/>Provides baseline comparison"]
     D["Replay Buffer: Notebook<br/>Stores: question + answer + score"]
     E["Trainer: Tutor<br/>Improves student using experiences"]
-    
+
     A --> B
     A --> C
     B --> D
     C --> D
     D --> E
     E --> A
-    
+
     style A fill:#4CAF50
     style B fill:#FF9800
     style C fill:#2196F3
@@ -203,7 +192,7 @@ graph LR
 ```
 
 Each step has different:
-- **Latency requirements**: Policy inference needs low latency, training can batch
+- **Latency requirements**: Policy inference needs low latency (each episode waits), training can batch multiple episodes together
 - **Scaling patterns**: Need N policy replicas to keep trainer busy, plus different sharding strategies (tensor parallel for training vs replicated inference)
 - **Failure modes**: Any component failure cascades to halt the entire pipeline (Forge prevents this with automatic failover)
 - **Resource utilization**: GPUs for inference/training, CPUs for data processing
@@ -218,10 +207,10 @@ def naive_rl_step():
     # Policy waits idle while reward model works
     response = policy_model.generate(prompt)  # GPU busy
     reward = reward_model.evaluate(prompt, response)  # Policy GPU idle
-    
-    # Training waits for single episode  
+
+    # Training waits for single episode
     loss = compute_loss(response, reward)  # Batch size = 1, inefficient
-    
+
     # Everything stops if any component fails
     if policy_fails or reward_fails or trainer_fails:
         entire_system_stops()
@@ -233,32 +222,37 @@ Forge solves these problems by treating each RL component as an **independent, d
 
 Let's see how core RL concepts map to Forge components (you'll notice a mix of `.route()` for services and `.call_one()` for actors - we cover when to use each in Part 2):
 
+**Quick API Reference:** (covered in detail in Part 2: Service Communication Patterns)
+- `.route()` - Send request to any healthy replica in a service (load balanced)
+- `.call_one()` - Send request to a single actor instance
+- `.fanout()` - Send request to ALL replicas in a service
+
 ```python
 async def real_rl_training_step(services, step):
     """Single RL step using verified Forge APIs"""
-    
+
     # 1. Environment interaction - Using actual DatasetActor API
     sample = await services['dataloader'].sample.call_one()
     prompt, target = sample["request"], sample["target"]
-    
+
     responses = await services['policy'].generate.route(prompt)
-    
+
     # 2. Reward computation - Using actual RewardActor API
     score = await services['reward_actor'].evaluate_response.route(
         prompt=prompt, response=responses[0].text, target=target
     )
-    
+
     # 3. Get reference logprobs - Using actual ReferenceModel API
     # Note: ReferenceModel requires full input_ids tensor, not just tokens
     input_ids = torch.cat([responses[0].prompt_ids, responses[0].token_ids])
     ref_logprobs = await services['ref_model'].forward.route(
         input_ids.unsqueeze(0), max_req_tokens=512, return_logprobs=True
     )
-    
+
     # 4. Experience storage - Using actual Episode pattern from GRPO
     episode = create_episode_from_response(responses[0], score, ref_logprobs, step)
     await services['replay_buffer'].add.call_one(episode)
-    
+
     # 5. Learning - Using actual trainer pattern
     batch = await services['replay_buffer'].sample.call_one(
         curr_policy_version=step
@@ -266,11 +260,11 @@ async def real_rl_training_step(services, step):
     if batch is not None:
         inputs, targets = batch  # GRPO returns (inputs, targets) tuple
         loss = await services['trainer'].train_step.call(inputs, targets)
-        
+
         # 6. Policy synchronization - Using actual weight update pattern
         await services['trainer'].push_weights.call(step + 1)
         await services['policy'].update_weights.fanout(step + 1)
-        
+
         return loss
 ```
 
@@ -286,7 +280,7 @@ answer = responses[0].text  # responses is list[Completion]
 
 Forge handles behind the scenes:
 - Routing to least loaded replica
-- GPU memory management  
+- GPU memory management
 - Batch optimization
 - Failure recovery
 - Auto-scaling based on demand
@@ -365,10 +359,16 @@ group_size = 1
     )
 ```
 
-Production scaling - multiply num_replicas for services or spawn multiple actors:
-- Policy: num_replicas=8 for high inference demand
-- RewardActor: num_replicas=16 for parallel evaluation
-- Trainer: Multiple processes for distributed training (RLTrainer handles this internally)
+**Forge Components: Services vs Actors**
+
+Forge has two types of distributed components:
+- **Services**: Multiple replicas with automatic load balancing (like Policy, RewardActor)
+- **Actors**: Single instances that handle their own internal distribution (like RLTrainer, ReplayBuffer)
+
+We cover this distinction in detail in Part 2, but for now this explains the scaling patterns:
+- Policy service: num_replicas=8 for high inference demand
+- RewardActor service: num_replicas=16 for parallel evaluation
+- RLTrainer actor: Single instance with internal distributed training
 
 
 ### Fault Tolerance
@@ -377,13 +377,13 @@ Production scaling - multiply num_replicas for services or spawn multiple actors
 responses = await policy.generate.route(prompt=question)
 answer = responses[0].text
 # -> Forge automatically routes to healthy replica
-# -> Failed replica respawns in background  
+# -> Failed replica respawns in background
 # -> No impact on training loop
 
 # If reward service fails:
 score = await reward_actor.evaluate_response.route(
     prompt=question, response=answer, target=target
-) 
+)
 ```
 
 - Retries on different replica automatically
@@ -392,4 +392,4 @@ score = await reward_actor.evaluate_response.route(
 
 This is fundamentally different from monolithic RL implementations where any component failure stops everything!
 
-In the next Section, we will go a layer deeper and learn how ForgeServices work. Continue to [Part 2 here](./2_Forge_Internals.MD)
\ No newline at end of file
+In the next Section, we will go a layer deeper and learn how ForgeServices work. Continue to [Part 2 here](./2_Forge_Internals.MD)
diff --git a/docs/Tutorials/2_Forge_Internals.MD b/docs/Tutorials/2_Forge_Internals.MD
index 05a40e4a5..e1af9cde3 100644
--- a/docs/Tutorials/2_Forge_Internals.MD
+++ b/docs/Tutorials/2_Forge_Internals.MD
@@ -13,23 +13,23 @@ When you call `await policy_service.generate(question)`, here's what actually ha
 ```mermaid
 graph TD
     Call["Your Code:<br/>await policy_service.generate"]
-    
+
     subgraph ServiceLayer["Service Layer"]
         Proxy["Service Proxy: Load balancing, Health checking"]
         LB["Load Balancer: Replica selection, Circuit breaker"]
     end
-    
+
     subgraph Replicas["Replica Management"]
         R1["Replica 1: GPU 0, Healthy"]
         R2["Replica 2: GPU 1, Overloaded"]
         R3["Replica 3: GPU 2, Failed"]
         R4["Replica 4: GPU 3, Healthy"]
     end
-    
+
     subgraph Compute["Actual Computation"]
         Actor["Policy Actor: vLLM engine, Model weights, KV cache"]
     end
-    
+
     Call --> Proxy
     Proxy --> LB
     LB --> R1
@@ -38,7 +38,7 @@ graph TD
     LB --> R4
     R1 --> Actor
     R4 --> Actor
-    
+
     style Call fill:#4CAF50
     style LB fill:#FF9800
     style R3 fill:#F44336
@@ -55,7 +55,7 @@ Here's the actual ServiceConfig from Forge source code:
 # Configuration pattern from apps/grpo/main.py:
 Policy.options(
     procs=1,           # Processes per replica
-    num_replicas=4,    # Number of replicas  
+    num_replicas=4,    # Number of replicas
     with_gpus=True     # Allocate GPUs
     # Other available options:
     # hosts=None   #  the number of remote hosts used per replica
@@ -69,7 +69,7 @@ Services are created using the `.options().as_service()` pattern from the actual
 The service creation automatically handles:
 - Spawning actor replicas across processes/GPUs
 - Load balancing with .route() method for services
-- Health monitoring and failure recovery  
+- Health monitoring and failure recovery
 - Message routing and serialization
 
 ```python
@@ -78,8 +78,8 @@ from forge.actors.policy import Policy
 model = "Qwen/Qwen3-1.7B"
 
 policy = await Policy.options(
-    procs=1, 
-    with_gpus=True, 
+    procs=1,
+    with_gpus=True,
     num_replicas=1
 ).as_service(
     engine_config={
@@ -158,7 +158,7 @@ Behind the scenes:
 ```python
 # Get version from all policy replicas
 current_versions = await policy.get_version.fanout()
-# Returns: [version_replica_1, version_replica_2, ...] 
+# Returns: [version_replica_1, version_replica_2, ...]
 
 # Update weights on all replicas
 await policy.update_weights.fanout(new_policy_version)
@@ -193,8 +193,8 @@ while training:
 ```
 
 **Performance characteristics**:
-- **Latency**: Process first result immediately  
-- **Throughput**: Pipeline parallelism (much higher than sequential)
+- **Latency**: Process first result immediately
+- **Throughput**: Non-blocking async operations (much higher than waiting for full batches)
 - **Fault tolerance**: Continues if some replicas fail
 
 **Critical insight**: This is essential for high-throughput RL where you can't wait for batches.
@@ -242,7 +242,7 @@ async with counter_service.session():
     print(await counter_service.increment.route())  # 1
     print(await counter_service.increment.route())  # 2
     print(await counter_service.increment.route())  # 3
-          
+
     final_value = await counter_service.get_value.route()
     print(f"Final value on this replica: {final_value}")  # 3
 
@@ -263,7 +263,7 @@ await counter_service.shutdown()
 
 The most complex challenge in distributed RL is maintaining state consistency while maximizing performance.
 
-### The KV Cache Problem  
+### The KV Cache Problem
 
 **The challenge**: Policy inference is much faster with KV cache, but cache is tied to specific conversation history.
 
@@ -278,16 +278,16 @@ async def naive_multi_turn():
 
 **The solution**: Sticky sessions ensure all calls go to same replica.
 
-```python  
+```python
 async def optimized_multi_turn():
     async with policy.session():
         # All calls guaranteed to hit same replica = cache hits
         response1 = await policy.generate.route(prompt=question1)
-        full_prompt = question1 + response1[0].text  
+        full_prompt = question1 + response1[0].text
         response2 = await policy.generate.route(prompt=full_prompt) # Cache hit!
         conversation = full_prompt + response2[0].text
         response3 = await policy.generate.route(prompt=conversation)   # Cache hit!
-        
+
     # Session ends, replica can be garbage collected or reused
 ```
 
@@ -327,11 +327,11 @@ batch = await replay_buffer.sample.call_one(
 async def real_weight_sync(trainer, policy, step):
     # Trainer pushes weights to TorchStore with version number
     await trainer.push_weights.call_one(policy_version=step + 1)
-    
-    # Policy service updates to new version from TorchStore  
+
+    # Policy service updates to new version from TorchStore
     # Use .fanout() to update ALL policy replicas
     await policy.update_weights.fanout(policy_version=step + 1)
-    
+
 # Check current policy version
 current_version = await policy.get_version.route()
 print(f"Current policy version: {current_version}")
@@ -349,29 +349,29 @@ Instead of manual coordination, Forge services handle speed mismatches automatic
 from apps.grpo.main import Episode, Group
 
 async def simple_rl_step():
-    
+
     # ===== Generate a rollout =====
     sample = await dataloader.sample.call_one()  # DatasetActor is an actor, not service
     prompt, target = sample["request"], sample["target"]  # Correct field names
-    
+
     print(f"Prompt: {prompt}")
     print(f"Target: {target}")
-    
+
     actions = await policy.generate.route(prompt=prompt)  # Policy is a service
     print(f"Policy response: {actions[0].text}")
-    
+
     # Create input tensor for reference model (requires full context)
     input_ids = torch.cat([actions[0].prompt_ids, actions[0].token_ids])
     ref_logprobs = await ref_model.forward.route(
         input_ids.unsqueeze(0), max_req_tokens=512, return_logprobs=True
-    )    
+    )
     reward = await reward_actor.evaluate_response.route(  # RewardActor is a service
-        prompt=prompt, 
-        response=actions[0].text, 
+        prompt=prompt,
+        response=actions[0].text,
         target=target
     )
     print(f"Reward: {reward}")
-    
+
     # Create episode using actual GRPO Episode structure
     episode = Episode(
         episode_id="0",
@@ -382,24 +382,24 @@ async def simple_rl_step():
         response_len=512,
         target=target
     )
-    
+
     # Add response data
     episode.response = actions[0].text
     episode.request_tokens = actions[0].prompt_ids.tolist()
     episode.response_tokens = actions[0].token_ids.tolist()
     episode.ref_logprobs = ref_logprobs[0]  # Extract from batch dimension
     episode.reward = reward
-    
+
     # Compute advantages using actual ComputeAdvantages actor
     group = Group.new_group(0, 1, prompt, 0, tokenizer.pad_token_id, 512, 512, target)
     group.episodes[0] = episode
     advantages = await compute_advantages.compute.call_one(group)  # ComputeAdvantages is an actor
     episode.advantage = advantages[0]
-    print(f"Advantage: {advantages[0]}")    
+    print(f"Advantage: {advantages[0]}")
     await replay_buffer.add.call_one(episode)  # ReplayBuffer is an actor
     print("Episode stored in replay buffer")
-    
-    # ===== Train on the batch ===== 
+
+    # ===== Train on the batch =====
     batch = await replay_buffer.sample.call_one(curr_policy_version=0)
     if batch is not None:
         print("Training on batch...")
@@ -469,12 +469,12 @@ class RewardActor(ForgeActor):
     async def evaluate_response(self, prompt: str, response: str, target: str) -> float:
         """Evaluate response quality using multiple reward functions"""
         total_reward = 0.0
-        
+
         for reward_fn in self.reward_functions:
             # Each reward function contributes to total score
             reward = reward_fn(prompt, response, target)
             total_reward += reward
-            
+
         # Return average reward across all functions
         return total_reward / len(self.reward_functions) if self.reward_functions else 0.0
 
@@ -490,7 +490,7 @@ target = "36"
 
 score = await reward_actor.evaluate_response.route(
     prompt=prompt,
-    response=response, 
+    response=response,
     target=target
 )
 print(f"Reward score: {score}")  # Usually around 1.0 for correct math answers
@@ -530,7 +530,7 @@ print("Initializing all services...")
     reward_actor,
 ) = await asyncio.gather(
     DatasetActor.options(procs=1).as_actor(
-        path="openai/gsm8k", revision="main", data_split="train", 
+        path="openai/gsm8k", revision="main", data_split="train",
         streaming=True, model="Qwen/Qwen3-1.7B"
     ),
     Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service(
@@ -559,41 +559,41 @@ print("All services initialized successfully!")
 async def production_training_loop():
     """Real training loop pattern from apps/grpo/main.py"""
     step = 0
-    
+
     while True:
-        # Data generation 
+        # Data generation
         sample = await dataloader.sample.call_one()
-        
+
         # Policy generation service call
         responses = await policy.generate.route(sample["request"])  # Correct field name
-        
+
         # Reference computation service call (requires full input tensor)
         input_ids = torch.cat([responses[0].prompt_ids, responses[0].token_ids])
         ref_logprobs = await ref_model.forward.route(
             input_ids.unsqueeze(0), max_req_tokens=512, return_logprobs=True
         )
-        
-        # Reward evaluation service call 
+
+        # Reward evaluation service call
         reward = await reward_actor.evaluate_response.route(
             prompt=sample["question"],
             response=responses[0].text,
             target=sample["answer"]
         )
-        
+
         # Experience storage (using actual Episode structure)
         episode = create_episode_from_grpo_data(sample, responses[0], reward, ref_logprobs[0], step)
         await replay_buffer.add.call_one(episode)
-        
+
         # Training when ready
         batch = await replay_buffer.sample.call_one(curr_policy_version=step)
         if batch is not None:
             inputs, targets = batch  # GRPO returns (inputs, targets) tuple
             loss = await trainer.train_step.call(inputs, targets)
-            
+
             # Weight synchronization pattern
             await trainer.push_weights.call(step + 1)
             await policy.update_weights.fanout(step + 1)  # Fanout to all replicas
-            
+
             print(f"Step {step}, Loss: {loss:.4f}")
             step += 1
 
@@ -612,11 +612,11 @@ print("All services shut down successfully!")
 
 **Key observations:**
 1. **Parallelism**: Independent operations run concurrently
-2. **Load balancing**: Each `.route()` call automatically selects optimal replica  
+2. **Load balancing**: Each `.route()` call automatically selects optimal replica
 3. **Fault tolerance**: Failures automatically retry on different replicas
 4. **Resource efficiency**: CPU and GPU services scale independently
 5. **Coordination**: Services coordinate through shared state (replay buffer, weight versions)
 
 This is the power of the service abstraction - complex distributed coordination looks like simple async Python code.
 
-In the next part we will learn about [Monarch internals](./3_Monarch_101.MD)
\ No newline at end of file
+In the next part we will learn about [Monarch internals](./3_Monarch_101.MD)
diff --git a/docs/Tutorials/ReadMe.MD b/docs/Tutorials/ReadMe.MD
index 7798b147d..084710853 100644
--- a/docs/Tutorials/ReadMe.MD
+++ b/docs/Tutorials/ReadMe.MD
@@ -4,7 +4,7 @@ A comprehensive guide for ML Engineers building distributed RL systems for langu
 
 Some of the examples mentioned below will be conceptual in nature for understanding. Please refer to API Docs (Coming Soon!) for more details
 
-Welcome to the Tutorials section! This section is inspired by the A-Z PyTorch tutorial, shoutout to our PyTorch friends that remember! 
+Welcome to the Tutorials section! This section is inspired by the A-Z PyTorch tutorial, shoutout to our PyTorch friends that remember!
 
 ###
 
@@ -14,6 +14,6 @@ This section currently is structured in 3 detailed parts:
 2. [Forge Internals](./2_Forge_Internals.MD): Goes a layer deeper and explains the internals of Forge
 3. [Monarch 101](./3_Monarch_101.MD): It's a 101 to Monarch and how Forge Talks to Monarch
 
-Each part builds upon the next and the entire section can be consumed in roughly an hour-Grab a Chai and Enjoy! 
+Each part builds upon the next and the entire section can be consumed in roughly an hour-Grab a Chai and Enjoy!
 
-If you're eager, please checkout our SFT Tutorial too (Coming soon!) as well as [App Examples](../../apps/).
\ No newline at end of file
+If you're eager, please checkout our SFT Tutorial too (Coming soon!) as well as [App Examples](../../apps/).

From c8c26ab21346e924db5d05c540b22524df6d0035 Mon Sep 17 00:00:00 2001
From: Sanyam Bhutani <sanyam.bhutani05@gmail.com>
Date: Mon, 13 Oct 2025 15:04:16 -0700
Subject: [PATCH 19/28] address felipe's comments, add image and fix sticky
 session examples

---
 docs/Tutorials/2_Forge_Internals.MD | 83 +++++++++++++++++++++++------
 1 file changed, 66 insertions(+), 17 deletions(-)

diff --git a/docs/Tutorials/2_Forge_Internals.MD b/docs/Tutorials/2_Forge_Internals.MD
index e1af9cde3..8189cf8a5 100644
--- a/docs/Tutorials/2_Forge_Internals.MD
+++ b/docs/Tutorials/2_Forge_Internals.MD
@@ -108,22 +108,54 @@ await policy.shutdown()
 
 Forge services are implemented as ServiceActors that manage collections of your ForgeActor replicas:
 
-Forge internals - What happens behind the scenes:
-1. `.as_service()` creates a `ServiceInterface`
-2. `ServiceInterface` manages N replicas of your `ForgeActor` class
-3. `ServiceInterface` handles routing between replicas
-4. You get methods like `.route()`, `.fanout()`, etc.
+When you call `.as_service()`, Forge creates a `ServiceInterface` that manages N replicas of your `ForgeActor` class and gives you methods like `.route()`, `.fanout()`, etc.
 
 ```python
-# Your code sees this:
+# Your code sees this simple interface:
 responses = await policy.generate.route(prompt=prompt)
+# But Forge handles all the complexity of replica management, load balancing, and fault tolerance
 ```
 
-But behind the scenes:
-- `ServiceInterface` selects healthy replica
-- Routes message to that replica's `Policy.generate()` endpoint
-- Handles failures and retries automatically
-- Returns list[Completion] from the selected replica
+## Communication Patterns: Quick Reference
+
+**API Summary:**
+- `.route()` - Send request to any healthy replica in a service (load balanced)
+- `.call_one()` - Send request to a single actor instance  
+- `.fanout()` - Send request to ALL replicas in a service
+
+```mermaid
+graph LR
+    subgraph Request["Your Request"]
+        Code["await service.method.ADVERB()"]
+    end
+    
+    subgraph Patterns["Communication Patterns"]
+        Route[".route()<br/>→ One healthy replica"]
+        CallOne[".call_one()<br/>→ Single actor"]
+        Fanout[".fanout()<br/>→ ALL replicas"]
+    end
+    
+    subgraph Replicas["Replicas/Actors"]
+        R1["Replica 1"]
+        R2["Replica 2"]
+        R3["Replica 3"]
+        A1["Actor"]
+    end
+    
+    Code --> Route
+    Code --> CallOne
+    Code --> Fanout
+    
+    Route --> R2
+    CallOne --> A1
+    Fanout --> R1
+    Fanout --> R2
+    Fanout --> R3
+    
+    style Route fill:#4CAF50
+    style CallOne fill:#FF9800
+    style Fanout fill:#9C27B0
+```
 
 ## Deep Dive: Service Communication Patterns
 
@@ -203,8 +235,10 @@ while training:
 
 **When to use**: When you need multiple calls to hit the same replica (like KV cache preservation).
 
+**What are sticky sessions?** A session ensures all your service calls within the `async with` block go to the same replica, instead of being load-balanced across different replicas.
+
 ```python
-# This Counter example demonstrates the session pattern
+# This Counter example demonstrates the difference between regular routing and sessions
 
 from forge.controller import ForgeActor
 from monarch.actor import endpoint
@@ -230,22 +264,37 @@ counter_service = await ForgeCounter.options(
     procs=1, num_replicas=4
 ).as_service(initial_value=0)
 
-# Test basic operations
-await counter_service.increment.route()
+# WITHOUT SESSIONS: Each .route() call goes to a different replica
+await counter_service.increment.route()  # Might go to replica 2
+await counter_service.increment.route()  # Might go to replica 1  
+await counter_service.increment.route()  # Might go to replica 3
+
 results = await counter_service.increment.fanout()  # Get from all replicas
 print(f"All replica values: {results}")
+# Output: All replica values: [1, 2, 1, 1] - Each replica has different state!
+```
 
-# STICKY SESSIONS
+The problem: each `.route()` call can go to different replicas, creating inconsistent state.
+
+```python
+# WITH SESSIONS: All calls go to the SAME replica
 print("\nUsing sticky sessions:")
-async with counter_service.session():
+async with counter_service.session():  # Creates a session that picks one replica
     await counter_service.reset.route()  # Uses .route() within session
     print(await counter_service.increment.route())  # 1
-    print(await counter_service.increment.route())  # 2
+    print(await counter_service.increment.route())  # 2  
     print(await counter_service.increment.route())  # 3
 
     final_value = await counter_service.get_value.route()
     print(f"Final value on this replica: {final_value}")  # 3
 
+# Output:
+# Using sticky sessions:
+# 1
+# 2
+# 3
+# Final value on this replica: 3
+
 # Same pattern works with Policy for multi-turn conversations:
 # async with policy.session():
 #     response1 = await policy.generate.route(turn1)

From 78539f0e04749dcd8084ddf411744b2c667a499c Mon Sep 17 00:00:00 2001
From: Sanyam Bhutani <sanyam.bhutani05@gmail.com>
Date: Mon, 13 Oct 2025 15:07:27 -0700
Subject: [PATCH 20/28] fix PR tests

---
 docs/Tutorials/2_Forge_Internals.MD | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/Tutorials/2_Forge_Internals.MD b/docs/Tutorials/2_Forge_Internals.MD
index 8189cf8a5..1a9421a96 100644
--- a/docs/Tutorials/2_Forge_Internals.MD
+++ b/docs/Tutorials/2_Forge_Internals.MD
@@ -120,7 +120,7 @@ responses = await policy.generate.route(prompt=prompt)
 
 **API Summary:**
 - `.route()` - Send request to any healthy replica in a service (load balanced)
-- `.call_one()` - Send request to a single actor instance  
+- `.call_one()` - Send request to a single actor instance
 - `.fanout()` - Send request to ALL replicas in a service
 
 ```mermaid
@@ -128,30 +128,30 @@ graph LR
     subgraph Request["Your Request"]
         Code["await service.method.ADVERB()"]
     end
-    
+
     subgraph Patterns["Communication Patterns"]
         Route[".route()<br/>→ One healthy replica"]
         CallOne[".call_one()<br/>→ Single actor"]
         Fanout[".fanout()<br/>→ ALL replicas"]
     end
-    
+
     subgraph Replicas["Replicas/Actors"]
         R1["Replica 1"]
         R2["Replica 2"]
         R3["Replica 3"]
         A1["Actor"]
     end
-    
+
     Code --> Route
     Code --> CallOne
     Code --> Fanout
-    
+
     Route --> R2
     CallOne --> A1
     Fanout --> R1
     Fanout --> R2
     Fanout --> R3
-    
+
     style Route fill:#4CAF50
     style CallOne fill:#FF9800
     style Fanout fill:#9C27B0
@@ -266,7 +266,7 @@ counter_service = await ForgeCounter.options(
 
 # WITHOUT SESSIONS: Each .route() call goes to a different replica
 await counter_service.increment.route()  # Might go to replica 2
-await counter_service.increment.route()  # Might go to replica 1  
+await counter_service.increment.route()  # Might go to replica 1
 await counter_service.increment.route()  # Might go to replica 3
 
 results = await counter_service.increment.fanout()  # Get from all replicas
@@ -282,7 +282,7 @@ print("\nUsing sticky sessions:")
 async with counter_service.session():  # Creates a session that picks one replica
     await counter_service.reset.route()  # Uses .route() within session
     print(await counter_service.increment.route())  # 1
-    print(await counter_service.increment.route())  # 2  
+    print(await counter_service.increment.route())  # 2
     print(await counter_service.increment.route())  # 3
 
     final_value = await counter_service.get_value.route()

From aa3d85c97481334baceb86de98227c30b2c6ff82 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Tue, 14 Oct 2025 11:57:42 -0700
Subject: [PATCH 21/28] Convert tutorials to .py

---
 docs/Tutorials/1_RL_and_Forge_Fundamentals.MD | 395 ---------
 docs/Tutorials/2_Forge_Internals.MD           | 671 ---------------
 docs/Tutorials/3_Monarch_101.MD               | 439 ----------
 docs/source/conf.py                           |  12 +-
 docs/source/tutorial_sources/README.txt       |   5 -
 .../1_RL_and_Forge_Fundamentals.py            | 503 ++++++++++++
 .../zero-to-forge/2_Forge_Internals.py        | 767 ++++++++++++++++++
 .../zero-to-forge/3_Monarch_101.py            | 572 +++++++++++++
 .../tutorial_sources/zero-to-forge/README.md} |  21 +-
 docs/source/tutorials.md                      |   5 +-
 docs/source/zero-to-forge-intro.md            |  28 +
 11 files changed, 1897 insertions(+), 1521 deletions(-)
 delete mode 100644 docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
 delete mode 100644 docs/Tutorials/2_Forge_Internals.MD
 delete mode 100644 docs/Tutorials/3_Monarch_101.MD
 create mode 100644 docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py
 create mode 100644 docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py
 create mode 100644 docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py
 rename docs/{Tutorials/ReadMe.MD => source/tutorial_sources/zero-to-forge/README.md} (57%)
 create mode 100644 docs/source/zero-to-forge-intro.md

diff --git a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD b/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
deleted file mode 100644
index 39b6d62aa..000000000
--- a/docs/Tutorials/1_RL_and_Forge_Fundamentals.MD
+++ /dev/null
@@ -1,395 +0,0 @@
-# Part 1: RL Fundamentals - Using Forge Terminology
-
-## Core RL Components in Forge
-
-Let's start with a simple math tutoring example to understand RL concepts with the exact names Forge uses:
-
-### The Toy Example: Teaching Math
-
-```mermaid
-graph TD
-    subgraph Example["Math Tutoring RL Example"]
-        Dataset["Dataset: math problems"]
-        Policy["Policy: student AI"]
-        Reward["Reward Model: scores answers"]
-        Reference["Reference Model: baseline"]
-        ReplayBuffer["Replay Buffer: stores experiences"]
-        Trainer["Trainer: improves student"]
-    end
-
-    Dataset --> Policy
-    Policy --> Reward
-    Policy --> Reference
-    Reward --> ReplayBuffer
-    Reference --> ReplayBuffer
-    ReplayBuffer --> Trainer
-    Trainer --> Policy
-
-    style Policy fill:#4CAF50
-    style Reward fill:#FF9800
-    style Trainer fill:#E91E63
-```
-
-### RL Components Defined (Forge Names)
-
-1. **Dataset**: Provides questions/prompts (like "What is 2+2?")
-2. **Policy**: The AI being trained (generates answers like "The answer is 4")
-3. **Reward Model**: Evaluates answer quality (gives scores like 0.95)
-4. **Reference Model**: Original policy copy (prevents drift from baseline)
-5. **Replay Buffer**: Stores experiences (question + answer + score)
-6. **Trainer**: Updates the policy weights based on experiences
-
-### The RL Learning Flow
-
-```python
-# CONCEPTUAL EXAMPLE - see apps/grpo/main.py for GRPO Code
-
-def conceptual_rl_step():
-    # 1. Get a math problem
-    question = dataset.sample()  # "What is 2+2?"
-
-    # 2. Student generates answer
-    answer = policy.generate(question)  # "The answer is 4"
-
-    # 3. Teacher grades it
-    score = reward_model.evaluate(question, answer)  # 0.95
-
-    # 4. Compare to original student
-    baseline = reference_model.compute_logprobs(question, answer)
-
-    # 5. Store the experience
-    experience = Episode(question, answer, score, baseline)
-    replay_buffer.add(experience)
-
-    # 6. When enough experiences collected, improve student
-    batch = replay_buffer.sample(curr_policy_version=0)
-    if batch is not None:
-        trainer.train_step(batch)  # Student gets better!
-
-# 🔄 See complete working example below with actual Forge service calls
-```
-
-## From Concepts to Forge Services
-
-Here's the key insight: **Each RL component becomes a Forge service**. The toy example above maps directly to Forge:
-
-```mermaid
-graph LR
-    subgraph Concepts["RL Concepts"]
-        C1["Dataset"]
-        C2["Policy"]
-        C3["Reward Model"]
-        C4["Reference Model"]
-        C5["Replay Buffer"]
-        C6["Trainer"]
-    end
-
-    subgraph Services["Forge Services (Real Classes)"]
-        S1["DatasetActor"]
-        S2["Policy"]
-        S3["RewardActor"]
-        S4["ReferenceModel"]
-        S5["ReplayBuffer"]
-        S6["RLTrainer"]
-    end
-
-    C1 --> S1
-    C2 --> S2
-    C3 --> S3
-    C4 --> S4
-    C5 --> S5
-    C6 --> S6
-
-    style C2 fill:#4CAF50
-    style S2 fill:#4CAF50
-    style C3 fill:#FF9800
-    style S3 fill:#FF9800
-```
-
-### RL Step with Forge Services
-
-Let's look at the example from above again, but this time we would use the names from Forge:
-
-```python
-# Conceptual Example
-
-async def conceptual_forge_rl_step(services, step):
-    # 1. Get a math problem - Using actual DatasetActor API
-    sample = await services['dataloader'].sample.call_one()
-    question, target = sample["request"], sample["target"]
-
-    # 2. Student generates answer - Using actual Policy API
-    responses = await services['policy'].generate.route(prompt=question)
-    answer = responses[0].text
-
-    # 3. Teacher grades it - Using actual RewardActor API
-    score = await services['reward_actor'].evaluate_response.route(
-        prompt=question, response=answer, target=target
-    )
-
-    # 4. Compare to baseline - Using actual ReferenceModel API
-    # Note: ReferenceModel.forward requires input_ids, max_req_tokens, return_logprobs
-    ref_logprobs = await services['ref_model'].forward.route(
-        input_ids, max_req_tokens, return_logprobs=True
-    )
-
-    # 5. Store experience - Using actual Episode structure from apps/grpo/main.py
-    episode = create_episode_from_response(responses[0], score, ref_logprobs, step)
-    await services['replay_buffer'].add.call_one(episode)
-
-    # 6. Improve student - Using actual training pattern
-    batch = await services['replay_buffer'].sample.call_one(
-        curr_policy_version=step
-    )
-    if batch is not None:
-        inputs, targets = batch
-        loss = await services['trainer'].train_step.call(inputs, targets)
-        return loss
-```
-
-**Key difference**: Same RL logic, but each component is now a distributed, fault-tolerant, auto-scaling service.
-
-Did you realise-we are not worrying about any Infra code here! Forge Automagically handles the details behind the scenes and you can focus on writing your RL Algorthms!
-
-
-## Why This Matters: Traditional ML Infrastructure Fails
-
-### The Infrastructure Challenge
-
-Our simple RL loop above has complex requirements:
-
-#### Problem 1: Different Resource Needs
-
-| Component | Resource Needs | Scaling Strategy |
-|-----------|----------------|------------------|
-| **Policy** (Student AI) | Large GPU memory | Multiple replicas for throughput |
-| **Reward Heuristic** (Teacher) | Small compute | CPU or small GPU |
-| **Trainer** (Tutor) | Massive GPU compute | Distributed training |
-| **Dataset** (Question Bank) | CPU intensive I/O | High memory bandwidth |
-
-### Problem 2: Complex Interdependencies
-
-```mermaid
-graph LR
-    A["Policy: Student AI<br/>'What is 2+2?' → 'The answer is 4'"]
-    B["Reward: Teacher<br/>Scores answer: 0.95"]
-    C["Reference: Original Student<br/>Provides baseline comparison"]
-    D["Replay Buffer: Notebook<br/>Stores: question + answer + score"]
-    E["Trainer: Tutor<br/>Improves student using experiences"]
-
-    A --> B
-    A --> C
-    B --> D
-    C --> D
-    D --> E
-    E --> A
-
-    style A fill:#4CAF50
-    style B fill:#FF9800
-    style C fill:#2196F3
-    style D fill:#8BC34A
-    style E fill:#E91E63
-```
-
-Each step has different:
-- **Latency requirements**: Policy inference needs low latency (each episode waits), training can batch multiple episodes together
-- **Scaling patterns**: Need N policy replicas to keep trainer busy, plus different sharding strategies (tensor parallel for training vs replicated inference)
-- **Failure modes**: Any component failure cascades to halt the entire pipeline (Forge prevents this with automatic failover)
-- **Resource utilization**: GPUs for inference/training, CPUs for data processing
-
-### Problem 3: The Coordination Challenge
-
-Unlike supervised learning where you process independent batches, RL requires coordination:
-
-```python
-# While this does work, it creates bottlenecks and resource waste
-def naive_rl_step():
-    # Policy waits idle while reward model works
-    response = policy_model.generate(prompt)  # GPU busy
-    reward = reward_model.evaluate(prompt, response)  # Policy GPU idle
-
-    # Training waits for single episode
-    loss = compute_loss(response, reward)  # Batch size = 1, inefficient
-
-    # Everything stops if any component fails
-    if policy_fails or reward_fails or trainer_fails:
-        entire_system_stops()
-```
-
-## Enter Forge: RL-Native Architecture
-
-Forge solves these problems by treating each RL component as an **independent, distributed unit** - some as fault-tolerant services (like Policy inference where failures are easy to handle), others as actors (like Trainers where recovery semantics differ)
-
-Let's see how core RL concepts map to Forge components (you'll notice a mix of `.route()` for services and `.call_one()` for actors - we cover when to use each in Part 2):
-
-**Quick API Reference:** (covered in detail in Part 2: Service Communication Patterns)
-- `.route()` - Send request to any healthy replica in a service (load balanced)
-- `.call_one()` - Send request to a single actor instance
-- `.fanout()` - Send request to ALL replicas in a service
-
-```python
-async def real_rl_training_step(services, step):
-    """Single RL step using verified Forge APIs"""
-
-    # 1. Environment interaction - Using actual DatasetActor API
-    sample = await services['dataloader'].sample.call_one()
-    prompt, target = sample["request"], sample["target"]
-
-    responses = await services['policy'].generate.route(prompt)
-
-    # 2. Reward computation - Using actual RewardActor API
-    score = await services['reward_actor'].evaluate_response.route(
-        prompt=prompt, response=responses[0].text, target=target
-    )
-
-    # 3. Get reference logprobs - Using actual ReferenceModel API
-    # Note: ReferenceModel requires full input_ids tensor, not just tokens
-    input_ids = torch.cat([responses[0].prompt_ids, responses[0].token_ids])
-    ref_logprobs = await services['ref_model'].forward.route(
-        input_ids.unsqueeze(0), max_req_tokens=512, return_logprobs=True
-    )
-
-    # 4. Experience storage - Using actual Episode pattern from GRPO
-    episode = create_episode_from_response(responses[0], score, ref_logprobs, step)
-    await services['replay_buffer'].add.call_one(episode)
-
-    # 5. Learning - Using actual trainer pattern
-    batch = await services['replay_buffer'].sample.call_one(
-        curr_policy_version=step
-    )
-    if batch is not None:
-        inputs, targets = batch  # GRPO returns (inputs, targets) tuple
-        loss = await services['trainer'].train_step.call(inputs, targets)
-
-        # 6. Policy synchronization - Using actual weight update pattern
-        await services['trainer'].push_weights.call(step + 1)
-        await services['policy'].update_weights.fanout(step + 1)
-
-        return loss
-```
-
-**Key insight**: Each line of RL pseudocode becomes a service call. The complexity of distribution, scaling, and fault tolerance is hidden behind these simple interfaces.
-
-## What Makes This Powerful
-
-### Automatic Resource Management
-```python
-responses = await policy.generate.route(prompt=question)
-answer = responses[0].text  # responses is list[Completion]
-```
-
-Forge handles behind the scenes:
-- Routing to least loaded replica
-- GPU memory management
-- Batch optimization
-- Failure recovery
-- Auto-scaling based on demand
-
-### Independent Scaling
-```python
-
-from forge.actors.policy import Policy
-from forge.actors.replay_buffer import ReplayBuffer
-from forge.actors.reference_model import ReferenceModel
-from forge.actors.trainer import RLTrainer
-from apps.grpo.main import DatasetActor, RewardActor, ComputeAdvantages
-from forge.data.rewards import MathReward, ThinkingReward
-import asyncio
-import torch
-
-model = "Qwen/Qwen3-1.7B"
-group_size = 1
-
-(
-    dataloader,
-    policy,
-    trainer,
-    replay_buffer,
-    compute_advantages,
-    ref_model,
-    reward_actor,
-) = await asyncio.gather(
-        # Dataset actor (CPU)
-        DatasetActor.options(procs=1).as_actor(
-            path="openai/gsm8k",
-            revision="main",
-            data_split="train",
-            streaming=True,
-            model=model,
-        ),
-        # Policy service with GPU
-        Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service(
-            engine_config={
-                "model": model,
-                "tensor_parallel_size": 1,
-                "pipeline_parallel_size": 1,
-                "enforce_eager": False
-            },
-            sampling_config={
-                "n": group_size,
-                "max_tokens": 16,
-                "temperature": 1.0,
-                "top_p": 1.0
-            }
-        ),
-        # Trainer actor with GPU
-        RLTrainer.options(procs=1, with_gpus=True).as_actor(
-            # Trainer config would come from YAML in real usage
-            model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"},
-            optimizer={"name": "AdamW", "lr": 1e-5},
-            training={"local_batch_size": 2, "seq_len": 2048}
-        ),
-        # Replay buffer (CPU)
-        ReplayBuffer.options(procs=1).as_actor(
-            batch_size=2,
-            max_policy_age=1,
-            dp_size=1
-        ),
-        # Advantage computation (CPU)
-        ComputeAdvantages.options(procs=1).as_actor(),
-        # Reference model with GPU
-        ReferenceModel.options(procs=1, with_gpus=True).as_actor(
-            model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"},
-            training={"dtype": "bfloat16"}
-        ),
-        # Reward actor (CPU)
-        RewardActor.options(procs=1, num_replicas=1).as_service(
-            reward_functions=[MathReward(), ThinkingReward()]
-        )
-    )
-```
-
-**Forge Components: Services vs Actors**
-
-Forge has two types of distributed components:
-- **Services**: Multiple replicas with automatic load balancing (like Policy, RewardActor)
-- **Actors**: Single instances that handle their own internal distribution (like RLTrainer, ReplayBuffer)
-
-We cover this distinction in detail in Part 2, but for now this explains the scaling patterns:
-- Policy service: num_replicas=8 for high inference demand
-- RewardActor service: num_replicas=16 for parallel evaluation
-- RLTrainer actor: Single instance with internal distributed training
-
-
-### Fault Tolerance
-```python
-# If a policy replica fails:
-responses = await policy.generate.route(prompt=question)
-answer = responses[0].text
-# -> Forge automatically routes to healthy replica
-# -> Failed replica respawns in background
-# -> No impact on training loop
-
-# If reward service fails:
-score = await reward_actor.evaluate_response.route(
-    prompt=question, response=answer, target=target
-)
-```
-
-- Retries on different replica automatically
-- Graceful degradation if all replicas fail
-- System continues (may need application-level handling)
-
-This is fundamentally different from monolithic RL implementations where any component failure stops everything!
-
-In the next Section, we will go a layer deeper and learn how ForgeServices work. Continue to [Part 2 here](./2_Forge_Internals.MD)
diff --git a/docs/Tutorials/2_Forge_Internals.MD b/docs/Tutorials/2_Forge_Internals.MD
deleted file mode 100644
index 1a9421a96..000000000
--- a/docs/Tutorials/2_Forge_Internals.MD
+++ /dev/null
@@ -1,671 +0,0 @@
-# Part 2: Peeling Back the Abstraction - What Are Services?
-
-We highly recommend reading [Part 1](./1_RL_and_Forge_Fundamentals.MD) before this, it explains RL Concepts and how they land in Forge.
-
-Now that you see the power of the service abstraction, let's understand what's actually happening under the hood, Grab your chai!
-
-## Service Anatomy: Beyond the Interface
-
-When you call `await policy_service.generate(question)`, here's what actually happens:
-
-(Don't worry, we will understand Services right in the next section!)
-
-```mermaid
-graph TD
-    Call["Your Code:<br/>await policy_service.generate"]
-
-    subgraph ServiceLayer["Service Layer"]
-        Proxy["Service Proxy: Load balancing, Health checking"]
-        LB["Load Balancer: Replica selection, Circuit breaker"]
-    end
-
-    subgraph Replicas["Replica Management"]
-        R1["Replica 1: GPU 0, Healthy"]
-        R2["Replica 2: GPU 1, Overloaded"]
-        R3["Replica 3: GPU 2, Failed"]
-        R4["Replica 4: GPU 3, Healthy"]
-    end
-
-    subgraph Compute["Actual Computation"]
-        Actor["Policy Actor: vLLM engine, Model weights, KV cache"]
-    end
-
-    Call --> Proxy
-    Proxy --> LB
-    LB --> R1
-    LB -.-> R2
-    LB -.-> R3
-    LB --> R4
-    R1 --> Actor
-    R4 --> Actor
-
-    style Call fill:#4CAF50
-    style LB fill:#FF9800
-    style R3 fill:#F44336
-    style Actor fill:#9C27B0
-```
-
-## Service Components Deep Dive
-
-### 1. Real Service Configuration
-
-Here's the actual ServiceConfig from Forge source code:
-
-```python
-# Configuration pattern from apps/grpo/main.py:
-Policy.options(
-    procs=1,           # Processes per replica
-    num_replicas=4,    # Number of replicas
-    with_gpus=True     # Allocate GPUs
-    # Other available options:
-    # hosts=None   #  the number of remote hosts used per replica
-)
-```
-
-### 2. Real Service Creation
-
-Services are created using the `.options().as_service()` pattern from the actual GRPO implementation:
-
-The service creation automatically handles:
-- Spawning actor replicas across processes/GPUs
-- Load balancing with .route() method for services
-- Health monitoring and failure recovery
-- Message routing and serialization
-
-```python
-from forge.actors.policy import Policy
-
-model = "Qwen/Qwen3-1.7B"
-
-policy = await Policy.options(
-    procs=1,
-    with_gpus=True,
-    num_replicas=1
-).as_service(
-    engine_config={
-        "model": model,
-        "tensor_parallel_size": 1,
-        "pipeline_parallel_size": 1,
-        "enforce_eager": False
-    },
-    sampling_config={
-        "n": 1,
-        "max_tokens": 16,
-        "temperature": 1.0,
-        "top_p": 1.0
-    }
-)
-
-prompt = "What is 3 + 5?"
-responses = await policy.generate.route(prompt)
-print(f"Response: {responses[0].text}")
-
-# Cleanup when done
-await policy.shutdown()
-```
-
-### 3. How Services Actually Work
-
-Forge services are implemented as ServiceActors that manage collections of your ForgeActor replicas:
-
-When you call `.as_service()`, Forge creates a `ServiceInterface` that manages N replicas of your `ForgeActor` class and gives you methods like `.route()`, `.fanout()`, etc.
-
-```python
-# Your code sees this simple interface:
-responses = await policy.generate.route(prompt=prompt)
-# But Forge handles all the complexity of replica management, load balancing, and fault tolerance
-```
-
-## Communication Patterns: Quick Reference
-
-**API Summary:**
-- `.route()` - Send request to any healthy replica in a service (load balanced)
-- `.call_one()` - Send request to a single actor instance
-- `.fanout()` - Send request to ALL replicas in a service
-
-```mermaid
-graph LR
-    subgraph Request["Your Request"]
-        Code["await service.method.ADVERB()"]
-    end
-
-    subgraph Patterns["Communication Patterns"]
-        Route[".route()<br/>→ One healthy replica"]
-        CallOne[".call_one()<br/>→ Single actor"]
-        Fanout[".fanout()<br/>→ ALL replicas"]
-    end
-
-    subgraph Replicas["Replicas/Actors"]
-        R1["Replica 1"]
-        R2["Replica 2"]
-        R3["Replica 3"]
-        A1["Actor"]
-    end
-
-    Code --> Route
-    Code --> CallOne
-    Code --> Fanout
-
-    Route --> R2
-    CallOne --> A1
-    Fanout --> R1
-    Fanout --> R2
-    Fanout --> R3
-
-    style Route fill:#4CAF50
-    style CallOne fill:#FF9800
-    style Fanout fill:#9C27B0
-```
-
-## Deep Dive: Service Communication Patterns
-
-These communication patterns (\"adverbs\") determine how your service calls are routed to replicas. Understanding when to use each pattern is key to effective Forge usage.
-
-### 1. `.route()` - Load Balanced Single Replica
-
-**When to use**: Normal request routing where any replica can handle the request.
-
-```python
-responses = await policy.generate.route(prompt=question)
-answer = responses[0].text  # Extract text from Completion object
-```
-
-Behind the scenes:
-1. Health check eliminates failed replicas
-2. Load balancer picks replica (currently round robin, configurable balancers coming soon)
-3. Request routes to that specific replica
-4. Automatic retry on different replica if failure
-
-**Performance characteristics**:
-- **Latency**: Lowest (single network hop)
-- **Throughput**: Limited by single replica capacity
-- **Fault tolerance**: Automatic failover to other replicas
-
-**Critical insight**: `.route()` is your default choice for stateless operations in Forge services.
-
-### 2. `.fanout()` - Broadcast with Results Collection
-
-**When to use**: You need responses from ALL replicas.
-
-```python
-# Get version from all policy replicas
-current_versions = await policy.get_version.fanout()
-# Returns: [version_replica_1, version_replica_2, ...]
-
-# Update weights on all replicas
-await policy.update_weights.fanout(new_policy_version)
-# Broadcasts to all replicas simultaneously
-```
-
-**Performance characteristics**:
-- **Latency**: Slowest replica determines total latency
-- **Throughput**: Network bandwidth × number of replicas
-- **Fault tolerance**: Fails if ANY replica fails (unless configured otherwise)
-
-**Critical gotcha**: Don't use `.fanout()` for high-frequency operations - it contacts all replicas.
-
-### 3. Streaming Operations - Custom Implementation Pattern
-
-**When to use**: You want to process results as they arrive, not wait for all.
-
-```python
-# CONCEPTUAL - Streaming requires custom implementation in your training loop
-# The basic ReplayBuffer doesn't have built-in streaming methods
-# Pattern from apps/grpo/main.py continuous training:
-
-while training:
-    # This is the real API call pattern
-    batch = await replay_buffer.sample.call_one(curr_policy_version=step)
-    if batch is not None:
-        # Process batch immediately
-        loss = await trainer.train_step.call_one(batch)
-        print(f"Training loss: {loss}")
-    else:
-        await asyncio.sleep(0.1)  # Wait for more data
-```
-
-**Performance characteristics**:
-- **Latency**: Process first result immediately
-- **Throughput**: Non-blocking async operations (much higher than waiting for full batches)
-- **Fault tolerance**: Continues if some replicas fail
-
-**Critical insight**: This is essential for high-throughput RL where you can't wait for batches.
-
-### 3. Service Sessions for Stateful Operations
-
-**When to use**: When you need multiple calls to hit the same replica (like KV cache preservation).
-
-**What are sticky sessions?** A session ensures all your service calls within the `async with` block go to the same replica, instead of being load-balanced across different replicas.
-
-```python
-# This Counter example demonstrates the difference between regular routing and sessions
-
-from forge.controller import ForgeActor
-from monarch.actor import endpoint
-
-class ForgeCounter(ForgeActor):
-    def __init__(self, initial_value: int):
-        self.value = initial_value
-
-    @endpoint
-    def increment(self) -> int:
-        self.value += 1
-        return self.value
-
-    @endpoint
-    def get_value(self) -> int:
-        return self.value
-
-    @endpoint
-    async def reset(self):
-        self.value = 0
-
-counter_service = await ForgeCounter.options(
-    procs=1, num_replicas=4
-).as_service(initial_value=0)
-
-# WITHOUT SESSIONS: Each .route() call goes to a different replica
-await counter_service.increment.route()  # Might go to replica 2
-await counter_service.increment.route()  # Might go to replica 1
-await counter_service.increment.route()  # Might go to replica 3
-
-results = await counter_service.increment.fanout()  # Get from all replicas
-print(f"All replica values: {results}")
-# Output: All replica values: [1, 2, 1, 1] - Each replica has different state!
-```
-
-The problem: each `.route()` call can go to different replicas, creating inconsistent state.
-
-```python
-# WITH SESSIONS: All calls go to the SAME replica
-print("\nUsing sticky sessions:")
-async with counter_service.session():  # Creates a session that picks one replica
-    await counter_service.reset.route()  # Uses .route() within session
-    print(await counter_service.increment.route())  # 1
-    print(await counter_service.increment.route())  # 2
-    print(await counter_service.increment.route())  # 3
-
-    final_value = await counter_service.get_value.route()
-    print(f"Final value on this replica: {final_value}")  # 3
-
-# Output:
-# Using sticky sessions:
-# 1
-# 2
-# 3
-# Final value on this replica: 3
-
-# Same pattern works with Policy for multi-turn conversations:
-# async with policy.session():
-#     response1 = await policy.generate.route(turn1)
-#     full_prompt = turn1 + response1[0].text + turn2
-#     response2 = await policy.generate.route(full_prompt)
-#     # Both calls hit same replica, preserving KV cache
-
-# Cleanup
-await counter_service.shutdown()
-```
-
-**Performance impact**: Critical for maintaining KV cache in multi-turn conversations.
-
-## Deep Dive: State Management Reality
-
-The most complex challenge in distributed RL is maintaining state consistency while maximizing performance.
-
-### The KV Cache Problem
-
-**The challenge**: Policy inference is much faster with KV cache, but cache is tied to specific conversation history.
-
-```python
-# This breaks KV cache optimization:
-async def naive_multi_turn():
-    # Each call might go to different replica = cache miss
-    response1 = await policy_service.generate.choose(question1)
-    response2 = await policy_service.generate.choose(question1 + response1) # Cache miss!
-    response3 = await policy_service.generate.choose(conversation_so_far)   # Cache miss!
-```
-
-**The solution**: Sticky sessions ensure all calls go to same replica.
-
-```python
-async def optimized_multi_turn():
-    async with policy.session():
-        # All calls guaranteed to hit same replica = cache hits
-        response1 = await policy.generate.route(prompt=question1)
-        full_prompt = question1 + response1[0].text
-        response2 = await policy.generate.route(prompt=full_prompt) # Cache hit!
-        conversation = full_prompt + response2[0].text
-        response3 = await policy.generate.route(prompt=conversation)   # Cache hit!
-
-    # Session ends, replica can be garbage collected or reused
-```
-
-**Performance impact**: Maintaining KV cache across turns avoids recomputing previous tokens.
-
-### Replay Buffer Consistency
-
-**The challenge**: Multiple trainers and experience collectors reading/writing concurrently.
-
-**Real Forge approach**: The ReplayBuffer actor handles concurrency internally:
-
-```python
-# Forge ReplayBuffer endpoints (verified from source code)
-# Add episodes (thread-safe by actor model)
-await replay_buffer.add.call_one(episode)  # .choose() would work too, but .call_one() clarifies it's a singleton actor not ActorMesh
-
-# Sample batches for training
-batch = await replay_buffer.sample.call_one(
-    curr_policy_version=step_number,
-    batch_size=None  # Optional parameter, uses default from config
-)
-
-# Additional methods available:
-# await replay_buffer.clear.call_one()  # Clear buffer
-# await replay_buffer.evict.call_one(curr_policy_version)  # Remove old episodes
-# state = await replay_buffer.state_dict.call_one()  # Get state for checkpointing
-```
-
-**Critical insight**: The actor model provides natural thread safety - each actor processes messages sequentially.
-
-### Weight Synchronization Strategy
-
-**The challenge**: Trainer updates policy weights, but policy service needs those weights.
-
-```python
-# Forge weight synchronization pattern from apps/grpo/main.py
-async def real_weight_sync(trainer, policy, step):
-    # Trainer pushes weights to TorchStore with version number
-    await trainer.push_weights.call_one(policy_version=step + 1)
-
-    # Policy service updates to new version from TorchStore
-    # Use .fanout() to update ALL policy replicas
-    await policy.update_weights.fanout(policy_version=step + 1)
-
-# Check current policy version
-current_version = await policy.get_version.route()
-print(f"Current policy version: {current_version}")
-```
-
-## Deep Dive: Asynchronous Coordination Patterns
-
-**The real challenge**: Different services run at different speeds, but Forge's service abstraction handles the coordination complexity.
-
-### The Forge Approach: Let Services Handle Coordination
-
-Instead of manual coordination, Forge services handle speed mismatches automatically:
-
-```python
-from apps.grpo.main import Episode, Group
-
-async def simple_rl_step():
-
-    # ===== Generate a rollout =====
-    sample = await dataloader.sample.call_one()  # DatasetActor is an actor, not service
-    prompt, target = sample["request"], sample["target"]  # Correct field names
-
-    print(f"Prompt: {prompt}")
-    print(f"Target: {target}")
-
-    actions = await policy.generate.route(prompt=prompt)  # Policy is a service
-    print(f"Policy response: {actions[0].text}")
-
-    # Create input tensor for reference model (requires full context)
-    input_ids = torch.cat([actions[0].prompt_ids, actions[0].token_ids])
-    ref_logprobs = await ref_model.forward.route(
-        input_ids.unsqueeze(0), max_req_tokens=512, return_logprobs=True
-    )
-    reward = await reward_actor.evaluate_response.route(  # RewardActor is a service
-        prompt=prompt,
-        response=actions[0].text,
-        target=target
-    )
-    print(f"Reward: {reward}")
-
-    # Create episode using actual GRPO Episode structure
-    episode = Episode(
-        episode_id="0",
-        request=prompt,
-        policy_version=0,
-        pad_id=tokenizer.pad_token_id,
-        request_len=512,
-        response_len=512,
-        target=target
-    )
-
-    # Add response data
-    episode.response = actions[0].text
-    episode.request_tokens = actions[0].prompt_ids.tolist()
-    episode.response_tokens = actions[0].token_ids.tolist()
-    episode.ref_logprobs = ref_logprobs[0]  # Extract from batch dimension
-    episode.reward = reward
-
-    # Compute advantages using actual ComputeAdvantages actor
-    group = Group.new_group(0, 1, prompt, 0, tokenizer.pad_token_id, 512, 512, target)
-    group.episodes[0] = episode
-    advantages = await compute_advantages.compute.call_one(group)  # ComputeAdvantages is an actor
-    episode.advantage = advantages[0]
-    print(f"Advantage: {advantages[0]}")
-    await replay_buffer.add.call_one(episode)  # ReplayBuffer is an actor
-    print("Episode stored in replay buffer")
-
-    # ===== Train on the batch =====
-    batch = await replay_buffer.sample.call_one(curr_policy_version=0)
-    if batch is not None:
-        print("Training on batch...")
-        inputs, targets = batch  # GRPO returns (inputs, targets) tuple
-        loss = await trainer.train_step.call(inputs, targets)  # RLTrainer is an actor
-        print(f"Training loss: {loss}")
-        return loss
-    else:
-        print("Not enough data in buffer yet")
-        return None
-
-# Note: This simplified example assumes tokenizer and services are already initialized
-for step in range(10):
-    print(f"\n--- RL Step {step + 1} ---")
-    loss = await simple_rl_step()
-    if loss:
-        print(f"Step {step + 1} complete, loss: {loss:.4f}")
-    else:
-        print(f"Step {step + 1} complete, building buffer...")
-```
-
-### Handling Speed Mismatches with Service Scaling
-
-**The insight**: Scale services independently based on their bottlenecks.
-
-```python
-# Scale fast services with more replicas
-policy = await Policy.options(
-    procs=1, num_replicas=8, with_gpus=True  # Many replicas for high throughput
-).as_service(
-    engine_config={"model": model_name, "tensor_parallel_size": 1}
-)
-
-# Reward evaluation might be CPU-bound
-reward_actor = await RewardActor.options(
-    procs=1, num_replicas=16, with_gpus=False  # More CPU replicas
-).as_service(
-    reward_functions=[MathReward()]
-)
-
-# Training needs fewer but more powerful replicas
-trainer = await RLTrainer.options(
-    procs=1, with_gpus=True  # Fewer but GPU-heavy
-).as_actor(  # Trainer typically uses .as_actor() not .as_service()
-    model={"name": "qwen3", "flavor": "1.7B"},
-    optimizer={"name": "AdamW", "lr": 1e-5}
-)
-```
-
-## Service Implementation Example
-
-Let's see how a reward service is actually implemented:
-
-```python
-# Exact RewardActor from apps/grpo/main.py
-
-from forge.controller import ForgeActor
-from monarch.actor import endpoint
-from forge.data.rewards import MathReward, ThinkingReward
-
-# class definition from apps/grpo/main.py
-class RewardActor(ForgeActor):
-    def __init__(self, reward_functions: list):
-        self.reward_functions = reward_functions
-
-    @endpoint
-    async def evaluate_response(self, prompt: str, response: str, target: str) -> float:
-        """Evaluate response quality using multiple reward functions"""
-        total_reward = 0.0
-
-        for reward_fn in self.reward_functions:
-            # Each reward function contributes to total score
-            reward = reward_fn(prompt, response, target)
-            total_reward += reward
-
-        # Return average reward across all functions
-        return total_reward / len(self.reward_functions) if self.reward_functions else 0.0
-
-reward_actor = await RewardActor.options(
-    procs=1, num_replicas=1
-).as_service(
-    reward_functions=[MathReward(), ThinkingReward()]
-)
-
-prompt = "What is 15% of 240?"
-response = "15% of 240 is 36"
-target = "36"
-
-score = await reward_actor.evaluate_response.route(
-    prompt=prompt,
-    response=response,
-    target=target
-)
-print(f"Reward score: {score}")  # Usually around 1.0 for correct math answers
-
-# For production scaling - increase num_replicas for parallel evaluation:
-# RewardActor.options(procs=1, num_replicas=16)  # 16 parallel evaluators
-
-# Cleanup when done
-await reward_actor.shutdown()
-```
-
-## Service Orchestration: The Training Loop
-
-Now let's see how services coordinate in a real training loop:
-
-```python
-# This is the REAL way production RL systems are built with Forge
-
-import asyncio
-import torch
-from forge.actors.policy import Policy
-from forge.actors.reference_model import ReferenceModel
-from forge.actors.replay_buffer import ReplayBuffer
-from forge.actors.trainer import RLTrainer
-from apps.grpo.main import DatasetActor, RewardActor, ComputeAdvantages
-from forge.data.rewards import MathReward, ThinkingReward
-
-# Service creation pattern from apps/grpo/main.py lines 322-344
-print("Initializing all services...")
-(
-    dataloader,
-    policy,
-    trainer,
-    replay_buffer,
-    compute_advantages,
-    ref_model,
-    reward_actor,
-) = await asyncio.gather(
-    DatasetActor.options(procs=1).as_actor(
-        path="openai/gsm8k", revision="main", data_split="train",
-        streaming=True, model="Qwen/Qwen3-1.7B"
-    ),
-    Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service(
-        engine_config={"model": "Qwen/Qwen3-1.7B", "tensor_parallel_size": 1},
-        sampling_config={"n": 1, "max_tokens": 512}
-    ),
-    RLTrainer.options(procs=1, with_gpus=True).as_actor(
-        model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": "hf://Qwen/Qwen3-1.7B"},
-        optimizer={"name": "AdamW", "lr": 1e-5},
-        training={"local_batch_size": 2, "seq_len": 2048}
-    ),
-    ReplayBuffer.options(procs=1).as_actor(
-        batch_size=2, max_policy_age=1, dp_size=1
-    ),
-    ComputeAdvantages.options(procs=1).as_actor(),
-    ReferenceModel.options(procs=1, with_gpus=True).as_actor(
-        model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": "hf://Qwen/Qwen3-1.7B"}
-    ),
-    RewardActor.options(procs=1, num_replicas=1).as_service(
-        reward_functions=[MathReward(), ThinkingReward()]
-    ),
-)
-
-print("All services initialized successfully!")
-
-async def production_training_loop():
-    """Real training loop pattern from apps/grpo/main.py"""
-    step = 0
-
-    while True:
-        # Data generation
-        sample = await dataloader.sample.call_one()
-
-        # Policy generation service call
-        responses = await policy.generate.route(sample["request"])  # Correct field name
-
-        # Reference computation service call (requires full input tensor)
-        input_ids = torch.cat([responses[0].prompt_ids, responses[0].token_ids])
-        ref_logprobs = await ref_model.forward.route(
-            input_ids.unsqueeze(0), max_req_tokens=512, return_logprobs=True
-        )
-
-        # Reward evaluation service call
-        reward = await reward_actor.evaluate_response.route(
-            prompt=sample["question"],
-            response=responses[0].text,
-            target=sample["answer"]
-        )
-
-        # Experience storage (using actual Episode structure)
-        episode = create_episode_from_grpo_data(sample, responses[0], reward, ref_logprobs[0], step)
-        await replay_buffer.add.call_one(episode)
-
-        # Training when ready
-        batch = await replay_buffer.sample.call_one(curr_policy_version=step)
-        if batch is not None:
-            inputs, targets = batch  # GRPO returns (inputs, targets) tuple
-            loss = await trainer.train_step.call(inputs, targets)
-
-            # Weight synchronization pattern
-            await trainer.push_weights.call(step + 1)
-            await policy.update_weights.fanout(step + 1)  # Fanout to all replicas
-
-            print(f"Step {step}, Loss: {loss:.4f}")
-            step += 1
-
-print("Shutting down services...")
-await asyncio.gather(
-    DatasetActor.shutdown(dataloader),
-    policy.shutdown(),
-    RLTrainer.shutdown(trainer),
-    ReplayBuffer.shutdown(replay_buffer),
-    ComputeAdvantages.shutdown(compute_advantages),
-    ReferenceModel.shutdown(ref_model),
-    reward_actor.shutdown(),
-)
-print("All services shut down successfully!")
-```
-
-**Key observations:**
-1. **Parallelism**: Independent operations run concurrently
-2. **Load balancing**: Each `.route()` call automatically selects optimal replica
-3. **Fault tolerance**: Failures automatically retry on different replicas
-4. **Resource efficiency**: CPU and GPU services scale independently
-5. **Coordination**: Services coordinate through shared state (replay buffer, weight versions)
-
-This is the power of the service abstraction - complex distributed coordination looks like simple async Python code.
-
-In the next part we will learn about [Monarch internals](./3_Monarch_101.MD)
diff --git a/docs/Tutorials/3_Monarch_101.MD b/docs/Tutorials/3_Monarch_101.MD
deleted file mode 100644
index 52bdb17d0..000000000
--- a/docs/Tutorials/3_Monarch_101.MD
+++ /dev/null
@@ -1,439 +0,0 @@
-# Part 3: The Forge-Monarch Connection
-
-This is part 3 of our series, in the previous sections: we learned Part 1: [RL Concepts and how they map to Forge](./1_RL_and_Forge_Fundamentals.MD), Part 2: [Forge Internals](./2_Forge_Internals.MD).
-
-Now let's peel back the layers. Forge services are built on top of **Monarch**, PyTorch's distributed actor framework. Understanding this connection is crucial for optimization and debugging.
-
-## The Complete Hierarchy: Service to Silicon
-
-```mermaid
-graph TD
-    subgraph YourCode["1. Your RL Code"]
-        Call["await policy_service.generate.route('What is 2+2?')"]
-    end
-
-    subgraph ForgeServices["2. Forge Service Layer"]
-        ServiceInterface["ServiceInterface: Routes requests, Load balancing, Health checks"]
-        ServiceActor["ServiceActor: Manages replicas, Monitors health, Coordinates failures"]
-    end
-
-    subgraph MonarchLayer["3. Monarch Actor Layer"]
-        ActorMesh["ActorMesh PolicyActor: 4 instances, Different GPUs, Message passing"]
-        ProcMesh["ProcMesh: 4 processes, GPU topology 0,1,2,3, Network interconnect"]
-    end
-
-    subgraph Hardware["4. Physical Hardware"]
-        GPU0["GPU 0: PolicyActor #1, vLLM Engine, Model Weights"]
-        GPU1["GPU 1: PolicyActor #2, vLLM Engine, Model Weights"]
-        GPU2["GPU 2: PolicyActor #3, vLLM Engine, Model Weights"]
-        GPU3["GPU 3: PolicyActor #4, vLLM Engine, Model Weights"]
-    end
-
-    Call --> ServiceInterface
-    ServiceInterface --> ServiceActor
-    ServiceActor --> ActorMesh
-    ActorMesh --> ProcMesh
-    ProcMesh --> GPU0
-    ProcMesh --> GPU1
-    ProcMesh --> GPU2
-    ProcMesh --> GPU3
-
-    style Call fill:#4CAF50
-    style ServiceActor fill:#FF9800
-    style ActorMesh fill:#9C27B0
-    style ProcMesh fill:#2196F3
-```
-
-## Deep Dive: ProcMesh - The Foundation
-
-**ProcMesh** is Monarch's core abstraction for organizing processes across hardware. Think of it as a multi-dimensional grid that maps directly to your cluster topology.
-
-### Single Host ProcMesh
-
-```mermaid
-graph TD
-    subgraph Host["Single Host (8 GPUs)"]
-        subgraph ProcMesh["ProcMesh: per_host={'gpus': 8}"]
-            P0["Process 0<br/>GPU 0"]
-            P1["Process 1<br/>GPU 1"]
-            P2["Process 2<br/>GPU 2"]
-            P3["Process 3<br/>GPU 3"]
-            P4["Process 4<br/>GPU 4"]
-            P5["Process 5<br/>GPU 5"]
-            P6["Process 6<br/>GPU 6"]
-            P7["Process 7<br/>GPU 7"]
-        end
-
-        P0 -.->|"Network"| P1
-        P1 -.->|"Network"| P2
-        P2 -.->|"Network"| P3
-        P3 -.->|"Network"| P4
-        P4 -.->|"Network"| P5
-        P5 -.->|"Network"| P6
-        P6 -.->|"Network"| P7
-        P7 -.->|"Network"| P0
-    end
-
-    style P0 fill:#F44336
-    style P1 fill:#F44336
-    style P2 fill:#F44336
-    style P3 fill:#F44336
-    style P4 fill:#F44336
-    style P5 fill:#F44336
-    style P6 fill:#F44336
-    style P7 fill:#F44336
-```
-
-### Multi-Host ProcMesh
-
-```mermaid
-graph TD
-    subgraph Cluster["Multi-Host Cluster"]
-        subgraph Host1["Host 1"]
-            subgraph PM1["ProcMesh Segment 1"]
-                H1P0["Process 0<br/>GPU 0"]
-                H1P1["Process 1<br/>GPU 1"]
-                H1P2["Process 2<br/>GPU 2"]
-                H1P3["Process 3<br/>GPU 3"]
-            end
-        end
-
-        subgraph Host2["Host 2"]
-            subgraph PM2["ProcMesh Segment 2"]
-                H2P0["Process 4<br/>GPU 0"]
-                H2P1["Process 5<br/>GPU 1"]
-                H2P2["Process 6<br/>GPU 2"]
-                H2P3["Process 7<br/>GPU 3"]
-            end
-        end
-
-        subgraph Host3["Host 3"]
-            subgraph PM3["ProcMesh Segment 3"]
-                H3P0["Process 8<br/>GPU 0"]
-                H3P1["Process 9<br/>GPU 1"]
-                H3P2["Process 10<br/>GPU 2"]
-                H3P3["Process 11<br/>GPU 3"]
-            end
-        end
-    end
-
-    H1P0 -.->|"InfiniBand"| H2P0
-    H1P1 -.->|"InfiniBand"| H2P1
-    H2P0 -.->|"InfiniBand"| H3P0
-    H2P1 -.->|"InfiniBand"| H3P1
-
-    style PM1 fill:#F44336
-    style PM2 fill:#4CAF50
-    style PM3 fill:#2196F3
-```
-
-```python
-# This shows the underlying actor system that powers Forge services
-
-from monarch.actor import Actor, endpoint, this_proc, Future
-from monarch.actor import ProcMesh, this_host
-import asyncio
-
-# STEP 1: Define a basic actor
-class Counter(Actor):
-    def __init__(self, initial_value: int):
-        self.value = initial_value
-
-    @endpoint
-    def increment(self) -> None:
-        self.value += 1
-
-    @endpoint
-    def get_value(self) -> int:
-        return self.value
-
-# STEP 2: Single actor in local process
-counter: Counter = this_proc().spawn("counter", Counter, initial_value=0)
-
-# STEP 3: Send messages
-fut: Future[int] = counter.get_value.call_one()
-value = await fut
-print(f"Counter value: {value}")  # 0
-
-# STEP 4: Multiple actors across processes
-procs: ProcMesh = this_host().spawn_procs(per_host={"gpus": 8})
-counters: Counter = procs.spawn("counters", Counter, 0)
-
-# STEP 5: Broadcast to all actors
-await counters.increment.call()
-
-# STEP 6: Different message patterns
-# call_one() - single actor
-value = await counters.get_value.call_one()
-print(f"One counter: {value}")
-
-# choose() - random single actor (actors only, not services)
-value = await counters.get_value.choose()
-print(f"Random counter: {value}")
-
-# call() - all actors, collect results
-values = await counters.get_value.call()
-print(f"All counters: {values}")
-
-# broadcast() - fire and forget
-await counters.increment.broadcast()
-
-# Cleanup
-await procs.stop()
-```
-
-## Actor Meshes: Your Code Running Distributed
-
-**ActorMesh** is created when you spawn actors across a ProcMesh. Each process in the ProcMesh gets one instance of your actor.
-
-```mermaid
-graph TD
-    subgraph Creation["Actor Creation Process"]
-        Code["mesh.spawn('policy', PolicyActor, model='Qwen/Qwen3-7B')"]
-
-        subgraph ProcMesh["ProcMesh (4 processes)"]
-            P0["Process 0<br/>GPU 0"]
-            P1["Process 1<br/>GPU 1"]
-            P2["Process 2<br/>GPU 2"]
-            P3["Process 3<br/>GPU 3"]
-        end
-
-        subgraph ActorMesh["ActorMesh PolicyActor"]
-            A0["PolicyActor Instance #0: model=Qwen/Qwen3-7B"]
-            A1["PolicyActor Instance #1: model=Qwen/Qwen3-7B"]
-            A2["PolicyActor Instance #2: model=Qwen/Qwen3-7B"]
-            A3["PolicyActor Instance #3: model=Qwen/Qwen3-7B"]
-        end
-
-        Code --> ProcMesh
-        P0 --> A0
-        P1 --> A1
-        P2 --> A2
-        P3 --> A3
-    end
-
-    style A0 fill:#4CAF50
-    style A1 fill:#4CAF50
-    style A2 fill:#4CAF50
-    style A3 fill:#4CAF50
-```
-
-### Message Routing Through ActorMesh
-
-```mermaid
-graph TD
-    subgraph MessageFlow["Message Flow Patterns"]
-        Client["await policy_actors.generate.METHOD(prompt)"]
-
-        subgraph Methods["Different Adverbs Route Differently"]
-            Choose["choose(): Routes to ONE actor, Load balanced"]
-            Call["call(): Routes to ALL actors, Collects results"]
-            Broadcast["broadcast(): Routes to ALL actors, Fire and forget"]
-            Stream["stream(): Routes to ALL actors, Iterator of results"]
-        end
-
-        subgraph ActorInstances["PolicyActor Instances"]
-            A0["Actor 0: GPU 0, generates response"]
-            A1["Actor 1: GPU 1, generates response"]
-            A2["Actor 2: GPU 2, generates response"]
-            A3["Actor 3: GPU 3, generates response"]
-        end
-
-        Client --> Choose
-        Client --> Call
-        Client --> Broadcast
-        Client --> Stream
-
-        Choose -.->|"Load balanced"| A1
-        Call --> A0
-        Call --> A1
-        Call --> A2
-        Call --> A3
-        Broadcast --> A0
-        Broadcast --> A1
-        Broadcast --> A2
-        Broadcast --> A3
-        Stream --> A0
-        Stream --> A1
-        Stream --> A2
-        Stream --> A3
-    end
-
-    style Choose fill:#4CAF50
-    style Call fill:#FF9800
-    style Broadcast fill:#E91E63
-    style Stream fill:#9C27B0
-```
-
-## How Forge Services Use Monarch
-
-Now the key insight: **Forge services are ServiceActors that manage ActorMeshes of your ForgeActor replicas**.
-
-### The Service Creation Process
-
-```mermaid
-graph TD
-    subgraph ServiceCreation["Service Creation Process"]
-        Call["await PolicyActor.options(num_replicas=4, procs=1).as_service(model='Qwen')"]
-
-        ServiceActor["ServiceActor: Manages 4 replicas, Health checks, Routes calls"]
-
-        subgraph Replicas["4 Independent Replicas"]
-            subgraph R0["Replica 0"]
-                PM0["ProcMesh: 1 process, GPU 0"]
-                AM0["ActorMesh<br/>1 PolicyActor"]
-            end
-
-            subgraph R1["Replica 1"]
-                PM1["ProcMesh: 1 process, GPU 1"]
-                AM1["ActorMesh<br/>1 PolicyActor"]
-            end
-
-            subgraph R2["Replica 2"]
-                PM2["ProcMesh: 1 process, GPU 2"]
-                AM2["ActorMesh<br/>1 PolicyActor"]
-            end
-
-            subgraph R3["Replica 3"]
-                PM3["ProcMesh: 1 process, GPU 3"]
-                AM3["ActorMesh<br/>1 PolicyActor"]
-            end
-        end
-
-        Call --> ServiceActor
-        ServiceActor --> R0
-        ServiceActor --> R1
-        ServiceActor --> R2
-        ServiceActor --> R3
-        PM0 --> AM0
-        PM1 --> AM1
-        PM2 --> AM2
-        PM3 --> AM3
-    end
-
-    style ServiceActor fill:#FF9800
-    style AM0 fill:#4CAF50
-    style AM1 fill:#4CAF50
-    style AM2 fill:#4CAF50
-    style AM3 fill:#4CAF50
-```
-
-### Service Call to Actor Execution
-
-```mermaid
-graph TD
-    subgraph CallFlow["Complete Call Flow"]
-        UserCall["await policy_service.generate.route('What is 2+2?')"]
-
-        ServiceInterface["ServiceInterface: Receives .route() call, Routes to ServiceActor"]
-
-        ServiceActor["ServiceActor: Selects healthy replica, Load balancing, Failure handling"]
-
-        SelectedReplica["Selected Replica #2: ProcMesh 1 process, ActorMesh 1 PolicyActor"]
-
-        PolicyActor["PolicyActor Instance: Loads model, Runs vLLM inference"]
-
-        GPU["GPU 2: vLLM engine, Model weights, KV cache, CUDA kernels"]
-
-        UserCall --> ServiceInterface
-        ServiceInterface --> ServiceActor
-        ServiceActor --> SelectedReplica
-        SelectedReplica --> PolicyActor
-        PolicyActor --> GPU
-
-        GPU -.->|"Response"| PolicyActor
-        PolicyActor -.->|"Response"| SelectedReplica
-        SelectedReplica -.->|"Response"| ServiceActor
-        ServiceActor -.->|"Response"| ServiceInterface
-        ServiceInterface -.->|"'The answer is 4'"| UserCall
-    end
-
-    style UserCall fill:#4CAF50
-    style ServiceActor fill:#FF9800
-    style PolicyActor fill:#9C27B0
-    style GPU fill:#FF5722
-```
-
-## Multiple Services Sharing Infrastructure
-
-In real RL systems, you have multiple services that can share or use separate ProcMeshes:
-
-```mermaid
-graph TD
-    subgraph Cluster["RL Training Cluster"]
-        subgraph Services["Forge Services"]
-            PS["Policy Service<br/>4 GPU replicas"]
-            TS["Trainer Service<br/>2 GPU replicas"]
-            RS["Reward Service<br/>4 CPU replicas"]
-            BS["Buffer Service<br/>1 CPU replica"]
-        end
-
-        subgraph MonarchInfra["Monarch Infrastructure"]
-            subgraph GPUMesh["GPU ProcMesh (6 processes)"]
-                G0["Process 0<br/>GPU 0"]
-                G1["Process 1<br/>GPU 1"]
-                G2["Process 2<br/>GPU 2"]
-                G3["Process 3<br/>GPU 3"]
-                G4["Process 4<br/>GPU 4"]
-                G5["Process 5<br/>GPU 5"]
-            end
-
-            subgraph CPUMesh["CPU ProcMesh (5 processes)"]
-                C0["Process 0<br/>CPU"]
-                C1["Process 1<br/>CPU"]
-                C2["Process 2<br/>CPU"]
-                C3["Process 3<br/>CPU"]
-                C4["Process 4<br/>CPU"]
-            end
-        end
-
-        PS --> G0
-        PS --> G1
-        PS --> G2
-        PS --> G3
-        TS --> G4
-        TS --> G5
-        RS --> C0
-        RS --> C1
-        RS --> C2
-        RS --> C3
-        BS --> C4
-    end
-
-    style PS fill:#4CAF50
-    style TS fill:#E91E63
-    style RS fill:#FF9800
-    style BS fill:#9C27B0
-    style GPUMesh fill:#FFEBEE
-    style CPUMesh fill:#E3F2FD
-```
-
-## Key Insights: Why This Architecture Matters
-
-1. **Process Isolation**: Each actor runs in its own process - failures don't cascade
-2. **Location Transparency**: Actors can be local or remote with identical APIs
-3. **Structured Distribution**: ProcMesh maps directly to hardware topology
-4. **Message Passing**: No shared memory means no race conditions or locks
-5. **Service Abstraction**: Forge hides Monarch complexity while preserving power
-
-Understanding this hierarchy helps you:
-- **Debug performance issues**: Is the bottleneck at service, actor, or hardware level?
-- **Optimize resource usage**: How many replicas per service? GPU vs CPU processes?
-- **Handle failures gracefully**: Which layer failed and how to recover?
-- **Scale effectively**: Where to add resources for maximum impact?
-
-# Conclusion
-
-## What You've Learned
-
-1. **RL Fundamentals**: How RL concepts map to Forge services with REAL, working examples
-2. **Service Abstraction**: How to use Forge services effectively with verified communication patterns
-3. **Monarch Foundation**: How Forge services connect to distributed actors and hardware
-
-## Key Takeaways
-
-- **Services hide complexity**: Your RL code looks like simple async functions, but runs on distributed clusters
-- **Communication patterns matter**: `.route()`, `.fanout()`, sessions, and `.call_one()` each serve specific purposes
-- **Architecture understanding helps**: Knowing the Service → Actor → Process → Hardware hierarchy helps you debug, optimize, and scale
-- **Always verify APIs**: This guide is verified, but cross-check with source code for latest changes
-- **Real API patterns**: Use `.options().as_service()` not `spawn_service()`, use `.route()` not `.choose()`, etc.
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 4e3cec1fa..179a32437 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -15,6 +15,7 @@
 import sys
 
 import pytorch_sphinx_theme2
+from sphinx_gallery.sorting import ExplicitOrder, FileNameSortKey
 
 # Add the source directory to Python path so modules can be imported
 sys.path.insert(0, os.path.abspath("../../src/forge"))
@@ -82,7 +83,12 @@ def get_version_path():
     "_templates",
     os.path.join(os.path.dirname(pytorch_sphinx_theme2.__file__), "templates"),
 ]
-exclude_patterns = ["tutorials/index.rst", "tutorials/template_tutorial.rst"]
+exclude_patterns = [
+    "tutorials/index.rst",
+    "tutorials/template_tutorial.rst",
+    "tutorials/**/index.rst",
+    "tutorial_sources/**/README.md",
+]
 
 html_static_path = ["_static"]
 html_css_files = ["custom.css"]
@@ -204,7 +210,7 @@ def get_version_path():
 sphinx_gallery_conf = {
     "examples_dirs": "tutorial_sources",  # Path to examples directory
     "gallery_dirs": "tutorials",  # Path to generate gallery
-    "filename_pattern": ".*",  # Include all files
+    "filename_pattern": ".*",  # Match all Python files
     "download_all_examples": False,
     "first_notebook_cell": "%matplotlib inline",
     "plot_gallery": "True",
@@ -212,6 +218,8 @@ def get_version_path():
     "backreferences_dir": None,
     "show_signature": False,
     "write_computation_times": False,
+    "subsection_order": ExplicitOrder(["tutorial_sources/zero-to-forge"]),
+    "within_subsection_order": FileNameSortKey,
 }
 
 
diff --git a/docs/source/tutorial_sources/README.txt b/docs/source/tutorial_sources/README.txt
index 1fadb0a08..e69de29bb 100644
--- a/docs/source/tutorial_sources/README.txt
+++ b/docs/source/tutorial_sources/README.txt
@@ -1,5 +0,0 @@
-Tutorials
-=========
-
-This gallery contains tutorials and examples to help you get started with Forge.
-Each tutorial demonstrates specific features and use cases with practical examples.
diff --git a/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py b/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py
new file mode 100644
index 000000000..08f7193c0
--- /dev/null
+++ b/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py
@@ -0,0 +1,503 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Part 1: RL Fundamentals - Using Forge Terminology
+==================================================
+
+**Author:** `Sanyam Bhutani <https://github.com/init27>`_
+
+.. grid:: 2
+
+    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+       :class-card: card-prerequisites
+
+       * Core RL components in Forge
+       * How RL concepts map to Forge services
+       * The RL training loop with Forge APIs
+       * Forge's distributed architecture benefits
+
+    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+       :class-card: card-prerequisites
+
+       * Understanding of basic RL concepts
+       * Familiarity with Python async/await
+       * PyTorch experience recommended
+"""
+
+#########################################################################
+# Core RL Components in Forge
+# ----------------------------
+#
+# Let's start with a simple math tutoring example to understand RL concepts 
+# with the exact names Forge uses:
+#
+# The Toy Example: Teaching Math
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# .. mermaid::
+#
+#     graph TD
+#         subgraph Example["Math Tutoring RL Example"]
+#             Dataset["Dataset: math problems"]
+#             Policy["Policy: student AI"]
+#             Reward["Reward Model: scores answers"]
+#             Reference["Reference Model: baseline"]
+#             ReplayBuffer["Replay Buffer: stores experiences"]
+#             Trainer["Trainer: improves student"]
+#         end
+#
+#         Dataset --> Policy
+#         Policy --> Reward
+#         Policy --> Reference
+#         Reward --> ReplayBuffer
+#         Reference --> ReplayBuffer
+#         ReplayBuffer --> Trainer
+#         Trainer --> Policy
+#
+#         style Policy fill:#4CAF50
+#         style Reward fill:#FF9800
+#         style Trainer fill:#E91E63
+#
+# RL Components Defined (Forge Names)
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# 1. **Dataset**: Provides questions/prompts (like "What is 2+2?")
+# 2. **Policy**: The AI being trained (generates answers like "The answer is 4")
+# 3. **Reward Model**: Evaluates answer quality (gives scores like 0.95)
+# 4. **Reference Model**: Original policy copy (prevents drift from baseline)
+# 5. **Replay Buffer**: Stores experiences (question + answer + score)
+# 6. **Trainer**: Updates the policy weights based on experiences
+
+######################################################################
+# The RL Learning Flow
+# --------------------
+#
+# Here's a conceptual example of how an RL step works.
+# This is CONCEPTUAL - see apps/grpo/main.py for actual GRPO implementation.
+
+
+def conceptual_rl_step():
+    """Conceptual RL training step showing the flow."""
+    # 1. Get a math problem
+    question = dataset.sample()  # "What is 2+2?"
+
+    # 2. Student generates answer
+    answer = policy.generate(question)  # "The answer is 4"
+
+    # 3. Teacher grades it
+    score = reward_model.evaluate(question, answer)  # 0.95
+
+    # 4. Compare to original student
+    baseline = reference_model.compute_logprobs(question, answer)
+
+    # 5. Store the experience
+    experience = Episode(question, answer, score, baseline)
+    replay_buffer.add(experience)
+
+    # 6. When enough experiences collected, improve student
+    batch = replay_buffer.sample(curr_policy_version=0)
+    if batch is not None:
+        trainer.train_step(batch)  # Student gets better!
+
+
+######################################################################
+# From Concepts to Forge Services
+# --------------------------------
+#
+# Here's the key insight: **Each RL component becomes a Forge service**. 
+# The toy example above maps directly to Forge:
+#
+# .. mermaid::
+#
+#     graph LR
+#         subgraph Concepts["RL Concepts"]
+#             C1["Dataset"]
+#             C2["Policy"]
+#             C3["Reward Model"]
+#             C4["Reference Model"]
+#             C5["Replay Buffer"]
+#             C6["Trainer"]
+#         end
+#
+#         subgraph Services["Forge Services (Real Classes)"]
+#             S1["DatasetActor"]
+#             S2["Policy"]
+#             S3["RewardActor"]
+#             S4["ReferenceModel"]
+#             S5["ReplayBuffer"]
+#             S6["RLTrainer"]
+#         end
+#
+#         C1 --> S1
+#         C2 --> S2
+#         C3 --> S3
+#         C4 --> S4
+#         C5 --> S5
+#         C6 --> S6
+#
+#         style C2 fill:#4CAF50
+#         style S2 fill:#4CAF50
+#         style C3 fill:#FF9800
+#         style S3 fill:#FF9800
+
+######################################################################
+# RL Step with Forge Services
+# ----------------------------
+#
+# Let's look at the example from above again, but this time we use the 
+# actual Forge API names:
+
+import asyncio
+
+
+async def conceptual_forge_rl_step(services, step):
+    """Single RL step using verified Forge APIs."""
+    # 1. Get a math problem - Using actual DatasetActor API
+    sample = await services["dataloader"].sample.call_one()
+    question, target = sample["request"], sample["target"]
+
+    # 2. Student generates answer - Using actual Policy API
+    responses = await services["policy"].generate.route(prompt=question)
+    answer = responses[0].text
+
+    # 3. Teacher grades it - Using actual RewardActor API
+    score = await services["reward_actor"].evaluate_response.route(
+        prompt=question, response=answer, target=target
+    )
+
+    # 4. Compare to baseline - Using actual ReferenceModel API
+    # Note: ReferenceModel.forward requires input_ids, max_req_tokens, return_logprobs
+    # ref_logprobs = await services['ref_model'].forward.route(
+    #     input_ids, max_req_tokens, return_logprobs=True
+    # )
+
+    # 5. Store experience - Using actual Episode structure from apps/grpo/main.py
+    # episode = create_episode_from_response(responses[0], score, ref_logprobs, step)
+    # await services['replay_buffer'].add.call_one(episode)
+
+    # 6. Improve student - Using actual training pattern
+    batch = await services["replay_buffer"].sample.call_one(curr_policy_version=step)
+    if batch is not None:
+        inputs, targets = batch  # GRPO returns (inputs, targets) tuple
+        loss = await services["trainer"].train_step.call(inputs, targets)
+
+        # 7. Policy synchronization - Using actual weight update pattern
+        await services["trainer"].push_weights.call(step + 1)
+        await services["policy"].update_weights.fanout(step + 1)
+
+        return loss
+
+
+######################################################################
+# Why This Matters: Traditional ML Infrastructure Fails
+# -----------------------------------------------------
+#
+# Our simple RL loop above has complex requirements:
+#
+# Problem 1: Different Resource Needs
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# +------------------+-------------------------+---------------------------+
+# | Component        | Resource Needs          | Scaling Strategy          |
+# +==================+=========================+===========================+
+# | **Policy**       | Large GPU memory        | Multiple replicas         |
+# +------------------+-------------------------+---------------------------+
+# | **Reward**       | Small compute           | CPU or small GPU          |
+# +------------------+-------------------------+---------------------------+
+# | **Trainer**      | Massive GPU compute     | Distributed training      |
+# +------------------+-------------------------+---------------------------+
+# | **Dataset**      | CPU intensive I/O       | High memory bandwidth     |
+# +------------------+-------------------------+---------------------------+
+#
+# Problem 2: Complex Interdependencies
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# .. mermaid::
+#
+#     graph LR
+#         A["Policy: Student AI<br/>'What is 2+2?' → 'The answer is 4'"]
+#         B["Reward: Teacher<br/>Scores answer: 0.95"]
+#         C["Reference: Original Student<br/>Provides baseline comparison"]
+#         D["Replay Buffer: Notebook<br/>Stores: question + answer + score"]
+#         E["Trainer: Tutor<br/>Improves student using experiences"]
+#
+#         A --> B
+#         A --> C
+#         B --> D
+#         C --> D
+#         D --> E
+#         E --> A
+#
+#         style A fill:#4CAF50
+#         style B fill:#FF9800
+#         style C fill:#2196F3
+#         style D fill:#8BC34A
+#         style E fill:#E91E63
+#
+# Each step has different:
+#
+# * **Latency requirements**: Policy inference needs low latency
+# * **Scaling patterns**: Need N policy replicas to keep trainer busy
+# * **Failure modes**: Any component failure cascades to halt pipeline
+# * **Resource utilization**: GPUs for inference/training, CPUs for data
+
+######################################################################
+# Enter Forge: RL-Native Architecture
+# ------------------------------------
+#
+# Forge solves these problems by treating each RL component as an 
+# **independent, distributed unit**.
+#
+# Quick API Reference (covered in detail in Part 2):
+#
+# * ``.route()`` - Send request to any healthy replica (load balanced)
+# * ``.call_one()`` - Send request to a single actor instance
+# * ``.fanout()`` - Send request to ALL replicas in a service
+
+
+async def real_rl_training_step(services, step):
+    """Single RL step using verified Forge APIs."""
+    # 1. Environment interaction - Using actual DatasetActor API
+    sample = await services["dataloader"].sample.call_one()
+    prompt, target = sample["request"], sample["target"]
+
+    responses = await services["policy"].generate.route(prompt)
+
+    # 2. Reward computation - Using actual RewardActor API
+    score = await services["reward_actor"].evaluate_response.route(
+        prompt=prompt, response=responses[0].text, target=target
+    )
+
+    # 3. Get reference logprobs - Using actual ReferenceModel API
+    import torch
+
+    input_ids = torch.cat([responses[0].prompt_ids, responses[0].token_ids])
+    ref_logprobs = await services["ref_model"].forward.route(
+        input_ids.unsqueeze(0), max_req_tokens=512, return_logprobs=True
+    )
+
+    # 4. Experience storage - Using actual Episode pattern from GRPO
+    # episode = create_episode_from_response(responses[0], score, ref_logprobs, step)
+    # await services['replay_buffer'].add.call_one(episode)
+
+    # 5. Learning - Using actual trainer pattern
+    batch = await services["replay_buffer"].sample.call_one(curr_policy_version=step)
+    if batch is not None:
+        inputs, targets = batch
+        loss = await services["trainer"].train_step.call(inputs, targets)
+
+        # 6. Policy synchronization
+        await services["trainer"].push_weights.call(step + 1)
+        await services["policy"].update_weights.fanout(step + 1)
+
+        return loss
+
+
+######################################################################
+# What Makes This Powerful
+# -------------------------
+#
+# Automatic Resource Management
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+async def example_automatic_management(policy):
+    """Forge handles routing, GPU memory, batching, and scaling."""
+    responses = await policy.generate.route(prompt="What is 2+2?")
+    answer = responses[0].text
+    return answer
+
+
+######################################################################
+# Independent Scaling
+# ~~~~~~~~~~~~~~~~~~~
+#
+# Here's how you configure different components with different resources:
+
+# Note: This is example code showing the Forge API
+# For actual imports, see apps/grpo/main.py
+try:
+    from forge.actors.policy import Policy
+    from forge.actors.replay_buffer import ReplayBuffer
+    from forge.actors.reference_model import ReferenceModel
+    from forge.actors.trainer import RLTrainer
+    from forge.data.rewards import MathReward, ThinkingReward
+
+    # Mock classes for the example
+    class DatasetActor:
+        pass
+
+    class RewardActor:
+        pass
+
+    class ComputeAdvantages:
+        pass
+
+except ImportError:
+    # Provide mock classes if imports fail during doc build
+    class Policy:
+        pass
+
+    class ReplayBuffer:
+        pass
+
+    class ReferenceModel:
+        pass
+
+    class RLTrainer:
+        pass
+
+    class DatasetActor:
+        pass
+
+    class RewardActor:
+        pass
+
+    class ComputeAdvantages:
+        pass
+
+    class MathReward:
+        pass
+
+    class ThinkingReward:
+        pass
+
+
+async def setup_forge_services():
+    """Configure Forge services with independent scaling."""
+    model = "Qwen/Qwen3-1.7B"
+    group_size = 1
+
+    (
+        dataloader,
+        policy,
+        trainer,
+        replay_buffer,
+        compute_advantages,
+        ref_model,
+        reward_actor,
+    ) = await asyncio.gather(
+        # Dataset actor (CPU)
+        DatasetActor.options(procs=1).as_actor(
+            path="openai/gsm8k",
+            revision="main",
+            data_split="train",
+            streaming=True,
+            model=model,
+        ),
+        # Policy service with GPU and multiple replicas
+        Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service(
+            engine_config={
+                "model": model,
+                "tensor_parallel_size": 1,
+                "pipeline_parallel_size": 1,
+                "enforce_eager": False,
+            },
+            sampling_config={
+                "n": group_size,
+                "max_tokens": 16,
+                "temperature": 1.0,
+                "top_p": 1.0,
+            },
+        ),
+        # Trainer actor with GPU
+        RLTrainer.options(procs=1, with_gpus=True).as_actor(
+            model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"},
+            optimizer={"name": "AdamW", "lr": 1e-5},
+            training={"local_batch_size": 2, "seq_len": 2048},
+        ),
+        # Replay buffer (CPU)
+        ReplayBuffer.options(procs=1).as_actor(
+            batch_size=2, max_policy_age=1, dp_size=1
+        ),
+        # Advantage computation (CPU)
+        ComputeAdvantages.options(procs=1).as_actor(),
+        # Reference model with GPU
+        ReferenceModel.options(procs=1, with_gpus=True).as_actor(
+            model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"},
+            training={"dtype": "bfloat16"},
+        ),
+        # Reward actor (CPU)
+        RewardActor.options(procs=1, num_replicas=1).as_service(
+            reward_functions=[MathReward(), ThinkingReward()]
+        ),
+    )
+
+    return {
+        "dataloader": dataloader,
+        "policy": policy,
+        "trainer": trainer,
+        "replay_buffer": replay_buffer,
+        "compute_advantages": compute_advantages,
+        "ref_model": ref_model,
+        "reward_actor": reward_actor,
+    }
+
+
+######################################################################
+# Forge Components: Services vs Actors
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Forge has two types of distributed components:
+#
+# * **Services**: Multiple replicas with automatic load balancing 
+#   (like Policy, RewardActor)
+# * **Actors**: Single instances that handle their own internal 
+#   distribution (like RLTrainer, ReplayBuffer)
+#
+# We cover this distinction in detail in Part 2, but for now this 
+# explains the scaling patterns:
+#
+# * Policy service: ``num_replicas=8`` for high inference demand
+# * RewardActor service: ``num_replicas=16`` for parallel evaluation
+# * RLTrainer actor: Single instance with internal distributed training
+
+######################################################################
+# Fault Tolerance
+# ~~~~~~~~~~~~~~~
+#
+# Forge provides automatic fault tolerance:
+
+
+async def example_fault_tolerance(policy, reward_actor):
+    """If a replica fails, Forge automatically handles it."""
+    # If a policy replica fails:
+    responses = await policy.generate.route(prompt="What is 2+2?")
+    answer = responses[0].text
+    # -> Forge automatically routes to healthy replica
+    # -> Failed replica respawns in background
+    # -> No impact on training loop
+
+    # If reward service fails:
+    score = await reward_actor.evaluate_response.route(
+        prompt="question", response=answer, target="target"
+    )
+    # -> Retries on different replica automatically
+    # -> Graceful degradation if all replicas fail
+    # -> System continues (may need application-level handling)
+
+
+######################################################################
+# Conclusion
+# ----------
+#
+# This tutorial covered:
+#
+# * How RL concepts map to Forge components
+# * The challenges of traditional RL infrastructure
+# * How Forge's architecture solves these challenges
+# * Basic Forge API patterns (route, call_one, fanout)
+#
+# In the next section, we will go a layer deeper and learn how Forge
+# services work internally.
+#
+# Further Reading
+# ---------------
+#
+# * Continue to :doc:`2_Forge_Internals`
+# * Check out the full `GRPO implementation <https://github.com/meta-pytorch/forge/tree/main/apps/grpo>`_
+# * Read about the :doc:`../../api_actors` documentation
diff --git a/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py b/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py
new file mode 100644
index 000000000..efecfdc72
--- /dev/null
+++ b/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py
@@ -0,0 +1,767 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Part 2: Peeling Back the Abstraction - What Are Services?
+==========================================================
+
+**Author:** `Sanyam Bhutani <https://github.com/init27>`_
+
+.. grid:: 2
+
+    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+       :class-card: card-prerequisites
+
+       * How Forge services work under the hood
+       * Service communication patterns (route, fanout, call_one)
+       * State management in distributed systems
+       * Real-world service orchestration patterns
+
+    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+       :class-card: card-prerequisites
+
+       * Complete :doc:`1_RL_and_Forge_Fundamentals`
+       * Understanding of Python async/await
+       * Basic distributed systems knowledge
+
+We highly recommend reading Part 1 before this - it explains RL Concepts 
+and how they land in Forge.
+
+Now that you see the power of the service abstraction, let's understand 
+what's actually happening under the hood. Grab your chai!
+"""
+
+######################################################################
+# Service Anatomy: Beyond the Interface
+# --------------------------------------
+#
+# When you call ``await policy_service.generate(question)``, here's what 
+# actually happens:
+#
+# (Don't worry, we will understand Services right in the next section!)
+#
+# .. mermaid::
+#
+#     graph TD
+#         Call["Your Code:<br/>await policy_service.generate"]
+#
+#         subgraph ServiceLayer["Service Layer"]
+#             Proxy["Service Proxy: Load balancing, Health checking"]
+#             LB["Load Balancer: Replica selection, Circuit breaker"]
+#         end
+#
+#         subgraph Replicas["Replica Management"]
+#             R1["Replica 1: GPU 0, Healthy"]
+#             R2["Replica 2: GPU 1, Overloaded"]
+#             R3["Replica 3: GPU 2, Failed"]
+#             R4["Replica 4: GPU 3, Healthy"]
+#         end
+#
+#         subgraph Compute["Actual Computation"]
+#             Actor["Policy Actor: vLLM engine, Model weights, KV cache"]
+#         end
+#
+#         Call --> Proxy
+#         Proxy --> LB
+#         LB --> R1
+#         LB -.-> R2
+#         LB -.-> R3
+#         LB --> R4
+#         R1 --> Actor
+#         R4 --> Actor
+#
+#         style Call fill:#4CAF50
+#         style LB fill:#FF9800
+#         style R3 fill:#F44336
+#         style Actor fill:#9C27B0
+
+######################################################################
+# Service Components Deep Dive
+# -----------------------------
+#
+# 1. Real Service Configuration
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Here's the actual ServiceConfig from Forge source code:
+
+# Configuration pattern from apps/grpo/main.py:
+# Policy.options(
+#     procs=1,           # Processes per replica
+#     num_replicas=4,    # Number of replicas
+#     with_gpus=True     # Allocate GPUs
+#     # Other available options:
+#     # hosts=None   #  the number of remote hosts used per replica
+# )
+
+######################################################################
+# 2. Real Service Creation
+# ~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Services are created using the ``.options().as_service()`` pattern 
+# from the actual GRPO implementation.
+#
+# The service creation automatically handles:
+#
+# * Spawning actor replicas across processes/GPUs
+# * Load balancing with .route() method for services
+# * Health monitoring and failure recovery
+# * Message routing and serialization
+
+import asyncio
+
+# Mock imports for documentation build
+try:
+    from forge.actors.policy import Policy
+except ImportError:
+
+    class Policy:
+        pass
+
+
+async def example_service_creation():
+    """Example of creating a Policy service."""
+    model = "Qwen/Qwen3-1.7B"
+
+    policy = await Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service(
+        engine_config={
+            "model": model,
+            "tensor_parallel_size": 1,
+            "pipeline_parallel_size": 1,
+            "enforce_eager": False,
+        },
+        sampling_config={
+            "n": 1,
+            "max_tokens": 16,
+            "temperature": 1.0,
+            "top_p": 1.0,
+        },
+    )
+
+    prompt = "What is 3 + 5?"
+    responses = await policy.generate.route(prompt)
+    print(f"Response: {responses[0].text}")
+
+    # Cleanup when done
+    await policy.shutdown()
+
+
+######################################################################
+# 3. How Services Actually Work
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Forge services are implemented as ServiceActors that manage 
+# collections of your ForgeActor replicas.
+#
+# When you call ``.as_service()``, Forge creates a ``ServiceInterface`` 
+# that manages N replicas of your ``ForgeActor`` class and gives you 
+# methods like ``.route()``, ``.fanout()``, etc.
+
+
+async def service_interface_example(policy):
+    """Your code sees this simple interface."""
+    # Simple call - but Forge handles all complexity
+    responses = await policy.generate.route(prompt="What is 2+2?")
+    # Forge handles: replica management, load balancing, fault tolerance
+
+
+######################################################################
+# Communication Patterns: Quick Reference
+# ----------------------------------------
+#
+# **API Summary:**
+#
+# * ``.route()`` - Send request to any healthy replica in a service (load balanced)
+# * ``.call_one()`` - Send request to a single actor instance
+# * ``.fanout()`` - Send request to ALL replicas in a service
+#
+# .. mermaid::
+#
+#     graph LR
+#         subgraph Request["Your Request"]
+#             Code["await service.method.ADVERB()"]
+#         end
+#
+#         subgraph Patterns["Communication Patterns"]
+#             Route[".route()<br/>→ One healthy replica"]
+#             CallOne[".call_one()<br/>→ Single actor"]
+#             Fanout[".fanout()<br/>→ ALL replicas"]
+#         end
+#
+#         subgraph Replicas["Replicas/Actors"]
+#             R1["Replica 1"]
+#             R2["Replica 2"]
+#             R3["Replica 3"]
+#             A1["Actor"]
+#         end
+#
+#         Code --> Route
+#         Code --> CallOne
+#         Code --> Fanout
+#
+#         Route --> R2
+#         CallOne --> A1
+#         Fanout --> R1
+#         Fanout --> R2
+#         Fanout --> R3
+#
+#         style Route fill:#4CAF50
+#         style CallOne fill:#FF9800
+#         style Fanout fill:#9C27B0
+
+######################################################################
+# Deep Dive: Service Communication Patterns
+# ------------------------------------------
+#
+# These communication patterns ("adverbs") determine how your service 
+# calls are routed to replicas. Understanding when to use each pattern 
+# is key to effective Forge usage.
+#
+# 1. ``.route()`` - Load Balanced Single Replica
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# **When to use**: Normal request routing where any replica can handle 
+# the request.
+
+
+async def route_example(policy):
+    """Using .route() for load-balanced requests."""
+    question = "What is 2+2?"
+    responses = await policy.generate.route(prompt=question)
+    answer = responses[0].text  # Extract text from Completion object
+    return answer
+
+
+# Behind the scenes:
+# 1. Health check eliminates failed replicas
+# 2. Load balancer picks replica (currently round robin)
+# 3. Request routes to that specific replica
+# 4. Automatic retry on different replica if failure
+#
+# **Performance characteristics**:
+#
+# * **Latency**: Lowest (single network hop)
+# * **Throughput**: Limited by single replica capacity
+# * **Fault tolerance**: Automatic failover to other replicas
+#
+# **Critical insight**: ``.route()`` is your default choice for 
+# stateless operations in Forge services.
+
+######################################################################
+# 2. ``.fanout()`` - Broadcast with Results Collection
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# **When to use**: You need responses from ALL replicas.
+
+
+async def fanout_example(policy):
+    """Using .fanout() to broadcast to all replicas."""
+    # Get version from all policy replicas
+    current_versions = await policy.get_version.fanout()
+    # Returns: [version_replica_1, version_replica_2, ...]
+
+    # Update weights on all replicas
+    await policy.update_weights.fanout(new_policy_version=1)
+    # Broadcasts to all replicas simultaneously
+
+
+# **Performance characteristics**:
+#
+# * **Latency**: Slowest replica determines total latency
+# * **Throughput**: Network bandwidth × number of replicas
+# * **Fault tolerance**: Fails if ANY replica fails (unless configured)
+#
+# **Critical gotcha**: Don't use ``.fanout()`` for high-frequency 
+# operations - it contacts all replicas.
+
+######################################################################
+# 3. Streaming Operations - Custom Implementation Pattern
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# **When to use**: You want to process results as they arrive.
+
+
+async def streaming_pattern_example(replay_buffer, trainer, step):
+    """Streaming pattern for continuous training."""
+    # CONCEPTUAL - Streaming requires custom implementation
+    # Pattern from apps/grpo/main.py continuous training:
+
+    while True:
+        # This is the real API call pattern
+        batch = await replay_buffer.sample.call_one(curr_policy_version=step)
+        if batch is not None:
+            # Process batch immediately
+            loss = await trainer.train_step.call_one(batch)
+            print(f"Training loss: {loss}")
+        else:
+            await asyncio.sleep(0.1)  # Wait for more data
+        break  # Just for example
+
+
+# **Performance characteristics**:
+#
+# * **Latency**: Process first result immediately
+# * **Throughput**: Non-blocking async operations
+# * **Fault tolerance**: Continues if some replicas fail
+#
+# **Critical insight**: Essential for high-throughput RL where 
+# you can't wait for batches.
+
+######################################################################
+# Service Sessions for Stateful Operations
+# -----------------------------------------
+#
+# **When to use**: When you need multiple calls to hit the same replica 
+# (like KV cache preservation).
+#
+# **What are sticky sessions?** A session ensures all your service calls 
+# within the ``async with`` block go to the same replica, instead of 
+# being load-balanced across different replicas.
+
+# Mock classes for example
+try:
+    from forge.controller import ForgeActor
+    from monarch.actor import endpoint
+except ImportError:
+
+    class ForgeActor:
+        pass
+
+    def endpoint(func):
+        return func
+
+
+class ForgeCounter(ForgeActor):
+    """Example counter to demonstrate sessions."""
+
+    def __init__(self, initial_value: int):
+        self.value = initial_value
+
+    @endpoint
+    def increment(self) -> int:
+        self.value += 1
+        return self.value
+
+    @endpoint
+    def get_value(self) -> int:
+        return self.value
+
+    @endpoint
+    async def reset(self):
+        self.value = 0
+
+
+async def without_sessions_example():
+    """WITHOUT SESSIONS: Each .route() goes to different replica."""
+    counter_service = await ForgeCounter.options(procs=1, num_replicas=4).as_service(
+        initial_value=0
+    )
+
+    # Each call might go to different replica
+    await counter_service.increment.route()  # Might go to replica 2
+    await counter_service.increment.route()  # Might go to replica 1
+    await counter_service.increment.route()  # Might go to replica 3
+
+    results = await counter_service.increment.fanout()
+    print(f"All replica values: {results}")
+    # Output: All replica values: [1, 2, 1, 1]
+    # Each replica has different state!
+
+    await counter_service.shutdown()
+
+
+async def with_sessions_example():
+    """WITH SESSIONS: All calls go to the SAME replica."""
+    counter_service = await ForgeCounter.options(procs=1, num_replicas=4).as_service(
+        initial_value=0
+    )
+
+    print("\nUsing sticky sessions:")
+    async with counter_service.session():
+        await counter_service.reset.route()
+        print(await counter_service.increment.route())  # 1
+        print(await counter_service.increment.route())  # 2
+        print(await counter_service.increment.route())  # 3
+
+        final_value = await counter_service.get_value.route()
+        print(f"Final value on this replica: {final_value}")  # 3
+
+    # Cleanup
+    await counter_service.shutdown()
+
+
+######################################################################
+# Deep Dive: State Management Reality
+# ------------------------------------
+#
+# The most complex challenge in distributed RL is maintaining state 
+# consistency while maximizing performance.
+#
+# The KV Cache Problem
+# ~~~~~~~~~~~~~~~~~~~~
+#
+# **The challenge**: Policy inference is much faster with KV cache, 
+# but cache is tied to specific conversation history.
+
+
+async def naive_multi_turn(policy_service):
+    """This breaks KV cache optimization."""
+    question1 = "What is 2+2?"
+
+    # Each call might go to different replica = cache miss
+    response1 = await policy_service.generate.route(prompt=question1)
+    full_prompt = question1 + response1[0].text
+    response2 = await policy_service.generate.route(prompt=full_prompt)  # Cache miss!
+    conversation = full_prompt + response2[0].text
+    response3 = await policy_service.generate.route(
+        prompt=conversation
+    )  # Cache miss!
+
+
+async def optimized_multi_turn(policy):
+    """The solution: Sticky sessions ensure same replica."""
+    async with policy.session():
+        # All calls guaranteed to hit same replica = cache hits
+        question1 = "What is 2+2?"
+        response1 = await policy.generate.route(prompt=question1)
+        full_prompt = question1 + response1[0].text
+        response2 = await policy.generate.route(prompt=full_prompt)  # Cache hit!
+        conversation = full_prompt + response2[0].text
+        response3 = await policy.generate.route(prompt=conversation)  # Cache hit!
+
+    # Session ends, replica can be garbage collected or reused
+
+
+# **Performance impact**: Maintaining KV cache across turns avoids 
+# recomputing previous tokens.
+
+######################################################################
+# Replay Buffer Consistency
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# **The challenge**: Multiple trainers and experience collectors 
+# reading/writing concurrently.
+#
+# **Real Forge approach**: The ReplayBuffer actor handles concurrency 
+# internally:
+
+
+async def replay_buffer_example(replay_buffer):
+    """ReplayBuffer provides thread-safe operations."""
+    # Add episodes (thread-safe by actor model)
+    episode = {}  # Mock episode
+    await replay_buffer.add.call_one(episode)
+
+    # Sample batches for training
+    batch = await replay_buffer.sample.call_one(
+        curr_policy_version=0,
+        batch_size=None,  # Optional, uses default from config
+    )
+
+    # Additional methods available:
+    # await replay_buffer.clear.call_one()  # Clear buffer
+    # await replay_buffer.evict.call_one(curr_policy_version)  # Remove old
+    # state = await replay_buffer.state_dict.call_one()  # Checkpoint
+
+
+# **Critical insight**: The actor model provides natural thread safety - 
+# each actor processes messages sequentially.
+
+######################################################################
+# Weight Synchronization Strategy
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# **The challenge**: Trainer updates policy weights, but policy service 
+# needs those weights.
+
+import torch
+
+
+async def real_weight_sync(trainer, policy, step):
+    """Forge weight synchronization pattern from apps/grpo/main.py."""
+    # Trainer pushes weights to TorchStore with version number
+    await trainer.push_weights.call_one(policy_version=step + 1)
+
+    # Policy service updates to new version from TorchStore
+    # Use .fanout() to update ALL policy replicas
+    await policy.update_weights.fanout(policy_version=step + 1)
+
+    # Check current policy version
+    current_version = await policy.get_version.route()
+    print(f"Current policy version: {current_version}")
+
+
+######################################################################
+# Deep Dive: Asynchronous Coordination Patterns
+# ----------------------------------------------
+#
+# **The real challenge**: Different services run at different speeds, 
+# but Forge's service abstraction handles the coordination complexity.
+#
+# The Forge Approach: Let Services Handle Coordination
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Instead of manual coordination, Forge services handle speed mismatches 
+# automatically:
+
+
+async def simple_rl_step(
+    dataloader, policy, ref_model, reward_actor, replay_buffer, compute_advantages, trainer
+):
+    """Simple RL step showing service coordination."""
+    # ===== Generate a rollout =====
+    sample = await dataloader.sample.call_one()
+    prompt, target = sample["request"], sample["target"]
+
+    print(f"Prompt: {prompt}")
+    print(f"Target: {target}")
+
+    actions = await policy.generate.route(prompt=prompt)
+    print(f"Policy response: {actions[0].text}")
+
+    # Create input tensor for reference model (requires full context)
+    input_ids = torch.cat([actions[0].prompt_ids, actions[0].token_ids])
+    ref_logprobs = await ref_model.forward.route(
+        input_ids.unsqueeze(0), max_req_tokens=512, return_logprobs=True
+    )
+
+    reward = await reward_actor.evaluate_response.route(
+        prompt=prompt, response=actions[0].text, target=target
+    )
+    print(f"Reward: {reward}")
+
+    # Create episode (simplified for example)
+    episode = {
+        "episode_id": "0",
+        "request": prompt,
+        "response": actions[0].text,
+        "reward": reward,
+        "ref_logprobs": ref_logprobs[0],
+    }
+
+    await replay_buffer.add.call_one(episode)
+    print("Episode stored in replay buffer")
+
+    # ===== Train on the batch =====
+    batch = await replay_buffer.sample.call_one(curr_policy_version=0)
+    if batch is not None:
+        print("Training on batch...")
+        inputs, targets = batch
+        loss = await trainer.train_step.call(inputs, targets)
+        print(f"Training loss: {loss}")
+        return loss
+    else:
+        print("Not enough data in buffer yet")
+        return None
+
+
+######################################################################
+# Handling Speed Mismatches with Service Scaling
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# **The insight**: Scale services independently based on their 
+# bottlenecks.
+
+# Mock imports for example
+try:
+    from forge.actors.policy import Policy
+    from forge.actors.trainer import RLTrainer
+except ImportError:
+
+    class Policy:
+        pass
+
+    class RLTrainer:
+        pass
+
+    class RewardActor:
+        pass
+
+
+async def scaling_example():
+    """Scale services independently based on bottlenecks."""
+    model_name = "Qwen/Qwen3-1.7B"
+
+    # Scale fast services with more replicas
+    policy = await Policy.options(
+        procs=1, num_replicas=8, with_gpus=True  # Many replicas for throughput
+    ).as_service(engine_config={"model": model_name, "tensor_parallel_size": 1})
+
+    # Reward evaluation might be CPU-bound
+    reward_actor = await RewardActor.options(
+        procs=1, num_replicas=16, with_gpus=False  # More CPU replicas
+    ).as_service(reward_functions=[])
+
+    # Training needs fewer but more powerful replicas
+    trainer = await RLTrainer.options(
+        procs=1, with_gpus=True  # Fewer but GPU-heavy
+    ).as_actor(  # Trainer typically uses .as_actor() not .as_service()
+        model={"name": "qwen3", "flavor": "1.7B"}, optimizer={"name": "AdamW", "lr": 1e-5}
+    )
+
+
+######################################################################
+# Service Implementation Example
+# -------------------------------
+#
+# Let's see how a reward service is actually implemented:
+
+# Mock imports
+try:
+    from forge.controller import ForgeActor
+    from monarch.actor import endpoint
+    from forge.data.rewards import MathReward, ThinkingReward
+except ImportError:
+
+    class ForgeActor:
+        pass
+
+    def endpoint(func):
+        return func
+
+    class MathReward:
+        def __call__(self, prompt, response, target):
+            return 1.0
+
+    class ThinkingReward:
+        def __call__(self, prompt, response, target):
+            return 1.0
+
+
+class RewardActor(ForgeActor):
+    """Exact RewardActor from apps/grpo/main.py."""
+
+    def __init__(self, reward_functions: list):
+        self.reward_functions = reward_functions
+
+    @endpoint
+    async def evaluate_response(self, prompt: str, response: str, target: str) -> float:
+        """Evaluate response quality using multiple reward functions."""
+        total_reward = 0.0
+
+        for reward_fn in self.reward_functions:
+            # Each reward function contributes to total score
+            reward = reward_fn(prompt, response, target)
+            total_reward += reward
+
+        # Return average reward across all functions
+        return total_reward / len(self.reward_functions) if self.reward_functions else 0.0
+
+
+async def reward_service_example():
+    """Create and use a reward service."""
+    reward_actor = await RewardActor.options(procs=1, num_replicas=1).as_service(
+        reward_functions=[MathReward(), ThinkingReward()]
+    )
+
+    prompt = "What is 15% of 240?"
+    response = "15% of 240 is 36"
+    target = "36"
+
+    score = await reward_actor.evaluate_response.route(
+        prompt=prompt, response=response, target=target
+    )
+    print(f"Reward score: {score}")  # Usually around 1.0 for correct answers
+
+    # For production scaling - increase num_replicas:
+    # RewardActor.options(procs=1, num_replicas=16)  # 16 parallel evaluators
+
+    # Cleanup when done
+    await reward_actor.shutdown()
+
+
+######################################################################
+# Service Orchestration: The Training Loop
+# -----------------------------------------
+#
+# Now let's see how services coordinate in a real training loop:
+
+
+async def production_training_loop():
+    """Real training loop pattern from apps/grpo/main.py."""
+    # Service creation pattern (abbreviated)
+    print("Initializing all services...")
+
+    # (Services initialization code here - see Part 1)
+
+    step = 0
+
+    while True:
+        # Data generation
+        sample = await dataloader.sample.call_one()
+
+        # Policy generation service call
+        responses = await policy.generate.route(sample["request"])
+
+        # Reference computation service call
+        input_ids = torch.cat([responses[0].prompt_ids, responses[0].token_ids])
+        ref_logprobs = await ref_model.forward.route(
+            input_ids.unsqueeze(0), max_req_tokens=512, return_logprobs=True
+        )
+
+        # Reward evaluation service call
+        reward = await reward_actor.evaluate_response.route(
+            prompt=sample["question"], response=responses[0].text, target=sample["answer"]
+        )
+
+        # Experience storage
+        await replay_buffer.add.call_one(episode)
+
+        # Training when ready
+        batch = await replay_buffer.sample.call_one(curr_policy_version=step)
+        if batch is not None:
+            inputs, targets = batch
+            loss = await trainer.train_step.call(inputs, targets)
+
+            # Weight synchronization pattern
+            await trainer.push_weights.call(step + 1)
+            await policy.update_weights.fanout(step + 1)  # Fanout to all replicas
+
+            print(f"Step {step}, Loss: {loss:.4f}")
+            step += 1
+
+        if step >= 100:
+            break
+
+
+# **Key observations:**
+#
+# 1. **Parallelism**: Independent operations run concurrently
+# 2. **Load balancing**: Each ``.route()`` call automatically selects optimal replica
+# 3. **Fault tolerance**: Failures automatically retry on different replicas
+# 4. **Resource efficiency**: CPU and GPU services scale independently
+# 5. **Coordination**: Services coordinate through shared state (replay buffer, weight versions)
+#
+# This is the power of the service abstraction - complex distributed 
+# coordination looks like simple async Python code.
+
+######################################################################
+# Conclusion
+# ----------
+#
+# This tutorial covered:
+#
+# * How Forge services work under the hood
+# * Communication patterns: ``.route()``, ``.fanout()``, ``.call_one()``
+# * State management with sessions and actors
+# * Service scaling and orchestration patterns
+#
+# **Key takeaways:**
+#
+# * Use ``.route()`` for stateless load-balanced operations
+# * Use ``.fanout()`` for coordinated updates across all replicas
+# * Use sessions for stateful operations like multi-turn conversations
+# * Scale services independently based on bottlenecks
+# * Let Forge handle coordination complexity
+#
+# In the next part we will learn about Monarch internals.
+#
+# Further Reading
+# ---------------
+#
+# * Continue to :doc:`3_Monarch_101` (coming soon)
+# * Check the `Forge source code <https://github.com/meta-pytorch/forge>`_
+# * Review the :doc:`../../api_actors` documentation
+# * Explore the `GRPO application <https://github.com/meta-pytorch/forge/tree/main/apps/grpo>`_
diff --git a/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py b/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py
new file mode 100644
index 000000000..d7c8f86e8
--- /dev/null
+++ b/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py
@@ -0,0 +1,572 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Part 3: The Forge-Monarch Connection
+=====================================
+
+**Author:** `Sanyam Bhutani <https://github.com/init27>`_
+
+.. grid:: 2
+
+    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+       :class-card: card-prerequisites
+
+       * How Forge services are built on Monarch
+       * Understanding ProcMesh and ActorMesh
+       * The complete hierarchy from service to silicon
+       * Message routing patterns in distributed actors
+
+    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+       :class-card: card-prerequisites
+
+       * Complete :doc:`1_RL_and_Forge_Fundamentals`
+       * Complete :doc:`2_Forge_Internals`
+       * Understanding of distributed systems
+
+This is part 3 of our series. In the previous sections we learned:
+
+* Part 1: [RL Concepts and how they map to Forge](1_RL_and_Forge_Fundamentals)
+* Part 2: [Forge Internals](2_Forge_Internals)
+
+Now let's peel back the layers. Forge services are built on top of 
+**Monarch**, PyTorch's distributed actor framework. Understanding this 
+connection is crucial for optimization and debugging.
+"""
+
+######################################################################
+# The Complete Hierarchy: Service to Silicon
+# -------------------------------------------
+#
+# .. mermaid::
+#
+#     graph TD
+#         subgraph YourCode["1. Your RL Code"]
+#             Call["await policy_service.generate.route('What is 2+2?')"]
+#         end
+#
+#         subgraph ForgeServices["2. Forge Service Layer"]
+#             ServiceInterface["ServiceInterface: Routes requests, Load balancing, Health checks"]
+#             ServiceActor["ServiceActor: Manages replicas, Monitors health, Coordinates failures"]
+#         end
+#
+#         subgraph MonarchLayer["3. Monarch Actor Layer"]
+#             ActorMesh["ActorMesh PolicyActor: 4 instances, Different GPUs, Message passing"]
+#             ProcMesh["ProcMesh: 4 processes, GPU topology 0,1,2,3, Network interconnect"]
+#         end
+#
+#         subgraph Hardware["4. Physical Hardware"]
+#             GPU0["GPU 0: PolicyActor #1, vLLM Engine, Model Weights"]
+#             GPU1["GPU 1: PolicyActor #2, vLLM Engine, Model Weights"]
+#             GPU2["GPU 2: PolicyActor #3, vLLM Engine, Model Weights"]
+#             GPU3["GPU 3: PolicyActor #4, vLLM Engine, Model Weights"]
+#         end
+#
+#         Call --> ServiceInterface
+#         ServiceInterface --> ServiceActor
+#         ServiceActor --> ActorMesh
+#         ActorMesh --> ProcMesh
+#         ProcMesh --> GPU0
+#         ProcMesh --> GPU1
+#         ProcMesh --> GPU2
+#         ProcMesh --> GPU3
+#
+#         style Call fill:#4CAF50
+#         style ServiceActor fill:#FF9800
+#         style ActorMesh fill:#9C27B0
+#         style ProcMesh fill:#2196F3
+
+######################################################################
+# Deep Dive: ProcMesh - The Foundation
+# -------------------------------------
+#
+# **ProcMesh** is Monarch's core abstraction for organizing processes 
+# across hardware. Think of it as a multi-dimensional grid that maps 
+# directly to your cluster topology.
+#
+# Single Host ProcMesh
+# ~~~~~~~~~~~~~~~~~~~~
+#
+# .. mermaid::
+#
+#     graph TD
+#         subgraph Host["Single Host (8 GPUs)"]
+#             subgraph ProcMesh["ProcMesh: per_host={'gpus': 8}"]
+#                 P0["Process 0<br/>GPU 0"]
+#                 P1["Process 1<br/>GPU 1"]
+#                 P2["Process 2<br/>GPU 2"]
+#                 P3["Process 3<br/>GPU 3"]
+#                 P4["Process 4<br/>GPU 4"]
+#                 P5["Process 5<br/>GPU 5"]
+#                 P6["Process 6<br/>GPU 6"]
+#                 P7["Process 7<br/>GPU 7"]
+#             end
+#
+#             P0 -.->|"Network"| P1
+#             P1 -.->|"Network"| P2
+#             P2 -.->|"Network"| P3
+#             P3 -.->|"Network"| P4
+#             P4 -.->|"Network"| P5
+#             P5 -.->|"Network"| P6
+#             P6 -.->|"Network"| P7
+#             P7 -.->|"Network"| P0
+#         end
+#
+#         style P0 fill:#F44336
+#         style P1 fill:#F44336
+#         style P2 fill:#F44336
+#         style P3 fill:#F44336
+#         style P4 fill:#F44336
+#         style P5 fill:#F44336
+#         style P6 fill:#F44336
+#         style P7 fill:#F44336
+
+######################################################################
+# Multi-Host ProcMesh
+# ~~~~~~~~~~~~~~~~~~~
+#
+# .. mermaid::
+#
+#     graph TD
+#         subgraph Cluster["Multi-Host Cluster"]
+#             subgraph Host1["Host 1"]
+#                 subgraph PM1["ProcMesh Segment 1"]
+#                     H1P0["Process 0<br/>GPU 0"]
+#                     H1P1["Process 1<br/>GPU 1"]
+#                     H1P2["Process 2<br/>GPU 2"]
+#                     H1P3["Process 3<br/>GPU 3"]
+#                 end
+#             end
+#
+#             subgraph Host2["Host 2"]
+#                 subgraph PM2["ProcMesh Segment 2"]
+#                     H2P0["Process 4<br/>GPU 0"]
+#                     H2P1["Process 5<br/>GPU 1"]
+#                     H2P2["Process 6<br/>GPU 2"]
+#                     H2P3["Process 7<br/>GPU 3"]
+#                 end
+#             end
+#
+#             subgraph Host3["Host 3"]
+#                 subgraph PM3["ProcMesh Segment 3"]
+#                     H3P0["Process 8<br/>GPU 0"]
+#                     H3P1["Process 9<br/>GPU 1"]
+#                     H3P2["Process 10<br/>GPU 2"]
+#                     H3P3["Process 11<br/>GPU 3"]
+#                 end
+#             end
+#         end
+#
+#         H1P0 -.->|"InfiniBand"| H2P0
+#         H1P1 -.->|"InfiniBand"| H2P1
+#         H2P0 -.->|"InfiniBand"| H3P0
+#         H2P1 -.->|"InfiniBand"| H3P1
+#
+#         style PM1 fill:#F44336
+#         style PM2 fill:#4CAF50
+#         style PM3 fill:#2196F3
+
+######################################################################
+# Monarch Actor System Basics
+# ----------------------------
+#
+# This shows the underlying actor system that powers Forge services.
+
+import asyncio
+
+# Mock imports for documentation build
+try:
+    from monarch.actor import Actor, endpoint, this_proc, Future
+    from monarch.actor import ProcMesh, this_host
+except ImportError:
+
+    class Actor:
+        pass
+
+    def endpoint(func):
+        return func
+
+    class Future:
+        pass
+
+    class ProcMesh:
+        pass
+
+    def this_proc():
+        return None
+
+    def this_host():
+        return None
+
+
+# STEP 1: Define a basic actor
+class Counter(Actor):
+    """Basic counter actor example."""
+
+    def __init__(self, initial_value: int):
+        self.value = initial_value
+
+    @endpoint
+    def increment(self) -> None:
+        """Increment the counter."""
+        self.value += 1
+
+    @endpoint
+    def get_value(self) -> int:
+        """Get current counter value."""
+        return self.value
+
+
+async def basic_actor_example():
+    """Example of using Monarch actors."""
+    # STEP 2: Single actor in local process
+    counter = this_proc().spawn("counter", Counter, initial_value=0)
+
+    # STEP 3: Send messages
+    fut = counter.get_value.call_one()
+    value = await fut
+    print(f"Counter value: {value}")  # 0
+
+
+async def distributed_actors_example():
+    """Example of actors across multiple processes."""
+    # STEP 4: Multiple actors across processes
+    procs = this_host().spawn_procs(per_host={"gpus": 8})
+    counters = procs.spawn("counters", Counter, 0)
+
+    # STEP 5: Broadcast to all actors
+    await counters.increment.call()
+
+    # STEP 6: Different message patterns
+    # call_one() - single actor
+    value = await counters.get_value.call_one()
+    print(f"One counter: {value}")
+
+    # choose() - random single actor (actors only, not services)
+    value = await counters.get_value.choose()
+    print(f"Random counter: {value}")
+
+    # call() - all actors, collect results
+    values = await counters.get_value.call()
+    print(f"All counters: {values}")
+
+    # broadcast() - fire and forget
+    await counters.increment.broadcast()
+
+    # Cleanup
+    await procs.stop()
+
+
+######################################################################
+# Actor Meshes: Your Code Running Distributed
+# --------------------------------------------
+#
+# **ActorMesh** is created when you spawn actors across a ProcMesh. 
+# Each process in the ProcMesh gets one instance of your actor.
+#
+# .. mermaid::
+#
+#     graph TD
+#         subgraph Creation["Actor Creation Process"]
+#             Code["mesh.spawn('policy', PolicyActor, model='Qwen/Qwen3-7B')"]
+#
+#             subgraph ProcMesh["ProcMesh (4 processes)"]
+#                 P0["Process 0<br/>GPU 0"]
+#                 P1["Process 1<br/>GPU 1"]
+#                 P2["Process 2<br/>GPU 2"]
+#                 P3["Process 3<br/>GPU 3"]
+#             end
+#
+#             subgraph ActorMesh["ActorMesh PolicyActor"]
+#                 A0["PolicyActor Instance #0: model=Qwen/Qwen3-7B"]
+#                 A1["PolicyActor Instance #1: model=Qwen/Qwen3-7B"]
+#                 A2["PolicyActor Instance #2: model=Qwen/Qwen3-7B"]
+#                 A3["PolicyActor Instance #3: model=Qwen/Qwen3-7B"]
+#             end
+#
+#             Code --> ProcMesh
+#             P0 --> A0
+#             P1 --> A1
+#             P2 --> A2
+#             P3 --> A3
+#         end
+#
+#         style A0 fill:#4CAF50
+#         style A1 fill:#4CAF50
+#         style A2 fill:#4CAF50
+#         style A3 fill:#4CAF50
+
+######################################################################
+# Message Routing Through ActorMesh
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# .. mermaid::
+#
+#     graph TD
+#         subgraph MessageFlow["Message Flow Patterns"]
+#             Client["await policy_actors.generate.METHOD(prompt)"]
+#
+#             subgraph Methods["Different Adverbs Route Differently"]
+#                 Choose["choose(): Routes to ONE actor, Load balanced"]
+#                 Call["call(): Routes to ALL actors, Collects results"]
+#                 Broadcast["broadcast(): Routes to ALL actors, Fire and forget"]
+#                 Stream["stream(): Routes to ALL actors, Iterator of results"]
+#             end
+#
+#             subgraph ActorInstances["PolicyActor Instances"]
+#                 A0["Actor 0: GPU 0, generates response"]
+#                 A1["Actor 1: GPU 1, generates response"]
+#                 A2["Actor 2: GPU 2, generates response"]
+#                 A3["Actor 3: GPU 3, generates response"]
+#             end
+#
+#             Client --> Choose
+#             Client --> Call
+#             Client --> Broadcast
+#             Client --> Stream
+#
+#             Choose -.->|"Load balanced"| A1
+#             Call --> A0
+#             Call --> A1
+#             Call --> A2
+#             Call --> A3
+#             Broadcast --> A0
+#             Broadcast --> A1
+#             Broadcast --> A2
+#             Broadcast --> A3
+#             Stream --> A0
+#             Stream --> A1
+#             Stream --> A2
+#             Stream --> A3
+#         end
+#
+#         style Choose fill:#4CAF50
+#         style Call fill:#FF9800
+#         style Broadcast fill:#E91E63
+#         style Stream fill:#9C27B0
+
+######################################################################
+# How Forge Services Use Monarch
+# -------------------------------
+#
+# Now the key insight: **Forge services are ServiceActors that manage 
+# ActorMeshes of your ForgeActor replicas**.
+#
+# The Service Creation Process
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# .. mermaid::
+#
+#     graph TD
+#         subgraph ServiceCreation["Service Creation Process"]
+#             Call["await PolicyActor.options(num_replicas=4, procs=1).as_service(model='Qwen')"]
+#
+#             ServiceActor["ServiceActor: Manages 4 replicas, Health checks, Routes calls"]
+#
+#             subgraph Replicas["4 Independent Replicas"]
+#                 subgraph R0["Replica 0"]
+#                     PM0["ProcMesh: 1 process, GPU 0"]
+#                     AM0["ActorMesh<br/>1 PolicyActor"]
+#                 end
+#
+#                 subgraph R1["Replica 1"]
+#                     PM1["ProcMesh: 1 process, GPU 1"]
+#                     AM1["ActorMesh<br/>1 PolicyActor"]
+#                 end
+#
+#                 subgraph R2["Replica 2"]
+#                     PM2["ProcMesh: 1 process, GPU 2"]
+#                     AM2["ActorMesh<br/>1 PolicyActor"]
+#                 end
+#
+#                 subgraph R3["Replica 3"]
+#                     PM3["ProcMesh: 1 process, GPU 3"]
+#                     AM3["ActorMesh<br/>1 PolicyActor"]
+#                 end
+#             end
+#
+#             Call --> ServiceActor
+#             ServiceActor --> R0
+#             ServiceActor --> R1
+#             ServiceActor --> R2
+#             ServiceActor --> R3
+#             PM0 --> AM0
+#             PM1 --> AM1
+#             PM2 --> AM2
+#             PM3 --> AM3
+#         end
+#
+#         style ServiceActor fill:#FF9800
+#         style AM0 fill:#4CAF50
+#         style AM1 fill:#4CAF50
+#         style AM2 fill:#4CAF50
+#         style AM3 fill:#4CAF50
+
+######################################################################
+# Service Call to Actor Execution
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# .. mermaid::
+#
+#     graph TD
+#         subgraph CallFlow["Complete Call Flow"]
+#             UserCall["await policy_service.generate.route('What is 2+2?')"]
+#
+#             ServiceInterface["ServiceInterface: Receives .route() call, Routes to ServiceActor"]
+#
+#             ServiceActor["ServiceActor: Selects healthy replica, Load balancing, Failure handling"]
+#
+#             SelectedReplica["Selected Replica #2: ProcMesh 1 process, ActorMesh 1 PolicyActor"]
+#
+#             PolicyActor["PolicyActor Instance: Loads model, Runs vLLM inference"]
+#
+#             GPU["GPU 2: vLLM engine, Model weights, KV cache, CUDA kernels"]
+#
+#             UserCall --> ServiceInterface
+#             ServiceInterface --> ServiceActor
+#             ServiceActor --> SelectedReplica
+#             SelectedReplica --> PolicyActor
+#             PolicyActor --> GPU
+#
+#             GPU -.->|"Response"| PolicyActor
+#             PolicyActor -.->|"Response"| SelectedReplica
+#             SelectedReplica -.->|"Response"| ServiceActor
+#             ServiceActor -.->|"Response"| ServiceInterface
+#             ServiceInterface -.->|"'The answer is 4'"| UserCall
+#         end
+#
+#         style UserCall fill:#4CAF50
+#         style ServiceActor fill:#FF9800
+#         style PolicyActor fill:#9C27B0
+#         style GPU fill:#FF5722
+
+######################################################################
+# Multiple Services Sharing Infrastructure
+# -----------------------------------------
+#
+# In real RL systems, you have multiple services that can share or use 
+# separate ProcMeshes:
+#
+# .. mermaid::
+#
+#     graph TD
+#         subgraph Cluster["RL Training Cluster"]
+#             subgraph Services["Forge Services"]
+#                 PS["Policy Service<br/>4 GPU replicas"]
+#                 TS["Trainer Service<br/>2 GPU replicas"]
+#                 RS["Reward Service<br/>4 CPU replicas"]
+#                 BS["Buffer Service<br/>1 CPU replica"]
+#             end
+#
+#             subgraph MonarchInfra["Monarch Infrastructure"]
+#                 subgraph GPUMesh["GPU ProcMesh (6 processes)"]
+#                     G0["Process 0<br/>GPU 0"]
+#                     G1["Process 1<br/>GPU 1"]
+#                     G2["Process 2<br/>GPU 2"]
+#                     G3["Process 3<br/>GPU 3"]
+#                     G4["Process 4<br/>GPU 4"]
+#                     G5["Process 5<br/>GPU 5"]
+#                 end
+#
+#                 subgraph CPUMesh["CPU ProcMesh (5 processes)"]
+#                     C0["Process 0<br/>CPU"]
+#                     C1["Process 1<br/>CPU"]
+#                     C2["Process 2<br/>CPU"]
+#                     C3["Process 3<br/>CPU"]
+#                     C4["Process 4<br/>CPU"]
+#                 end
+#             end
+#
+#             PS --> G0
+#             PS --> G1
+#             PS --> G2
+#             PS --> G3
+#             TS --> G4
+#             TS --> G5
+#             RS --> C0
+#             RS --> C1
+#             RS --> C2
+#             RS --> C3
+#             BS --> C4
+#         end
+#
+#         style PS fill:#4CAF50
+#         style TS fill:#E91E63
+#         style RS fill:#FF9800
+#         style BS fill:#9C27B0
+#         style GPUMesh fill:#FFEBEE
+#         style CPUMesh fill:#E3F2FD
+
+######################################################################
+# Key Insights: Why This Architecture Matters
+# --------------------------------------------
+#
+# 1. **Process Isolation**: Each actor runs in its own process - failures don't cascade
+# 2. **Location Transparency**: Actors can be local or remote with identical APIs
+# 3. **Structured Distribution**: ProcMesh maps directly to hardware topology
+# 4. **Message Passing**: No shared memory means no race conditions or locks
+# 5. **Service Abstraction**: Forge hides Monarch complexity while preserving power
+#
+# Understanding this hierarchy helps you:
+#
+# * **Debug performance issues**: Is the bottleneck at service, actor, or hardware level?
+# * **Optimize resource usage**: How many replicas per service? GPU vs CPU processes?
+# * **Handle failures gracefully**: Which layer failed and how to recover?
+# * **Scale effectively**: Where to add resources for maximum impact?
+
+
+def demonstrate_architecture_benefits():
+    """Example showing why the architecture matters."""
+    # Process Isolation: Failures don't cascade
+    # If one PolicyActor crashes, others continue serving
+
+    # Location Transparency: Same API whether local or remote
+    # await policy.generate.route(prompt)  # Works same everywhere
+
+    # Structured Distribution: ProcMesh maps to hardware
+    # per_host={"gpus": 8} creates 8 processes, 1 per GPU
+
+    # Message Passing: No locks needed
+    # Each actor processes messages sequentially, naturally thread-safe
+
+    # Service Abstraction: Simple interface, powerful backend
+    # await service.method.route() hides all distribution complexity
+    pass
+
+
+######################################################################
+# Conclusion
+# ----------
+#
+# What You've Learned
+# ~~~~~~~~~~~~~~~~~~~
+#
+# 1. **RL Fundamentals**: How RL concepts map to Forge services with real examples
+# 2. **Service Abstraction**: How to use Forge services effectively
+# 3. **Monarch Foundation**: How Forge services connect to distributed actors and hardware
+#
+# Key Takeaways
+# ~~~~~~~~~~~~~
+#
+# * **Services hide complexity**: Your RL code looks like simple async functions, 
+#   but runs on distributed clusters
+# * **Communication patterns matter**: ``.route()``, ``.fanout()``, sessions, 
+#   and ``.call_one()`` each serve specific purposes
+# * **Architecture understanding helps**: Knowing the Service → Actor → Process → 
+#   Hardware hierarchy helps you debug, optimize, and scale
+# * **Always verify APIs**: This guide is verified, but cross-check with source 
+#   code for latest changes
+# * **Real API patterns**: Use ``.options().as_service()`` not ``spawn_service()``, 
+#   use ``.route()`` not ``.choose()``, etc.
+#
+# Further Reading
+# ---------------
+#
+# * Review :doc:`1_RL_and_Forge_Fundamentals` for RL concepts
+# * Review :doc:`2_Forge_Internals` for service patterns
+# * Check the `Forge source code <https://github.com/meta-pytorch/forge>`_
+# * Explore the `GRPO application <https://github.com/meta-pytorch/forge/tree/main/apps/grpo>`_
+# * Read about `Monarch <https://github.com/pytorch/monarch>`_ for deeper understanding
diff --git a/docs/Tutorials/ReadMe.MD b/docs/source/tutorial_sources/zero-to-forge/README.md
similarity index 57%
rename from docs/Tutorials/ReadMe.MD
rename to docs/source/tutorial_sources/zero-to-forge/README.md
index 084710853..9be7e3b6a 100644
--- a/docs/Tutorials/ReadMe.MD
+++ b/docs/source/tutorial_sources/zero-to-forge/README.md
@@ -1,4 +1,4 @@
-## Zero to Forge: From RL Theory to Production-Scale Implementation
+# Zero to Forge: From RL Theory to Production-Scale Implementation
 
 A comprehensive guide for ML Engineers building distributed RL systems for language models.
 
@@ -6,14 +6,21 @@ Some of the examples mentioned below will be conceptual in nature for understand
 
 Welcome to the Tutorials section! This section is inspired by the A-Z PyTorch tutorial, shoutout to our PyTorch friends that remember!
 
-###
+## Tutorial Structure
 
 This section currently is structured in 3 detailed parts:
 
-1. [RL Fundamentals and Understanding Forge Terminology](./1_RL_and_Forge_Fundamentals.MD): This gives a quick refresher of Reinforcement Learning and teaches you Forge Fundamentals
-2. [Forge Internals](./2_Forge_Internals.MD): Goes a layer deeper and explains the internals of Forge
-3. [Monarch 101](./3_Monarch_101.MD): It's a 101 to Monarch and how Forge Talks to Monarch
+1. [RL Fundamentals and Understanding Forge Terminology](1_RL_and_Forge_Fundamentals.html): This gives a quick refresher of Reinforcement Learning and teaches you Forge Fundamentals
+2. [Forge Internals](2_Forge_Internals.html): Goes a layer deeper and explains the internals of Forge
+3. [Monarch 101](3_Monarch_101.html): It's a 101 to Monarch and how Forge Talks to Monarch
 
-Each part builds upon the next and the entire section can be consumed in roughly an hour-Grab a Chai and Enjoy!
+Each part builds upon the next and the entire section can be consumed in roughly an hour - Grab a Chai and Enjoy!
 
-If you're eager, please checkout our SFT Tutorial too (Coming soon!) as well as [App Examples](../../apps/).
+If you're eager, please checkout our SFT Tutorial too (Coming soon!)!
+
+.. toctree::
+   :maxdepth: 1
+
+   1_RL_and_Forge_Fundamentals
+   2_Forge_Internals
+   3_Monarch_101
diff --git a/docs/source/tutorials.md b/docs/source/tutorials.md
index 6e06c636a..42339dfcb 100644
--- a/docs/source/tutorials.md
+++ b/docs/source/tutorials.md
@@ -1,9 +1,10 @@
 # Tutorials
 
- This section provides step-by-step guides to help you master TorchForge's capabilities,
- from basic model fine-tuning to advanced distributed training scenarios.
+This section provides step-by-step guides to help you master TorchForge's capabilities,
+from basic model fine-tuning to advanced distributed training scenarios.
 
 ```{toctree}
 :maxdepth: 1
 
+zero-to-forge-intro
 ```
diff --git a/docs/source/zero-to-forge-intro.md b/docs/source/zero-to-forge-intro.md
new file mode 100644
index 000000000..e56edc663
--- /dev/null
+++ b/docs/source/zero-to-forge-intro.md
@@ -0,0 +1,28 @@
+# Zero to Forge: From RL Theory to Production-Scale Implementation
+
+A comprehensive guide for ML Engineers building distributed RL systems for language models.
+
+Some of the examples mentioned below will be conceptual in nature for understanding. Please refer to API Docs (Coming Soon!) for more details
+
+Welcome to the Tutorials section! This section is inspired by the A-Z PyTorch tutorial, shoutout to our PyTorch friends that remember!
+
+## Tutorial Structure
+
+This section currently is structured in 3 detailed parts:
+
+1. [RL Fundamentals and Understanding Forge Terminology](tutorials/zero-to-forge/1_RL_and_Forge_Fundamentals.html): This gives a quick refresher of Reinforcement Learning and teaches you Forge Fundamentals
+2. [Forge Internals](tutorials/zero-to-forge/2_Forge_Internals.html): Goes a layer deeper and explains the internals of Forge
+3. [Monarch 101](tutorials/zero-to-forge/3_Monarch_101.html): It's a 101 to Monarch and how Forge Talks to Monarch
+
+Each part builds upon the next and the entire section can be consumed in roughly an hour - Grab a Chai and Enjoy!
+
+If you're eager, please checkout our SFT Tutorial too (Coming soon!)!
+
+```{toctree}
+:maxdepth: 1
+:hidden:
+
+tutorials/zero-to-forge/1_RL_and_Forge_Fundamentals
+tutorials/zero-to-forge/2_Forge_Internals
+tutorials/zero-to-forge/3_Monarch_101
+```

From 6d2cb580bda321935a6947605f2f4fdb8b8e0d55 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Tue, 14 Oct 2025 12:11:38 -0700
Subject: [PATCH 22/28] Update

---
 docs/source/tutorial_sources/zero-to-forge/README.md | 6 +++---
 docs/source/zero-to-forge-intro.md                   | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/source/tutorial_sources/zero-to-forge/README.md b/docs/source/tutorial_sources/zero-to-forge/README.md
index 9be7e3b6a..f32b01c20 100644
--- a/docs/source/tutorial_sources/zero-to-forge/README.md
+++ b/docs/source/tutorial_sources/zero-to-forge/README.md
@@ -10,9 +10,9 @@ Welcome to the Tutorials section! This section is inspired by the A-Z PyTorch tu
 
 This section currently is structured in 3 detailed parts:
 
-1. [RL Fundamentals and Understanding Forge Terminology](1_RL_and_Forge_Fundamentals.html): This gives a quick refresher of Reinforcement Learning and teaches you Forge Fundamentals
-2. [Forge Internals](2_Forge_Internals.html): Goes a layer deeper and explains the internals of Forge
-3. [Monarch 101](3_Monarch_101.html): It's a 101 to Monarch and how Forge Talks to Monarch
+1. [RL Fundamentals and Understanding Forge Terminology](1_RL_and_Forge_Fundamentals): This gives a quick refresher of Reinforcement Learning and teaches you Forge Fundamentals
+2. [Forge Internals](2_Forge_Internals): Goes a layer deeper and explains the internals of Forge
+3. [Monarch 101](3_Monarch_101): It's a 101 to Monarch and how Forge Talks to Monarch
 
 Each part builds upon the next and the entire section can be consumed in roughly an hour - Grab a Chai and Enjoy!
 
diff --git a/docs/source/zero-to-forge-intro.md b/docs/source/zero-to-forge-intro.md
index e56edc663..c7c31fdf1 100644
--- a/docs/source/zero-to-forge-intro.md
+++ b/docs/source/zero-to-forge-intro.md
@@ -10,9 +10,9 @@ Welcome to the Tutorials section! This section is inspired by the A-Z PyTorch tu
 
 This section currently is structured in 3 detailed parts:
 
-1. [RL Fundamentals and Understanding Forge Terminology](tutorials/zero-to-forge/1_RL_and_Forge_Fundamentals.html): This gives a quick refresher of Reinforcement Learning and teaches you Forge Fundamentals
-2. [Forge Internals](tutorials/zero-to-forge/2_Forge_Internals.html): Goes a layer deeper and explains the internals of Forge
-3. [Monarch 101](tutorials/zero-to-forge/3_Monarch_101.html): It's a 101 to Monarch and how Forge Talks to Monarch
+1. [RL Fundamentals and Understanding Forge Terminology](tutorials/zero-to-forge/1_RL_and_Forge_Fundamentals): This gives a quick refresher of Reinforcement Learning and teaches you Forge Fundamentals
+2. [Forge Internals](tutorials/zero-to-forge/2_Forge_Internals): Goes a layer deeper and explains the internals of Forge
+3. [Monarch 101](tutorials/zero-to-forge/3_Monarch_101): It's a 101 to Monarch and how Forge Talks to Monarch
 
 Each part builds upon the next and the entire section can be consumed in roughly an hour - Grab a Chai and Enjoy!
 

From 42aef41e2623c3c155db9e8980ee061b4eea84ca Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Tue, 14 Oct 2025 12:31:37 -0700
Subject: [PATCH 23/28] Precommit

---
 .../1_RL_and_Forge_Fundamentals.py            | 28 +++---
 .../zero-to-forge/2_Forge_Internals.py        | 86 ++++++++++---------
 .../zero-to-forge/3_Monarch_101.py            | 29 +++----
 3 files changed, 75 insertions(+), 68 deletions(-)

diff --git a/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py b/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py
index 08f7193c0..08c0f3335 100644
--- a/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py
+++ b/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py
@@ -32,7 +32,7 @@
 # Core RL Components in Forge
 # ----------------------------
 #
-# Let's start with a simple math tutoring example to understand RL concepts 
+# Let's start with a simple math tutoring example to understand RL concepts
 # with the exact names Forge uses:
 #
 # The Toy Example: Teaching Math
@@ -108,7 +108,7 @@ def conceptual_rl_step():
 # From Concepts to Forge Services
 # --------------------------------
 #
-# Here's the key insight: **Each RL component becomes a Forge service**. 
+# Here's the key insight: **Each RL component becomes a Forge service**.
 # The toy example above maps directly to Forge:
 #
 # .. mermaid::
@@ -148,7 +148,7 @@ def conceptual_rl_step():
 # RL Step with Forge Services
 # ----------------------------
 #
-# Let's look at the example from above again, but this time we use the 
+# Let's look at the example from above again, but this time we use the
 # actual Forge API names:
 
 import asyncio
@@ -249,7 +249,7 @@ async def conceptual_forge_rl_step(services, step):
 # Enter Forge: RL-Native Architecture
 # ------------------------------------
 #
-# Forge solves these problems by treating each RL component as an 
+# Forge solves these problems by treating each RL component as an
 # **independent, distributed unit**.
 #
 # Quick API Reference (covered in detail in Part 2):
@@ -322,8 +322,8 @@ async def example_automatic_management(policy):
 # For actual imports, see apps/grpo/main.py
 try:
     from forge.actors.policy import Policy
-    from forge.actors.replay_buffer import ReplayBuffer
     from forge.actors.reference_model import ReferenceModel
+    from forge.actors.replay_buffer import ReplayBuffer
     from forge.actors.trainer import RLTrainer
     from forge.data.rewards import MathReward, ThinkingReward
 
@@ -406,7 +406,11 @@ async def setup_forge_services():
         ),
         # Trainer actor with GPU
         RLTrainer.options(procs=1, with_gpus=True).as_actor(
-            model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"},
+            model={
+                "name": "qwen3",
+                "flavor": "1.7B",
+                "hf_assets_path": f"hf://{model}",
+            },
             optimizer={"name": "AdamW", "lr": 1e-5},
             training={"local_batch_size": 2, "seq_len": 2048},
         ),
@@ -418,7 +422,11 @@ async def setup_forge_services():
         ComputeAdvantages.options(procs=1).as_actor(),
         # Reference model with GPU
         ReferenceModel.options(procs=1, with_gpus=True).as_actor(
-            model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"},
+            model={
+                "name": "qwen3",
+                "flavor": "1.7B",
+                "hf_assets_path": f"hf://{model}",
+            },
             training={"dtype": "bfloat16"},
         ),
         # Reward actor (CPU)
@@ -444,12 +452,12 @@ async def setup_forge_services():
 #
 # Forge has two types of distributed components:
 #
-# * **Services**: Multiple replicas with automatic load balancing 
+# * **Services**: Multiple replicas with automatic load balancing
 #   (like Policy, RewardActor)
-# * **Actors**: Single instances that handle their own internal 
+# * **Actors**: Single instances that handle their own internal
 #   distribution (like RLTrainer, ReplayBuffer)
 #
-# We cover this distinction in detail in Part 2, but for now this 
+# We cover this distinction in detail in Part 2, but for now this
 # explains the scaling patterns:
 #
 # * Policy service: ``num_replicas=8`` for high inference demand
diff --git a/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py b/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py
index efecfdc72..6c4be76f6 100644
--- a/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py
+++ b/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py
@@ -27,10 +27,10 @@
        * Understanding of Python async/await
        * Basic distributed systems knowledge
 
-We highly recommend reading Part 1 before this - it explains RL Concepts 
+We highly recommend reading Part 1 before this - it explains RL Concepts
 and how they land in Forge.
 
-Now that you see the power of the service abstraction, let's understand 
+Now that you see the power of the service abstraction, let's understand
 what's actually happening under the hood. Grab your chai!
 """
 
@@ -38,7 +38,7 @@
 # Service Anatomy: Beyond the Interface
 # --------------------------------------
 #
-# When you call ``await policy_service.generate(question)``, here's what 
+# When you call ``await policy_service.generate(question)``, here's what
 # actually happens:
 #
 # (Don't worry, we will understand Services right in the next section!)
@@ -100,7 +100,7 @@
 # 2. Real Service Creation
 # ~~~~~~~~~~~~~~~~~~~~~~~~
 #
-# Services are created using the ``.options().as_service()`` pattern 
+# Services are created using the ``.options().as_service()`` pattern
 # from the actual GRPO implementation.
 #
 # The service creation automatically handles:
@@ -152,11 +152,11 @@ async def example_service_creation():
 # 3. How Services Actually Work
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
-# Forge services are implemented as ServiceActors that manage 
+# Forge services are implemented as ServiceActors that manage
 # collections of your ForgeActor replicas.
 #
-# When you call ``.as_service()``, Forge creates a ``ServiceInterface`` 
-# that manages N replicas of your ``ForgeActor`` class and gives you 
+# When you call ``.as_service()``, Forge creates a ``ServiceInterface``
+# that manages N replicas of your ``ForgeActor`` class and gives you
 # methods like ``.route()``, ``.fanout()``, etc.
 
 
@@ -215,14 +215,14 @@ async def service_interface_example(policy):
 # Deep Dive: Service Communication Patterns
 # ------------------------------------------
 #
-# These communication patterns ("adverbs") determine how your service 
-# calls are routed to replicas. Understanding when to use each pattern 
+# These communication patterns ("adverbs") determine how your service
+# calls are routed to replicas. Understanding when to use each pattern
 # is key to effective Forge usage.
 #
 # 1. ``.route()`` - Load Balanced Single Replica
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
-# **When to use**: Normal request routing where any replica can handle 
+# **When to use**: Normal request routing where any replica can handle
 # the request.
 
 
@@ -246,7 +246,7 @@ async def route_example(policy):
 # * **Throughput**: Limited by single replica capacity
 # * **Fault tolerance**: Automatic failover to other replicas
 #
-# **Critical insight**: ``.route()`` is your default choice for 
+# **Critical insight**: ``.route()`` is your default choice for
 # stateless operations in Forge services.
 
 ######################################################################
@@ -273,7 +273,7 @@ async def fanout_example(policy):
 # * **Throughput**: Network bandwidth × number of replicas
 # * **Fault tolerance**: Fails if ANY replica fails (unless configured)
 #
-# **Critical gotcha**: Don't use ``.fanout()`` for high-frequency 
+# **Critical gotcha**: Don't use ``.fanout()`` for high-frequency
 # operations - it contacts all replicas.
 
 ######################################################################
@@ -306,18 +306,18 @@ async def streaming_pattern_example(replay_buffer, trainer, step):
 # * **Throughput**: Non-blocking async operations
 # * **Fault tolerance**: Continues if some replicas fail
 #
-# **Critical insight**: Essential for high-throughput RL where 
+# **Critical insight**: Essential for high-throughput RL where
 # you can't wait for batches.
 
 ######################################################################
 # Service Sessions for Stateful Operations
 # -----------------------------------------
 #
-# **When to use**: When you need multiple calls to hit the same replica 
+# **When to use**: When you need multiple calls to hit the same replica
 # (like KV cache preservation).
 #
-# **What are sticky sessions?** A session ensures all your service calls 
-# within the ``async with`` block go to the same replica, instead of 
+# **What are sticky sessions?** A session ensures all your service calls
+# within the ``async with`` block go to the same replica, instead of
 # being load-balanced across different replicas.
 
 # Mock classes for example
@@ -396,13 +396,13 @@ async def with_sessions_example():
 # Deep Dive: State Management Reality
 # ------------------------------------
 #
-# The most complex challenge in distributed RL is maintaining state 
+# The most complex challenge in distributed RL is maintaining state
 # consistency while maximizing performance.
 #
 # The KV Cache Problem
 # ~~~~~~~~~~~~~~~~~~~~
 #
-# **The challenge**: Policy inference is much faster with KV cache, 
+# **The challenge**: Policy inference is much faster with KV cache,
 # but cache is tied to specific conversation history.
 
 
@@ -415,9 +415,7 @@ async def naive_multi_turn(policy_service):
     full_prompt = question1 + response1[0].text
     response2 = await policy_service.generate.route(prompt=full_prompt)  # Cache miss!
     conversation = full_prompt + response2[0].text
-    response3 = await policy_service.generate.route(
-        prompt=conversation
-    )  # Cache miss!
+    response3 = await policy_service.generate.route(prompt=conversation)  # Cache miss!
 
 
 async def optimized_multi_turn(policy):
@@ -434,17 +432,17 @@ async def optimized_multi_turn(policy):
     # Session ends, replica can be garbage collected or reused
 
 
-# **Performance impact**: Maintaining KV cache across turns avoids 
+# **Performance impact**: Maintaining KV cache across turns avoids
 # recomputing previous tokens.
 
 ######################################################################
 # Replay Buffer Consistency
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
-# **The challenge**: Multiple trainers and experience collectors 
+# **The challenge**: Multiple trainers and experience collectors
 # reading/writing concurrently.
 #
-# **Real Forge approach**: The ReplayBuffer actor handles concurrency 
+# **Real Forge approach**: The ReplayBuffer actor handles concurrency
 # internally:
 
 
@@ -466,14 +464,14 @@ async def replay_buffer_example(replay_buffer):
     # state = await replay_buffer.state_dict.call_one()  # Checkpoint
 
 
-# **Critical insight**: The actor model provides natural thread safety - 
+# **Critical insight**: The actor model provides natural thread safety -
 # each actor processes messages sequentially.
 
 ######################################################################
 # Weight Synchronization Strategy
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
-# **The challenge**: Trainer updates policy weights, but policy service 
+# **The challenge**: Trainer updates policy weights, but policy service
 # needs those weights.
 
 import torch
@@ -497,18 +495,24 @@ async def real_weight_sync(trainer, policy, step):
 # Deep Dive: Asynchronous Coordination Patterns
 # ----------------------------------------------
 #
-# **The real challenge**: Different services run at different speeds, 
+# **The real challenge**: Different services run at different speeds,
 # but Forge's service abstraction handles the coordination complexity.
 #
 # The Forge Approach: Let Services Handle Coordination
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
-# Instead of manual coordination, Forge services handle speed mismatches 
+# Instead of manual coordination, Forge services handle speed mismatches
 # automatically:
 
 
 async def simple_rl_step(
-    dataloader, policy, ref_model, reward_actor, replay_buffer, compute_advantages, trainer
+    dataloader,
+    policy,
+    ref_model,
+    reward_actor,
+    replay_buffer,
+    compute_advantages,
+    trainer,
 ):
     """Simple RL step showing service coordination."""
     # ===== Generate a rollout =====
@@ -561,24 +565,17 @@ async def simple_rl_step(
 # Handling Speed Mismatches with Service Scaling
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
-# **The insight**: Scale services independently based on their 
+# **The insight**: Scale services independently based on their
 # bottlenecks.
 
 # Mock imports for example
 try:
-    from forge.actors.policy import Policy
     from forge.actors.trainer import RLTrainer
 except ImportError:
 
-    class Policy:
-        pass
-
     class RLTrainer:
         pass
 
-    class RewardActor:
-        pass
-
 
 async def scaling_example():
     """Scale services independently based on bottlenecks."""
@@ -598,7 +595,8 @@ async def scaling_example():
     trainer = await RLTrainer.options(
         procs=1, with_gpus=True  # Fewer but GPU-heavy
     ).as_actor(  # Trainer typically uses .as_actor() not .as_service()
-        model={"name": "qwen3", "flavor": "1.7B"}, optimizer={"name": "AdamW", "lr": 1e-5}
+        model={"name": "qwen3", "flavor": "1.7B"},
+        optimizer={"name": "AdamW", "lr": 1e-5},
     )
 
 
@@ -611,8 +609,8 @@ async def scaling_example():
 # Mock imports
 try:
     from forge.controller import ForgeActor
-    from monarch.actor import endpoint
     from forge.data.rewards import MathReward, ThinkingReward
+    from monarch.actor import endpoint
 except ImportError:
 
     class ForgeActor:
@@ -647,7 +645,9 @@ async def evaluate_response(self, prompt: str, response: str, target: str) -> fl
             total_reward += reward
 
         # Return average reward across all functions
-        return total_reward / len(self.reward_functions) if self.reward_functions else 0.0
+        return (
+            total_reward / len(self.reward_functions) if self.reward_functions else 0.0
+        )
 
 
 async def reward_service_example():
@@ -703,7 +703,9 @@ async def production_training_loop():
 
         # Reward evaluation service call
         reward = await reward_actor.evaluate_response.route(
-            prompt=sample["question"], response=responses[0].text, target=sample["answer"]
+            prompt=sample["question"],
+            response=responses[0].text,
+            target=sample["answer"],
         )
 
         # Experience storage
@@ -734,7 +736,7 @@ async def production_training_loop():
 # 4. **Resource efficiency**: CPU and GPU services scale independently
 # 5. **Coordination**: Services coordinate through shared state (replay buffer, weight versions)
 #
-# This is the power of the service abstraction - complex distributed 
+# This is the power of the service abstraction - complex distributed
 # coordination looks like simple async Python code.
 
 ######################################################################
diff --git a/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py b/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py
index d7c8f86e8..09e61e154 100644
--- a/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py
+++ b/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py
@@ -32,8 +32,8 @@
 * Part 1: [RL Concepts and how they map to Forge](1_RL_and_Forge_Fundamentals)
 * Part 2: [Forge Internals](2_Forge_Internals)
 
-Now let's peel back the layers. Forge services are built on top of 
-**Monarch**, PyTorch's distributed actor framework. Understanding this 
+Now let's peel back the layers. Forge services are built on top of
+**Monarch**, PyTorch's distributed actor framework. Understanding this
 connection is crucial for optimization and debugging.
 """
 
@@ -83,8 +83,8 @@
 # Deep Dive: ProcMesh - The Foundation
 # -------------------------------------
 #
-# **ProcMesh** is Monarch's core abstraction for organizing processes 
-# across hardware. Think of it as a multi-dimensional grid that maps 
+# **ProcMesh** is Monarch's core abstraction for organizing processes
+# across hardware. Think of it as a multi-dimensional grid that maps
 # directly to your cluster topology.
 #
 # Single Host ProcMesh
@@ -175,12 +175,9 @@
 #
 # This shows the underlying actor system that powers Forge services.
 
-import asyncio
-
 # Mock imports for documentation build
 try:
-    from monarch.actor import Actor, endpoint, this_proc, Future
-    from monarch.actor import ProcMesh, this_host
+    from monarch.actor import Actor, endpoint, Future, ProcMesh, this_host, this_proc
 except ImportError:
 
     class Actor:
@@ -264,7 +261,7 @@ async def distributed_actors_example():
 # Actor Meshes: Your Code Running Distributed
 # --------------------------------------------
 #
-# **ActorMesh** is created when you spawn actors across a ProcMesh. 
+# **ActorMesh** is created when you spawn actors across a ProcMesh.
 # Each process in the ProcMesh gets one instance of your actor.
 #
 # .. mermaid::
@@ -352,7 +349,7 @@ async def distributed_actors_example():
 # How Forge Services Use Monarch
 # -------------------------------
 #
-# Now the key insight: **Forge services are ServiceActors that manage 
+# Now the key insight: **Forge services are ServiceActors that manage
 # ActorMeshes of your ForgeActor replicas**.
 #
 # The Service Creation Process
@@ -447,7 +444,7 @@ async def distributed_actors_example():
 # Multiple Services Sharing Infrastructure
 # -----------------------------------------
 #
-# In real RL systems, you have multiple services that can share or use 
+# In real RL systems, you have multiple services that can share or use
 # separate ProcMeshes:
 #
 # .. mermaid::
@@ -551,15 +548,15 @@ def demonstrate_architecture_benefits():
 # Key Takeaways
 # ~~~~~~~~~~~~~
 #
-# * **Services hide complexity**: Your RL code looks like simple async functions, 
+# * **Services hide complexity**: Your RL code looks like simple async functions,
 #   but runs on distributed clusters
-# * **Communication patterns matter**: ``.route()``, ``.fanout()``, sessions, 
+# * **Communication patterns matter**: ``.route()``, ``.fanout()``, sessions,
 #   and ``.call_one()`` each serve specific purposes
-# * **Architecture understanding helps**: Knowing the Service → Actor → Process → 
+# * **Architecture understanding helps**: Knowing the Service → Actor → Process →
 #   Hardware hierarchy helps you debug, optimize, and scale
-# * **Always verify APIs**: This guide is verified, but cross-check with source 
+# * **Always verify APIs**: This guide is verified, but cross-check with source
 #   code for latest changes
-# * **Real API patterns**: Use ``.options().as_service()`` not ``spawn_service()``, 
+# * **Real API patterns**: Use ``.options().as_service()`` not ``spawn_service()``,
 #   use ``.route()`` not ``.choose()``, etc.
 #
 # Further Reading

From 0296c34bda43fdb0d681922059bdddb974503776 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Tue, 14 Oct 2025 14:09:02 -0700
Subject: [PATCH 24/28] Update

---
 .../1_RL_and_Forge_Fundamentals.py            | 145 ++---
 .../zero-to-forge/2_Forge_Internals.py        | 563 +++++++++---------
 2 files changed, 343 insertions(+), 365 deletions(-)

diff --git a/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py b/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py
index 08c0f3335..cbdb1fe5f 100644
--- a/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py
+++ b/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py
@@ -191,6 +191,15 @@ async def conceptual_forge_rl_step(services, step):
 
         return loss
 
+######################################################################
+# **Key difference**: Same RL logic, but each component is now a distributed,
+# # fault-tolerant, auto-scaling service.
+
+# Did you realise-we are not worrying about any Infra code here! Forge
+# # Automagically handles the details behind the scenes and you can focus on
+# # writing your RL Algorthms!
+
+
 
 ######################################################################
 # Why This Matters: Traditional ML Infrastructure Fails
@@ -296,6 +305,10 @@ async def real_rl_training_step(services, step):
 
         return loss
 
+#####################################################################
+# **Key insight**: Each line of RL pseudocode becomes a service call.
+# The complexity of distribution, scaling, and fault tolerance is hidden
+# behind these simple interfaces.
 
 ######################################################################
 # What Makes This Powerful
@@ -311,6 +324,15 @@ async def example_automatic_management(policy):
     answer = responses[0].text
     return answer
 
+######################################################################
+# Forge handles behind the scenes:
+#
+# - Routing to least loaded replica
+# - GPU memory management
+# - Batch optimization
+# - Failure recovery
+# - Auto-scaling based on demand
+
 
 ######################################################################
 # Independent Scaling
@@ -320,67 +342,27 @@ async def example_automatic_management(policy):
 
 # Note: This is example code showing the Forge API
 # For actual imports, see apps/grpo/main.py
-try:
-    from forge.actors.policy import Policy
-    from forge.actors.reference_model import ReferenceModel
-    from forge.actors.replay_buffer import ReplayBuffer
-    from forge.actors.trainer import RLTrainer
-    from forge.data.rewards import MathReward, ThinkingReward
-
-    # Mock classes for the example
-    class DatasetActor:
-        pass
-
-    class RewardActor:
-        pass
-
-    class ComputeAdvantages:
-        pass
-
-except ImportError:
-    # Provide mock classes if imports fail during doc build
-    class Policy:
-        pass
-
-    class ReplayBuffer:
-        pass
-
-    class ReferenceModel:
-        pass
-
-    class RLTrainer:
-        pass
-
-    class DatasetActor:
-        pass
-
-    class RewardActor:
-        pass
-
-    class ComputeAdvantages:
-        pass
-
-    class MathReward:
-        pass
-
-    class ThinkingReward:
-        pass
-
-
-async def setup_forge_services():
-    """Configure Forge services with independent scaling."""
-    model = "Qwen/Qwen3-1.7B"
-    group_size = 1
-
-    (
-        dataloader,
-        policy,
-        trainer,
-        replay_buffer,
-        compute_advantages,
-        ref_model,
-        reward_actor,
-    ) = await asyncio.gather(
+from forge.actors.policy import Policy
+from forge.actors.replay_buffer import ReplayBuffer
+from forge.actors.reference_model import ReferenceModel
+from forge.actors.trainer import RLTrainer
+from apps.grpo.main import DatasetActor, RewardActor, ComputeAdvantages
+from forge.data.rewards import MathReward, ThinkingReward
+import asyncio
+import torch
+
+model = "Qwen/Qwen3-1.7B"
+group_size = 1
+
+(
+    dataloader,
+    policy,
+    trainer,
+    replay_buffer,
+    compute_advantages,
+    ref_model,
+    reward_actor,
+) = await asyncio.gather(
         # Dataset actor (CPU)
         DatasetActor.options(procs=1).as_actor(
             path="openai/gsm8k",
@@ -389,62 +371,47 @@ async def setup_forge_services():
             streaming=True,
             model=model,
         ),
-        # Policy service with GPU and multiple replicas
+        # Policy service with GPU
         Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service(
             engine_config={
                 "model": model,
                 "tensor_parallel_size": 1,
                 "pipeline_parallel_size": 1,
-                "enforce_eager": False,
+                "enforce_eager": False
             },
             sampling_config={
                 "n": group_size,
                 "max_tokens": 16,
                 "temperature": 1.0,
-                "top_p": 1.0,
-            },
+                "top_p": 1.0
+            }
         ),
         # Trainer actor with GPU
         RLTrainer.options(procs=1, with_gpus=True).as_actor(
-            model={
-                "name": "qwen3",
-                "flavor": "1.7B",
-                "hf_assets_path": f"hf://{model}",
-            },
+            # Trainer config would come from YAML in real usage
+            model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"},
             optimizer={"name": "AdamW", "lr": 1e-5},
-            training={"local_batch_size": 2, "seq_len": 2048},
+            training={"local_batch_size": 2, "seq_len": 2048}
         ),
         # Replay buffer (CPU)
         ReplayBuffer.options(procs=1).as_actor(
-            batch_size=2, max_policy_age=1, dp_size=1
+            batch_size=2,
+            max_policy_age=1,
+            dp_size=1
         ),
         # Advantage computation (CPU)
         ComputeAdvantages.options(procs=1).as_actor(),
         # Reference model with GPU
         ReferenceModel.options(procs=1, with_gpus=True).as_actor(
-            model={
-                "name": "qwen3",
-                "flavor": "1.7B",
-                "hf_assets_path": f"hf://{model}",
-            },
-            training={"dtype": "bfloat16"},
+            model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"},
+            training={"dtype": "bfloat16"}
         ),
         # Reward actor (CPU)
         RewardActor.options(procs=1, num_replicas=1).as_service(
             reward_functions=[MathReward(), ThinkingReward()]
-        ),
+        )
     )
 
-    return {
-        "dataloader": dataloader,
-        "policy": policy,
-        "trainer": trainer,
-        "replay_buffer": replay_buffer,
-        "compute_advantages": compute_advantages,
-        "ref_model": ref_model,
-        "reward_actor": reward_actor,
-    }
-
 
 ######################################################################
 # Forge Components: Services vs Actors
diff --git a/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py b/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py
index 6c4be76f6..0a1e57c91 100644
--- a/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py
+++ b/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py
@@ -27,8 +27,8 @@
        * Understanding of Python async/await
        * Basic distributed systems knowledge
 
-We highly recommend reading Part 1 before this - it explains RL Concepts
-and how they land in Forge.
+We highly recommend completing Part 1 before starting this tutorial. 
+Part 1 explains RL Concepts and how they land in Forge.
 
 Now that you see the power of the service abstraction, let's understand
 what's actually happening under the hood. Grab your chai!
@@ -85,16 +85,19 @@
 # 1. Real Service Configuration
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
-# Here's the actual ServiceConfig from Forge source code:
+# Here's the actual ``ServiceConfig`` from Forge source code:
 
 # Configuration pattern from apps/grpo/main.py:
-# Policy.options(
-#     procs=1,           # Processes per replica
-#     num_replicas=4,    # Number of replicas
-#     with_gpus=True     # Allocate GPUs
-#     # Other available options:
-#     # hosts=None   #  the number of remote hosts used per replica
-# )
+# 
+# .. code-block:: python
+# 
+#    Policy.options(
+#        procs=1,           # Processes per replica
+#        num_replicas=4,    # Number of replicas
+#        with_gpus=True     # Allocate GPUs
+#        # Other available options:
+#        # hosts=None   #  the number of remote hosts used per replica
+#    )
 
 ######################################################################
 # 2. Real Service Creation
@@ -106,65 +109,56 @@
 # The service creation automatically handles:
 #
 # * Spawning actor replicas across processes/GPUs
-# * Load balancing with .route() method for services
+# * Load balancing with ``.route()`` method for services
 # * Health monitoring and failure recovery
 # * Message routing and serialization
 
-import asyncio
-
-# Mock imports for documentation build
-try:
-    from forge.actors.policy import Policy
-except ImportError:
-
-    class Policy:
-        pass
-
-
-async def example_service_creation():
-    """Example of creating a Policy service."""
-    model = "Qwen/Qwen3-1.7B"
-
-    policy = await Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service(
-        engine_config={
-            "model": model,
-            "tensor_parallel_size": 1,
-            "pipeline_parallel_size": 1,
-            "enforce_eager": False,
-        },
-        sampling_config={
-            "n": 1,
-            "max_tokens": 16,
-            "temperature": 1.0,
-            "top_p": 1.0,
-        },
-    )
+from forge.actors.policy import Policy
+
+model = "Qwen/Qwen3-1.7B"
+
+policy = await Policy.options(
+    procs=1,
+    with_gpus=True,
+    num_replicas=1
+).as_service(
+    engine_config={
+        "model": model,
+        "tensor_parallel_size": 1,
+        "pipeline_parallel_size": 1,
+        "enforce_eager": False
+    },
+    sampling_config={
+        "n": 1,
+        "max_tokens": 16,
+        "temperature": 1.0,
+        "top_p": 1.0
+    }
+)
 
-    prompt = "What is 3 + 5?"
-    responses = await policy.generate.route(prompt)
-    print(f"Response: {responses[0].text}")
+prompt = "What is 3 + 5?"
+responses = await policy.generate.route(prompt)
+print(f"Response: {responses[0].text}")
 
-    # Cleanup when done
-    await policy.shutdown()
+# Cleanup when done
+await policy.shutdown()
 
 
 ######################################################################
 # 3. How Services Actually Work
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
-# Forge services are implemented as ServiceActors that manage
-# collections of your ForgeActor replicas.
+# Forge services are implemented as ``ServiceActors`` that manage
+# collections of your ``ForgeActor`` replicas.
 #
 # When you call ``.as_service()``, Forge creates a ``ServiceInterface``
 # that manages N replicas of your ``ForgeActor`` class and gives you
 # methods like ``.route()``, ``.fanout()``, etc.
 
 
-async def service_interface_example(policy):
-    """Your code sees this simple interface."""
-    # Simple call - but Forge handles all complexity
-    responses = await policy.generate.route(prompt="What is 2+2?")
-    # Forge handles: replica management, load balancing, fault tolerance
+# Your code sees this simple interface:
+responses = await policy.generate.route(prompt=prompt)
+# But Forge handles all the complexity of replica management, load balancing, and fault tolerance
 
 
 ######################################################################
@@ -225,16 +219,12 @@ async def service_interface_example(policy):
 # **When to use**: Normal request routing where any replica can handle
 # the request.
 
+responses = await policy.generate.route(prompt=question)
+answer = responses[0].text  # Extract text from Completion object
 
-async def route_example(policy):
-    """Using .route() for load-balanced requests."""
-    question = "What is 2+2?"
-    responses = await policy.generate.route(prompt=question)
-    answer = responses[0].text  # Extract text from Completion object
-    return answer
-
-
+######################################################################
 # Behind the scenes:
+#
 # 1. Health check eliminates failed replicas
 # 2. Load balancer picks replica (currently round robin)
 # 3. Request routes to that specific replica
@@ -256,15 +246,13 @@ async def route_example(policy):
 # **When to use**: You need responses from ALL replicas.
 
 
-async def fanout_example(policy):
-    """Using .fanout() to broadcast to all replicas."""
-    # Get version from all policy replicas
-    current_versions = await policy.get_version.fanout()
-    # Returns: [version_replica_1, version_replica_2, ...]
+# Get version from all policy replicas
+current_versions = await policy.get_version.fanout()
+# Returns: [version_replica_1, version_replica_2, ...]
 
-    # Update weights on all replicas
-    await policy.update_weights.fanout(new_policy_version=1)
-    # Broadcasts to all replicas simultaneously
+# Update weights on all replicas
+await policy.update_weights.fanout(new_policy_version)
+# Broadcasts to all replicas simultaneously
 
 
 # **Performance characteristics**:
@@ -281,25 +269,22 @@ async def fanout_example(policy):
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
 # **When to use**: You want to process results as they arrive.
+#
+# Streaming requires custom implementation in your training loop.
+# The basic ``ReplayBuffer`` doesn't have built-in streaming methods.
 
+# Pattern from apps/grpo/main.py continuous training:
+while training:
+    # This is the real API call pattern
+    batch = await replay_buffer.sample.call_one(curr_policy_version=step)
+    if batch is not None:
+        # Process batch immediately
+        loss = await trainer.train_step.call_one(batch)
+        print(f"Training loss: {loss}")
+    else:
+        await asyncio.sleep(0.1)  # Wait for more data
 
-async def streaming_pattern_example(replay_buffer, trainer, step):
-    """Streaming pattern for continuous training."""
-    # CONCEPTUAL - Streaming requires custom implementation
-    # Pattern from apps/grpo/main.py continuous training:
-
-    while True:
-        # This is the real API call pattern
-        batch = await replay_buffer.sample.call_one(curr_policy_version=step)
-        if batch is not None:
-            # Process batch immediately
-            loss = await trainer.train_step.call_one(batch)
-            print(f"Training loss: {loss}")
-        else:
-            await asyncio.sleep(0.1)  # Wait for more data
-        break  # Just for example
-
-
+######################################################################
 # **Performance characteristics**:
 #
 # * **Latency**: Process first result immediately
@@ -319,23 +304,12 @@ async def streaming_pattern_example(replay_buffer, trainer, step):
 # **What are sticky sessions?** A session ensures all your service calls
 # within the ``async with`` block go to the same replica, instead of
 # being load-balanced across different replicas.
+# This Counter example demonstrates the difference between regular routing and sessions:
 
-# Mock classes for example
-try:
-    from forge.controller import ForgeActor
-    from monarch.actor import endpoint
-except ImportError:
-
-    class ForgeActor:
-        pass
-
-    def endpoint(func):
-        return func
-
+from forge.controller import ForgeActor
+from monarch.actor import endpoint
 
 class ForgeCounter(ForgeActor):
-    """Example counter to demonstrate sessions."""
-
     def __init__(self, initial_value: int):
         self.value = initial_value
 
@@ -352,45 +326,47 @@ def get_value(self) -> int:
     async def reset(self):
         self.value = 0
 
+counter_service = await ForgeCounter.options(
+    procs=1, num_replicas=4
+).as_service(initial_value=0)
 
-async def without_sessions_example():
-    """WITHOUT SESSIONS: Each .route() goes to different replica."""
-    counter_service = await ForgeCounter.options(procs=1, num_replicas=4).as_service(
-        initial_value=0
-    )
-
-    # Each call might go to different replica
-    await counter_service.increment.route()  # Might go to replica 2
-    await counter_service.increment.route()  # Might go to replica 1
-    await counter_service.increment.route()  # Might go to replica 3
+# WITHOUT SESSIONS: Each .route() call goes to a different replica
+await counter_service.increment.route()  # Might go to replica 2
+await counter_service.increment.route()  # Might go to replica 1
+await counter_service.increment.route()  # Might go to replica 3
 
-    results = await counter_service.increment.fanout()
-    print(f"All replica values: {results}")
-    # Output: All replica values: [1, 2, 1, 1]
-    # Each replica has different state!
+results = await counter_service.increment.fanout()  # Get from all replicas
+print(f"All replica values: {results}")
+# Output: All replica values: [1, 2, 1, 1] - Each replica has different state!
 
-    await counter_service.shutdown()
+######################################################################
+# The problem: each `.route()` call can go to different replicas, creating inconsistent state.
 
+# WITH SESSIONS: All calls go to the SAME replica
+print("\nUsing sticky sessions:")
+async with counter_service.session():  # Creates a session that picks one replica
+    await counter_service.reset.route()  # Uses .route() within session
+    print(await counter_service.increment.route())  # 1
+    print(await counter_service.increment.route())  # 2
+    print(await counter_service.increment.route())  # 3
 
-async def with_sessions_example():
-    """WITH SESSIONS: All calls go to the SAME replica."""
-    counter_service = await ForgeCounter.options(procs=1, num_replicas=4).as_service(
-        initial_value=0
-    )
+    final_value = await counter_service.get_value.route()
+    print(f"Final value on this replica: {final_value}")  # 3
 
-    print("\nUsing sticky sessions:")
-    async with counter_service.session():
-        await counter_service.reset.route()
-        print(await counter_service.increment.route())  # 1
-        print(await counter_service.increment.route())  # 2
-        print(await counter_service.increment.route())  # 3
+######################################################################
+# Same pattern works with Policy for multi-turn conversations:
 
-        final_value = await counter_service.get_value.route()
-        print(f"Final value on this replica: {final_value}")  # 3
+async with policy.session():
+    response1 = await policy.generate.route(turn1)
+    full_prompt = turn1 + response1[0].text + turn2
+    response2 = await policy.generate.route(full_prompt)
+    # Both calls hit same replica, preserving KV cache
 
-    # Cleanup
-    await counter_service.shutdown()
+# Cleanup
+await counter_service.shutdown()
 
+######################################################################
+# **Performance impact**: Critical for maintaining KV cache in multi-turn conversations.
 
 ######################################################################
 # Deep Dive: State Management Reality
@@ -405,17 +381,15 @@ async def with_sessions_example():
 # **The challenge**: Policy inference is much faster with KV cache,
 # but cache is tied to specific conversation history.
 
-
-async def naive_multi_turn(policy_service):
-    """This breaks KV cache optimization."""
-    question1 = "What is 2+2?"
-
+# This breaks KV cache optimization:
+async def naive_multi_turn():
     # Each call might go to different replica = cache miss
-    response1 = await policy_service.generate.route(prompt=question1)
-    full_prompt = question1 + response1[0].text
-    response2 = await policy_service.generate.route(prompt=full_prompt)  # Cache miss!
-    conversation = full_prompt + response2[0].text
-    response3 = await policy_service.generate.route(prompt=conversation)  # Cache miss!
+    response1 = await policy_service.generate.choose(question1)
+    response2 = await policy_service.generate.choose(question1 + response1) # Cache miss!
+    response3 = await policy_service.generate.choose(conversation_so_far)   # Cache miss!
+
+######################################################################
+# **The solution**: Sticky sessions ensure all calls go to same replica.
 
 
 async def optimized_multi_turn(policy):
@@ -431,7 +405,7 @@ async def optimized_multi_turn(policy):
 
     # Session ends, replica can be garbage collected or reused
 
-
+######################################################################
 # **Performance impact**: Maintaining KV cache across turns avoids
 # recomputing previous tokens.
 
@@ -445,23 +419,20 @@ async def optimized_multi_turn(policy):
 # **Real Forge approach**: The ReplayBuffer actor handles concurrency
 # internally:
 
+# Forge ReplayBuffer endpoints (verified from source code)
+# Add episodes (thread-safe by actor model)
+await replay_buffer.add.call_one(episode)  # .choose() would work too, but .call_one() clarifies it's a singleton actor not ActorMesh
 
-async def replay_buffer_example(replay_buffer):
-    """ReplayBuffer provides thread-safe operations."""
-    # Add episodes (thread-safe by actor model)
-    episode = {}  # Mock episode
-    await replay_buffer.add.call_one(episode)
-
-    # Sample batches for training
-    batch = await replay_buffer.sample.call_one(
-        curr_policy_version=0,
-        batch_size=None,  # Optional, uses default from config
-    )
+# Sample batches for training
+batch = await replay_buffer.sample.call_one(
+    curr_policy_version=step_number,
+    batch_size=None  # Optional parameter, uses default from config
+)
 
-    # Additional methods available:
-    # await replay_buffer.clear.call_one()  # Clear buffer
-    # await replay_buffer.evict.call_one(curr_policy_version)  # Remove old
-    # state = await replay_buffer.state_dict.call_one()  # Checkpoint
+# Additional methods available:
+# await replay_buffer.clear.call_one()  # Clear buffer
+# await replay_buffer.evict.call_one(curr_policy_version)  # Remove old episodes
+# state = await replay_buffer.state_dict.call_one()  # Get state for checkpointing
 
 
 # **Critical insight**: The actor model provides natural thread safety -
@@ -474,11 +445,8 @@ async def replay_buffer_example(replay_buffer):
 # **The challenge**: Trainer updates policy weights, but policy service
 # needs those weights.
 
-import torch
-
-
+# Forge weight synchronization pattern from apps/grpo/main.py
 async def real_weight_sync(trainer, policy, step):
-    """Forge weight synchronization pattern from apps/grpo/main.py."""
     # Trainer pushes weights to TorchStore with version number
     await trainer.push_weights.call_one(policy_version=step + 1)
 
@@ -486,10 +454,9 @@ async def real_weight_sync(trainer, policy, step):
     # Use .fanout() to update ALL policy replicas
     await policy.update_weights.fanout(policy_version=step + 1)
 
-    # Check current policy version
-    current_version = await policy.get_version.route()
-    print(f"Current policy version: {current_version}")
-
+# Check current policy version
+current_version = await policy.get_version.route()
+print(f"Current policy version: {current_version}")
 
 ######################################################################
 # Deep Dive: Asynchronous Coordination Patterns
@@ -505,24 +472,18 @@ async def real_weight_sync(trainer, policy, step):
 # automatically:
 
 
-async def simple_rl_step(
-    dataloader,
-    policy,
-    ref_model,
-    reward_actor,
-    replay_buffer,
-    compute_advantages,
-    trainer,
-):
-    """Simple RL step showing service coordination."""
+from apps.grpo.main import Episode, Group
+
+async def simple_rl_step():
+
     # ===== Generate a rollout =====
-    sample = await dataloader.sample.call_one()
-    prompt, target = sample["request"], sample["target"]
+    sample = await dataloader.sample.call_one()  # DatasetActor is an actor, not service
+    prompt, target = sample["request"], sample["target"]  # Correct field names
 
     print(f"Prompt: {prompt}")
     print(f"Target: {target}")
 
-    actions = await policy.generate.route(prompt=prompt)
+    actions = await policy.generate.route(prompt=prompt)  # Policy is a service
     print(f"Policy response: {actions[0].text}")
 
     # Create input tensor for reference model (requires full context)
@@ -530,36 +491,60 @@ async def simple_rl_step(
     ref_logprobs = await ref_model.forward.route(
         input_ids.unsqueeze(0), max_req_tokens=512, return_logprobs=True
     )
-
-    reward = await reward_actor.evaluate_response.route(
-        prompt=prompt, response=actions[0].text, target=target
+    reward = await reward_actor.evaluate_response.route(  # RewardActor is a service
+        prompt=prompt,
+        response=actions[0].text,
+        target=target
     )
     print(f"Reward: {reward}")
 
-    # Create episode (simplified for example)
-    episode = {
-        "episode_id": "0",
-        "request": prompt,
-        "response": actions[0].text,
-        "reward": reward,
-        "ref_logprobs": ref_logprobs[0],
-    }
+    # Create episode using actual GRPO Episode structure
+    episode = Episode(
+        episode_id="0",
+        request=prompt,
+        policy_version=0,
+        pad_id=tokenizer.pad_token_id,
+        request_len=512,
+        response_len=512,
+        target=target
+    )
 
-    await replay_buffer.add.call_one(episode)
+    # Add response data
+    episode.response = actions[0].text
+    episode.request_tokens = actions[0].prompt_ids.tolist()
+    episode.response_tokens = actions[0].token_ids.tolist()
+    episode.ref_logprobs = ref_logprobs[0]  # Extract from batch dimension
+    episode.reward = reward
+
+    # Compute advantages using actual ComputeAdvantages actor
+    group = Group.new_group(0, 1, prompt, 0, tokenizer.pad_token_id, 512, 512, target)
+    group.episodes[0] = episode
+    advantages = await compute_advantages.compute.call_one(group)  # ComputeAdvantages is an actor
+    episode.advantage = advantages[0]
+    print(f"Advantage: {advantages[0]}")
+    await replay_buffer.add.call_one(episode)  # ReplayBuffer is an actor
     print("Episode stored in replay buffer")
 
     # ===== Train on the batch =====
     batch = await replay_buffer.sample.call_one(curr_policy_version=0)
     if batch is not None:
         print("Training on batch...")
-        inputs, targets = batch
-        loss = await trainer.train_step.call(inputs, targets)
+        inputs, targets = batch  # GRPO returns (inputs, targets) tuple
+        loss = await trainer.train_step.call(inputs, targets)  # RLTrainer is an actor
         print(f"Training loss: {loss}")
         return loss
     else:
         print("Not enough data in buffer yet")
         return None
 
+# Note: This simplified example assumes tokenizer and services are already initialized
+for step in range(10):
+    print(f"\n--- RL Step {step + 1} ---")
+    loss = await simple_rl_step()
+    if loss:
+        print(f"Step {step + 1} complete, loss: {loss:.4f}")
+    else:
+        print(f"Step {step + 1} complete, building buffer...")
 
 ######################################################################
 # Handling Speed Mismatches with Service Scaling
@@ -568,36 +553,27 @@ async def simple_rl_step(
 # **The insight**: Scale services independently based on their
 # bottlenecks.
 
-# Mock imports for example
-try:
-    from forge.actors.trainer import RLTrainer
-except ImportError:
-
-    class RLTrainer:
-        pass
-
-
-async def scaling_example():
-    """Scale services independently based on bottlenecks."""
-    model_name = "Qwen/Qwen3-1.7B"
-
-    # Scale fast services with more replicas
-    policy = await Policy.options(
-        procs=1, num_replicas=8, with_gpus=True  # Many replicas for throughput
-    ).as_service(engine_config={"model": model_name, "tensor_parallel_size": 1})
-
-    # Reward evaluation might be CPU-bound
-    reward_actor = await RewardActor.options(
-        procs=1, num_replicas=16, with_gpus=False  # More CPU replicas
-    ).as_service(reward_functions=[])
-
-    # Training needs fewer but more powerful replicas
-    trainer = await RLTrainer.options(
-        procs=1, with_gpus=True  # Fewer but GPU-heavy
-    ).as_actor(  # Trainer typically uses .as_actor() not .as_service()
-        model={"name": "qwen3", "flavor": "1.7B"},
-        optimizer={"name": "AdamW", "lr": 1e-5},
-    )
+# Scale fast services with more replicas
+policy = await Policy.options(
+    procs=1, num_replicas=8, with_gpus=True  # Many replicas for high throughput
+).as_service(
+    engine_config={"model": model_name, "tensor_parallel_size": 1}
+)
+
+# Reward evaluation might be CPU-bound
+reward_actor = await RewardActor.options(
+    procs=1, num_replicas=16, with_gpus=False  # More CPU replicas
+).as_service(
+    reward_functions=[MathReward()]
+)
+
+# Training needs fewer but more powerful replicas
+trainer = await RLTrainer.options(
+    procs=1, with_gpus=True  # Fewer but GPU-heavy
+).as_actor(  # Trainer typically uses .as_actor() not .as_service()
+    model={"name": "qwen3", "flavor": "1.7B"},
+    optimizer={"name": "AdamW", "lr": 1e-5}
+)
 
 
 ######################################################################
@@ -606,37 +582,20 @@ async def scaling_example():
 #
 # Let's see how a reward service is actually implemented:
 
-# Mock imports
-try:
-    from forge.controller import ForgeActor
-    from forge.data.rewards import MathReward, ThinkingReward
-    from monarch.actor import endpoint
-except ImportError:
-
-    class ForgeActor:
-        pass
-
-    def endpoint(func):
-        return func
-
-    class MathReward:
-        def __call__(self, prompt, response, target):
-            return 1.0
-
-    class ThinkingReward:
-        def __call__(self, prompt, response, target):
-            return 1.0
+# Exact RewardActor from apps/grpo/main.py
 
+from forge.controller import ForgeActor
+from monarch.actor import endpoint
+from forge.data.rewards import MathReward, ThinkingReward
 
+# class definition from apps/grpo/main.py
 class RewardActor(ForgeActor):
-    """Exact RewardActor from apps/grpo/main.py."""
-
     def __init__(self, reward_functions: list):
         self.reward_functions = reward_functions
 
     @endpoint
     async def evaluate_response(self, prompt: str, response: str, target: str) -> float:
-        """Evaluate response quality using multiple reward functions."""
+        """Evaluate response quality using multiple reward functions"""
         total_reward = 0.0
 
         for reward_fn in self.reward_functions:
@@ -645,31 +604,29 @@ async def evaluate_response(self, prompt: str, response: str, target: str) -> fl
             total_reward += reward
 
         # Return average reward across all functions
-        return (
-            total_reward / len(self.reward_functions) if self.reward_functions else 0.0
-        )
-
+        return total_reward / len(self.reward_functions) if self.reward_functions else 0.0
 
-async def reward_service_example():
-    """Create and use a reward service."""
-    reward_actor = await RewardActor.options(procs=1, num_replicas=1).as_service(
-        reward_functions=[MathReward(), ThinkingReward()]
-    )
+reward_actor = await RewardActor.options(
+    procs=1, num_replicas=1
+).as_service(
+    reward_functions=[MathReward(), ThinkingReward()]
+)
 
-    prompt = "What is 15% of 240?"
-    response = "15% of 240 is 36"
-    target = "36"
+prompt = "What is 15% of 240?"
+response = "15% of 240 is 36"
+target = "36"
 
-    score = await reward_actor.evaluate_response.route(
-        prompt=prompt, response=response, target=target
-    )
-    print(f"Reward score: {score}")  # Usually around 1.0 for correct answers
+score = await reward_actor.evaluate_response.route(
+    prompt=prompt,
+    response=response,
+    target=target
+)
+print(f"Reward score: {score}")  # Usually around 1.0 for correct math answers
+# For production scaling - increase num_replicas for parallel evaluation:
+# RewardActor.options(procs=1, num_replicas=16)  # 16 parallel evaluators
 
-    # For production scaling - increase num_replicas:
-    # RewardActor.options(procs=1, num_replicas=16)  # 16 parallel evaluators
-
-    # Cleanup when done
-    await reward_actor.shutdown()
+# Cleanup when done
+await reward_actor.shutdown()
 
 
 ######################################################################
@@ -679,13 +636,57 @@ async def reward_service_example():
 # Now let's see how services coordinate in a real training loop:
 
 
-async def production_training_loop():
-    """Real training loop pattern from apps/grpo/main.py."""
-    # Service creation pattern (abbreviated)
-    print("Initializing all services...")
+# This is the REAL way production RL systems are built with Forge
 
-    # (Services initialization code here - see Part 1)
+import asyncio
+import torch
+from forge.actors.policy import Policy
+from forge.actors.reference_model import ReferenceModel
+from forge.actors.replay_buffer import ReplayBuffer
+from forge.actors.trainer import RLTrainer
+from apps.grpo.main import DatasetActor, RewardActor, ComputeAdvantages
+from forge.data.rewards import MathReward, ThinkingReward
+
+# Service creation pattern from apps/grpo/main.py lines 322-344
+print("Initializing all services...")
+(
+    dataloader,
+    policy,
+    trainer,
+    replay_buffer,
+    compute_advantages,
+    ref_model,
+    reward_actor,
+) = await asyncio.gather(
+    DatasetActor.options(procs=1).as_actor(
+        path="openai/gsm8k", revision="main", data_split="train",
+        streaming=True, model="Qwen/Qwen3-1.7B"
+    ),
+    Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service(
+        engine_config={"model": "Qwen/Qwen3-1.7B", "tensor_parallel_size": 1},
+        sampling_config={"n": 1, "max_tokens": 512}
+    ),
+    RLTrainer.options(procs=1, with_gpus=True).as_actor(
+        model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": "hf://Qwen/Qwen3-1.7B"},
+        optimizer={"name": "AdamW", "lr": 1e-5},
+        training={"local_batch_size": 2, "seq_len": 2048}
+    ),
+    ReplayBuffer.options(procs=1).as_actor(
+        batch_size=2, max_policy_age=1, dp_size=1
+    ),
+    ComputeAdvantages.options(procs=1).as_actor(),
+    ReferenceModel.options(procs=1, with_gpus=True).as_actor(
+        model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": "hf://Qwen/Qwen3-1.7B"}
+    ),
+    RewardActor.options(procs=1, num_replicas=1).as_service(
+        reward_functions=[MathReward(), ThinkingReward()]
+    ),
+)
 
+print("All services initialized successfully!")
+
+async def production_training_loop():
+    """Real training loop pattern from apps/grpo/main.py"""
     step = 0
 
     while True:
@@ -693,9 +694,9 @@ async def production_training_loop():
         sample = await dataloader.sample.call_one()
 
         # Policy generation service call
-        responses = await policy.generate.route(sample["request"])
+        responses = await policy.generate.route(sample["request"])  # Correct field name
 
-        # Reference computation service call
+        # Reference computation service call (requires full input tensor)
         input_ids = torch.cat([responses[0].prompt_ids, responses[0].token_ids])
         ref_logprobs = await ref_model.forward.route(
             input_ids.unsqueeze(0), max_req_tokens=512, return_logprobs=True
@@ -705,16 +706,17 @@ async def production_training_loop():
         reward = await reward_actor.evaluate_response.route(
             prompt=sample["question"],
             response=responses[0].text,
-            target=sample["answer"],
+            target=sample["answer"]
         )
 
-        # Experience storage
+        # Experience storage (using actual Episode structure)
+        episode = create_episode_from_grpo_data(sample, responses[0], reward, ref_logprobs[0], step)
         await replay_buffer.add.call_one(episode)
 
         # Training when ready
         batch = await replay_buffer.sample.call_one(curr_policy_version=step)
         if batch is not None:
-            inputs, targets = batch
+            inputs, targets = batch  # GRPO returns (inputs, targets) tuple
             loss = await trainer.train_step.call(inputs, targets)
 
             # Weight synchronization pattern
@@ -724,8 +726,17 @@ async def production_training_loop():
             print(f"Step {step}, Loss: {loss:.4f}")
             step += 1
 
-        if step >= 100:
-            break
+print("Shutting down services...")
+await asyncio.gather(
+    DatasetActor.shutdown(dataloader),
+    policy.shutdown(),
+    RLTrainer.shutdown(trainer),
+    ReplayBuffer.shutdown(replay_buffer),
+    ComputeAdvantages.shutdown(compute_advantages),
+    ReferenceModel.shutdown(ref_model),
+    reward_actor.shutdown(),
+)
+print("All services shut down successfully!")
 
 
 # **Key observations:**

From 6629c0f3dff18362ab4aa48fad304908eab48867 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Tue, 14 Oct 2025 14:28:13 -0700
Subject: [PATCH 25/28] Update

---
 .../zero-to-forge/3_Monarch_101.py            | 242 ++++++------------
 1 file changed, 84 insertions(+), 158 deletions(-)

diff --git a/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py b/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py
index 09e61e154..e6f071af9 100644
--- a/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py
+++ b/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py
@@ -90,6 +90,23 @@
 # Single Host ProcMesh
 # ~~~~~~~~~~~~~~~~~~~~
 #
+#  **Key insight**: ProcMesh creates one process per GPU, automatically handling the process-to-hardware mapping.
+#
+
+# This simple call:
+procs = this_host().spawn_procs(per_host={"gpus": 8})
+
+# Creates:
+# Process 0 → GPU 0
+# Process 1 → GPU 1
+# Process 2 → GPU 2
+# Process 3 → GPU 3
+# Process 4 → GPU 4
+# Process 5 → GPU 5
+# Process 6 → GPU 6
+# Process 7 → GPU 7
+
+######################################################################
 # .. mermaid::
 #
 #     graph TD
@@ -124,10 +141,16 @@
 #         style P6 fill:#F44336
 #         style P7 fill:#F44336
 
+######################################################################
+# The beauty: you don't manage individual processes or GPU assignments -
+# ProcMesh handles the topology for you.
+
 ######################################################################
 # Multi-Host ProcMesh
 # ~~~~~~~~~~~~~~~~~~~
 #
+# **Key insight**: ProcMesh seamlessly scales across multiple hosts with continuous process numbering.
+#
 # .. mermaid::
 #
 #     graph TD
@@ -169,92 +192,81 @@
 #         style PM2 fill:#4CAF50
 #         style PM3 fill:#2196F3
 
-######################################################################
-# Monarch Actor System Basics
-# ----------------------------
-#
-# This shows the underlying actor system that powers Forge services.
-
-# Mock imports for documentation build
-try:
-    from monarch.actor import Actor, endpoint, Future, ProcMesh, this_host, this_proc
-except ImportError:
-
-    class Actor:
-        pass
-
-    def endpoint(func):
-        return func
+# Same simple API works across hosts:
+cluster_procs = spawn_cluster_procs(
+    hosts=["host1", "host2", "host3"],
+    per_host={"gpus": 4}
+)
 
-    class Future:
-        pass
+# Automatically creates:
+# Host 1: Processes 0-3  → GPUs 0-3
+# Host 2: Processes 4-7  → GPUs 0-3
+# Host 3: Processes 8-11 → GPUs 0-3
 
-    class ProcMesh:
-        pass
+# Your code stays the same whether it's 1 host or 100 hosts
+actors = cluster_procs.spawn("my_actor", MyActor)
 
-    def this_proc():
-        return None
+######################################################################
+# **The power**: Scale from single host to cluster without changing your
+# actor code - ProcMesh handles all the complexity.
 
-    def this_host():
-        return None
+# This shows the underlying actor system that powers Forge services
+# NOTE: This is for educational purposes - use ForgeActor and .as_service() in real Forge apps!
 
+from monarch.actor import Actor, endpoint, this_proc, Future
+from monarch.actor import ProcMesh, this_host
+import asyncio
 
 # STEP 1: Define a basic actor
 class Counter(Actor):
-    """Basic counter actor example."""
-
     def __init__(self, initial_value: int):
         self.value = initial_value
 
     @endpoint
     def increment(self) -> None:
-        """Increment the counter."""
         self.value += 1
 
     @endpoint
     def get_value(self) -> int:
-        """Get current counter value."""
         return self.value
 
+# STEP 2: Single actor in local process
+counter: Counter = this_proc().spawn("counter", Counter, initial_value=0)
 
-async def basic_actor_example():
-    """Example of using Monarch actors."""
-    # STEP 2: Single actor in local process
-    counter = this_proc().spawn("counter", Counter, initial_value=0)
-
-    # STEP 3: Send messages
-    fut = counter.get_value.call_one()
-    value = await fut
-    print(f"Counter value: {value}")  # 0
+# STEP 3: Send messages
+fut: Future[int] = counter.get_value.call_one()
+value = await fut
+print(f"Counter value: {value}")  # 0
 
+# STEP 4: Multiple actors across processes
+procs: ProcMesh = this_host().spawn_procs(per_host={"gpus": 8})
+counters: Counter = procs.spawn("counters", Counter, 0)
 
-async def distributed_actors_example():
-    """Example of actors across multiple processes."""
-    # STEP 4: Multiple actors across processes
-    procs = this_host().spawn_procs(per_host={"gpus": 8})
-    counters = procs.spawn("counters", Counter, 0)
+# STEP 5: Broadcast to all actors
+await counters.increment.call()
 
-    # STEP 5: Broadcast to all actors
-    await counters.increment.call()
+# STEP 6: Different message patterns
+# call_one() - single actor
+value = await counters.get_value.call_one()
+print(f"One counter: {value}")  # Output: One counter: 1
 
-    # STEP 6: Different message patterns
-    # call_one() - single actor
-    value = await counters.get_value.call_one()
-    print(f"One counter: {value}")
+# choose() - random single actor (actors only, not services)
+value = await counters.get_value.choose()
+print(f"Random counter: {value}")  # Output: Random counter: 1
 
-    # choose() - random single actor (actors only, not services)
-    value = await counters.get_value.choose()
-    print(f"Random counter: {value}")
+# call() - all actors, collect results
+values = await counters.get_value.call()
+print(f"All counters: {values}")  # Output: All counters: [1, 1, 1, 1, 1, 1, 1, 1]
 
-    # call() - all actors, collect results
-    values = await counters.get_value.call()
-    print(f"All counters: {values}")
+# broadcast() - fire and forget
+await counters.increment.broadcast()  # No return value - just sends to all actors
 
-    # broadcast() - fire and forget
-    await counters.increment.broadcast()
+# Cleanup
+await procs.stop()
 
-    # Cleanup
-    await procs.stop()
+######################################################################
+# Remember: This raw Monarch code is for understanding how Forge works internally.
+# In your Forge applications, use ForgeActor, .as_service(), .as_actor() instead!
 
 
 ######################################################################
@@ -264,93 +276,25 @@ async def distributed_actors_example():
 # **ActorMesh** is created when you spawn actors across a ProcMesh.
 # Each process in the ProcMesh gets one instance of your actor.
 #
-# .. mermaid::
-#
-#     graph TD
-#         subgraph Creation["Actor Creation Process"]
-#             Code["mesh.spawn('policy', PolicyActor, model='Qwen/Qwen3-7B')"]
-#
-#             subgraph ProcMesh["ProcMesh (4 processes)"]
-#                 P0["Process 0<br/>GPU 0"]
-#                 P1["Process 1<br/>GPU 1"]
-#                 P2["Process 2<br/>GPU 2"]
-#                 P3["Process 3<br/>GPU 3"]
-#             end
-#
-#             subgraph ActorMesh["ActorMesh PolicyActor"]
-#                 A0["PolicyActor Instance #0: model=Qwen/Qwen3-7B"]
-#                 A1["PolicyActor Instance #1: model=Qwen/Qwen3-7B"]
-#                 A2["PolicyActor Instance #2: model=Qwen/Qwen3-7B"]
-#                 A3["PolicyActor Instance #3: model=Qwen/Qwen3-7B"]
-#             end
-#
-#             Code --> ProcMesh
-#             P0 --> A0
-#             P1 --> A1
-#             P2 --> A2
-#             P3 --> A3
-#         end
-#
-#         style A0 fill:#4CAF50
-#         style A1 fill:#4CAF50
-#         style A2 fill:#4CAF50
-#         style A3 fill:#4CAF50
+# - **One actor instance per process**: `mesh.spawn("policy", PolicyActor)` creates one PolicyActor in each process
+# - **Same constructor arguments**: All instances get the same initialization parameters
+# - **Independent state**: Each actor instance maintains its own state and memory
+# - **Message routing**: You can send messages to one actor or all actors using different methods
+
+# Simple example:
+procs = spawn_procs(per_host={"gpus": 4})  # 4 processes
+policy_actors = procs.spawn("policy", PolicyActor, model="Qwen/Qwen3-7B")
+
+# Now you have 4 PolicyActor instances, one per GPU
+# All initialized with the same model parameter
 
-######################################################################
-# Message Routing Through ActorMesh
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#
-# .. mermaid::
-#
-#     graph TD
-#         subgraph MessageFlow["Message Flow Patterns"]
-#             Client["await policy_actors.generate.METHOD(prompt)"]
-#
-#             subgraph Methods["Different Adverbs Route Differently"]
-#                 Choose["choose(): Routes to ONE actor, Load balanced"]
-#                 Call["call(): Routes to ALL actors, Collects results"]
-#                 Broadcast["broadcast(): Routes to ALL actors, Fire and forget"]
-#                 Stream["stream(): Routes to ALL actors, Iterator of results"]
-#             end
-#
-#             subgraph ActorInstances["PolicyActor Instances"]
-#                 A0["Actor 0: GPU 0, generates response"]
-#                 A1["Actor 1: GPU 1, generates response"]
-#                 A2["Actor 2: GPU 2, generates response"]
-#                 A3["Actor 3: GPU 3, generates response"]
-#             end
-#
-#             Client --> Choose
-#             Client --> Call
-#             Client --> Broadcast
-#             Client --> Stream
-#
-#             Choose -.->|"Load balanced"| A1
-#             Call --> A0
-#             Call --> A1
-#             Call --> A2
-#             Call --> A3
-#             Broadcast --> A0
-#             Broadcast --> A1
-#             Broadcast --> A2
-#             Broadcast --> A3
-#             Stream --> A0
-#             Stream --> A1
-#             Stream --> A2
-#             Stream --> A3
-#         end
-#
-#         style Choose fill:#4CAF50
-#         style Call fill:#FF9800
-#         style Broadcast fill:#E91E63
-#         style Stream fill:#9C27B0
 
 ######################################################################
 # How Forge Services Use Monarch
 # -------------------------------
 #
-# Now the key insight: **Forge services are ServiceActors that manage
-# ActorMeshes of your ForgeActor replicas**.
+# Now the key insight: **Forge services are ``ServiceActors`` that manage
+# ``ActorMeshes`` of your ``ForgeActor`` replicas**.
 #
 # The Service Creation Process
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -445,7 +389,7 @@ async def distributed_actors_example():
 # -----------------------------------------
 #
 # In real RL systems, you have multiple services that can share or use
-# separate ProcMeshes:
+# separate ``ProcMeshes``:
 #
 # .. mermaid::
 #
@@ -515,24 +459,6 @@ async def distributed_actors_example():
 # * **Scale effectively**: Where to add resources for maximum impact?
 
 
-def demonstrate_architecture_benefits():
-    """Example showing why the architecture matters."""
-    # Process Isolation: Failures don't cascade
-    # If one PolicyActor crashes, others continue serving
-
-    # Location Transparency: Same API whether local or remote
-    # await policy.generate.route(prompt)  # Works same everywhere
-
-    # Structured Distribution: ProcMesh maps to hardware
-    # per_host={"gpus": 8} creates 8 processes, 1 per GPU
-
-    # Message Passing: No locks needed
-    # Each actor processes messages sequentially, naturally thread-safe
-
-    # Service Abstraction: Simple interface, powerful backend
-    # await service.method.route() hides all distribution complexity
-    pass
-
 
 ######################################################################
 # Conclusion

From 19db1e81f12e6c957380634085274d715cccec3b Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Tue, 14 Oct 2025 14:35:48 -0700
Subject: [PATCH 26/28] Update

---
 .../1_RL_and_Forge_Fundamentals.py            | 125 +++++++++---------
 .../zero-to-forge/2_Forge_Internals.py        |  55 ++++----
 2 files changed, 97 insertions(+), 83 deletions(-)

diff --git a/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py b/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py
index cbdb1fe5f..d5d50f4b6 100644
--- a/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py
+++ b/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py
@@ -351,66 +351,73 @@ async def example_automatic_management(policy):
 import asyncio
 import torch
 
-model = "Qwen/Qwen3-1.7B"
-group_size = 1
-
-(
-    dataloader,
-    policy,
-    trainer,
-    replay_buffer,
-    compute_advantages,
-    ref_model,
-    reward_actor,
-) = await asyncio.gather(
-        # Dataset actor (CPU)
-        DatasetActor.options(procs=1).as_actor(
-            path="openai/gsm8k",
-            revision="main",
-            data_split="train",
-            streaming=True,
-            model=model,
-        ),
-        # Policy service with GPU
-        Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service(
-            engine_config={
-                "model": model,
-                "tensor_parallel_size": 1,
-                "pipeline_parallel_size": 1,
-                "enforce_eager": False
-            },
-            sampling_config={
-                "n": group_size,
-                "max_tokens": 16,
-                "temperature": 1.0,
-                "top_p": 1.0
-            }
-        ),
-        # Trainer actor with GPU
-        RLTrainer.options(procs=1, with_gpus=True).as_actor(
-            # Trainer config would come from YAML in real usage
-            model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"},
-            optimizer={"name": "AdamW", "lr": 1e-5},
-            training={"local_batch_size": 2, "seq_len": 2048}
-        ),
-        # Replay buffer (CPU)
-        ReplayBuffer.options(procs=1).as_actor(
-            batch_size=2,
-            max_policy_age=1,
-            dp_size=1
-        ),
-        # Advantage computation (CPU)
-        ComputeAdvantages.options(procs=1).as_actor(),
-        # Reference model with GPU
-        ReferenceModel.options(procs=1, with_gpus=True).as_actor(
-            model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"},
-            training={"dtype": "bfloat16"}
-        ),
-        # Reward actor (CPU)
-        RewardActor.options(procs=1, num_replicas=1).as_service(
-            reward_functions=[MathReward(), ThinkingReward()]
+async def example_forge_service_initialization():
+    """Example of initializing Forge services for RL training."""
+    model = "Qwen/Qwen3-1.7B"
+    group_size = 1
+
+    (
+        dataloader,
+        policy,
+        trainer,
+        replay_buffer,
+        compute_advantages,
+        ref_model,
+        reward_actor,
+    ) = await asyncio.gather(
+            # Dataset actor (CPU)
+            DatasetActor.options(procs=1).as_actor(
+                path="openai/gsm8k",
+                revision="main",
+                data_split="train",
+                streaming=True,
+                model=model,
+            ),
+            # Policy service with GPU
+            Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service(
+                engine_config={
+                    "model": model,
+                    "tensor_parallel_size": 1,
+                    "pipeline_parallel_size": 1,
+                    "enforce_eager": False
+                },
+                sampling_config={
+                    "n": group_size,
+                    "max_tokens": 16,
+                    "temperature": 1.0,
+                    "top_p": 1.0
+                }
+            ),
+            # Trainer actor with GPU
+            RLTrainer.options(procs=1, with_gpus=True).as_actor(
+                # Trainer config would come from YAML in real usage
+                model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"},
+                optimizer={"name": "AdamW", "lr": 1e-5},
+                training={"local_batch_size": 2, "seq_len": 2048}
+            ),
+            # Replay buffer (CPU)
+            ReplayBuffer.options(procs=1).as_actor(
+                batch_size=2,
+                max_policy_age=1,
+                dp_size=1
+            ),
+            # Advantage computation (CPU)
+            ComputeAdvantages.options(procs=1).as_actor(),
+            # Reference model with GPU
+            ReferenceModel.options(procs=1, with_gpus=True).as_actor(
+                model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"},
+                training={"dtype": "bfloat16"}
+            ),
+            # Reward actor (CPU)
+            RewardActor.options(procs=1, num_replicas=1).as_service(
+                reward_functions=[MathReward(), ThinkingReward()]
+            )
         )
-    )
+    
+    return dataloader, policy, trainer, replay_buffer, compute_advantages, ref_model, reward_actor
+
+# Run the example (commented out to avoid execution during doc build)
+# asyncio.run(example_forge_service_initialization())
 
 
 ######################################################################
diff --git a/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py b/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py
index 0a1e57c91..d946cc50d 100644
--- a/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py
+++ b/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py
@@ -113,35 +113,42 @@
 # * Health monitoring and failure recovery
 # * Message routing and serialization
 
+import asyncio
 from forge.actors.policy import Policy
 
-model = "Qwen/Qwen3-1.7B"
+async def example_service_creation():
+    """Example of creating and using a policy service."""
+    model = "Qwen/Qwen3-1.7B"
+
+    policy = await Policy.options(
+        procs=1,
+        with_gpus=True,
+        num_replicas=1
+    ).as_service(
+        engine_config={
+            "model": model,
+            "tensor_parallel_size": 1,
+            "pipeline_parallel_size": 1,
+            "enforce_eager": False
+        },
+        sampling_config={
+            "n": 1,
+            "max_tokens": 16,
+            "temperature": 1.0,
+            "top_p": 1.0
+        }
+    )
 
-policy = await Policy.options(
-    procs=1,
-    with_gpus=True,
-    num_replicas=1
-).as_service(
-    engine_config={
-        "model": model,
-        "tensor_parallel_size": 1,
-        "pipeline_parallel_size": 1,
-        "enforce_eager": False
-    },
-    sampling_config={
-        "n": 1,
-        "max_tokens": 16,
-        "temperature": 1.0,
-        "top_p": 1.0
-    }
-)
+    prompt = "What is 3 + 5?"
+    responses = await policy.generate.route(prompt)
+    print(f"Response: {responses[0].text}")
 
-prompt = "What is 3 + 5?"
-responses = await policy.generate.route(prompt)
-print(f"Response: {responses[0].text}")
+    # Cleanup when done
+    await policy.shutdown()
+    return policy
 
-# Cleanup when done
-await policy.shutdown()
+# Run the example
+asyncio.run(example_service_creation())
 
 
 ######################################################################

From 7c9e61f73fccb77dd3a2603572855df46bfc66bc Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Tue, 14 Oct 2025 14:43:09 -0700
Subject: [PATCH 27/28] Precomit

---
 .../1_RL_and_Forge_Fundamentals.py            | 130 +++---
 .../zero-to-forge/2_Forge_Internals.py        | 428 ++++++++++--------
 .../zero-to-forge/3_Monarch_101.py            |  63 +--
 3 files changed, 345 insertions(+), 276 deletions(-)

diff --git a/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py b/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py
index d5d50f4b6..d97d715a3 100644
--- a/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py
+++ b/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py
@@ -191,6 +191,7 @@ async def conceptual_forge_rl_step(services, step):
 
         return loss
 
+
 ######################################################################
 # **Key difference**: Same RL logic, but each component is now a distributed,
 # # fault-tolerant, auto-scaling service.
@@ -200,7 +201,6 @@ async def conceptual_forge_rl_step(services, step):
 # # writing your RL Algorthms!
 
 
-
 ######################################################################
 # Why This Matters: Traditional ML Infrastructure Fails
 # -----------------------------------------------------
@@ -282,8 +282,6 @@ async def real_rl_training_step(services, step):
     )
 
     # 3. Get reference logprobs - Using actual ReferenceModel API
-    import torch
-
     input_ids = torch.cat([responses[0].prompt_ids, responses[0].token_ids])
     ref_logprobs = await services["ref_model"].forward.route(
         input_ids.unsqueeze(0), max_req_tokens=512, return_logprobs=True
@@ -305,6 +303,7 @@ async def real_rl_training_step(services, step):
 
         return loss
 
+
 #####################################################################
 # **Key insight**: Each line of RL pseudocode becomes a service call.
 # The complexity of distribution, scaling, and fault tolerance is hidden
@@ -324,6 +323,10 @@ async def example_automatic_management(policy):
     answer = responses[0].text
     return answer
 
+
+import torch
+from apps.grpo.main import ComputeAdvantages, DatasetActor, RewardActor
+
 ######################################################################
 # Forge handles behind the scenes:
 #
@@ -343,13 +346,11 @@ async def example_automatic_management(policy):
 # Note: This is example code showing the Forge API
 # For actual imports, see apps/grpo/main.py
 from forge.actors.policy import Policy
-from forge.actors.replay_buffer import ReplayBuffer
 from forge.actors.reference_model import ReferenceModel
+from forge.actors.replay_buffer import ReplayBuffer
 from forge.actors.trainer import RLTrainer
-from apps.grpo.main import DatasetActor, RewardActor, ComputeAdvantages
 from forge.data.rewards import MathReward, ThinkingReward
-import asyncio
-import torch
+
 
 async def example_forge_service_initialization():
     """Example of initializing Forge services for RL training."""
@@ -365,56 +366,71 @@ async def example_forge_service_initialization():
         ref_model,
         reward_actor,
     ) = await asyncio.gather(
-            # Dataset actor (CPU)
-            DatasetActor.options(procs=1).as_actor(
-                path="openai/gsm8k",
-                revision="main",
-                data_split="train",
-                streaming=True,
-                model=model,
-            ),
-            # Policy service with GPU
-            Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service(
-                engine_config={
-                    "model": model,
-                    "tensor_parallel_size": 1,
-                    "pipeline_parallel_size": 1,
-                    "enforce_eager": False
-                },
-                sampling_config={
-                    "n": group_size,
-                    "max_tokens": 16,
-                    "temperature": 1.0,
-                    "top_p": 1.0
-                }
-            ),
-            # Trainer actor with GPU
-            RLTrainer.options(procs=1, with_gpus=True).as_actor(
-                # Trainer config would come from YAML in real usage
-                model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"},
-                optimizer={"name": "AdamW", "lr": 1e-5},
-                training={"local_batch_size": 2, "seq_len": 2048}
-            ),
-            # Replay buffer (CPU)
-            ReplayBuffer.options(procs=1).as_actor(
-                batch_size=2,
-                max_policy_age=1,
-                dp_size=1
-            ),
-            # Advantage computation (CPU)
-            ComputeAdvantages.options(procs=1).as_actor(),
-            # Reference model with GPU
-            ReferenceModel.options(procs=1, with_gpus=True).as_actor(
-                model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": f"hf://{model}"},
-                training={"dtype": "bfloat16"}
-            ),
-            # Reward actor (CPU)
-            RewardActor.options(procs=1, num_replicas=1).as_service(
-                reward_functions=[MathReward(), ThinkingReward()]
-            )
-        )
-    
-    return dataloader, policy, trainer, replay_buffer, compute_advantages, ref_model, reward_actor
+        # Dataset actor (CPU)
+        DatasetActor.options(procs=1).as_actor(
+            path="openai/gsm8k",
+            revision="main",
+            data_split="train",
+            streaming=True,
+            model=model,
+        ),
+        # Policy service with GPU
+        Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service(
+            engine_config={
+                "model": model,
+                "tensor_parallel_size": 1,
+                "pipeline_parallel_size": 1,
+                "enforce_eager": False,
+            },
+            sampling_config={
+                "n": group_size,
+                "max_tokens": 16,
+                "temperature": 1.0,
+                "top_p": 1.0,
+            },
+        ),
+        # Trainer actor with GPU
+        RLTrainer.options(procs=1, with_gpus=True).as_actor(
+            # Trainer config would come from YAML in real usage
+            model={
+                "name": "qwen3",
+                "flavor": "1.7B",
+                "hf_assets_path": f"hf://{model}",
+            },
+            optimizer={"name": "AdamW", "lr": 1e-5},
+            training={"local_batch_size": 2, "seq_len": 2048},
+        ),
+        # Replay buffer (CPU)
+        ReplayBuffer.options(procs=1).as_actor(
+            batch_size=2, max_policy_age=1, dp_size=1
+        ),
+        # Advantage computation (CPU)
+        ComputeAdvantages.options(procs=1).as_actor(),
+        # Reference model with GPU
+        ReferenceModel.options(procs=1, with_gpus=True).as_actor(
+            model={
+                "name": "qwen3",
+                "flavor": "1.7B",
+                "hf_assets_path": f"hf://{model}",
+            },
+            training={"dtype": "bfloat16"},
+        ),
+        # Reward actor (CPU)
+        RewardActor.options(procs=1, num_replicas=1).as_service(
+            reward_functions=[MathReward(), ThinkingReward()]
+        ),
+    )
+
+    return (
+        dataloader,
+        policy,
+        trainer,
+        replay_buffer,
+        compute_advantages,
+        ref_model,
+        reward_actor,
+    )
+
 
 # Run the example (commented out to avoid execution during doc build)
 # asyncio.run(example_forge_service_initialization())
diff --git a/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py b/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py
index d946cc50d..72b63d728 100644
--- a/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py
+++ b/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py
@@ -27,7 +27,7 @@
        * Understanding of Python async/await
        * Basic distributed systems knowledge
 
-We highly recommend completing Part 1 before starting this tutorial. 
+We highly recommend completing Part 1 before starting this tutorial.
 Part 1 explains RL Concepts and how they land in Forge.
 
 Now that you see the power of the service abstraction, let's understand
@@ -88,9 +88,9 @@
 # Here's the actual ``ServiceConfig`` from Forge source code:
 
 # Configuration pattern from apps/grpo/main.py:
-# 
+#
 # .. code-block:: python
-# 
+#
 #    Policy.options(
 #        procs=1,           # Processes per replica
 #        num_replicas=4,    # Number of replicas
@@ -114,29 +114,22 @@
 # * Message routing and serialization
 
 import asyncio
+
 from forge.actors.policy import Policy
 
+
 async def example_service_creation():
     """Example of creating and using a policy service."""
     model = "Qwen/Qwen3-1.7B"
 
-    policy = await Policy.options(
-        procs=1,
-        with_gpus=True,
-        num_replicas=1
-    ).as_service(
+    policy = await Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service(
         engine_config={
             "model": model,
             "tensor_parallel_size": 1,
             "pipeline_parallel_size": 1,
-            "enforce_eager": False
+            "enforce_eager": False,
         },
-        sampling_config={
-            "n": 1,
-            "max_tokens": 16,
-            "temperature": 1.0,
-            "top_p": 1.0
-        }
+        sampling_config={"n": 1, "max_tokens": 16, "temperature": 1.0, "top_p": 1.0},
     )
 
     prompt = "What is 3 + 5?"
@@ -147,6 +140,7 @@ async def example_service_creation():
     await policy.shutdown()
     return policy
 
+
 # Run the example
 asyncio.run(example_service_creation())
 
@@ -164,7 +158,7 @@ async def example_service_creation():
 
 
 # Your code sees this simple interface:
-responses = await policy.generate.route(prompt=prompt)
+# responses = await policy.generate.route(prompt=prompt)
 # But Forge handles all the complexity of replica management, load balancing, and fault tolerance
 
 
@@ -226,8 +220,13 @@ async def example_service_creation():
 # **When to use**: Normal request routing where any replica can handle
 # the request.
 
-responses = await policy.generate.route(prompt=question)
-answer = responses[0].text  # Extract text from Completion object
+
+def example_route_pattern():
+    """Example showing route pattern for load balanced requests."""
+    # responses = await policy.generate.route(prompt=question)
+    # answer = responses[0].text  # Extract text from Completion object
+    pass
+
 
 ######################################################################
 # Behind the scenes:
@@ -253,13 +252,16 @@ async def example_service_creation():
 # **When to use**: You need responses from ALL replicas.
 
 
-# Get version from all policy replicas
-current_versions = await policy.get_version.fanout()
-# Returns: [version_replica_1, version_replica_2, ...]
+async def example_fanout_pattern():
+    """Example showing fanout pattern for broadcast operations."""
+    # Get version from all policy replicas
+    # current_versions = await policy.get_version.fanout()
+    # Returns: [version_replica_1, version_replica_2, ...]
 
-# Update weights on all replicas
-await policy.update_weights.fanout(new_policy_version)
-# Broadcasts to all replicas simultaneously
+    # Update weights on all replicas
+    # await policy.update_weights.fanout(new_policy_version)
+    # Broadcasts to all replicas simultaneously
+    pass
 
 
 # **Performance characteristics**:
@@ -280,16 +282,20 @@ async def example_service_creation():
 # Streaming requires custom implementation in your training loop.
 # The basic ``ReplayBuffer`` doesn't have built-in streaming methods.
 
-# Pattern from apps/grpo/main.py continuous training:
-while training:
+
+async def example_streaming_pattern():
+    """Pattern from apps/grpo/main.py continuous training."""
     # This is the real API call pattern
-    batch = await replay_buffer.sample.call_one(curr_policy_version=step)
-    if batch is not None:
-        # Process batch immediately
-        loss = await trainer.train_step.call_one(batch)
-        print(f"Training loss: {loss}")
-    else:
-        await asyncio.sleep(0.1)  # Wait for more data
+    # while training:
+    #     batch = await replay_buffer.sample.call_one(curr_policy_version=step)
+    #     if batch is not None:
+    #         # Process batch immediately
+    #         loss = await trainer.train_step.call_one(batch)
+    #         print(f"Training loss: {loss}")
+    #     else:
+    #         await asyncio.sleep(0.1)  # Wait for more data
+    pass
+
 
 ######################################################################
 # **Performance characteristics**:
@@ -316,6 +322,7 @@ async def example_service_creation():
 from forge.controller import ForgeActor
 from monarch.actor import endpoint
 
+
 class ForgeCounter(ForgeActor):
     def __init__(self, initial_value: int):
         self.value = initial_value
@@ -333,44 +340,46 @@ def get_value(self) -> int:
     async def reset(self):
         self.value = 0
 
-counter_service = await ForgeCounter.options(
-    procs=1, num_replicas=4
-).as_service(initial_value=0)
 
-# WITHOUT SESSIONS: Each .route() call goes to a different replica
-await counter_service.increment.route()  # Might go to replica 2
-await counter_service.increment.route()  # Might go to replica 1
-await counter_service.increment.route()  # Might go to replica 3
+async def example_session_comparison():
+    """Demonstrate the difference between sessions and normal routing."""
+    counter_service = await ForgeCounter.options(procs=1, num_replicas=4).as_service(
+        initial_value=0
+    )
 
-results = await counter_service.increment.fanout()  # Get from all replicas
-print(f"All replica values: {results}")
-# Output: All replica values: [1, 2, 1, 1] - Each replica has different state!
+    # WITHOUT SESSIONS: Each .route() call goes to a different replica
+    await counter_service.increment.route()  # Might go to replica 2
+    await counter_service.increment.route()  # Might go to replica 1
+    await counter_service.increment.route()  # Might go to replica 3
 
-######################################################################
-# The problem: each `.route()` call can go to different replicas, creating inconsistent state.
+    results = await counter_service.increment.fanout()  # Get from all replicas
+    print(f"All replica values: {results}")
+    # Output: All replica values: [1, 2, 1, 1] - Each replica has different state!
 
-# WITH SESSIONS: All calls go to the SAME replica
-print("\nUsing sticky sessions:")
-async with counter_service.session():  # Creates a session that picks one replica
-    await counter_service.reset.route()  # Uses .route() within session
-    print(await counter_service.increment.route())  # 1
-    print(await counter_service.increment.route())  # 2
-    print(await counter_service.increment.route())  # 3
+    # WITH SESSIONS: All calls go to the SAME replica
+    print("\nUsing sticky sessions:")
+    async with counter_service.session():  # Creates a session that picks one replica
+        await counter_service.reset.route()  # Uses .route() within session
+        print(await counter_service.increment.route())  # 1
+        print(await counter_service.increment.route())  # 2
+        print(await counter_service.increment.route())  # 3
 
-    final_value = await counter_service.get_value.route()
-    print(f"Final value on this replica: {final_value}")  # 3
+        final_value = await counter_service.get_value.route()
+        print(f"Final value on this replica: {final_value}")  # 3
 
-######################################################################
-# Same pattern works with Policy for multi-turn conversations:
+    # Cleanup
+    await counter_service.shutdown()
 
-async with policy.session():
-    response1 = await policy.generate.route(turn1)
-    full_prompt = turn1 + response1[0].text + turn2
-    response2 = await policy.generate.route(full_prompt)
-    # Both calls hit same replica, preserving KV cache
 
-# Cleanup
-await counter_service.shutdown()
+async def example_multi_turn_conversation(policy, turn1, turn2):
+    """Same pattern works with Policy for multi-turn conversations."""
+    async with policy.session():
+        response1 = await policy.generate.route(turn1)
+        full_prompt = turn1 + response1[0].text + turn2
+        response2 = await policy.generate.route(full_prompt)
+        # Both calls hit same replica, preserving KV cache
+        return response2
+
 
 ######################################################################
 # **Performance impact**: Critical for maintaining KV cache in multi-turn conversations.
@@ -392,8 +401,11 @@ async def reset(self):
 async def naive_multi_turn():
     # Each call might go to different replica = cache miss
     response1 = await policy_service.generate.choose(question1)
-    response2 = await policy_service.generate.choose(question1 + response1) # Cache miss!
-    response3 = await policy_service.generate.choose(conversation_so_far)   # Cache miss!
+    response2 = await policy_service.generate.choose(
+        question1 + response1
+    )  # Cache miss!
+    response3 = await policy_service.generate.choose(conversation_so_far)  # Cache miss!
+
 
 ######################################################################
 # **The solution**: Sticky sessions ensure all calls go to same replica.
@@ -412,6 +424,7 @@ async def optimized_multi_turn(policy):
 
     # Session ends, replica can be garbage collected or reused
 
+
 ######################################################################
 # **Performance impact**: Maintaining KV cache across turns avoids
 # recomputing previous tokens.
@@ -426,20 +439,23 @@ async def optimized_multi_turn(policy):
 # **Real Forge approach**: The ReplayBuffer actor handles concurrency
 # internally:
 
-# Forge ReplayBuffer endpoints (verified from source code)
-# Add episodes (thread-safe by actor model)
-await replay_buffer.add.call_one(episode)  # .choose() would work too, but .call_one() clarifies it's a singleton actor not ActorMesh
 
-# Sample batches for training
-batch = await replay_buffer.sample.call_one(
-    curr_policy_version=step_number,
-    batch_size=None  # Optional parameter, uses default from config
-)
+async def example_replay_buffer_usage():
+    """Forge ReplayBuffer endpoints (verified from source code)."""
+    # Add episodes (thread-safe by actor model)
+    # await replay_buffer.add.call_one(episode)
 
-# Additional methods available:
-# await replay_buffer.clear.call_one()  # Clear buffer
-# await replay_buffer.evict.call_one(curr_policy_version)  # Remove old episodes
-# state = await replay_buffer.state_dict.call_one()  # Get state for checkpointing
+    # Sample batches for training
+    # batch = await replay_buffer.sample.call_one(
+    #     curr_policy_version=step_number,
+    #     batch_size=None,  # Optional parameter, uses default from config
+    # )
+
+    # Additional methods available:
+    # await replay_buffer.clear.call_one()  # Clear buffer
+    # await replay_buffer.evict.call_one(curr_policy_version)  # Remove old episodes
+    # state = await replay_buffer.state_dict.call_one()  # Get state
+    pass
 
 
 # **Critical insight**: The actor model provides natural thread safety -
@@ -461,9 +477,13 @@ async def real_weight_sync(trainer, policy, step):
     # Use .fanout() to update ALL policy replicas
     await policy.update_weights.fanout(policy_version=step + 1)
 
-# Check current policy version
-current_version = await policy.get_version.route()
-print(f"Current policy version: {current_version}")
+
+async def check_policy_version(policy):
+    """Check current policy version."""
+    current_version = await policy.get_version.route()
+    print(f"Current policy version: {current_version}")
+    return current_version
+
 
 ######################################################################
 # Deep Dive: Asynchronous Coordination Patterns
@@ -481,6 +501,7 @@ async def real_weight_sync(trainer, policy, step):
 
 from apps.grpo.main import Episode, Group
 
+
 async def simple_rl_step():
 
     # ===== Generate a rollout =====
@@ -499,9 +520,7 @@ async def simple_rl_step():
         input_ids.unsqueeze(0), max_req_tokens=512, return_logprobs=True
     )
     reward = await reward_actor.evaluate_response.route(  # RewardActor is a service
-        prompt=prompt,
-        response=actions[0].text,
-        target=target
+        prompt=prompt, response=actions[0].text, target=target
     )
     print(f"Reward: {reward}")
 
@@ -513,7 +532,7 @@ async def simple_rl_step():
         pad_id=tokenizer.pad_token_id,
         request_len=512,
         response_len=512,
-        target=target
+        target=target,
     )
 
     # Add response data
@@ -526,7 +545,9 @@ async def simple_rl_step():
     # Compute advantages using actual ComputeAdvantages actor
     group = Group.new_group(0, 1, prompt, 0, tokenizer.pad_token_id, 512, 512, target)
     group.episodes[0] = episode
-    advantages = await compute_advantages.compute.call_one(group)  # ComputeAdvantages is an actor
+    advantages = await compute_advantages.compute.call_one(
+        group
+    )  # ComputeAdvantages is an actor
     episode.advantage = advantages[0]
     print(f"Advantage: {advantages[0]}")
     await replay_buffer.add.call_one(episode)  # ReplayBuffer is an actor
@@ -544,14 +565,18 @@ async def simple_rl_step():
         print("Not enough data in buffer yet")
         return None
 
-# Note: This simplified example assumes tokenizer and services are already initialized
-for step in range(10):
-    print(f"\n--- RL Step {step + 1} ---")
-    loss = await simple_rl_step()
-    if loss:
-        print(f"Step {step + 1} complete, loss: {loss:.4f}")
-    else:
-        print(f"Step {step + 1} complete, building buffer...")
+
+async def run_training_steps():
+    """Run multiple RL training steps."""
+    # Note: This simplified example assumes tokenizer and services are already initialized
+    for step in range(10):
+        print(f"\n--- RL Step {step + 1} ---")
+        loss = await simple_rl_step()
+        if loss:
+            print(f"Step {step + 1} complete, loss: {loss:.4f}")
+        else:
+            print(f"Step {step + 1} complete, building buffer...")
+
 
 ######################################################################
 # Handling Speed Mismatches with Service Scaling
@@ -560,27 +585,27 @@ async def simple_rl_step():
 # **The insight**: Scale services independently based on their
 # bottlenecks.
 
-# Scale fast services with more replicas
-policy = await Policy.options(
-    procs=1, num_replicas=8, with_gpus=True  # Many replicas for high throughput
-).as_service(
-    engine_config={"model": model_name, "tensor_parallel_size": 1}
-)
-
-# Reward evaluation might be CPU-bound
-reward_actor = await RewardActor.options(
-    procs=1, num_replicas=16, with_gpus=False  # More CPU replicas
-).as_service(
-    reward_functions=[MathReward()]
-)
-
-# Training needs fewer but more powerful replicas
-trainer = await RLTrainer.options(
-    procs=1, with_gpus=True  # Fewer but GPU-heavy
-).as_actor(  # Trainer typically uses .as_actor() not .as_service()
-    model={"name": "qwen3", "flavor": "1.7B"},
-    optimizer={"name": "AdamW", "lr": 1e-5}
-)
+
+async def example_service_scaling():
+    """Example showing how to scale services independently."""
+    # Scale fast services with more replicas
+    policy = await Policy.options(
+        procs=1, num_replicas=8, with_gpus=True  # Many replicas for high throughput
+    ).as_service(engine_config={"model": "model_name", "tensor_parallel_size": 1})
+
+    # Reward evaluation might be CPU-bound
+    reward_actor = await RewardActor.options(
+        procs=1, num_replicas=16, with_gpus=False  # More CPU replicas
+    ).as_service(reward_functions=[MathReward()])
+
+    # Training needs fewer but more powerful replicas
+    trainer = await RLTrainer.options(
+        procs=1, with_gpus=True  # Fewer but GPU-heavy
+    ).as_actor(  # Trainer typically uses .as_actor() not .as_service()
+        model={"name": "qwen3", "flavor": "1.7B"},
+        optimizer={"name": "AdamW", "lr": 1e-5},
+    )
+    return policy, reward_actor, trainer
 
 
 ######################################################################
@@ -592,8 +617,8 @@ async def simple_rl_step():
 # Exact RewardActor from apps/grpo/main.py
 
 from forge.controller import ForgeActor
-from monarch.actor import endpoint
 from forge.data.rewards import MathReward, ThinkingReward
+from monarch.actor import endpoint
 
 # class definition from apps/grpo/main.py
 class RewardActor(ForgeActor):
@@ -611,29 +636,30 @@ async def evaluate_response(self, prompt: str, response: str, target: str) -> fl
             total_reward += reward
 
         # Return average reward across all functions
-        return total_reward / len(self.reward_functions) if self.reward_functions else 0.0
+        return (
+            total_reward / len(self.reward_functions) if self.reward_functions else 0.0
+        )
 
-reward_actor = await RewardActor.options(
-    procs=1, num_replicas=1
-).as_service(
-    reward_functions=[MathReward(), ThinkingReward()]
-)
 
-prompt = "What is 15% of 240?"
-response = "15% of 240 is 36"
-target = "36"
+async def example_reward_actor_usage():
+    """Example of using the RewardActor service."""
+    reward_actor = await RewardActor.options(procs=1, num_replicas=1).as_service(
+        reward_functions=[MathReward(), ThinkingReward()]
+    )
 
-score = await reward_actor.evaluate_response.route(
-    prompt=prompt,
-    response=response,
-    target=target
-)
-print(f"Reward score: {score}")  # Usually around 1.0 for correct math answers
-# For production scaling - increase num_replicas for parallel evaluation:
-# RewardActor.options(procs=1, num_replicas=16)  # 16 parallel evaluators
+    prompt = "What is 15% of 240?"
+    response = "15% of 240 is 36"
+    target = "36"
 
-# Cleanup when done
-await reward_actor.shutdown()
+    score = await reward_actor.evaluate_response.route(
+        prompt=prompt, response=response, target=target
+    )
+    print(f"Reward score: {score}")  # Usually around 1.0 for correct math answers
+    # For production scaling - increase num_replicas for parallel evaluation:
+    # RewardActor.options(procs=1, num_replicas=16)  # 16 parallel evaluators
+
+    # Cleanup when done
+    await reward_actor.shutdown()
 
 
 ######################################################################
@@ -645,18 +671,82 @@ async def evaluate_response(self, prompt: str, response: str, target: str) -> fl
 
 # This is the REAL way production RL systems are built with Forge
 
-import asyncio
-import torch
-from forge.actors.policy import Policy
-from forge.actors.reference_model import ReferenceModel
-from forge.actors.replay_buffer import ReplayBuffer
-from forge.actors.trainer import RLTrainer
-from apps.grpo.main import DatasetActor, RewardActor, ComputeAdvantages
-from forge.data.rewards import MathReward, ThinkingReward
 
-# Service creation pattern from apps/grpo/main.py lines 322-344
-print("Initializing all services...")
-(
+async def example_full_service_orchestration():
+    """Service creation pattern from apps/grpo/main.py lines 322-344."""
+    print("Initializing all services...")
+    (
+        dataloader,
+        policy,
+        trainer,
+        replay_buffer,
+        compute_advantages,
+        ref_model,
+        reward_actor,
+    ) = await asyncio.gather(
+        DatasetActor.options(procs=1).as_actor(
+            path="openai/gsm8k",
+            revision="main",
+            data_split="train",
+            streaming=True,
+            model="Qwen/Qwen3-1.7B",
+        ),
+        Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service(
+            engine_config={"model": "Qwen/Qwen3-1.7B", "tensor_parallel_size": 1},
+            sampling_config={"n": 1, "max_tokens": 512},
+        ),
+        RLTrainer.options(procs=1, with_gpus=True).as_actor(
+            model={
+                "name": "qwen3",
+                "flavor": "1.7B",
+                "hf_assets_path": "hf://Qwen/Qwen3-1.7B",
+            },
+            optimizer={"name": "AdamW", "lr": 1e-5},
+            training={"local_batch_size": 2, "seq_len": 2048},
+        ),
+        ReplayBuffer.options(procs=1).as_actor(
+            batch_size=2, max_policy_age=1, dp_size=1
+        ),
+        ComputeAdvantages.options(procs=1).as_actor(),
+        ReferenceModel.options(procs=1, with_gpus=True).as_actor(
+            model={
+                "name": "qwen3",
+                "flavor": "1.7B",
+                "hf_assets_path": "hf://Qwen/Qwen3-1.7B",
+            }
+        ),
+        RewardActor.options(procs=1, num_replicas=1).as_service(
+            reward_functions=[MathReward(), ThinkingReward()]
+        ),
+    )
+
+    print("All services initialized successfully!")
+
+    # Run training loop
+    await production_training_loop(
+        dataloader,
+        policy,
+        trainer,
+        replay_buffer,
+        compute_advantages,
+        ref_model,
+        reward_actor,
+    )
+
+    print("Shutting down services...")
+    await asyncio.gather(
+        DatasetActor.shutdown(dataloader),
+        policy.shutdown(),
+        RLTrainer.shutdown(trainer),
+        ReplayBuffer.shutdown(replay_buffer),
+        ComputeAdvantages.shutdown(compute_advantages),
+        ReferenceModel.shutdown(ref_model),
+        reward_actor.shutdown(),
+    )
+    print("All services shut down successfully!")
+
+
+async def production_training_loop(
     dataloader,
     policy,
     trainer,
@@ -664,35 +754,7 @@ async def evaluate_response(self, prompt: str, response: str, target: str) -> fl
     compute_advantages,
     ref_model,
     reward_actor,
-) = await asyncio.gather(
-    DatasetActor.options(procs=1).as_actor(
-        path="openai/gsm8k", revision="main", data_split="train",
-        streaming=True, model="Qwen/Qwen3-1.7B"
-    ),
-    Policy.options(procs=1, with_gpus=True, num_replicas=1).as_service(
-        engine_config={"model": "Qwen/Qwen3-1.7B", "tensor_parallel_size": 1},
-        sampling_config={"n": 1, "max_tokens": 512}
-    ),
-    RLTrainer.options(procs=1, with_gpus=True).as_actor(
-        model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": "hf://Qwen/Qwen3-1.7B"},
-        optimizer={"name": "AdamW", "lr": 1e-5},
-        training={"local_batch_size": 2, "seq_len": 2048}
-    ),
-    ReplayBuffer.options(procs=1).as_actor(
-        batch_size=2, max_policy_age=1, dp_size=1
-    ),
-    ComputeAdvantages.options(procs=1).as_actor(),
-    ReferenceModel.options(procs=1, with_gpus=True).as_actor(
-        model={"name": "qwen3", "flavor": "1.7B", "hf_assets_path": "hf://Qwen/Qwen3-1.7B"}
-    ),
-    RewardActor.options(procs=1, num_replicas=1).as_service(
-        reward_functions=[MathReward(), ThinkingReward()]
-    ),
-)
-
-print("All services initialized successfully!")
-
-async def production_training_loop():
+):
     """Real training loop pattern from apps/grpo/main.py"""
     step = 0
 
@@ -713,11 +775,13 @@ async def production_training_loop():
         reward = await reward_actor.evaluate_response.route(
             prompt=sample["question"],
             response=responses[0].text,
-            target=sample["answer"]
+            target=sample["answer"],
         )
 
         # Experience storage (using actual Episode structure)
-        episode = create_episode_from_grpo_data(sample, responses[0], reward, ref_logprobs[0], step)
+        episode = create_episode_from_grpo_data(
+            sample, responses[0], reward, ref_logprobs[0], step
+        )
         await replay_buffer.add.call_one(episode)
 
         # Training when ready
@@ -733,18 +797,6 @@ async def production_training_loop():
             print(f"Step {step}, Loss: {loss:.4f}")
             step += 1
 
-print("Shutting down services...")
-await asyncio.gather(
-    DatasetActor.shutdown(dataloader),
-    policy.shutdown(),
-    RLTrainer.shutdown(trainer),
-    ReplayBuffer.shutdown(replay_buffer),
-    ComputeAdvantages.shutdown(compute_advantages),
-    ReferenceModel.shutdown(ref_model),
-    reward_actor.shutdown(),
-)
-print("All services shut down successfully!")
-
 
 # **Key observations:**
 #
diff --git a/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py b/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py
index e6f071af9..c989724a0 100644
--- a/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py
+++ b/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py
@@ -194,8 +194,7 @@
 
 # Same simple API works across hosts:
 cluster_procs = spawn_cluster_procs(
-    hosts=["host1", "host2", "host3"],
-    per_host={"gpus": 4}
+    hosts=["host1", "host2", "host3"], per_host={"gpus": 4}
 )
 
 # Automatically creates:
@@ -213,9 +212,8 @@
 # This shows the underlying actor system that powers Forge services
 # NOTE: This is for educational purposes - use ForgeActor and .as_service() in real Forge apps!
 
-from monarch.actor import Actor, endpoint, this_proc, Future
-from monarch.actor import ProcMesh, this_host
-import asyncio
+from monarch.actor import Actor, endpoint, Future, ProcMesh, this_host, this_proc
+
 
 # STEP 1: Define a basic actor
 class Counter(Actor):
@@ -230,39 +228,43 @@ def increment(self) -> None:
     def get_value(self) -> int:
         return self.value
 
-# STEP 2: Single actor in local process
-counter: Counter = this_proc().spawn("counter", Counter, initial_value=0)
 
-# STEP 3: Send messages
-fut: Future[int] = counter.get_value.call_one()
-value = await fut
-print(f"Counter value: {value}")  # 0
+async def example_monarch_counter_usage():
+    """Example showing basic Monarch actor usage."""
+    # STEP 2: Single actor in local process
+    counter: Counter = this_proc().spawn("counter", Counter, initial_value=0)
+
+    # STEP 3: Send messages
+    fut: Future[int] = counter.get_value.call_one()
+    value = await fut
+    print(f"Counter value: {value}")  # 0
 
-# STEP 4: Multiple actors across processes
-procs: ProcMesh = this_host().spawn_procs(per_host={"gpus": 8})
-counters: Counter = procs.spawn("counters", Counter, 0)
+    # STEP 4: Multiple actors across processes
+    procs: ProcMesh = this_host().spawn_procs(per_host={"gpus": 8})
+    counters: Counter = procs.spawn("counters", Counter, 0)
 
-# STEP 5: Broadcast to all actors
-await counters.increment.call()
+    # STEP 5: Broadcast to all actors
+    await counters.increment.call()
 
-# STEP 6: Different message patterns
-# call_one() - single actor
-value = await counters.get_value.call_one()
-print(f"One counter: {value}")  # Output: One counter: 1
+    # STEP 6: Different message patterns
+    # call_one() - single actor
+    value = await counters.get_value.call_one()
+    print(f"One counter: {value}")  # Output: One counter: 1
 
-# choose() - random single actor (actors only, not services)
-value = await counters.get_value.choose()
-print(f"Random counter: {value}")  # Output: Random counter: 1
+    # choose() - random single actor (actors only, not services)
+    value = await counters.get_value.choose()
+    print(f"Random counter: {value}")  # Output: Random counter: 1
 
-# call() - all actors, collect results
-values = await counters.get_value.call()
-print(f"All counters: {values}")  # Output: All counters: [1, 1, 1, 1, 1, 1, 1, 1]
+    # call() - all actors, collect results
+    values = await counters.get_value.call()
+    print(f"All counters: {values}")  # Output: All counters: [1, 1, 1, 1, 1, 1, 1, 1]
 
-# broadcast() - fire and forget
-await counters.increment.broadcast()  # No return value - just sends to all actors
+    # broadcast() - fire and forget
+    await counters.increment.broadcast()  # No return value - just sends to all actors
+
+    # Cleanup
+    await procs.stop()
 
-# Cleanup
-await procs.stop()
 
 ######################################################################
 # Remember: This raw Monarch code is for understanding how Forge works internally.
@@ -459,7 +461,6 @@ def get_value(self) -> int:
 # * **Scale effectively**: Where to add resources for maximum impact?
 
 
-
 ######################################################################
 # Conclusion
 # ----------

From b680e7e241f24efa35f5fbdd518943887132e53d Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Tue, 14 Oct 2025 14:56:50 -0700
Subject: [PATCH 28/28] Update

---
 .../zero-to-forge/1_RL_and_Forge_Fundamentals.py           | 7 ++++++-
 .../tutorial_sources/zero-to-forge/2_Forge_Internals.py    | 4 ++--
 .../source/tutorial_sources/zero-to-forge/3_Monarch_101.py | 6 +++---
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py b/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py
index d97d715a3..e408cffae 100644
--- a/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py
+++ b/docs/source/tutorial_sources/zero-to-forge/1_RL_and_Forge_Fundamentals.py
@@ -325,7 +325,12 @@ async def example_automatic_management(policy):
 
 
 import torch
-from apps.grpo.main import ComputeAdvantages, DatasetActor, RewardActor
+
+try:
+    from apps.grpo.main import ComputeAdvantages, DatasetActor, RewardActor
+except ImportError:
+    # Module not available during doc build
+    ComputeAdvantages = DatasetActor = RewardActor = None
 
 ######################################################################
 # Forge handles behind the scenes:
diff --git a/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py b/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py
index 72b63d728..001d3c02d 100644
--- a/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py
+++ b/docs/source/tutorial_sources/zero-to-forge/2_Forge_Internals.py
@@ -141,8 +141,8 @@ async def example_service_creation():
     return policy
 
 
-# Run the example
-asyncio.run(example_service_creation())
+# Run the example (commented out to avoid execution during doc build)
+# asyncio.run(example_service_creation())
 
 
 ######################################################################
diff --git a/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py b/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py
index c989724a0..c0c3c1411 100644
--- a/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py
+++ b/docs/source/tutorial_sources/zero-to-forge/3_Monarch_101.py
@@ -93,10 +93,10 @@
 #  **Key insight**: ProcMesh creates one process per GPU, automatically handling the process-to-hardware mapping.
 #
 
-# This simple call:
-procs = this_host().spawn_procs(per_host={"gpus": 8})
+# Example call (commented out since this_host is not defined at module level):
+# procs = this_host().spawn_procs(per_host={"gpus": 8})
 
-# Creates:
+# This creates:
 # Process 0 → GPU 0
 # Process 1 → GPU 1
 # Process 2 → GPU 2