diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 78fd09634..9b16e58b1 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -39,7 +39,7 @@ jobs:
run: python -m pip install torch==2.9.0 --index-url https://download.pytorch.org/whl/test/cu130
- name: Install monarch
shell: bash -l {0}
- run: python -m pip install monarch-no-torch==0.1.0.dev20250826 --find-links assets/ci
+ run: pip install torchmonarch
- name: Install torchforge
shell: bash -l {0}
env:
@@ -52,9 +52,35 @@ jobs:
shell: bash -l {0}
working-directory: docs
run: |
- set +e # Don't exit on error
- make html SPHINXOPTS="-WT --keep-going" || echo "Build completed with warnings/errors"
- set -e # Re-enable exit on error for subsequent commands
+ # Set up library paths to ensure all dependencies are available
+ # This is critical for monarch and other native dependencies that need libpython3.10.so.1.0
+ export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH:-}"
+
+ # Also set CUDA paths if needed
+ if [ -d "/usr/local/cuda-12.9" ]; then
+ export LD_LIBRARY_PATH="/usr/local/cuda-12.9/compat:${LD_LIBRARY_PATH}"
+ export CUDA_HOME=/usr/local/cuda-12.9
+ fi
+
+ # Verify dependencies can be imported before building docs
+ echo "Verifying dependencies..."
+ python -c "import forge; print('✓ forge imported successfully')"
+ python -c "import monarch; print('✓ monarch imported successfully')"
+
+ # Build docs with -WT (warnings as errors) and --keep-going to see all issues
+ # Capture exit code but continue to see all errors
+ set +e
+ make html SPHINXOPTS="--keep-going"
+ BUILD_EXIT_CODE=$?
+ set -e
+
+ # Report results
+ if [ $BUILD_EXIT_CODE -ne 0 ]; then
+ echo "❌ Documentation build failed with warnings or errors (exit code: $BUILD_EXIT_CODE)"
+ exit $BUILD_EXIT_CODE
+ else
+ echo "✅ Documentation build completed successfully with no warnings or errors"
+ fi
- name: Upload docs artifact
uses: actions/upload-artifact@v4
with:
diff --git a/.gitignore b/.gitignore
index 413066489..c952405d6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -153,7 +153,7 @@ docs/source/generated_examples/
docs/source/gen_modules/
docs/source/generated/
docs/source/sg_execution_times.rst
-docs/source/tutorials
+docs/source/tutorials/*
# pytorch-sphinx-theme gets installed here
docs/src
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 8846bc62e..525ca1e86 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -6,3 +6,4 @@ sphinxcontrib-mermaid==1.0.0
sphinx-gallery==0.19.0
myst-parser #==0.18.1 # if want to contribute in markdown
sphinx-sitemap==2.7.1
+sphinx-autodoc-typehints==1.25.3
diff --git a/docs/source/_static/custom.css b/docs/source/_static/custom.css
new file mode 100644
index 000000000..89854cc8b
--- /dev/null
+++ b/docs/source/_static/custom.css
@@ -0,0 +1,98 @@
+/* Custom CSS for collapsible parameter lists */
+
+/* Hide parameters in signatures */
+.sig-param-hidden {
+ display: none !important;
+}
+
+/* Inline toggle button for signatures */
+.params-toggle-btn-inline {
+ display: inline;
+ padding: 0.2rem 0.5rem;
+ margin: 0 0.25rem;
+ background-color: var(--pst-color-background);
+ border: 1px solid var(--pst-color-border);
+ border-radius: 3px;
+ cursor: pointer;
+ font-size: 0.85em;
+ font-family: var(--pst-font-family-base);
+ color: var(--pst-color-primary);
+ transition: all 0.2s ease;
+ vertical-align: middle;
+}
+
+.params-toggle-btn-inline:hover {
+ background-color: var(--pst-color-background);
+ border-color: var(--pst-color-border);
+}
+
+.params-toggle-btn-inline:focus {
+ outline: none;
+}
+
+.toggle-icon {
+ display: inline-block;
+ font-size: 0.8em;
+ transition: transform 0.2s ease;
+}
+
+/* Wrapper for the button */
+.sig-params-wrapper {
+ display: inline;
+}
+
+/* Old styles for field-list collapsing (kept for backward compatibility) */
+.collapsible-params {
+ margin: 1rem 0;
+}
+
+.params-toggle-btn {
+ display: inline-block;
+ padding: 0.5rem 1rem;
+ margin-bottom: 0.5rem;
+ background-color: var(--pst-color-background);
+ border: 1px solid var(--pst-color-border);
+ border-radius: 4px;
+ cursor: pointer;
+ font-size: 0.9rem;
+ color: var(--pst-color-primary);
+ transition: all 0.3s ease;
+}
+
+.params-toggle-btn:hover {
+ background-color: var(--pst-color-background);
+ border-color: var(--pst-color-border);
+}
+
+.params-content {
+ max-height: 10000px;
+ overflow: hidden;
+ transition: max-height 0.5s ease, opacity 0.3s ease;
+ opacity: 1;
+}
+
+.params-content.collapsed {
+ max-height: 0;
+ opacity: 0;
+}
+
+/* Ensure the collapsed parameters look good */
+.params-content dl.field-list {
+ margin-top: 0;
+}
+
+.params-content > dt {
+ margin-top: 0.5rem;
+}
+
+.params-content > dt:first-child {
+ margin-top: 0;
+}
+
+/* Responsive adjustments */
+@media (max-width: 768px) {
+ .params-toggle-btn {
+ width: 100%;
+ text-align: left;
+ }
+}
diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js
new file mode 100644
index 000000000..415592d30
--- /dev/null
+++ b/docs/source/_static/custom.js
@@ -0,0 +1,93 @@
+// Custom JavaScript to make long parameter lists in class signatures collapsible
+document.addEventListener('DOMContentLoaded', function() {
+ console.log('Collapsible parameters script loaded');
+
+ // Find all class/function signatures
+ const signatures = document.querySelectorAll('dl.py.class > dt, dl.py.function > dt, dl.py.method > dt');
+
+ signatures.forEach(function(signature) {
+ // Find all parameter elements in the signature
+ const params = signature.querySelectorAll('em.sig-param, .sig-param');
+
+ console.log(`Found signature with ${params.length} parameters`);
+
+ // Only make it collapsible if there are more than 10 parameters
+ if (params.length > 10) {
+ console.log('Creating collapsible structure for signature with', params.length, 'parameters');
+
+ const visibleCount = 5;
+ const hiddenCount = params.length - visibleCount;
+
+ // Create a wrapper div for the toggle button
+ const wrapper = document.createElement('span');
+ wrapper.className = 'sig-params-wrapper';
+ wrapper.style.display = 'inline';
+
+ // Create toggle button
+ const toggleBtn = document.createElement('button');
+ toggleBtn.className = 'params-toggle-btn-inline';
+ toggleBtn.innerHTML = ` Show More`;
+ toggleBtn.setAttribute('aria-expanded', 'false');
+ toggleBtn.title = `Show ${hiddenCount} more parameters`;
+
+ // Collect all nodes to hide (params and text nodes between them)
+ const nodesToHide = [];
+
+ // Hide parameters after the first 3
+ let insertedButton = false;
+ params.forEach(function(param, index) {
+ if (index >= visibleCount) {
+ // Add 'hidden' class to hide the parameter
+ param.classList.add('sig-param-hidden');
+ nodesToHide.push(param);
+
+ // Also hide the text node (comma/space) that follows this parameter
+ let nextNode = param.nextSibling;
+ while (nextNode && nextNode.nodeType === Node.TEXT_NODE) {
+ const textSpan = document.createElement('span');
+ textSpan.className = 'sig-param-hidden';
+ textSpan.textContent = nextNode.textContent;
+ nextNode.parentNode.replaceChild(textSpan, nextNode);
+ nodesToHide.push(textSpan);
+ break;
+ }
+
+ // Insert the toggle button before the first hidden parameter
+ if (!insertedButton) {
+ param.parentNode.insertBefore(wrapper, param);
+ wrapper.appendChild(toggleBtn);
+ insertedButton = true;
+ }
+ }
+ });
+
+ // Add click handler to toggle
+ toggleBtn.addEventListener('click', function(e) {
+ e.preventDefault();
+ e.stopPropagation();
+
+ const isExpanded = toggleBtn.getAttribute('aria-expanded') === 'true';
+
+ if (isExpanded) {
+ // Collapse: hide parameters again
+ nodesToHide.forEach(function(node) {
+ node.classList.add('sig-param-hidden');
+ });
+ toggleBtn.setAttribute('aria-expanded', 'false');
+ toggleBtn.innerHTML = ` Show More`;
+ toggleBtn.title = `Show ${hiddenCount} more parameters`;
+ } else {
+ // Expand: show all parameters
+ nodesToHide.forEach(function(node) {
+ node.classList.remove('sig-param-hidden');
+ });
+ toggleBtn.setAttribute('aria-expanded', 'true');
+ toggleBtn.innerHTML = ` Hide`;
+ toggleBtn.title = `Hide ${hiddenCount} parameters`;
+ }
+ });
+
+ console.log('Collapsible structure created successfully');
+ }
+ });
+});
diff --git a/docs/source/api.md b/docs/source/api.md
index 5ed009c4c..1235f9d4e 100644
--- a/docs/source/api.md
+++ b/docs/source/api.md
@@ -1,35 +1,35 @@
# API Reference
-This section provides comprehensive API documentation for TorchForge modules and classes.
+This section provides comprehensive API documentation for TorchForge.
-TorchForge is organized into several key modules, each providing specialized functionality for post-training generative AI models:
+## Overview
-## Module Overview
+TorchForge is a PyTorch native platform for post-training generative AI models,
+designed to streamline reinforcement learning workflows for large language
+models. The platform leverages PyTorch's distributed computing capabilities
+and is built on top of [Monarch](https://meta-pytorch.org/monarch/),
+making extensive use of actors for distributed computation and fault tolerance.
-**Core Components**
-- [Interfaces & Types](api_core.md) - Core interfaces and type definitions
-- [Actors](api_actors.md) - Model training and inference components
-- [Controller](api_controller.md) - Distributed training orchestration and resource management
+Key Features of TorchForge include:
-**Data Management**
-- [Data](api_data.md) - Data handling utilities, datasets, and data models
+- **Actor-Based Architecture**: TorchForge uses an actor-based system for distributed training, providing excellent scalability and fault tolerance.
+- **PyTorch Native**: Built natively on PyTorch, ensuring seamless integration with existing PyTorch workflows.
+- **Post-Training Focus**: Specifically designed for post-training techniques like RLHF, SFT, and other alignment methods.
+- **Distributed by Design**: Supports multi-GPU and multi-node training out of the box.
-**Training Components**
-- [Losses](api_losses.md) - Loss functions for reinforcement learning and supervised fine-tuning
-- [Environments](api_envs.md) - Training and inference environments
-**Tools & Utilities**
-- [Utilities](api_util.md) - General utility functions and helpers
+For most use cases, you'll interact with the high-level service
+interfaces, which handle the complexity of actor coordination and
+distributed training automatically.
-```{toctree}
-:maxdepth: 2
-:hidden:
+For advanced users who need fine-grained control, the individual actor
+APIs provide direct access to the underlying distributed components.
-api_core
+```{toctree}
+:maxdepth: 1
api_actors
-api_data
-api_losses
-api_envs
-api_controller
-api_util
+api_service
+api_generator
+api_model
+api_trainer
```
diff --git a/docs/source/api_actors.md b/docs/source/api_actors.md
index 6ef5f1ff8..73eae1220 100644
--- a/docs/source/api_actors.md
+++ b/docs/source/api_actors.md
@@ -1,19 +1,20 @@
-# Actors
-
-The actors module contains the core components for model training and inference in TorchForge. This includes policy actors, reference models, replay buffers, and trainers.
-
-## Policy Actor
-
-The policy actor is responsible for model inference and policy interactions during training.
-
-## Reference Model
-
-The reference model provides baseline comparisons for reinforcement learning algorithms.
-
-## Replay Buffer
-
-The replay buffer manages storage and sampling of training experiences.
-
-## Trainer
-
-The trainer orchestrates the training process and implements training algorithms.
+# ForgeActor
+
+```{eval-rst}
+.. currentmodule:: forge.actors
+```
+
+The actors module contains the core components for model training
+and inference in TorchForge. These pre-built actors provide essential
+functionality for reinforcement learning workflows and can be used
+as building blocks for complex distributed training systems.
+
+```{eval-rst}
+.. currentmodule:: forge.controller.actor
+
+.. autoclass:: ForgeActor
+ :members:
+ :undoc-members:
+ :show-inheritance:
+ :exclude-members: logger, setup, set_env, __init__
+```
diff --git a/docs/source/api_controller.md b/docs/source/api_controller.md
deleted file mode 100644
index e9bedda74..000000000
--- a/docs/source/api_controller.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Controller
-
-Distributed training orchestration and resource management components for TorchForge.
diff --git a/docs/source/api_core.md b/docs/source/api_core.md
deleted file mode 100644
index 75b3e9ae5..000000000
--- a/docs/source/api_core.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Core Interfaces
-
-This section covers the fundamental interfaces and type definitions that form the foundation of TorchForge.
diff --git a/docs/source/api_data.md b/docs/source/api_data.md
deleted file mode 100644
index cbc1cfc53..000000000
--- a/docs/source/api_data.md
+++ /dev/null
@@ -1,16 +0,0 @@
-# Data Management
-
-Comprehensive data handling utilities for training and
-inference, including datasets, data models, and various
-data processing utilities.
-
-## Prompt
-
-Data model for input prompts and contexts.
-
-```{eval-rst}
-.. automodule:: forge.data_models.prompt
- :members:
- :undoc-members:
- :show-inheritance:
-```
diff --git a/docs/source/api_envs.md b/docs/source/api_envs.md
deleted file mode 100644
index 88e9d1cea..000000000
--- a/docs/source/api_envs.md
+++ /dev/null
@@ -1,8 +0,0 @@
-# Environments
-
-Training and inference environments for TorchForge models.
-
-
-## Chat Environment
-
-Chat-based environment for conversational AI training and inference.
diff --git a/docs/source/api_generator.md b/docs/source/api_generator.md
new file mode 100644
index 000000000..a0bb67f3d
--- /dev/null
+++ b/docs/source/api_generator.md
@@ -0,0 +1,46 @@
+# Generator
+
+```{eval-rst}
+.. currentmodule:: forge.actors.policy
+```
+
+The Generator (Policy) is the core inference engine in TorchForge,
+built on top of [vLLM](https://docs.vllm.ai/en/latest/).
+It manages model serving, text generation, and weight updates for reinforcement learning workflows.
+
+## Policy
+
+```{eval-rst}
+.. autoclass:: Policy
+ :members: launch, generate, update_weights, get_version, stop
+ :exclude-members: __init__
+ :no-inherited-members:
+```
+
+## Configuration
+
+### EngineConfig
+
+```{eval-rst}
+.. autoclass:: EngineConfig
+ :members:
+ :undoc-members:
+ :no-inherited-members:
+```
+
+### SamplingConfig
+
+```{eval-rst}
+.. autoclass:: SamplingConfig
+ :members:
+ :undoc-members:
+```
+
+## PolicyWorker
+
+```{eval-rst}
+.. autoclass:: PolicyWorker
+ :members: execute_model, update, setup_kv_cache
+ :show-inheritance:
+ :exclude-members: __init__
+```
diff --git a/docs/source/api_losses.md b/docs/source/api_losses.md
deleted file mode 100644
index 097b83394..000000000
--- a/docs/source/api_losses.md
+++ /dev/null
@@ -1,11 +0,0 @@
-# Losses
-
-Loss functions for reinforcement learning and supervised fine-tuning in TorchForge.
-
-## GRPO Loss
-
-Generalized Reward Policy Optimization (GRPO) loss implementation for reinforcement learning.
-
-## Reinforce Loss
-
-Reinforce algorithm loss implementation for policy gradient methods.
diff --git a/docs/source/api_model.md b/docs/source/api_model.md
new file mode 100644
index 000000000..94e51478e
--- /dev/null
+++ b/docs/source/api_model.md
@@ -0,0 +1,29 @@
+# Model
+
+```{eval-rst}
+.. currentmodule:: forge.actors.reference_model
+```
+
+The {class}`forge.actors.reference_model.ReferenceModel` provides a frozen
+copy of the policy model used for computing advantages in reinforcement
+learning. It performs inference on input sequences and returns logits or
+log probabilities for computing KL divergence and other RL metrics.
+
+## ReferenceModel
+
+```{eval-rst}
+.. autoclass:: forge.actors.reference_model.ReferenceModel
+ :members:
+ :undoc-members:
+ :show-inheritance:
+```
+
+The ReferenceModel uses a subset of TorchTitan's configuration system:
+
+- **model**: Model architecture settings (Model dataclass)
+- **parallelism**: Parallelism configuration for distributed inference (Parallelism dataclass)
+- **checkpoint**: Checkpoint loading settings (Checkpoint dataclass)
+- **compile**: Model compilation settings (Compile dataclass)
+- **training**: Training configuration for dtype and other settings (Training dataclass)
+
+For detailed configuration options, refer to the [TorchTitan documentation](https://github.com/pytorch/torchtitan).
diff --git a/docs/source/api_service.md b/docs/source/api_service.md
new file mode 100644
index 000000000..df2bf3dc8
--- /dev/null
+++ b/docs/source/api_service.md
@@ -0,0 +1,12 @@
+# Service
+
+```{eval-rst}
+.. currentmodule:: forge.controller.service.service
+```
+
+```{eval-rst}
+.. autoclass:: Service
+
+ :members: call_all, start_session, get_metrics, get_metrics_summary, terminate_session, stop
+ :show-inheritance:
+```
diff --git a/docs/source/api_trainer.md b/docs/source/api_trainer.md
new file mode 100644
index 000000000..75aba94f0
--- /dev/null
+++ b/docs/source/api_trainer.md
@@ -0,0 +1,68 @@
+# Trainer
+
+```{eval-rst}
+.. currentmodule:: forge.actors.trainer
+```
+
+The Trainer manages model training in TorchForge, built on top of TorchTitan.
+It handles forward/backward passes, weight updates, and checkpoint management for reinforcement learning workflows.
+
+## RLTrainer
+
+```{eval-rst}
+.. autoclass:: RLTrainer
+ :members: train_step, push_weights, cleanup
+ :exclude-members: __init__
+```
+
+## Configuration
+
+The RLTrainer uses TorchTitan's configuration system with the following components:
+
+### Job Configuration
+
+```{eval-rst}
+.. autoclass:: torchtitan.config.job_config.Job
+ :members:
+ :undoc-members:
+```
+
+### Model Configuration
+
+```{eval-rst}
+.. autoclass:: torchtitan.config.job_config.Model
+ :members:
+ :undoc-members:
+```
+
+### Optimizer Configuration
+
+```{eval-rst}
+.. autoclass:: torchtitan.config.job_config.Optimizer
+ :members:
+ :undoc-members:
+```
+
+### Training Configuration
+
+```{eval-rst}
+.. autoclass:: torchtitan.config.job_config.Training
+ :members:
+ :undoc-members:
+```
+
+### Parallelism Configuration
+
+```{eval-rst}
+.. autoclass:: torchtitan.config.job_config.Parallelism
+ :members:
+ :undoc-members:
+```
+
+### Checkpoint Configuration
+
+```{eval-rst}
+.. autoclass:: torchtitan.config.job_config.Checkpoint
+ :members:
+ :undoc-members:
+```
diff --git a/docs/source/api_util.md b/docs/source/api_util.md
deleted file mode 100644
index f15e03b76..000000000
--- a/docs/source/api_util.md
+++ /dev/null
@@ -1,25 +0,0 @@
-# Utilities
-
-General utility functions and helpers used throughout TorchForge.
-
-## Distributed Computing
-
-Utilities for distributed training and communication.
-
-```{eval-rst}
-.. automodule:: forge.util.distributed
- :members:
- :undoc-members:
- :show-inheritance:
-```
-
-## Logging
-
-Logging configuration and utilities.
-
-```{eval-rst}
-.. automodule:: forge.util.logging
- :members:
- :undoc-members:
- :show-inheritance:
-```
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 760a8d714..ee9d62148 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -58,6 +58,7 @@ def get_version_path():
"myst_parser",
"sphinx.ext.autodoc",
"sphinx.ext.autosummary",
+ "sphinx_autodoc_typehints",
"sphinx.ext.napoleon",
"sphinx.ext.intersphinx",
"sphinx.ext.viewcode",
@@ -74,11 +75,18 @@ def get_version_path():
]
sitemap_url_scheme = "{link}"
+# Ensure static files use relative paths
+html_static_path = ["_static"]
+
templates_path = [
"_templates",
os.path.join(os.path.dirname(pytorch_sphinx_theme2.__file__), "templates"),
]
-exclude_patterns = ["tutorials/index.rst"]
+exclude_patterns = ["tutorials/index.rst", "tutorials/template_tutorial.rst"]
+
+html_static_path = ["_static"]
+html_css_files = ["custom.css"]
+html_js_files = ["custom.js"]
sys.path.insert(0, os.path.abspath("."))
sys.path.insert(0, os.path.abspath("../../src"))
@@ -124,6 +132,8 @@ def get_version_path():
"navbar_center": "navbar-nav",
"canonical_url": "https://meta-pytorch.org/forge/",
"header_links_before_dropdown": 7,
+ "show_nav_level": 2,
+ "show_toc_level": 2,
}
theme_variables = pytorch_sphinx_theme2.get_theme_variables()
@@ -160,11 +170,42 @@ def get_version_path():
autodoc_default_options = {
"members": True,
"member-order": "bysource",
- "special-members": "__init__",
- "undoc-members": True,
"exclude-members": "__weakref__",
+ "private-members": False,
}
+# Autodoc configuration for cleaner signatures
+autodoc_preserve_defaults = True # Preserves default values without expansion
+autodoc_typehints = "description" # Move type hints to description instead of signature
+autodoc_typehints_description_target = (
+ "documented" # Only add types to documented params
+)
+
+# Suppress warnings from third-party library docstrings
+suppress_warnings = [
+ "docutils", # Suppress docstring formatting issues from third-party libraries
+ "app.add_node", # Suppress node warnings
+ "app.add_directive", # Suppress directive warnings
+ "ref.class", # Suppress missing reference warnings
+ "ref.func", # Suppress missing function reference warnings
+ "ref.meth", # Suppress missing method reference warnings
+]
+
+# Treat warnings as non-fatal - continue build even if there are warnings
+keep_warnings = True
+
+# Don't fail the build on warnings - important for handling third-party library docstrings
+# This is especially important when dependencies (like torchtitan) have RST formatting
+# that may not be perfect but works with Napoleon extension
+nitpicky = False # Don't be overly strict about references
+
+# Napoleon settings for Google-style docstrings (from torchtitan and other dependencies)
+napoleon_google_docstring = True
+napoleon_numpy_docstring = True
+napoleon_use_param = True
+napoleon_use_rtype = True
+napoleon_use_ivar = True
+
# -- Sphinx Gallery configuration -------------------------------------------
sphinx_gallery_conf = {
@@ -176,6 +217,6 @@ def get_version_path():
"plot_gallery": "True",
"promote_jupyter_magic": True,
"backreferences_dir": None,
- "write_computation_times": True,
"show_signature": False,
+ "write_computation_times": False,
}
diff --git a/src/forge/actors/reference_model.py b/src/forge/actors/reference_model.py
index cc57e5246..bfe9f9494 100644
--- a/src/forge/actors/reference_model.py
+++ b/src/forge/actors/reference_model.py
@@ -37,6 +37,10 @@
@dataclass
class ReferenceModel(ForgeActor):
+ """
+ Reference model implementation for the TorchForge service.
+ """
+
# Refer to titan JobConfig for enabling more ForgeEngine configuration
model: Model = field(default_factory=Model)
parallelism: Parallelism = field(default_factory=Parallelism)
diff --git a/src/forge/actors/trainer.py b/src/forge/actors/trainer.py
index 4ffc63001..486286680 100644
--- a/src/forge/actors/trainer.py
+++ b/src/forge/actors/trainer.py
@@ -95,6 +95,10 @@ def cleanup_old_weight_versions(
@dataclass
class RLTrainer(ForgeActor):
+ """
+ RL Trainer implementation for the TorchForge service.
+ """
+
job: Job = field(default_factory=Job)
model: Model = field(default_factory=Model)
optimizer: Optimizer = field(default_factory=Optimizer)
diff --git a/src/forge/controller/actor.py b/src/forge/controller/actor.py
index a899da6f0..4a5cbf173 100644
--- a/src/forge/controller/actor.py
+++ b/src/forge/controller/actor.py
@@ -22,11 +22,36 @@
class ForgeActor(Actor):
+ """
+ Base class for Forge actors with configurable resource attributes.
+
+ The initialization sets up logging configuration with rank/size information and
+ initializes the actor's process mesh reference. The rank and size are automatically
+ determined from the current execution context.
+
+ Args:
+ *args: Variable length argument list passed to the parent Actor class.
+ **kwargs: Arbitrary keyword arguments passed to the parent Actor class.
+ """
+
procs: int = 1
+ """Number of processes to use for this actor. Defaults to 1."""
+
hosts: int | None = None
+ """Number of hosts to distribute the actor across. If None, uses as many
+ hosts as needed to accommodate the requested processes. Defaults to None."""
+
with_gpus: bool = False
+ """Whether to allocate GPU resources for this actor. Defaults to False."""
+
num_replicas: int = 1
+ """Number of replicas to create when spawning as a service.
+ Only applies when using as_service(). Defaults to 1."""
+
mesh_name: str | None = None
+ """Optional name for the process mesh used by this actor.
+ If None, a default name will be generated. Defaults to None."""
+
_extra_config: dict[str, Any] = {}
def __init__(self, *args, **kwargs):
@@ -69,23 +94,35 @@ def options(
`.as_actor()` or `.as_service()`. Each call creates a separate subclass, so
multiple different configurations can coexist without interfering with each other.
- ---- Usage Examples ----
+ Examples:
+
+ * Pre-configure a service with multiple replicas:
+
+ .. code-block:: python
+
+ service = await MyForgeActor.options(num_replicas=2, procs=2).as_service(...)
+ await service.shutdown()
+
+ * Default usage without calling options:
+
+ .. code-block:: python
+
+ service = await MyForgeActor.as_service(...)
+ await service.shutdown()
+
+ * Pre-configure a single actor
+
+ .. code-block:: python
- # Pre-configure a service with multiple replicas
- service = await MyForgeActor.options(num_replicas=2, procs=2).as_service(...)
- await service.shutdown()
+ actor = await MyForgeActor.options(procs=1, hosts=1).as_actor(...)
+ await actor.shutdown()
- # Default usage without calling options
- service = await MyForgeActor.as_service(...)
- await service.shutdown()
+ * Default usage without calling options
- # Pre-configure a single actor
- actor = await MyForgeActor.options(procs=1, hosts=1).as_actor(...)
- await actor.shutdown()
+ .. code-block:: python
- # Default usage without calling options
- actor = await MyForgeActor.as_actor(...)
- await actor.shutdown()
+ actor = await MyForgeActor.as_actor(...)
+ await actor.shutdown()
"""
attrs = {
diff --git a/src/forge/controller/service/service.py b/src/forge/controller/service/service.py
index 0b655fb6a..1413cbba1 100644
--- a/src/forge/controller/service/service.py
+++ b/src/forge/controller/service/service.py
@@ -68,13 +68,6 @@ class Service:
actor_def: Actor class definition to instantiate on each replica
*actor_args: Positional arguments passed to actor constructor
**actor_kwargs: Keyword arguments passed to actor constructor
-
-
- Attributes:
- _cfg: Service configuration
- _replicas: List of managed replica instances
- _active_sessions: Currently active sessions
- _metrics: Aggregated service and replica metrics
"""
def __init__(
@@ -486,6 +479,10 @@ async def _get_replica(self, sess_id: str | None) -> "Replica":
)
async def stop(self):
+ """
+ Stops the service and all managed replicas.
+ This method should be called when the service is no longer needed.
+ """
logger.debug("Stopping service...")
# Signal shutdown to health loop
self._shutdown_requested = True
@@ -605,12 +602,6 @@ class ServiceActor(Actor):
actor_def: Actor class definition to instantiate on each replica
*actor_args: Positional arguments passed to actor constructor
**actor_kwargs: Keyword arguments passed to actor constructor
-
- Attributes:
- _cfg: Service configuration
- _replicas: List of managed replica instances
- _active_sessions: Currently active sessions
- _metrics: Aggregated service and replica metrics
"""
def __init__(self, cfg: ServiceConfig, actor_def, actor_kwargs: dict):