Skip to content
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
da21e1d
Add reward interface, math reward, unit tests
DNXie Aug 21, 2025
5c72908
Merge branch 'meta-pytorch:main' into main
DNXie Aug 22, 2025
b4d7a61
Merge branch 'meta-pytorch:main' into main
DNXie Aug 25, 2025
02d77c6
Merge branch 'meta-pytorch:main' into main
DNXie Aug 27, 2025
fd1d38b
Merge branch 'meta-pytorch:main' into main
DNXie Aug 28, 2025
f79beee
Merge branch 'meta-pytorch:main' into main
DNXie Aug 28, 2025
d8d775a
Merge branch 'meta-pytorch:main' into main
DNXie Sep 2, 2025
e423c44
Merge branch 'meta-pytorch:main' into main
DNXie Sep 4, 2025
4815c05
Merge branch 'meta-pytorch:main' into main
DNXie Sep 8, 2025
77d41e4
Merge branch 'meta-pytorch:main' into main
DNXie Sep 9, 2025
a3feb1e
Merge branch 'meta-pytorch:main' into main
DNXie Sep 10, 2025
23d7e02
Merge branch 'meta-pytorch:main' into main
DNXie Sep 11, 2025
2ca881d
refactor, buggy
DNXie Sep 11, 2025
4df5d3a
some tweak
DNXie Sep 12, 2025
0b5c0db
add options under forgeactor, and tested it with vllm main
DNXie Sep 12, 2025
1cc5cf2
add as_service to forgeactor for default config
DNXie Sep 12, 2025
a92952a
enable passing serviceConfig obj to options
DNXie Sep 15, 2025
2ce61d1
update all the usages
DNXie Sep 15, 2025
f32fef7
Merge branch 'main' into add_options
DNXie Sep 15, 2025
f28824d
fix script error and CI broken test
DNXie Sep 15, 2025
549f43a
fix ci test
DNXie Sep 15, 2025
26a4207
remove redundant line
DNXie Sep 15, 2025
a311cbd
Update tests/unit_tests/test_service.py
DNXie Sep 15, 2025
1261568
Merge branch 'main' into add_options
DNXie Sep 15, 2025
c1854ec
add missing import back
DNXie Sep 15, 2025
e595fbd
fix ci
DNXie Sep 16, 2025
52cb676
Merge branch 'add_options' of github.com:DNXie/forge into DNXie-add_o…
DNXie Sep 16, 2025
fd38100
Merge branch 'DNXie-add_options'
DNXie Sep 16, 2025
9a80b16
Merge branch 'main' of github.com:DNXie/forge
DNXie Sep 16, 2025
09e7237
merge
DNXie Sep 16, 2025
0165027
make options return a forgeactor and as_service return a serviceinter…
DNXie Sep 16, 2025
721e32a
remove redundant test case
DNXie Sep 16, 2025
915baf1
fix lint
DNXie Sep 16, 2025
4171675
fix broken ci
DNXie Sep 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 18 additions & 38 deletions apps/grpo/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
from forge.actors.trainer import _qwen3_hf_to_vllm
from forge.cli.config import parse
from forge.controller.actor import ForgeActor
from forge.controller.service import ServiceConfig, shutdown_service, spawn_service
from forge.data.rewards import MathReward, ThinkingReward
from forge.util.metric_logging import get_metric_logger
from monarch.actor import endpoint
Expand Down Expand Up @@ -351,39 +350,20 @@ async def main(cfg: DictConfig):
ref_model,
reward_actor,
) = await asyncio.gather(
spawn_service(
ServiceConfig(**cfg.dataset.service),
DatasetActor,
**exclude_service(cfg.dataset),
DatasetActor.options(**cfg.dataset.service).as_service(
**exclude_service(cfg.dataset)
),
spawn_service(
ServiceConfig(**cfg.policy.service),
Policy,
**exclude_service(cfg.policy),
Policy.options(**cfg.policy.service).as_service(**exclude_service(cfg.policy)),
Trainer.options(**cfg.trainer.service).as_service(
**exclude_service(cfg.trainer)
),
spawn_service(
ServiceConfig(**cfg.trainer.service),
Trainer,
**exclude_service(cfg.trainer),
ReplayBuffer.options(**cfg.replay_buffer.service).as_service(
**exclude_service(cfg.replay_buffer)
),
spawn_service(
ServiceConfig(**cfg.replay_buffer.service),
ReplayBuffer,
**exclude_service(cfg.replay_buffer),
),
spawn_service(
ServiceConfig(**cfg.compute_advantages.service),
ComputeAdvantages,
),
spawn_service(
ServiceConfig(**cfg.ref_model.service),
RefModel,
model_name=model,
),
spawn_service(
ServiceConfig(**cfg.reward_actor.service),
RewardActor,
reward_functions=[MathReward(), ThinkingReward()],
ComputeAdvantages.options(**cfg.compute_advantages.service).as_service(),
RefModel.options(**cfg.ref_model.service).as_service(model_name=model),
RewardActor.options(**cfg.reward_actor.service).as_service(
reward_functions=[MathReward(), ThinkingReward()]
),
)

Expand Down Expand Up @@ -468,13 +448,13 @@ async def continuous_training():
finally:
print("Shutting down...")
await asyncio.gather(
shutdown_service(policy),
shutdown_service(trainer),
shutdown_service(replay_buffer),
shutdown_service(dataloader),
shutdown_service(compute_advantages),
shutdown_service(ref_model),
shutdown_service(reward_actor),
dataloader.shutdown(),
policy.shutdown(),
trainer.shutdown(),
replay_buffer.shutdown(),
compute_advantages.shutdown(),
ref_model.shutdown(),
reward_actor.shutdown(),
)


Expand Down
25 changes: 10 additions & 15 deletions apps/rl/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,31 +16,26 @@

from forge.actors import ReplayBuffer, RLTrainer
from forge.cli.config import parse
from forge.controller.service import ServiceConfig, shutdown_service, spawn_service

from omegaconf import DictConfig

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


async def run(cfg: DictConfig):
trainer, replay_buffer = await asyncio.gather(
spawn_service(
ServiceConfig(procs_per_replica=1, with_gpus=True, num_replicas=4),
RLTrainer,
**cfg.trainer,
),
spawn_service(
ServiceConfig(procs_per_replica=1, num_replicas=1),
ReplayBuffer,
**cfg.replay_buffer,
),
)
trainer = await RLTrainer.options(
procs_per_replica=1, with_gpus=True, num_replicas=4
).as_service(**cfg.trainer)
replay_buffer = await ReplayBuffer.options(
procs_per_replica=1, num_replicas=1
).as_service(**cfg.replay_buffer)

print("Services initialized....")

print("shutting down...")
await shutdown_service(trainer)
await shutdown_service(replay_buffer)
await trainer.shutdown()
await replay_buffer.shutdown()


@parse
Expand Down
1 change: 0 additions & 1 deletion apps/vllm/llama3_8b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ policy:
n: 2
guided_decoding: false
max_tokens: 512
available_devices: null
service:
procs_per_replica: 2
num_replicas: 1
Expand Down
9 changes: 3 additions & 6 deletions apps/vllm/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@

from forge.actors.policy import Policy
from forge.cli.config import parse
from forge.controller.service import ServiceConfig, shutdown_service, spawn_service

from omegaconf import DictConfig
from src.forge.data.utils import exclude_service
Expand All @@ -28,10 +27,8 @@ async def run(cfg: DictConfig):

print("Spawning service...")

policy = await spawn_service(
ServiceConfig(**cfg.policy.service),
Policy,
**exclude_service(cfg.policy),
policy = await Policy.options(**cfg.policy.service).as_service(
**exclude_service(cfg.policy)
)

async with policy.session():
Expand All @@ -48,7 +45,7 @@ async def run(cfg: DictConfig):

print("\nShutting down...")

await shutdown_service(policy)
await policy.shutdown()


@parse
Expand Down
15 changes: 4 additions & 11 deletions src/forge/actors/reference_actor.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,11 @@
from collections.abc import Mapping
from dataclasses import dataclass, field, fields

from typing import Any

import torch
from monarch.actor import current_rank, current_size, endpoint
from omegaconf import DictConfig, OmegaConf
from torch import nn

from torchtitan.components.lr_scheduler import LRSchedulersContainer
from torchtitan.config.job_config import Comm, Model, Parallelism
from torchtitan.distributed import ParallelDims, utils as dist_utils
from torchtitan.config.job_config import Model, Parallelism
from torchtitan.distributed import utils as dist_utils
from torchtitan.experiments.forge.engine import ForgeEngine
from torchtitan.experiments.forge.job_config import ForgeJobConfig
from transformers import AutoModelForCausalLM
Expand Down Expand Up @@ -334,10 +329,8 @@ async def setup(self):
engine_config = {f.name: getattr(self, f.name) for f in fields(self)}
self.engine = ForgeEngine(ForgeJobConfig(**engine_config))

# Spawn the RefModel
self.ref_model = await spawn_service(
default_service_cfg,
HuggingFaceRefModel,
# Spawn the RefModel with default service config
self.ref_model = HuggingFaceRefModel.as_service(
model_name=self.model.name,
device=self.device,
)
Expand Down
114 changes: 113 additions & 1 deletion src/forge/controller/actor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@

import math
import sys
from typing import Type

from monarch.actor import Actor, current_rank, current_size, endpoint

from forge.controller.proc_mesh import get_proc_mesh, stop_proc_mesh
from forge.types import ProcessConfig

from forge.types import ProcessConfig, ServiceConfig

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
Expand Down Expand Up @@ -41,6 +43,116 @@ def __init__(self, *args, **kwargs):
self.logger.root.addHandler(stdout_handler)
super().__init__(*args, **kwargs)

@classmethod
def options(
cls,
*,
service_config: ServiceConfig | None = None,
num_replicas: int | None = None,
procs_per_replica: int | None = None,
**service_kwargs,
) -> Type["ConfiguredService"]:
"""
Returns a ConfiguredService class that wraps this ForgeActor in a Service.

Usage (choose ONE of the following forms):
# Option A: construct ServiceConfig implicitly
service = await MyForgeActor.options(
num_replicas=1,
procs_per_replica=2,
).as_service(...)
await service.shutdown()

# Option B: provide an explicit ServiceConfig
cfg = ServiceConfig(num_replicas=1, procs_per_replica=2, scheduling="round_robin")
service = await MyForgeActor.options(service_config=cfg).as_service(...)
await service.shutdown()

"""
from forge.controller.service import Service, ServiceInterface

if service_config is not None:
# Use the provided config directly
cfg = service_config
else:
if num_replicas is None or procs_per_replica is None:
raise ValueError(
"Must provide either `service_config` or (num_replicas + procs_per_replica)."
)
cfg = ServiceConfig(
num_replicas=num_replicas,
procs_per_replica=procs_per_replica,
**service_kwargs,
)

class ConfiguredService:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we remove the ConfiguredService piece altogether? options() should return a type["ForgeActor"]

Copy link
Member Author

@DNXie DNXie Sep 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The main reason we have the ConfiguredService wrapper is to allow direct access to actor endpoints via instance attributes (e.g., service.value.choose()) while still carrying the ServiceConfig on the class.

If we remove the wrapper and make options() return a subclass of ForgeActor directly, the endpoints (like counter.value) are only accessible through the underlying ServiceInterface. So we need to change the statement

await service.value.choose()

to

await service._service_interface.value.choose()

I attempted to delegate EndpointProperty access via __getattr__ like this

def __getattr__(self, item):
    if self._service_interface is None:
        raise AttributeError(f"Service not started yet; cannot access '{item}'")
    
    attr = getattr(self._service_interface, item)
    from monarch._src.actor.endpoint import EndpointProperty
    if isinstance(attr, EndpointProperty):
        # Call the descriptor's __get__ to bind it
        return attr.__get__(self._service_interface, type(self._service_interface))

    return attr

However, it didn’t fully work: Python descriptors like EndpointProperty only bind correctly when accessed through the class or a properly initialized ServiceInterface instance. The __get__ call in __getattr__ does not fully replicate the descriptor binding behavior.

I’m still getting familiar with the internals of Monarch and some of the Python descriptor mechanics, so I'd love to hear any suggestions if there’s a cleaner way to handle this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, EndpointProperty binding is only needed after we do as_service() right?

this is how I imagine the options piece:

class ForgeActor(Actor):
    _service_config: ServiceConfig | None = None

    @classmethod
    def options(
        cls,
        *,
        service_config: ServiceConfig | None = None,
        num_replicas: int | None = None,
        procs_per_replica: int | None = None,
        **service_kwargs,
    ) -> Type["ForgeActor"]:
        if service_config:
            config = service_config
        else:
            config = ServiceConfig(num_replicas=num_replicas, procs_per_replica=procs_per_replica)  
        return type(
            f"{cls.__name__}",
            (cls,),
            {"_service_config": config}
        )```

Copy link
Member Author

@DNXie DNXie Sep 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

EndpointProperty binding is only needed after we do as_service() right?

Yes. It is needed whenever we call an endpoint function.

Your proposed implementation seems reasonable. However, it does not handle endpoint binding. After as_service(), you still need to go through service._service_interface to access endpoints.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As we discussed offline, if as_service returns a ServiceInterface directly, we cannot terminate the service with

service.shutdown()

because the returned object (service) is just a ServiceInterface. In that case, we’d have to fall back to

shutdown_service(service)

Personally, I prefer the service.shutdown() style since it feels more natural and object-oriented, but I’m okay with either one.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can add def shutdown(self) directly to ServiceInterface

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done!

"""
A wrapper around Service that binds a ForgeActor class.
Provides:
- as_service(): spawns the actor inside the service
- shutdown(): stops the service
"""

_actor_def = cls
_service_interface: ServiceInterface | None

def __init__(self) -> None:
self._service_interface = None

@classmethod
async def as_service(cls, **actor_kwargs) -> "ConfiguredService":
"""
Spawn the actor inside a Service with the given configuration.

Args:
**actor_kwargs: arguments to pass to the ForgeActor constructor

Returns:
self: so that methods like .shutdown() can be called
"""
self = cls()
logger.info("Spawning Service Actor for %s", self._actor_def.__name__)
service = Service(cfg, self._actor_def, actor_kwargs)
await service.__initialize__()
self._service_interface = ServiceInterface(service, self._actor_def)
return self

async def shutdown(self):
"""
Gracefully stops the service if it has been started.
"""
if self._service_interface is None:
raise RuntimeError("Service not started yet")
await self._service_interface._service.stop()
self._service_interface = None

def __getattr__(self, item):
"""
Delegate attribute access to the ServiceInterface instance.
This makes ConfiguredService behave like a ServiceInterface.
"""
if self._service_interface is None:
raise AttributeError(
f"Service not started yet; cannot access '{item}'"
)
return getattr(self._service_interface, item)

return ConfiguredService

@classmethod
async def as_service(cls, **actor_kwargs) -> "ConfiguredService":
"""
Spawn this ForgeActor inside a Service with default configuration.
Defaults: num_replicas=1, procs_per_replica=1

Usage:
service = await MyForgeActor.as_service(...)
await service.shutdown()
"""
return await cls.options(num_replicas=1, procs_per_replica=1).as_service(
**actor_kwargs
)

@endpoint
async def setup(self):
"""Sets up the actor.
Expand Down
3 changes: 0 additions & 3 deletions src/forge/controller/service/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from .metrics import ServiceMetrics
from .replica import Replica, ReplicaMetrics
from .service import Service, ServiceActor, ServiceConfig
from .spawn import shutdown_service, spawn_service

__all__ = [
"Replica",
Expand All @@ -20,6 +19,4 @@
"Session",
"SessionContext",
"ServiceActor",
"spawn_service",
"shutdown_service",
]
40 changes: 2 additions & 38 deletions src/forge/controller/service/spawn.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,50 +11,14 @@
from monarch.actor import proc_mesh

from forge.controller import ForgeActor
from forge.controller.service import Service, ServiceActor, ServiceConfig
from forge.controller.service import ServiceActor, ServiceConfig

from forge.controller.service.interface import ServiceInterface, ServiceInterfaceV2
from forge.controller.service.interface import ServiceInterfaceV2

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


async def spawn_service(
service_cfg: ServiceConfig, actor_def: Type[ForgeActor], **actor_kwargs
) -> ServiceInterface:
"""Spawns a service based on the actor class.

Args:
service_cfg: Service configuration
actor_def: Actor class definition
**actor_kwargs: Keyword arguments to pass to actor constructor

Returns:
A ServiceInterface that provides access to the Service Actor
"""
# Assert that actor_def is a subclass of ForgeActor
if not issubclass(actor_def, ForgeActor):
raise TypeError(
f"actor_def must be a subclass of ForgeActor, got {type(actor_def).__name__}"
)

# Create a single-node proc_mesh and actor_mesh for the Service Actor
logger.info("Spawning Service Actor for %s", actor_def.__name__)
service = Service(service_cfg, actor_def, actor_kwargs)
await service.__initialize__()
# Return the ServiceInterface that wraps the proc_mesh, actor_mesh, and actor_def
return ServiceInterface(service, actor_def)


async def shutdown_service(service: ServiceInterface) -> None:
"""Shuts down the service.

Implemented in this way to avoid actors overriding stop() unintentionally.

"""
await service._service.stop()


async def spawn_service_v2(
service_cfg: ServiceConfig, actor_def: Type[ForgeActor], **actor_kwargs
) -> ServiceInterfaceV2:
Expand Down
Loading
Loading