Skip to content
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
da21e1d
Add reward interface, math reward, unit tests
DNXie Aug 21, 2025
5c72908
Merge branch 'meta-pytorch:main' into main
DNXie Aug 22, 2025
b4d7a61
Merge branch 'meta-pytorch:main' into main
DNXie Aug 25, 2025
02d77c6
Merge branch 'meta-pytorch:main' into main
DNXie Aug 27, 2025
fd1d38b
Merge branch 'meta-pytorch:main' into main
DNXie Aug 28, 2025
f79beee
Merge branch 'meta-pytorch:main' into main
DNXie Aug 28, 2025
d8d775a
Merge branch 'meta-pytorch:main' into main
DNXie Sep 2, 2025
e423c44
Merge branch 'meta-pytorch:main' into main
DNXie Sep 4, 2025
4815c05
Merge branch 'meta-pytorch:main' into main
DNXie Sep 8, 2025
77d41e4
Merge branch 'meta-pytorch:main' into main
DNXie Sep 9, 2025
a3feb1e
Merge branch 'meta-pytorch:main' into main
DNXie Sep 10, 2025
23d7e02
Merge branch 'meta-pytorch:main' into main
DNXie Sep 11, 2025
2ca881d
refactor, buggy
DNXie Sep 11, 2025
4df5d3a
some tweak
DNXie Sep 12, 2025
0b5c0db
add options under forgeactor, and tested it with vllm main
DNXie Sep 12, 2025
1cc5cf2
add as_service to forgeactor for default config
DNXie Sep 12, 2025
a92952a
enable passing serviceConfig obj to options
DNXie Sep 15, 2025
2ce61d1
update all the usages
DNXie Sep 15, 2025
f32fef7
Merge branch 'main' into add_options
DNXie Sep 15, 2025
f28824d
fix script error and CI broken test
DNXie Sep 15, 2025
549f43a
fix ci test
DNXie Sep 15, 2025
26a4207
remove redundant line
DNXie Sep 15, 2025
a311cbd
Update tests/unit_tests/test_service.py
DNXie Sep 15, 2025
1261568
Merge branch 'main' into add_options
DNXie Sep 15, 2025
c1854ec
add missing import back
DNXie Sep 15, 2025
e595fbd
fix ci
DNXie Sep 16, 2025
52cb676
Merge branch 'add_options' of github.com:DNXie/forge into DNXie-add_o…
DNXie Sep 16, 2025
fd38100
Merge branch 'DNXie-add_options'
DNXie Sep 16, 2025
9a80b16
Merge branch 'main' of github.com:DNXie/forge
DNXie Sep 16, 2025
09e7237
merge
DNXie Sep 16, 2025
0165027
make options return a forgeactor and as_service return a serviceinter…
DNXie Sep 16, 2025
721e32a
remove redundant test case
DNXie Sep 16, 2025
915baf1
fix lint
DNXie Sep 16, 2025
4171675
fix broken ci
DNXie Sep 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 3 additions & 6 deletions apps/vllm/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@

from forge.actors.policy import Policy
from forge.cli.config import parse
from forge.controller.service import ServiceConfig, shutdown_service, spawn_service

from omegaconf import DictConfig
from src.forge.data.utils import exclude_service
Expand All @@ -28,10 +27,8 @@ async def run(cfg: DictConfig):

print("Spawning service...")

policy = await spawn_service(
ServiceConfig(**cfg.policy.service),
Policy,
**exclude_service(cfg.policy),
policy = await Policy.options(**cfg.policy.service).as_service(
**exclude_service(cfg.policy)
)

async with policy.session():
Expand All @@ -48,7 +45,7 @@ async def run(cfg: DictConfig):

print("\nShutting down...")

await shutdown_service(policy)
await policy.shutdown()


@parse
Expand Down
77 changes: 76 additions & 1 deletion src/forge/controller/actor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@

import math
import sys
from typing import Type

from monarch.actor import Actor, current_rank, current_size, endpoint

from forge.controller.proc_mesh import get_proc_mesh, stop_proc_mesh
from forge.types import ProcessConfig

from forge.types import ProcessConfig, ServiceConfig

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
Expand Down Expand Up @@ -41,6 +43,79 @@ def __init__(self, *args, **kwargs):
self.logger.root.addHandler(stdout_handler)
super().__init__(*args, **kwargs)

@classmethod
def options(
cls, *, num_replicas: int, procs_per_replica: int, **service_kwargs
) -> Type["ConfiguredService"]:
"""
Returns a ConfiguredService class that wraps this ForgeActor in a Service.

Usage:
service = await MyForgeActor.options(num_replicas=1, procs_per_replica=2).as_service(...)
await service.shutdown()
"""
from forge.controller.service import Service, ServiceInterface

cfg = ServiceConfig(
num_replicas=num_replicas,
procs_per_replica=procs_per_replica,
**service_kwargs,
)

class ConfiguredService:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we remove the ConfiguredService piece altogether? options() should return a type["ForgeActor"]

Copy link
Member Author

@DNXie DNXie Sep 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The main reason we have the ConfiguredService wrapper is to allow direct access to actor endpoints via instance attributes (e.g., service.value.choose()) while still carrying the ServiceConfig on the class.

If we remove the wrapper and make options() return a subclass of ForgeActor directly, the endpoints (like counter.value) are only accessible through the underlying ServiceInterface. So we need to change the statement

await service.value.choose()

to

await service._service_interface.value.choose()

I attempted to delegate EndpointProperty access via __getattr__ like this

def __getattr__(self, item):
    if self._service_interface is None:
        raise AttributeError(f"Service not started yet; cannot access '{item}'")
    
    attr = getattr(self._service_interface, item)
    from monarch._src.actor.endpoint import EndpointProperty
    if isinstance(attr, EndpointProperty):
        # Call the descriptor's __get__ to bind it
        return attr.__get__(self._service_interface, type(self._service_interface))

    return attr

However, it didn’t fully work: Python descriptors like EndpointProperty only bind correctly when accessed through the class or a properly initialized ServiceInterface instance. The __get__ call in __getattr__ does not fully replicate the descriptor binding behavior.

I’m still getting familiar with the internals of Monarch and some of the Python descriptor mechanics, so I'd love to hear any suggestions if there’s a cleaner way to handle this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, EndpointProperty binding is only needed after we do as_service() right?

this is how I imagine the options piece:

class ForgeActor(Actor):
    _service_config: ServiceConfig | None = None

    @classmethod
    def options(
        cls,
        *,
        service_config: ServiceConfig | None = None,
        num_replicas: int | None = None,
        procs_per_replica: int | None = None,
        **service_kwargs,
    ) -> Type["ForgeActor"]:
        if service_config:
            config = service_config
        else:
            config = ServiceConfig(num_replicas=num_replicas, procs_per_replica=procs_per_replica)  
        return type(
            f"{cls.__name__}",
            (cls,),
            {"_service_config": config}
        )```

Copy link
Member Author

@DNXie DNXie Sep 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

EndpointProperty binding is only needed after we do as_service() right?

Yes. It is needed whenever we call an endpoint function.

Your proposed implementation seems reasonable. However, it does not handle endpoint binding. After as_service(), you still need to go through service._service_interface to access endpoints.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As we discussed offline, if as_service returns a ServiceInterface directly, we cannot terminate the service with

service.shutdown()

because the returned object (service) is just a ServiceInterface. In that case, we’d have to fall back to

shutdown_service(service)

Personally, I prefer the service.shutdown() style since it feels more natural and object-oriented, but I’m okay with either one.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can add def shutdown(self) directly to ServiceInterface

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done!

"""
A wrapper around Service that binds a ForgeActor class.
Provides:
- as_service(): spawns the actor inside the service
- shutdown(): stops the service
"""

_actor_def = cls
_service_interface: ServiceInterface | None

def __init__(self) -> None:
self._service_interface = None

@classmethod
async def as_service(cls, **actor_kwargs) -> "ConfiguredService":
"""
Spawn the actor inside a Service with the given configuration.

Args:
**actor_kwargs: arguments to pass to the ForgeActor constructor

Returns:
self: so that methods like .shutdown() can be called
"""
self = cls()
logger.info("Spawning Service Actor for %s", self._actor_def.__name__)
service = Service(cfg, self._actor_def, actor_kwargs)
await service.__initialize__()
self._service_interface = ServiceInterface(service, self._actor_def)
return self

async def shutdown(self):
"""
Gracefully stops the service if it has been started.
"""
if self._service_interface is None:
raise RuntimeError("Service not started yet")
await self._service_interface._service.stop()
self._service_interface = None

def __getattr__(self, item):
"""
Delegate attribute access to the ServiceInterface instance.
This makes ConfiguredService behave like a ServiceInterface.
"""
if self._service_interface is None:
raise AttributeError(
f"Service not started yet; cannot access '{item}'"
)
return getattr(self._service_interface, item)

return ConfiguredService

@endpoint
async def setup(self):
"""Sets up the actor.
Expand Down
Loading