Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
da21e1d
Add reward interface, math reward, unit tests
DNXie Aug 21, 2025
5c72908
Merge branch 'meta-pytorch:main' into main
DNXie Aug 22, 2025
b4d7a61
Merge branch 'meta-pytorch:main' into main
DNXie Aug 25, 2025
02d77c6
Merge branch 'meta-pytorch:main' into main
DNXie Aug 27, 2025
fd1d38b
Merge branch 'meta-pytorch:main' into main
DNXie Aug 28, 2025
f79beee
Merge branch 'meta-pytorch:main' into main
DNXie Aug 28, 2025
d8d775a
Merge branch 'meta-pytorch:main' into main
DNXie Sep 2, 2025
e423c44
Merge branch 'meta-pytorch:main' into main
DNXie Sep 4, 2025
4815c05
Merge branch 'meta-pytorch:main' into main
DNXie Sep 8, 2025
77d41e4
Merge branch 'meta-pytorch:main' into main
DNXie Sep 9, 2025
a3feb1e
Merge branch 'meta-pytorch:main' into main
DNXie Sep 10, 2025
23d7e02
Merge branch 'meta-pytorch:main' into main
DNXie Sep 11, 2025
2ca881d
refactor, buggy
DNXie Sep 11, 2025
4df5d3a
some tweak
DNXie Sep 12, 2025
0b5c0db
add options under forgeactor, and tested it with vllm main
DNXie Sep 12, 2025
1cc5cf2
add as_service to forgeactor for default config
DNXie Sep 12, 2025
a92952a
enable passing serviceConfig obj to options
DNXie Sep 15, 2025
2ce61d1
update all the usages
DNXie Sep 15, 2025
f32fef7
Merge branch 'main' into add_options
DNXie Sep 15, 2025
f28824d
fix script error and CI broken test
DNXie Sep 15, 2025
549f43a
fix ci test
DNXie Sep 15, 2025
26a4207
remove redundant line
DNXie Sep 15, 2025
a311cbd
Update tests/unit_tests/test_service.py
DNXie Sep 15, 2025
1261568
Merge branch 'main' into add_options
DNXie Sep 15, 2025
c1854ec
add missing import back
DNXie Sep 15, 2025
e595fbd
fix ci
DNXie Sep 16, 2025
52cb676
Merge branch 'add_options' of github.com:DNXie/forge into DNXie-add_o…
DNXie Sep 16, 2025
fd38100
Merge branch 'DNXie-add_options'
DNXie Sep 16, 2025
9a80b16
Merge branch 'main' of github.com:DNXie/forge
DNXie Sep 16, 2025
09e7237
merge
DNXie Sep 16, 2025
0165027
make options return a forgeactor and as_service return a serviceinter…
DNXie Sep 16, 2025
721e32a
remove redundant test case
DNXie Sep 16, 2025
915baf1
fix lint
DNXie Sep 16, 2025
4171675
fix broken ci
DNXie Sep 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 18 additions & 44 deletions apps/grpo/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
from forge.cli.config import parse
from forge.controller.actor import ForgeActor
from forge.controller.provisioner import shutdown
from forge.controller.service import ServiceConfig, shutdown_service, spawn_service
from forge.data.rewards import MathReward, ThinkingReward
from forge.data.utils import exclude_service
from forge.util.metric_logging import get_metric_logger
Expand Down Expand Up @@ -355,44 +354,20 @@ async def main(cfg: DictConfig):
ref_model,
reward_actor,
) = await asyncio.gather(
spawn_service(
ServiceConfig(**cfg.dataset.service),
DatasetActor,
**exclude_service(cfg.dataset),
DatasetActor.options(**cfg.dataset.service).as_service(
**exclude_service(cfg.dataset)
),
spawn_service(
ServiceConfig(**cfg.policy.service),
Policy,
**exclude_service(cfg.policy),
Policy.options(**cfg.policy.service).as_service(**exclude_service(cfg.policy)),
Trainer.options(**cfg.trainer.service).as_service(
**exclude_service(cfg.trainer)
),
spawn_service(
ServiceConfig(**cfg.trainer.service),
Trainer,
**exclude_service(cfg.trainer),
ReplayBuffer.options(**cfg.replay_buffer.service).as_service(
**exclude_service(cfg.replay_buffer)
),
spawn_service(
ServiceConfig(**cfg.replay_buffer.service),
ReplayBuffer,
**exclude_service(cfg.replay_buffer),
),
spawn_service(
ServiceConfig(**cfg.compute_advantages.service),
ComputeAdvantages,
),
spawn_service(
ServiceConfig(**cfg.ref_model.service),
RefModel,
model_name=model,
),
# spawn_service(
# ServiceConfig(procs_per_replica=1, num_replicas=1, with_gpus=True),
# ReferenceModel,
# model=titan_model,
# ),
spawn_service(
ServiceConfig(**cfg.reward_actor.service),
RewardActor,
reward_functions=[MathReward(), ThinkingReward()],
ComputeAdvantages.options(**cfg.compute_advantages.service).as_service(),
RefModel.options(**cfg.ref_model.service).as_service(model_name=model),
RewardActor.options(**cfg.reward_actor.service).as_service(
reward_functions=[MathReward(), ThinkingReward()]
),
)

Expand Down Expand Up @@ -477,14 +452,13 @@ async def continuous_training():
finally:
print("Shutting down...")
await asyncio.gather(
shutdown_service(policy),
shutdown_service(trainer),
shutdown_service(replay_buffer),
shutdown_service(dataloader),
shutdown_service(compute_advantages),
shutdown_service(ref_model),
shutdown_service(reward_actor),
return_exceptions=True,
dataloader.shutdown(),
policy.shutdown(),
trainer.shutdown(),
replay_buffer.shutdown(),
compute_advantages.shutdown(),
ref_model.shutdown(),
reward_actor.shutdown(),
)
# TODO - add a global shutdown that implicitly shuts down all services
# and remote allocations
Expand Down
29 changes: 11 additions & 18 deletions apps/rl/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import torch.nn.functional as F
from forge.actors import ReplayBuffer, RLTrainer
from forge.cli.config import parse
from forge.controller.service import ServiceConfig, shutdown_service, spawn_service

from omegaconf import DictConfig
from torch import Tensor

Expand Down Expand Up @@ -135,21 +135,14 @@ def simple_grpo_loss(


async def run(cfg: DictConfig):
trainer, replay_buffer = await asyncio.gather(
spawn_service(
ServiceConfig(procs_per_replica=4, with_gpus=True, num_replicas=1),
RLTrainer,
loss=simple_grpo_loss,
**cfg.trainer,
),
spawn_service(
ServiceConfig(procs_per_replica=1, num_replicas=1),
ReplayBuffer,
collate=collate,
**cfg.replay_buffer,
),
)
print("Services initialized...")
trainer = await RLTrainer.options(
procs_per_replica=1, with_gpus=True, num_replicas=4
).as_service(**cfg.trainer)
replay_buffer = await ReplayBuffer.options(
procs_per_replica=1, num_replicas=1
).as_service(**cfg.replay_buffer)

print("Services initialized....")

print("Collecting Data...")
g = torch.manual_seed(0)
Expand All @@ -176,8 +169,8 @@ async def run(cfg: DictConfig):
print("Loss: ", outputs["loss"])

print("Shutting down...")
await shutdown_service(trainer)
await shutdown_service(replay_buffer)
await trainer.shutdown()
await replay_buffer.shutdown()


@parse
Expand Down
1 change: 0 additions & 1 deletion apps/vllm/llama3_8b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ policy:
n: 2
guided_decoding: false
max_tokens: 512
available_devices: null
service:
procs_per_replica: 2
num_replicas: 1
Expand Down
9 changes: 3 additions & 6 deletions apps/vllm/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from forge.actors.policy import Policy
from forge.cli.config import parse
from forge.controller.provisioner import shutdown
from forge.controller.service import ServiceConfig, shutdown_service, spawn_service

from omegaconf import DictConfig
from src.forge.data.utils import exclude_service
Expand All @@ -33,10 +32,8 @@ async def run(cfg: DictConfig):
prompt = "What is 3+5?" if gd else "Tell me a joke"

print("Spawning service...")
policy = await spawn_service(
ServiceConfig(**cfg.policy.service),
Policy,
**exclude_service(cfg.policy),
policy = await Policy.options(**cfg.policy.service).as_service(
**exclude_service(cfg.policy)
)

try:
Expand All @@ -54,7 +51,7 @@ async def run(cfg: DictConfig):

finally:
print("\nShutting down...")
await shutdown_service(policy)
await policy.shutdown()
await shutdown()


Expand Down
4 changes: 2 additions & 2 deletions src/forge/cli/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@

from pathlib import Path

from forge.cli.subcommand import Subcommand

from huggingface_hub import snapshot_download
from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError

from forge.cli.subcommand import Subcommand

# TODO: update this
REPO_ID_FNAME = "original_repo_id"

Expand Down
6 changes: 3 additions & 3 deletions src/forge/cli/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@

from pathlib import Path

import forge
from forge.cli.subcommand import Subcommand

from torch.distributed.elastic.multiprocessing.errors import record
from torch.distributed.run import get_args_parser as get_torchrun_args_parser, run

import forge
from forge.cli.subcommand import Subcommand

ROOT = Path(forge.__file__).parent.parent


Expand Down
77 changes: 76 additions & 1 deletion src/forge/controller/actor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,17 @@

import math
import sys
from typing import Type, TypeVar

from monarch.actor import Actor, current_rank, current_size, endpoint

from forge.controller.proc_mesh import get_proc_mesh, stop_proc_mesh
from forge.types import ProcessConfig

from forge.types import ProcessConfig, ServiceConfig

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
T = TypeVar("T", bound="ForgeActor")


class ForgeActor(Actor):
Expand All @@ -41,6 +44,78 @@ def __init__(self, *args, **kwargs):
self.logger.root.addHandler(stdout_handler)
super().__init__(*args, **kwargs)

@classmethod
def options(
cls: Type[T],
*,
service_config: ServiceConfig | None = None,
num_replicas: int | None = None,
procs_per_replica: int | None = None,
**service_kwargs,
) -> Type[T]:
"""
Returns a subclass of this ForgeActor with a bound ServiceConfig.
The returned subclass can later be launched via `.as_service()`.

Usage (choose ONE of the following forms):
# Option A: construct ServiceConfig implicitly
service = await MyForgeActor.options(
num_replicas=1,
procs_per_replica=2,
).as_service(...)
await service.shutdown()

# Option B: provide an explicit ServiceConfig
cfg = ServiceConfig(num_replicas=1, procs_per_replica=2, ..)
service = await MyForgeActor.options(service_config=cfg).as_service(...)
await service.shutdown()

# Option C: skip options, use the default service config with num_replicas=1, procs_per_replica=1
service = await MyForgeActor.as_service(...)
await service.shutdown()
"""

if service_config is not None:
cfg = service_config
else:
if num_replicas is None or procs_per_replica is None:
raise ValueError(
"Must provide either `service_config` or (num_replicas + procs_per_replica)."
)
cfg = ServiceConfig(
num_replicas=num_replicas,
procs_per_replica=procs_per_replica,
**service_kwargs,
)

return type(
f"{cls.__name__}Configured",
(cls,),
{"_service_config": cfg},
)

@classmethod
async def as_service(cls: Type[T], **actor_kwargs) -> "ServiceInterface":
"""
Convenience method to spawn this actor as a Service using default configuration.
If `.options()` was called, it will use the bound ServiceConfig;
otherwise defaults to 1 replica, 1 proc.
"""
# Lazy import to avoid top-level dependency issues
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is fine, just curious if this is a circular dependency issue?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. There would be a circular dependency issue

from forge.controller.service import Service, ServiceInterface

# Use _service_config if already set by options(), else default
cfg = getattr(cls, "_service_config", None)
if cfg is None:
cfg = ServiceConfig(num_replicas=1, procs_per_replica=1)
# dynamically create a configured subclass for consistency
cls = type(f"{cls.__name__}Configured", (cls,), {"_service_config": cfg})

logger.info(("Spawning Service Actor for %s", cls.__name__))
service = Service(cfg, cls, actor_kwargs)
await service.__initialize__()
return ServiceInterface(service, cls)

@endpoint
async def setup(self):
"""Sets up the actor.
Expand Down
3 changes: 0 additions & 3 deletions src/forge/controller/service/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from .metrics import ServiceMetrics
from .replica import Replica, ReplicaMetrics
from .service import Service, ServiceActor, ServiceConfig
from .spawn import shutdown_service, spawn_service

__all__ = [
"Replica",
Expand All @@ -20,6 +19,4 @@
"Session",
"SessionContext",
"ServiceActor",
"spawn_service",
"shutdown_service",
]
6 changes: 6 additions & 0 deletions src/forge/controller/service/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,12 @@ async def terminate_session(self, sess_id: str):
"""Terminates an active session and cleans up associated resources."""
return await self._service.terminate_session(sess_id)

async def shutdown(self) -> None:
"""
Shut down the underlying Service.
"""
await self._service.stop()

def session(self) -> "SessionContext":
"""Returns a context manager for session-based calls."""
return SessionContext(self)
Expand Down
40 changes: 2 additions & 38 deletions src/forge/controller/service/spawn.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,50 +11,14 @@
from monarch.actor import proc_mesh

from forge.controller import ForgeActor
from forge.controller.service import Service, ServiceActor, ServiceConfig
from forge.controller.service import ServiceActor, ServiceConfig

from forge.controller.service.interface import ServiceInterface, ServiceInterfaceV2
from forge.controller.service.interface import ServiceInterfaceV2

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


async def spawn_service(
service_cfg: ServiceConfig, actor_def: Type[ForgeActor], **actor_kwargs
) -> ServiceInterface:
"""Spawns a service based on the actor class.

Args:
service_cfg: Service configuration
actor_def: Actor class definition
**actor_kwargs: Keyword arguments to pass to actor constructor

Returns:
A ServiceInterface that provides access to the Service Actor
"""
# Assert that actor_def is a subclass of ForgeActor
if not issubclass(actor_def, ForgeActor):
raise TypeError(
f"actor_def must be a subclass of ForgeActor, got {type(actor_def).__name__}"
)

# Create a single-node proc_mesh and actor_mesh for the Service Actor
logger.info("Spawning Service Actor for %s", actor_def.__name__)
service = Service(service_cfg, actor_def, actor_kwargs)
await service.__initialize__()
# Return the ServiceInterface that wraps the proc_mesh, actor_mesh, and actor_def
return ServiceInterface(service, actor_def)


async def shutdown_service(service: ServiceInterface) -> None:
"""Shuts down the service.

Implemented in this way to avoid actors overriding stop() unintentionally.

"""
await service._service.stop()


async def spawn_service_v2(
service_cfg: ServiceConfig, actor_def: Type[ForgeActor], **actor_kwargs
) -> ServiceInterfaceV2:
Expand Down
Loading
Loading