From 1a21945592984209b11237b8a6e8db3b892e11b4 Mon Sep 17 00:00:00 2001 From: Danielle Pintz Date: Fri, 3 Oct 2025 13:20:23 -0700 Subject: [PATCH 1/2] fix sft v2 --- apps/sft_v2/llama3_8b.yaml | 10 ++++++---- apps/sft_v2/main.py | 18 ++++++------------ 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/apps/sft_v2/llama3_8b.yaml b/apps/sft_v2/llama3_8b.yaml index bd61abe82..675a8d9e8 100644 --- a/apps/sft_v2/llama3_8b.yaml +++ b/apps/sft_v2/llama3_8b.yaml @@ -1,9 +1,11 @@ +# >>> python -m apps.sft_v2.main --config apps/sft_v2/llama3_8b.yaml + # Config for supervised full finetuning using a Llama3.1 8B Instruct model # # This config assumes that you've run the following command before launching # this run: # export HF_HUB_DISABLE_XET=1 -# uv run forge download meta-llama/Meta-Llama-3.1-8B-Instruct +# forge download meta-llama/Meta-Llama-3.1-8B-Instruct --output-dir /tmp/Meta-Llama-3.1-8B-Instruct # TODO: required by torchtitan @@ -14,11 +16,11 @@ comm: model: name: llama3 flavor: 8B - tokenizer_path: /tmp/Llama-3.1-8B-Instruct + hf_assets_path: /tmp/Meta-Llama-3.1-8B-Instruct processes: - scheduler: local # local | mast (not supported yet) - hosts: 1 + # scheduler: local # local | mast (not supported yet) + # hosts: 1 procs: 8 with_gpus: true diff --git a/apps/sft_v2/main.py b/apps/sft_v2/main.py index a2f4f1479..8dc64c837 100644 --- a/apps/sft_v2/main.py +++ b/apps/sft_v2/main.py @@ -23,7 +23,7 @@ import torchtitan.experiments.forge.train_spec as forge_train_spec from forge.cli.config import parse -from forge.controller import ForgeActor, spawn_actors +from forge.controller import ForgeActor from forge.data.collate import collate_packed from forge.data.datasets.packed import PackedDataset, TextPacker from forge.data.datasets.sft_dataset import AlpacaToMessages, sft_iterable_dataset @@ -130,16 +130,16 @@ async def setup(self): # self.logger = self.setup_logger(self.train_config.logger_config) def setup_data(self): - print(os.path.join(self.job_config.model.tokenizer_path, "tokenizer.json")) + print(os.path.join(self.job_config.model.hf_assets_path, "tokenizer.json")) tokenizer = HuggingFaceModelTokenizer( tokenizer_json_path=os.path.join( - self.job_config.model.tokenizer_path, "tokenizer.json" + self.job_config.model.hf_assets_path, "tokenizer.json" ), tokenizer_config_json_path=os.path.join( - self.job_config.model.tokenizer_path, "tokenizer_config.json" + self.job_config.model.hf_assets_path, "tokenizer_config.json" ), generation_config_path=os.path.join( - self.job_config.model.tokenizer_path, "generation_config.json" + self.job_config.model.hf_assets_path, "generation_config.json" ), ) @@ -280,13 +280,7 @@ def __repr__(self) -> str: async def run(cfg: DictConfig) -> None: logging.info("Spawing recipe...") process_cfg = cfg.pop("processes") - recipe = await spawn_actors( - "sft", - ForgeSFTRecipe, - {"config": cfg}, - process_cfg, - set_address=True, - ) + recipe = await ForgeSFTRecipe.options(**process_cfg).as_service(cfg) logging.info("Created recipe, running setup.") await recipe.setup.fanout() From e4a7d48d9cd7635b11fd4e9c85b581730389521b Mon Sep 17 00:00:00 2001 From: Danielle Pintz Date: Fri, 3 Oct 2025 14:05:10 -0700 Subject: [PATCH 2/2] upd --- apps/sft_v2/llama3_8b.yaml | 2 -- apps/sft_v2/main.py | 8 ++++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/apps/sft_v2/llama3_8b.yaml b/apps/sft_v2/llama3_8b.yaml index 675a8d9e8..86fd88ca5 100644 --- a/apps/sft_v2/llama3_8b.yaml +++ b/apps/sft_v2/llama3_8b.yaml @@ -19,8 +19,6 @@ model: hf_assets_path: /tmp/Meta-Llama-3.1-8B-Instruct processes: - # scheduler: local # local | mast (not supported yet) - # hosts: 1 procs: 8 with_gpus: true diff --git a/apps/sft_v2/main.py b/apps/sft_v2/main.py index 8dc64c837..61b27baa3 100644 --- a/apps/sft_v2/main.py +++ b/apps/sft_v2/main.py @@ -280,16 +280,16 @@ def __repr__(self) -> str: async def run(cfg: DictConfig) -> None: logging.info("Spawing recipe...") process_cfg = cfg.pop("processes") - recipe = await ForgeSFTRecipe.options(**process_cfg).as_service(cfg) + recipe = await ForgeSFTRecipe.options(**process_cfg).as_actor(cfg) logging.info("Created recipe, running setup.") - await recipe.setup.fanout() + await recipe.setup.call() logging.info("Recipe has been setup. Training now.") - await recipe.train.fanout() + await recipe.train.call() logging.info("Done training. Clean up") - await recipe.cleanup.fanout() + await recipe.cleanup.call() await recipe.mesh.stop() logging.info("All done!")