diff --git a/apps/sft_v2/llama3_8b.yaml b/apps/sft_v2/llama3_8b.yaml index bd61abe82..86fd88ca5 100644 --- a/apps/sft_v2/llama3_8b.yaml +++ b/apps/sft_v2/llama3_8b.yaml @@ -1,9 +1,11 @@ +# >>> python -m apps.sft_v2.main --config apps/sft_v2/llama3_8b.yaml + # Config for supervised full finetuning using a Llama3.1 8B Instruct model # # This config assumes that you've run the following command before launching # this run: # export HF_HUB_DISABLE_XET=1 -# uv run forge download meta-llama/Meta-Llama-3.1-8B-Instruct +# forge download meta-llama/Meta-Llama-3.1-8B-Instruct --output-dir /tmp/Meta-Llama-3.1-8B-Instruct # TODO: required by torchtitan @@ -14,11 +16,9 @@ comm: model: name: llama3 flavor: 8B - tokenizer_path: /tmp/Llama-3.1-8B-Instruct + hf_assets_path: /tmp/Meta-Llama-3.1-8B-Instruct processes: - scheduler: local # local | mast (not supported yet) - hosts: 1 procs: 8 with_gpus: true diff --git a/apps/sft_v2/main.py b/apps/sft_v2/main.py index a2f4f1479..61b27baa3 100644 --- a/apps/sft_v2/main.py +++ b/apps/sft_v2/main.py @@ -23,7 +23,7 @@ import torchtitan.experiments.forge.train_spec as forge_train_spec from forge.cli.config import parse -from forge.controller import ForgeActor, spawn_actors +from forge.controller import ForgeActor from forge.data.collate import collate_packed from forge.data.datasets.packed import PackedDataset, TextPacker from forge.data.datasets.sft_dataset import AlpacaToMessages, sft_iterable_dataset @@ -130,16 +130,16 @@ async def setup(self): # self.logger = self.setup_logger(self.train_config.logger_config) def setup_data(self): - print(os.path.join(self.job_config.model.tokenizer_path, "tokenizer.json")) + print(os.path.join(self.job_config.model.hf_assets_path, "tokenizer.json")) tokenizer = HuggingFaceModelTokenizer( tokenizer_json_path=os.path.join( - self.job_config.model.tokenizer_path, "tokenizer.json" + self.job_config.model.hf_assets_path, "tokenizer.json" ), tokenizer_config_json_path=os.path.join( - self.job_config.model.tokenizer_path, "tokenizer_config.json" + self.job_config.model.hf_assets_path, "tokenizer_config.json" ), generation_config_path=os.path.join( - self.job_config.model.tokenizer_path, "generation_config.json" + self.job_config.model.hf_assets_path, "generation_config.json" ), ) @@ -280,22 +280,16 @@ def __repr__(self) -> str: async def run(cfg: DictConfig) -> None: logging.info("Spawing recipe...") process_cfg = cfg.pop("processes") - recipe = await spawn_actors( - "sft", - ForgeSFTRecipe, - {"config": cfg}, - process_cfg, - set_address=True, - ) + recipe = await ForgeSFTRecipe.options(**process_cfg).as_actor(cfg) logging.info("Created recipe, running setup.") - await recipe.setup.fanout() + await recipe.setup.call() logging.info("Recipe has been setup. Training now.") - await recipe.train.fanout() + await recipe.train.call() logging.info("Done training. Clean up") - await recipe.cleanup.fanout() + await recipe.cleanup.call() await recipe.mesh.stop() logging.info("All done!")