From 8b9d30c48c1b904c356c2988e10a8e2c06857984 Mon Sep 17 00:00:00 2001 From: DNXie Date: Thu, 16 Oct 2025 15:57:42 -0700 Subject: [PATCH] Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: --- apps/grpo/qwen3_1_7b.yaml | 5 +++-- apps/grpo/qwen3_32b.yaml | 5 +++-- apps/grpo/qwen3_8b.yaml | 5 +++-- apps/sft/llama3_8b.yaml | 5 +++-- apps/sft/qwen3_8b.yaml | 5 +++-- 5 files changed, 15 insertions(+), 10 deletions(-) diff --git a/apps/grpo/qwen3_1_7b.yaml b/apps/grpo/qwen3_1_7b.yaml index 14e4871cf..057ee2bcf 100644 --- a/apps/grpo/qwen3_1_7b.yaml +++ b/apps/grpo/qwen3_1_7b.yaml @@ -74,8 +74,9 @@ trainer: disable_loss_parallel: true checkpoint: enable: true - initial_load_path: hf://${model} - initial_load_in_hf: true + folder: ./checkpoint # The folder to save checkpoints to. + initial_load_path: hf://${model} # The path to load the initial checkpoint from. Ignored if `folder` exists. + initial_load_in_hf: true # If true, interpret initial_load_path as a HuggingFace model repo last_save_in_hf: true interval: 500 async_mode: "disabled" diff --git a/apps/grpo/qwen3_32b.yaml b/apps/grpo/qwen3_32b.yaml index e7a0cf509..c39260b45 100644 --- a/apps/grpo/qwen3_32b.yaml +++ b/apps/grpo/qwen3_32b.yaml @@ -77,8 +77,9 @@ trainer: disable_loss_parallel: true checkpoint: enable: true - initial_load_path: hf://${model} - initial_load_in_hf: true + folder: ./checkpoint # The folder to save checkpoints to. + initial_load_path: hf://${model} # The path to load the initial checkpoint from. Ignored if `folder` exists. + initial_load_in_hf: true # If true, interpret initial_load_path as a HuggingFace model repo last_save_in_hf: true interval: 500 async_mode: "disabled" diff --git a/apps/grpo/qwen3_8b.yaml b/apps/grpo/qwen3_8b.yaml index 534e5b92a..b0b4bf96e 100644 --- a/apps/grpo/qwen3_8b.yaml +++ b/apps/grpo/qwen3_8b.yaml @@ -70,8 +70,9 @@ trainer: disable_loss_parallel: true checkpoint: enable: true - initial_load_path: hf://${model} - initial_load_in_hf: true + folder: ./checkpoint # The folder to save checkpoints to. + initial_load_path: hf://${model} # The path to load the initial checkpoint from. Ignored if `folder` exists. + initial_load_in_hf: true # If true, interpret initial_load_path as a HuggingFace model repo last_save_in_hf: true interval: 500 async_mode: "disabled" diff --git a/apps/sft/llama3_8b.yaml b/apps/sft/llama3_8b.yaml index 43a690c1e..44e4485e4 100644 --- a/apps/sft/llama3_8b.yaml +++ b/apps/sft/llama3_8b.yaml @@ -45,8 +45,9 @@ parallelism: checkpoint: enable: true - initial_load_path: hf://${model_name} - initial_load_in_hf: true + folder: ./checkpoint # The folder to save checkpoints to. + initial_load_path: hf://${model} # The path to load the initial checkpoint from. Ignored if `folder` exists. + initial_load_in_hf: true # If true, interpret initial_load_path as a HuggingFace model repo last_save_in_hf: true interval: 500 async_mode: "disabled" diff --git a/apps/sft/qwen3_8b.yaml b/apps/sft/qwen3_8b.yaml index 2ab88bbd3..1c0d5bc8b 100644 --- a/apps/sft/qwen3_8b.yaml +++ b/apps/sft/qwen3_8b.yaml @@ -44,8 +44,9 @@ parallelism: checkpoint: enable: true - initial_load_path: hf://${model_name} - initial_load_in_hf: true + folder: ./checkpoint # The folder to save checkpoints to. + initial_load_path: hf://${model} # The path to load the initial checkpoint from. Ignored if `folder` exists. + initial_load_in_hf: true # If true, interpret initial_load_path as a HuggingFace model repo last_save_in_hf: true interval: 500 async_mode: "disabled"