Skip to content

Commit 3800885

Browse files
authored
fix checkpoint naming; add num_epoch parameter (#6277)
1 parent 26d859f commit 3800885

File tree

2 files changed

+3
-2
lines changed

2 files changed

+3
-2
lines changed

applications/ColossalChat/coati/distributed/consumer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ def loop(self) -> None:
130130
if (step + 1) % self.save_interval == 0 or (step + 1) == self.num_update_per_episode:
131131
if self.rank == 0:
132132
print(f"Start saving policy model at step {step + 1}.")
133-
save_path = os.path.join(self.save_dir, f"modeling-step-{step + 1}")
133+
save_path = os.path.join(self.save_dir, f"modeling-episode-{episode}-step-{step + 1}")
134134
self.booster.save_model(self.policy_model, save_path, shard=True)
135135
if self.rank == 0:
136136
print(f"Saved model checkpoint at step {step + 1} in folder {save_path}")

applications/ColossalChat/rl_example.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
parser.add_argument("-m", "--model", type=str, default="Qwen/Qwen2.5-7B")
1111
parser.add_argument("-d", "--dataset", type=str, default="data.jsonl")
1212
parser.add_argument("-p", "--project", type=str, default="GRPO", help="Project name.")
13+
parser.add_argument("-e", "--num-episodes", type=int, default=1, help="Number of episodes to train.")
1314

1415
# Distributed training parameters
1516
parser.add_argument("-t", "--num-trainers", type=int, default=2)
@@ -192,7 +193,7 @@
192193
num_producers=args.num_inferencer,
193194
num_proc_per_producer=inference_model_config.get("tensor_parallel_size", 1),
194195
num_consumer_procs=args.num_trainers,
195-
num_episodes=1,
196+
num_episodes=args.num_episodes,
196197
inference_batch_size=args.inference_batch_size,
197198
inference_microbatch_size=args.inference_microbatch_size,
198199
train_batch_size=args.train_batch_size,

0 commit comments

Comments
 (0)