Skip to content

Commit 5d0626f

Browse files
author
root
committed
try catch 重试
1 parent a5716c3 commit 5d0626f

File tree

3 files changed

+32
-28
lines changed

3 files changed

+32
-28
lines changed

agentlightning/runner.py

Lines changed: 29 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -225,35 +225,39 @@ async def run_async(self) -> bool:
225225
return False
226226

227227
rollout_obj = Rollout(rollout_id=task.rollout_id) # Default empty rollout
228-
229228
try:
230-
try:
231-
self.agent.on_rollout_start(task, self, self.tracer)
232-
except Exception:
233-
logger.exception(f"{self._log_prefix(rollout_id)} Exception during on_rollout_start hook.")
234-
235-
with self.tracer.trace_context(name=f"rollout_{rollout_id}"):
236-
start_time = time.time()
237-
rollout_method = (
238-
self.agent.training_rollout_async if task.mode == "train" else self.agent.validation_rollout_async
239-
)
240-
# Pass the task input, not the whole task object
241-
result = await rollout_method(task.input, task.rollout_id, resources_update.resources)
242-
rollout_obj = self._to_rollout_object(result, task.rollout_id)
243-
end_time = time.time()
244-
logger.info(
245-
f"{self._log_prefix(rollout_id)} Completed in "
246-
f"{end_time - start_time:.2f}s. Reward: {rollout_obj.final_reward}"
247-
)
229+
self.agent.on_rollout_start(task, self, self.tracer)
248230
except Exception:
249-
logger.exception(f"{self._log_prefix(rollout_id)} Exception during rollout.")
250-
finally:
231+
logger.exception(f"{self._log_prefix(rollout_id)} Exception during on_rollout_start hook.")
232+
MAX_TRY=3
233+
while MAX_TRY > 0:
251234
try:
252-
self.agent.on_rollout_end(task, rollout_obj, self, self.tracer)
235+
with self.tracer.trace_context(name=f"rollout_{rollout_id}"):
236+
start_time = time.time()
237+
rollout_method = (
238+
self.agent.training_rollout_async if task.mode == "train" else self.agent.validation_rollout_async
239+
)
240+
# Pass the task input, not the whole task object
241+
result = await rollout_method(task.input, task.rollout_id, resources_update.resources)
242+
rollout_obj = self._to_rollout_object(result, task.rollout_id)
243+
end_time = time.time()
244+
logger.info(
245+
f"{self._log_prefix(rollout_id)} Completed in "
246+
f"{end_time - start_time:.2f}s. Reward: {rollout_obj.final_reward}"
247+
)
248+
break
253249
except Exception:
254-
logger.exception(f"{self._log_prefix(rollout_id)} Exception during on_rollout_end hook.")
255-
await self.client.post_rollout_async(rollout_obj)
256-
250+
logger.exception(f"{self._log_prefix(rollout_id)} Exception during rollout.")
251+
MAX_TRY = MAX_TRY - 1
252+
finally:
253+
if rollout_obj.triplets:
254+
try:
255+
self.agent.on_rollout_end(task, rollout_obj, self, self.tracer)
256+
except Exception:
257+
logger.exception(f"{self._log_prefix(rollout_id)} Exception during on_rollout_end hook.")
258+
await self.client.post_rollout_async(rollout_obj)
259+
else:
260+
raise Exception("rollout_obj.triplets is EMPTY")
257261
return True
258262

259263
async def iter_async(self) -> int:

examples/werewolf/train.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ python -m agentlightning.verl \
2424
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
2525
actor_rollout_ref.rollout.multi_turn.format=hermes \
2626
actor_rollout_ref.model.path=${BASE_MODEL} \
27-
data.max_prompt_length=12288 \
27+
data.max_prompt_length=11264 \
2828
data.max_response_length=1024 \
2929
data.truncation='error' \
3030
trainer.val_before_train=True \
@@ -47,10 +47,10 @@ python -m agentlightning.verl \
4747
trainer.rollout_data_dir='/root/dataDisk/rollout' \
4848
trainer.resume_mode=auto \
4949
trainer.critic_warmup=0 \
50-
trainer.logger=['console','wandb'] \
50+
trainer.logger=['console','tensorboard'] \
5151
trainer.project_name=${PROJECT_NAME} \
5252
trainer.experiment_name=${EXPERIMENT_NAME} \
5353
trainer.nnodes=1 \
54-
trainer.save_freq=5 \
54+
trainer.save_freq=1 \
5555
trainer.test_freq=0 \
5656
trainer.total_epochs=1 $@

0 commit comments

Comments
 (0)