Skip to content

Commit 68151ed

Browse files
committed
improve retry script
1 parent 9c7371b commit 68151ed

File tree

1 file changed

+5
-1
lines changed

1 file changed

+5
-1
lines changed

benchmarks/run_input_shard.sh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,13 @@ OUTPUT_FILE="benchmarks_autotune_${TIMESTAMP}_input_shard_$((SHARD+1))_of_${WORL
99
# Retry until success
1010
attempt=0
1111
while true; do
12+
# while (( attempt < 10 )); do
1213
attempt=$((attempt + 1))
1314
echo "Attempt $attempt: Running benchmark for shard $((SHARD+1))/${WORLD_SIZE}..."
1415

16+
# TIMESTAMP=$(date +%s)
17+
# OUTPUT_FILE="benchmarks_autotune_${TIMESTAMP}_input_shard_$((SHARD+1))_of_${WORLD_SIZE}.txt"
18+
1519
CUDA_VISIBLE_DEVICES=$((RANK_OFFSET+SHARD)) python benchmarks/run.py --input-shard $((SHARD+1))/${WORLD_SIZE} --metrics tflops,gbps,speedup >"$OUTPUT_FILE" 2>&1
1620

1721
exit_code=$?
@@ -20,7 +24,7 @@ while true; do
2024
break
2125
else
2226
echo "Failed with exit code $exit_code. Retrying..."
23-
sleep 5 # Optional: wait 5 seconds before retrying
27+
sleep 10 # wait a few seconds before retrying
2428
fi
2529
done
2630

0 commit comments

Comments
 (0)