Skip to content

Commit c336f77

Browse files
committed
add retry loop to script
1 parent 7782164 commit c336f77

File tree

2 files changed

+22
-1
lines changed

2 files changed

+22
-1
lines changed

benchmarks/run_input_shard.sh

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,27 @@
11
[[ -z "$RANK_OFFSET" ]] && { echo "Error: RANK_OFFSET is not set"; exit 1; }
22
[[ -z "$SHARD" ]] && { echo "Error: SHARD is not set"; exit 1; }
33
[[ -z "$WORLD_SIZE" ]] && { echo "Error: WORLD_SIZE is not set"; exit 1; }
4-
CUDA_VISIBLE_DEVICES=$((RANK_OFFSET+SHARD)) python benchmarks/run.py --input-shard $((SHARD+1))/${WORLD_SIZE} --metrics tflops,gbps,speedup >benchmarks_autotune_$(date +%s)_input_shard_$((SHARD+1))_of_${WORLD_SIZE}.txt 2>&1
4+
5+
# Capture timestamp once for consistent filename
6+
TIMESTAMP=$(date +%s)
7+
OUTPUT_FILE="benchmarks_autotune_${TIMESTAMP}_input_shard_$((SHARD+1))_of_${WORLD_SIZE}.txt"
8+
9+
# Retry until success
10+
attempt=0
11+
while true; do
12+
attempt=$((attempt + 1))
13+
echo "Attempt $attempt: Running benchmark for shard $((SHARD+1))/${WORLD_SIZE}..."
14+
15+
HELION_FORCE_DISK_CACHE=1 CUDA_VISIBLE_DEVICES=$((RANK_OFFSET+SHARD)) python benchmarks/run.py --input-shard $((SHARD+1))/${WORLD_SIZE} --metrics tflops,gbps,speedup >"$OUTPUT_FILE" 2>&1
16+
17+
exit_code=$?
18+
if [ $exit_code -eq 0 ]; then
19+
echo "Success! Benchmark completed for shard $((SHARD+1))/${WORLD_SIZE}"
20+
break
21+
else
22+
echo "Failed with exit code $exit_code. Retrying..."
23+
sleep 5 # Optional: wait 5 seconds before retrying
24+
fi
25+
done
526

627
# SHARD=0 RANK_OFFSET=4 WORLD_SIZE=4 bash benchmarks/run_input_shard.sh

benchmarks/tunableop_results0.csv

Whitespace-only changes.

0 commit comments

Comments
 (0)