|
1 | 1 | #!/bin/bash |
2 | | -set -e |
3 | 2 |
|
4 | 3 | alias gcurl='curl -H "Authorization: Bearer $(gcloud auth print-access-token)" -H "Content-Type: application/json"' |
5 | 4 |
|
6 | | -readarray -t notebooks < .cloud-build/Notebooks.txt |
7 | | -NOTEBOOK_RUNTIME_TEMPLATE=$(<NOTEBOOK_RUNTIME_TEMPLATE) |
8 | | -OUTPUT_URI=$(<OUTPUT_URI) |
9 | | -SA=$(<SA) |
10 | | -PROJECT_ID=$(<PROJECT_ID) |
11 | | -REGION=$(<REGION) |
12 | | -PUBSUB_TOPIC=$(<PS_TOPIC) |
13 | | - |
14 | | -DATE=$(date +%Y-%m-%d) |
15 | | -TIME=$(date +%H-%M-%S) |
16 | | -TIMESTAMP=$(date "+%B %d %Y %H:%M:%S") |
17 | | - |
18 | | -declare -A operation_map |
19 | | -declare -a pending completed_success completed_failure |
20 | | - |
21 | | -launch_notebook() { |
22 | | - local path="$1" |
23 | | - local name="${path##*/}" |
24 | | - local display_name="${name%.ipynb}-$DATE-$TIME" |
25 | | - |
26 | | - echo "Launching: $path" |
27 | | - local operation_id=$(gcloud colab executions create \ |
28 | | - --display-name="$display_name" \ |
| 5 | +TARGET=$(cat .cloud-build/Notebooks.txt) |
| 6 | + |
| 7 | +current_date=$(date +%Y-%m-%d) |
| 8 | +current_time=$(date +%H-%M-%S) |
| 9 | +current_time_readable=$(date "+%B %d %Y %H:%M:%S") |
| 10 | + |
| 11 | +NOTEBOOK_RUNTIME_TEMPLATE=$(cat NOTEBOOK_RUNTIME_TEMPLATE) |
| 12 | +OUTPUT_URI=$(cat OUTPUT_URI) |
| 13 | +SA=$(cat SA) |
| 14 | +PROJECT_ID=$(cat PROJECT_ID) |
| 15 | +REGION=$(cat REGION) |
| 16 | +PUBSUB_TOPIC=$(cat PS_TOPIC) |
| 17 | + |
| 18 | +failed_count=0 |
| 19 | +successful_count=0 |
| 20 | +declare -a failed_notebooks |
| 21 | +declare -a successful_notebooks |
| 22 | + |
| 23 | +MAX_PARALLEL_JOBS=5 |
| 24 | +joblist=() |
| 25 | + |
| 26 | +# Function to run a single notebook |
| 27 | +run_notebook() { |
| 28 | + local x="$1" |
| 29 | + local current_date="$2" |
| 30 | + local current_time="$3" |
| 31 | + |
| 32 | + DISPLAY_NAME="${x##generative-ai/}" |
| 33 | + DISPLAY_NAME="${DISPLAY_NAME%.ipynb}-$current_date-$current_time" |
| 34 | + echo "Starting execution for ${x}" |
| 35 | + |
| 36 | + OPERATION_ID=$(gcloud colab executions create \ |
| 37 | + --display-name="$DISPLAY_NAME" \ |
29 | 38 | --notebook-runtime-template="$NOTEBOOK_RUNTIME_TEMPLATE" \ |
30 | | - --direct-content="$path" \ |
| 39 | + --direct-content="$x" \ |
31 | 40 | --gcs-output-uri="$OUTPUT_URI" \ |
32 | 41 | --project="$PROJECT_ID" \ |
33 | 42 | --region="$REGION" \ |
34 | 43 | --service-account="$SA" \ |
35 | 44 | --execution-timeout="1h30m" \ |
36 | 45 | --format="value(name)") |
37 | 46 |
|
38 | | - local id=$(basename "$operation_id") |
39 | | - operation_map["$path"]="$id" |
40 | | - pending+=("$path") |
41 | | -} |
42 | | - |
43 | | -monitor_executions() { |
44 | | - while [[ ${#pending[@]} -gt 0 ]]; do |
45 | | - echo "Waiting for ${#pending[@]} notebooks..." |
46 | | - |
47 | | - local still_pending=() |
48 | | - |
49 | | - for path in "${pending[@]}"; do |
50 | | - local id="${operation_map["$path"]}" |
51 | | - |
52 | | - local status=$(gcloud colab executions describe "$id" --region="$REGION" --format="value(jobState)" 2>/dev/null || echo "JOB_STATE_FAILED") |
53 | | - |
54 | | - case "$status" in |
55 | | - JOB_STATE_SUCCEEDED) |
56 | | - echo "Success: $path" |
57 | | - completed_success+=("$path") |
58 | | - ;; |
59 | | - JOB_STATE_FAILED | *_CANCELLED | *_UNSPECIFIED) |
60 | | - echo "Failure: $path ($status)" |
61 | | - completed_failure+=("$path") |
62 | | - ;; |
63 | | - *) |
64 | | - echo "Still running: $path ($status)" |
65 | | - still_pending+=("$path") |
66 | | - ;; |
67 | | - esac |
68 | | - done |
69 | | - |
70 | | - pending=("${still_pending[@]}") |
71 | | - [[ ${#pending[@]} -gt 0 ]] && sleep 60 |
72 | | - done |
73 | | -} |
| 47 | + TRUNCATED_OPERATION_ID=$(echo "$OPERATION_ID" | cut -c 67-85) |
74 | 48 |
|
75 | | -publish_results() { |
76 | | - local total=${#notebooks[@]} |
77 | | - local failed=${#completed_failure[@]} |
78 | | - local passed=${#completed_success[@]} |
| 49 | + if ! EXECUTION_DETAILS=$(gcloud colab executions describe "$TRUNCATED_OPERATION_ID" --region="$REGION"); then |
| 50 | + echo "Error describing execution for ${x}" >&2 |
| 51 | + echo "fail:$x" |
| 52 | + return |
| 53 | + fi |
79 | 54 |
|
80 | | - printf "%s\n" "${completed_failure[@]}" > /workspace/Failure.txt |
| 55 | + JOB_STATE=$(echo "$EXECUTION_DETAILS" | grep "jobState:" | awk '{print $2}') |
| 56 | + if [[ "$JOB_STATE" == "JOB_STATE_SUCCEEDED" ]]; then |
| 57 | + echo "success:$x" |
| 58 | + else |
| 59 | + echo "fail:$x" |
| 60 | + fi |
| 61 | +} |
81 | 62 |
|
82 | | - local fail_list=$(IFS=, ; echo "${completed_failure[*]}") |
83 | | - local pass_list=$(IFS=, ; echo "${completed_success[*]}") |
| 63 | +# Parallel runner |
| 64 | +for x in $TARGET; do |
| 65 | + run_notebook "$x" "$current_date" "$current_time" > "result_$total_count.txt" 2>&1 & |
84 | 66 |
|
85 | | - local message="{\"total_count\":$total,\"failed_count\":$failed,\"failed_notebooks\":\"$fail_list\",\"successful_notebooks\":\"$pass_list\",\"successful_count\":$passed,\"execution_date\":\"$TIMESTAMP\"}" |
| 67 | + joblist+=($!) |
| 68 | + total_count=$((total_count + 1)) |
86 | 69 |
|
87 | | - echo "$(date) - INFO - Publishing results..." |
88 | | - if ! gcloud pubsub topics publish "$PUBSUB_TOPIC" --message="$message" --project="$PROJECT_ID"; then |
89 | | - echo "$(date) - ERROR - Failed to publish to Pub/Sub topic $PUBSUB_TOPIC" |
| 70 | + # Control concurrency |
| 71 | + if [[ ${#joblist[@]} -ge $MAX_PARALLEL_JOBS ]]; then |
| 72 | + wait -n |
| 73 | + joblist=($(jobs -p)) # prune finished jobs |
90 | 74 | fi |
91 | | -} |
| 75 | +done |
92 | 76 |
|
93 | | -echo "--- Launching notebooks ---" |
94 | | -for nb in "${notebooks[@]}"; do |
95 | | - [[ -n "$nb" ]] && launch_notebook "$nb" |
| 77 | +# Wait for all remaining jobs |
| 78 | +wait |
| 79 | + |
| 80 | +# Collect results |
| 81 | +for result_file in result_*.txt; do |
| 82 | + if grep -q "^success:" "$result_file"; then |
| 83 | + notebook=$(grep "^success:" "$result_file" | cut -d':' -f2-) |
| 84 | + successful_notebooks+=("$notebook") |
| 85 | + successful_count=$((successful_count + 1)) |
| 86 | + elif grep -q "^fail:" "$result_file"; then |
| 87 | + notebook=$(grep "^fail:" "$result_file" | cut -d':' -f2-) |
| 88 | + failed_notebooks+=("$notebook") |
| 89 | + failed_count=$((failed_count + 1)) |
| 90 | + echo "- $notebook" | tee -a /workspace/Failure.txt |
| 91 | + fi |
| 92 | + rm "$result_file" |
96 | 93 | done |
97 | 94 |
|
98 | | -echo "--- Monitoring executions ---" |
99 | | -monitor_executions |
| 95 | +# Summary |
| 96 | +echo "Total successful notebook executions: $successful_count" |
| 97 | +echo "Total failed notebook executions: $failed_count" |
| 98 | + |
| 99 | +# Publish result to Pub/Sub |
| 100 | +failed_notebooks_str=$(IFS=', '; echo "${failed_notebooks[*]}") |
| 101 | +successful_notebooks_str=$(IFS=', '; echo "${successful_notebooks[*]}") |
| 102 | + |
| 103 | +message_data="{\"total_count\":$((total_count)),\"failed_count\":$((failed_count)),\"failed_notebooks\":\"${failed_notebooks_str}\",\"successful_notebooks\":\"${successful_notebooks_str}\",\"successful_count\":$((successful_count)),\"execution_date\":\"${current_time_readable}\"}" |
100 | 104 |
|
101 | | -echo "--- Publishing summary ---" |
102 | | -publish_results |
| 105 | +echo "$(date) - INFO - Publishing to Pub/Sub topic: $PUBSUB_TOPIC" |
| 106 | +gcloud pubsub topics publish "$PUBSUB_TOPIC" --message="$message_data" --project="$PROJECT_ID" || \ |
| 107 | + echo "$(date) - ERROR - Failed to publish to Pub/Sub" |
103 | 108 |
|
104 | | -echo "Done. Success: ${#completed_success[@]}, Failures: ${#completed_failure[@]}" |
| 109 | +echo "All notebook executions completed." |
0 commit comments