Skip to content

Commit 7b27e8a

Browse files
authored
Added run_rc_validation_python_yaml to validate the Beam YAML RC (apache#34670)
1 parent 1773bd6 commit 7b27e8a

File tree

2 files changed

+287
-0
lines changed

2 files changed

+287
-0
lines changed
Lines changed: 286 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,286 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
name: Run Python YAML RC Validation
19+
20+
on:
21+
workflow_dispatch:
22+
inputs:
23+
RELEASE_VER:
24+
description: 'Beam Release Version (e.g., 2.64.0)'
25+
required: true
26+
default: '2.64.0'
27+
RC_NUM:
28+
description: 'Release Candidate number (e.g., 1)'
29+
required: true
30+
default: '1'
31+
# APACHE_CONTENTS_REPO is not needed for Python-only YAML test
32+
# CLEANUP_BQ_RESOURCES is not needed as we use GCS
33+
34+
# This allows a subsequently queued workflow run to interrupt previous runs
35+
concurrency:
36+
group: '${{ github.workflow }} @ ${{ github.event.inputs.RELEASE_VER }}-${{ github.event.inputs.RC_NUM }}'
37+
cancel-in-progress: true
38+
39+
# Setting explicit permissions for the action
40+
permissions:
41+
actions: write
42+
pull-requests: write # Needed for setup-action potentially
43+
checks: write
44+
contents: read # Needs read to checkout the code
45+
deployments: read
46+
id-token: write # Required for GCP Workload Identity Federation
47+
issues: write
48+
discussions: read
49+
packages: read
50+
pages: read
51+
repository-projects: read
52+
security-events: read
53+
statuses: read
54+
55+
env: # Workflow level env vars
56+
GCP_PROJECT_ID: 'apache-beam-testing'
57+
58+
jobs:
59+
run_python_yaml_rc_validation:
60+
name: Run Python YAML RC Validation (${{ github.event.inputs.RELEASE_VER }} RC${{ github.event.inputs.RC_NUM }})
61+
runs-on: [self-hosted, ubuntu-20.04, main]
62+
timeout-minutes: 60 # Reduced timeout as the job runs for ~20 mins + setup/validation
63+
env: # Job-level env vars
64+
DEVELOCITY_ACCESS_KEY: ${{ secrets.DEVELOCITY_ACCESS_KEY }}
65+
GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }}
66+
GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }}
67+
RUN_ID_SUFFIX: ${{ github.run_id }}_${{ github.run_attempt }}
68+
GCE_REGION: 'us-central1'
69+
RELEASE_VERSION: ${{ github.event.inputs.RELEASE_VER }}
70+
RC_NUM: ${{ github.event.inputs.RC_NUM }}
71+
# Define the base bucket and unique folder prefix directly here
72+
GCS_UNIQUE_FOLDER_PREFIX: gs://rc-validation-migration-tests/yaml_rc_validation/${{ github.event.inputs.RELEASE_VER }}_RC${{ github.event.inputs.RC_NUM }}_${{ github.run_id }}_${{ github.run_attempt }}
73+
# Temp, Staging, and Output locations will be constructed in the steps using the prefix above
74+
RC_TAG: "v${{github.event.inputs.RELEASE_VER}}-RC${{github.event.inputs.RC_NUM}}"
75+
PYTHON_VERSION: '3.12' # Or adjust if needed
76+
BEAM_PYTHON_SDK_TAR_GZ: apache_beam-${{ github.event.inputs.RELEASE_VER }}.tar.gz
77+
BEAM_SOURCE_ZIP: apache-beam-${{ github.event.inputs.RELEASE_VER }}-source-release.zip
78+
APACHE_DIST_URL_BASE: https://dist.apache.org/repos/dist/dev/beam/${{ github.event.inputs.RELEASE_VER }}/python
79+
YAML_PIPELINE_FILE: t1_2.yaml
80+
SUBMISSION_TIMEOUT_SECONDS: 120 # Timeout for the python submission script itself
81+
82+
steps:
83+
- name: Checkout code at RC tag
84+
uses: actions/checkout@v4
85+
with:
86+
ref: ${{ env.RC_TAG }}
87+
88+
- name: Setup environment
89+
uses: ./.github/actions/setup-environment-action
90+
with:
91+
java-version: 11 # Keep Java setup for now, might be needed by gcloud/Dataflow
92+
93+
- name: Setup Python
94+
uses: actions/setup-python@v5
95+
with:
96+
python-version: ${{ env.PYTHON_VERSION }}
97+
98+
- name: Install Dependencies
99+
run: |
100+
sudo apt-get update --yes
101+
sudo apt-get install -y wget unzip coreutils procps grep sed
102+
shell: bash
103+
104+
- name: Set up Cloud SDK
105+
uses: google-github-actions/setup-gcloud@v2
106+
107+
- name: Download RC Artifacts
108+
run: |
109+
echo "Downloading from ${{ env.APACHE_DIST_URL_BASE }}"
110+
wget ${{ env.APACHE_DIST_URL_BASE }}/python/${{ env.BEAM_PYTHON_SDK_TAR_GZ }}
111+
wget ${{ env.APACHE_DIST_URL_BASE }}/python/${{ env.BEAM_PYTHON_SDK_TAR_GZ }}.sha512
112+
# Source zip not strictly needed if installing from tar.gz, but keeping for consistency/potential future use
113+
wget ${{ env.APACHE_DIST_URL_BASE }}/${{ env.BEAM_SOURCE_ZIP }}
114+
wget ${{ env.APACHE_DIST_URL_BASE }}/${{ env.BEAM_SOURCE_ZIP }}.sha512
115+
shell: bash
116+
117+
- name: Verify Hashes
118+
run: |
119+
echo "Verifying sha512 checksums..."
120+
sha512sum -c ${{ env.BEAM_PYTHON_SDK_TAR_GZ }}.sha512
121+
sha512sum -c ${{ env.BEAM_SOURCE_ZIP }}.sha512
122+
shell: bash
123+
124+
- name: Setup Python Virtual Environment
125+
run: |
126+
echo "Setting up Python virtual environment..."
127+
python -m venv beam_env
128+
source beam_env/bin/activate
129+
pip install --upgrade pip setuptools wheel
130+
echo "Virtual environment ready."
131+
shell: bash
132+
133+
- name: Install Python SDK with [gcp, yaml] extras
134+
run: |
135+
echo "Installing Python SDK: ${{ env.BEAM_PYTHON_SDK_TAR_GZ }} with [gcp,yaml] extras"
136+
source beam_env/bin/activate
137+
# Install from the downloaded tar.gz
138+
pip install "${{ env.BEAM_PYTHON_SDK_TAR_GZ }}[gcp,yaml]"
139+
echo "SDK installed."
140+
pip freeze # Log installed packages
141+
shell: bash
142+
143+
- name: Create YAML Pipeline File
144+
run: |
145+
echo "Creating YAML pipeline file: ${{ env.YAML_PIPELINE_FILE }}"
146+
cat <<EOF > ${{ env.YAML_PIPELINE_FILE }}
147+
pipeline:
148+
type: chain
149+
transforms:
150+
- type: ReadFromPubSub
151+
config:
152+
topic: projects/pubsub-public-data/topics/taxirides-realtime
153+
format: json
154+
schema:
155+
type: object
156+
properties:
157+
ride_id: {type: string}
158+
- type: WriteToJson
159+
config:
160+
# Construct the output path directly here
161+
path: "${{ env.GCS_UNIQUE_FOLDER_PREFIX }}/output/out.json"
162+
num_shards: 100
163+
windowing:
164+
type: fixed
165+
size: 30s
166+
options:
167+
streaming: true
168+
EOF
169+
echo "YAML file created:"
170+
cat ${{ env.YAML_PIPELINE_FILE }}
171+
shell: bash
172+
173+
- name: Run YAML Pipeline (Dataflow Runner), Wait, Extract ID, Cleanup Submitter
174+
id: submit_yaml_df
175+
run: |
176+
echo "Running YAML Pipeline with DataflowRunner in Background..."
177+
source beam_env/bin/activate
178+
python -m apache_beam.yaml.main \
179+
--yaml_pipeline_file=${{ env.YAML_PIPELINE_FILE }} \
180+
--runner DataflowRunner \
181+
--region=${{ env.GCE_REGION }} \
182+
--project=${{ env.GCP_PROJECT_ID }} \
183+
--temp_location ${{ env.GCS_UNIQUE_FOLDER_PREFIX }}/temp \
184+
--staging_location ${{ env.GCS_UNIQUE_FOLDER_PREFIX }}/staging \
185+
> yaml_dataflow_submit.log 2>&1 &
186+
187+
YAML_DF_PID=$!
188+
echo "YAML Pipeline (Dataflow Runner) submission process started in background with PID: ${YAML_DF_PID}"
189+
echo ${YAML_DF_PID} > yaml_dataflow_submit.pid
190+
191+
echo "Waiting up to ${{ env.SUBMISSION_TIMEOUT_SECONDS }} seconds for Dataflow job submission process (PID: ${YAML_DF_PID}) to potentially complete..."
192+
sleep ${{ env.SUBMISSION_TIMEOUT_SECONDS }}
193+
194+
echo "Proceeding with Job ID extraction..."
195+
# Try extracting Job ID using common patterns from Dataflow submission logs
196+
JOB_ID=$(grep -oP 'Dataflow Job ID: \K\S+' yaml_dataflow_submit.log || grep -oP "job_id='?\K[^' >]+" yaml_dataflow_submit.log || grep -oP "id: '?\"?\K[^'\" >]+" yaml_dataflow_submit.log | head -n 1)
197+
198+
if [[ -n "$JOB_ID" ]]; then
199+
echo "Extracted YAML Dataflow Job ID: $JOB_ID"
200+
echo "$JOB_ID" > yaml_dataflow_jobid.txt
201+
else
202+
echo "ERROR: Could not extract YAML Dataflow Job ID after ${{ env.SUBMISSION_TIMEOUT_SECONDS }}s wait. Log content:"
203+
echo "--- YAML Dataflow submission log START ---"
204+
cat yaml_dataflow_submit.log || echo "Log file not found."
205+
echo "--- YAML Dataflow submission log END ---"
206+
# Exit the step with failure if job ID is crucial and not found
207+
exit 1
208+
fi
209+
210+
# Check if the submission process is still running and kill it if necessary
211+
if [ -f yaml_dataflow_submit.pid ] && ps -p $YAML_DF_PID > /dev/null; then
212+
echo "Submission process (PID: $YAML_DF_PID) is still running after ${{ env.SUBMISSION_TIMEOUT_SECONDS }}s. Attempting to kill it."
213+
kill -9 $YAML_DF_PID || echo "Failed to kill process $YAML_DF_PID."
214+
else
215+
echo "Submission process (PID: $YAML_DF_PID) has already finished or PID file is missing."
216+
fi
217+
# Clean up PID file regardless
218+
if [ -f yaml_dataflow_submit.pid ]; then
219+
rm yaml_dataflow_submit.pid
220+
fi
221+
222+
echo "YAML Pipeline (Dataflow Runner) submission step finished processing."
223+
shell: bash
224+
225+
- name: Wait for Job to Run
226+
run: |
227+
if [ ! -f yaml_dataflow_jobid.txt ]; then
228+
echo "Skipping wait as Job ID was not extracted."
229+
exit 0 # Allow cleanup to proceed
230+
fi
231+
JOB_ID=$(cat yaml_dataflow_jobid.txt)
232+
echo "Waiting for 20 minutes for Dataflow job $JOB_ID to run..."
233+
sleep 1200 # 20 minutes = 1200 seconds
234+
echo "Wait finished."
235+
shell: bash
236+
237+
- name: Cancel YAML Dataflow Job
238+
if: always() # Run even if wait failed or previous steps failed, to attempt cleanup
239+
run: |
240+
if [ -f yaml_dataflow_jobid.txt ]; then
241+
JOB_ID=$(cat yaml_dataflow_jobid.txt)
242+
if [[ -n "$JOB_ID" ]]; then
243+
echo "Attempting to cancel YAML Dataflow job: $JOB_ID in region ${{ env.GCE_REGION }}"
244+
gcloud dataflow jobs cancel "$JOB_ID" --region=${{ env.GCE_REGION }} --project=${{ env.GCP_PROJECT_ID }} || echo "Failed to cancel YAML Dataflow job $JOB_ID (maybe it finished or was already cancelled)."
245+
else
246+
echo "YAML Dataflow Job ID file exists but is empty."
247+
fi
248+
# Keep jobid file for validation step, remove in final cleanup
249+
else
250+
echo "yaml_dataflow_jobid.txt not found, cannot cancel job (it might have failed before ID extraction)."
251+
fi
252+
shell: bash
253+
254+
- name: Validate GCS Output
255+
run: |
256+
if [ ! -f yaml_dataflow_jobid.txt ]; then
257+
echo "Skipping GCS validation as Job ID was not extracted (job likely failed early)."
258+
exit 0 # Allow cleanup to proceed
259+
fi
260+
# Construct the output path pattern directly here
261+
OUTPUT_PATTERN="${{ env.GCS_UNIQUE_FOLDER_PREFIX }}/output/out.json-*-of-*"
262+
echo "Validating GCS output files exist matching pattern: ${OUTPUT_PATTERN}"
263+
# Wait a bit for cancellation to finalize and files to potentially appear fully
264+
sleep 60
265+
# Check if any files matching the pattern exist within the unique output folder.
266+
echo "Checking for files matching pattern: ${OUTPUT_PATTERN}"
267+
if gsutil ls "${OUTPUT_PATTERN}" > /dev/null 2>&1; then
268+
echo "SUCCESS: Found output files matching pattern in GCS."
269+
gsutil ls "${OUTPUT_PATTERN}" # List found files
270+
else
271+
echo "ERROR: No output files found matching pattern '${OUTPUT_PATTERN}' in GCS bucket."
272+
exit 1
273+
fi
274+
shell: bash
275+
276+
# ================== Cleanup ==================
277+
- name: Cleanup GCS Temp/Staging and Local Files
278+
if: always()
279+
run: |
280+
echo "Deleting unique run folder in GCS: ${{ env.GCS_UNIQUE_FOLDER_PREFIX }}"
281+
# Delete the entire unique folder for this run, including temp, staging, and output
282+
gsutil -m rm -r "${{ env.GCS_UNIQUE_FOLDER_PREFIX }}" || echo "Failed to delete unique run folder ${{ env.GCS_UNIQUE_FOLDER_PREFIX }} in GCS. Manual cleanup might be required."
283+
284+
echo "Removing local log, yaml, and jobid files..."
285+
rm -f yaml_dataflow_submit.log ${{ env.YAML_PIPELINE_FILE }} yaml_dataflow_jobid.txt
286+
shell: bash

contributor-docs/release-guide.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -908,6 +908,7 @@ Wiki](https://cwiki.apache.org/confluence/display/BEAM/Python+Tips#PythonTips-In
908908
- [ ] Java Mobile Gaming RC Validation (~60min): https://github.com/apache/beam/actions/workflows/run_rc_validation_java_mobile_gaming.yml
909909
- [ ] Python Mobile Gaming RC Validation (~90min): https://github.com/apache/beam/actions/workflows/run_rc_validation_python_mobile_gaming.yml
910910
- [ ] Go SDK Release Candidate Validation: https://github.com/apache/beam/actions/workflows/run_rc_validation_go_wordcount.yml
911+
- [ ] YAML Release Candidate Validation: https://github.com/apache/beam/actions/workflows/run_rc_validation_python_yaml.yml
911912

912913
### Checklist to proceed to the next phase
913914

0 commit comments

Comments
 (0)