Skip to content

Commit a6ec655

Browse files
authored
Bigtable scenario (#111)
* Add Dataproc CloudSql test Signed-off-by: Dominik Dębowczyk <dominik.debowczyk@getindata.com> * Add Bigtable scenario Signed-off-by: Dominik Dębowczyk <dominik.debowczyk@getindata.com> --------- Signed-off-by: Dominik Dębowczyk <dominik.debowczyk@getindata.com>
1 parent 2f033d7 commit a6ec655

39 files changed

+1801
-8
lines changed

.github/workflows/main_new_release.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,8 @@ jobs:
9090
uses: ./.github/workflows/producer_spark_dataproc.yml
9191
secrets:
9292
gcpKey: ${{ secrets.GCP_SA_KEY }}
93+
postgresqlUser: ${{ secrets.POSTGRESQL_USER }}
94+
postgresqlPassword: ${{ secrets.POSTGRESQL_PASSWORD }}
9395
with:
9496
release: ${{ needs.initialize_workflow.outputs.openlineage_release }}
9597
get-latest-snapshots: 'false'

.github/workflows/main_pr.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,8 @@ jobs:
8282
uses: ./.github/workflows/producer_spark_dataproc.yml
8383
secrets:
8484
gcpKey: ${{ secrets.GCP_SA_KEY }}
85+
postgresqlUser: ${{ secrets.POSTGRESQL_USER }}
86+
postgresqlPassword: ${{ secrets.POSTGRESQL_PASSWORD }}
8587
with:
8688
release: ${{ needs.initialize_workflow.outputs.ol_release }}
8789
get-latest-snapshots: 'false'

.github/workflows/producer_spark_dataproc.yml

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@ on:
55
secrets:
66
gcpKey:
77
required: true
8+
postgresqlUser:
9+
required: true
10+
postgresqlPassword:
11+
required: true
812
inputs:
913
release:
1014
description: "release tag of OpenLineage to use"
@@ -80,6 +84,14 @@ jobs:
8084
local-file-path: producer/spark_dataproc/runner/get_openlineage_jar.sh
8185
gcs-path: "gs://open-lineage-e2e/scripts"
8286
credentials: ${{ steps.gcp-auth.outputs.credentials_file_path }}
87+
88+
- name: Upload CloudSQL init actions to GCS
89+
id: upload-cloud-sql-initialization-actions
90+
uses: ./.github/actions/upload_artifacts
91+
with:
92+
local-file-path: producer/spark_dataproc/runner/cloud_sql_proxy.sh
93+
gcs-path: "gs://open-lineage-e2e/scripts"
94+
credentials: ${{ steps.gcp-auth.outputs.credentials_file_path }}
8395

8496
- name: Set up Python 3.11
8597
uses: actions/setup-python@v3
@@ -100,8 +112,8 @@ jobs:
100112
--region us-west1 \
101113
--cluster-name dataproc-producer-test-${{ github.run_id }} \
102114
--credentials-file ${{ steps.gcp-auth.outputs.credentials_file_path }} \
103-
--metadata "SPARK_BQ_CONNECTOR_URL=gs://open-lineage-e2e/jars/spark-3.5-bigquery-0.0.1-SNAPSHOT.jar,OPENLINEAGE_SPARK_URL=${{ steps.upload-spark-integration.outputs.uploaded-file }},SPARK_SPANNER_CONNECTOR_URL=gs://open-lineage-e2e/jars/spark-3.1-spanner-1.1.0.jar" \
104-
--initialization-actions="${{ steps.upload-initialization-actions.outputs.uploaded-file }}"
115+
--metadata 'SPARK_BQ_CONNECTOR_URL=gs://open-lineage-e2e/jars/spark-3.5-bigquery-0.0.1-SNAPSHOT.jar,OPENLINEAGE_SPARK_URL=${{ steps.upload-spark-integration.outputs.uploaded-file }},SPARK_SPANNER_CONNECTOR_URL=gs://open-lineage-e2e/jars/spark-3.1-spanner-1.1.0.jar,SPARK_BIGTABLE_CONNECTOR_URL=gs://open-lineage-e2e/jars/spark-bigtable_2.12-0.3.0.jar,enable-cloud-sql-hive-metastore=false,additional-cloud-sql-instances=gcp-open-lineage-testing:us-central1:open-lineage-e2e=tcp:3307' \
116+
--initialization-actions="${{ steps.upload-initialization-actions.outputs.uploaded-file }},${{ steps.upload-cloud-sql-initialization-actions.outputs.uploaded-file }}"
105117
# --metadata "SPARK_BQ_CONNECTOR_URL=${{ steps.upload-spark-bq-connector.outputs.uploaded-file }},OPENLINEAGE_SPARK_URL=${{ steps.upload-spark-integration.outputs.uploaded-file }}" \
106118

107119
- name: Set producer output event dir
@@ -126,7 +138,7 @@ jobs:
126138
--gcs-bucket open-lineage-e2e \
127139
--python-job "$run_script" \
128140
--jars "${{ steps.upload-gcs-transport.outputs.uploaded-file }}" \
129-
--spark-properties "spark.extraListeners=io.openlineage.spark.agent.OpenLineageSparkListener,spark.sql.warehouse.dir=/tmp/warehouse,spark.openlineage.transport.type=gcs" \
141+
--spark-properties 'spark.extraListeners=io.openlineage.spark.agent.OpenLineageSparkListener,spark.sql.warehouse.dir=/tmp/warehouse,spark.openlineage.transport.type=gcs,spark.driver.POSTGRESQL_USER=${{ secrets.postgresqlUser }},spark.driver.POSTGRESQL_PASSWORD=${{ secrets.postgresqlPassword }}' \
130142
--output-directory "${{ steps.set-producer-output.outputs.event_dir }}/$scenario" \
131143
--credentials-file "${{ steps.gcp-auth.outputs.credentials_file_path }}" \
132144
--dataproc-image-version 2.2-ubuntu22 \

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,8 @@ ipython_config.py
8787
# intended to run in multiple environments; otherwise, check them in:
8888
# .python-version
8989

90+
.venv/
91+
9092
# pipenv
9193
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
9294
# However, in case of collaboration, if having platform-specific dependencies or dependencies
@@ -161,4 +163,4 @@ cython_debug/
161163
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
162164
#.idea/
163165

164-
ignored/
166+
ignored/

get_openlineage_jar.sh

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#!/bin/bash
2+
3+
# Installs OpenLineage jar onto a Cloud Dataproc cluster.
4+
5+
set -euxo pipefail
6+
7+
readonly VM_SPARK_JARS_DIR=/usr/lib/spark/jars
8+
readonly SPARK_BQ_CONNECTOR_URL=$(/usr/share/google/get_metadata_value attributes/SPARK_BQ_CONNECTOR_URL || echo "")
9+
readonly OPENLINEAGE_SPARK_URL=$(/usr/share/google/get_metadata_value attributes/OPENLINEAGE_SPARK_URL || echo "")
10+
readonly SPARK_SPANNER_CONNECTOR_URL=$(/usr/share/google/get_metadata_value attributes/SPARK_SPANNER_CONNECTOR_URL || echo "")
11+
readonly SPARK_BIGTABLE_CONNECTOR_URL=$(/usr/share/google/get_metadata_value attributes/SPARK_BIGTABLE_CONNECTOR_URL || echo "")
12+
13+
14+
if [[ -n "${OPENLINEAGE_SPARK_URL}" ]]; then
15+
bq_url="${SPARK_BQ_CONNECTOR_URL}"
16+
ol_url="${OPENLINEAGE_SPARK_URL}"
17+
spanner_url="${SPARK_SPANNER_CONNECTOR_URL}"
18+
else
19+
bq_url="gs://open-lineage-e2e/jars/spark-3.5-bigquery-0.0.1-SNAPSHOT.jar"
20+
ol_url="gs://open-lineage-e2e/jars/openlineage-spark_2.12-1.29.0-SNAPSHOT.jar"
21+
spanner_url="gs://open-lineage-e2e/jars/spark-3.1-spanner-1.1.0.jar"
22+
bigtable_url="gs://open-lineage-e2e/jars/spark-bigtable_2.12-0.3.0.jar"
23+
fi
24+
25+
postgresql_url="gs://open-lineage-e2e/jars/postgresql-42.5.6.jar"
26+
27+
gsutil cp -P "${bq_url}" "${VM_SPARK_JARS_DIR}/"
28+
gsutil cp -P "${ol_url}" "${VM_SPARK_JARS_DIR}/"
29+
gsutil cp -P "${spanner_url}" "${VM_SPARK_JARS_DIR}/"
30+
gsutil cp -P "${postgresql_url}" "${VM_SPARK_JARS_DIR}/"
31+
gsutil cp -P "${bigtable_url}" "${VM_SPARK_JARS_DIR}/"

0 commit comments

Comments
 (0)