OpenLineage
diff --git a/‎.github/workflows/main_new_release.yml‎
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/main_new_release.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/main_pr.yml‎
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/main_pr.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/producer_spark_dataproc.yml‎
Lines changed: 15 additions & 3 deletions b/‎.github/workflows/producer_spark_dataproc.yml‎
Lines changed: 15 additions & 3 deletions
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 1 deletion b/‎.gitignore‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎get_openlineage_jar.sh‎
Lines changed: 31 additions & 0 deletions b/‎get_openlineage_jar.sh‎
Lines changed: 31 additions & 0 deletions
@@ -90,6 +90,8 @@ jobs:
     uses: ./.github/workflows/producer_spark_dataproc.yml
     secrets:
       gcpKey: ${{ secrets.GCP_SA_KEY }}
+      postgresqlUser: ${{ secrets.POSTGRESQL_USER }}
+      postgresqlPassword: ${{ secrets.POSTGRESQL_PASSWORD }}
     with:
       release: ${{ needs.initialize_workflow.outputs.openlineage_release }}
       get-latest-snapshots: 'false'
 
@@ -82,6 +82,8 @@ jobs:
     uses: ./.github/workflows/producer_spark_dataproc.yml
     secrets:
       gcpKey: ${{ secrets.GCP_SA_KEY }}
+      postgresqlUser: ${{ secrets.POSTGRESQL_USER }}
+      postgresqlPassword: ${{ secrets.POSTGRESQL_PASSWORD }}
     with:
       release: ${{ needs.initialize_workflow.outputs.ol_release }}
       get-latest-snapshots: 'false'
 
@@ -5,6 +5,10 @@ on:
     secrets:
       gcpKey:
         required: true
+      postgresqlUser:
+        required: true
+      postgresqlPassword:
+        required: true
     inputs:
       release:
         description: "release tag of OpenLineage to use"
@@ -80,6 +84,14 @@ jobs:
           local-file-path: producer/spark_dataproc/runner/get_openlineage_jar.sh
           gcs-path: "gs://open-lineage-e2e/scripts"
           credentials: ${{ steps.gcp-auth.outputs.credentials_file_path }}
+    
+      - name: Upload CloudSQL init actions to GCS
+        id: upload-cloud-sql-initialization-actions
+        uses: ./.github/actions/upload_artifacts
+        with:
+          local-file-path: producer/spark_dataproc/runner/cloud_sql_proxy.sh
+          gcs-path: "gs://open-lineage-e2e/scripts"
+          credentials: ${{ steps.gcp-auth.outputs.credentials_file_path }}
 
       - name: Set up Python 3.11
         uses: actions/setup-python@v3
@@ -100,8 +112,8 @@ jobs:
           --region us-west1 \
           --cluster-name dataproc-producer-test-${{ github.run_id }} \
           --credentials-file ${{ steps.gcp-auth.outputs.credentials_file_path }} \
-          --metadata "SPARK_BQ_CONNECTOR_URL=gs://open-lineage-e2e/jars/spark-3.5-bigquery-0.0.1-SNAPSHOT.jar,OPENLINEAGE_SPARK_URL=${{ steps.upload-spark-integration.outputs.uploaded-file }},SPARK_SPANNER_CONNECTOR_URL=gs://open-lineage-e2e/jars/spark-3.1-spanner-1.1.0.jar" \
-          --initialization-actions="${{ steps.upload-initialization-actions.outputs.uploaded-file }}"
+          --metadata 'SPARK_BQ_CONNECTOR_URL=gs://open-lineage-e2e/jars/spark-3.5-bigquery-0.0.1-SNAPSHOT.jar,OPENLINEAGE_SPARK_URL=${{ steps.upload-spark-integration.outputs.uploaded-file }},SPARK_SPANNER_CONNECTOR_URL=gs://open-lineage-e2e/jars/spark-3.1-spanner-1.1.0.jar,SPARK_BIGTABLE_CONNECTOR_URL=gs://open-lineage-e2e/jars/spark-bigtable_2.12-0.3.0.jar,enable-cloud-sql-hive-metastore=false,additional-cloud-sql-instances=gcp-open-lineage-testing:us-central1:open-lineage-e2e=tcp:3307' \
+          --initialization-actions="${{ steps.upload-initialization-actions.outputs.uploaded-file }},${{ steps.upload-cloud-sql-initialization-actions.outputs.uploaded-file }}"
 #          --metadata "SPARK_BQ_CONNECTOR_URL=${{ steps.upload-spark-bq-connector.outputs.uploaded-file }},OPENLINEAGE_SPARK_URL=${{ steps.upload-spark-integration.outputs.uploaded-file }}" \
 
       - name: Set producer output event dir
@@ -126,7 +138,7 @@ jobs:
               --gcs-bucket open-lineage-e2e \
               --python-job "$run_script" \
               --jars "${{ steps.upload-gcs-transport.outputs.uploaded-file }}" \
-              --spark-properties "spark.extraListeners=io.openlineage.spark.agent.OpenLineageSparkListener,spark.sql.warehouse.dir=/tmp/warehouse,spark.openlineage.transport.type=gcs" \
+              --spark-properties 'spark.extraListeners=io.openlineage.spark.agent.OpenLineageSparkListener,spark.sql.warehouse.dir=/tmp/warehouse,spark.openlineage.transport.type=gcs,spark.driver.POSTGRESQL_USER=${{ secrets.postgresqlUser }},spark.driver.POSTGRESQL_PASSWORD=${{ secrets.postgresqlPassword }}' \
               --output-directory "${{ steps.set-producer-output.outputs.event_dir }}/$scenario" \
               --credentials-file "${{ steps.gcp-auth.outputs.credentials_file_path }}" \
               --dataproc-image-version 2.2-ubuntu22 \
 
@@ -87,6 +87,8 @@ ipython_config.py
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 
+.venv/
+
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
@@ -161,4 +163,4 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 
-ignored/
+ignored/
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+# Installs OpenLineage jar onto a Cloud Dataproc cluster.
+
+set -euxo pipefail
+
+readonly VM_SPARK_JARS_DIR=/usr/lib/spark/jars
+readonly SPARK_BQ_CONNECTOR_URL=$(/usr/share/google/get_metadata_value attributes/SPARK_BQ_CONNECTOR_URL || echo "")
+readonly OPENLINEAGE_SPARK_URL=$(/usr/share/google/get_metadata_value attributes/OPENLINEAGE_SPARK_URL || echo "")
+readonly SPARK_SPANNER_CONNECTOR_URL=$(/usr/share/google/get_metadata_value attributes/SPARK_SPANNER_CONNECTOR_URL || echo "")
+readonly SPARK_BIGTABLE_CONNECTOR_URL=$(/usr/share/google/get_metadata_value attributes/SPARK_BIGTABLE_CONNECTOR_URL || echo "")
+
+
+if [[ -n "${OPENLINEAGE_SPARK_URL}" ]]; then
+    bq_url="${SPARK_BQ_CONNECTOR_URL}"
+    ol_url="${OPENLINEAGE_SPARK_URL}"
+    spanner_url="${SPARK_SPANNER_CONNECTOR_URL}"
+else
+    bq_url="gs://open-lineage-e2e/jars/spark-3.5-bigquery-0.0.1-SNAPSHOT.jar"
+    ol_url="gs://open-lineage-e2e/jars/openlineage-spark_2.12-1.29.0-SNAPSHOT.jar"
+    spanner_url="gs://open-lineage-e2e/jars/spark-3.1-spanner-1.1.0.jar"
+    bigtable_url="gs://open-lineage-e2e/jars/spark-bigtable_2.12-0.3.0.jar"
+fi
+
+postgresql_url="gs://open-lineage-e2e/jars/postgresql-42.5.6.jar"
+
+gsutil cp -P "${bq_url}" "${VM_SPARK_JARS_DIR}/"
+gsutil cp -P "${ol_url}" "${VM_SPARK_JARS_DIR}/"
+gsutil cp -P "${spanner_url}" "${VM_SPARK_JARS_DIR}/"
+gsutil cp -P "${postgresql_url}" "${VM_SPARK_JARS_DIR}/"
+gsutil cp -P "${bigtable_url}" "${VM_SPARK_JARS_DIR}/"