diff --git a/.github/actions/upload_artifacts/action.yml b/.github/actions/upload_artifacts/action.yml index b93a193b..246cc068 100644 --- a/.github/actions/upload_artifacts/action.yml +++ b/.github/actions/upload_artifacts/action.yml @@ -10,6 +10,7 @@ inputs: required: true credentials: description: "GCP credentials" + required: false outputs: uploaded-file: value: ${{ steps.upload-artifact.outputs.uploaded_file }} @@ -29,7 +30,7 @@ runs: id: upload-artifact shell: bash run: | - python scripts/upload_file_to_gcs.py ${{ inputs.local-file-path }} ${{ inputs.gcs-path }} --credentials ${{ inputs.credentials }} + python scripts/upload_file_to_gcs.py ${{ inputs.local-file-path }} ${{ inputs.gcs-path }} exit_code=$? if [ $exit_code -ne 0 ]; then diff --git a/.github/workflows/consumer_dataplex.yml b/.github/workflows/consumer_dataplex.yml index 551a7604..4c27f26c 100644 --- a/.github/workflows/consumer_dataplex.yml +++ b/.github/workflows/consumer_dataplex.yml @@ -4,11 +4,19 @@ on: workflow_call: secrets: gcpKey: - required: true + required: false inputs: release: description: "release tag of OpenLineage to use" type: string + workload_identity_provider: + description: "GCP Workload Identity Provider (if not using credentials_json)" + type: string + required: false + service_account: + description: "GCP Service Account email (if not using credentials_json)" + type: string + required: false permissions: contents: read @@ -25,11 +33,29 @@ jobs: uses: actions/setup-python@v5 with: python-version: "3.11" - - name: GCP authorization - id: gcp-auth + - name: GCP authorization (Workload Identity) + id: gcp-auth-wif + if: ${{ inputs.workload_identity_provider != '' }} + uses: 'google-github-actions/auth@v2' + with: + workload_identity_provider: '${{ inputs.workload_identity_provider }}' + service_account: '${{ inputs.service_account }}' + + - name: GCP authorization (JSON Key) + id: gcp-auth-json + if: ${{ inputs.workload_identity_provider == '' }} uses: 'google-github-actions/auth@v2' with: credentials_json: '${{ secrets.gcpKey }}' + + - name: Set credentials file path + id: gcp-auth + run: | + if [ -n "${{ steps.gcp-auth-wif.outputs.credentials_file_path }}" ]; then + echo "credentials_file_path=${{ steps.gcp-auth-wif.outputs.credentials_file_path }}" >> $GITHUB_OUTPUT + else + echo "credentials_file_path=${{ steps.gcp-auth-json.outputs.credentials_file_path }}" >> $GITHUB_OUTPUT + fi - name: Install dependencies run: | python -m pip install --upgrade pip @@ -50,4 +76,3 @@ jobs: name: dataplex-report path: dataplex-report.json retention-days: 1 - diff --git a/.github/workflows/main_new_release.yml b/.github/workflows/main_new_release.yml index 1958c959..24be6ad7 100644 --- a/.github/workflows/main_new_release.yml +++ b/.github/workflows/main_new_release.yml @@ -97,10 +97,10 @@ jobs: needs: initialize_workflow if: ${{ needs.initialize_workflow.outputs.run_dataplex == 'true' }} uses: ./.github/workflows/consumer_dataplex.yml - secrets: - gcpKey: ${{ secrets.GCP_SA_KEY }} with: release: ${{ needs.initialize_workflow.outputs.openlineage_release }} + workload_identity_provider: ${{ vars.GCP_WORKLOAD_IDENTITY_PROVIDER }} + service_account: ${{ vars.GCP_SERVICE_ACCOUNT }} spark-dataproc: needs: initialize_workflow @@ -109,13 +109,14 @@ jobs: strategy: matrix: ${{ fromJson(needs.initialize_workflow.outputs.spark_matrix) }} secrets: - gcpKey: ${{ secrets.GCP_SA_KEY }} postgresqlUser: ${{ secrets.POSTGRESQL_USER }} postgresqlPassword: ${{ secrets.POSTGRESQL_PASSWORD }} with: ol_release: ${{ matrix.openlineage_versions }} spark_release: ${{ matrix.component_version }} get-latest-snapshots: 'false' + workload_identity_provider: ${{ vars.GCP_WORKLOAD_IDENTITY_PROVIDER }} + service_account: ${{ vars.GCP_SERVICE_ACCOUNT }} hive-dataproc: needs: initialize_workflow @@ -123,12 +124,12 @@ jobs: uses: ./.github/workflows/producer_hive_dataproc.yml strategy: matrix: ${{ fromJson(needs.initialize_workflow.outputs.hive_matrix) }} - secrets: - gcpKey: ${{ secrets.GCP_SA_KEY }} with: ol_release: ${{ matrix.openlineage_versions }} component_release: ${{ matrix.component_version }} get-latest-snapshots: 'false' + workload_identity_provider: ${{ vars.GCP_WORKLOAD_IDENTITY_PROVIDER }} + service_account: ${{ vars.GCP_SERVICE_ACCOUNT }} ######## COLLECTION OF REPORTS AND EXECUTE APPROPRIATE ACTIONS ######## diff --git a/.github/workflows/main_ol_spec_changes.yml b/.github/workflows/main_ol_spec_changes.yml index 69fab929..498f8d8c 100644 --- a/.github/workflows/main_ol_spec_changes.yml +++ b/.github/workflows/main_ol_spec_changes.yml @@ -132,13 +132,14 @@ jobs: strategy: matrix: ${{ fromJson(needs.initialize_workflow.outputs.spark_matrix) }} secrets: - gcpKey: ${{ secrets.GCP_SA_KEY }} postgresqlUser: ${{ secrets.POSTGRESQL_USER }} postgresqlPassword: ${{ secrets.POSTGRESQL_PASSWORD }} with: ol_release: ${{ matrix.openlineage_versions }} spark_release: ${{ matrix.component_version }} get-latest-snapshots: 'true' + workload_identity_provider: ${{ vars.GCP_WORKLOAD_IDENTITY_PROVIDER }} + service_account: ${{ vars.GCP_SERVICE_ACCOUNT }} hive-dataproc: needs: @@ -147,12 +148,12 @@ jobs: uses: ./.github/workflows/producer_hive_dataproc.yml strategy: matrix: ${{ fromJson(needs.initialize_workflow.outputs.hive_matrix) }} - secrets: - gcpKey: ${{ secrets.GCP_SA_KEY }} with: ol_release: ${{ matrix.openlineage_versions }} component_release: ${{ matrix.component_version }} get-latest-snapshots: 'true' + workload_identity_provider: ${{ vars.GCP_WORKLOAD_IDENTITY_PROVIDER }} + service_account: ${{ vars.GCP_SERVICE_ACCOUNT }} ######## COLLECTION OF REPORTS AND EXECUTE APPROPRIATE ACTIONS ######## diff --git a/.github/workflows/main_pr.yml b/.github/workflows/main_pr.yml index e52f36af..64cb1207 100644 --- a/.github/workflows/main_pr.yml +++ b/.github/workflows/main_pr.yml @@ -112,10 +112,10 @@ jobs: - scenarios if: ${{ !failure() && needs.initialize_workflow.outputs.run_dataplex == 'true' }} uses: ./.github/workflows/consumer_dataplex.yml - secrets: - gcpKey: ${{ secrets.GCP_SA_KEY }} with: release: ${{ needs.initialize_workflow.outputs.ol_release }} + workload_identity_provider: 'projects/484892851355/locations/global/workloadIdentityPools/github-actions-pool/providers/github-oidc-provider' + service_account: ${{ vars.GCP_SERVICE_ACCOUNT }} spark_dataproc: needs: initialize_workflow @@ -124,13 +124,14 @@ jobs: strategy: matrix: ${{ fromJson(needs.initialize_workflow.outputs.spark_matrix) }} secrets: - gcpKey: ${{ secrets.GCP_SA_KEY }} postgresqlUser: ${{ secrets.POSTGRESQL_USER }} postgresqlPassword: ${{ secrets.POSTGRESQL_PASSWORD }} with: ol_release: ${{ matrix.openlineage_versions }} spark_release: ${{ matrix.component_version }} get-latest-snapshots: 'false' + workload_identity_provider: 'projects/484892851355/locations/global/workloadIdentityPools/github-actions-pool/providers/github-oidc-provider' + service_account: ${{ vars.GCP_SERVICE_ACCOUNT }} hive_dataproc: needs: initialize_workflow @@ -138,12 +139,12 @@ jobs: uses: ./.github/workflows/producer_hive_dataproc.yml strategy: matrix: ${{ fromJson(needs.initialize_workflow.outputs.hive_matrix) }} - secrets: - gcpKey: ${{ secrets.GCP_SA_KEY }} with: ol_release: ${{ matrix.openlineage_versions }} component_release: ${{ matrix.component_version }} get-latest-snapshots: 'false' + workload_identity_provider: 'projects/484892851355/locations/global/workloadIdentityPools/github-actions-pool/providers/github-oidc-provider' + service_account: ${{ vars.GCP_SERVICE_ACCOUNT }} ######## COLLECTION OF REPORTS AND EXECUTE APPROPRIATE ACTIONS ######## diff --git a/.github/workflows/producer_hive_dataproc.yml b/.github/workflows/producer_hive_dataproc.yml index 85991364..713f11cd 100644 --- a/.github/workflows/producer_hive_dataproc.yml +++ b/.github/workflows/producer_hive_dataproc.yml @@ -4,7 +4,7 @@ on: workflow_call: secrets: gcpKey: - required: true + required: false inputs: component_release: description: "release of hive dataproc to use" @@ -15,6 +15,14 @@ on: get-latest-snapshots: description: "Should the artifact be downloaded from maven repo or circleci" type: string + workload_identity_provider: + description: "GCP Workload Identity Provider (if not using credentials_json)" + type: string + required: false + service_account: + description: "GCP Service Account email (if not using credentials_json)" + type: string + required: false jobs: run-hive-tests: @@ -42,13 +50,31 @@ jobs: esac - - name: GCP authorization - id: gcp-auth - if: ${{ steps.init.outputs.scenarios }} + - name: GCP authorization (Workload Identity) + id: gcp-auth-wif + if: ${{ steps.init.outputs.scenarios && inputs.workload_identity_provider != '' }} + uses: 'google-github-actions/auth@v2' + with: + workload_identity_provider: '${{ inputs.workload_identity_provider }}' + service_account: '${{ inputs.service_account }}' + + - name: GCP authorization (JSON Key) + id: gcp-auth-json + if: ${{ steps.init.outputs.scenarios && inputs.workload_identity_provider == '' }} uses: 'google-github-actions/auth@v2' with: credentials_json: '${{ secrets.gcpKey }}' + - name: Set credentials file path + id: gcp-auth + if: ${{ steps.init.outputs.scenarios }} + run: | + if [ -n "${{ steps.gcp-auth-wif.outputs.credentials_file_path }}" ]; then + echo "credentials_file_path=${{ steps.gcp-auth-wif.outputs.credentials_file_path }}" >> $GITHUB_OUTPUT + else + echo "credentials_file_path=${{ steps.gcp-auth-json.outputs.credentials_file_path }}" >> $GITHUB_OUTPUT + fi + - name: Get OL artifacts id: get-ol-artifacts if: ${{ steps.init.outputs.scenarios }} diff --git a/.github/workflows/producer_spark_dataproc.yml b/.github/workflows/producer_spark_dataproc.yml index c399b3ab..cf3baa8b 100644 --- a/.github/workflows/producer_spark_dataproc.yml +++ b/.github/workflows/producer_spark_dataproc.yml @@ -4,7 +4,7 @@ on: workflow_call: secrets: gcpKey: - required: true + required: false postgresqlUser: required: true postgresqlPassword: @@ -19,6 +19,14 @@ on: get-latest-snapshots: description: "Should the artifact be downloaded from maven repo or circleci" type: string + workload_identity_provider: + description: "GCP Workload Identity Provider (if not using credentials_json)" + type: string + required: false + service_account: + description: "GCP Service Account email (if not using credentials_json)" + type: string + required: false jobs: run-spark-tests: @@ -55,11 +63,11 @@ jobs: - name: GCP authorization - id: gcp-auth - if: ${{ steps.init.outputs.scenarios }} + id: gcp-auth-wif uses: 'google-github-actions/auth@v2' with: - credentials_json: '${{ secrets.gcpKey }}' + workload_identity_provider: '${{ inputs.workload_identity_provider }}' + project_id: 'gcp-open-lineage-testing' - name: Get OL artifacts id: get-ol-artifacts @@ -186,7 +194,6 @@ jobs: --region us-west1 \ --dataproc-image-version ${{ steps.init.outputs.dataproc_version }} \ --cluster-name "dataproc-producer-test-${{steps.init.outputs.component_cluster_suffix}}-${{ steps.init.outputs.openlineage_cluster_suffix }}-${{ github.run_id }}" \ - --credentials-file ${{ steps.gcp-auth.outputs.credentials_file_path }} \ --metadata "$metadata" \ --initialization-actions="${{ steps.upload-initialization-actions.outputs.uploaded-file }},${{ steps.upload-cloud-sql-initialization-actions.outputs.uploaded-file }}" @@ -229,7 +236,6 @@ jobs: --jars "${{ steps.upload-gcs-transport.outputs.uploaded-file }}" \ --spark-properties "$properties" \ --output-directory "${{ steps.set-producer-output.outputs.event_dir }}/$scenario" \ - --credentials-file "${{ steps.gcp-auth.outputs.credentials_file_path }}" \ --dataproc-image-version ${{ steps.init.outputs.dataproc_version }} then echo "Error: Spark job failed for scenario: $scenario" @@ -250,7 +256,6 @@ jobs: --project-id gcp-open-lineage-testing \ --region us-west1 \ --cluster-name "dataproc-producer-test-${{steps.init.outputs.component_cluster_suffix}}-${{steps.init.outputs.openlineage_cluster_suffix}}-${{ github.run_id }}" \ - --credentials-file ${{ steps.gcp-auth.outputs.credentials_file_path }} else echo "Cluster does not exist" fi diff --git a/producer/spark_dataproc/scenarios/hive/events_3.5.1/columnLineage_test.json b/producer/spark_dataproc/scenarios/hive/events_3.5.1/columnLineage_test.json index 8c9dc823..b4fc38fd 100644 --- a/producer/spark_dataproc/scenarios/hive/events_3.5.1/columnLineage_test.json +++ b/producer/spark_dataproc/scenarios/hive/events_3.5.1/columnLineage_test.json @@ -50,4 +50,5 @@ } } ] + } \ No newline at end of file