Skip to content

Commit 4599674

Browse files
pawelmarutroller100 (BearingNode)
andauthored
Feature/dbt producer compatibility test (#211)
* feat: add dbt producer compatibility test framework - Add atomic test runner with CLI interface and validation - Add OpenLineage event generation and PIE framework integration - Add scenario-based testing structure for csv_to_duckdb_local - Include comprehensive documentation and maintainer info - Add gitignore exclusions for local artifacts and sensitive files This implements a complete dbt producer compatibility test that validates: - OpenLineage event generation from dbt runs - Event schema compliance using PIE framework validation - Column lineage, schema, and SQL facet extraction - Community-standard directory structure and documentation Signed-off-by: roller100 (BearingNode) <contact@bearingnode.com> * Dbt producer compatibility test Signed-off-by: Pawel Marut <pawel.marut@xebia.com> --------- Signed-off-by: roller100 (BearingNode) <contact@bearingnode.com> Signed-off-by: Pawel Marut <pawel.marut@xebia.com> Co-authored-by: roller100 (BearingNode) <contact@bearingnode.com>
1 parent e4fbaaf commit 4599674

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+1743
-28
lines changed

.github/actions/run_event_validation/action.yml

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -55,17 +55,22 @@ runs:
5555
run: |
5656
cd tmp
5757
IFS=',' read -ra TAGS <<< "${{ inputs.release_tags }}"
58+
git fetch --tags --quiet
5859
for TAG in "${TAGS[@]}"; do
5960
echo "Checking out tag: $TAG"
60-
git fetch --tags --quiet
6161
if git checkout --quiet "$TAG"; then
6262
DEST_DIR="../specs/$TAG"
63-
if [ -d "spec" ]; then
64-
mkdir -p "../specs/$TAG"
65-
find spec -path './website' -prune -o -type f \( -name '*Facet.json' -o -name 'OpenLineage.json' \) -exec cp {} "../specs/$TAG/" \;
63+
if [[ -d "spec" || -d "integration/common/openlineage" ]]; then
64+
mkdir -p "$DEST_DIR"
65+
if [ -d "spec" ]; then
66+
find spec -path './website' -prune -o -type f \( -name '*Facet.json' -o -name 'OpenLineage.json' \) -exec cp {} "$DEST_DIR" \;
67+
fi
68+
if [ -d "integration/common/src/openlineage" ]; then
69+
find integration/common/src/openlineage -type f -iname '*facet.json' -exec cp {} "$DEST_DIR" \;
70+
fi
6671
echo "success"
6772
else
68-
echo "Spec directory not found in $TAG"
73+
echo "Neither spec nor integration/common/src/openlineage directory found in $TAG"
6974
fi
7075
else
7176
echo "Tag $TAG not found!"

.github/workflows/main_new_release.yml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@ on:
1717
description: 'Run Hive Dataproc tests'
1818
required: false
1919
default: 'true'
20+
run_dbt:
21+
description: 'Run DBT tests'
22+
required: false
23+
default: 'true'
2024
openlineage_release:
2125
description: 'Override OpenLineage release version'
2226
required: false
@@ -26,6 +30,9 @@ on:
2630
hive_matrix:
2731
description: 'Overwrite matrix for hive tests'
2832
required: false
33+
dbt_matrix:
34+
description: 'Overwrite matrix for hive tests'
35+
required: false
2936

3037
permissions:
3138
id-token: write
@@ -40,9 +47,11 @@ jobs:
4047
run_dataplex: ${{ github.event.inputs.run_dataplex || 'true' }}
4148
run_spark_dataproc: ${{ github.event.inputs.run_spark_dataproc || 'true' }}
4249
run_hive_dataproc: ${{ github.event.inputs.run_hive_dataproc || 'true' }}
50+
run_dbt: ${{ github.event.inputs.run_dbt || 'true' }}
4351
openlineage_release: ${{ github.event.inputs.openlineage_release || steps.select-components.outputs.ol_release }}
4452
spark_matrix: ${{ github.event.inputs.spark_matrix || steps.set-matrix-values.outputs.spark_dataproc_matrix }}
4553
hive_matrix: ${{ github.event.inputs.hive_matrix || steps.set-matrix-values.outputs.hive_dataproc_matrix }}
54+
dbt_matrix: ${{ github.event.inputs.dbt_matrix || steps.set-matrix-values.outputs.dbt_matrix }}
4655
execution_time: ${{ steps.get-execution-time.outputs.execution_time }}
4756
steps:
4857
- name: Get execution time
@@ -90,6 +99,7 @@ jobs:
9099
91100
echo "spark_dataproc_matrix=$(get_matrix spark_dataproc)" >> $GITHUB_OUTPUT
92101
echo "hive_dataproc_matrix=$(get_matrix hive_dataproc)" >> $GITHUB_OUTPUT
102+
echo "dbt_matrix=$(get_matrix dbt)" >> $GITHUB_OUTPUT
93103
94104
######## COMPONENT VALIDATION ########
95105

@@ -130,6 +140,17 @@ jobs:
130140
component_release: ${{ matrix.component_version }}
131141
get-latest-snapshots: 'false'
132142

143+
dbt:
144+
needs: initialize_workflow
145+
if: ${{ needs.initialize_workflow.outputs.run_dbt == 'true' }}
146+
uses: ./.github/workflows/producer_dbt.yml
147+
strategy:
148+
matrix: ${{ fromJson(needs.initialize_workflow.outputs.dbt_matrix) }}
149+
with:
150+
ol_release: ${{ matrix.openlineage_versions }}
151+
dbt_release: ${{ matrix.component_version }}
152+
get-latest-snapshots: 'false'
153+
133154
######## COLLECTION OF REPORTS AND EXECUTE APPROPRIATE ACTIONS ########
134155

135156
collect-and-compare-reports:
@@ -138,6 +159,7 @@ jobs:
138159
- dataplex
139160
- spark-dataproc
140161
- hive-dataproc
162+
- dbt
141163
if: ${{ !failure() }}
142164
uses: ./.github/workflows/collect_and_compare_reports.yml
143165

.github/workflows/main_ol_spec_changes.yml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ on:
1919
hive_matrix:
2020
description: 'Overwrite matrix for hive tests'
2121
required: false
22+
dbt_matrix:
23+
description: 'Overwrite matrix for hive tests'
24+
required: false
2225

2326

2427
permissions:
@@ -35,6 +38,7 @@ jobs:
3538
ol_release: ${{ github.event.inputs.openlineage_release || steps.get-release.outputs.openlineage_release }}
3639
spark_matrix: ${{ github.event.inputs.spark_matrix || steps.set-matrix-values.outputs.spark_dataproc_matrix }}
3740
hive_matrix: ${{ github.event.inputs.hive_matrix || steps.set-matrix-values.outputs.hive_dataproc_matrix }}
41+
dbt_matrix: ${{ github.event.inputs.dbt_matrix || steps.set-matrix-values.outputs.dbt_matrix }}
3842
execution_time: ${{ steps.get-execution-time.outputs.execution_time }}
3943
steps:
4044
- name: Get execution time
@@ -108,6 +112,7 @@ jobs:
108112
109113
echo "spark_dataproc_matrix=$(get_matrix spark_dataproc)" >> $GITHUB_OUTPUT
110114
echo "hive_dataproc_matrix=$(get_matrix hive_dataproc)" >> $GITHUB_OUTPUT
115+
echo "dbt_matrix=$(get_matrix dbt)" >> $GITHUB_OUTPUT
111116
112117
113118
######## COMPONENT VALIDATION ########
@@ -154,6 +159,18 @@ jobs:
154159
component_release: ${{ matrix.component_version }}
155160
get-latest-snapshots: 'true'
156161

162+
dbt:
163+
needs:
164+
- initialize_workflow
165+
if: ${{ success() && needs.initialize_workflow.outputs.changes_in_spec == 'true' }}
166+
uses: ./.github/workflows/producer_dbt.yml
167+
strategy:
168+
matrix: ${{ fromJson(needs.initialize_workflow.outputs.dbt_matrix) }}
169+
with:
170+
ol_release: ${{ matrix.openlineage_versions }}
171+
dbt_release: ${{ matrix.component_version }}
172+
get-latest-snapshots: 'false'
173+
157174
######## COLLECTION OF REPORTS AND EXECUTE APPROPRIATE ACTIONS ########
158175

159176
collect-and-compare-reports:
@@ -162,6 +179,7 @@ jobs:
162179
- scenarios_check
163180
- spark-dataproc
164181
- hive-dataproc
182+
- dbt
165183
uses: ./.github/workflows/collect_and_compare_reports.yml
166184
with:
167185
fail-for-new-failures: true

.github/workflows/main_pr.yml

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,12 @@ jobs:
1919
run_scenarios: ${{ steps.get-changed.outputs.scenarios_changed }}
2020
run_spark_dataproc: ${{ steps.get-changed.outputs.spark_dataproc_changed }}
2121
run_hive_dataproc: ${{ steps.get-changed.outputs.hive_dataproc_changed }}
22+
run_dbt: ${{ steps.get-changed.outputs.dbt_changed }}
2223
ol_release: ${{ steps.get-release.outputs.openlineage_release }}
2324
any_run: ${{ steps.get-changed.outputs.any_changed }}
2425
spark_matrix: ${{ steps.set-matrix-values.outputs.spark_dataproc_matrix }}
2526
hive_matrix: ${{ steps.set-matrix-values.outputs.hive_dataproc_matrix }}
27+
dbt_matrix: ${{ steps.set-matrix-values.outputs.dbt_matrix }}
2628
steps:
2729
- name: Checkout code
2830
uses: actions/checkout@v4
@@ -55,8 +57,9 @@ jobs:
5557
dataplex=$(check_path "consumer/consumers/dataplex/" "dataplex_changed")
5658
spark_dataproc=$(check_path "producer/spark_dataproc/" "spark_dataproc_changed")
5759
hive_dataproc=$(check_path "producer/hive_dataproc/" "hive_dataproc_changed")
60+
dbt=$(check_path "producer/dbt/" "dbt_changed")
5861
59-
if [[ $scenarios || $dataplex || $spark_dataproc || $hive_dataproc ]]; then
62+
if [[ $scenarios || $dataplex || $spark_dataproc || $hive_dataproc || $dbt ]]; then
6063
echo "any_changed=true" >> $GITHUB_OUTPUT
6164
fi
6265
fi
@@ -94,6 +97,7 @@ jobs:
9497
9598
echo "spark_dataproc_matrix=$(get_matrix spark_dataproc)" >> $GITHUB_OUTPUT
9699
echo "hive_dataproc_matrix=$(get_matrix hive_dataproc)" >> $GITHUB_OUTPUT
100+
echo "dbt_matrix=$(get_matrix dbt)" >> $GITHUB_OUTPUT
97101
98102
99103
######## COMPONENT VALIDATION ########
@@ -145,6 +149,17 @@ jobs:
145149
component_release: ${{ matrix.component_version }}
146150
get-latest-snapshots: 'false'
147151

152+
dbt:
153+
needs: initialize_workflow
154+
if: ${{ needs.initialize_workflow.outputs.run_dbt == 'true' }}
155+
uses: ./.github/workflows/producer_dbt.yml
156+
strategy:
157+
matrix: ${{ fromJson(needs.initialize_workflow.outputs.dbt_matrix) }}
158+
with:
159+
dbt_release: ${{ matrix.component_version }}
160+
ol_release: ${{ matrix.openlineage_versions }}
161+
get-latest-snapshots: 'false'
162+
148163
######## COLLECTION OF REPORTS AND EXECUTE APPROPRIATE ACTIONS ########
149164

150165
collect-and-compare-reports:
@@ -154,6 +169,7 @@ jobs:
154169
- dataplex
155170
- spark_dataproc
156171
- hive_dataproc
172+
- dbt
157173
if: ${{ !failure() && needs.initialize_workflow.outputs.any_run == 'true'}}
158174
uses: ./.github/workflows/collect_and_compare_reports.yml
159175
with:

.github/workflows/producer_dbt.yml

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
name: dbt Producer
2+
3+
on:
4+
workflow_call:
5+
inputs:
6+
dbt_release:
7+
description: "release of dbt-core to use"
8+
type: string
9+
ol_release:
10+
description: "release tag of OpenLineage to use"
11+
type: string
12+
get-latest-snapshots:
13+
description: "Should the artifact be downloaded from maven repo or circleci"
14+
type: string
15+
workflow_dispatch:
16+
inputs:
17+
dbt_release:
18+
description: "release of dbt-core to use"
19+
type: string
20+
default: "1.8.0"
21+
ol_release:
22+
description: "release tag of OpenLineage to use"
23+
type: string
24+
default: "1.23.0"
25+
get-latest-snapshots:
26+
description: "Should the artifact be downloaded from maven repo or circleci"
27+
type: string
28+
default: "false"
29+
30+
jobs:
31+
run-dbt-tests:
32+
runs-on: ubuntu-latest
33+
34+
services:
35+
postgres:
36+
image: postgres:15-alpine
37+
env:
38+
POSTGRES_USER: testuser
39+
POSTGRES_PASSWORD: testpass
40+
POSTGRES_DB: dbt_test
41+
ports:
42+
- 5432:5432
43+
options: >-
44+
--health-cmd "pg_isready -U testuser -d dbt_test"
45+
--health-interval 10s
46+
--health-timeout 5s
47+
--health-retries 5
48+
49+
steps:
50+
- name: Checkout code
51+
uses: actions/checkout@v4
52+
53+
- name: Initialize tests
54+
id: init
55+
run: |
56+
scenarios=$(./scripts/get_valid_test_scenarios.sh "producer/dbt/scenarios/" ${{ inputs.dbt_release }} ${{ inputs.ol_release }} )
57+
if [[ "$scenarios" != "" ]]; then
58+
echo "scenarios=$scenarios" >> $GITHUB_OUTPUT
59+
echo "Found scenarios: $scenarios"
60+
else
61+
echo "No valid scenarios found for dbt ${{ inputs.dbt_release }} and OL ${{ inputs.ol_release }}"
62+
fi
63+
64+
- name: Set up Python 3.12
65+
if: ${{ steps.init.outputs.scenarios }}
66+
uses: actions/setup-python@v5
67+
with:
68+
python-version: "3.12"
69+
70+
- name: Install dbt dependencies
71+
if: ${{ steps.init.outputs.scenarios }}
72+
run: |
73+
python -m pip install --upgrade pip
74+
pip install dbt-core==${{ inputs.dbt_release }}
75+
pip install dbt-postgres
76+
pip install openlineage-dbt==${{ inputs.ol_release }}
77+
78+
- name: Set producer output event dir
79+
if: ${{ steps.init.outputs.scenarios }}
80+
id: set-producer-output
81+
run: |
82+
echo "event_dir=/tmp/dbt-events-$(date +%s%3N)" >> $GITHUB_OUTPUT
83+
84+
- name: Run dbt scenarios and create OL events
85+
if: ${{ steps.init.outputs.scenarios }}
86+
id: run-producer
87+
continue-on-error: true
88+
run: |
89+
set -e
90+
IFS=';' read -ra scenarios <<< "${{ steps.init.outputs.scenarios }}"
91+
92+
for scenario in "${scenarios[@]}"
93+
do
94+
echo "Running dbt scenario: $scenario"
95+
96+
mkdir -p "${{ steps.set-producer-output.outputs.event_dir }}/$scenario"
97+
bash producer/dbt/scenarios/$scenario/test/run.sh "${{ steps.set-producer-output.outputs.event_dir }}/$scenario"
98+
99+
echo "Finished running scenario: $scenario"
100+
done
101+
102+
echo "Finished running all scenarios"
103+
104+
- uses: actions/upload-artifact@v4
105+
if: ${{ steps.init.outputs.scenarios }}
106+
with:
107+
name: dbt-${{inputs.dbt_release}}-${{inputs.ol_release}}-events
108+
path: ${{ steps.set-producer-output.outputs.event_dir }}
109+
retention-days: 1
110+
111+
- name: Validation
112+
if: ${{ steps.init.outputs.scenarios }}
113+
uses: ./.github/actions/run_event_validation
114+
with:
115+
component: 'dbt'
116+
producer-dir: 'producer'
117+
release_tags: ${{ inputs.get-latest-snapshots == 'true' && 'main' || inputs.ol_release }}
118+
ol_release: ${{ inputs.ol_release }}
119+
component_release: ${{ inputs.dbt_release }}
120+
event-directory: ${{ steps.set-producer-output.outputs.event_dir }}
121+
target-path: 'dbt-${{inputs.dbt_release}}-${{inputs.ol_release}}-report.json'
122+
123+
- uses: actions/upload-artifact@v4
124+
if: ${{ steps.init.outputs.scenarios }}
125+
with:
126+
name: dbt-${{inputs.dbt_release}}-${{inputs.ol_release}}-report
127+
path: dbt-${{inputs.dbt_release}}-${{inputs.ol_release}}-report.json
128+
retention-days: 1

.gitignore

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ __pycache__/
66
# C extensions
77
*.so
88

9+
#Status files and documentation
10+
Status/
11+
912
# Distribution / packaging
1013
.Python
1114
build/
@@ -164,4 +167,10 @@ cython_debug/
164167
.idea/
165168

166169
ignored/
167-
bin/
170+
bin/
171+
172+
# OpenLineage event files generated during local testing
173+
**/specs/
174+
**/output/
175+
**/test/openlineage.yml
176+
dbt_producer_report.json

generated-files/releases.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@
77
"name": "spark_dataproc",
88
"latest_version": ""
99
},
10+
{
11+
"name": "dbt",
12+
"latest_version": "1.8.0"
13+
},
1014
{
1115
"name": "openlineage",
1216
"latest_version": "1.40.1"

0 commit comments

Comments
 (0)