From 26cf13c5c419e14ca07608c6bc71e9be3e9f12e6 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 5 Nov 2025 23:08:18 +0000 Subject: [PATCH 1/3] Add full_name column to stg_customers model --- models/staging/stg_customers.sql | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/models/staging/stg_customers.sql b/models/staging/stg_customers.sql index cad047269..ea4e8a5a6 100644 --- a/models/staging/stg_customers.sql +++ b/models/staging/stg_customers.sql @@ -13,7 +13,8 @@ renamed as ( select id as customer_id, first_name, - last_name + last_name, + first_name || ' ' || last_name as full_name from source From 4bebfa285508eb947bbda6ed0107650b1aea7338 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 6 Nov 2025 01:24:55 +0000 Subject: [PATCH 2/3] Add comprehensive tests for stg_customers model - Add not_null tests for first_name, last_name, and full_name columns - Add descriptions for all columns - Add model-level description --- models/staging/schema.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/models/staging/schema.yml b/models/staging/schema.yml index c207e4cf5..a51318b1c 100644 --- a/models/staging/schema.yml +++ b/models/staging/schema.yml @@ -2,12 +2,29 @@ version: 2 models: - name: stg_customers + description: Staging layer for customer data with standardized column names columns: - name: customer_id + description: Unique identifier for each customer tests: - unique - not_null + - name: first_name + description: Customer's first name + tests: + - not_null + + - name: last_name + description: Customer's last name + tests: + - not_null + + - name: full_name + description: Customer's full name (first name + last name) + tests: + - not_null + - name: stg_orders columns: - name: order_id From aaa6d36fd43c783bab37ba0ea8976130f7bcd6f4 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 6 Nov 2025 01:27:40 +0000 Subject: [PATCH 3/3] Add Soda data quality checks for stg_customers model - Create checks/stg_customers.yml with comprehensive Soda CL checks - Convert dbt tests to Soda format (unique, not_null) - Add additional data quality checks (row count, string length, schema validation) - Add custom SQL check to verify full_name concatenation - Create checks/configuration.yml for DuckDB connection - Add checks/README.md with usage instructions and test mappings --- checks/README.md | 76 ++++++++++++++++++++++++++++++++++++++++ checks/configuration.yml | 16 +++++++++ checks/stg_customers.yml | 55 +++++++++++++++++++++++++++++ 3 files changed, 147 insertions(+) create mode 100644 checks/README.md create mode 100644 checks/configuration.yml create mode 100644 checks/stg_customers.yml diff --git a/checks/README.md b/checks/README.md new file mode 100644 index 000000000..84d959324 --- /dev/null +++ b/checks/README.md @@ -0,0 +1,76 @@ +# Soda Data Quality Checks + +This directory contains Soda checks for data quality testing of dbt models. + +## Setup + +1. Install Soda Core for DuckDB: +```bash +pip install soda-core-duckdb +``` + +2. Build your dbt models first: +```bash +dbt run +``` + +## Running Checks + +### Run all checks for stg_customers: +```bash +soda scan -d jaffle_shop -c checks/configuration.yml checks/stg_customers.yml +``` + +### Run checks and save results: +```bash +soda scan -d jaffle_shop -c checks/configuration.yml checks/stg_customers.yml --save-results +``` + +## Check Mappings + +The following dbt tests were converted to Soda checks: + +### stg_customers Model + +| dbt Test | Soda Check | Description | +|----------|------------|-------------| +| `customer_id: unique` | `duplicate_count(customer_id) = 0` | Ensures no duplicate customer IDs | +| `customer_id: not_null` | `missing_count(customer_id) = 0` | Ensures customer_id is always present | +| `first_name: not_null` | `missing_count(first_name) = 0` | Ensures first_name is always present | +| `last_name: not_null` | `missing_count(last_name) = 0` | Ensures last_name is always present | +| `full_name: not_null` | `missing_count(full_name) = 0` | Ensures full_name is always present | + +### Additional Checks + +The Soda configuration includes additional data quality checks beyond the original dbt tests: +- **Row count validation**: Ensures table is not empty +- **String length validation**: Ensures names are not empty strings +- **Schema validation**: Confirms all expected columns exist +- **Custom SQL check**: Verifies full_name is correctly concatenated from first_name and last_name + +## File Structure + +``` +checks/ +├── README.md # This file +├── configuration.yml # Soda data source configuration +└── stg_customers.yml # Data quality checks for stg_customers model +``` + +## Integration with CI/CD + +You can integrate Soda checks into your CI/CD pipeline: + +```yaml +# Example GitHub Actions workflow +- name: Run Soda Checks + run: | + dbt run --select stg_customers + soda scan -d jaffle_shop -c checks/configuration.yml checks/stg_customers.yml +``` + +## Documentation + +- [Soda Documentation](https://docs.soda.io/) +- [Soda CL Reference](https://docs.soda.io/soda-cl/soda-cl-overview.html) +- [DuckDB Data Source](https://docs.soda.io/soda/connect-duckdb.html) diff --git a/checks/configuration.yml b/checks/configuration.yml new file mode 100644 index 000000000..93723cf09 --- /dev/null +++ b/checks/configuration.yml @@ -0,0 +1,16 @@ +# Soda configuration file for DuckDB connection +# This file configures the data source connection for running Soda checks + +data_source jaffle_shop: + type: duckdb + # Update the path to your DuckDB database file + # After running dbt, the database is typically at: target/jaffle_shop.duckdb + path: ../target/jaffle_shop.duckdb + + # Optional: specify schema if needed + # schema: main + +# Example usage: +# 1. Install Soda Core for DuckDB: pip install soda-core-duckdb +# 2. Run dbt to build models: dbt run +# 3. Run Soda checks: soda scan -d jaffle_shop -c checks/configuration.yml checks/stg_customers.yml diff --git a/checks/stg_customers.yml b/checks/stg_customers.yml new file mode 100644 index 000000000..4dc791fb6 --- /dev/null +++ b/checks/stg_customers.yml @@ -0,0 +1,55 @@ +# Soda checks for stg_customers model +# This file defines data quality checks using Soda CL (Checks Language) +# Run with: soda scan -d your_datasource -c configuration.yml checks/stg_customers.yml + +checks for stg_customers: + # Row count check - ensure table is not empty + - row_count > 0 + + # customer_id checks (unique, not_null) + - missing_count(customer_id) = 0: + name: customer_id should not have null values + - duplicate_count(customer_id) = 0: + name: customer_id should be unique + - invalid_count(customer_id) = 0: + valid format: uuid + name: customer_id should be valid (optional - remove if not using UUID format) + + # first_name checks (not_null) + - missing_count(first_name) = 0: + name: first_name should not have null values + - invalid_count(first_name) = 0: + valid min length: 1 + name: first_name should not be empty string + + # last_name checks (not_null) + - missing_count(last_name) = 0: + name: last_name should not have null values + - invalid_count(last_name) = 0: + valid min length: 1 + name: last_name should not be empty string + + # full_name checks (not_null) + - missing_count(full_name) = 0: + name: full_name should not have null values + - invalid_count(full_name) = 0: + valid min length: 3 + name: full_name should contain at least first and last name with space + + # Additional data quality checks + - schema: + name: Confirm expected schema + fail: + when required column missing: + - customer_id + - first_name + - last_name + - full_name + + # Custom SQL check - verify full_name is correctly formatted + - failed rows: + name: full_name should match first_name + space + last_name + fail query: | + SELECT * + FROM stg_customers + WHERE full_name != first_name || ' ' || last_name