adding soda data quality checks to pricing data pipeline

digitalghost-dev · digitalghost-dev · commit b603977159cf · 2025-11-24T16:16:53.000-08:00
diff --git a/card_data/pipelines/definitions.py b/card_data/pipelines/definitions.py
@@ -5,7 +5,7 @@
 import dagster as dg
 
 from .defs.extract.extract_pricing_data import build_dataframe
-from .defs.load.load_pricing_data import load_pricing_data
+from .defs.load.load_pricing_data import load_pricing_data, data_quality_checks_on_pricing
 
 
 @definitions
@@ -17,7 +17,7 @@ def defs():
 # Define the pricing pipeline job that materializes the assets and downstream dbt model
 pricing_pipeline_job = dg.define_asset_job(
     name="pricing_pipeline_job",
-    selection=dg.AssetSelection.assets(build_dataframe, load_pricing_data).downstream(include_self=True),
+    selection=dg.AssetSelection.assets(build_dataframe).downstream(include_self=True),
 )
 
 price_schedule = dg.ScheduleDefinition(
@@ -28,7 +28,7 @@ def defs():
 )
 
 defs_pricing = dg.Definitions(
-    assets=[build_dataframe, load_pricing_data],
+    assets=[build_dataframe, load_pricing_data, data_quality_checks_on_pricing],
     jobs=[pricing_pipeline_job],
     schedules=[price_schedule],
 )
diff --git a/card_data/pipelines/defs/load/load_pricing_data.py b/card_data/pipelines/defs/load/load_pricing_data.py
@@ -1,3 +1,6 @@
+import subprocess
+from pathlib import Path
+
 import dagster as dg
 import polars as pl
 from dagster import RetryPolicy, Backoff
@@ -23,3 +26,36 @@ def load_pricing_data(build_pricing_dataframe: pl.DataFrame) -> None:
     except OperationalError as e:
         print(colored(" ✖", "red"), "Connection error in load_pricing_data():", e)
         raise
+
+
+@dg.asset(
+    deps=[load_pricing_data],
+    kinds={"Soda"},
+    name="data_quality_checks_on_pricing",
+)
+def data_quality_checks_on_pricing() -> None:
+    current_file_dir = Path(__file__).parent
+    print(f"Setting cwd to: {current_file_dir}")
+
+    result = subprocess.run(
+        [
+            "soda",
+            "scan",
+            "-d",
+            "supabase",
+            "-c",
+            "../../soda/configuration.yml",
+            "../../soda/checks_pricing.yml",
+        ],
+        capture_output=True,
+        text=True,
+        cwd=current_file_dir,
+    )
+
+    if result.stdout:
+        print(result.stdout)
+    if result.stderr:
+        print(result.stderr)
+
+    if result.returncode != 0:
+        raise Exception(f"Soda data quality checks failed with return code {result.returncode}")
diff --git a/card_data/pipelines/defs/transformation/transform_data.py b/card_data/pipelines/defs/transformation/transform_data.py
@@ -16,7 +16,7 @@ def get_asset_key(self, dbt_resource_props):
                 "series": "quality_checks_series",
                 "sets": "load_set_data",
                 "cards": "load_card_data",
-                "pricing_data": "load_pricing_data",
+                "pricing_data": "data_quality_checks_on_pricing",
             }
             if name in source_mapping:
                 return dg.AssetKey([source_mapping[name]])
diff --git a/card_data/pipelines/soda/checks_pricing.yml b/card_data/pipelines/soda/checks_pricing.yml
@@ -0,0 +1,75 @@
+checks for pricing_data:
+  # Row count validation - currently have 4216 rows
+  # Expect at least 4000 cards
+  - row_count > 4000:
+      name: Minimum row count check
+
+  # Warn if row count drops significantly
+  - row_count > 4200:
+      warn:
+        when fail
+      name: Row count sanity check (warn if below expected)
+
+  # Schema validation checks
+  - schema:
+      fail:
+        when required column missing: [product_id, name, card_number, market_price]
+        when wrong column type:
+          product_id: bigint
+          name: text
+          card_number: text
+          market_price: double precision
+
+  # Completeness checks - product_id, name, card_number should always be present
+  - missing_count(product_id) = 0:
+      name: Product ID completeness
+
+  - missing_count(name) = 0:
+      name: Card name completeness
+
+  - missing_count(card_number) = 0:
+      name: Card number completeness
+
+  # Data uniqueness checks
+  - duplicate_count(product_id) = 0:
+      name: Product ID uniqueness
+
+  # Data format validation
+  # Card numbers should be alphanumeric with slashes (e.g., "013/198", "4", "005/086")
+  - invalid_count(card_number) = 0:
+      valid regex: '^[A-Za-z0-9/]+$'
+      name: Card number format validation
+
+  # Card names should not be empty and should be reasonable length (<100 chars)
+  - invalid_count(name) = 0:
+      valid min length: 1
+      valid max length: 100
+      name: Card name length validation
+
+  # Data range validation
+  # Product IDs should be positive 6-digit numbers (observed range: 475k-642k)
+  - invalid_count(product_id) = 0:
+      valid min: 100000
+      valid max: 999999999
+      name: Product ID range validation
+
+  # Market prices (when present) should be positive and reasonable
+  # Current range: $0.02 to $1119.08
+  - invalid_percent(market_price) < 1%:
+      valid min: 0.01
+      valid max: 10000
+      name: Market price range validation ($0.01-$10,000)
+
+  # Statistical validation - average price should be reasonable
+  # Current average is ~$6.01, allow range of $2-$20 for sanity
+  - avg(market_price):
+      warn:
+        when < 2
+        when > 20
+      name: Average market price sanity check
+
+  # Anomaly detection - check for extreme outliers
+  - max(market_price) < 5000:
+      warn:
+        when fail
+      name: Maximum price outlier detection (warn if >$5000)

Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,7 @@ def get_asset_key(self, dbt_resource_props):`
`16`	`16`	`"series": "quality_checks_series",`
`17`	`17`	`"sets": "load_set_data",`
`18`	`18`	`"cards": "load_card_data",`
`19`		`- "pricing_data": "load_pricing_data",`
	`19`	`+ "pricing_data": "data_quality_checks_on_pricing",`
`20`	`20`	`}`
`21`	`21`	`if name in source_mapping:`
`22`	`22`	`return dg.AssetKey([source_mapping[name]])`