11version : " 1"
2- validations :
2+ rules :
33 # https://clickhouse.com/docs/getting-started/example-datasets/nyc-taxi
44 - dataset : ch@[nyc_taxi.trips_small]
5- # common pre-filter for every check, e.g. to run daily check only for yesterday
65 where : " pickup_datetime > '2014-01-01'"
76 checks :
8- - id : row_count > 0
9- description : " data should be present" # optional
10- on_error : alert # optional (ignore, alert), default "alert"
11-
12- - id : row_count between 100 and 30000
13- description : " expected rows count"
14- on_error : ignore
15-
16- - id : null_count(pickup_ntaname) == 0
17- description : " no nulls are allowed in column: pickup_ntaname"
18-
19- - id : min(pickup_datetime) < now() - interval 3 day
20- description : " min(pickup_datetime) should not be earlier than 3 days"
21-
22- - id : stddevPop(trip_distance) < 100_000
23- description : " check stddev value"
24-
25- - id : sum(fare_amount) <= 10_000_000
26- description : " sum of value"
27-
28- - id : countIf(trip_id == 1) == 1
29- description : " check trip id"
30-
31- - id : raw_query
32- description : " raw query quality test"
33- query : |
34- select countIf(trip_distance == 0) > 0 from {{table}} where 1=1
7+ # schema-level checks
8+ - schema_check :
9+ expect_columns_ordered :
10+ columns_order : [trip_id, pickup_datetime, dropoff_datetime, pickup_longitude, pickup_latitude,
11+ dropoff_longitude, dropoff_latitude, passenger_count, trip_distance, fare_amount, extra,
12+ tip_amount, tolls_amount, total_amount, payment_type, pickup_ntaname, dropoff_ntaname]
13+ desc : " Ensure table columns are in the expected order"
14+ on_fail : error
15+
16+ - schema_check :
17+ expect_columns :
18+ columns : [trip_id, fare_amount]
19+ desc : " Ensure required columns exist"
20+ on_fail : error
21+
22+ - schema_check :
23+ columns_not_present :
24+ columns : [credit_card_number, credit_card_cvv]
25+ pattern : " pii_*"
26+ desc : " Ensure PII and credit card info is not present in the table"
27+ on_fail : error
28+
29+ # table-level checks
30+ - row_count between 10000 and 3500000 :
31+ desc : " Dataset should contain a reasonable number of trips"
32+ on_fail : error
33+
34+ # column existence and nullability
35+ - not_null(trip_id) :
36+ desc : " Trip ID is mandatory"
37+ - not_null(pickup_datetime)
38+ - not_null(dropoff_datetime)
39+
40+ # data freshness
41+ - freshness(pickup_datetime) < 7d :
42+ desc : " Data should be no older than 7 days"
43+ on_fail : warn
44+
45+ # uniqueness constraints
46+ - uniqueness(trip_id) :
47+ desc : " Trip IDs must be unique"
48+ on_fail : error
49+
50+ # numeric validations
51+ - min(trip_distance) >= 0 :
52+ desc : " Trip distance cannot be negative"
53+ - max(trip_distance) < 1000 :
54+ desc : " Maximum trip distance seems unrealistic"
55+ on_fail : warn
56+ - avg(trip_distance) between 1.0 and 20.0 :
57+ desc : " Average trip distance should be reasonable"
58+ - stddev(trip_distance) < 100 :
59+ desc : " Trip distance variation should be within normal range"
60+
61+ # fare validations
62+ - min(fare_amount) > 0 :
63+ desc : " Fare amount should be positive"
64+ - max(fare_amount) < 1000 :
65+ desc : " Maximum fare seems too high"
66+ - sum(fare_amount) between 10000 and 10000000 :
67+ desc : " Total fare amount should be within expected range"
68+
69+ # custom validation with raw query
70+ - raw_query :
71+ desc : " Check for trips with zero distance but positive fare"
72+ query : " select count() from {{dataset}} where trip_distance = 0 and fare_amount > 0"
73+ on_fail : warn
3574
3675 # https://wiki.postgresql.org/wiki/Sample_Databases
3776 - dataset : pg@[public.land_registry_price_paid_uk]
38- # exclude January for example
39- where : " transfer_date >= '2025-02-01 00:00:00.000000'"
77+ where : " transfer_date >= '2025-01-01'"
4078 checks :
41- - id : row_count > 0
42- description : " data should be present"
43- on_error : alert
44-
45- - id : row_count between 200000 and 300000
46- description : " expected rows count"
47- on_error : ignore
48-
49- - id : min(price) > 0
50- description : " min(price) should be greater than zero"
51-
52- - id : max(price) < 100000000
53- description : " max(price) should be less than 100_000_000"
54-
55- - id : stddev_pop(price) < 500000
56- description : " price stddev"
57-
58- # https://github.com/datacharmer/test_db
79+ # schema validation
80+ - schema_check :
81+ expect_columns_ordered :
82+ columns_order : [transaction, price, transfer_date, postcode, property_type, newly_built, duration, paon, saon,
83+ street, locality, city, district, county, ppd_category_type, record_status]
84+ desc : " Validate expected column order for data consistency"
85+ on_fail : warn
86+
87+ - schema_check :
88+ expect_columns :
89+ columns : [transaction, price, property_type]
90+ desc : " Ensure critical columns exist"
91+ on_fail : error
92+
93+ - row_count() between 100 and 250000 :
94+ desc : " Recent property transactions should be within expected volume"
95+
96+ # price checks
97+ - not_null(price) :
98+ desc : " Property price is mandatory"
99+ - min(price) >= 100 :
100+ desc : " Minimum price should be realistic"
101+ - max(price) < 50000000 :
102+ desc : " Maximum price should be within UK market range"
103+ - avg(price) between 200000 and 800000 :
104+ desc : " Average property price should align with market data"
105+ - stddev(price) < 500000 :
106+ desc : " Price standard deviation should indicate reasonable market variation"
107+
108+ # property type validations
109+ - not_null(property_type)
110+ - uniqueness(transaction) :
111+ desc : " Each transaction must have a unique identifier"
112+ on_fail : error
113+
114+ # date validations
115+ - freshness(transfer_date) < 1d :
116+ desc : " Transfer date should be very recent"
117+ on_fail : warn
118+
119+
120+ # # https://github.com/datacharmer/test_db
59121 - dataset : mysql@[employees.salaries]
60122 checks :
61- - id : row_count > 0
62- description : " data should be present"
63- on_error : alert
64-
65- - id : min(salary) > 0
66- description : " min(salary) should be greater than zero"
67-
68- - id : max(salary) < 150000
69- description : " max(salary) should be less than 150000"
70-
71- - id : stddev_pop(salary) < 50000
72- description : " price stddev"
123+ # schema validation
124+ - schema_check :
125+ expect_columns_ordered :
126+ columns_order : [order_id, customer_id, order_status, total_amount, item_count, created_at, shipped_date]
127+ desc : " Ensure order table maintains expected column structure"
128+ on_fail : error
129+
130+ - schema_check :
131+ expect_columns :
132+ columns : [order_id, customer_id, order_status, total_amount]
133+ desc : " Ensure essential order columns exist"
134+ on_fail : error
135+
136+ # order volume validation
137+ - row_count between 100 and 10000 :
138+ desc : " Monthly order volume should be within business expectations"
139+ on_fail : warn
140+
141+ # customer data integrity
142+ - not_null(customer_id) :
143+ desc : " Every order must have a customer ID"
144+ on_fail : error
145+ - not_null(order_status)
146+
147+ # order value validations
148+ - min(total_amount) > 0 :
149+ desc : " Order total must be positive"
150+ on_fail : error
151+ - max(total_amount) < 10000 :
152+ desc : " Unusually high order amount detected"
153+ on_fail : warn
154+ - avg(total_amount) between 25.0 and 200.0 :
155+ desc : " Average order value should align with business metrics"
0 commit comments