Skip to content

Commit ca37751

Browse files
committed
dbqctl v0.5.0 - updated core, new checks format, checks output improvements
1 parent 1644c21 commit ca37751

File tree

7 files changed

+294
-130
lines changed

7 files changed

+294
-130
lines changed

checks.yaml

Lines changed: 144 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -1,72 +1,155 @@
11
version: "1"
2-
validations:
2+
rules:
33
# https://clickhouse.com/docs/getting-started/example-datasets/nyc-taxi
44
- dataset: ch@[nyc_taxi.trips_small]
5-
# common pre-filter for every check, e.g. to run daily check only for yesterday
65
where: "pickup_datetime > '2014-01-01'"
76
checks:
8-
- id: row_count > 0
9-
description: "data should be present" # optional
10-
on_error: alert # optional (ignore, alert), default "alert"
11-
12-
- id: row_count between 100 and 30000
13-
description: "expected rows count"
14-
on_error: ignore
15-
16-
- id: null_count(pickup_ntaname) == 0
17-
description: "no nulls are allowed in column: pickup_ntaname"
18-
19-
- id: min(pickup_datetime) < now() - interval 3 day
20-
description: "min(pickup_datetime) should not be earlier than 3 days"
21-
22-
- id: stddevPop(trip_distance) < 100_000
23-
description: "check stddev value"
24-
25-
- id: sum(fare_amount) <= 10_000_000
26-
description: "sum of value"
27-
28-
- id: countIf(trip_id == 1) == 1
29-
description: "check trip id"
30-
31-
- id: raw_query
32-
description: "raw query quality test"
33-
query: |
34-
select countIf(trip_distance == 0) > 0 from {{table}} where 1=1
7+
# schema-level checks
8+
- schema_check:
9+
expect_columns_ordered:
10+
columns_order: [trip_id, pickup_datetime, dropoff_datetime, pickup_longitude, pickup_latitude,
11+
dropoff_longitude, dropoff_latitude, passenger_count, trip_distance, fare_amount, extra,
12+
tip_amount, tolls_amount, total_amount, payment_type, pickup_ntaname, dropoff_ntaname]
13+
desc: "Ensure table columns are in the expected order"
14+
on_fail: error
15+
16+
- schema_check:
17+
expect_columns:
18+
columns: [trip_id, fare_amount]
19+
desc: "Ensure required columns exist"
20+
on_fail: error
21+
22+
- schema_check:
23+
columns_not_present:
24+
columns: [credit_card_number, credit_card_cvv]
25+
pattern: "pii_*"
26+
desc: "Ensure PII and credit card info is not present in the table"
27+
on_fail: error
28+
29+
# table-level checks
30+
- row_count between 10000 and 3500000:
31+
desc: "Dataset should contain a reasonable number of trips"
32+
on_fail: error
33+
34+
# column existence and nullability
35+
- not_null(trip_id):
36+
desc: "Trip ID is mandatory"
37+
- not_null(pickup_datetime)
38+
- not_null(dropoff_datetime)
39+
40+
# data freshness
41+
- freshness(pickup_datetime) < 7d:
42+
desc: "Data should be no older than 7 days"
43+
on_fail: warn
44+
45+
# uniqueness constraints
46+
- uniqueness(trip_id):
47+
desc: "Trip IDs must be unique"
48+
on_fail: error
49+
50+
# numeric validations
51+
- min(trip_distance) >= 0:
52+
desc: "Trip distance cannot be negative"
53+
- max(trip_distance) < 1000:
54+
desc: "Maximum trip distance seems unrealistic"
55+
on_fail: warn
56+
- avg(trip_distance) between 1.0 and 20.0:
57+
desc: "Average trip distance should be reasonable"
58+
- stddev(trip_distance) < 100:
59+
desc: "Trip distance variation should be within normal range"
60+
61+
# fare validations
62+
- min(fare_amount) > 0:
63+
desc: "Fare amount should be positive"
64+
- max(fare_amount) < 1000:
65+
desc: "Maximum fare seems too high"
66+
- sum(fare_amount) between 10000 and 10000000:
67+
desc: "Total fare amount should be within expected range"
68+
69+
# custom validation with raw query
70+
- raw_query:
71+
desc: "Check for trips with zero distance but positive fare"
72+
query: "select count() from {{dataset}} where trip_distance = 0 and fare_amount > 0"
73+
on_fail: warn
3574

3675
# https://wiki.postgresql.org/wiki/Sample_Databases
3776
- dataset: pg@[public.land_registry_price_paid_uk]
38-
# exclude January for example
39-
where: "transfer_date >= '2025-02-01 00:00:00.000000'"
77+
where: "transfer_date >= '2025-01-01'"
4078
checks:
41-
- id: row_count > 0
42-
description: "data should be present"
43-
on_error: alert
44-
45-
- id: row_count between 200000 and 300000
46-
description: "expected rows count"
47-
on_error: ignore
48-
49-
- id: min(price) > 0
50-
description: "min(price) should be greater than zero"
51-
52-
- id: max(price) < 100000000
53-
description: "max(price) should be less than 100_000_000"
54-
55-
- id: stddev_pop(price) < 500000
56-
description: "price stddev"
57-
58-
# https://github.com/datacharmer/test_db
79+
# schema validation
80+
- schema_check:
81+
expect_columns_ordered:
82+
columns_order: [transaction, price, transfer_date, postcode, property_type, newly_built, duration, paon, saon,
83+
street, locality, city, district, county, ppd_category_type, record_status]
84+
desc: "Validate expected column order for data consistency"
85+
on_fail: warn
86+
87+
- schema_check:
88+
expect_columns:
89+
columns: [transaction, price, property_type]
90+
desc: "Ensure critical columns exist"
91+
on_fail: error
92+
93+
- row_count() between 100 and 250000:
94+
desc: "Recent property transactions should be within expected volume"
95+
96+
# price checks
97+
- not_null(price):
98+
desc: "Property price is mandatory"
99+
- min(price) >= 100:
100+
desc: "Minimum price should be realistic"
101+
- max(price) < 50000000:
102+
desc: "Maximum price should be within UK market range"
103+
- avg(price) between 200000 and 800000:
104+
desc: "Average property price should align with market data"
105+
- stddev(price) < 500000:
106+
desc: "Price standard deviation should indicate reasonable market variation"
107+
108+
# property type validations
109+
- not_null(property_type)
110+
- uniqueness(transaction):
111+
desc: "Each transaction must have a unique identifier"
112+
on_fail: error
113+
114+
# date validations
115+
- freshness(transfer_date) < 1d:
116+
desc: "Transfer date should be very recent"
117+
on_fail: warn
118+
119+
120+
# # https://github.com/datacharmer/test_db
59121
- dataset: mysql@[employees.salaries]
60122
checks:
61-
- id: row_count > 0
62-
description: "data should be present"
63-
on_error: alert
64-
65-
- id: min(salary) > 0
66-
description: "min(salary) should be greater than zero"
67-
68-
- id: max(salary) < 150000
69-
description: "max(salary) should be less than 150000"
70-
71-
- id: stddev_pop(salary) < 50000
72-
description: "price stddev"
123+
# schema validation
124+
- schema_check:
125+
expect_columns_ordered:
126+
columns_order: [order_id, customer_id, order_status, total_amount, item_count, created_at, shipped_date]
127+
desc: "Ensure order table maintains expected column structure"
128+
on_fail: error
129+
130+
- schema_check:
131+
expect_columns:
132+
columns: [order_id, customer_id, order_status, total_amount]
133+
desc: "Ensure essential order columns exist"
134+
on_fail: error
135+
136+
# order volume validation
137+
- row_count between 100 and 10000:
138+
desc: "Monthly order volume should be within business expectations"
139+
on_fail: warn
140+
141+
# customer data integrity
142+
- not_null(customer_id):
143+
desc: "Every order must have a customer ID"
144+
on_fail: error
145+
- not_null(order_status)
146+
147+
# order value validations
148+
- min(total_amount) > 0:
149+
desc: "Order total must be positive"
150+
on_fail: error
151+
- max(total_amount) < 10000:
152+
desc: "Unusually high order amount detected"
153+
on_fail: warn
154+
- avg(total_amount) between 25.0 and 200.0:
155+
desc: "Average order value should align with business metrics"

cmd/check.go

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,10 @@ import (
2727
)
2828

2929
type FailedCheckDetails struct {
30-
ID string
31-
Dataset string
32-
Err error
30+
Expression string
31+
Dataset string
32+
ActualVal string
33+
Err string
3334
}
3435

3536
func NewCheckCommand(app internal.DbqCliApp) *cobra.Command {
@@ -56,7 +57,7 @@ By automating these checks, you can proactively identify and address data qualit
5657
passedCount := 0
5758
var failedChecks []FailedCheckDetails
5859

59-
for _, rule := range checksCfg.Validations {
60+
for _, rule := range checksCfg.Rules {
6061
dataSourceId, datasets, err := parseDatasetString(rule.Dataset)
6162
if err != nil {
6263
return fmt.Errorf("error while parsing dataset property: %w", err)
@@ -70,20 +71,26 @@ By automating these checks, you can proactively identify and address data qualit
7071
for _, dataset := range datasets {
7172
fmt.Printf("running %d quality checks for '%s'\n", len(rule.Checks), dataset)
7273
for _, check := range rule.Checks {
73-
pass, _, err := app.RunCheck(&check, dataSource, dataset, rule.Where)
74+
validationResult := app.RunCheck(&check, dataSource, dataset, rule.Where)
7475

75-
fmt.Printf(" check %s ... %s\n", check.Description, getCheckResultLabel(pass))
76-
if pass {
76+
checkLabel := check.Expression
77+
if check.Description != "" {
78+
checkLabel = check.Description
79+
}
80+
81+
fmt.Printf(" %s: %s \n", getCheckResultLabel(validationResult.Pass), checkLabel)
82+
if validationResult.Pass {
7783
passedCount += 1
7884
} else {
7985
failedChecks = append(failedChecks, FailedCheckDetails{
80-
ID: check.ID,
81-
Dataset: dataset,
82-
Err: err,
86+
Expression: check.Expression,
87+
Dataset: dataset,
88+
ActualVal: validationResult.QueryResultValue,
89+
Err: validationResult.Error,
8390
})
8491
}
8592

86-
if !pass && strGetOrDefault(string(check.OnError), string(dbqcore.OnErrorActionAlert)) == string(dbqcore.OnErrorActionAlert) {
93+
if !validationResult.Pass && strGetOrDefault(string(check.OnFail), string(dbqcore.OnFailActionError)) == string(dbqcore.OnFailActionError) {
8794
exitCode = 1
8895
}
8996
}
@@ -93,8 +100,18 @@ By automating these checks, you can proactively identify and address data qualit
93100
if len(failedChecks) != 0 {
94101
for _, result := range failedChecks {
95102
fmt.Println()
96-
fmt.Printf("--- %s ---\n", result.ID)
97-
fmt.Printf("error: %s\n", result.Err)
103+
fmt.Printf("--- %s : %s ---\n", result.Dataset, result.Expression)
104+
if result.ActualVal != "" {
105+
units := ""
106+
if strings.HasPrefix(result.Expression, "freshness") {
107+
units = " (diff in seconds)"
108+
}
109+
110+
fmt.Printf("actual value: %s%s\n", result.ActualVal, units)
111+
}
112+
if result.Err != "" {
113+
fmt.Printf("error: %s\n", result.Err)
114+
}
98115
}
99116
}
100117

cmd/version.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ import (
2222
)
2323

2424
const (
25-
DbqCtlVersion = "v0.4.0"
25+
DbqCtlVersion = "v0.5.0"
2626
)
2727

2828
func NewVersionCommand() *cobra.Command {

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ module github.com/DataBridgeTech/dbqctl
33
go 1.24.5
44

55
require (
6-
github.com/DataBridgeTech/dbqcore v0.4.0
6+
github.com/DataBridgeTech/dbqcore v0.5.2
77
github.com/spf13/cobra v1.9.1
88
github.com/spf13/pflag v1.0.7
99
github.com/spf13/viper v1.20.1

go.sum

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ github.com/ClickHouse/clickhouse-go/v2 v2.39.0/go.mod h1:m13KylpdcPzpIjznlfXp53I
77
github.com/DataBridgeTech/dbqcore v0.3.0/go.mod h1:frSX2soCK0jiZMu1rcPPbvHar2KMtKL0/Hh2fHiQJ6s=
88
github.com/DataBridgeTech/dbqcore v0.4.0 h1:GEoH9o6ByltB6VUil02Wu6/vA2qarVcFd7j18h5Eeiw=
99
github.com/DataBridgeTech/dbqcore v0.4.0/go.mod h1:frSX2soCK0jiZMu1rcPPbvHar2KMtKL0/Hh2fHiQJ6s=
10+
github.com/DataBridgeTech/dbqcore v0.5.1 h1:s/30Mqp3uhBUB5moLYLQ1gc2TRDgnHqihH1u2P8gm0A=
11+
github.com/DataBridgeTech/dbqcore v0.5.1/go.mod h1:/uaxo2GiU3UjeIQ1KC2XcIHXZkDbE5wLoab+eR1PIgg=
12+
github.com/DataBridgeTech/dbqcore v0.5.2 h1:3vQLhskFYfLrGngFHvc7LlJRs1+ZaswMaFZ51r5yeZw=
13+
github.com/DataBridgeTech/dbqcore v0.5.2/go.mod h1:/uaxo2GiU3UjeIQ1KC2XcIHXZkDbE5wLoab+eR1PIgg=
1014
github.com/andybalholm/brotli v1.2.0 h1:ukwgCxwYrmACq68yiUqwIWnGY0cTPox/M94sVwToPjQ=
1115
github.com/andybalholm/brotli v1.2.0/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY=
1216
github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=

0 commit comments

Comments
 (0)