Skip to content

Commit c5ded72

Browse files
authored
Merge pull request #1967 from LBHackney-IT/di-471-improvements-to-dq-outputs
Improvements to data quality results output for improved analysis
2 parents 89d424d + d3e72e3 commit c5ded72

File tree

4 files changed

+137
-101
lines changed

4 files changed

+137
-101
lines changed

scripts/helpers/housing_gx_dq_inputs.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
sql_config = {'person_reshape': {
22
'sql': """SELECT *, substr(startdate, 1, 10) as startdate_parsed, substr(enddate, 1, 10) as enddate_parsed,
3-
substr(dateofbirth, 1, 10) as dateofbirth_parsed FROM "housing-refined-zone"."person_reshape" WHERE import_date = (SELECT max(import_date) FROM "housing-refined-zone"."person_reshape") AND enddate IS NULL AND type IN ('Secure', 'Introductory') and substr(dateofbirth, 1, 10) between '1850-01-01' and '2100-01-01' and substr(startdate, 1, 10) between '1900-01-01' and '2100-01-01'""",
3+
substr(dateofbirth, 1, 10) as dateofbirth_parsed FROM "housing-refined-zone"."person_reshape" WHERE import_date = (SELECT max(import_date) FROM "housing-refined-zone"."person_reshape") AND enddate IS NULL AND type IN ('Secure', 'Introductory')""",
44
'id_field': 'person_id'},
55
'tenure_reshape': {
66
'sql': """SELECT * FROM "housing-refined-zone"."tenure_reshape" where import_date>'20240412' and import_date=(select max(import_date) from "housing-refined-zone"."tenure_reshape" where import_date>'20240412') and isterminated=False and description in ('Secure', 'Introductory')""",
@@ -24,11 +24,18 @@
2424

2525
partition_keys = ['import_year', 'import_month', 'import_day', 'import_date']
2626

27-
dq_dimensions_map = {'expect_column_value_lengths_to_be_between': 'ACCURACY',
27+
dq_dimensions_map = {'expect_column_value_lengths_to_be_between': 'VALIDITY',
28+
'expect_first_name_column_value_lengths': 'VALIDITY',
29+
'expect_surname_column_value_lengths': 'VALIDITY',
2830
'expect_column_values_to_be_unique': 'UNIQUENESS',
2931
'expect_column_values_to_match_regex': 'VALIDITY',
32+
'expect_uprn_column_values_to_match_regex': 'VALIDITY',
3033
'expect_column_values_to_be_in_set': 'CONSISTENCY',
34+
'expect_person_type_values_to_be_in_set': 'CONSISTENCY',
35+
'expect_preferred_title_values_to_be_in_set': 'CONSISTENCY',
3136
'expect_select_column_values_to_be_unique_within_record': 'UNIQUENESS',
3237
'expect_column_values_to_not_be_null': 'COMPLETENESS',
33-
'expect_column_values_to_be_between': 'VALIDITY'
38+
'expect_uprn_not_to_be_null': 'COMPLETENESS',
39+
'expect_column_values_to_be_between': 'VALIDITY',
40+
'expect_date_of_birth_to_be_between': 'VALIDITY'
3441
}

scripts/jobs/housing/housing_apply_gx_dq_tests.py

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -105,12 +105,8 @@ def main():
105105
)
106106

107107
checkpoint_result = checkpoint.run(batch_parameters=batch_parameters)
108-
results_dict = checkpoint_result.describe_dict()
109-
# Serialize the result to handle any datetime objects
110-
json_results = json.dumps(results_dict, default=json_serial)
111-
logger.info(f"json_results: {json_results}")
112-
results = json.loads(json_results)
113-
table_results_df = pd.json_normalize(results['validation_results'][0]['expectations'])
108+
results_dict = list(checkpoint_result.run_results.values())[0].to_json_dict()
109+
table_results_df = pd.json_normalize(results_dict['results'])
114110
table_results_df_list.append(table_results_df)
115111

116112
# generate id lists for each unexpected result set
@@ -125,33 +121,41 @@ def main():
125121
results_df = pd.concat(table_results_df_list)
126122

127123
# map DQ dimension type
128-
results_df['dq_dimension_type'] = results_df['expectation_type'].map(dq_dimensions_map)
124+
results_df['dq_dimension_type'] = results_df['expectation_config.type'].map(dq_dimensions_map)
129125

130126
# add clean dataset name
131-
results_df['table_name'] = results_df['kwargs.batch_id'].map(
127+
results_df['dataset_name'] = results_df['expectation_config.kwargs.batch_id'].map(
132128
lambda x: x.removeprefix('pandas-').removesuffix('_df_asset'))
133129

130+
# add composite key for each specific test (so can be tracked over time)
131+
results_df.insert(loc=0, column='expectation_key',
132+
value=results_df.set_index(['expectation_config.type', 'dataset_name']).index.factorize()[0] + 1)
133+
results_df['expectation_id'] = results_df['expectation_config.type'] + "_" + results_df['dataset_name']
134+
134135
results_df['import_year'] = datetime.today().year
135136
results_df['import_month'] = datetime.today().month
136137
results_df['import_day'] = datetime.today().day
137138
results_df['import_date'] = datetime.today().strftime('%Y%m%d')
138139

139140
# set dtypes for Athena
140-
dtype_dict = {'expectation_type': 'string',
141-
'kwargs.batch_id': 'string',
142-
'kwargs.column': 'string',
143-
'kwargs.min_value': 'string',
144-
'kwargs.max_value': 'string',
141+
dtype_dict = {'expectation_config.type': 'string',
142+
'expectation_config.kwargs.batch_id': 'string',
143+
'expectation_config.kwargs.column': 'string',
144+
'expectation_config.kwargs.min_value': 'string',
145+
'expectation_config.kwargs.max_value': 'string',
145146
'result.element_count': 'bigint',
146147
'result.unexpected_count': 'bigint',
147148
'result.missing_count': 'bigint',
148149
'result.partial_unexpected_list': 'array<string>',
149150
'result.unexpected_list': 'array<string>',
150151
'result.unexpected_index_list': 'array<bigint>',
151152
'result.unexpected_index_query': 'string',
152-
'kwargs.regex': 'string',
153-
'kwargs.value_set': 'string',
154-
'kwargs.column_list': 'string',
153+
'expectation_config.kwargs.regex': 'string',
154+
'expectation_config.kwargs.value_set': 'string',
155+
'expectation_config.kwargs.column_list': 'string',
156+
'exception_info.raised_exception': 'string',
157+
'exception_info.exception_traceback': 'string',
158+
'exception_info.exception_message': 'string',
155159
'import_year': 'string',
156160
'import_month': 'string',
157161
'import_day': 'string',
@@ -164,7 +168,7 @@ def main():
164168
dataset=True,
165169
database=target_database,
166170
table=target_table,
167-
mode="overwrite_partitions",
171+
mode="overwrite",
168172
partition_cols=partition_keys,
169173
dtype=dtype_dict
170174
)

scripts/jobs/housing/housing_person_reshape_gx_suite.py

Lines changed: 105 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -10,89 +10,114 @@
1010
args = getResolvedOptions(sys.argv, arg_key)
1111
locals().update(args)
1212

13+
14+
class ExpectFirstNameColumnValueLength(gxe.ExpectColumnValueLengthsToBeBetween):
15+
column: str = "firstname"
16+
min_value: int = 1
17+
description: str = "Expect first name to be at least 1 character length"
18+
19+
20+
class ExpectSurnameColumnValueLength(gxe.ExpectColumnValueLengthsToBeBetween):
21+
column: str = "surname"
22+
min_value: int = 1
23+
description: str = "Expect surname to be at least 1 character length"
24+
25+
26+
class ExpectUPRNColumnValueLengthsBetween(gxe.ExpectColumnValueLengthsToBeBetween):
27+
column: str = "uprn"
28+
min_value: int = 11
29+
max_value: int = 12
30+
description: str = "Expect UPRN to be between 11 and 12 characters length inclusive"
31+
32+
33+
class ExpectUPRNColumnValuesToMatchRegex(gxe.ExpectColumnValuesToMatchRegex):
34+
column: str = "uprn"
35+
regex: str = r"^[1-9]\d{10,11}"
36+
description: str = "Expect UPRN to match regex ^[1-9]\d{10,11} (starting with digit 1-9, followed by 10 or 11 digits"
37+
38+
39+
class ExpectUPRNNotToBeNull(gxe.ExpectColumnValuesToNotBeNull):
40+
column: str = "uprn"
41+
description: str = "Expect UPRN column to be complete with no missing values"
42+
43+
44+
class ExpectPersonTypeValuesToBeInSet(gxe.ExpectColumnValuesToBeInSet):
45+
column: str = 'person_type'
46+
value_set: list = ['Tenant', 'HouseholdMember', 'Leaseholder', 'Freeholder', 'Occupant', 'HousingOfficer',
47+
'HousingAreaManager']
48+
description: str = "Expect person types values to contain one of Tenant, HouseholdMember, Leaseholder, Freeholder, Occupant HousingOfficer, HousingAreaManager"
49+
50+
51+
class ExpectPreferredTitleValuesToBeInSet(gxe.ExpectColumnValuesToBeInSet):
52+
column: str = 'preferredtitle'
53+
value_set: list = ['Dr', 'Master', 'Miss', 'Mr', 'Mrs', 'Ms', 'Rabbi', 'Reverend', 'Mx']
54+
description: str = "Expect preferred titles to be one of Dr, Master, Miss, Mr, Mrs, Ms, Mx, Rabbi, Reverend"
55+
56+
57+
class ExpectPersonIDColumnValuesToBeUnique(gxe.ExpectColumnValuesToBeUnique):
58+
column: str = 'person_id'
59+
description: str = "Expect Person ID to be unique within dataset"
60+
61+
62+
class ExpectPersonIDColumnValuesToNotBeNull(gxe.ExpectColumnValuesToNotBeNull):
63+
column: str = 'person_id'
64+
description: str = "Expect Person ID be complete with no missing values"
65+
66+
67+
class ExpectPersonIDAndPropertyReferenceColumnValuesToBeUniqueWithinRecord(
68+
gxe.ExpectSelectColumnValuesToBeUniqueWithinRecord):
69+
column_list: list = ['person_id', 'propertyreference']
70+
description: str = "Expect Person ID and Property Reference to be unique within dataset"
71+
72+
73+
class ExpectPropertyRefColumnValuesToNotBeNull(gxe.ExpectColumnValuesToNotBeNull):
74+
column: str = 'propertyreference'
75+
description: str = "Expect Property Reference be complete with no missing values"
76+
77+
78+
class ExpectPersonIDAndPaymentReferenceColumnValuesToBeUniqueWithinRecord(
79+
gxe.ExpectSelectColumnValuesToBeUniqueWithinRecord):
80+
column_list: list = ['person_id', 'paymentreference']
81+
description: str = "Expect Person ID and Payment Reference to be unique within dataset"
82+
83+
84+
class ExpectUPRNColumnValuesToNotBeNull(gxe.ExpectColumnValuesToNotBeNull):
85+
column: str = 'uprn'
86+
description: str = "Expect UPRN be complete with no missing values"
87+
88+
89+
class ExpectDateOfBirthColumnValuesToNotBeNull(gxe.ExpectColumnValuesToNotBeNull):
90+
column: str = 'dateofbirth_parsed'
91+
description: str = "Expect dateofbirth_parsed be complete with no missing values"
92+
93+
94+
class ExpectDateOfBirthToBeBetween(gxe.ExpectColumnValuesToBeBetween):
95+
column: str = 'dateofbirth_parsed'
96+
min_value: str = datetime(1900, 1, 1, 0, 0, 0).isoformat()
97+
max_value: str = datetime.today().isoformat()
98+
condition_parser: str = "pandas"
99+
row_condition: str = 'df["dateofbirth_parsed"].str[:10] >= "1850-01-01" and df["dateofbirth_parsed"].str[:10] < "2025-01-01" and df["startdate_parsed"].str[:10] > "1900-01-01" and df["startdate_parsed"].str[:10] < "2100-01-01"'
100+
description: str = "Expect dateofbirth_parsed be complete with no missing values"
101+
102+
13103
# add to GX context
14104
context = gx.get_context(mode="file", project_root_dir=s3_target_location)
15105

16106
suite = gx.ExpectationSuite(name='person_reshape_suite')
17-
suite.add_expectation(
18-
gxe.ExpectColumnValueLengthsToBeBetween(
19-
column="firstname",
20-
min_value=1)
21-
)
22-
suite.add_expectation(
23-
gxe.ExpectColumnValueLengthsToBeBetween(
24-
column="surname",
25-
min_value=1)
26-
)
27-
suite.add_expectation(
28-
gxe.ExpectColumnValueLengthsToBeBetween(
29-
column="uprn",
30-
min_value=11,
31-
max_value=12)
32-
)
33-
suite.add_expectation(
34-
gxe.ExpectColumnValuesToMatchRegex(
35-
column="uprn",
36-
regex=r"^[1-9]\d{10,11}")
37-
)
38-
suite.add_expectation(
39-
gxe.ExpectColumnValuesToNotBeNull(
40-
column='uprn')
41-
)
42-
suite.add_expectation(
43-
gxe.ExpectColumnValuesToBeInSet(
44-
column='type',
45-
value_set=['Asylum Seeker', 'Commercial Let', 'Temp Decant', 'Freehold', 'Freehold (Serv)', 'Introductory',
46-
'Leasehold (RTB)', 'Lse 100% Stair', 'License Temp Ac', 'Mesne Profit Ac', 'Non-Secure',
47-
'Private Garage', 'Registered Social Landlord', 'RenttoMortgage', 'Secure', 'Shared Owners',
48-
'Short Life Lse', 'Private Sale LH', 'Shared Equity', 'Tenant Acc Flat', 'Temp B&B', 'Tenant Garage',
49-
'Temp Hostel Lse', 'Temp Hostel', 'Temp Annex', 'Temp Private Lt', 'Temp Traveller'])
50-
)
51-
suite.add_expectation(
52-
gxe.ExpectColumnValuesToBeInSet(
53-
column='person_type',
54-
value_set=['Tenant', 'HouseholdMember', 'Leaseholder', 'Freeholder', 'Occupant', 'HousingOfficer',
55-
'HousingAreaManager'])
56-
)
57-
suite.add_expectation(
58-
gxe.ExpectColumnValuesToBeInSet(
59-
column='preferredtitle',
60-
value_set=['Dr', 'Master', 'Miss', 'Mr', 'Mrs', 'Ms', 'Other', 'Rabbi', 'Reverend'])
61-
)
62-
suite.add_expectation(
63-
gxe.ExpectColumnValuesToBeUnique(
64-
column='person_id')
65-
)
66-
suite.add_expectation(
67-
gxe.ExpectColumnValuesToNotBeNull(
68-
column='person_id')
69-
)
70-
suite.add_expectation(
71-
gxe.ExpectSelectColumnValuesToBeUniqueWithinRecord(
72-
column_list=['person_id', 'propertyreference'])
73-
)
74-
suite.add_expectation(
75-
gxe.ExpectColumnValuesToNotBeNull(
76-
column='propertyreference')
77-
)
78-
suite.add_expectation(
79-
gxe.ExpectSelectColumnValuesToBeUniqueWithinRecord(
80-
column_list=['person_id', 'paymentreference'])
81-
)
82-
suite.add_expectation(
83-
gxe.ExpectColumnValuesToNotBeNull(
84-
column='uprn')
85-
)
86-
suite.add_expectation(
87-
gxe.ExpectColumnValuesToNotBeNull(
88-
column='dateofbirth_parsed')
89-
)
90-
suite.add_expectation(
91-
gxe.ExpectColumnValuesToBeBetween(
92-
column='dateofbirth_parsed',
93-
min_value=datetime(1900, 1, 1, 0, 0, 0).isoformat(),
94-
max_value=datetime.today().isoformat()
95-
)
96-
)
107+
suite.add_expectation(ExpectFirstNameColumnValueLength())
108+
suite.add_expectation(ExpectSurnameColumnValueLength())
109+
suite.add_expectation(ExpectUPRNColumnValueLengthsBetween())
110+
suite.add_expectation(ExpectUPRNColumnValuesToMatchRegex())
111+
suite.add_expectation(ExpectUPRNNotToBeNull())
112+
suite.add_expectation(ExpectPersonTypeValuesToBeInSet())
113+
suite.add_expectation(ExpectPreferredTitleValuesToBeInSet())
114+
suite.add_expectation(ExpectPersonIDColumnValuesToBeUnique())
115+
suite.add_expectation(ExpectPersonIDColumnValuesToNotBeNull())
116+
suite.add_expectation(ExpectPersonIDAndPropertyReferenceColumnValuesToBeUniqueWithinRecord())
117+
suite.add_expectation(ExpectPropertyRefColumnValuesToNotBeNull())
118+
suite.add_expectation(ExpectPersonIDAndPaymentReferenceColumnValuesToBeUniqueWithinRecord())
119+
suite.add_expectation(ExpectUPRNColumnValuesToNotBeNull())
120+
suite.add_expectation(ExpectDateOfBirthColumnValuesToNotBeNull())
121+
suite.add_expectation(ExpectDateOfBirthToBeBetween())
97122

98123
suite = context.suites.add(suite)

terraform/etl/54-aws-glue-housing-apply-gx-dq-tests.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ module "housing_apply_gx_dq_tests" {
2020
"--job-bookmark-option" = "job-bookmark-enable"
2121
"--enable-glue-datacatalog" = "true"
2222
"--enable-continuous-cloudwatch-log" = "true"
23-
"--additional-python-modules" = "great_expectations==1.2.0,PyAthena,awswrangler"
23+
"--additional-python-modules" = "great_expectations==1.2.1,PyAthena,numpy==1.26.1,awswrangler==3.10.0"
2424
"--region_name" = data.aws_region.current.name
2525
"--s3_endpoint" = "https://s3.${data.aws_region.current.name}.amazonaws.com"
2626
"--s3_target_location" = "s3://${module.raw_zone_data_source.bucket_id}/housing/data-quality-tests/"

0 commit comments

Comments
 (0)