Skip to content

Commit cdbbb9c

Browse files
authored
Merge pull request #1954 from LBHackney-IT/di-459-create-refined-gx-output-dataset
Add additional columns to housing_gx_data_quality_tests_complete and adjust configurations
2 parents ae3c863 + 4110374 commit cdbbb9c

File tree

3 files changed

+18
-6
lines changed

3 files changed

+18
-6
lines changed

scripts/helpers/housing_gx_dq_inputs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
'id_field': 'person_id'},
55
'tenure_reshape': {
66
'sql': """SELECT * FROM "housing-refined-zone"."tenure_reshape" where import_date>'20240412' and import_date=(select max(import_date) from "housing-refined-zone"."tenure_reshape" where import_date>'20240412') and isterminated=False and description in ('Secure', 'Introductory')""",
7-
'id_field': 'tenure_id'},
7+
'id_field': 'tenancy_id'},
88
'contacts_reshape': {
99
'sql': """SELECT id, targetid, createdat, contacttype, subtype, value, lastmodified, targettype, isactive, person_id, import_date FROM "housing-refined-zone"."contacts_reshape" where import_date=(select max(import_date) from "housing-refined-zone"."contacts_reshape") and isactive=True""",
1010
'id_field': 'id'},

scripts/jobs/housing/housing_apply_gx_dq_tests.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import great_expectations as gx
1111
import pandas as pd
1212
from pyathena import connect
13-
from scripts.helpers.housing_gx_dq_inputs import sql_config, table_list, partition_keys
13+
from scripts.helpers.housing_gx_dq_inputs import sql_config, table_list, partition_keys, dq_dimensions_map
1414
import scripts.jobs.housing.housing_person_reshape_gx_suite
1515
import scripts.jobs.housing.housing_tenure_reshape_gx_suite
1616
import scripts.jobs.housing.housing_contacts_reshape_gx_suite
@@ -98,7 +98,9 @@ def main():
9898
name=f'{table}_checkpoint',
9999
validation_definitions=[validation_definition],
100100
actions=actions,
101-
result_format={"result_format": "COMPLETE"}
101+
result_format={"result_format": "COMPLETE",
102+
"return_unexpected_index_query": False,
103+
"partial_unexpected_count": 0}
102104
)
103105
)
104106

@@ -111,8 +113,20 @@ def main():
111113
table_results_df = pd.json_normalize(results['validation_results'][0]['expectations'])
112114
table_results_df_list.append(table_results_df)
113115

116+
# generate id lists for each unexpected result set
117+
query_df = table_results_df.loc[(~table_results_df['result.unexpected_index_list'].isna()) & (
118+
table_results_df['result.unexpected_index_list'].values != '[]')]
119+
120+
table_results_df['unexpected_id_list'] = pd.Series(dtype='object')
121+
for i, row in query_df.iterrows():
122+
table_results_df.loc[i, 'unexpected_id_list'] = str(
123+
list(df[sql_config.get(table).get('id_field')].iloc[row['result.unexpected_index_list']]))
124+
114125
results_df = pd.concat(table_results_df_list)
115126

127+
# map DQ dimension type
128+
results_df['dq_dimension_type'] = results_df['expectation_type'].map(dq_dimensions_map)
129+
116130
results_df['import_year'] = datetime.today().year
117131
results_df['import_month'] = datetime.today().month
118132
results_df['import_day'] = datetime.today().day
@@ -128,8 +142,6 @@ def main():
128142
'result.unexpected_count': 'bigint',
129143
'result.missing_count': 'bigint',
130144
'result.partial_unexpected_list': 'array<string>',
131-
'result.partial_unexpected_counts': 'array<struct<count:bigint,value:string>>',
132-
'result.partial_unexpected_index_list': 'array<bigint>',
133145
'result.unexpected_list': 'array<string>',
134146
'result.unexpected_index_list': 'array<bigint>',
135147
'result.unexpected_index_query': 'string',

terraform/etl/54-aws-glue-housing-apply-gx-dq-tests.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ module "housing_apply_gx_dq_tests" {
2020
"--job-bookmark-option" = "job-bookmark-enable"
2121
"--enable-glue-datacatalog" = "true"
2222
"--enable-continuous-cloudwatch-log" = "true"
23-
"--additional-python-modules" = "great_expectations==1.1.0,PyAthena,awswrangler"
23+
"--additional-python-modules" = "great_expectations==1.2.0,PyAthena,awswrangler"
2424
"--region_name" = data.aws_region.current.name
2525
"--s3_endpoint" = "https://s3.${data.aws_region.current.name}.amazonaws.com"
2626
"--s3_target_location" = "s3://${module.raw_zone_data_source.bucket_id}/housing/data-quality-tests/"

0 commit comments

Comments
 (0)