Skip to content

Commit 0a134e0

Browse files
committed
Add workaround for working with dates in GX. Add expectation to person_reshape suite. Amend sql query for person_reshape.
1 parent c44528a commit 0a134e0

File tree

3 files changed

+26
-5
lines changed

3 files changed

+26
-5
lines changed

scripts/helpers/housing_gx_dq_inputs.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
sql_config = {'person_reshape': {
2-
'sql': """SELECT *, cast(date_parse(substr(startdate, 1, 10), '%Y-%m-%d') as date) as startdate_parsed, cast(date_parse(substr(enddate, 1, 10), '%Y-%m-%d') as date) as enddate_parsed, cast(date_parse(substr(dateofbirth, 1, 10), '%Y-%m-%d') as date) as dateofbirth_parsed FROM "housing-refined-zone"."person_reshape" where import_date=(select max(import_date) from "housing-refined-zone"."person_reshape") and enddate is NULL and type in ('Secure', 'Introductory')""",
2+
'sql': """SELECT *, substr(startdate, 1, 10) as startdate_parsed, substr(enddate, 1, 10) as enddate_parsed,
3+
substr(dateofbirth, 1, 10) as dateofbirth_parsed
4+
FROM "housing-refined-zone"."person_reshape" WHERE import_date = (SELECT max(import_date) FROM "housing-refined-zone"."person_reshape"
5+
) AND enddate IS NULL AND type IN ('Secure', 'Introductory') and substr(dateofbirth, 1, 10) between '1850-01-01' and '2100-01-01' and substr(startdate, 1, 10) between '1900-01-01' and '2100-01-01'""",
36
'id_field': 'person_id'},
47
'tenure_reshape': {
58
'sql': """SELECT * FROM "housing-refined-zone"."tenure_reshape" where import_date>'20240412' and import_date=(select max(import_date) from "housing-refined-zone"."tenure_reshape" where import_date>'20240412') and isterminated=False and description in ('Secure', 'Introductory')""",

scripts/jobs/housing/housing_apply_gx_dq_tests.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# flake8: noqa: F821
22

33
import awswrangler as wr
4-
from datetime import datetime
4+
from datetime import datetime, date
55
import json
66
import logging
77
import sys
@@ -27,6 +27,13 @@
2727
locals().update(args)
2828

2929

30+
def json_serial(obj):
31+
"""JSON serializer for objects not serializable by default."""
32+
if isinstance(obj, (datetime, date)):
33+
return obj.isoformat()
34+
raise TypeError(f"Type {type(obj)} not serializable")
35+
36+
3037
def main():
3138
# add GX context
3239
context = gx.get_context(mode="file", project_root_dir=s3_target_location)
@@ -96,7 +103,11 @@ def main():
96103
)
97104

98105
checkpoint_result = checkpoint.run(batch_parameters=batch_parameters)
99-
results = json.loads(checkpoint_result.describe())
106+
results_dict = checkpoint_result.describe_dict()
107+
# Serialize the result to handle any datetime objects
108+
json_results = json.dumps(results_dict, default=json_serial)
109+
logger.info(f"json_results: {json_results}")
110+
results = json.loads(json_results)
100111
table_results_df = pd.json_normalize(results['validation_results'][0]['expectations'])
101112
table_results_df_list.append(table_results_df)
102113

scripts/jobs/housing/housing_person_reshape_gx_suite.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# flake8: noqa: F821
2-
from datetime import datetime, date
2+
from datetime import datetime
33
import sys
44

55
from awsglue.utils import getResolvedOptions
@@ -85,7 +85,14 @@
8585
)
8686
suite.add_expectation(
8787
gxe.ExpectColumnValuesToNotBeNull(
88-
column='dateofbirth')
88+
column='dateofbirth_parsed')
89+
)
90+
suite.add_expectation(
91+
gxe.ExpectColumnValuesToBeBetween(
92+
column='dateofbirth_parsed',
93+
min_value=datetime(1900, 1, 1, 0, 0, 0).isoformat(),
94+
max_value=datetime.today().isoformat()
95+
)
8996
)
9097

9198
suite = context.suites.add(suite)

0 commit comments

Comments
 (0)