Skip to content

Commit c49e500

Browse files
committed
Adjust expectations to be custom classes, so they can be differentiated.
Remove expectations that are no longer relevant. Adjust sql query for person_reshape table.
1 parent 0b45ad5 commit c49e500

File tree

5 files changed

+191
-168
lines changed

5 files changed

+191
-168
lines changed

scripts/jobs/housing/housing_apply_gx_dq_tests.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,8 @@ def main():
137137
results_df['import_day'] = datetime.today().day
138138
results_df['import_date'] = datetime.today().strftime('%Y%m%d')
139139

140+
results_df = results_df.drop(columns={'exception_info.exception_traceback'})
141+
140142
# set dtypes for Athena
141143
dtype_dict = {'expectation_config.type': 'string',
142144
'expectation_config.kwargs.batch_id': 'string',

scripts/jobs/housing/housing_assets_reshape_gx_suite.py

Lines changed: 35 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -10,45 +10,44 @@
1010
args = getResolvedOptions(sys.argv, arg_key)
1111
locals().update(args)
1212

13+
14+
class ExpectUPRNColumnValueLengthsBetween(gxe.ExpectColumnValueLengthsToBeBetween):
15+
column: str = "uprn"
16+
min_value: int = 11
17+
max_value: int = 12
18+
description: str = "Expect UPRN to be between 11 and 12 characters length inclusive"
19+
20+
21+
class ExpectUPRNColumnValuesToMatchRegex(gxe.ExpectColumnValuesToMatchRegex):
22+
column: str = "uprn"
23+
regex: str = r"^[1-9]\d{10,11}"
24+
description: str = "Expect UPRN to match regex ^[1-9]\d{10,11} (starting with digit 1-9, followed by 10 or 11 digits"
25+
26+
27+
class ExpectUPRNNotToBeNull(gxe.ExpectColumnValuesToNotBeNull):
28+
column: str = "uprn"
29+
description: str = "Expect UPRN column to be complete with no missing values"
30+
31+
32+
class ExpectAssetIDNotToBeNull(gxe.ExpectColumnValuesToNotBeNull):
33+
column: str = "asset_id"
34+
description: str = "Expect Asset ID column to be complete with no missing values"
35+
36+
37+
class ExpectAssetTypeNotToBeNull(gxe.ExpectColumnValuesToNotBeNull):
38+
column: str = "assettype"
39+
description: str = "Expect Asset Type column to be complete with no missing values"
40+
41+
1342
# add to GX context
1443
context = gx.get_context(mode="file", project_root_dir=s3_target_location)
1544

1645
suite = gx.ExpectationSuite(name='assets_reshape_suite')
17-
suite.add_expectation(
18-
gxe.ExpectColumnValueLengthsToBeBetween(
19-
column="uprn",
20-
min_value=11,
21-
max_value=12)
22-
)
23-
suite.add_expectation(
24-
gxe.ExpectColumnValuesToMatchRegex(
25-
column="uprn",
26-
regex=r"^[1-9]\d{10,11}")
27-
)
28-
suite.add_expectation(
29-
gxe.ExpectColumnValuesToBeInSet(
30-
column='type',
31-
value_set=['Dwelling'])
32-
)
33-
suite.add_expectation(
34-
gxe.ExpectColumnValuesToBeUnique(
35-
column='asset_id')
36-
)
37-
suite.add_expectation(
38-
gxe.ExpectColumnValuesToNotBeNull(
39-
column='asset_id')
40-
)
41-
suite.add_expectation(
42-
gxe.ExpectColumnValuesToNotBeNull(
43-
column='uprn')
44-
)
45-
suite.add_expectation(
46-
gxe.ExpectColumnValuesToNotBeNull(
47-
column='estate_name')
48-
)
49-
suite.add_expectation(
50-
gxe.ExpectColumnValuesToNotBeNull(
51-
column='type')
52-
)
46+
47+
suite.add_expectation(ExpectUPRNColumnValueLengthsBetween())
48+
suite.add_expectation(ExpectUPRNColumnValuesToMatchRegex())
49+
suite.add_expectation(ExpectUPRNNotToBeNull())
50+
suite.add_expectation(ExpectAssetIDNotToBeNull())
51+
suite.add_expectation(ExpectAssetTypeNotToBeNull())
5352

5453
suite = context.suites.add(suite)

scripts/jobs/housing/housing_contacts_reshape_gx_suite.py

Lines changed: 61 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,57 @@
66
import great_expectations as gx
77
import great_expectations.expectations as gxe
88

9+
10+
class ExpectContactTypeColumnValuesToBeInSet(gxe.ExpectColumnValuesToBeInSet):
11+
column: str = 'contacttype'
12+
value_set: list = ['email', 'address', 'phone']
13+
description: str = "Expect Contact Type to be one of email, address or phone"
14+
15+
16+
class ExpectSubTypeColumnValuesToBeInSet(gxe.ExpectColumnValuesToBeInSet):
17+
column: str = 'subtype'
18+
value_set: list = ['mainNumber', 'emergencyContact', 'carer', 'wife', 'husband', 'spouse', 'child', 'sibling',
19+
'relative',
20+
'neighbour', 'doctor', 'socialWorker', 'other']
21+
description: str = "Expect Subtype values to be within set"
22+
23+
24+
class ExpectTargetTypeColumnValuesToBeInSet(gxe.ExpectColumnValuesToBeInSet):
25+
column: str = 'targettype'
26+
value_set: list = ['person', 'organisation']
27+
description: str = "Expect Target Type values to be one of person or organisation"
28+
29+
30+
class ExpectContactValueColumnValuesToBeUnique(gxe.ExpectColumnValuesToBeUnique):
31+
column: str = 'value'
32+
description: str = "Expect Value field to be unique for a contact type"
33+
34+
35+
class ExpectTargetIDAndValueColumnValuesToBeUniqueWithinRecord(gxe.ExpectSelectColumnValuesToBeUniqueWithinRecord):
36+
column_list: list = ['target_id', 'value']
37+
description: str = "Expect Target ID and Value field to be unique for a record"
38+
39+
40+
class ExpectTargetIDColumnValuesToNotBeNull(gxe.ExpectColumnValuesToNotBeNull):
41+
column = 'target_id'
42+
description: str = "Expect Target ID column to be complete with no nulls"
43+
44+
45+
class ExpectContactValueColumnValuesToNotBeNull(gxe.ExpectColumnValuesToNotBeNull):
46+
column = 'value'
47+
description: str = "Expect Value column to be complete with no nulls"
48+
49+
50+
class ExpectContactTypeColumnValuesToNotBeNull(gxe.ExpectColumnValuesToNotBeNull):
51+
column = 'contacttype'
52+
description: str = "Expect Contact Type column to be complete with no nulls"
53+
54+
55+
class ExpectSubTypeColumnValuesToNotBeNull(gxe.ExpectColumnValuesToNotBeNull):
56+
column = 'subtype'
57+
description: str = "Expect Subtype column to be complete with no nulls"
58+
59+
960
arg_key = ['s3_target_location']
1061
args = getResolvedOptions(sys.argv, arg_key)
1162
locals().update(args)
@@ -14,45 +65,15 @@
1465
context = gx.get_context(mode="file", project_root_dir=s3_target_location)
1566

1667
suite = gx.ExpectationSuite(name='contacts_reshape_suite')
17-
suite.add_expectation(
18-
gxe.ExpectColumnValuesToBeInSet(
19-
column='contacttype',
20-
value_set=['email', 'address', 'phone'])
21-
)
22-
suite.add_expectation(
23-
gxe.ExpectColumnValuesToBeInSet(
24-
column='subtype',
25-
value_set=['mainNumber', 'emergencyContact', 'carer', 'wife', 'husband', 'spouse', 'child', 'sibling',
26-
'relative', 'neighbour', 'doctor', 'socialWorker', 'other'])
27-
)
28-
suite.add_expectation(
29-
gxe.ExpectColumnValuesToBeInSet(
30-
column='targettype',
31-
value_set=['person', 'organisation'])
32-
)
33-
suite.add_expectation(
34-
gxe.ExpectColumnValuesToBeUnique(
35-
column='value')
36-
)
37-
suite.add_expectation(
38-
gxe.ExpectSelectColumnValuesToBeUniqueWithinRecord(
39-
column_list=['target_id', 'value'])
40-
)
41-
suite.add_expectation(
42-
gxe.ExpectColumnValuesToNotBeNull(
43-
column='target_id')
44-
)
45-
suite.add_expectation(
46-
gxe.ExpectColumnValuesToNotBeNull(
47-
column='value')
48-
)
49-
suite.add_expectation(
50-
gxe.ExpectColumnValuesToNotBeNull(
51-
column='contacttype')
52-
)
53-
suite.add_expectation(
54-
gxe.ExpectColumnValuesToNotBeNull(
55-
column='subtype')
56-
)
68+
69+
suite.add_expectation(ExpectContactTypeColumnValuesToBeInSet())
70+
suite.add_expectation(ExpectSubTypeColumnValuesToBeInSet())
71+
suite.add_expectation(ExpectTargetTypeColumnValuesToBeInSet())
72+
suite.add_expectation(ExpectContactValueColumnValuesToBeUnique())
73+
suite.add_expectation(ExpectTargetIDAndValueColumnValuesToBeUniqueWithinRecord())
74+
suite.add_expectation(ExpectTargetIDColumnValuesToNotBeNull())
75+
suite.add_expectation(ExpectContactValueColumnValuesToNotBeNull())
76+
suite.add_expectation(ExpectContactTypeColumnValuesToNotBeNull())
77+
suite.add_expectation(ExpectSubTypeColumnValuesToNotBeNull())
5778

5879
suite = context.suites.add(suite)

scripts/jobs/housing/housing_person_reshape_gx_suite.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,6 @@ class ExpectDateOfBirthToBeBetween(gxe.ExpectColumnValuesToBeBetween):
9595
column: str = 'dateofbirth_parsed'
9696
min_value: str = datetime(1900, 1, 1, 0, 0, 0).isoformat()
9797
max_value: str = datetime.today().isoformat()
98-
condition_parser: str = "pandas"
99-
row_condition: str = 'df["dateofbirth_parsed"].str[:10] >= "1850-01-01" and df["dateofbirth_parsed"].str[:10] < "2025-01-01" and df["startdate_parsed"].str[:10] > "1900-01-01" and df["startdate_parsed"].str[:10] < "2100-01-01"'
10098
description: str = "Expect dateofbirth_parsed be complete with no missing values"
10199

102100

0 commit comments

Comments
 (0)