Skip to content

Commit fdd14dd

Browse files
committed
VED-951 Implement DQ validation checks (#1074)
1 parent d68abad commit fdd14dd

File tree

11 files changed

+242
-40
lines changed

11 files changed

+242
-40
lines changed

lambdas/filenameprocessor/poetry.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

lambdas/filenameprocessor/pyproject.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ aws-lambda-typing = "~2.20.0"
1818
moto = "^4"
1919
requests = "~2.32.5"
2020
responses = "~0.25.8"
21-
pydantic = "~1.10.13"
2221
pyjwt = "~2.10.1"
2322
cryptography = "~46.0.0"
2423
cffi = "~1.17.1"

lambdas/shared/src/common/data_quality/checker.py

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
from dataclasses import dataclass
22

3+
from pydantic import ValidationError
4+
35
from common.data_quality.completeness import DataQualityCompletenessChecker, MissingFields
4-
from common.data_quality.validator import DataQualityValidator
6+
from common.data_quality.models.immunization_batch_row_model import ImmunizationBatchRowModel
57
from common.models.fhir_converter.converter import Converter
68

79

@@ -21,27 +23,38 @@ def __init__(
2123
is_batch_csv: bool,
2224
):
2325
self.completeness_checker = completeness_checker
26+
self.data_quality_model = ImmunizationBatchRowModel
2427
self.is_batch_csv = is_batch_csv
2528

2629
def run_checks(self, immunisation: dict) -> DataQualityOutput:
27-
data_quality_validator = DataQualityValidator()
28-
2930
if not self.is_batch_csv:
3031
immunisation = Converter(fhir_data=immunisation).run_conversion()
3132

3233
return DataQualityOutput(
3334
missing_fields=self._check_completeness(immunisation),
34-
invalid_fields=self._check_validity(immunisation, data_quality_validator),
35+
invalid_fields=self._check_validity(immunisation),
3536
timeliness=self._check_timeliness(immunisation),
3637
)
3738

3839
def _check_completeness(self, immunisation: dict) -> MissingFields:
3940
return self.completeness_checker.run_checks(immunisation)
4041

41-
@staticmethod
42-
def _check_validity(immunisation: dict, data_quality_validator: DataQualityValidator) -> list[str]:
43-
pass
42+
def _check_validity(self, immunisation: dict) -> list[str]:
43+
"""Checks the flat batch csv immunisation data structure against the fields and validation rules defined by the
44+
data quality team. Returns the fields that were invalid."""
45+
fields_with_errors = []
46+
47+
try:
48+
self.data_quality_model.parse_obj(immunisation)
49+
except ValidationError as exc:
50+
for error in exc.errors():
51+
path_to_field_name = error.get("loc", [])
52+
53+
if len(path_to_field_name) > 0:
54+
# Model uses a flat structure, so all fields will have a depth of 0
55+
fields_with_errors.append(path_to_field_name[0])
56+
57+
return fields_with_errors
4458

45-
@staticmethod
46-
def _check_timeliness(immunisation: dict) -> dict[str, int]:
59+
def _check_timeliness(self, immunisation: dict) -> dict[str, int]:
4760
pass
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import datetime
2+
import decimal
3+
4+
ALLOWED_DOSE_AMOUNTS = {
5+
# The below are all present in the existing data, but we should check which ones are acceptable:
6+
decimal.Decimal("0"),
7+
decimal.Decimal("0.1"),
8+
decimal.Decimal("0.2"),
9+
decimal.Decimal("0.3"),
10+
decimal.Decimal("0.4"),
11+
decimal.Decimal("0.5"),
12+
decimal.Decimal("0.7"),
13+
decimal.Decimal("1"),
14+
decimal.Decimal("2"),
15+
decimal.Decimal("10"),
16+
decimal.Decimal("11"),
17+
}
18+
19+
ALLOWED_DOSE_UNIT_CODES = {
20+
# The below are all present in the existing data, but we should check which ones are acceptable:
21+
"258773002", # ml
22+
"3317411000001100", # dose
23+
"3318611000001103", # pre-filled disposable injection
24+
"3319711000001103", # unit dose
25+
"408102007", # unit dose
26+
"413516001", # ampoule
27+
"415818006", # vial
28+
}
29+
30+
MIN_ACCEPTED_PAST_DATE = datetime.date(1900, 1, 1)
31+
MIN_ACCEPTED_EXPIRY_DATE = datetime.date(2020, 1, 1)

lambdas/shared/src/common/data_quality/models/__init__.py

Whitespace-only changes.
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
"""Simple immunization model to support data quality validation. The team only requires simple validation on a subset
2+
of the immunization fields"""
3+
4+
import datetime
5+
import decimal
6+
7+
from dateutil.relativedelta import relativedelta
8+
from pydantic import BaseModel, condate, constr, validator
9+
10+
from common.data_quality.constants import (
11+
ALLOWED_DOSE_AMOUNTS,
12+
ALLOWED_DOSE_UNIT_CODES,
13+
MIN_ACCEPTED_EXPIRY_DATE,
14+
MIN_ACCEPTED_PAST_DATE,
15+
)
16+
from common.data_quality.timeliness import parse_csv_date, parse_csv_datetime
17+
18+
# Consider upgrading fhir.resources 7 -> 8 and pydantic 1 -> 2 to use the more powerful and readable Annotated types
19+
# and constraints
20+
NhsNumber = constr(min_length=10, max_length=10, regex=r"^\d{10}$")
21+
BatchCsvDate = condate(ge=MIN_ACCEPTED_PAST_DATE)
22+
PersonPostcode = constr(regex=r"^[A-Z]{1,2}\d[A-Z\d]? ?\d[A-Z]{2}$")
23+
ExpiryDate = condate(ge=MIN_ACCEPTED_EXPIRY_DATE)
24+
SnomedCode = constr(min_length=6, max_length=18, regex=r"^\d{6,18}$")
25+
26+
27+
class ImmunizationBatchRowModel(BaseModel):
28+
"""Represents the Immunization as provided in batch CSV. Contains the subset of fields for DQ validation."""
29+
30+
NHS_NUMBER: NhsNumber
31+
PERSON_DOB: BatchCsvDate
32+
DATE_AND_TIME: BatchCsvDate
33+
PERSON_POSTCODE: PersonPostcode
34+
EXPIRY_DATE: ExpiryDate # TODO - check with DQ team. Should these checks be relative to the occurrence datetime?
35+
DOSE_AMOUNT: decimal.Decimal # TODO - check with DQ team. Actual values vary a lot from proposed enum.
36+
SITE_OF_VACCINATION_CODE: SnomedCode # TODO - check with DQ team. Their reqs are quite rudimentary, could change?
37+
ROUTE_OF_VACCINATION_CODE: SnomedCode
38+
DOSE_UNIT_CODE: str # TODO - check with DQ team. Their enum does not match up with what this data actually is.
39+
INDICATION_CODE: SnomedCode
40+
41+
@validator("PERSON_DOB", "EXPIRY_DATE", pre=True)
42+
def parse_csv_date(cls, value: str) -> datetime.date:
43+
return parse_csv_date(value)
44+
45+
@validator("PERSON_DOB", "DATE_AND_TIME")
46+
def ensure_past_date(cls, value: datetime.date) -> datetime.date:
47+
if value >= datetime.date.today():
48+
raise ValueError("Date must be in the past")
49+
50+
return value
51+
52+
@validator("DATE_AND_TIME", pre=True)
53+
def parse_csv_datetime(cls, value: str) -> datetime.datetime:
54+
return parse_csv_datetime(value)
55+
56+
@validator("EXPIRY_DATE")
57+
def is_expiry_within_a_year(cls, value: datetime.date) -> datetime.date:
58+
if value > datetime.date.today() + relativedelta(years=1):
59+
raise ValueError("EXPIRY_DATE must be within a year from today")
60+
61+
return value
62+
63+
@validator("DOSE_AMOUNT")
64+
def is_valid_dose_amount(cls, value: decimal.Decimal) -> decimal.Decimal:
65+
if value not in ALLOWED_DOSE_AMOUNTS:
66+
raise ValueError("Invalid DOSE_AMOUNT provided")
67+
68+
return value
69+
70+
@validator("DOSE_UNIT_CODE")
71+
def is_valid_dose_unit_code(cls, value: str) -> str:
72+
if value not in ALLOWED_DOSE_UNIT_CODES:
73+
raise ValueError("Invalid DOSE_UNIT_CODE provided")
74+
75+
return value
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
"""Module containing functions to aid with timeliness checks"""
2+
3+
import datetime
4+
5+
6+
def parse_csv_date(d: str) -> datetime.date:
7+
return datetime.datetime.strptime(d, "%Y%m%d").date()
8+
9+
10+
def parse_csv_datetime(d: str) -> datetime.datetime:
11+
"""Parses the custom NHS imms batch CSV datetime format YYYYmmddThmmss and an optional 2-digit timezone offset"""
12+
match len(d):
13+
case 17:
14+
return datetime.datetime.strptime(d[0:15], "%Y%m%dT%H%M%S").replace(
15+
tzinfo=datetime.timezone(datetime.timedelta(hours=int(d[15:17])))
16+
)
17+
case 15:
18+
return datetime.datetime.strptime(d, "%Y%m%dT%H%M%S").replace(tzinfo=datetime.timezone.utc)
19+
case _:
20+
raise ValueError("Invalid datetime format provided")

lambdas/shared/src/common/data_quality/validator.py

Lines changed: 0 additions & 2 deletions
This file was deleted.

lambdas/shared/tests/test_common/data_quality/sample_values.py

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -134,37 +134,37 @@
134134

135135
VALID_BATCH_IMMUNISATION = {
136136
"NHS_NUMBER": "9000000009",
137-
"PERSON_FORENAME": "Sam",
138-
"PERSON_SURNAME": "Trailor",
139-
"PERSON_DOB": "19650228",
140-
"PERSON_GENDER_CODE": "0",
141-
"PERSON_POSTCODE": "EC1A 1BB",
142-
"DATE_AND_TIME": "20210207T13281700",
143-
"SITE_CODE": "B0C4P",
137+
"PERSON_FORENAME": "JOHN",
138+
"PERSON_SURNAME": "DOE",
139+
"PERSON_DOB": "19801231",
140+
"PERSON_GENDER_CODE": "1",
141+
"PERSON_POSTCODE": "AB12 3CD",
142+
"DATE_AND_TIME": "20240511T120000",
143+
"SITE_CODE": "RJ1",
144144
"SITE_CODE_TYPE_URI": "https://fhir.nhs.uk/Id/ods-organization-code",
145145
"UNIQUE_ID": "ACME-vacc123456",
146146
"UNIQUE_ID_URI": "https://supplierABC/identifiers/vacc",
147147
"ACTION_FLAG": "UPDATE",
148-
"PERFORMING_PROFESSIONAL_FORENAME": "Florence",
149-
"PERFORMING_PROFESSIONAL_SURNAME": "Nightingale",
150-
"RECORDED_DATE": "20210207",
151-
"PRIMARY_SOURCE": "TRUE",
152-
"VACCINATION_PROCEDURE_CODE": "13246814444444",
153-
"VACCINATION_PROCEDURE_TERM": "Test Value string 123456 COVID vaccination",
148+
"PERFORMING_PROFESSIONAL_FORENAME": "ALICE",
149+
"PERFORMING_PROFESSIONAL_SURNAME": "SMITH",
150+
"RECORDED_DATE": "20250306",
151+
"PRIMARY_SOURCE": "True",
152+
"VACCINATION_PROCEDURE_CODE": "1324681000000101",
153+
"VACCINATION_PROCEDURE_TERM": "Procedure Term",
154154
"DOSE_SEQUENCE": "1",
155-
"VACCINE_PRODUCT_CODE": "39114911000001105",
156-
"VACCINE_PRODUCT_TERM": "COVID-19 Vaccine Vaxzevria (ChAdOx1 S [recombinant]) not less than 2.5x100,000,000 infectious units/0.5ml dose suspension for injection multidose vials (AstraZeneca UK Ltd) (product)",
157-
"VACCINE_MANUFACTURER": "AstraZeneca Ltd",
158-
"BATCH_NUMBER": "4120Z001",
159-
"EXPIRY_DATE": "20210702",
155+
"VACCINE_PRODUCT_CODE": "VACC123",
156+
"VACCINE_PRODUCT_TERM": "Vaccine Term",
157+
"VACCINE_MANUFACTURER": "Manufacturer XYZ",
158+
"BATCH_NUMBER": "BATCH001",
159+
"EXPIRY_DATE": "20241202",
160160
"SITE_OF_VACCINATION_CODE": "368208006",
161161
"SITE_OF_VACCINATION_TERM": "Left upper arm structure (body structure)",
162162
"ROUTE_OF_VACCINATION_CODE": "78421000",
163163
"ROUTE_OF_VACCINATION_TERM": "Intramuscular route (qualifier value)",
164164
"DOSE_AMOUNT": "0.5",
165-
"DOSE_UNIT_CODE": "ml",
165+
"DOSE_UNIT_CODE": "258773002",
166166
"DOSE_UNIT_TERM": "milliliter",
167167
"INDICATION_CODE": "443684005",
168-
"LOCATION_CODE": "EC1111",
168+
"LOCATION_CODE": "X99999",
169169
"LOCATION_CODE_TYPE_URI": "https://fhir.nhs.uk/Id/ods-organization-code",
170170
}
Lines changed: 73 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,78 @@
1+
import copy
2+
import datetime
13
import unittest
4+
from unittest.mock import patch
5+
6+
from common.data_quality.checker import DataQualityChecker
7+
from common.data_quality.completeness import DataQualityCompletenessChecker
8+
from test_common.data_quality.sample_values import VALID_BATCH_IMMUNISATION
29

310

411
class TestDataQualityChecker(unittest.TestCase):
512
def setUp(self):
6-
super().setUp()
13+
# Fix date.today() for all validation tests
14+
date_today_patcher = patch("common.data_quality.models.immunization_batch_row_model.datetime", wraps=datetime)
15+
self.mock_date_today = date_today_patcher.start()
16+
self.mock_date_today.date.today.return_value = datetime.date(2024, 5, 20)
17+
18+
completeness_checker = DataQualityCompletenessChecker()
19+
self.batch_dq_checker = DataQualityChecker(completeness_checker, is_batch_csv=True)
20+
self.fhir_json_dq_checker = DataQualityChecker(completeness_checker, is_batch_csv=False)
21+
22+
def test_check_validity_returns_empty_list_when_data_is_valid(self):
23+
validation_result = self.batch_dq_checker._check_validity(VALID_BATCH_IMMUNISATION)
24+
25+
self.assertEqual([], validation_result)
26+
27+
def test_check_validity_returns_list_of_invalid_fields_when_invalid_data_provided(self):
28+
test_cases = [
29+
("NHS_NUMBER", "1234"), # Failing min length
30+
("NHS_NUMBER", "1234543543543543534"), # Failing max length
31+
("NHS_NUMBER", "900000AB09"), # Failing digit only check
32+
("PERSON_DOB", "18990101"), # Prior to min accepted past date
33+
("PERSON_DOB", "20240137"), # Invalid date
34+
("PERSON_DOB", "20240624"), # Past dates only
35+
("DATE_AND_TIME", "17000511T120000"), # Prior to min accepted past date
36+
("DATE_AND_TIME", "20241511T120000"), # Invalid datetime
37+
("DATE_AND_TIME", "20241511T120"), # Invalid datetime
38+
("DATE_AND_TIME", "20240520T120001"), # Past dates only
39+
("PERSON_POSTCODE", "AAA12 3B"),
40+
("EXPIRY_DATE", "18990101"), # Prior to min accepted past date
41+
("EXPIRY_DATE", "20240137"), # Invalid date
42+
("EXPIRY_DATE", "20250521"), # Expiry greater than a year away
43+
("DOSE_AMOUNT", "abd"), # Not a decimal value
44+
("DOSE_AMOUNT", "5.67"), # Decimal value but not in the permitted list of values
45+
("SITE_OF_VACCINATION_CODE", "1254"), # Fails snomed code min length
46+
("SITE_OF_VACCINATION_CODE", "12321432543543543534"), # Fails snomed code max length
47+
("SITE_OF_VACCINATION_CODE", "18756hg098"), # Fails regex
48+
("ROUTE_OF_VACCINATION_CODE", "1254"), # Fails snomed code min length
49+
("ROUTE_OF_VACCINATION_CODE", "12321432543543543534"), # Fails snomed code max length
50+
("ROUTE_OF_VACCINATION_CODE", "18756hg098"), # Fails regex
51+
("DOSE_UNIT_CODE", "415818088"), # Dose unit code not in the enums
52+
("INDICATION_CODE", "1254"), # Fails snomed code min length
53+
("INDICATION_CODE", "12321432543543543534"), # Fails snomed code max length
54+
("INDICATION_CODE", "18756hg098"), # Fails regex
55+
]
56+
57+
for field, failing_value in test_cases:
58+
with self.subTest(field=field, failing_value=failing_value):
59+
invalid_batch_imms_payload = copy.deepcopy(VALID_BATCH_IMMUNISATION)
60+
invalid_batch_imms_payload[field] = failing_value
61+
62+
validation_result = self.batch_dq_checker._check_validity(invalid_batch_imms_payload)
63+
64+
self.assertEqual([field], validation_result)
65+
66+
def test_check_validity_returns_list_of_multiple_invalid_fields_for_multiple_failures(self):
67+
invalid_batch_imms_payload = copy.deepcopy(VALID_BATCH_IMMUNISATION)
68+
invalid_batch_imms_payload["NHS_NUMBER"] = "12345678901"
69+
invalid_batch_imms_payload["EXPIRY_DATE"] = "20240137"
70+
invalid_batch_imms_payload["PERSON_POSTCODE"] = "12 ACX"
71+
invalid_batch_imms_payload["DOSE_AMOUNT"] = "6.789"
72+
invalid_batch_imms_payload["INDICATION_CODE"] = "123"
73+
74+
validation_result = self.batch_dq_checker._check_validity(invalid_batch_imms_payload)
75+
76+
self.assertEqual(
77+
["NHS_NUMBER", "PERSON_POSTCODE", "EXPIRY_DATE", "DOSE_AMOUNT", "INDICATION_CODE"], validation_result
78+
)

0 commit comments

Comments
 (0)