Skip to content

Commit 3a9a707

Browse files
committed
NRL-1509 add devOps script to identify pointers with malformed URLs for specified custodians
1 parent cda4df5 commit 3a9a707

File tree

1 file changed

+98
-0
lines changed

1 file changed

+98
-0
lines changed

reports/find_malformed_urls.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
from datetime import datetime, timedelta, timezone
2+
from typing import Any
3+
4+
import boto3
5+
import fire
6+
7+
from nrlf.consumer.fhir.r4.model import DocumentReference
8+
from nrlf.core.logger import logger
9+
from nrlf.core.validators import DocumentReferenceValidator
10+
11+
dynamodb = boto3.client("dynamodb")
12+
paginator = dynamodb.get_paginator("scan")
13+
14+
logger.setLevel("ERROR")
15+
16+
17+
def _validate_url(document: str):
18+
"""
19+
Per NRL-1509 improperly truncated pointer URLs end with '/Binary/[ODSCODE]/'
20+
Properly formed full pointer URLs from these custodians end with '/Binary/[ODSCODE]/[unique-id-string]'
21+
"""
22+
docref = DocumentReference.model_validate_json(document)
23+
24+
validator = DocumentReferenceValidator()
25+
result = validator.validate(data=docref)
26+
for i, content in enumerate(result.content):
27+
if content.attachment.contentType == "application/pdf":
28+
url = content.attachment.url
29+
if url[-1] == "/":
30+
raise RuntimeError("Malformed URL found: " + str(url))
31+
32+
if not result.is_valid:
33+
raise RuntimeError("Failed to validate document: " + str(result.issues))
34+
35+
36+
def _find_malformed_urls(
37+
table_name: str, custodian_ods_codes: set[str]
38+
) -> dict[str, float | int]:
39+
"""
40+
Find pointers in the given table that have malformed URLs.
41+
This is required for NRL-1509 because some suppliers suspect their pointer retrieval URLs may have been improperly truncated.
42+
Parameters:
43+
- table_name: The name of the pointers table to use.
44+
- custodian_ods_codes: Set of ODS codes whose pointers may have malformed URLs.
45+
"""
46+
47+
print(f"Finding invalid URLs in table {table_name}....") # noqa
48+
49+
params: dict[str, Any] = {
50+
"TableName": table_name,
51+
"PaginationConfig": {"PageSize": 50},
52+
}
53+
54+
malformed_pointers = []
55+
total_scanned_count = 0
56+
57+
start_time = datetime.now(tz=timezone.utc)
58+
59+
for page in paginator.paginate(**params):
60+
for item in page["Items"]:
61+
pointer_id = item.get("id", {}).get("S")
62+
custodian, _ = pointer_id.split("-", 1)
63+
# only need to check pointers created by the specified custodians
64+
if custodian in custodian_ods_codes:
65+
document = item.get("document", {}).get("S", "")
66+
try:
67+
_validate_url(document)
68+
except Exception as exc:
69+
malformed_pointers.append((pointer_id, exc))
70+
71+
total_scanned_count += page["ScannedCount"]
72+
73+
if total_scanned_count % 1000 == 0:
74+
print(".", end="", flush=True) # noqa
75+
76+
if total_scanned_count % 100000 == 0:
77+
print( # noqa
78+
f"scanned={total_scanned_count} invalid={len(malformed_pointers)}"
79+
)
80+
81+
end_time = datetime.now(tz=timezone.utc)
82+
83+
print(" Done") # noqa
84+
85+
print("Writing malformed_pointers to file ./malformed_pointers.txt ...") # noqa
86+
with open("malformed_pointers.txt", "w") as f:
87+
for _id, err in malformed_pointers:
88+
f.write(f"{_id}: {err}\n")
89+
90+
return {
91+
"malformed_pointers": len(malformed_pointers),
92+
"scanned_count": total_scanned_count,
93+
"took-secs": timedelta.total_seconds(end_time - start_time),
94+
}
95+
96+
97+
if __name__ == "__main__":
98+
fire.Fire(_find_malformed_urls)

0 commit comments

Comments
 (0)