Skip to content

Commit 3d8c4e1

Browse files
committed
NRL-1705 find and identify duplicates for a set of custodians
1 parent df3ca99 commit 3d8c4e1

File tree

1 file changed

+111
-0
lines changed

1 file changed

+111
-0
lines changed

reports/find_duplicate_pointers.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
from datetime import datetime, timedelta, timezone
2+
from typing import Any
3+
4+
import boto3
5+
import fire
6+
7+
from nrlf.core.logger import logger
8+
9+
dynamodb = boto3.client("dynamodb")
10+
paginator = dynamodb.get_paginator("scan")
11+
12+
logger.setLevel("ERROR")
13+
14+
REQUIRED_ATTRIBUTES = [
15+
"nhs_number",
16+
"custodian",
17+
"id",
18+
"master_identifier",
19+
"type_id",
20+
"created_on",
21+
]
22+
23+
24+
def _get_duplicates(table_name: str, custodians: str | tuple[str]) -> Any:
25+
"""
26+
Get masterids for duplicate pointers in the given table for a list of custodians.
27+
Parameters:
28+
- table_name: The name of the pointers table to use.
29+
"""
30+
custodian_list = (
31+
custodians.split(",") if isinstance(custodians, str) else list(custodians)
32+
)
33+
34+
print( # noqa
35+
f"Finding duplicate pointers for custodians {custodian_list} in table {table_name}...."
36+
)
37+
38+
required_attributes = REQUIRED_ATTRIBUTES
39+
40+
expression_names_str = ",".join(
41+
[f":param{custodian}" for custodian in custodian_list]
42+
)
43+
expression_values_list = {
44+
f":param{custodian}": {"S": custodian} for custodian in custodian_list
45+
}
46+
47+
params: dict[str, Any] = {
48+
"TableName": table_name,
49+
"PaginationConfig": {"PageSize": 50},
50+
"FilterExpression": f"custodian IN ({expression_names_str})",
51+
"ExpressionAttributeValues": expression_values_list,
52+
"ProjectionExpression": ",".join(required_attributes),
53+
}
54+
pointers_by_key = dict()
55+
total_scanned_count = 0
56+
duplicate_count = 0
57+
duplicates_set = set()
58+
59+
start_time = datetime.now(tz=timezone.utc)
60+
61+
for page in paginator.paginate(**params):
62+
for item in page["Items"]:
63+
pointer_id = item.get("id", {}).get("S", "no-id")
64+
pointer_type = item.get("type_id", {}).get("S", "no-type")
65+
master_id = item.get("master_identifier", {}).get("S", "no-master-id")
66+
custodian = item.get("custodian", {}).get("S", "no-custodian")
67+
patient_id = item.get("nhs_number", {}).get("S", "no-patient-id")
68+
created_on = item.get("created_on", {}).get("S", "no-creation-datetime")
69+
70+
pointer_data = {
71+
"id": pointer_id,
72+
"master_id": master_id,
73+
"datetime": created_on,
74+
}
75+
76+
px_type_ods_key = f"{patient_id}-{custodian}-{pointer_type}"
77+
78+
if px_type_ods_key not in pointers_by_key:
79+
pointers_by_key[px_type_ods_key] = [pointer_data]
80+
else:
81+
pointers_by_key[px_type_ods_key].append(pointer_data)
82+
duplicate_count += 1
83+
duplicates_set.add(px_type_ods_key)
84+
85+
total_scanned_count += page["ScannedCount"]
86+
87+
if total_scanned_count % 1000 == 0:
88+
print(".", end="", flush=True) # noqa
89+
90+
if total_scanned_count % 100000 == 0:
91+
print( # noqa
92+
f"scanned={total_scanned_count} found={duplicate_count} potential duplicates "
93+
)
94+
95+
end_time = datetime.now(tz=timezone.utc)
96+
97+
print(" Table scan completed") # noqa
98+
99+
for key in duplicates_set:
100+
print(f"Duplicates for {key}:") # noqa
101+
print(pointers_by_key[key]) # noqa
102+
103+
return {
104+
"duplicates-found": duplicate_count,
105+
"scanned-count": total_scanned_count,
106+
"took-secs": timedelta.total_seconds(end_time - start_time),
107+
}
108+
109+
110+
if __name__ == "__main__":
111+
fire.Fire(_get_duplicates)

0 commit comments

Comments
 (0)