|
| 1 | +from datetime import datetime, timedelta, timezone |
| 2 | +from typing import Any |
| 3 | + |
| 4 | +import boto3 |
| 5 | +import fire |
| 6 | + |
| 7 | +from nrlf.core.logger import logger |
| 8 | + |
| 9 | +dynamodb = boto3.client("dynamodb") |
| 10 | +paginator = dynamodb.get_paginator("scan") |
| 11 | + |
| 12 | +logger.setLevel("ERROR") |
| 13 | + |
| 14 | +REQUIRED_ATTRIBUTES = [ |
| 15 | + "nhs_number", |
| 16 | + "custodian", |
| 17 | + "id", |
| 18 | + "master_identifier", |
| 19 | + "type_id", |
| 20 | + "created_on", |
| 21 | +] |
| 22 | + |
| 23 | + |
| 24 | +def _get_duplicates(table_name: str, custodians: str | tuple[str]) -> Any: |
| 25 | + """ |
| 26 | + Get masterids for duplicate pointers in the given table for a list of custodians. |
| 27 | + Parameters: |
| 28 | + - table_name: The name of the pointers table to use. |
| 29 | + """ |
| 30 | + custodian_list = ( |
| 31 | + custodians.split(",") if isinstance(custodians, str) else list(custodians) |
| 32 | + ) |
| 33 | + |
| 34 | + print( # noqa |
| 35 | + f"Finding duplicate pointers for custodians {custodian_list} in table {table_name}...." |
| 36 | + ) |
| 37 | + |
| 38 | + required_attributes = REQUIRED_ATTRIBUTES |
| 39 | + |
| 40 | + expression_names_str = ",".join( |
| 41 | + [f":param{custodian}" for custodian in custodian_list] |
| 42 | + ) |
| 43 | + expression_values_list = { |
| 44 | + f":param{custodian}": {"S": custodian} for custodian in custodian_list |
| 45 | + } |
| 46 | + |
| 47 | + params: dict[str, Any] = { |
| 48 | + "TableName": table_name, |
| 49 | + "PaginationConfig": {"PageSize": 50}, |
| 50 | + "FilterExpression": f"custodian IN ({expression_names_str})", |
| 51 | + "ExpressionAttributeValues": expression_values_list, |
| 52 | + "ProjectionExpression": ",".join(required_attributes), |
| 53 | + } |
| 54 | + pointers_by_key = dict() |
| 55 | + total_scanned_count = 0 |
| 56 | + duplicate_count = 0 |
| 57 | + duplicates_set = set() |
| 58 | + |
| 59 | + start_time = datetime.now(tz=timezone.utc) |
| 60 | + |
| 61 | + for page in paginator.paginate(**params): |
| 62 | + for item in page["Items"]: |
| 63 | + pointer_id = item.get("id", {}).get("S", "no-id") |
| 64 | + pointer_type = item.get("type_id", {}).get("S", "no-type") |
| 65 | + master_id = item.get("master_identifier", {}).get("S", "no-master-id") |
| 66 | + custodian = item.get("custodian", {}).get("S", "no-custodian") |
| 67 | + patient_id = item.get("nhs_number", {}).get("S", "no-patient-id") |
| 68 | + created_on = item.get("created_on", {}).get("S", "no-creation-datetime") |
| 69 | + |
| 70 | + pointer_data = { |
| 71 | + "id": pointer_id, |
| 72 | + "master_id": master_id, |
| 73 | + "datetime": created_on, |
| 74 | + } |
| 75 | + |
| 76 | + px_type_ods_key = f"{patient_id}-{custodian}-{pointer_type}" |
| 77 | + |
| 78 | + if px_type_ods_key not in pointers_by_key: |
| 79 | + pointers_by_key[px_type_ods_key] = [pointer_data] |
| 80 | + else: |
| 81 | + pointers_by_key[px_type_ods_key].append(pointer_data) |
| 82 | + duplicate_count += 1 |
| 83 | + duplicates_set.add(px_type_ods_key) |
| 84 | + |
| 85 | + total_scanned_count += page["ScannedCount"] |
| 86 | + |
| 87 | + if total_scanned_count % 1000 == 0: |
| 88 | + print(".", end="", flush=True) # noqa |
| 89 | + |
| 90 | + if total_scanned_count % 100000 == 0: |
| 91 | + print( # noqa |
| 92 | + f"scanned={total_scanned_count} found={duplicate_count} potential duplicates " |
| 93 | + ) |
| 94 | + |
| 95 | + end_time = datetime.now(tz=timezone.utc) |
| 96 | + |
| 97 | + print(" Table scan completed") # noqa |
| 98 | + |
| 99 | + for key in duplicates_set: |
| 100 | + print(f"Duplicates for {key}:") # noqa |
| 101 | + print(pointers_by_key[key]) # noqa |
| 102 | + |
| 103 | + return { |
| 104 | + "duplicates-found": duplicate_count, |
| 105 | + "scanned-count": total_scanned_count, |
| 106 | + "took-secs": timedelta.total_seconds(end_time - start_time), |
| 107 | + } |
| 108 | + |
| 109 | + |
| 110 | +if __name__ == "__main__": |
| 111 | + fire.Fire(_get_duplicates) |
0 commit comments