|
| 1 | +import json |
| 2 | +import os |
| 3 | +from datetime import datetime, timedelta, timezone |
| 4 | +from typing import Any |
| 5 | + |
| 6 | +import boto3 |
| 7 | +import fire |
| 8 | + |
| 9 | +INCLUDE_PATIENT_IDS = os.environ.get("INCLUDE_PATIENT_IDS", "false").lower() == "true" |
| 10 | + |
| 11 | +dynamodb = boto3.client("dynamodb") |
| 12 | +paginator = dynamodb.get_paginator("scan") |
| 13 | + |
| 14 | + |
| 15 | +def _get_masterids_for_custodians(table_name: str, custodians: str | tuple[str]) -> Any: |
| 16 | + """ |
| 17 | + Get masterids for pointers in the given table for a list of custodians. |
| 18 | + Parameters: |
| 19 | + - table_name: The name of the pointers table to use. |
| 20 | + """ |
| 21 | + custodian_list = ( |
| 22 | + custodians.split(",") if isinstance(custodians, str) else list(custodians) |
| 23 | + ) |
| 24 | + |
| 25 | + print( # noqa |
| 26 | + f"Getting masterids for custodians {custodian_list} in table {table_name}...." |
| 27 | + ) |
| 28 | + |
| 29 | + required_attributes = ["id", "type_id", "master_identifier", "custodian"] |
| 30 | + if INCLUDE_PATIENT_IDS: |
| 31 | + required_attributes.append("nhs_number") |
| 32 | + |
| 33 | + expression_names_str = ",".join( |
| 34 | + [f":param{custodian}" for custodian in custodian_list] |
| 35 | + ) |
| 36 | + expression_values_list = { |
| 37 | + f":param{custodian}": {"S": custodian} for custodian in custodian_list |
| 38 | + } |
| 39 | + |
| 40 | + params: dict[str, Any] = { |
| 41 | + "TableName": table_name, |
| 42 | + "PaginationConfig": {"PageSize": 50}, |
| 43 | + "FilterExpression": f"custodian IN ({expression_names_str})", |
| 44 | + "ExpressionAttributeValues": expression_values_list, |
| 45 | + "ProjectionExpression": ",".join(required_attributes), |
| 46 | + } |
| 47 | + |
| 48 | + pointers_info: list[dict[str, str]] = [] |
| 49 | + total_scanned_count = 0 |
| 50 | + |
| 51 | + start_time = datetime.now(tz=timezone.utc) |
| 52 | + |
| 53 | + for page in paginator.paginate(**params): |
| 54 | + for item in page["Items"]: |
| 55 | + pointer_id = item.get("id", {}).get("S", "no-id") |
| 56 | + pointer_type = item.get("type_id", {}).get("S", "no-type") |
| 57 | + master_id = item.get("master_identifier", {}).get("S", "no-master-id") |
| 58 | + custodian = item.get("custodian", {}).get("S", "no-custodian") |
| 59 | + patient_id = item.get("nhs_number", {}).get("S", "no-patient-id") |
| 60 | + |
| 61 | + pointers_info.append( |
| 62 | + { |
| 63 | + "nrl-id": pointer_id, |
| 64 | + "pointer-type": pointer_type, |
| 65 | + "master_identifier": master_id, |
| 66 | + "custodian": custodian, |
| 67 | + "patient_id": patient_id if INCLUDE_PATIENT_IDS else "not-included", |
| 68 | + } |
| 69 | + ) |
| 70 | + |
| 71 | + total_scanned_count += page["ScannedCount"] |
| 72 | + |
| 73 | + if total_scanned_count % 1000 == 0: |
| 74 | + print(".", end="", flush=True) # noqa |
| 75 | + |
| 76 | + if total_scanned_count % 100000 == 0: |
| 77 | + print(f"scanned={total_scanned_count} found={len(pointers_info)} ") # noqa |
| 78 | + |
| 79 | + end_time = datetime.now(tz=timezone.utc) |
| 80 | + |
| 81 | + print(" Done") # noqa |
| 82 | + |
| 83 | + print(f"Writing pointers to file ./pointer-masterids.txt ...") # noqa |
| 84 | + with open("pointer-masterids.txt", "w") as f: |
| 85 | + f.write(json.dumps(pointers_info, indent=2)) |
| 86 | + |
| 87 | + return { |
| 88 | + "output-file": "pointer-masterids.txt", |
| 89 | + "pointers-found": len(pointers_info), |
| 90 | + "scanned-count": total_scanned_count, |
| 91 | + "took-secs": timedelta.total_seconds(end_time - start_time), |
| 92 | + } |
| 93 | + |
| 94 | + |
| 95 | +if __name__ == "__main__": |
| 96 | + fire.Fire(_get_masterids_for_custodians) |
0 commit comments