|
| 1 | +import json |
| 2 | +from datetime import datetime, timedelta, timezone |
| 3 | +from typing import Any |
| 4 | + |
| 5 | +import boto3 |
| 6 | +import fire |
| 7 | + |
| 8 | +dynamodb = boto3.client("dynamodb") |
| 9 | +paginator = dynamodb.get_paginator("scan") |
| 10 | + |
| 11 | + |
| 12 | +def _get_masterids_for_custodians(table_name: str, custodians: str) -> Any: |
| 13 | + """ |
| 14 | + Get masterids for pointers in the given table for a list of custodians. |
| 15 | + Parameters: |
| 16 | + - table_name: The name of the pointers table to use. |
| 17 | + """ |
| 18 | + |
| 19 | + print( # noqa |
| 20 | + f"Getting masterids for custodians {custodians} in table {table_name}...." |
| 21 | + ) |
| 22 | + |
| 23 | + expression_names_str = ",".join([f":param_{custodian}" for custodian in custodians]) |
| 24 | + expression_values_list = { |
| 25 | + f":param_{custodian}": {"S": custodian} for custodian in custodians |
| 26 | + } |
| 27 | + |
| 28 | + params: dict[str, Any] = { |
| 29 | + "TableName": table_name, |
| 30 | + "PaginationConfig": {"PageSize": 50}, |
| 31 | + "FilterExpression": f"custodian IN ({expression_names_str})", |
| 32 | + "ExpressionAttributeValues": expression_values_list, |
| 33 | + "ProjectionExpression": "id, type_id, master_identifier", |
| 34 | + } |
| 35 | + |
| 36 | + pointers_info: list[dict[str, str]] = [] |
| 37 | + total_scanned_count = 0 |
| 38 | + |
| 39 | + start_time = datetime.now(tz=timezone.utc) |
| 40 | + |
| 41 | + for page in paginator.paginate(**params): |
| 42 | + for item in page["Items"]: |
| 43 | + pointer_id = item.get("id", {}).get("S", "no-id") |
| 44 | + pointer_type = item.get("type_id", {}).get("S", "no-type") |
| 45 | + master_id = item.get("master_identifier", {}).get("S", "no-master-id") |
| 46 | + |
| 47 | + pointers_info.append( |
| 48 | + { |
| 49 | + "nrl-id": pointer_id, |
| 50 | + "pointer-type": pointer_type, |
| 51 | + "master_identifier": master_id, |
| 52 | + } |
| 53 | + ) |
| 54 | + |
| 55 | + total_scanned_count += page["ScannedCount"] |
| 56 | + |
| 57 | + if total_scanned_count % 1000 == 0: |
| 58 | + print(".", end="", flush=True) # noqa |
| 59 | + |
| 60 | + if total_scanned_count % 100000 == 0: |
| 61 | + print(f"scanned={total_scanned_count} found={len(pointers_info)} ") # noqa |
| 62 | + |
| 63 | + end_time = datetime.now(tz=timezone.utc) |
| 64 | + |
| 65 | + print(" Done") # noqa |
| 66 | + |
| 67 | + print(f"Writing pointers to file ./pointer-masterids.txt ...") # noqa |
| 68 | + with open(f"pointer-masterids.txt", "w") as f: |
| 69 | + f.write(json.dumps(pointers_info, indent=2)) |
| 70 | + |
| 71 | + return { |
| 72 | + "output-file": "pointer-masterids.txt", |
| 73 | + "pointers-found": len(pointers_info), |
| 74 | + "scanned-count": total_scanned_count, |
| 75 | + "took-secs": timedelta.total_seconds(end_time - start_time), |
| 76 | + } |
| 77 | + |
| 78 | + |
| 79 | +if __name__ == "__main__": |
| 80 | + fire.Fire(_get_masterids_for_custodians) |
0 commit comments