|
| 1 | +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. |
| 2 | +# SPDX-License-Identifier: MIT-0 |
| 3 | + |
| 4 | +""" |
| 5 | +Robust list deletion utilities for DynamoDB document tracking. |
| 6 | +
|
| 7 | +This module provides robust deletion functions that handle timestamp mismatches |
| 8 | +between document records and list entries, preventing orphaned list entries. |
| 9 | +""" |
| 10 | + |
| 11 | +import logging |
| 12 | +from typing import Dict, Any, Optional, List |
| 13 | +from boto3.dynamodb.conditions import Key |
| 14 | + |
| 15 | +logger = logging.getLogger(__name__) |
| 16 | + |
| 17 | + |
| 18 | +def calculate_shard(timestamp: str) -> tuple[str, str]: |
| 19 | + """ |
| 20 | + Calculate shard information from timestamp. |
| 21 | + |
| 22 | + Args: |
| 23 | + timestamp: ISO timestamp string (e.g., "2025-09-10T12:03:27.256164+00:00") |
| 24 | + |
| 25 | + Returns: |
| 26 | + tuple: (date_part, shard_str) where shard_str is 2-digit padded |
| 27 | + |
| 28 | + Raises: |
| 29 | + ValueError: If timestamp format is invalid |
| 30 | + """ |
| 31 | + if not timestamp or not isinstance(timestamp, str): |
| 32 | + raise ValueError(f"Invalid timestamp: must be a non-empty string, got {type(timestamp)}") |
| 33 | + |
| 34 | + if 'T' not in timestamp: |
| 35 | + raise ValueError(f"Invalid timestamp format: missing 'T' separator, got {timestamp}") |
| 36 | + |
| 37 | + try: |
| 38 | + date_part = timestamp.split('T')[0] # e.g., 2025-09-10 |
| 39 | + time_part = timestamp.split('T')[1] |
| 40 | + |
| 41 | + if ':' not in time_part: |
| 42 | + raise ValueError(f"Invalid time format: missing ':' separator, got {time_part}") |
| 43 | + |
| 44 | + hour_part = int(time_part.split(':')[0]) # e.g., 12 |
| 45 | + |
| 46 | + # Validate hour range |
| 47 | + if not 0 <= hour_part <= 23: |
| 48 | + raise ValueError(f"Invalid hour: must be 0-23, got {hour_part}") |
| 49 | + |
| 50 | + # Calculate shard (6 shards per day = 4 hours each) |
| 51 | + hours_in_shard = 24 / 6 |
| 52 | + shard = int(hour_part / hours_in_shard) |
| 53 | + shard_str = f"{shard:02d}" # Format with leading zero |
| 54 | + |
| 55 | + return date_part, shard_str |
| 56 | + |
| 57 | + except (ValueError, IndexError) as e: |
| 58 | + if "Invalid" in str(e): |
| 59 | + raise # Re-raise our custom validation errors |
| 60 | + raise ValueError(f"Invalid timestamp format: {timestamp}, error: {str(e)}") |
| 61 | + |
| 62 | + |
| 63 | +def try_exact_list_deletion(tracking_table, list_pk: str, list_sk: str, object_key: str) -> bool: |
| 64 | + """ |
| 65 | + Attempt to delete list entry with exact timestamp match. |
| 66 | + |
| 67 | + Args: |
| 68 | + tracking_table: DynamoDB table resource |
| 69 | + list_pk: Primary key of list entry |
| 70 | + list_sk: Sort key of list entry |
| 71 | + object_key: Document object key for logging |
| 72 | + |
| 73 | + Returns: |
| 74 | + bool: True if successful, False if not found |
| 75 | + """ |
| 76 | + try: |
| 77 | + logger.info(f"Trying exact deletion - PK={list_pk}, SK={list_sk}") |
| 78 | + result = tracking_table.delete_item( |
| 79 | + Key={ |
| 80 | + 'PK': list_pk, |
| 81 | + 'SK': list_sk |
| 82 | + }, |
| 83 | + ReturnValues='ALL_OLD' |
| 84 | + ) |
| 85 | + |
| 86 | + if 'Attributes' in result: |
| 87 | + logger.info(f"Successfully deleted list entry with exact match: {result['Attributes']}") |
| 88 | + return True |
| 89 | + else: |
| 90 | + logger.warning(f"No list entry found with exact match: PK={list_pk}, SK={list_sk}") |
| 91 | + return False |
| 92 | + except Exception as e: |
| 93 | + logger.error(f"Error in exact list deletion for {object_key}: {str(e)}") |
| 94 | + return False |
| 95 | + |
| 96 | + |
| 97 | +def query_shard_for_object_key(tracking_table, list_pk: str, object_key: str) -> List[Dict[str, Any]]: |
| 98 | + """ |
| 99 | + Query a shard for any list entries containing the specified object key. |
| 100 | + Uses DynamoDB filter expressions for efficiency. |
| 101 | + |
| 102 | + Args: |
| 103 | + tracking_table: DynamoDB table resource |
| 104 | + list_pk: Primary key of the shard |
| 105 | + object_key: Document object key to search for |
| 106 | + |
| 107 | + Returns: |
| 108 | + List[Dict]: List of matching DynamoDB items |
| 109 | + """ |
| 110 | + try: |
| 111 | + logger.info(f"Querying shard {list_pk} for entries containing ObjectKey: {object_key}") |
| 112 | + |
| 113 | + # Use filter expression to efficiently query only matching items |
| 114 | + # This prevents loading all shard items and reduces pagination issues |
| 115 | + response = tracking_table.query( |
| 116 | + KeyConditionExpression=Key('PK').eq(list_pk), |
| 117 | + FilterExpression="ObjectKey = :obj_key OR contains(SK, :obj_id)", |
| 118 | + ExpressionAttributeValues={ |
| 119 | + ':obj_key': object_key, |
| 120 | + ':obj_id': f"#id#{object_key}" |
| 121 | + } |
| 122 | + ) |
| 123 | + |
| 124 | + matching_items = [] |
| 125 | + if 'Items' in response: |
| 126 | + for item in response['Items']: |
| 127 | + matching_items.append(item) |
| 128 | + logger.info(f"Found matching list entry: PK={item.get('PK')}, SK={item.get('SK')}") |
| 129 | + |
| 130 | + # Handle pagination if needed (though filter should make this rare) |
| 131 | + while 'LastEvaluatedKey' in response: |
| 132 | + logger.info(f"Handling pagination for shard query, continuing from: {response['LastEvaluatedKey']}") |
| 133 | + response = tracking_table.query( |
| 134 | + KeyConditionExpression=Key('PK').eq(list_pk), |
| 135 | + FilterExpression="ObjectKey = :obj_key OR contains(SK, :obj_id)", |
| 136 | + ExpressionAttributeValues={ |
| 137 | + ':obj_key': object_key, |
| 138 | + ':obj_id': f"#id#{object_key}" |
| 139 | + }, |
| 140 | + ExclusiveStartKey=response['LastEvaluatedKey'] |
| 141 | + ) |
| 142 | + |
| 143 | + if 'Items' in response: |
| 144 | + for item in response['Items']: |
| 145 | + matching_items.append(item) |
| 146 | + logger.info(f"Found matching list entry (paginated): PK={item.get('PK')}, SK={item.get('SK')}") |
| 147 | + |
| 148 | + logger.info(f"Found {len(matching_items)} matching entries in shard {list_pk}") |
| 149 | + return matching_items |
| 150 | + |
| 151 | + except Exception as e: |
| 152 | + logger.error(f"Error querying shard for object key {object_key}: {str(e)}") |
| 153 | + return [] |
| 154 | + |
| 155 | + |
| 156 | +def get_adjacent_shards(date_part: str, shard_str: str) -> List[str]: |
| 157 | + """ |
| 158 | + Get adjacent shard identifiers for edge case handling. |
| 159 | + |
| 160 | + Args: |
| 161 | + date_part: Date string (e.g., "2025-09-10") |
| 162 | + shard_str: Shard string (e.g., "03") |
| 163 | + |
| 164 | + Returns: |
| 165 | + List[str]: List of adjacent shard PKs |
| 166 | + """ |
| 167 | + try: |
| 168 | + current_shard = int(shard_str) |
| 169 | + adjacent_shards = [] |
| 170 | + |
| 171 | + # Previous shard |
| 172 | + if current_shard > 0: |
| 173 | + prev_shard = f"{current_shard - 1:02d}" |
| 174 | + adjacent_shards.append(f"list#{date_part}#s#{prev_shard}") |
| 175 | + |
| 176 | + # Next shard |
| 177 | + if current_shard < 5: # Max shard is 05 |
| 178 | + next_shard = f"{current_shard + 1:02d}" |
| 179 | + adjacent_shards.append(f"list#{date_part}#s#{next_shard}") |
| 180 | + |
| 181 | + logger.info(f"Adjacent shards for {date_part}#s#{shard_str}: {adjacent_shards}") |
| 182 | + return adjacent_shards |
| 183 | + |
| 184 | + except Exception as e: |
| 185 | + logger.error(f"Error calculating adjacent shards: {str(e)}") |
| 186 | + return [] |
| 187 | + |
| 188 | + |
| 189 | +def delete_list_entries_robust(tracking_table, object_key: str, document_metadata: Optional[Dict[str, Any]]) -> bool: |
| 190 | + """ |
| 191 | + Robustly delete list entries for the given object key. |
| 192 | + Uses multiple strategies: exact match, shard query, adjacent shard search. |
| 193 | + |
| 194 | + Args: |
| 195 | + tracking_table: DynamoDB table resource |
| 196 | + object_key: Document object key |
| 197 | + document_metadata: Optional document metadata containing timestamp info |
| 198 | + |
| 199 | + Returns: |
| 200 | + bool: True if any entries were deleted |
| 201 | + """ |
| 202 | + deleted_any = False |
| 203 | + |
| 204 | + # Strategy 1: Try exact timestamp match if we have document metadata |
| 205 | + if document_metadata: |
| 206 | + event_time = None |
| 207 | + if 'QueuedTime' in document_metadata and document_metadata['QueuedTime']: |
| 208 | + event_time = document_metadata['QueuedTime'] |
| 209 | + logger.info(f"Using QueuedTime for exact match: {event_time}") |
| 210 | + elif 'InitialEventTime' in document_metadata and document_metadata['InitialEventTime']: |
| 211 | + event_time = document_metadata['InitialEventTime'] |
| 212 | + logger.info(f"Using InitialEventTime for exact match: {event_time}") |
| 213 | + |
| 214 | + if event_time: |
| 215 | + try: |
| 216 | + date_part, shard_str = calculate_shard(event_time) |
| 217 | + list_pk = f"list#{date_part}#s#{shard_str}" |
| 218 | + list_sk = f"ts#{event_time}#id#{object_key}" |
| 219 | + |
| 220 | + if try_exact_list_deletion(tracking_table, list_pk, list_sk, object_key): |
| 221 | + deleted_any = True |
| 222 | + logger.info("Successfully deleted list entry with exact timestamp match") |
| 223 | + return deleted_any # Success, no need for fallback |
| 224 | + else: |
| 225 | + logger.warning("Exact timestamp match failed, proceeding with fallback strategies") |
| 226 | + except Exception as e: |
| 227 | + logger.error(f"Error in exact timestamp deletion: {str(e)}") |
| 228 | + |
| 229 | + # Strategy 2: Query calculated shard for any entries with matching ObjectKey |
| 230 | + if document_metadata: |
| 231 | + try: |
| 232 | + # Use the same timestamp for shard calculation |
| 233 | + event_time = document_metadata.get('QueuedTime') or document_metadata.get('InitialEventTime') |
| 234 | + if event_time: |
| 235 | + date_part, shard_str = calculate_shard(event_time) |
| 236 | + list_pk = f"list#{date_part}#s#{shard_str}" |
| 237 | + |
| 238 | + matching_entries = query_shard_for_object_key(tracking_table, list_pk, object_key) |
| 239 | + for entry in matching_entries: |
| 240 | + try: |
| 241 | + result = tracking_table.delete_item( |
| 242 | + Key={ |
| 243 | + 'PK': entry['PK'], |
| 244 | + 'SK': entry['SK'] |
| 245 | + }, |
| 246 | + ReturnValues='ALL_OLD' |
| 247 | + ) |
| 248 | + if 'Attributes' in result: |
| 249 | + logger.info(f"Successfully deleted list entry via shard query: {entry['SK']}") |
| 250 | + deleted_any = True |
| 251 | + except Exception as e: |
| 252 | + logger.error(f"Error deleting found list entry: {str(e)}") |
| 253 | + |
| 254 | + # Strategy 3: Check adjacent shards for edge cases |
| 255 | + if not deleted_any: |
| 256 | + logger.info("No entries found in calculated shard, checking adjacent shards") |
| 257 | + adjacent_shards = get_adjacent_shards(date_part, shard_str) |
| 258 | + |
| 259 | + for adj_pk in adjacent_shards: |
| 260 | + matching_entries = query_shard_for_object_key(tracking_table, adj_pk, object_key) |
| 261 | + for entry in matching_entries: |
| 262 | + try: |
| 263 | + result = tracking_table.delete_item( |
| 264 | + Key={ |
| 265 | + 'PK': entry['PK'], |
| 266 | + 'SK': entry['SK'] |
| 267 | + }, |
| 268 | + ReturnValues='ALL_OLD' |
| 269 | + ) |
| 270 | + if 'Attributes' in result: |
| 271 | + logger.info(f"Successfully deleted list entry via adjacent shard: {entry['SK']}") |
| 272 | + deleted_any = True |
| 273 | + except Exception as e: |
| 274 | + logger.error(f"Error deleting adjacent shard entry: {str(e)}") |
| 275 | + except Exception as e: |
| 276 | + logger.error(f"Error in shard query strategies: {str(e)}") |
| 277 | + |
| 278 | + # Strategy 4: Last resort - search recent dates if no metadata available |
| 279 | + if not deleted_any and not document_metadata: |
| 280 | + logger.warning(f"No document metadata available for {object_key}, attempting recent date search") |
| 281 | + # This could be expanded to search recent dates/shards as a last resort |
| 282 | + # For now, just log the limitation |
| 283 | + logger.warning("Cannot delete list entries without any timestamp information") |
| 284 | + |
| 285 | + if deleted_any: |
| 286 | + logger.info(f"Successfully deleted list entries for {object_key}") |
| 287 | + else: |
| 288 | + logger.warning(f"No list entries found or deleted for {object_key}") |
| 289 | + |
| 290 | + return deleted_any |
0 commit comments