22
33import itertools
44import logging
5+ import warnings
56from functools import wraps
67from typing import (
78 TYPE_CHECKING ,
@@ -164,7 +165,8 @@ def _convert_items(
164165 mapping = [
165166 {k : v .value if isinstance (v , Binary ) else v for k , v in d .items ()} # type: ignore[attr-defined]
166167 for d in items
167- ]
168+ ],
169+ schema = arrow_kwargs .pop ("schema" , None ),
168170 )
169171 ],
170172 arrow_kwargs ,
@@ -187,6 +189,7 @@ def _read_scan_chunked(
187189 dynamodb_client : Optional ["DynamoDBClient" ],
188190 as_dataframe : bool ,
189191 kwargs : Dict [str , Any ],
192+ schema : Optional [pa .Schema ] = None ,
190193 segment : Optional [int ] = None ,
191194) -> Union [Iterator [pa .Table ], Iterator [_ItemsListType ]]:
192195 # SEE: https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Scan.html#Scan.ParallelScan
@@ -210,7 +213,7 @@ def _read_scan_chunked(
210213 for d in response .get ("Items" , [])
211214 ]
212215 total_items += len (items )
213- yield _utils .list_to_arrow_table (mapping = items ) if as_dataframe else items
216+ yield _utils .list_to_arrow_table (mapping = items , schema = schema ) if as_dataframe else items
214217
215218 if ("Limit" in kwargs ) and (total_items >= kwargs ["Limit" ]):
216219 break
@@ -229,13 +232,14 @@ def _read_scan(
229232 dynamodb_client : Optional ["DynamoDBClient" ],
230233 as_dataframe : bool ,
231234 kwargs : Dict [str , Any ],
235+ schema : Optional [pa .Schema ],
232236 segment : int ,
233237) -> Union [pa .Table , _ItemsListType ]:
234- items_iterator : Iterator [_ItemsListType ] = _read_scan_chunked (dynamodb_client , False , kwargs , segment )
238+ items_iterator : Iterator [_ItemsListType ] = _read_scan_chunked (dynamodb_client , False , kwargs , None , segment )
235239
236240 items = list (itertools .chain .from_iterable (items_iterator ))
237241
238- return _utils .list_to_arrow_table (mapping = items ) if as_dataframe else items
242+ return _utils .list_to_arrow_table (mapping = items , schema = schema ) if as_dataframe else items
239243
240244
241245def _read_query_chunked (
@@ -326,10 +330,11 @@ def _read_items_scan(
326330
327331 kwargs = _serialize_kwargs (kwargs )
328332 kwargs ["TableName" ] = table_name
333+ schema = arrow_kwargs .pop ("schema" , None )
329334
330335 if chunked :
331336 _logger .debug ("Scanning DynamoDB table %s and returning results in an iterator" , table_name )
332- scan_iterator = _read_scan_chunked (dynamodb_client , as_dataframe , kwargs )
337+ scan_iterator = _read_scan_chunked (dynamodb_client , as_dataframe , kwargs , schema )
333338 if as_dataframe :
334339 return (_utils .table_refs_to_df ([items ], arrow_kwargs ) for items in scan_iterator )
335340
@@ -347,6 +352,7 @@ def _read_items_scan(
347352 dynamodb_client ,
348353 itertools .repeat (as_dataframe ),
349354 itertools .repeat (kwargs ),
355+ itertools .repeat (schema ),
350356 range (total_segments ),
351357 )
352358
@@ -400,6 +406,10 @@ def _read_items(
400406 items = _read_query (table_name , chunked , boto3_session , ** kwargs )
401407 else :
402408 # Last resort use Scan
409+ warnings .warn (
410+ f"Attempting DynamoDB Scan operation with arguments:\n { kwargs } " ,
411+ UserWarning ,
412+ )
403413 return _read_items_scan (
404414 table_name = table_name ,
405415 as_dataframe = as_dataframe ,
@@ -450,6 +460,11 @@ def read_items( # pylint: disable=too-many-branches
450460 Under the hood, it wraps all the four available read actions: `get_item`, `batch_get_item`,
451461 `query` and `scan`.
452462
463+ Warning
464+ -------
465+ To avoid a potentially costly Scan operation, please make sure to pass arguments such as
466+ `partition_values` or `max_items_evaluated`. Note that `filter_expression` is applied AFTER a Scan
467+
453468 Note
454469 ----
455470 Number of Parallel Scan segments is based on the `use_threads` argument.
@@ -581,6 +596,7 @@ def read_items( # pylint: disable=too-many-branches
581596 ... )
582597
583598 Reading items matching a FilterExpression expressed with boto3.dynamodb.conditions.Attr
599+ Note that FilterExpression is applied AFTER a Scan operation
584600
585601 >>> import awswrangler as wr
586602 >>> from boto3.dynamodb.conditions import Attr
0 commit comments