fix: support pyarrow schema in DynamoDB read_items #2399 (#2401)

jaidisido · web-flow · commit 31c4bd010248 · 2023-07-21T15:23:42.000+01:00
* fix: support pyarrow schema in DynamoDB read_items #2399 --------- Signed-off-by: Abdel Jaidi <jaidisido@gmail.com>
diff --git a/awswrangler/dynamodb/_read.py b/awswrangler/dynamodb/_read.py
@@ -2,6 +2,7 @@
 
 import itertools
 import logging
+import warnings
 from functools import wraps
 from typing import (
     TYPE_CHECKING,
@@ -164,7 +165,8 @@ def _convert_items(
                     mapping=[
                         {k: v.value if isinstance(v, Binary) else v for k, v in d.items()}  # type: ignore[attr-defined]
                         for d in items
-                    ]
+                    ],
+                    schema=arrow_kwargs.pop("schema", None),
                 )
             ],
             arrow_kwargs,
@@ -187,6 +189,7 @@ def _read_scan_chunked(
     dynamodb_client: Optional["DynamoDBClient"],
     as_dataframe: bool,
     kwargs: Dict[str, Any],
+    schema: Optional[pa.Schema] = None,
     segment: Optional[int] = None,
 ) -> Union[Iterator[pa.Table], Iterator[_ItemsListType]]:
     # SEE: https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Scan.html#Scan.ParallelScan
@@ -210,7 +213,7 @@ def _read_scan_chunked(
             for d in response.get("Items", [])
         ]
         total_items += len(items)
-        yield _utils.list_to_arrow_table(mapping=items) if as_dataframe else items
+        yield _utils.list_to_arrow_table(mapping=items, schema=schema) if as_dataframe else items
 
         if ("Limit" in kwargs) and (total_items >= kwargs["Limit"]):
             break
@@ -229,13 +232,14 @@ def _read_scan(
     dynamodb_client: Optional["DynamoDBClient"],
     as_dataframe: bool,
     kwargs: Dict[str, Any],
+    schema: Optional[pa.Schema],
     segment: int,
 ) -> Union[pa.Table, _ItemsListType]:
-    items_iterator: Iterator[_ItemsListType] = _read_scan_chunked(dynamodb_client, False, kwargs, segment)
+    items_iterator: Iterator[_ItemsListType] = _read_scan_chunked(dynamodb_client, False, kwargs, None, segment)
 
     items = list(itertools.chain.from_iterable(items_iterator))
 
-    return _utils.list_to_arrow_table(mapping=items) if as_dataframe else items
+    return _utils.list_to_arrow_table(mapping=items, schema=schema) if as_dataframe else items
 
 
 def _read_query_chunked(
@@ -326,10 +330,11 @@ def _read_items_scan(
 
     kwargs = _serialize_kwargs(kwargs)
     kwargs["TableName"] = table_name
+    schema = arrow_kwargs.pop("schema", None)
 
     if chunked:
         _logger.debug("Scanning DynamoDB table %s and returning results in an iterator", table_name)
-        scan_iterator = _read_scan_chunked(dynamodb_client, as_dataframe, kwargs)
+        scan_iterator = _read_scan_chunked(dynamodb_client, as_dataframe, kwargs, schema)
         if as_dataframe:
             return (_utils.table_refs_to_df([items], arrow_kwargs) for items in scan_iterator)
 
@@ -347,6 +352,7 @@ def _read_items_scan(
         dynamodb_client,
         itertools.repeat(as_dataframe),
         itertools.repeat(kwargs),
+        itertools.repeat(schema),
         range(total_segments),
     )
 
@@ -400,6 +406,10 @@ def _read_items(
             items = _read_query(table_name, chunked, boto3_session, **kwargs)
         else:
             # Last resort use Scan
+            warnings.warn(
+                f"Attempting DynamoDB Scan operation with arguments:\n{kwargs}",
+                UserWarning,
+            )
             return _read_items_scan(
                 table_name=table_name,
                 as_dataframe=as_dataframe,
@@ -450,6 +460,11 @@ def read_items(  # pylint: disable=too-many-branches
     Under the hood, it wraps all the four available read actions: `get_item`, `batch_get_item`,
     `query` and `scan`.
 
+    Warning
+    -------
+    To avoid a potentially costly Scan operation, please make sure to pass arguments such as
+    `partition_values` or `max_items_evaluated`. Note that `filter_expression` is applied AFTER a Scan
+
     Note
     ----
     Number of Parallel Scan segments is based on the `use_threads` argument.
@@ -581,6 +596,7 @@ def read_items(  # pylint: disable=too-many-branches
     ... )
 
     Reading items matching a FilterExpression expressed with boto3.dynamodb.conditions.Attr
+    Note that FilterExpression is applied AFTER a Scan operation
 
     >>> import awswrangler as wr
     >>> from boto3.dynamodb.conditions import Attr
diff --git a/tests/unit/test_dynamodb.py b/tests/unit/test_dynamodb.py
@@ -4,13 +4,16 @@
 from decimal import Decimal
 from typing import Any, Dict
 
+import pyarrow as pa
 import pytest
 from boto3.dynamodb.conditions import Attr, Key
 from botocore.exceptions import ClientError
 
 import awswrangler as wr
 import awswrangler.pandas as pd
 
+from .._utils import is_ray_modin
+
 pytestmark = pytest.mark.distributed
 
 
@@ -500,3 +503,37 @@ def test_read_items_limited(
     if chunked:
         df3 = pd.concat(df3)
     assert df3.shape == (min(max_items_evaluated, len(df)), len(df.columns))
+
+
+@pytest.mark.parametrize(
+    "params",
+    [
+        {
+            "KeySchema": [{"AttributeName": "id", "KeyType": "HASH"}],
+            "AttributeDefinitions": [{"AttributeName": "id", "AttributeType": "N"}],
+        }
+    ],
+)
+@pytest.mark.parametrize("chunked", [False, True])
+def test_read_items_schema(params, dynamodb_table: str, chunked: bool):
+    df = pd.DataFrame(
+        {
+            "id": [Decimal("123.4"), Decimal("226.49"), Decimal("320.320"), Decimal("425.0753")],
+            "word": ["this", "is", "a", "test"],
+            "char_count": [4, 2, 1, 4],
+        }
+    )
+    wr.dynamodb.put_df(df=df, table_name=dynamodb_table)
+
+    if not is_ray_modin:
+        with pytest.raises(pa.ArrowInvalid):
+            wr.dynamodb.read_items(table_name=dynamodb_table, allow_full_scan=True)
+
+    schema = pa.schema([("id", pa.decimal128(7, 4)), ("word", pa.string()), ("char_count", pa.int8())])
+    kwargs = {
+        "table_name": dynamodb_table,
+        "chunked": chunked,
+        "pyarrow_additional_kwargs": {"schema": schema},
+    }
+    wr.dynamodb.read_items(allow_full_scan=True, **kwargs)
+    wr.dynamodb.read_items(filter_expression=Attr("id").eq(1), **kwargs)