feat: Python 3.12 (#2559)

kukushking · LeonLuttenberger · web-flow · commit 3507fdab607b · 2023-12-20T11:28:05.000-06:00
Signed-off-by: Anton Kukushkin &lt;kukushkin.anton@gmail.com&gt;
Co-authored-by: Leon Luttenberger &lt;luttenberger.leon@gmail.com&gt;
diff --git a/.github/workflows/minimal-tests.yml b/.github/workflows/minimal-tests.yml
@@ -17,7 +17,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8]
+        python-version: ["3.8", "3.11", "3.12"]
         platform: [ubuntu-latest, macos-latest, windows-latest]
 
     env:
diff --git a/README.md b/README.md
@@ -101,6 +101,8 @@ AWS SDK for pandas can also run your workflows at scale by leveraging [Modin](ht
 
 The quickest way to get started is to use AWS Glue with Ray. Read our [docs](https://aws-sdk-pandas.readthedocs.io/en/3.5.0/scale.html), our blogs ([1](https://aws.amazon.com/blogs/big-data/scale-aws-sdk-for-pandas-workloads-with-aws-glue-for-ray/)/[2](https://aws.amazon.com/blogs/big-data/advanced-patterns-with-aws-sdk-for-pandas-on-aws-glue-for-ray/)), or head to our latest [tutorials](https://github.com/aws/aws-sdk-pandas/tree/main/tutorials) to discover even more features.
 
+> ⚠️ **Ray is currently not available for Python 3.12. While AWS SDK for pandas supports Python 3.12, it cannot be used at scale.**
+
 ## [Read The Docs](https://aws-sdk-pandas.readthedocs.io/)
 
 - [**What is AWS SDK for pandas?**](https://aws-sdk-pandas.readthedocs.io/en/3.5.0/about.html)
diff --git a/awswrangler/_config.py b/awswrangler/_config.py
@@ -16,7 +16,7 @@
 _logger: logging.Logger = logging.getLogger(__name__)
 
 
-_ConfigValueType = Union[str, bool, int, float, botocore.config.Config, dict]
+_ConfigValueType = Union[str, bool, int, float, botocore.config.Config, Dict[Any, Any]]
 
 
 class _ConfigArg(NamedTuple):
diff --git a/awswrangler/_databases.py b/awswrangler/_databases.py
@@ -160,7 +160,7 @@ def _records2df(
     for col_values, col_name in zip(tuple(zip(*records)), cols_names):  # Transposing
         if (dtype is None) or (col_name not in dtype):
             if _oracledb_found:
-                col_values = oracle.handle_oracle_objects(col_values, col_name)  # ruff: noqa: PLW2901
+                col_values = oracle.handle_oracle_objects(col_values, col_name)  # type: ignore[arg-type,assignment]  # noqa: PLW2901
             try:
                 array: pa.Array = pa.array(obj=col_values, safe=safe)  # Creating Arrow array
             except pa.ArrowInvalid as ex:
@@ -169,7 +169,7 @@ def _records2df(
             try:
                 if _oracledb_found:
                     if _should_handle_oracle_objects(dtype[col_name]):
-                        col_values = oracle.handle_oracle_objects(col_values, col_name, dtype)
+                        col_values = oracle.handle_oracle_objects(col_values, col_name, dtype)  # type: ignore[arg-type,assignment]  # noqa: PLW2901
                 array = pa.array(obj=col_values, type=dtype[col_name], safe=safe)  # Creating Arrow array with dtype
             except (pa.ArrowInvalid, pa.ArrowTypeError):
                 array = pa.array(obj=col_values, safe=safe)  # Creating Arrow array
diff --git a/awswrangler/_utils.py b/awswrangler/_utils.py
@@ -165,7 +165,7 @@ def decorator(func: FunctionType) -> FunctionType:
 
         @wraps(func)
         def inner(*args: Any, **kwargs: Any) -> Any:
-            passed_unsupported_kwargs = set(unsupported_kwargs).intersection(  # type: ignore
+            passed_unsupported_kwargs = set(unsupported_kwargs).intersection(
                 set([key for key, value in kwargs.items() if value is not None])
             )
 
@@ -620,7 +620,7 @@ def ensure_cpu_count(use_threads: Union[bool, int] = True) -> int:
     1
 
     """
-    if type(use_threads) == int:  # pylint: disable=unidiomatic-typecheck
+    if type(use_threads) == int:  # pylint: disable=unidiomatic-typecheck  # noqa: E721
         if use_threads < 1:
             return 1
         return use_threads
@@ -736,7 +736,7 @@ def get_credentials_from_session(
 ) -> botocore.credentials.ReadOnlyCredentials:
     """Get AWS credentials from boto3 session."""
     session: boto3.Session = ensure_session(session=boto3_session)
-    credentials: botocore.credentials.Credentials = session.get_credentials()
+    credentials: botocore.credentials.Credentials = session.get_credentials()  # type: ignore[assignment]
     frozen_credentials: botocore.credentials.ReadOnlyCredentials = credentials.get_frozen_credentials()
     return frozen_credentials
 
diff --git a/awswrangler/athena/_cache.py b/awswrangler/athena/_cache.py
@@ -50,7 +50,7 @@ def update_cache(self, items: List[Dict[str, Any]]) -> None:
                 if oldest_item:
                     items = list(
                         filter(
-                            lambda x: x["Status"]["SubmissionDateTime"] > oldest_item["Status"]["SubmissionDateTime"],  # type: ignore[arg-type]
+                            lambda x: x["Status"]["SubmissionDateTime"] > oldest_item["Status"]["SubmissionDateTime"],
                             items,
                         )
                     )
diff --git a/awswrangler/athena/_read.py b/awswrangler/athena/_read.py
@@ -74,7 +74,7 @@ def _add_query_metadata_generator(
 ) -> Iterator[pd.DataFrame]:
     """Add Query Execution metadata to every DF in iterator."""
     for df in dfs:
-        df = _apply_query_metadata(df=df, query_metadata=query_metadata)  # ruff: noqa: PLW2901
+        df = _apply_query_metadata(df=df, query_metadata=query_metadata)  # noqa: PLW2901
         yield df
 
 
diff --git a/awswrangler/athena/_utils.py b/awswrangler/athena/_utils.py
@@ -197,7 +197,7 @@ def _parse_describe_table(df: pd.DataFrame) -> pd.DataFrame:
     origin_df_dict = df.to_dict()
     target_df_dict: Dict[str, List[Union[str, bool]]] = {"Column Name": [], "Type": [], "Partition": [], "Comment": []}
     for index, col_name in origin_df_dict["col_name"].items():
-        col_name = col_name.strip()  # ruff: noqa: PLW2901
+        col_name = col_name.strip()  # noqa: PLW2901
         if col_name.startswith("#") or not col_name:
             pass
         elif col_name in target_df_dict["Column Name"]:
diff --git a/awswrangler/cleanrooms/_utils.py b/awswrangler/cleanrooms/_utils.py
@@ -29,6 +29,7 @@ def wait_query(
         Protected query execution ID
     boto3_session : boto3.Session, optional
         Boto3 Session. If None, the default boto3 session is used
+
     Returns
     -------
     Dict[str, Any]
diff --git a/awswrangler/data_api/rds.py b/awswrangler/data_api/rds.py
@@ -165,7 +165,7 @@ def _execute_statement(
         def function(sql: str) -> "ExecuteStatementResponseTypeDef":
             return self.client.execute_statement(
                 resourceArn=self.resource_arn,
-                database=database,  # type: ignore[arg-type]
+                database=database,
                 sql=sql,
                 secretArn=self.secret_arn,
                 includeResultMetadata=True,
@@ -196,7 +196,7 @@ def _batch_execute_statement(
         def function(sql: str) -> "BatchExecuteStatementResponseTypeDef":
             return self.client.batch_execute_statement(
                 resourceArn=self.resource_arn,
-                database=database,  # type: ignore[arg-type]
+                database=database,
                 sql=sql,
                 secretArn=self.secret_arn,
                 **additional_kwargs,
@@ -363,7 +363,7 @@ def _generate_parameters(columns: List[str], values: List[Any]) -> List[Dict[str
     parameter_list = []
 
     for col, value in zip(columns, values):
-        value, type_hint = _create_value_dict(value)  # ruff: noqa: PLW2901
+        value, type_hint = _create_value_dict(value)  # noqa: PLW2901
 
         parameter = {
             "name": col,
diff --git a/awswrangler/distributed/ray/_utils.py b/awswrangler/distributed/ray/_utils.py
@@ -53,7 +53,7 @@ def _estimate_available_parallelism() -> int:
 
 
 def ensure_worker_count(use_threads: Union[bool, int] = True) -> int:
-    if type(use_threads) == int:  # pylint: disable=unidiomatic-typecheck
+    if type(use_threads) == int:  # pylint: disable=unidiomatic-typecheck  # noqa: E721
         if use_threads < 1:
             return 1
         return use_threads
diff --git a/awswrangler/distributed/ray/datasources/arrow_parquet_datasource.py b/awswrangler/distributed/ray/datasources/arrow_parquet_datasource.py
@@ -119,7 +119,7 @@ def _get_file_suffix(self, file_format: str, compression: Optional[str]) -> str:
 # raw pyarrow file fragment causes S3 network calls.
 class _SerializedPiece:
     def __init__(self, frag: ParquetFileFragment):
-        self._data = cloudpickle.dumps(  # type: ignore[attr-defined]
+        self._data = cloudpickle.dumps(  # type: ignore[attr-defined,no-untyped-call]
             (frag.format, frag.path, frag.filesystem, frag.partition_expression)
         )
 
diff --git a/awswrangler/distributed/ray/modin/_core.py b/awswrangler/distributed/ray/modin/_core.py
@@ -27,7 +27,7 @@ def _validate_partition_shape(df: pd.DataFrame) -> bool:
     """
     # Unwrap partitions as they are currently stored (axis=None)
     partitions_shape = np.array(unwrap_partitions(df)).shape
-    return partitions_shape[1] == 1
+    return partitions_shape[1] == 1  # type: ignore[no-any-return,unused-ignore]
 
 
 FunctionType = TypeVar("FunctionType", bound=Callable[..., Any])
diff --git a/awswrangler/dynamodb/_utils.py b/awswrangler/dynamodb/_utils.py
@@ -19,7 +19,7 @@
     from mypy_boto3_dynamodb.type_defs import (
         AttributeValueTypeDef,
         ExecuteStatementOutputTypeDef,
-        KeySchemaElementTableTypeDef,
+        KeySchemaElementTypeDef,
         WriteRequestTypeDef,
     )
 
@@ -180,7 +180,7 @@ def execute_statement(
 
 
 def _validate_items(
-    items: Union[List[Dict[str, Any]], List[Mapping[str, Any]]], key_schema: List["KeySchemaElementTableTypeDef"]
+    items: Union[List[Dict[str, Any]], List[Mapping[str, Any]]], key_schema: List["KeySchemaElementTypeDef"]
 ) -> None:
     """
     Validate if all items have the required keys for the Amazon DynamoDB table.
diff --git a/awswrangler/dynamodb/_write.py b/awswrangler/dynamodb/_write.py
@@ -20,7 +20,7 @@
 
 if TYPE_CHECKING:
     from mypy_boto3_dynamodb.client import DynamoDBClient
-    from mypy_boto3_dynamodb.type_defs import KeySchemaElementTableTypeDef
+    from mypy_boto3_dynamodb.type_defs import KeySchemaElementTypeDef
 
 
 _logger: logging.Logger = logging.getLogger(__name__)
@@ -139,7 +139,7 @@ def _put_df(
     dynamodb_client: Optional["DynamoDBClient"],
     df: pd.DataFrame,
     table_name: str,
-    key_schema: List["KeySchemaElementTableTypeDef"],
+    key_schema: List["KeySchemaElementTypeDef"],
 ) -> None:
     items: List[Mapping[str, Any]] = [v.dropna().to_dict() for _, v in df.iterrows()]
 
@@ -214,7 +214,7 @@ def _put_items(
     dynamodb_client: Optional["DynamoDBClient"],
     items: Union[List[Dict[str, Any]], List[Mapping[str, Any]]],
     table_name: str,
-    key_schema: List["KeySchemaElementTableTypeDef"],
+    key_schema: List["KeySchemaElementTypeDef"],
 ) -> None:
     _logger.debug("Inserting %d items", len(items))
     _validate_items(items=items, key_schema=key_schema)
diff --git a/awswrangler/emr.py b/awswrangler/emr.py
@@ -663,6 +663,7 @@ def create_cluster(  # pylint: disable=too-many-arguments,too-many-locals,unused
 
         By default, adds log4j config as follows:
         `{"Classification": "spark-log4j", "Properties": {"log4j.rootCategory": f"{pars['spark_log_level']}, console"}}`
+
     Returns
     -------
     str
diff --git a/awswrangler/neptune/_client.py b/awswrangler/neptune/_client.py
@@ -8,6 +8,7 @@
 import boto3
 from botocore.auth import SigV4Auth
 from botocore.awsrequest import AWSPreparedRequest, AWSRequest
+from botocore.credentials import Credentials
 from typing_extensions import Literal, NotRequired
 
 import awswrangler.neptune._gremlin_init as gremlin
@@ -126,7 +127,7 @@ def _get_aws_request(
     ) -> Union[AWSRequest, AWSPreparedRequest]:
         req = AWSRequest(method=method, url=url, data=data, params=params, headers=headers)
         if self.iam_enabled:
-            credentials = self.boto3_session.get_credentials()
+            credentials: Credentials = self.boto3_session.get_credentials()  # type: ignore[assignment]
             try:
                 frozen_creds = credentials.get_frozen_credentials()
             except AttributeError:
diff --git a/awswrangler/neptune/_gremlin_parser.py b/awswrangler/neptune/_gremlin_parser.py
@@ -68,7 +68,7 @@ def _parse_dict(data: Any) -> Any:
         for k, v in data.items():
             # If the key is a Vertex or an Edge do special processing
             if isinstance(k, (gremlin.Vertex, gremlin.Edge)):
-                k = k.id  # ruff: noqa: PLW2901
+                k = k.id  # noqa: PLW2901
 
             # If the value is a list do special processing to make it a scalar if the list is of length 1
             if isinstance(v, list) and len(v) == 1:
diff --git a/awswrangler/oracle.py b/awswrangler/oracle.py
@@ -603,7 +603,7 @@ def to_sql(
                 df=df, column_placeholders=column_placeholders, chunksize=chunksize
             )
             for _, parameters in placeholder_parameter_pair_generator:
-                parameters = list(zip(*[iter(parameters)] * len(df.columns)))  # ruff: noqa: PLW2901
+                parameters = list(zip(*[iter(parameters)] * len(df.columns)))  # noqa: PLW2901
                 _logger.debug("sql: %s", sql)
                 cursor.executemany(sql, parameters)
 
diff --git a/awswrangler/s3/_copy.py b/awswrangler/s3/_copy.py
@@ -39,7 +39,7 @@ def _copy_objects(
             CopySource=copy_source,
             Bucket=target_bucket,
             Key=target_key,
-            ExtraArgs=s3_additional_kwargs,  # type: ignore[arg-type]
+            ExtraArgs=s3_additional_kwargs,
             Config=TransferConfig(num_download_attempts=10, use_threads=use_threads),  # type: ignore[arg-type]
         )
 
diff --git a/awswrangler/s3/_read.py b/awswrangler/s3/_read.py
@@ -325,7 +325,7 @@ def _ensure_locations_are_valid(paths: Iterable[str]) -> Iterator[str]:
         # If the suffix looks like a partition,
         if suffix and (suffix.count("=") == 1):
             # the path should end in a '/' character.
-            path = f"{path}/"  # ruff: noqa: PLW2901
+            path = f"{path}/"  # noqa: PLW2901
         yield path
 
 
diff --git a/awswrangler/s3/_select.py b/awswrangler/s3/_select.py
@@ -56,8 +56,8 @@ def _select_object_content(
     for event in response["Payload"]:
         if "Records" in event:
             records = (
-                event["Records"]["Payload"]  # type: ignore[index]
-                .decode(  # type: ignore[attr-defined]
+                event["Records"]["Payload"]
+                .decode(
                     encoding="utf-8",
                     errors="ignore",
                 )
diff --git a/awswrangler/s3/_write_dataset.py b/awswrangler/s3/_write_dataset.py
@@ -149,14 +149,16 @@ def _to_partitions(
     s3_client = client(service_name="s3", session=boto3_session)
     for keys, subgroup in df.groupby(by=partition_cols, observed=True):
         # Keys are either a primitive type or a tuple if partitioning by multiple cols
-        keys = (keys,) if not isinstance(keys, tuple) else keys  # ruff: noqa: PLW2901
+        keys = (keys,) if not isinstance(keys, tuple) else keys  # noqa: PLW2901
         # Drop partition columns from df
         subgroup.drop(
             columns=[col for col in partition_cols if col in subgroup.columns],
             inplace=True,
-        )  # ruff: noqa: PLW2901
+        )  # noqa: PLW2901
         # Drop index levels if partitioning by index columns
-        subgroup = subgroup.droplevel(level=[col for col in partition_cols if col in subgroup.index.names])
+        subgroup = subgroup.droplevel(  # noqa: PLW2901
+            level=[col for col in partition_cols if col in subgroup.index.names]
+        )
         prefix = _delete_objects(
             keys=keys,
             path_root=path_root,
diff --git a/building/build-lambda-layers.sh b/building/build-lambda-layers.sh
@@ -4,44 +4,74 @@ set -ex
 VERSION=$(poetry version --short)
 DIR_NAME=$(dirname "$PWD")
 
+PYTHON_VERSION=${1:-ALL}
+
 ARCH=$(arch)
 [ "${ARCH}" = "aarch64" ] && ARCH_SUFFIX="-arm64" # AWS Lambda, the name arm64 is used instead of aarch64
 
-echo "Building Lambda Layers for AWS SDK for pandas ${VERSION}"
+if [[ $PYTHON_VERSION == "ALL" ]]
+then
+  echo "Building Lambda Layers for AWS SDK for pandas ${VERSION} (ALL supported Python versions)"
+else
+  echo "Building Lambda Layers for AWS SDK for pandas ${VERSION} (ONLY Python $PYTHON_VERSION)"
+fi
 
 pushd lambda
 
 # Building all related docker images
-./build-docker-images.sh
+./build-docker-images.sh $PYTHON_VERSION
 
 # Python 3.8
-docker run \
-  --volume "$DIR_NAME":/aws-sdk-pandas/ \
-  --workdir /aws-sdk-pandas/building/lambda \
-  --rm \
-  awswrangler-build-py38 \
-  build-lambda-layer.sh "${VERSION}-py3.8${ARCH_SUFFIX}" "ninja-build"
+if [[ $PYTHON_VERSION == "ALL" || $PYTHON_VERSION == "3.8" ]]
+then
+  docker run \
+    --volume "$DIR_NAME":/aws-sdk-pandas/ \
+    --workdir /aws-sdk-pandas/building/lambda \
+    --rm \
+    awswrangler-build-py38 \
+    build-lambda-layer.sh "${VERSION}-py3.8${ARCH_SUFFIX}" "ninja-build"
+fi
 
 # Python 3.9
-docker run \
-  --volume "$DIR_NAME":/aws-sdk-pandas/ \
-  --workdir /aws-sdk-pandas/building/lambda \
-  --rm \
-  awswrangler-build-py39 \
-  build-lambda-layer.sh "${VERSION}-py3.9${ARCH_SUFFIX}" "ninja-build"
+if [[ $PYTHON_VERSION == "ALL" || $PYTHON_VERSION == "3.9" ]]
+then
+  docker run \
+    --volume "$DIR_NAME":/aws-sdk-pandas/ \
+    --workdir /aws-sdk-pandas/building/lambda \
+    --rm \
+    awswrangler-build-py39 \
+    build-lambda-layer.sh "${VERSION}-py3.9${ARCH_SUFFIX}" "ninja-build"
+fi
 
 # Python 3.10
-docker run \
-  --volume "$DIR_NAME":/aws-sdk-pandas/ \
-  --workdir /aws-sdk-pandas/building/lambda \
-  --rm \
-  awswrangler-build-py310 \
-  build-lambda-layer.sh "${VERSION}-py3.10${ARCH_SUFFIX}" "ninja-build"
+if [[ $PYTHON_VERSION == "ALL" || $PYTHON_VERSION == "3.10" ]]
+then
+  docker run \
+    --volume "$DIR_NAME":/aws-sdk-pandas/ \
+    --workdir /aws-sdk-pandas/building/lambda \
+    --rm \
+    awswrangler-build-py310 \
+    build-lambda-layer.sh "${VERSION}-py3.10${ARCH_SUFFIX}" "ninja-build"
+fi
 
 # Python 3.11
-docker run \
-  --volume "$DIR_NAME":/aws-sdk-pandas/ \
-  --workdir /aws-sdk-pandas/building/lambda \
-  --rm \
-  awswrangler-build-py311 \
-  build-lambda-layer.sh "${VERSION}-py3.11${ARCH_SUFFIX}" "ninja-build"
+if [[ $PYTHON_VERSION == "ALL" || $PYTHON_VERSION == "3.11" ]]
+then
+  docker run \
+    --volume "$DIR_NAME":/aws-sdk-pandas/ \
+    --workdir /aws-sdk-pandas/building/lambda \
+    --rm \
+    awswrangler-build-py311 \
+    build-lambda-layer.sh "${VERSION}-py3.11${ARCH_SUFFIX}" "ninja-build"
+fi
+
+# Python 3.12
+if [[ $PYTHON_VERSION == "ALL" || $PYTHON_VERSION == "3.12" ]]
+then
+  docker run \
+    --volume "$DIR_NAME":/aws-sdk-pandas/ \
+    --workdir /aws-sdk-pandas/building/lambda \
+    --rm \
+    awswrangler-build-py312 \
+    build-lambda-layer.sh "${VERSION}-py3.12${ARCH_SUFFIX}" "ninja-build"
+fi
diff --git a/building/lambda/Dockerfile b/building/lambda/Dockerfile
diff --git a/building/lambda/Dockerfile.al2023 b/building/lambda/Dockerfile.al2023
diff --git a/building/lambda/build-docker-images.sh b/building/lambda/build-docker-images.sh
diff --git a/building/lambda/build-lambda-layer.sh b/building/lambda/build-lambda-layer.sh
diff --git a/docs/source/install.rst b/docs/source/install.rst
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/tests/unit/test_athena.py b/tests/unit/test_athena.py
diff --git a/tests/unit/test_neptune.py b/tests/unit/test_neptune.py
diff --git a/tests/unit/test_neptune_parsing.py b/tests/unit/test_neptune_parsing.py
diff --git a/tox.ini b/tox.ini

Original file line number	Diff line number	Diff line change
`@@ -50,7 +50,7 @@ def update_cache(self, items: List[Dict[str, Any]]) -> None:`
`50`	`50`	`if oldest_item:`
`51`	`51`	`items = list(`
`52`	`52`	`filter(`
`53`		`- lambda x: x["Status"]["SubmissionDateTime"] > oldest_item["Status"]["SubmissionDateTime"], # type: ignore[arg-type]`
	`53`	`+ lambda x: x["Status"]["SubmissionDateTime"] > oldest_item["Status"]["SubmissionDateTime"],`
`54`	`54`	`items,`
`55`	`55`	`)`
`56`	`56`	`)`