feat: Allow Ray 2.5 & downgrade tox (#2338)

kukushking · web-flow · commit 28c12689b37f · 2023-06-14T17:53:23.000+01:00
* Allow Ray 2.5, upgrade modin &amp; downgrade tox due to dependency version conflicts

* [skip ci] Remove generics

* Force ray 2.5

* Downgrade Modin

* [skip ci] Typing fixes

* Allow ray 2+
diff --git a/awswrangler/distributed/ray/_executor.py b/awswrangler/distributed/ray/_executor.py
@@ -26,7 +26,7 @@ def map(self, func: Callable[..., MapOutputType], _: Optional["BaseClient"], *ar
         return list(func(*arg) for arg in zip(itertools.repeat(None), *args))
 
 
-@ray.remote
+@ray.remote  # type: ignore[attr-defined]
 class AsyncActor:
     async def run_concurrent(self, func: Callable[..., MapOutputType], *args: Any) -> MapOutputType:
         return func(*args)
diff --git a/awswrangler/distributed/ray/_utils.py b/awswrangler/distributed/ray/_utils.py
@@ -18,8 +18,8 @@ def _estimate_avail_cpus(cur_pg: Optional[PlacementGroup]) -> int:
     Args:
         cur_pg: The current placement group, if any.
     """
-    cluster_cpus = int(ray.cluster_resources().get("CPU", 1))
-    cluster_gpus = int(ray.cluster_resources().get("GPU", 0))
+    cluster_cpus = int(ray.cluster_resources().get("CPU", 1))  # type: ignore[attr-defined]
+    cluster_gpus = int(ray.cluster_resources().get("GPU", 0))  # type: ignore[attr-defined]
 
     # If we're in a placement group, we shouldn't assume the entire cluster's
     # resources are available for us to use. Estimate an upper bound on what's
diff --git a/awswrangler/distributed/ray/datasources/arrow_csv_datasource.py b/awswrangler/distributed/ray/datasources/arrow_csv_datasource.py
@@ -59,7 +59,7 @@ def _read_stream(  # type: ignore[override]  # pylint: disable=arguments-differ
     def _write_block(  # type: ignore[override]  # pylint: disable=arguments-differ
         self,
         f: pa.NativeFile,
-        block: BlockAccessor[Any],
+        block: BlockAccessor,
         **writer_args: Any,
     ) -> None:
         write_options_dict = writer_args.get("write_options", {})
diff --git a/awswrangler/distributed/ray/datasources/arrow_orc_datasource.py b/awswrangler/distributed/ray/datasources/arrow_orc_datasource.py
@@ -44,7 +44,7 @@ def _open_input_source(
     def _write_block(  # type: ignore[override]
         self,
         f: pa.NativeFile,
-        block: BlockAccessor[Any],
+        block: BlockAccessor,
         pandas_kwargs: Optional[Dict[str, Any]],
         **writer_args: Any,
     ) -> None:
diff --git a/awswrangler/distributed/ray/datasources/arrow_parquet_base_datasource.py b/awswrangler/distributed/ray/datasources/arrow_parquet_base_datasource.py
@@ -70,7 +70,7 @@ def _open_input_source(
     def _write_block(  # type: ignore[override]
         self,
         f: pa.NativeFile,
-        block: BlockAccessor[Any],
+        block: BlockAccessor,
         **writer_args: Any,
     ) -> None:
         schema: Optional[pa.schema] = writer_args.get("schema", None)
diff --git a/awswrangler/distributed/ray/datasources/arrow_parquet_datasource.py b/awswrangler/distributed/ray/datasources/arrow_parquet_datasource.py
@@ -82,14 +82,14 @@ class ArrowParquetDatasource(ArrowParquetBaseDatasource):  # pylint: disable=abs
        relative to the root S3 prefix.
     """
 
-    def create_reader(self, **kwargs: Dict[str, Any]) -> Reader[Any]:
+    def create_reader(self, **kwargs: Dict[str, Any]) -> Reader:
         """Return a Reader for the given read arguments."""
         return _ArrowParquetDatasourceReader(**kwargs)  # type: ignore[arg-type]
 
     def _write_block(  # type: ignore[override]  # pylint: disable=arguments-differ, arguments-renamed, unused-argument
         self,
         f: "pyarrow.NativeFile",
-        block: BlockAccessor[Any],
+        block: BlockAccessor,
         pandas_kwargs: Optional[Dict[str, Any]],
         **writer_args: Any,
     ) -> None:
@@ -185,7 +185,7 @@ def _deserialize_pieces_with_retry(
     raise final_exception  # type: ignore[misc]
 
 
-class _ArrowParquetDatasourceReader(Reader[Any]):  # pylint: disable=too-many-instance-attributes
+class _ArrowParquetDatasourceReader(Reader):  # pylint: disable=too-many-instance-attributes
     def __init__(
         self,
         paths: Union[str, List[str]],
@@ -194,7 +194,7 @@ def __init__(
         columns: Optional[List[str]] = None,
         schema: Optional[Schema] = None,
         meta_provider: ParquetMetadataProvider = DefaultParquetMetadataProvider(),
-        _block_udf: Optional[Callable[[Block[Any]], Block[Any]]] = None,
+        _block_udf: Optional[Callable[[Block], Block]] = None,
         **reader_args: Any,
     ):
         import pyarrow as pa
@@ -209,7 +209,7 @@ def __init__(
             import ray
             from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
 
-            self._local_scheduling = NodeAffinitySchedulingStrategy(ray.get_runtime_context().get_node_id(), soft=False)
+            self._local_scheduling = NodeAffinitySchedulingStrategy(ray.get_runtime_context().get_node_id(), soft=False)  # type: ignore[attr-defined]
 
         dataset_kwargs = reader_args.pop("dataset_kwargs", {})
         try:
@@ -361,7 +361,7 @@ def _estimate_files_encoding_ratio(self) -> float:
 # 1. Use _add_table_partitions to add partition columns. The behavior is controlled by Pandas SDK
 #    native `dataset` parameter. The partitions are loaded relative to the `path_root` prefix.
 def _read_pieces(
-    block_udf: Optional[Callable[[Block[Any]], Block[Any]]],
+    block_udf: Optional[Callable[[Block], Block]],
     reader_args: Any,
     columns: Optional[List[str]],
     schema: Optional[Union[type, "pyarrow.lib.Schema"]],
diff --git a/awswrangler/distributed/ray/datasources/pandas_file_based_datasource.py b/awswrangler/distributed/ray/datasources/pandas_file_based_datasource.py
@@ -35,7 +35,7 @@ def _get_write_path_for_block(
         *,
         filesystem: Optional["pyarrow.fs.FileSystem"] = None,
         dataset_uuid: Optional[str] = None,
-        block: Optional[Block[Any]] = None,
+        block: Optional[Block] = None,
         block_index: Optional[int] = None,
         file_format: Optional[str] = None,
     ) -> str:
@@ -64,7 +64,7 @@ def __init__(self) -> None:
     def _read_file(self, f: pyarrow.NativeFile, path: str, **reader_args: Any) -> pd.DataFrame:
         raise NotImplementedError()
 
-    def do_write(  # type: ignore[override] # pylint: disable=arguments-differ
+    def do_write(  # pylint: disable=arguments-differ
         self,
         blocks: List[ObjectRef[pd.DataFrame]],
         metadata: List[BlockMetadata],
@@ -141,9 +141,9 @@ def write_block(write_path: str, block: pd.DataFrame) -> str:
 
         return write_tasks
 
-    def write(
+    def write(  # type: ignore[override]
         self,
-        blocks: Iterable[Union[Block[pd.DataFrame], ObjectRef[pd.DataFrame]]],
+        blocks: Iterable[Union[Block, ObjectRef[pd.DataFrame]]],
         ctx: TaskContext,
         path: str,
         dataset_uuid: str,
@@ -188,7 +188,7 @@ def write_block(write_path: str, block: pd.DataFrame) -> str:
 
         file_suffix = self._get_file_suffix(self._FILE_EXTENSION, compression)
 
-        builder = DelegatingBlockBuilder()  # type: ignore[no-untyped-call,var-annotated]
+        builder = DelegatingBlockBuilder()  # type: ignore[no-untyped-call]
         for block in blocks:
             # Dereference the block if ObjectRef is passed
             builder.add_block(ray_get(block) if isinstance(block, ray.ObjectRef) else block)  # type: ignore[arg-type]
@@ -198,7 +198,7 @@ def write_block(write_path: str, block: pd.DataFrame) -> str:
             path,
             filesystem=filesystem,
             dataset_uuid=dataset_uuid,
-            block=block,  # type: ignore[arg-type]
+            block=block,
             block_index=ctx.task_idx,
             file_format=file_suffix,
         )
@@ -211,7 +211,7 @@ def _get_file_suffix(self, file_format: str, compression: Optional[str]) -> str:
     def _write_block(
         self,
         f: "pyarrow.NativeFile",
-        block: BlockAccessor[Any],
+        block: BlockAccessor,
         writer_args_fn: Callable[[], Dict[str, Any]] = lambda: {},
         **writer_args: Any,
     ) -> None:
diff --git a/awswrangler/distributed/ray/datasources/pandas_text_datasource.py b/awswrangler/distributed/ray/datasources/pandas_text_datasource.py
@@ -72,7 +72,7 @@ def _read_file(self, f: pyarrow.NativeFile, path: str, **reader_args: Any) -> pd
     def _write_block(  # type: ignore[override]  # pylint: disable=arguments-differ, arguments-renamed
         self,
         f: io.TextIOWrapper,
-        block: BlockAccessor[Any],
+        block: BlockAccessor,
         pandas_kwargs: Optional[Dict[str, Any]],
         **writer_args: Any,
     ) -> None:
diff --git a/awswrangler/distributed/ray/modin/_utils.py b/awswrangler/distributed/ray/modin/_utils.py
@@ -29,7 +29,7 @@ def _block_to_df(
     return _table_to_df(table=block._table, kwargs=to_pandas_kwargs)  # pylint: disable=protected-access
 
 
-def _ray_dataset_from_df(df: Union[pd.DataFrame, modin_pd.DataFrame]) -> Dataset[Any]:
+def _ray_dataset_from_df(df: Union[pd.DataFrame, modin_pd.DataFrame]) -> Dataset:
     """Create Ray dataset from supported types of data frames."""
     if isinstance(df, modin_pd.DataFrame):
         return from_modin(df)  # type: ignore[no-any-return]
@@ -39,7 +39,7 @@ def _ray_dataset_from_df(df: Union[pd.DataFrame, modin_pd.DataFrame]) -> Dataset
 
 
 def _to_modin(
-    dataset: Union[ray.data.Dataset[Any], ray.data.Dataset[pd.DataFrame]],
+    dataset: Dataset,
     to_pandas_kwargs: Optional[Dict[str, Any]] = None,
     ignore_index: Optional[bool] = True,
 ) -> modin_pd.DataFrame:
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml