aws
diff --git a/‎awswrangler/_data_types.py‎
Lines changed: 7 additions & 1 deletion b/‎awswrangler/_data_types.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎awswrangler/_databases.py‎
Lines changed: 1 addition & 1 deletion b/‎awswrangler/_databases.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎awswrangler/distributed/datasources/__init__.py‎
Lines changed: 5 additions & 1 deletion b/‎awswrangler/distributed/datasources/__init__.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎awswrangler/distributed/datasources/parquet_datasource.py‎
Lines changed: 141 additions & 4 deletions b/‎awswrangler/distributed/datasources/parquet_datasource.py‎
Lines changed: 141 additions & 4 deletions
diff --git a/‎awswrangler/s3/_read_parquet.py‎
Lines changed: 4 additions & 2 deletions b/‎awswrangler/s3/_read_parquet.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎awswrangler/s3/_write.py‎
Lines changed: 2 additions & 1 deletion b/‎awswrangler/s3/_write.py‎
Lines changed: 2 additions & 1 deletion
@@ -454,7 +454,7 @@ def pyarrow2pandas_extension(  # pylint: disable=too-many-branches,too-many-retu
     return None
 
 
-def pyarrow_types_from_pandas(  # pylint: disable=too-many-branches
+def pyarrow_types_from_pandas(  # pylint: disable=too-many-branches,too-many-statements
     df: pd.DataFrame, index: bool, ignore_cols: Optional[List[str]] = None, index_left: bool = False
 ) -> Dict[str, pa.DataType]:
     """Extract the related Pyarrow data types from any Pandas DataFrame."""
@@ -474,8 +474,14 @@ def pyarrow_types_from_pandas(  # pylint: disable=too-many-branches
             cols_dtypes[name] = pa.int32()
         elif dtype == "Int64":
             cols_dtypes[name] = pa.int64()
+        elif dtype == "float32":
+            cols_dtypes[name] = pa.float32()
+        elif dtype == "float64":
+            cols_dtypes[name] = pa.float64()
         elif dtype == "string":
             cols_dtypes[name] = pa.string()
+        elif dtype == "boolean":
+            cols_dtypes[name] = pa.bool_()
         else:
             cols.append(name)
 
 
@@ -149,7 +149,7 @@ def _records2df(
                     if dtype[col_name] == pa.string() or isinstance(dtype[col_name], pa.Decimal128Type):
                         col_values = oracle.handle_oracle_objects(col_values, col_name, dtype)
                 array = pa.array(obj=col_values, type=dtype[col_name], safe=safe)  # Creating Arrow array with dtype
-            except pa.ArrowInvalid:
+            except (pa.ArrowInvalid, pa.ArrowTypeError):
                 array = pa.array(obj=col_values, safe=safe)  # Creating Arrow array
                 array = array.cast(target_type=dtype[col_name], safe=safe)  # Casting
         arrays.append(array)
 
@@ -1,7 +1,11 @@
 """Distributed Datasources Module."""
 
-from awswrangler.distributed.datasources.parquet_datasource import ParquetDatasource
+from awswrangler.distributed.datasources.parquet_datasource import (
+    ParquetDatasource,
+    UserProvidedKeyBlockWritePathProvider,
+)
 
 __all__ = [
     "ParquetDatasource",
+    "UserProvidedKeyBlockWritePathProvider",
 ]
@@ -1,24 +1,32 @@
 """Distributed ParquetDatasource Module."""
 
 import logging
-from typing import Any, Callable, Iterator, List, Optional, Union
+from typing import Any, Callable, Dict, Iterator, List, Optional, Union
 
 import numpy as np
 import pyarrow as pa
 
 # fs required to implicitly trigger S3 subsystem initialization
 import pyarrow.fs  # noqa: F401 pylint: disable=unused-import
 import pyarrow.parquet as pq
-from ray import cloudpickle
+from ray import cloudpickle  # pylint: disable=wrong-import-order,ungrouped-imports
+from ray.data.block import Block, BlockAccessor, BlockMetadata
 from ray.data.context import DatasetContext
-from ray.data.datasource.datasource import ReadTask
-from ray.data.datasource.file_based_datasource import _resolve_paths_and_filesystem
+from ray.data.datasource import BlockWritePathProvider, DefaultBlockWritePathProvider
+from ray.data.datasource.datasource import ReadTask, WriteResult
+from ray.data.datasource.file_based_datasource import (
+    _resolve_paths_and_filesystem,
+    _S3FileSystemWrapper,
+    _wrap_s3_serialization_workaround,
+)
 from ray.data.datasource.file_meta_provider import DefaultParquetMetadataProvider, ParquetMetadataProvider
 from ray.data.datasource.parquet_datasource import (
     _deregister_parquet_file_fragment_serialization,
     _register_parquet_file_fragment_serialization,
 )
 from ray.data.impl.output_buffer import BlockOutputBuffer
+from ray.data.impl.remote_fn import cached_remote_fn
+from ray.types import ObjectRef
 
 from awswrangler._arrow import _add_table_partitions
 
@@ -29,9 +37,31 @@
 PARQUET_READER_ROW_BATCH_SIZE = 100000
 
 
+class UserProvidedKeyBlockWritePathProvider(BlockWritePathProvider):
+    """Block write path provider.
+
+    Used when writing single-block datasets into a user-provided S3 key.
+    """
+
+    def _get_write_path_for_block(
+        self,
+        base_path: str,
+        *,
+        filesystem: Optional["pyarrow.fs.FileSystem"] = None,
+        dataset_uuid: Optional[str] = None,
+        block: Optional[ObjectRef[Block[Any]]] = None,
+        block_index: Optional[int] = None,
+        file_format: Optional[str] = None,
+    ) -> str:
+        return base_path
+
+
 class ParquetDatasource:
     """Parquet datasource, for reading and writing Parquet files."""
 
+    def __init__(self) -> None:
+        self._write_paths: List[str] = []
+
     # Original: https://github.com/ray-project/ray/blob/releases/1.13.0/python/ray/data/datasource/parquet_datasource.py
     def prepare_read(
         self,
@@ -135,3 +165,110 @@ def read_pieces(serialized_pieces: str) -> Iterator[pa.Table]:
             _deregister_parquet_file_fragment_serialization()  # type: ignore
 
         return read_tasks
+
+    # Original implementation:
+    # https://github.com/ray-project/ray/blob/releases/1.13.0/python/ray/data/datasource/file_based_datasource.py
+    def do_write(
+        self,
+        blocks: List[ObjectRef[Block[Any]]],
+        _: List[BlockMetadata],
+        path: str,
+        dataset_uuid: str,
+        filesystem: Optional["pyarrow.fs.FileSystem"] = None,
+        try_create_dir: bool = True,
+        open_stream_args: Optional[Dict[str, Any]] = None,
+        block_path_provider: BlockWritePathProvider = DefaultBlockWritePathProvider(),
+        write_args_fn: Callable[[], Dict[str, Any]] = lambda: {},
+        _block_udf: Optional[Callable[[Block[Any]], Block[Any]]] = None,
+        ray_remote_args: Optional[Dict[str, Any]] = None,
+        **write_args: Any,
+    ) -> List[ObjectRef[WriteResult]]:
+        """Create write tasks for a parquet file datasource."""
+        paths, filesystem = _resolve_paths_and_filesystem(path, filesystem)
+        path = paths[0]
+        if try_create_dir:
+            filesystem.create_dir(path, recursive=True)
+        filesystem = _wrap_s3_serialization_workaround(filesystem)
+
+        _write_block_to_file = self._write_block
+
+        if open_stream_args is None:
+            open_stream_args = {}
+
+        if ray_remote_args is None:
+            ray_remote_args = {}
+
+        def write_block(write_path: str, block: Block[Any]) -> str:
+            _logger.debug("Writing %s file.", write_path)
+            fs: Optional["pyarrow.fs.FileSystem"] = filesystem
+            if isinstance(fs, _S3FileSystemWrapper):
+                fs = fs.unwrap()  # type: ignore
+            if _block_udf is not None:
+                block = _block_udf(block)
+
+            with fs.open_output_stream(write_path, **open_stream_args) as f:
+                _write_block_to_file(
+                    f,
+                    BlockAccessor.for_block(block),
+                    writer_args_fn=write_args_fn,
+                    **write_args,
+                )
+            # This is a change from original FileBasedDatasource.do_write that does not return paths
+            return write_path
+
+        write_block = cached_remote_fn(write_block).options(**ray_remote_args)
+
+        file_format = self._file_format()
+        write_tasks = []
+        for block_idx, block in enumerate(blocks):
+            write_path = block_path_provider(
+                path,
+                filesystem=filesystem,
+                dataset_uuid=dataset_uuid,
+                block=block,
+                block_index=block_idx,
+                file_format=file_format,
+            )
+            write_task = write_block.remote(write_path, block)  # type: ignore
+            write_tasks.append(write_task)
+
+        return write_tasks
+
+    def on_write_complete(self, write_results: List[Any], **_: Any) -> None:
+        """Execute callback on write complete."""
+        _logger.debug("Write complete %s.", write_results)
+        # Collect and return all write task paths
+        self._write_paths.extend(write_results)
+
+    def on_write_failed(self, write_results: List[ObjectRef[Any]], error: Exception, **_: Any) -> None:
+        """Execute callback on write failed."""
+        _logger.debug("Write failed %s.", write_results)
+        raise error
+
+    def get_write_paths(self) -> List[str]:
+        """Return S3 paths of where the results have been written."""
+        return self._write_paths
+
+    def _write_block(
+        self,
+        f: "pyarrow.NativeFile",
+        block: BlockAccessor[Any],
+        writer_args_fn: Callable[[], Dict[str, Any]] = lambda: {},
+        **writer_args: Any,
+    ) -> None:
+        """Write a block to S3."""
+        import pyarrow.parquet as pq  # pylint: disable=import-outside-toplevel,redefined-outer-name,reimported
+
+        writer_args = _resolve_kwargs(writer_args_fn, **writer_args)
+        pq.write_table(block.to_arrow(), f, **writer_args)
+
+    def _file_format(self) -> str:
+        """Return file format."""
+        return "parquet"
+
+
+def _resolve_kwargs(kwargs_fn: Callable[[], Dict[str, Any]], **kwargs: Any) -> Dict[str, Any]:
+    if kwargs_fn:
+        kwarg_overrides = kwargs_fn()
+        kwargs.update(kwarg_overrides)
+    return kwargs
@@ -7,7 +7,6 @@
 from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, Union
 
 import boto3
-import pandas as pd
 import pyarrow as pa
 import pyarrow.dataset
 import pyarrow.parquet
@@ -30,10 +29,13 @@
 )
 
 if config.distributed:
+    import modin.pandas as pd
     from ray.data import read_datasource
 
     from awswrangler.distributed._utils import _to_modin  # pylint: disable=ungrouped-imports
     from awswrangler.distributed.datasources import ParquetDatasource
+else:
+    import pandas as pd
 
 BATCH_READ_BLOCK_SIZE = 65_536
 CHUNKED_READ_S3_BLOCK_SIZE = 10_485_760  # 10 MB (20 * 2**20)
@@ -323,7 +325,7 @@ def _read_parquet(
 ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
     if config.distributed:
         dataset = read_datasource(
-            datasource=ParquetDatasource(),
+            datasource=ParquetDatasource(),  # type: ignore
             parallelism=parallelism,
             use_threads=use_threads,
             paths=paths,
 
@@ -55,13 +55,14 @@ def _validate_args(
     description: Optional[str],
     parameters: Optional[Dict[str, str]],
     columns_comments: Optional[Dict[str, str]],
+    distributed: Optional[bool] = False,
 ) -> None:
     if df.empty is True:
         raise exceptions.EmptyDataFrame("DataFrame cannot be empty.")
     if dataset is False:
         if path is None:
             raise exceptions.InvalidArgumentValue("If dataset is False, the `path` argument must be passed.")
-        if path.endswith("/"):
+        if not distributed and path.endswith("/"):
             raise exceptions.InvalidArgumentValue(
                 "If <dataset=False>, the argument <path> should be a key, not a prefix."
             )