aws
diff --git a/‎awswrangler/distributed/ray/datasources/__init__.py‎
Lines changed: 12 additions & 1 deletion b/‎awswrangler/distributed/ray/datasources/__init__.py‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎awswrangler/distributed/ray/datasources/arrow_csv_datasink.py‎
Lines changed: 51 additions & 0 deletions b/‎awswrangler/distributed/ray/datasources/arrow_csv_datasink.py‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎awswrangler/distributed/ray/datasources/arrow_csv_datasource.py‎
Lines changed: 34 additions & 34 deletions b/‎awswrangler/distributed/ray/datasources/arrow_csv_datasource.py‎
Lines changed: 34 additions & 34 deletions
diff --git a/‎awswrangler/distributed/ray/datasources/arrow_json_datasource.py‎
Lines changed: 29 additions & 19 deletions b/‎awswrangler/distributed/ray/datasources/arrow_json_datasource.py‎
Lines changed: 29 additions & 19 deletions
diff --git a/‎awswrangler/distributed/ray/datasources/arrow_orc_datasink.py‎
Lines changed: 67 additions & 0 deletions b/‎awswrangler/distributed/ray/datasources/arrow_orc_datasink.py‎
Lines changed: 67 additions & 0 deletions
@@ -1,11 +1,16 @@
 """Ray Datasources Module."""
 
+from awswrangler.distributed.ray.datasources.arrow_csv_datasink import ArrowCSVDatasink
 from awswrangler.distributed.ray.datasources.arrow_csv_datasource import ArrowCSVDatasource
 from awswrangler.distributed.ray.datasources.arrow_json_datasource import ArrowJSONDatasource
+from awswrangler.distributed.ray.datasources.arrow_orc_datasink import ArrowORCDatasink
 from awswrangler.distributed.ray.datasources.arrow_orc_datasource import ArrowORCDatasource
 from awswrangler.distributed.ray.datasources.arrow_parquet_base_datasource import ArrowParquetBaseDatasource
+from awswrangler.distributed.ray.datasources.arrow_parquet_datasink import ArrowParquetDatasink
 from awswrangler.distributed.ray.datasources.arrow_parquet_datasource import ArrowParquetDatasource
-from awswrangler.distributed.ray.datasources.pandas_file_based_datasource import UserProvidedKeyBlockWritePathProvider
+from awswrangler.distributed.ray.datasources.block_path_provider import UserProvidedKeyBlockWritePathProvider
+from awswrangler.distributed.ray.datasources.file_datasink import _BlockFileDatasink
+from awswrangler.distributed.ray.datasources.pandas_text_datasink import PandasCSVDatasink, PandasJSONDatasink
 from awswrangler.distributed.ray.datasources.pandas_text_datasource import (
     PandasCSVDataSource,
     PandasFWFDataSource,
@@ -14,6 +19,9 @@
 )
 
 __all__ = [
+    "ArrowCSVDatasink",
+    "ArrowORCDatasink",
+    "ArrowParquetDatasink",
     "ArrowCSVDatasource",
     "ArrowJSONDatasource",
     "ArrowORCDatasource",
@@ -24,4 +32,7 @@
     "PandasJSONDatasource",
     "PandasTextDatasource",
     "UserProvidedKeyBlockWritePathProvider",
+    "PandasCSVDatasink",
+    "PandasJSONDatasink",
+    "_BlockFileDatasink",
 ]
@@ -0,0 +1,51 @@
+"""Ray PandasTextDatasink Module."""
+
+import io
+import logging
+from typing import Any, Dict, Optional
+
+from pyarrow import csv
+from ray.data.block import BlockAccessor
+from ray.data.datasource.block_path_provider import BlockWritePathProvider
+
+from awswrangler.distributed.ray.datasources.file_datasink import _BlockFileDatasink
+
+_logger: logging.Logger = logging.getLogger(__name__)
+
+
+class ArrowCSVDatasink(_BlockFileDatasink):
+    """A datasink that writes CSV files using Arrow."""
+
+    def __init__(
+        self,
+        path: str,
+        *,
+        block_path_provider: Optional[BlockWritePathProvider] = None,
+        dataset_uuid: Optional[str] = None,
+        open_s3_object_args: Optional[Dict[str, Any]] = None,
+        pandas_kwargs: Optional[Dict[str, Any]] = None,
+        write_options: Optional[Dict[str, Any]] = None,
+        **write_args: Any,
+    ):
+        super().__init__(
+            path,
+            file_format="csv",
+            block_path_provider=block_path_provider,
+            dataset_uuid=dataset_uuid,
+            open_s3_object_args=open_s3_object_args,
+            pandas_kwargs=pandas_kwargs,
+            **write_args,
+        )
+
+        self.write_options = write_options or {}
+
+    def write_block(self, file: io.TextIOWrapper, block: BlockAccessor) -> None:
+        """
+        Write a block of data to a file.
+
+        Parameters
+        ----------
+        block : BlockAccessor
+        file : io.TextIOWrapper
+        """
+        csv.write_csv(block.to_arrow(), file, csv.WriteOptions(**self.write_options))
@@ -1,39 +1,50 @@
 """Ray ArrowCSVDatasource Module."""
-from typing import Any, Iterator
+from typing import Any, Dict, Iterator, List, Optional, Union
 
 import pyarrow as pa
 from pyarrow import csv
-from ray.data.block import BlockAccessor
+from ray.data.datasource.file_based_datasource import FileBasedDatasource
 
 from awswrangler._arrow import _add_table_partitions
-from awswrangler.distributed.ray.datasources.pandas_file_based_datasource import PandasFileBasedDatasource
 
 
-class ArrowCSVDatasource(PandasFileBasedDatasource):  # pylint: disable=abstract-method
-    """CSV datasource, for reading and writing CSV files using PyArrow."""
+class ArrowCSVDatasource(FileBasedDatasource):
+    """CSV datasource, for reading CSV files using PyArrow."""
 
-    _FILE_EXTENSION = "csv"
+    _FILE_EXTENSIONS = ["csv"]
 
-    def _read_stream(  # type: ignore[override]  # pylint: disable=arguments-differ
+    def __init__(
         self,
-        f: pa.NativeFile,
-        path: str,
-        path_root: str,
+        paths: Union[str, List[str]],
         dataset: bool,
-        **reader_args: Any,
-    ) -> Iterator[pa.Table]:
-        read_options = reader_args.get("read_options", csv.ReadOptions(use_threads=False))
-        parse_options = reader_args.get(
-            "parse_options",
-            csv.ParseOptions(),
-        )
-        convert_options = reader_args.get("convert_options", csv.ConvertOptions())
+        path_root: str,
+        version_ids: Optional[Dict[str, str]] = None,
+        s3_additional_kwargs: Optional[Dict[str, str]] = None,
+        pandas_kwargs: Optional[Dict[str, Any]] = None,
+        arrow_csv_args: Optional[Dict[str, Any]] = None,
+        **file_based_datasource_kwargs: Any,
+    ):
+        from pyarrow import csv
+
+        super().__init__(paths, **file_based_datasource_kwargs)
+
+        self.dataset = dataset
+        self.path_root = path_root
 
+        if arrow_csv_args is None:
+            arrow_csv_args = {}
+
+        self.read_options = arrow_csv_args.pop("read_options", csv.ReadOptions(use_threads=False))
+        self.parse_options = arrow_csv_args.pop("parse_options", csv.ParseOptions())
+        self.convert_options = arrow_csv_args.get("convert_options", csv.ConvertOptions())
+        self.arrow_csv_args = arrow_csv_args
+
+    def _read_stream(self, f: pa.NativeFile, path: str) -> Iterator[pa.Table]:
         reader = csv.open_csv(
             f,
-            read_options=read_options,
-            parse_options=parse_options,
-            convert_options=convert_options,
+            read_options=self.read_options,
+            parse_options=self.parse_options,
+            convert_options=self.convert_options,
         )
 
         schema = None
@@ -44,25 +55,14 @@ def _read_stream(  # type: ignore[override]  # pylint: disable=arguments-differ
                 if schema is None:
                     schema = table.schema
 
-                if dataset:
+                if self.dataset:
                     table = _add_table_partitions(
                         table=table,
                         path=f"s3://{path}",
-                        path_root=path_root,
+                        path_root=self.path_root,
                     )
 
                 yield table
 
             except StopIteration:
                 return
-
-    def _write_block(  # type: ignore[override]  # pylint: disable=arguments-differ
-        self,
-        f: pa.NativeFile,
-        block: BlockAccessor,
-        **writer_args: Any,
-    ) -> None:
-        write_options_dict = writer_args.get("write_options", {})
-        write_options = csv.WriteOptions(**write_options_dict)
-
-        csv.write_csv(block.to_arrow(), f, write_options)
@@ -1,39 +1,49 @@
 """Ray ArrowCSVDatasource Module."""
-from typing import Any
+from typing import Any, Dict, Iterator, List, Optional, Union
 
 import pyarrow as pa
 from pyarrow import json
+from ray.data.datasource.file_based_datasource import FileBasedDatasource
 
 from awswrangler._arrow import _add_table_partitions
-from awswrangler.distributed.ray.datasources.pandas_file_based_datasource import PandasFileBasedDatasource
 
 
-class ArrowJSONDatasource(PandasFileBasedDatasource):  # pylint: disable=abstract-method
-    """JSON datasource, for reading and writing JSON files using PyArrow."""
+class ArrowJSONDatasource(FileBasedDatasource):  # pylint: disable=abstract-method
+    """JSON datasource, for reading JSON files using PyArrow."""
 
-    _FILE_EXTENSION = "json"
+    _FILE_EXTENSIONS = ["json"]
 
-    def _read_file(  # type: ignore[override]  # pylint: disable=arguments-differ
+    def __init__(
         self,
-        f: pa.NativeFile,
-        path: str,
-        path_root: str,
+        paths: Union[str, List[str]],
         dataset: bool,
-        **reader_args: Any,
-    ) -> pa.Table:
-        read_options_dict = reader_args.get("read_options", dict(use_threads=False))
-        parse_options_dict = reader_args.get("parse_options", {})
+        path_root: str,
+        version_ids: Optional[Dict[str, str]] = None,
+        s3_additional_kwargs: Optional[Dict[str, str]] = None,
+        pandas_kwargs: Optional[Dict[str, Any]] = None,
+        arrow_json_args: Optional[Dict[str, Any]] = None,
+        **file_based_datasource_kwargs: Any,
+    ):
+        super().__init__(paths, **file_based_datasource_kwargs)
+
+        self.dataset = dataset
+        self.path_root = path_root
+
+        if arrow_json_args is None:
+            arrow_json_args = {}
 
-        read_options = json.ReadOptions(**read_options_dict)
-        parse_options = json.ParseOptions(**parse_options_dict)
+        self.read_options = json.ReadOptions(arrow_json_args.pop("read_options", dict(use_threads=False)))
+        self.parse_options = json.ParseOptions(arrow_json_args.pop("parse_options", {}))
+        self.arrow_json_args = arrow_json_args
 
-        table = json.read_json(f, read_options=read_options, parse_options=parse_options)
+    def _read_stream(self, f: pa.NativeFile, path: str) -> Iterator[pa.Table]:
+        table = json.read_json(f, read_options=self.read_options, parse_options=self.parse_options)
 
-        if dataset:
+        if self.dataset:
             table = _add_table_partitions(
                 table=table,
                 path=f"s3://{path}",
-                path_root=path_root,
+                path_root=self.path_root,
             )
 
-        return table
+        return [table]  # type: ignore[return-value]
@@ -0,0 +1,67 @@
+"""Ray PandasTextDatasink Module."""
+
+import io
+import logging
+from typing import Any, Dict, Optional
+
+import pyarrow as pa
+from ray.data.block import BlockAccessor
+from ray.data.datasource.block_path_provider import BlockWritePathProvider
+
+from awswrangler._arrow import _df_to_table
+from awswrangler.distributed.ray.datasources.file_datasink import _BlockFileDatasink
+
+_logger: logging.Logger = logging.getLogger(__name__)
+
+
+class ArrowORCDatasink(_BlockFileDatasink):
+    """A datasink that writes CSV files using Arrow."""
+
+    def __init__(
+        self,
+        path: str,
+        *,
+        block_path_provider: Optional[BlockWritePathProvider] = None,
+        dataset_uuid: Optional[str] = None,
+        open_s3_object_args: Optional[Dict[str, Any]] = None,
+        pandas_kwargs: Optional[Dict[str, Any]] = None,
+        schema: Optional[pa.Schema] = None,
+        index: bool = False,
+        dtype: Optional[Dict[str, str]] = None,
+        pyarrow_additional_kwargs: Optional[Dict[str, Any]] = None,
+        **write_args: Any,
+    ):
+        super().__init__(
+            path,
+            file_format="orc",
+            block_path_provider=block_path_provider,
+            dataset_uuid=dataset_uuid,
+            open_s3_object_args=open_s3_object_args,
+            pandas_kwargs=pandas_kwargs,
+            **write_args,
+        )
+
+        self.pyarrow_additional_kwargs = pyarrow_additional_kwargs or {}
+        self.schema = schema
+        self.index = index
+        self.dtype = dtype
+
+    def write_block(self, file: io.TextIOWrapper, block: BlockAccessor) -> None:
+        """
+        Write a block of data to a file.
+
+        Parameters
+        ----------
+        file : io.TextIOWrapper
+        block : BlockAccessor
+        """
+        from pyarrow import orc
+
+        compression: str = self.write_args.get("compression", None) or "UNCOMPRESSED"
+
+        orc.write_table(
+            _df_to_table(block.to_pandas(), schema=self.schema, index=self.index, dtype=self.dtype),
+            file,
+            compression=compression,
+            **self.pyarrow_additional_kwargs,
+        )