@@ -433,6 +433,7 @@ def to_csv(
433433 self ,
434434 dataframe ,
435435 path ,
436+ sep = "," ,
436437 database = None ,
437438 table = None ,
438439 partition_cols = None ,
@@ -447,6 +448,7 @@ def to_csv(
447448
448449 :param dataframe: Pandas Dataframe
449450 :param path: AWS S3 path (E.g. s3://bucket-name/folder_name/
451+ :param sep: Same as pandas.to_csv()
450452 :param database: AWS Glue Database name
451453 :param table: AWS Glue table name
452454 :param partition_cols: List of columns names that will be partitions on S3
@@ -456,18 +458,18 @@ def to_csv(
456458 :param procs_io_bound: Number of cores used for I/O bound tasks
457459 :return: List of objects written on S3
458460 """
459- return self . to_s3 (
460- dataframe = dataframe ,
461- path = path ,
462- file_format = "csv" ,
463- database = database ,
464- table = table ,
465- partition_cols = partition_cols ,
466- preserve_index = preserve_index ,
467- mode = mode ,
468- procs_cpu_bound = procs_cpu_bound ,
469- procs_io_bound = procs_io_bound ,
470- )
461+ extra_args = { "sep" : sep }
462+ return self . to_s3 ( dataframe = dataframe ,
463+ path = path ,
464+ file_format = "csv" ,
465+ database = database ,
466+ table = table ,
467+ partition_cols = partition_cols ,
468+ preserve_index = preserve_index ,
469+ mode = mode ,
470+ procs_cpu_bound = procs_cpu_bound ,
471+ procs_io_bound = procs_io_bound ,
472+ extra_args = extra_args )
471473
472474 def to_parquet (self ,
473475 dataframe ,
@@ -519,7 +521,8 @@ def to_s3(self,
519521 mode = "append" ,
520522 procs_cpu_bound = None ,
521523 procs_io_bound = None ,
522- cast_columns = None ):
524+ cast_columns = None ,
525+ extra_args = None ):
523526 """
524527 Write a Pandas Dataframe on S3
525528 Optionally writes metadata on AWS Glue.
@@ -535,6 +538,7 @@ def to_s3(self,
535538 :param procs_cpu_bound: Number of cores used for CPU bound tasks
536539 :param procs_io_bound: Number of cores used for I/O bound tasks
537540 :param cast_columns: Dictionary of columns indexes and Arrow types to be casted. (E.g. {2: "int64", 5: "int32"}) (Only for "parquet" file_format)
541+ :param extra_args: Extra arguments specific for each file formats (E.g. "sep" for CSV)
538542 :return: List of objects written on S3
539543 """
540544 if dataframe .empty :
@@ -554,7 +558,8 @@ def to_s3(self,
554558 mode = mode ,
555559 procs_cpu_bound = procs_cpu_bound ,
556560 procs_io_bound = procs_io_bound ,
557- cast_columns = cast_columns )
561+ cast_columns = cast_columns ,
562+ extra_args = extra_args )
558563 if database :
559564 self ._session .glue .metadata_to_glue (dataframe = dataframe ,
560565 path = path ,
@@ -565,7 +570,8 @@ def to_s3(self,
565570 preserve_index = preserve_index ,
566571 file_format = file_format ,
567572 mode = mode ,
568- cast_columns = cast_columns )
573+ cast_columns = cast_columns ,
574+ extra_args = extra_args )
569575 return objects_paths
570576
571577 def data_to_s3 (self ,
@@ -577,7 +583,8 @@ def data_to_s3(self,
577583 mode = "append" ,
578584 procs_cpu_bound = None ,
579585 procs_io_bound = None ,
580- cast_columns = None ):
586+ cast_columns = None ,
587+ extra_args = None ):
581588 if not procs_cpu_bound :
582589 procs_cpu_bound = self ._session .procs_cpu_bound
583590 if not procs_io_bound :
@@ -601,7 +608,8 @@ def data_to_s3(self,
601608 target = self ._data_to_s3_dataset_writer_remote ,
602609 args = (send_pipe , dataframe .iloc [bounder [0 ]:bounder [1 ], :],
603610 path , partition_cols , preserve_index ,
604- self ._session .primitives , file_format , cast_columns ),
611+ self ._session .primitives , file_format , cast_columns ,
612+ extra_args ),
605613 )
606614 proc .daemon = False
607615 proc .start ()
@@ -619,7 +627,8 @@ def data_to_s3(self,
619627 preserve_index = preserve_index ,
620628 session_primitives = self ._session .primitives ,
621629 file_format = file_format ,
622- cast_columns = cast_columns )
630+ cast_columns = cast_columns ,
631+ extra_args = extra_args )
623632 if mode == "overwrite_partitions" and partition_cols :
624633 if procs_io_bound > procs_cpu_bound :
625634 num_procs = floor (
@@ -639,7 +648,8 @@ def _data_to_s3_dataset_writer(dataframe,
639648 preserve_index ,
640649 session_primitives ,
641650 file_format ,
642- cast_columns = None ):
651+ cast_columns = None ,
652+ extra_args = None ):
643653 objects_paths = []
644654 if not partition_cols :
645655 object_path = Pandas ._data_to_s3_object_writer (
@@ -648,7 +658,8 @@ def _data_to_s3_dataset_writer(dataframe,
648658 preserve_index = preserve_index ,
649659 session_primitives = session_primitives ,
650660 file_format = file_format ,
651- cast_columns = cast_columns )
661+ cast_columns = cast_columns ,
662+ extra_args = extra_args )
652663 objects_paths .append (object_path )
653664 else :
654665 for keys , subgroup in dataframe .groupby (partition_cols ):
@@ -665,21 +676,21 @@ def _data_to_s3_dataset_writer(dataframe,
665676 preserve_index = preserve_index ,
666677 session_primitives = session_primitives ,
667678 file_format = file_format ,
668- cast_columns = cast_columns )
679+ cast_columns = cast_columns ,
680+ extra_args = extra_args )
669681 objects_paths .append (object_path )
670682 return objects_paths
671683
672684 @staticmethod
673- def _data_to_s3_dataset_writer_remote (
674- send_pipe ,
675- dataframe ,
676- path ,
677- partition_cols ,
678- preserve_index ,
679- session_primitives ,
680- file_format ,
681- cast_columns = None ,
682- ):
685+ def _data_to_s3_dataset_writer_remote (send_pipe ,
686+ dataframe ,
687+ path ,
688+ partition_cols ,
689+ preserve_index ,
690+ session_primitives ,
691+ file_format ,
692+ cast_columns = None ,
693+ extra_args = None ):
683694 send_pipe .send (
684695 Pandas ._data_to_s3_dataset_writer (
685696 dataframe = dataframe ,
@@ -688,7 +699,8 @@ def _data_to_s3_dataset_writer_remote(
688699 preserve_index = preserve_index ,
689700 session_primitives = session_primitives ,
690701 file_format = file_format ,
691- cast_columns = cast_columns ))
702+ cast_columns = cast_columns ,
703+ extra_args = extra_args ))
692704 send_pipe .close ()
693705
694706 @staticmethod
@@ -697,7 +709,8 @@ def _data_to_s3_object_writer(dataframe,
697709 preserve_index ,
698710 session_primitives ,
699711 file_format ,
700- cast_columns = None ):
712+ cast_columns = None ,
713+ extra_args = None ):
701714 fs = s3 .get_fs (session_primitives = session_primitives )
702715 fs = pyarrow .filesystem ._ensure_filesystem (fs )
703716 s3 .mkdir_if_not_exists (fs , path )
@@ -713,27 +726,40 @@ def _data_to_s3_object_writer(dataframe,
713726 path = object_path ,
714727 preserve_index = preserve_index ,
715728 fs = fs ,
716- cast_columns = cast_columns )
729+ cast_columns = cast_columns ,
730+ extra_args = extra_args )
717731 elif file_format == "csv" :
718- Pandas .write_csv_dataframe (
719- dataframe = dataframe ,
720- path = object_path ,
721- preserve_index = preserve_index ,
722- fs = fs ,
723- )
732+ Pandas .write_csv_dataframe (dataframe = dataframe ,
733+ path = object_path ,
734+ preserve_index = preserve_index ,
735+ fs = fs ,
736+ extra_args = extra_args )
724737 return object_path
725738
726739 @staticmethod
727- def write_csv_dataframe (dataframe , path , preserve_index , fs ):
740+ def write_csv_dataframe (dataframe ,
741+ path ,
742+ preserve_index ,
743+ fs ,
744+ extra_args = None ):
745+ csv_extra_args = {}
746+ if "sep" in extra_args :
747+ csv_extra_args ["sep" ] = extra_args ["sep" ]
728748 csv_buffer = bytes (
729- dataframe .to_csv (None , header = False , index = preserve_index ),
730- "utf-8" )
749+ dataframe .to_csv (None ,
750+ header = False ,
751+ index = preserve_index ,
752+ ** csv_extra_args ), "utf-8" )
731753 with fs .open (path , "wb" ) as f :
732754 f .write (csv_buffer )
733755
734756 @staticmethod
735- def write_parquet_dataframe (dataframe , path , preserve_index , fs ,
736- cast_columns ):
757+ def write_parquet_dataframe (dataframe ,
758+ path ,
759+ preserve_index ,
760+ fs ,
761+ cast_columns ,
762+ extra_args = None ):
737763 if not cast_columns :
738764 cast_columns = {}
739765 casted_in_pandas = []
0 commit comments