@@ -1647,7 +1647,7 @@ def read_sql_aurora(self,
16471647
16481648 def read_csv_list (
16491649 self ,
1650- paths ,
1650+ paths : List [ str ] ,
16511651 max_result_size = None ,
16521652 header : Optional [str ] = "infer" ,
16531653 names = None ,
@@ -1738,7 +1738,7 @@ def read_csv_list(
17381738
17391739 def _read_csv_list_iterator (
17401740 self ,
1741- paths ,
1741+ paths : List [ str ] ,
17421742 max_result_size = None ,
17431743 header = "infer" ,
17441744 names = None ,
@@ -1802,3 +1802,68 @@ def _read_csv_list_iterator(
18021802 infer_datetime_format = infer_datetime_format ,
18031803 encoding = encoding ,
18041804 converters = converters )
1805+
1806+ def read_csv_prefix (
1807+ self ,
1808+ path_prefix : str ,
1809+ max_result_size = None ,
1810+ header : Optional [str ] = "infer" ,
1811+ names = None ,
1812+ usecols = None ,
1813+ dtype = None ,
1814+ sep = "," ,
1815+ thousands = None ,
1816+ decimal = "." ,
1817+ lineterminator = "\n " ,
1818+ quotechar = '"' ,
1819+ quoting = csv .QUOTE_MINIMAL ,
1820+ escapechar = None ,
1821+ parse_dates : Union [bool , Dict , List ] = False ,
1822+ infer_datetime_format = False ,
1823+ encoding = "utf-8" ,
1824+ converters = None ,
1825+ ) -> Union [pd .DataFrame , Iterator [pd .DataFrame ]]:
1826+ """
1827+ Read CSV files from AWS S3 PREFIX using optimized strategies.
1828+ Try to mimic as most as possible pandas.read_csv()
1829+ https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
1830+ P.S. max_result_size != None tries to mimic the chunksize behaviour in pandas.read_sql()
1831+
1832+ :param path_prefix: AWS S3 path prefix (E.g. S3://BUCKET_NAME/PREFIX)
1833+ :param max_result_size: Max number of bytes on each request to S3
1834+ :param header: Same as pandas.read_csv()
1835+ :param names: Same as pandas.read_csv()
1836+ :param usecols: Same as pandas.read_csv()
1837+ :param dtype: Same as pandas.read_csv()
1838+ :param sep: Same as pandas.read_csv()
1839+ :param thousands: Same as pandas.read_csv()
1840+ :param decimal: Same as pandas.read_csv()
1841+ :param lineterminator: Same as pandas.read_csv()
1842+ :param quotechar: Same as pandas.read_csv()
1843+ :param quoting: Same as pandas.read_csv()
1844+ :param escapechar: Same as pandas.read_csv()
1845+ :param parse_dates: Same as pandas.read_csv()
1846+ :param infer_datetime_format: Same as pandas.read_csv()
1847+ :param encoding: Same as pandas.read_csv()
1848+ :param converters: Same as pandas.read_csv()
1849+ :return: Pandas Dataframe or Iterator of Pandas Dataframes if max_result_size != None
1850+ """
1851+ paths : List [str ] = self ._session .s3 .list_objects (path = path_prefix )
1852+ paths = [p for p in paths if not p .endswith ("/" )]
1853+ return self .read_csv_list (paths = paths ,
1854+ max_result_size = max_result_size ,
1855+ header = header ,
1856+ names = names ,
1857+ usecols = usecols ,
1858+ dtype = dtype ,
1859+ sep = sep ,
1860+ thousands = thousands ,
1861+ decimal = decimal ,
1862+ lineterminator = lineterminator ,
1863+ quotechar = quotechar ,
1864+ quoting = quoting ,
1865+ escapechar = escapechar ,
1866+ parse_dates = parse_dates ,
1867+ infer_datetime_format = infer_datetime_format ,
1868+ encoding = encoding ,
1869+ converters = converters )
0 commit comments