Add Pandas.read_csv_prefix()

igorborgest · igorborgest · commit f95194ba7191 · 2020-01-06T22:57:49.000-03:00
diff --git a/awswrangler/pandas.py b/awswrangler/pandas.py
@@ -1647,7 +1647,7 @@ def read_sql_aurora(self,
 
     def read_csv_list(
             self,
-            paths,
+            paths: List[str],
             max_result_size=None,
             header: Optional[str] = "infer",
             names=None,
@@ -1738,7 +1738,7 @@ def read_csv_list(
 
     def _read_csv_list_iterator(
         self,
-        paths,
+        paths: List[str],
         max_result_size=None,
         header="infer",
         names=None,
@@ -1802,3 +1802,68 @@ def _read_csv_list_iterator(
                                                infer_datetime_format=infer_datetime_format,
                                                encoding=encoding,
                                                converters=converters)
+
+    def read_csv_prefix(
+            self,
+            path_prefix: str,
+            max_result_size=None,
+            header: Optional[str] = "infer",
+            names=None,
+            usecols=None,
+            dtype=None,
+            sep=",",
+            thousands=None,
+            decimal=".",
+            lineterminator="\n",
+            quotechar='"',
+            quoting=csv.QUOTE_MINIMAL,
+            escapechar=None,
+            parse_dates: Union[bool, Dict, List] = False,
+            infer_datetime_format=False,
+            encoding="utf-8",
+            converters=None,
+    ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
+        """
+        Read CSV files from AWS S3 PREFIX using optimized strategies.
+        Try to mimic as most as possible pandas.read_csv()
+        https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
+        P.S. max_result_size != None tries to mimic the chunksize behaviour in pandas.read_sql()
+
+        :param path_prefix: AWS S3 path prefix (E.g. S3://BUCKET_NAME/PREFIX)
+        :param max_result_size: Max number of bytes on each request to S3
+        :param header: Same as pandas.read_csv()
+        :param names: Same as pandas.read_csv()
+        :param usecols: Same as pandas.read_csv()
+        :param dtype: Same as pandas.read_csv()
+        :param sep: Same as pandas.read_csv()
+        :param thousands: Same as pandas.read_csv()
+        :param decimal: Same as pandas.read_csv()
+        :param lineterminator: Same as pandas.read_csv()
+        :param quotechar: Same as pandas.read_csv()
+        :param quoting: Same as pandas.read_csv()
+        :param escapechar: Same as pandas.read_csv()
+        :param parse_dates: Same as pandas.read_csv()
+        :param infer_datetime_format: Same as pandas.read_csv()
+        :param encoding: Same as pandas.read_csv()
+        :param converters: Same as pandas.read_csv()
+        :return: Pandas Dataframe or Iterator of Pandas Dataframes if max_result_size != None
+        """
+        paths: List[str] = self._session.s3.list_objects(path=path_prefix)
+        paths = [p for p in paths if not p.endswith("/")]
+        return self.read_csv_list(paths=paths,
+                                max_result_size=max_result_size,
+                                header=header,
+                                names=names,
+                                usecols=usecols,
+                                dtype=dtype,
+                                sep=sep,
+                                thousands=thousands,
+                                decimal=decimal,
+                                lineterminator=lineterminator,
+                                quotechar=quotechar,
+                                quoting=quoting,
+                                escapechar=escapechar,
+                                parse_dates=parse_dates,
+                                infer_datetime_format=infer_datetime_format,
+                                encoding=encoding,
+                                converters=converters)
diff --git a/awswrangler/s3.py b/awswrangler/s3.py
@@ -227,14 +227,15 @@ def delete_objects_batch(session_primitives: "SessionPrimitives", bucket, batch)
         for bounder in bounders:
             client_s3.delete_objects(Bucket=bucket, Delete={"Objects": batch[bounder[0]:bounder[1]]})
 
-    def list_objects(self, path):
+    def list_objects(self, path: str) -> List[str]:
+        bucket: str
         bucket, path = self.parse_path(path=path)
-        args = {"Bucket": bucket, "MaxKeys": 1000, "Prefix": path}
-        next_continuation_token = True
-        keys = []
-        while next_continuation_token:
-            res = self._client_s3.list_objects_v2(**args)
-            if not res.get("Contents"):
+        args: Dict[str, Any] = {"Bucket": bucket, "MaxKeys": 1000, "Prefix": path}
+        next_continuation_token: str = ""
+        keys: List[str] = []
+        while next_continuation_token is not None:
+            res: Dict[str, Any] = self._client_s3.list_objects_v2(**args)
+            if res.get("Contents") is None:
                 break
             keys += [f"s3://{bucket}/{x.get('Key')}" for x in res.get("Contents")]
             next_continuation_token = res.get("NextContinuationToken")
diff --git a/testing/test_awswrangler/test_pandas.py b/testing/test_awswrangler/test_pandas.py
@@ -1749,6 +1749,7 @@ def test_aurora_mysql_unload_simple(bucket, mysql_parameters):
 
 @pytest.mark.parametrize("sample, row_num", [("data_samples/micro.csv", 30), ("data_samples/small.csv", 100)])
 def test_read_csv_list(bucket, sample, row_num):
+    wr.s3.delete_objects(path=f"s3://{bucket}/")
     n = 10
     paths = []
     for i in range(n):
@@ -1762,6 +1763,7 @@ def test_read_csv_list(bucket, sample, row_num):
 
 @pytest.mark.parametrize("sample, row_num", [("data_samples/micro.csv", 30), ("data_samples/small.csv", 100)])
 def test_read_csv_list_iterator(bucket, sample, row_num):
+    wr.s3.delete_objects(path=f"s3://{bucket}/")
     n = 10
     paths = []
     for i in range(n):
@@ -2001,3 +2003,24 @@ def test_aurora_mysql_load_special(bucket, mysql_parameters):
         assert rows[2][2] == "\\\\\\\\"
         assert rows[3][2] == "\"\"\"\""
     conn.close()
+
+
+@pytest.mark.parametrize("sample, row_num", [("data_samples/micro.csv", 30), ("data_samples/small.csv", 100)])
+def test_read_csv_prefix_iterator(bucket, sample, row_num):
+    wr.s3.delete_objects(path=f"s3://{bucket}/")
+    n = 10
+    paths = []
+    for i in range(n):
+        key = f"{sample}_{i}"
+        boto3.client("s3").upload_file(sample, bucket, key)
+        paths.append(f"s3://{bucket}/{key}")
+    sleep(15)
+
+    df_iter = wr.pandas.read_csv_prefix(path_prefix=f"s3://{bucket}/{sample}_", max_result_size=200)
+    total_count = 0
+    for df in df_iter:
+        count = len(df.index)
+        print(f"count: {count}")
+        total_count += count
+    wr.s3.delete_listed_objects(objects_paths=paths)
+    assert total_count == row_num * n