Splitting up and creating two functions, copy_objects() and merge_datasets()

igorborgest · igorborgest · commit 537afe581819 · 2020-04-17T10:13:16.000-03:00
diff --git a/awswrangler/_utils.py b/awswrangler/_utils.py
@@ -69,7 +69,7 @@ def ensure_cpu_count(use_threads: bool = True) -> int:
 
     Note
     ----
-    In case of `use_threads=True` the number of process that could be spawned will be get from os.cpu_count().
+    In case of `use_threads=True` the number of threads that could be spawned will be get from os.cpu_count().
 
     Parameters
     ----------
diff --git a/awswrangler/athena.py b/awswrangler/athena.py
@@ -369,7 +369,7 @@ def read_sql_query(  # pylint: disable=too-many-branches,too-many-locals
 
     Note
     ----
-    In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count().
+    In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().
 
     Parameters
     ----------
@@ -605,7 +605,7 @@ def read_sql_table(
 
     Note
     ----
-    In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count().
+    In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().
 
     Parameters
     ----------
diff --git a/awswrangler/db.py b/awswrangler/db.py
@@ -438,7 +438,7 @@ def copy_to_redshift(  # pylint: disable=too-many-arguments
 
     Note
     ----
-    In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count().
+    In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().
 
     Parameters
     ----------
@@ -576,7 +576,7 @@ def copy_files_to_redshift(  # pylint: disable=too-many-locals,too-many-argument
 
     Note
     ----
-    In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count().
+    In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().
 
     Parameters
     ----------
@@ -798,7 +798,7 @@ def write_redshift_copy_manifest(
 
     Note
     ----
-    In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count().
+    In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().
 
     Parameters
     ----------
@@ -908,7 +908,7 @@ def unload_redshift(
 
     Note
     ----
-    In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count().
+    In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().
 
     Parameters
     ----------
@@ -1024,7 +1024,7 @@ def unload_redshift_to_files(
 
     Note
     ----
-    In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count().
+    In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().
 
     Parameters
     ----------
diff --git a/awswrangler/s3.py b/awswrangler/s3.py
@@ -176,7 +176,7 @@ def delete_objects(
 
     Note
     ----
-    In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count().
+    In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().
 
     Parameters
     ----------
@@ -248,7 +248,7 @@ def describe_objects(
 
     Note
     ----
-    In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count().
+    In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().
 
     Parameters
     ----------
@@ -328,7 +328,7 @@ def size_objects(
 
     Note
     ----
-    In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count().
+    In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().
 
     Parameters
     ----------
@@ -396,7 +396,7 @@ def to_csv(  # pylint: disable=too-many-arguments
 
     Note
     ----
-    In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count().
+    In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().
 
     Parameters
     ----------
@@ -426,9 +426,9 @@ def to_csv(  # pylint: disable=too-many-arguments
         List of column names that will be used to create partitions. Only takes effect if dataset=True.
     mode: str, optional
         ``append`` (Default), ``overwrite``, ``overwrite_partitions``. Only takes effect if dataset=True.
-    database : str
+    database : str, optional
         Glue/Athena catalog: Database name.
-    table : str
+    table : str, optional
         Glue/Athena catalog: Table name.
     dtype: Dict[str, str], optional
         Dictionary of columns names and Athena/Glue types to be casted.
@@ -808,7 +808,7 @@ def to_parquet(  # pylint: disable=too-many-arguments
 
     Note
     ----
-    In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count().
+    In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().
 
     Parameters
     ----------
@@ -836,9 +836,9 @@ def to_parquet(  # pylint: disable=too-many-arguments
         List of column names that will be used to create partitions. Only takes effect if dataset=True.
     mode: str, optional
         ``append`` (Default), ``overwrite``, ``overwrite_partitions``. Only takes effect if dataset=True.
-    database : str
+    database : str, optional
         Glue/Athena catalog: Database name.
-    table : str
+    table : str, optional
         Glue/Athena catalog: Table name.
     dtype: Dict[str, str], optional
         Dictionary of columns names and Athena/Glue types to be casted.
@@ -1153,7 +1153,7 @@ def read_csv(
 
     Note
     ----
-    In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count().
+    In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().
 
     Parameters
     ----------
@@ -1236,7 +1236,7 @@ def read_fwf(
 
     Note
     ----
-    In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count().
+    In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().
 
     Parameters
     ----------
@@ -1319,7 +1319,7 @@ def read_json(
 
     Note
     ----
-    In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count().
+    In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().
 
     Parameters
     ----------
@@ -1524,7 +1524,7 @@ def read_parquet(
 
     Note
     ----
-    In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count().
+    In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().
 
     Parameters
     ----------
@@ -1671,7 +1671,7 @@ def read_parquet_metadata(
 
     Note
     ----
-    In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count().
+    In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().
 
     Parameters
     ----------
@@ -1743,7 +1743,7 @@ def store_parquet_metadata(
 
     Note
     ----
-    In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count().
+    In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().
 
     Parameters
     ----------
@@ -1843,7 +1843,7 @@ def wait_objects_exist(
 
     Note
     ----
-    In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count().
+    In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().
 
     Parameters
     ----------
@@ -1895,7 +1895,7 @@ def wait_objects_not_exist(
 
     Note
     ----
-    In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count().
+    In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().
 
     Parameters
     ----------
@@ -1981,7 +1981,7 @@ def read_parquet_table(
 
     Note
     ----
-    In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count().
+    In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().
 
     Parameters
     ----------
@@ -2054,24 +2054,27 @@ def read_parquet_table(
     )
 
 
-def copy_objects(
-    paths: List[str],
+def merge_datasets(
     source_path: str,
     target_path: str,
     mode: str = "append",
     use_threads: bool = True,
     boto3_session: Optional[boto3.Session] = None,
 ) -> List[str]:
-    """Copy a list of S3 objects to another S3 directory.
+    """Merge a source dataset into a target dataset.
+
+    Note
+    ----
+    If you are merging tables (S3 datasets + Glue Catalog metadata),
+    remember that you will also need to update your partitions metadata in some cases.
+    (e.g. wr.athena.repair_table(table='...', database='...'))
 
     Note
     ----
-    In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count().
+    In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().
 
     Parameters
     ----------
-    paths : List[str]
-        List of S3 objects paths (e.g. [s3://bucket/dir0/key0, s3://bucket/dir0/key1]).
     source_path : str,
         S3 Path for the source directory.
     target_path : str,
@@ -2092,21 +2095,23 @@ def copy_objects(
     Examples
     --------
     >>> import awswrangler as wr
-    >>> wr.s3.copy_objects(
-    ...     paths=["s3://bucket0/dir0/key0", "s3://bucket0/dir0/key1"])
+    >>> wr.s3.merge_datasets(
     ...     source_path="s3://bucket0/dir0/",
     ...     target_path="s3://bucket1/dir1/",
     ...     mode="append"
     ... )
     ["s3://bucket1/dir1/key0", "s3://bucket1/dir1/key1"]
 
     """
-    _logger.debug(f"len(paths): {len(paths)}")
-    if len(paths) < 1:
-        return []
     source_path = source_path[:-1] if source_path[-1] == "/" else source_path
     target_path = target_path[:-1] if target_path[-1] == "/" else target_path
     session: boto3.Session = _utils.ensure_session(session=boto3_session)
+
+    paths: List[str] = list_objects(path=f"{source_path}/", boto3_session=session)
+    _logger.debug(f"len(paths): {len(paths)}")
+    if len(paths) < 1:
+        return []
+
     if mode == "overwrite":
         _logger.debug(f"Deleting to overwrite: {target_path}/")
         delete_objects(path=f"{target_path}/", use_threads=use_threads, boto3_session=session)
@@ -2121,6 +2126,60 @@ def copy_objects(
     elif mode != "append":
         raise exceptions.InvalidArgumentValue(f"{mode} is a invalid mode option.")
 
+    new_objects: List[str] = copy_objects(paths=paths, source_path=source_path, target_path=target_path, use_threads=use_threads, boto3_session=session)
+    _logger.debug(f"len(new_objects): {len(new_objects)}")
+    return new_objects
+
+
+def copy_objects(
+    paths: List[str],
+    source_path: str,
+    target_path: str,
+    use_threads: bool = True,
+    boto3_session: Optional[boto3.Session] = None,
+) -> List[str]:
+    """Copy a list of S3 objects to another S3 directory.
+
+    Note
+    ----
+    In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().
+
+    Parameters
+    ----------
+    paths : List[str]
+        List of S3 objects paths (e.g. [s3://bucket/dir0/key0, s3://bucket/dir0/key1]).
+    source_path : str,
+        S3 Path for the source directory.
+    target_path : str,
+        S3 Path for the target directory.
+    use_threads : bool
+        True to enable concurrent requests, False to disable multiple threads.
+        If enabled os.cpu_count() will be used as the max number of threads.
+    boto3_session : boto3.Session(), optional
+        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+
+    Returns
+    -------
+    List[str]
+        List of new objects paths.
+
+    Examples
+    --------
+    >>> import awswrangler as wr
+    >>> wr.s3.copy_objects(
+    ...     paths=["s3://bucket0/dir0/key0", "s3://bucket0/dir0/key1"])
+    ...     source_path="s3://bucket0/dir0/",
+    ...     target_path="s3://bucket1/dir1/",
+    ... )
+    ["s3://bucket1/dir1/key0", "s3://bucket1/dir1/key1"]
+
+    """
+    _logger.debug(f"len(paths): {len(paths)}")
+    if len(paths) < 1:
+        return []
+    source_path = source_path[:-1] if source_path[-1] == "/" else source_path
+    target_path = target_path[:-1] if target_path[-1] == "/" else target_path
+    session: boto3.Session = _utils.ensure_session(session=boto3_session)
     batch: List[Tuple[str, str]] = []
     new_objects: List[str] = []
     for path in paths:
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -30,6 +30,7 @@ Amazon S3
     wait_objects_exist
     wait_objects_not_exist
     copy_objects
+    merge_datasets
 
 AWS Glue Catalog
 ----------------
diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py