feat: Opensearch parallel bulk (#2310)

kukushking · web-flow · commit d3650ff6110f · 2023-06-01T11:55:11.000+01:00
* feat: Opensearch parallel bulk

* Minor refactor

* Raise an error if using args not compatible with parallel bulk

* [skip-ci] Docstrings
diff --git a/awswrangler/opensearch/_write.py b/awswrangler/opensearch/_write.py
@@ -260,6 +260,7 @@ def index_json(
     doc_type: Optional[str] = None,
     boto3_session: Optional[boto3.Session] = boto3.Session(),
     json_path: Optional[str] = None,
+    use_threads: Union[bool, int] = False,
     **kwargs: Any,
 ) -> Any:
     """Index all documents from JSON file to OpenSearch index.
@@ -284,6 +285,10 @@ def index_json(
     boto3_session : boto3.Session(), optional
         Boto3 Session to be used to access s3 if s3 path is provided.
         The default boto3 Session will be used if boto3_session receive None.
+    use_threads : bool, int
+        True to enable concurrent requests, False to disable multiple threads.
+        If enabled os.cpu_count() will be used as the max number of threads.
+        If integer is provided, specified number is used.
     **kwargs :
         KEYWORD arguments forwarded to :func:`~awswrangler.opensearch.index_documents`
         which is used to execute the operation
@@ -324,7 +329,9 @@ def index_json(
         documents = list(_file_line_generator(path, is_json=True))
         if json_path:
             documents = _get_documents_w_json_path(documents, json_path)
-    return index_documents(client=client, documents=documents, index=index, doc_type=doc_type, **kwargs)
+    return index_documents(
+        client=client, documents=documents, index=index, doc_type=doc_type, use_threads=use_threads, **kwargs
+    )
 
 
 @_utils.check_optional_dependency(opensearchpy, "opensearchpy")
@@ -334,6 +341,7 @@ def index_csv(
     index: str,
     doc_type: Optional[str] = None,
     pandas_kwargs: Optional[Dict[str, Any]] = None,
+    use_threads: Union[bool, int] = False,
     **kwargs: Any,
 ) -> Any:
     """Index all documents from a CSV file to OpenSearch index.
@@ -353,6 +361,10 @@ def index_csv(
         e.g. pandas_kwargs={'sep': '|', 'na_values': ['null', 'none']}
         https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
         Note: these params values are enforced: `skip_blank_lines=True`
+    use_threads : bool, int
+        True to enable concurrent requests, False to disable multiple threads.
+        If enabled os.cpu_count() will be used as the max number of threads.
+        If integer is provided, specified number is used.
     **kwargs :
         KEYWORD arguments forwarded to :func:`~awswrangler.opensearch.index_documents`
         which is used to execute the operation
@@ -396,12 +408,17 @@ def index_csv(
     }
     pandas_kwargs.update(enforced_pandas_params)
     df = pd.read_csv(path, **pandas_kwargs)
-    return index_df(client, df=df, index=index, doc_type=doc_type, **kwargs)
+    return index_df(client, df=df, index=index, doc_type=doc_type, use_threads=use_threads, **kwargs)
 
 
 @_utils.check_optional_dependency(opensearchpy, "opensearchpy")
 def index_df(
-    client: "opensearchpy.OpenSearch", df: pd.DataFrame, index: str, doc_type: Optional[str] = None, **kwargs: Any
+    client: "opensearchpy.OpenSearch",
+    df: pd.DataFrame,
+    index: str,
+    doc_type: Optional[str] = None,
+    use_threads: Union[bool, int] = False,
+    **kwargs: Any,
 ) -> Any:
     """Index all documents from a DataFrame to OpenSearch index.
 
@@ -415,6 +432,10 @@ def index_df(
         Name of the index.
     doc_type : str, optional
         Name of the document type (for Elasticsearch versions 5.x and earlier).
+    use_threads : bool, int
+        True to enable concurrent requests, False to disable multiple threads.
+        If enabled os.cpu_count() will be used as the max number of threads.
+        If integer is provided, specified number is used.
     **kwargs :
         KEYWORD arguments forwarded to :func:`~awswrangler.opensearch.index_documents`
         which is used to execute the operation
@@ -438,7 +459,14 @@ def index_df(
     ...     index='sample-index1'
     ... )
     """
-    return index_documents(client=client, documents=_df_doc_generator(df), index=index, doc_type=doc_type, **kwargs)
+    return index_documents(
+        client=client,
+        documents=_df_doc_generator(df),
+        index=index,
+        doc_type=doc_type,
+        use_threads=use_threads,
+        **kwargs,
+    )
 
 
 @_utils.check_optional_dependency(opensearchpy, "opensearchpy")
@@ -453,13 +481,19 @@ def index_documents(
     bulk_size: int = 1000,
     chunk_size: Optional[int] = 500,
     max_chunk_bytes: Optional[int] = 100 * 1024 * 1024,
-    max_retries: Optional[int] = 5,
-    initial_backoff: Optional[int] = 2,
-    max_backoff: Optional[int] = 600,
+    max_retries: Optional[int] = None,
+    initial_backoff: Optional[int] = None,
+    max_backoff: Optional[int] = None,
+    use_threads: Union[bool, int] = False,
     **kwargs: Any,
 ) -> Dict[str, Any]:
     """Index all documents to OpenSearch index.
 
+    Note
+    ----
+    `max_retries`, `initial_backoff`, and `max_backoff` are not supported with parallel bulk
+     (when `use_threads`is set to True).
+
     Note
     ----
     Some of the args are referenced from opensearch-py client library (bulk helpers)
@@ -501,6 +535,10 @@ def index_documents(
         Any subsequent retries will be powers of ``initial_backoff*2**retry_number`` (default: 2)
     max_backoff: int, optional
         maximum number of seconds a retry will wait (default: 600)
+    use_threads : bool, int
+        True to enable concurrent requests, False to disable multiple threads.
+        If enabled os.cpu_count() will be used as the max number of threads.
+        If integer is provided, specified number is used.
     **kwargs :
         KEYWORD arguments forwarded to bulk operation
         elasticsearch >= 7.10.2 / opensearch: \
@@ -528,6 +566,11 @@ def index_documents(
     if "refresh" in kwargs and _is_serverless(client):
         raise exceptions.NotSupported("Refresh policy not supported in OpenSearch Serverless.")
 
+    if use_threads and any([max_retries, initial_backoff, max_backoff]):
+        raise exceptions.InvalidArgumentCombination(
+            f"`max_retries`, `initial_backoff`, and `max_backoff` are not supported when `use_threads` is set to True"
+        )
+
     if not isinstance(documents, list):
         documents = list(documents)
     total_documents = len(documents)
@@ -556,20 +599,30 @@ def index_documents(
                 refresh_interval = _get_refresh_interval(client, index)
                 _disable_refresh_interval(client, index)
             _logger.debug("running bulk index of %s documents", len(bulk_chunk_documents))
-            _success, _errors = opensearchpy.helpers.bulk(
-                client=client,
-                actions=bulk_chunk_documents,
-                ignore_status=ignore_status,
-                chunk_size=chunk_size,
-                max_chunk_bytes=max_chunk_bytes,
-                max_retries=max_retries,
-                initial_backoff=initial_backoff,
-                max_backoff=max_backoff,
-                request_timeout=30,
+            bulk_kwargs = {
+                "ignore_status": ignore_status,
+                "chunk_size": chunk_size,
+                "max_chunk_bytes": max_chunk_bytes,
+                "request_timeout": 30,
                 **kwargs,
-            )
-            success += _success
-            errors += _errors
+            }
+            _logger.debug("running bulk with kwargs: %s", bulk_kwargs)
+            if use_threads:
+                # Parallel bulk does not support max_retries, initial_backoff & max_backoff
+                for _success, _errors in opensearchpy.helpers.parallel_bulk(
+                    client, bulk_chunk_documents, **bulk_kwargs
+                ):
+                    success += _success
+                    errors += _errors
+            else:
+                # Defaults
+                bulk_kwargs["max_retries"] = 5 if not max_retries else max_retries
+                bulk_kwargs["initial_backoff"] = 2 if not initial_backoff else initial_backoff
+                bulk_kwargs["max_backoff"] = 600 if not max_backoff else max_backoff
+
+                _success, _errors = opensearchpy.helpers.bulk(client, bulk_chunk_documents, **bulk_kwargs)
+                success += _success
+                errors += _errors
             _logger.debug("indexed %s documents (%s/%s)", _success, success, total_documents)
             if progressbar:
                 progress_bar.update(success, force=True)
diff --git a/tests/unit/test_opensearch.py b/tests/unit/test_opensearch.py
@@ -312,6 +312,27 @@ def test_index_documents(client):
         wr.opensearch.delete_index(client, index)
 
 
+@pytest.mark.parametrize("use_threads", [False, True, 2])
+def test_index_documents_parallel(client, use_threads):
+    index = f"test_index_documents_{_get_unique_suffix()}"
+    # Pre-create index to avoid multiple threads creating conflicting mappings
+    wr.opensearch.create_index(
+        client=client,
+        index=index,
+        mappings={"properties": {"name": {"type": "text"}}},
+    )
+    try:
+        response = wr.opensearch.index_documents(
+            client,
+            documents=[{"_id": "1", "name": "John"}, {"_id": "2", "name": "George"}, {"_id": "3", "name": "Julia"}],
+            index=index,
+            use_threads=use_threads,
+        )
+        assert response.get("success", 0) == 3
+    finally:
+        wr.opensearch.delete_index(client, index)
+
+
 def test_index_documents_id_keys(client):
     index = f"test_index_documents_id_keys_{_get_unique_suffix()}"
     try: