feat: Support additional parameters for Neptune bulk load (#2297)

LeonLuttenberger · web-flow · commit 22d6d0892b35 · 2023-05-26T11:02:35.000-05:00
diff --git a/awswrangler/neptune/__init__.py b/awswrangler/neptune/__init__.py
@@ -1,4 +1,5 @@
 """Utilities Module for Amazon Neptune."""
+from awswrangler.neptune._client import BulkLoadParserConfiguration
 from awswrangler.neptune._gremlin_parser import GremlinParser
 from awswrangler.neptune._neptune import (
     bulk_load,
@@ -23,4 +24,5 @@
     "bulk_load_from_files",
     "GremlinParser",
     "flatten_nested_df",
+    "BulkLoadParserConfiguration",
 ]
diff --git a/awswrangler/neptune/_client.py b/awswrangler/neptune/_client.py
@@ -2,11 +2,12 @@
 """Amazon NeptuneClient Module."""
 
 import logging
-from typing import Any, Dict, List, Optional, Union, cast
+from typing import Any, Dict, List, Optional, TypedDict, Union, cast
 
 import boto3
 from botocore.auth import SigV4Auth
 from botocore.awsrequest import AWSPreparedRequest, AWSRequest
+from typing_extensions import Literal, NotRequired
 
 import awswrangler.neptune._gremlin_init as gremlin
 from awswrangler import _utils, exceptions
@@ -27,6 +28,28 @@
 WS_PROTOCOL = "wss"
 
 
+class BulkLoadParserConfiguration(TypedDict):
+    """Typed dictionary representing the additional parser configuration for the Neptune Bulk Loader."""
+
+    namedGraphUri: NotRequired[str]
+    """
+    The default graph for all RDF formats when no graph is specified
+    (for non-quads formats and NQUAD entries with no graph).
+    """
+    baseUri: NotRequired[str]
+    """The base URI for RDF/XML and Turtle formats."""
+    allowEmptyStrings: NotRequired[bool]
+    """
+    Gremlin users need to be able to pass empty string values("") as node
+    and edge properties when loading CSV data.
+    If ``allowEmptyStrings`` is set to ``false`` (the default),
+    such empty strings are treated as nulls and are not loaded.
+
+    If allowEmptyStrings is set to true, the loader treats empty strings
+    as valid property values and loads them accordingly.
+    """
+
+
 class NeptuneClient:
     """Class representing a Neptune cluster connection."""
 
@@ -280,7 +303,18 @@ def status(self) -> Any:
         res = self._http_session.send(req)
         return res.json()
 
-    def load(self, s3_path: str, role_arn: str, parallelism: str = "HIGH", format: str = "csv") -> str:
+    def load(
+        self,
+        s3_path: str,
+        role_arn: str,
+        parallelism: Literal["LOW", "MEDIUM", "HIGH", "OVERSUBSCRIBE"] = "HIGH",
+        mode: Literal["RESUME", "NEW", "AUTO"] = "AUTO",
+        format: str = "csv",
+        parser_configuration: Optional[BulkLoadParserConfiguration] = None,
+        update_single_cardinality_properties: Literal["TRUE", "FALSE"] = "FALSE",
+        queue_request: Literal["TRUE", "FALSE"] = "FALSE",
+        dependencies: Optional[List[str]] = None,
+    ) -> str:
         """
         Start the Neptune Loader command for loading CSV data from external files on S3 into a Neptune DB cluster.
 
@@ -295,24 +329,52 @@ def load(self, s3_path: str, role_arn: str, parallelism: str = "HIGH", format: s
             see `Prerequisites: IAM Role and Amazon S3 Access <https://docs.aws.amazon.com/neptune/latest/userguide/bulk-load-tutorial-IAM.html>`_.
         parallelism: str
             Specifies the number of threads used by the bulk load process.
+        mode: str
+            The load job mode.
+
+            In ```RESUME``` mode, the loader looks for a previous load from this source, and if it finds one, resumes that load job.
+            If no previous load job is found, the loader stops.
+
+            In ```NEW``` mode, the creates a new load request regardless of any previous loads.
+            You can use this mode to reload all the data from a source after dropping previously loaded data from your Neptune cluster, or to load new data available at the same source.
+
+            In ```AUTO``` mode, the loader looks for a previous load job from the same source, and if it finds one, resumes that job, just as in ```RESUME``` mode.
         format: str
             The format of the data. For more information about data formats for the Neptune Loader command,
             see `Using the Amazon Neptune Bulk Loader to Ingest Data <https://docs.aws.amazon.com/neptune/latest/userguide/load-api-reference-load.html#:~:text=The%20format%20of%20the%20data.%20For%20more%20information%20about%20data%20formats%20for%20the%20Neptune%20Loader%20command%2C%20see%20Using%20the%20Amazon%20Neptune%20Bulk%20Loader%20to%20Ingest%20Data.>`_.
+        parser_configuration: dict[str, Any], optional
+            An optional object with additional parser configuration values.
+            Each of the child parameters is also optional: ``namedGraphUri``, ``baseUri`` and ``allowEmptyStrings``.
+        update_single_cardinality_properties: str
+            An optional parameter that controls how the bulk loader
+            treats a new value for single-cardinality vertex or edge properties.
+        queue_request: str
+            An optional flag parameter that indicates whether the load request can be queued up or not.
+
+            If omitted or set to ``"FALSE"``, the load request will fail if another load job is already running.
+        dependencies: list[str], optional
+            An optional parameter that can make a queued load request contingent on the successful completion of one or more previous jobs in the queue.
 
         Returns
         -------
         str
             ID of the load job
         """
-        data = {
+        data: Dict[str, Any] = {
             "source": s3_path,
             "format": format,
             "iamRoleArn": role_arn,
-            "mode": "AUTO",
+            "mode": mode,
             "region": self.region,
             "failOnError": "TRUE",
             "parallelism": parallelism,
+            "updateSingleCardinalityProperties": update_single_cardinality_properties,
+            "queueRequest": queue_request,
         }
+        if parser_configuration:
+            data["parserConfiguration"] = parser_configuration
+        if dependencies:
+            data["dependencies"] = dependencies
 
         url = f"https://{self.host}:{self.port}/loader"
 
diff --git a/awswrangler/neptune/_neptune.py b/awswrangler/neptune/_neptune.py
@@ -4,15 +4,15 @@
 import logging
 import re
 import time
-from typing import Any, Callable, Dict, Literal, Optional, TypeVar, Union
+from typing import Any, Callable, Dict, List, Literal, Optional, TypeVar, Union
 
 import boto3
 
 import awswrangler.neptune._gremlin_init as gremlin
 import awswrangler.pandas as pd
 from awswrangler import _utils, exceptions, s3
 from awswrangler._config import apply_configs
-from awswrangler.neptune._client import NeptuneClient
+from awswrangler.neptune._client import BulkLoadParserConfiguration, NeptuneClient
 
 gremlin_python = _utils.import_optional_dependency("gremlin_python")
 opencypher = _utils.import_optional_dependency("requests")
@@ -285,6 +285,10 @@ def bulk_load(
     iam_role: str,
     neptune_load_wait_polling_delay: float = 0.25,
     load_parallelism: Literal["LOW", "MEDIUM", "HIGH", "OVERSUBSCRIBE"] = "HIGH",
+    parser_configuration: Optional[BulkLoadParserConfiguration] = None,
+    update_single_cardinality_properties: Literal["TRUE", "FALSE"] = "FALSE",
+    queue_request: Literal["TRUE", "FALSE"] = "FALSE",
+    dependencies: Optional[List[str]] = None,
     keep_files: bool = False,
     use_threads: Union[bool, int] = True,
     boto3_session: Optional[boto3.Session] = None,
@@ -312,6 +316,18 @@ def bulk_load(
         Interval in seconds for how often the function will check if the Neptune bulk load has completed.
     load_parallelism: str
         Specifies the number of threads used by Neptune's bulk load process.
+    parser_configuration: dict[str, Any], optional
+        An optional object with additional parser configuration values.
+        Each of the child parameters is also optional: ``namedGraphUri``, ``baseUri`` and ``allowEmptyStrings``.
+    update_single_cardinality_properties: str
+        An optional parameter that controls how the bulk loader
+        treats a new value for single-cardinality vertex or edge properties.
+    queue_request: str
+        An optional flag parameter that indicates whether the load request can be queued up or not.
+
+        If omitted or set to ``"FALSE"``, the load request will fail if another load job is already running.
+    dependencies: list[str], optional
+        An optional parameter that can make a queued load request contingent on the successful completion of one or more previous jobs in the queue.
     keep_files: bool
         Whether to keep stage files or delete them. False by default.
     use_threads: bool | int
@@ -352,8 +368,13 @@ def bulk_load(
             client=client,
             path=path,
             iam_role=iam_role,
+            format="csv",
             neptune_load_wait_polling_delay=neptune_load_wait_polling_delay,
             load_parallelism=load_parallelism,
+            parser_configuration=parser_configuration,
+            update_single_cardinality_properties=update_single_cardinality_properties,
+            queue_request=queue_request,
+            dependencies=dependencies,
         )
     finally:
         if keep_files is False:
@@ -372,11 +393,16 @@ def bulk_load_from_files(
     client: NeptuneClient,
     path: str,
     iam_role: str,
+    format: Literal["csv", "opencypher", "ntriples", "nquads", "rdfxml", "turtle"] = "csv",
     neptune_load_wait_polling_delay: float = 0.25,
     load_parallelism: Literal["LOW", "MEDIUM", "HIGH", "OVERSUBSCRIBE"] = "HIGH",
+    parser_configuration: Optional[BulkLoadParserConfiguration] = None,
+    update_single_cardinality_properties: Literal["TRUE", "FALSE"] = "FALSE",
+    queue_request: Literal["TRUE", "FALSE"] = "FALSE",
+    dependencies: Optional[List[str]] = None,
 ) -> None:
     """
-    Load CSV files from S3 into Amazon Neptune using the Neptune Bulk Loader.
+    Load files from S3 into Amazon Neptune using the Neptune Bulk Loader.
 
     For more information about the Bulk Loader see
     `here <https://docs.aws.amazon.com/neptune/latest/userguide/bulk-load.html>`_.
@@ -391,10 +417,25 @@ def bulk_load_from_files(
         The Amazon Resource Name (ARN) for an IAM role to be assumed by the Neptune DB instance for access to the S3 bucket.
         For information about creating a role that has access to Amazon S3 and then associating it with a Neptune cluster,
         see `Prerequisites: IAM Role and Amazon S3 Access <https://docs.aws.amazon.com/neptune/latest/userguide/bulk-load-tutorial-IAM.html>`_.
+    format: str
+        The format of the data.
     neptune_load_wait_polling_delay: float
         Interval in seconds for how often the function will check if the Neptune bulk load has completed.
     load_parallelism: str
         Specifies the number of threads used by Neptune's bulk load process.
+    parser_configuration: dict[str, Any], optional
+        An optional object with additional parser configuration values.
+        Each of the child parameters is also optional: ``namedGraphUri``, ``baseUri`` and ``allowEmptyStrings``.
+    update_single_cardinality_properties: str
+        An optional parameter that controls how the bulk loader
+        treats a new value for single-cardinality vertex or edge properties.
+    queue_request: str
+        An optional flag parameter that indicates whether the load request can be queued up or not.
+
+        If omitted or set to ``"FALSE"``, the load request will fail if another load job is already running.
+    dependencies: list[str], optional
+        An optional parameter that can make a queued load request contingent on the successful completion of one or more previous jobs in the queue.
+
 
     Examples
     --------
@@ -403,15 +444,20 @@ def bulk_load_from_files(
     >>> wr.neptune.bulk_load_from_files(
     ...     client=client,
     ...     path="s3://my-bucket/stage-files/",
-    ...     iam_role="arn:aws:iam::XXX:role/XXX"
+    ...     iam_role="arn:aws:iam::XXX:role/XXX",
+    ...     format="csv",
     ... )
     """
     _logger.debug("Starting Neptune Bulk Load from %s", path)
     load_id = client.load(
         path,
         iam_role,
-        format="csv",
+        format=format,
         parallelism=load_parallelism,
+        parser_configuration=parser_configuration,
+        update_single_cardinality_properties=update_single_cardinality_properties,
+        queue_request=queue_request,
+        dependencies=dependencies,
     )
 
     while True:
@@ -426,7 +472,7 @@ def bulk_load_from_files(
 
         time.sleep(neptune_load_wait_polling_delay)
 
-    _logger.debug("Neptune load %s has succeeded in loading data from %s", load_id, path)
+    _logger.debug("Neptune load %s has succeeded in loading %s data from %s", load_id, format, path)
 
 
 def connect(host: str, port: int, iam_enabled: bool = False, **kwargs: Any) -> NeptuneClient:
diff --git a/awswrangler/neptune/_utils.py b/awswrangler/neptune/_utils.py
@@ -24,7 +24,7 @@ class WriteDFType(Enum):
     UPDATE = 3
 
 
-def write_gremlin_df(client: NeptuneClient, df: pd.DataFrame, mode: WriteDFType, batch_size: int) -> bool:
+def write_gremlin_df(client: "NeptuneClient", df: pd.DataFrame, mode: WriteDFType, batch_size: int) -> bool:
     """Write the provided DataFrame using Gremlin.
 
     Parameters
@@ -67,7 +67,7 @@ def write_gremlin_df(client: NeptuneClient, df: pd.DataFrame, mode: WriteDFType,
     return _run_gremlin_insert(client, g)
 
 
-def _run_gremlin_insert(client: NeptuneClient, g: GraphTraversalSource) -> bool:
+def _run_gremlin_insert(client: "NeptuneClient", g: GraphTraversalSource) -> bool:
     translator = Translator("g")
     s = translator.translate(g.bytecode)
     s = s.replace("Cardinality.", "")  # hack to fix parser error for set cardinality