Add support for DAX if new "daxEndpoint" config is provided

rehevkor5 · rehevkor5 · commit 14fb5960047a · 2021-02-23T20:31:06.000-06:00
- Update to latest AWS SDK
- Make SBT ask for Java 1.8 because that's what Spark uses
diff --git a/README.md b/README.md
@@ -62,6 +62,7 @@ val avgWeightByColor = vegetableDs.agg($"color", avg($"weightKg")) // The column
 ```python
 # Load a DataFrame from a Dynamo table. Only incurs the cost of a single scan for schema inference.
 dynamoDf = spark.read.option("tableName", "SomeTableName") \
+                     .mode(SaveMode.Append) \
                      .format("dynamodb") \
                      .load() # <-- DataFrame of Row objects with inferred schema.
 
@@ -70,6 +71,7 @@ dynamoDf.show(100)
 
 # write to some other table overwriting existing item with same keys
 dynamoDf.write.option("tableName", "SomeOtherTable") \
+              .mode(SaveMode.Append) \
               .format("dynamodb") \
               .save()
 ```
@@ -83,23 +85,29 @@ pyspark --packages com.audienceproject:spark-dynamodb_<spark-scala-version>:<ver
 The following parameters can be set as options on the Spark reader and writer object before loading/saving.
 - `region` sets the region where the dynamodb table. Default is environment specific.
 - `roleArn` sets an IAM role to assume. This allows for access to a DynamoDB in a different account than the Spark cluster. Defaults to the standard role configuration.
+- `daxEndpoint` if not blank, reads and writes will interact with the provided DAX cluster endpoint instead of DynamoDB
+directly. Default is empty string.
 
 The following parameters can be set as options on the Spark reader object before loading.
 
 - `readPartitions` number of partitions to split the initial RDD when loading the data into Spark. Defaults to the size of the DynamoDB table divided into chunks of `maxPartitionBytes`
 - `maxPartitionBytes` the maximum size of a single input partition. Default 128 MB
 - `defaultParallelism` the number of input partitions that can be read from DynamoDB simultaneously. Defaults to `sparkContext.defaultParallelism`
-- `targetCapacity` fraction of provisioned read capacity on the table (or index) to consume for reading. Default 1 (i.e. 100% capacity).
+- `targetCapacity` fraction of provisioned read capacity on the table (or index) to consume for reading, enforced by
+a rate limiter. Default 1 (i.e. 100% capacity).
 - `stronglyConsistentReads` whether or not to use strongly consistent reads. Default false.
 - `bytesPerRCU` number of bytes that can be read per second with a single Read Capacity Unit. Default 4000 (4 KB). This value is multiplied by two when `stronglyConsistentReads=false`
 - `filterPushdown` whether or not to use filter pushdown to DynamoDB on scan requests. Default true.
 - `throughput` the desired read throughput to use. It overwrites any calculation used by the package. It is intended to be used with tables that are on-demand. Defaults to 100 for on-demand.
 
-The following parameters can be set as options on the Spark writer object before saving.
+The following parameters can be set as options on the Spark writer object before saving. By default, items will be
+written using PutItem grouped into BatchWriteItem operations.
 
 - `writeBatchSize` number of items to send per call to DynamoDB BatchWriteItem. Default 25.
-- `targetCapacity` fraction of provisioned write capacity on the table to consume for writing or updating. Default 1 (i.e. 100% capacity).
-- `update` if true items will be written using UpdateItem on keys rather than BatchWriteItem. Default false.
+- `targetCapacity` fraction of provisioned write capacity on the table to consume for writing or updating, enforced
+by a rate limiter. Default 1 (i.e. 100% capacity).
+- `update` if true, items will be written using UpdateItem on keys rather than BatchWriteItem. Default false.
+- `delete` if true, items will be deleted using BatchWriteItem. Default false.
 - `throughput` the desired write throughput to use. It overwrites any calculation used by the package. It is intended to be used with tables that are on-demand. Defaults to 100 for on-demand.
 - `inferSchema` if false will not automatically infer schema - this is useful when writing to a table with many columns
 
diff --git a/build.sbt b/build.sbt
@@ -2,18 +2,30 @@ organization := "com.audienceproject"
 
 name := "spark-dynamodb"
 
-version := "1.1.3"
+version := "1.1.3-SNAPSHOT"
 
 description := "Plug-and-play implementation of an Apache Spark custom data source for AWS DynamoDB."
 
 scalaVersion := "2.12.12"
 
+javacOptions ++= Seq("-source", "1.8", "-target", "1.8")
+scalacOptions += "-target:jvm-1.8"
+
+initialize := {
+    val _ = initialize.value
+    val required = "1.8"
+    val current  = sys.props("java.specification.version")
+    assert(current == required, s"Unsupported JDK: java.specification.version $current != $required")
+}
+
 compileOrder := CompileOrder.JavaThenScala
 
 resolvers += "DynamoDBLocal" at "https://s3-us-west-2.amazonaws.com/dynamodb-local/release"
 
-libraryDependencies += "com.amazonaws" % "aws-java-sdk-sts" % "1.11.678"
-libraryDependencies += "com.amazonaws" % "aws-java-sdk-dynamodb" % "1.11.678"
+libraryDependencies += "com.amazonaws" % "aws-java-sdk-sts" % "1.11.955"
+libraryDependencies += "com.amazonaws" % "aws-java-sdk-dynamodb" % "1.11.955"
+// This should be "optional" scope after updating SBT to a version that supports it
+libraryDependencies += "com.amazonaws" % "amazon-dax-client" % "1.0.208233.0"
 libraryDependencies += "com.amazonaws" % "DynamoDBLocal" % "[1.11,2.0)" % "test" exclude("com.google.guava", "guava")
 
 libraryDependencies += "org.apache.spark" %% "spark-sql" % "3.0.0" % "provided"
diff --git a/project/build.properties b/project/build.properties
@@ -1 +1 @@
-sbt.version = 1.2.6
+sbt.version = 1.2.8
diff --git a/src/main/scala/com/audienceproject/spark/dynamodb/connector/DynamoConnector.scala b/src/main/scala/com/audienceproject/spark/dynamodb/connector/DynamoConnector.scala
@@ -20,6 +20,9 @@
   */
 package com.audienceproject.spark.dynamodb.connector
 
+import java.net.URI
+
+import com.amazon.dax.client.dynamodbv2.{AmazonDaxAsyncClientBuilder, AmazonDaxClientBuilder}
 import com.amazonaws.auth.profile.ProfileCredentialsProvider
 import com.amazonaws.auth.{AWSCredentialsProvider, AWSStaticCredentialsProvider, BasicSessionCredentials, DefaultAWSCredentialsProviderChain}
 import com.amazonaws.client.builder.AwsClientBuilder.EndpointConfiguration
@@ -44,17 +47,25 @@ private[dynamodb] trait DynamoConnector {
         val chosenRegion = region.getOrElse(properties.getOrElse("aws.dynamodb.region", "us-east-1"))
         val credentials = getCredentials(chosenRegion, roleArn, providerClassName)
 
-        properties.get("aws.dynamodb.endpoint").map(endpoint => {
-            AmazonDynamoDBClientBuilder.standard()
-                .withCredentials(credentials)
-                .withEndpointConfiguration(new EndpointConfiguration(endpoint, chosenRegion))
-                .build()
-        }).getOrElse(
-            AmazonDynamoDBClientBuilder.standard()
+        if (daxEndpoint.isEmpty) {
+            properties.get("aws.dynamodb.endpoint").map(endpoint => {
+                AmazonDynamoDBClientBuilder.standard()
+                    .withCredentials(credentials)
+                    .withEndpointConfiguration(new EndpointConfiguration(endpoint, chosenRegion))
+                    .build()
+            }).getOrElse(
+                AmazonDynamoDBClientBuilder.standard()
+                    .withCredentials(credentials)
+                    .withRegion(chosenRegion)
+                    .build()
+            )
+        } else {
+            AmazonDaxClientBuilder.standard()
+                .withEndpointConfiguration(daxEndpoint)
                 .withCredentials(credentials)
-                .withRegion(chosenRegion)
                 .build()
-        )
+        }
+
     }
 
     def getDynamoDBAsyncClient(region: Option[String] = None,
@@ -63,17 +74,24 @@ private[dynamodb] trait DynamoConnector {
         val chosenRegion = region.getOrElse(properties.getOrElse("aws.dynamodb.region", "us-east-1"))
         val credentials = getCredentials(chosenRegion, roleArn, providerClassName)
 
-        properties.get("aws.dynamodb.endpoint").map(endpoint => {
-            AmazonDynamoDBAsyncClientBuilder.standard()
-                .withCredentials(credentials)
-                .withEndpointConfiguration(new EndpointConfiguration(endpoint, chosenRegion))
-                .build()
-        }).getOrElse(
-            AmazonDynamoDBAsyncClientBuilder.standard()
+        if (daxEndpoint.isEmpty) {
+            properties.get("aws.dynamodb.endpoint").map(endpoint => {
+                AmazonDynamoDBAsyncClientBuilder.standard()
+                    .withCredentials(credentials)
+                    .withEndpointConfiguration(new EndpointConfiguration(endpoint, chosenRegion))
+                    .build()
+            }).getOrElse(
+                AmazonDynamoDBAsyncClientBuilder.standard()
+                    .withCredentials(credentials)
+                    .withRegion(chosenRegion)
+                    .build()
+            )
+        } else {
+            AmazonDaxAsyncClientBuilder.standard()
+                .withEndpointConfiguration(daxEndpoint)
                 .withCredentials(credentials)
-                .withRegion(chosenRegion)
                 .build()
-        )
+        }
     }
 
     /**
@@ -126,6 +144,8 @@ private[dynamodb] trait DynamoConnector {
 
     val filterPushdownEnabled: Boolean
 
+    val daxEndpoint: String
+
     def scan(segmentNum: Int, columns: Seq[String], filters: Seq[Filter]): ItemCollection[ScanOutcome]
 
     def isEmpty: Boolean = itemLimit == 0
diff --git a/src/main/scala/com/audienceproject/spark/dynamodb/connector/TableConnector.scala b/src/main/scala/com/audienceproject/spark/dynamodb/connector/TableConnector.scala
@@ -43,6 +43,8 @@ private[dynamodb] class TableConnector(tableName: String, parallelism: Int, para
 
     override val filterPushdownEnabled: Boolean = filterPushdown
 
+    override val daxEndpoint: String = parameters.getOrElse("daxEndpoint", "").trim
+
     override val (keySchema, readLimit, writeLimit, itemLimit, totalSegments) = {
         val table = getDynamoDB(region, roleArn, providerClassName).getTable(tableName)
         val desc = table.describe()
diff --git a/src/main/scala/com/audienceproject/spark/dynamodb/connector/TableIndexConnector.scala b/src/main/scala/com/audienceproject/spark/dynamodb/connector/TableIndexConnector.scala
@@ -39,6 +39,8 @@ private[dynamodb] class TableIndexConnector(tableName: String, indexName: String
 
     override val filterPushdownEnabled: Boolean = filterPushdown
 
+    override val daxEndpoint: String = parameters.getOrElse("daxEndpoint", "").trim
+
     override val (keySchema, readLimit, itemLimit, totalSegments) = {
         val table = getDynamoDB(region, roleArn, providerClassName).getTable(tableName)
         val indexDesc = table.describe().getGlobalSecondaryIndexes.asScala.find(_.getIndexName == indexName).get
diff --git a/src/main/scala/com/audienceproject/spark/dynamodb/datasource/DynamoDataWriter.scala b/src/main/scala/com/audienceproject/spark/dynamodb/datasource/DynamoDataWriter.scala
@@ -51,7 +51,10 @@ class DynamoDataWriter(batchSize: Int,
 
     override def abort(): Unit = {}
 
-    override def close(): Unit = client.shutdown()
+    override def close(): Unit = {
+        buffer.clear()
+        client.shutdown()
+    }
 
     protected def flush(): Unit = {
         if (buffer.nonEmpty) {

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-sbt.version = 1.2.6`
	`1`	`+sbt.version = 1.2.8`