Cleaned up updateItems function and changed write relation to always use defaultParallelism for numPartitions

jacobfi · jacobfi · commit 8a5e5c9b4a24 · 2019-02-05T13:07:01.000+01:00
diff --git a/README.md b/README.md
@@ -61,9 +61,9 @@ The following parameters can be set as options on the Spark reader object before
 
 The following parameters can be set as options on the Spark writer object before saving.
 
-- `writePartitions` number of partitions to split the given DataFrame into when writing to DynamoDB. Set to `skip` to avoid repartitioning the DataFrame before writing. Defaults to `sparkContext.defaultParallelism`
 - `writeBatchSize` number of items to send per call to DynamoDB BatchWriteItem. Default 25.
-- `update` if true writes will be using UpdateItem on keys rather than BatchWriteItem. Default false 
+- `targetCapacity` fraction of provisioned write capacity on the table to consume for writing or updating. Default 1 (i.e. 100% capacity).
+- `update` if true items will be written using UpdateItem on keys rather than BatchWriteItem. Default false. 
 
 ## Running Unit Tests
 The unit tests are dependent on the AWS DynamoDBLocal client, which in turn is dependent on [sqlite4java](https://bitbucket.org/almworks/sqlite4java/src/master/). I had some problems running this on OSX, so I had to put the library directly in the /lib folder, as graciously explained in [this Stack Overflow answer](https://stackoverflow.com/questions/34137043/amazon-dynamodb-local-unknown-error-exception-or-failure/35353377#35353377).
diff --git a/build.sbt b/build.sbt
@@ -2,7 +2,7 @@ organization := "com.audienceproject"
 
 name := "spark-dynamodb"
 
-version := "0.3.6"
+version := "0.4.0"
 
 description := "Plug-and-play implementation of an Apache Spark custom data source for AWS DynamoDB."
 
@@ -13,7 +13,7 @@ crossScalaVersions := Seq("2.11.12", "2.12.7")
 resolvers += "DynamoDBLocal" at "https://s3-us-west-2.amazonaws.com/dynamodb-local/release"
 
 libraryDependencies += "com.amazonaws" % "aws-java-sdk-dynamodb" % "1.11.466"
-libraryDependencies += "com.amazonaws" % "DynamoDBLocal" % "[1.11,2.0)" % "test"
+libraryDependencies += "com.amazonaws" % "DynamoDBLocal" % "[1.11,2.0)" % "test" exclude("com.google.guava", "guava")
 
 libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.4.0" % "provided"
 libraryDependencies += "com.google.guava" % "guava" % "14.0.1" % "provided"
diff --git a/project/plugins.sbt b/project/plugins.sbt
@@ -2,3 +2,4 @@ logLevel := Level.Warn
 
 addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.1")
 addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "5.2.4")
+addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.9.2")
diff --git a/src/main/scala/com/audienceproject/spark/dynamodb/DefaultSource.scala b/src/main/scala/com/audienceproject/spark/dynamodb/DefaultSource.scala
@@ -31,9 +31,7 @@ import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}
 
-class DefaultSource extends RelationProvider
-    with SchemaRelationProvider
-    with CreatableRelationProvider {
+class DefaultSource extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider {
 
     val logger: Logger = LoggerFactory.getLogger(this.getClass.getName)
 
@@ -50,19 +48,15 @@ class DefaultSource extends RelationProvider
     override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String],
                                 data: DataFrame): BaseRelation = {
         logger.info(s"Using Guava version $getGuavaVersion")
-        val writeData =
-            if (parameters.get("writePartitions").contains("skip")) data
-            else data.repartition(parameters.get("writePartitions").map(_.toInt).getOrElse(sqlContext.sparkContext.defaultParallelism))
 
-        val writeRelation = new DynamoWriteRelation(writeData, parameters)(sqlContext)
-        if (parameters.getOrElse("update", "false").toBoolean) {
+        val writeRelation = new DynamoWriteRelation(data, parameters)(sqlContext)
+
+        if (parameters.getOrElse("update", "false").toBoolean)
             writeRelation.update()
-        } else {
+        else
             writeRelation.write()
 
-        }
         writeRelation
-
     }
 
     private def getGuavaVersion: String = try {
diff --git a/src/main/scala/com/audienceproject/spark/dynamodb/connector/DynamoConnector.scala b/src/main/scala/com/audienceproject/spark/dynamodb/connector/DynamoConnector.scala
@@ -23,18 +23,18 @@ package com.audienceproject.spark.dynamodb.connector
 import com.amazonaws.auth.DefaultAWSCredentialsProviderChain
 import com.amazonaws.auth.profile.ProfileCredentialsProvider
 import com.amazonaws.client.builder.AwsClientBuilder.EndpointConfiguration
-import com.amazonaws.services.dynamodbv2.{AmazonDynamoDB, AmazonDynamoDBClientBuilder}
 import com.amazonaws.services.dynamodbv2.document.{DynamoDB, ItemCollection, ScanOutcome}
+import com.amazonaws.services.dynamodbv2.{AmazonDynamoDB, AmazonDynamoDBClientBuilder}
 import org.apache.spark.sql.sources.Filter
 
 private[dynamodb] trait DynamoConnector {
 
-    def getDynamoDB(region:Option[String]=None): DynamoDB = {
+    def getDynamoDB(region: Option[String] = None): DynamoDB = {
         val client: AmazonDynamoDB = getDynamoDBClient(region)
         new DynamoDB(client)
     }
 
-    def getDynamoDBClient(region:Option[String]=None) = {
+    def getDynamoDBClient(region: Option[String] = None): AmazonDynamoDB = {
         val chosenRegion = region.getOrElse(sys.env.getOrElse("aws.dynamodb.region", "us-east-1"))
         Option(System.getProperty("aws.dynamodb.endpoint")).map(endpoint => {
             val credentials = Option(System.getProperty("aws.profile"))
diff --git a/src/main/scala/com/audienceproject/spark/dynamodb/connector/DynamoUpdatable.scala b/src/main/scala/com/audienceproject/spark/dynamodb/connector/DynamoUpdatable.scala
@@ -25,7 +25,6 @@ import org.apache.spark.sql.types.StructType
 
 trait DynamoUpdatable {
 
-
     def updateItems(schema: StructType)(items: Iterator[Row]): Unit
 
 }
diff --git a/src/main/scala/com/audienceproject/spark/dynamodb/connector/TableConnector.scala b/src/main/scala/com/audienceproject/spark/dynamodb/connector/TableConnector.scala
@@ -20,10 +20,13 @@
   */
 package com.audienceproject.spark.dynamodb.connector
 
+import java.util
+
 import com.amazonaws.services.dynamodbv2.document._
-import com.amazonaws.services.dynamodbv2.document.spec.{BatchWriteItemSpec, ScanSpec}
-import com.amazonaws.services.dynamodbv2.model.{AttributeValue, ReturnConsumedCapacity, UpdateItemRequest, UpdateItemResult}
+import com.amazonaws.services.dynamodbv2.document.spec.{BatchWriteItemSpec, ScanSpec, UpdateItemSpec}
+import com.amazonaws.services.dynamodbv2.model.ReturnConsumedCapacity
 import com.amazonaws.services.dynamodbv2.xspec.ExpressionSpecBuilder
+import com.amazonaws.services.dynamodbv2.xspec.ExpressionSpecBuilder.{BOOL => newBOOL, L => newL, M => newM, N => newN, S => newS}
 import com.google.common.util.concurrent.RateLimiter
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.sources.Filter
@@ -94,45 +97,6 @@ private[dynamodb] class TableConnector(tableName: String, totalSegments: Int, pa
         getDynamoDB(region).getTable(tableName).scan(scanSpec)
     }
 
-    override def updateItems(schema: StructType)(items: Iterator[Row]): Unit = {
-        val columnNames = schema.map(_.name)
-        val hashKeyIndex = columnNames.indexOf(keySchema.hashKeyName)
-        val rangeKeyIndex = keySchema.rangeKeyName.map(columnNames.indexOf)
-        val columnIndices = columnNames.zipWithIndex.filterNot({
-            case (name, _) => keySchema match {
-                case KeySchema(hashKey, None) => name == hashKey
-                case KeySchema(hashKey, Some(rangeKey)) => name == hashKey || name == rangeKey
-            }
-        })
-
-        val rateLimiter = RateLimiter.create(writeLimit max 1)
-        val client = getDynamoDBClient(region)
-
-        // For each item.
-        items.foreach(row => {
-            val key: Map[String, AttributeValue] = keySchema match {
-                case KeySchema(hashKey, None) => Map(hashKey -> mapValueToAttributeValue(row(hashKeyIndex), schema(hashKey).dataType))
-                case KeySchema(hashKey, Some(rangeKey)) =>
-                    Map(hashKey -> mapValueToAttributeValue(row(hashKeyIndex), schema(hashKey).dataType),
-                        rangeKey -> mapValueToAttributeValue(row(rangeKeyIndex.get), schema(rangeKey).dataType))
-
-            }
-            val nonNullColumnIndices = columnIndices.filter(c => row(c._2) != null)
-            val updateExpression = s"SET ${nonNullColumnIndices.map(c => s"${c._1}=:${c._1}").mkString(", ")}"
-            val expressionAttributeValues = nonNullColumnIndices.map(c => s":${c._1}" -> mapValueToAttributeValue(row(c._2), schema(c._1).dataType)).toMap.asJava
-            val updateItemReq = new UpdateItemRequest()
-                .withReturnConsumedCapacity(ReturnConsumedCapacity.TOTAL)
-                .withTableName(tableName)
-                .withKey(key.asJava)
-                .withUpdateExpression(updateExpression)
-                .withExpressionAttributeValues(expressionAttributeValues)
-
-            val updateItemResult = client.updateItem(updateItemReq)
-
-            handleUpdateItemResult(rateLimiter)(updateItemResult)
-        })
-    }
-
     override def putItems(schema: StructType, batchSize: Int)(items: Iterator[Row]): Unit = {
         val columnNames = schema.map(_.name)
         val hashKeyIndex = columnNames.indexOf(keySchema.hashKeyName)
@@ -174,46 +138,85 @@ private[dynamodb] class TableConnector(tableName: String, totalSegments: Int, pa
             ))
 
             val response = client.batchWriteItem(batchWriteItemSpec)
-
             handleBatchWriteResponse(client, rateLimiter)(response)
         })
     }
 
+    override def updateItems(schema: StructType)(items: Iterator[Row]): Unit = {
+        val columnNames = schema.map(_.name)
+        val hashKeyIndex = columnNames.indexOf(keySchema.hashKeyName)
+        val rangeKeyIndex = keySchema.rangeKeyName.map(columnNames.indexOf)
+        val columnIndices = columnNames.zipWithIndex.filterNot({
+            case (name, _) => keySchema match {
+                case KeySchema(hashKey, None) => name == hashKey
+                case KeySchema(hashKey, Some(rangeKey)) => name == hashKey || name == rangeKey
+            }
+        })
+
+        val rateLimiter = RateLimiter.create(writeLimit max 1)
+        val client = getDynamoDB(region)
+
+        // For each item.
+        items.foreach(row => {
+            // Build update expression.
+            val xspec = new ExpressionSpecBuilder()
+            columnIndices.foreach({
+                case (name, index) if !row.isNullAt(index) =>
+                    val updateAction = schema(name).dataType match {
+                        case StringType => newS(name).set(row.getString(index))
+                        case BooleanType => newBOOL(name).set(row.getBoolean(index))
+                        case IntegerType => newN(name).set(row.getInt(index))
+                        case LongType => newN(name).set(row.getLong(index))
+                        case ShortType => newN(name).set(row.getShort(index))
+                        case FloatType => newN(name).set(row.getFloat(index))
+                        case DoubleType => newN(name).set(row.getDouble(index))
+                        case ArrayType(innerType, _) => newL(name).set(row.getSeq[Any](index).map(e => mapValue(e, innerType)).asJava)
+                        case MapType(keyType, valueType, _) =>
+                            if (keyType != StringType) throw new IllegalArgumentException(
+                                s"Invalid Map key type '${keyType.typeName}'. DynamoDB only supports String as Map key type.")
+                            newM(name).set(row.getMap[String, Any](index).mapValues(e => mapValue(e, valueType)).asJava)
+                        case StructType(fields) => newM(name).set(mapStruct(row.getStruct(index), fields))
+                    }
+                    xspec.addUpdate(updateAction)
+                case _ =>
+            })
+
+            val updateItemSpec = new UpdateItemSpec()
+                .withExpressionSpec(xspec.buildForUpdate())
+                .withReturnConsumedCapacity(ReturnConsumedCapacity.TOTAL)
+
+            // Map primary key.
+            keySchema match {
+                case KeySchema(hashKey, None) => updateItemSpec.withPrimaryKey(hashKey, row(hashKeyIndex))
+                case KeySchema(hashKey, Some(rangeKey)) =>
+                    updateItemSpec.withPrimaryKey(hashKey, row(hashKeyIndex), rangeKey, row(rangeKeyIndex.get))
+            }
+
+            if (updateItemSpec.getUpdateExpression.nonEmpty) {
+                val response = client.getTable(tableName).updateItem(updateItemSpec)
+                handleUpdateResponse(rateLimiter)(response)
+            }
+        })
+    }
+
     private def mapValue(element: Any, elementType: DataType): Any = {
         elementType match {
             case ArrayType(innerType, _) => element.asInstanceOf[Seq[_]].map(e => mapValue(e, innerType)).asJava
             case MapType(keyType, valueType, _) =>
                 if (keyType != StringType) throw new IllegalArgumentException(
                     s"Invalid Map key type '${keyType.typeName}'. DynamoDB only supports String as Map key type.")
-                element.asInstanceOf[Map[_, _]].mapValues(e => mapValue(e, valueType)).asJava
+                element.asInstanceOf[Map[String, _]].mapValues(e => mapValue(e, valueType)).asJava
             case StructType(fields) =>
                 val row = element.asInstanceOf[Row]
-                (fields.indices map { i =>
-                    fields(i).name -> mapValue(row(i), fields(i).dataType)
-                }).toMap.asJava
+                mapStruct(row, fields)
             case _ => element
         }
     }
 
-    private def mapValueToAttributeValue(element: Any, elementType: DataType): AttributeValue = {
-        elementType match {
-            case ArrayType(innerType, _) => new AttributeValue().withL(element.asInstanceOf[Seq[_]].map(e => mapValueToAttributeValue(e, innerType)): _*)
-            case MapType(keyType, valueType, _) =>
-                if (keyType != StringType) throw new IllegalArgumentException(
-                    s"Invalid Map key type '${keyType.typeName}'. DynamoDB only supports String as Map key type.")
-
-                new AttributeValue().withM(element.asInstanceOf[Map[String, _]].mapValues(e => mapValueToAttributeValue(e, valueType)).asJava)
-
-            case StructType(fields) =>
-                val row = element.asInstanceOf[Row]
-                new AttributeValue().withM((fields.indices map { i =>
-                    fields(i).name -> mapValueToAttributeValue(row(i), fields(i).dataType)
-                }).toMap.asJava)
-            case StringType => new AttributeValue().withS(element.asInstanceOf[String])
-            case LongType | IntegerType | DoubleType | FloatType => new AttributeValue().withN(element.toString)
-            case BooleanType => new AttributeValue().withBOOL(element.asInstanceOf[Boolean])
-        }
-    }
+    private def mapStruct(row: Row, fields: Seq[StructField]): util.Map[String, Any] =
+        (fields.indices map { i =>
+            fields(i).name -> mapValue(row(i), fields(i).dataType)
+        }).toMap.asJava
 
     @tailrec
     private def handleBatchWriteResponse(client: DynamoDB, rateLimiter: RateLimiter)
@@ -231,12 +234,12 @@ private[dynamodb] class TableConnector(tableName: String, totalSegments: Int, pa
         }
     }
 
-    private def handleUpdateItemResult(rateLimiter: RateLimiter)
-                                      (result: UpdateItemResult): Unit = {
+    private def handleUpdateResponse(rateLimiter: RateLimiter)
+                                    (response: UpdateItemOutcome): Unit = {
         // Rate limit on write capacity.
-        if (result.getConsumedCapacity != null) {
-            rateLimiter.acquire(result.getConsumedCapacity.getCapacityUnits.toInt)
-        }
+        Option(response.getUpdateItemResult.getConsumedCapacity)
+            .map(_.getCapacityUnits.toInt)
+            .foreach(rateLimiter.acquire)
     }
 
 }
diff --git a/src/main/scala/com/audienceproject/spark/dynamodb/rdd/DynamoWriteRelation.scala b/src/main/scala/com/audienceproject/spark/dynamodb/rdd/DynamoWriteRelation.scala
@@ -33,8 +33,7 @@ private[dynamodb] class DynamoWriteRelation(data: DataFrame, parameters: Map[Str
 
     private val tableName = parameters("tableName")
     private val batchSize = parameters.getOrElse("writeBatchSize", "25").toInt
-    private val numPartitions = data.rdd.getNumPartitions
-    private val connector = new TableConnector(tableName, numPartitions, parameters)
+    private val connector = new TableConnector(tableName, sqlContext.sparkContext.defaultParallelism, parameters)
 
     override val schema: StructType = data.schema
 
@@ -48,5 +47,4 @@ private[dynamodb] class DynamoWriteRelation(data: DataFrame, parameters: Map[Str
         data.foreachPartition(connector.updateItems(schema) _)
     }
 
-
 }
diff --git a/src/test/scala/com/audienceproject/spark/dynamodb/WriteRelationTest.scala b/src/test/scala/com/audienceproject/spark/dynamodb/WriteRelationTest.scala
@@ -51,6 +51,7 @@ class WriteRelationTest extends AbstractInMemoryTest {
         assert(validationDs.select("color").as[String].collect().forall(Seq("yellow", "orange", "red") contains _))
         assert(validationDs.select("weight").as[Double].collect().forall(Seq(0.1, 0.2, 0.2) contains _))
     }
+
     test("Updating from a local Dataset with new and only some previous columns") {
         val tablename = "UpdateTest1"
         dynamoDB.createTable(new CreateTableRequest()
@@ -65,23 +66,22 @@ class WriteRelationTest extends AbstractInMemoryTest {
             ("lemon", "yellow", 0.1),
             ("orange", "orange", 0.2),
             ("pomegranate", "red", 0.2)
-        ).toDF("name","color","weight")
+        ).toDF("name", "color", "weight")
         newItemsDs.write.dynamodb(tablename)
 
         newItemsDs
-            .withColumn("size",length($"color"))
+            .withColumn("size", length($"color"))
             .drop("color")
-            .withColumn("weight",$"weight"*2)
-            .write.option("update","true").dynamodb(tablename)
+            .withColumn("weight", $"weight" * 2)
+            .write.option("update", "true").dynamodb(tablename)
 
         val validationDs = spark.read.dynamodb(tablename)
         validationDs.show
         assert(validationDs.count() === 3)
         assert(validationDs.select("name").as[String].collect().forall(Seq("lemon", "orange", "pomegranate") contains _))
         assert(validationDs.select("color").as[String].collect().forall(Seq("yellow", "orange", "red") contains _))
         assert(validationDs.select("weight").as[Double].collect().forall(Seq(0.2, 0.4, 0.4) contains _))
-        assert(validationDs.select("size").as[Long].collect().forall(Seq(6,3) contains _))
-
+        assert(validationDs.select("size").as[Long].collect().forall(Seq(6, 3) contains _))
     }
 
     test("Updating from a local Dataset with null values") {
@@ -98,20 +98,20 @@ class WriteRelationTest extends AbstractInMemoryTest {
             ("lemon", "yellow", 0.1),
             ("orange", "orange", 0.2),
             ("pomegranate", "red", 0.2)
-        ).toDF("name","color","weight")
+        ).toDF("name", "color", "weight")
         newItemsDs.write.dynamodb(tablename)
 
         val alteredDs = newItemsDs
-            .withColumn("weight",when($"weight" < 0.2,$"weight").otherwise(lit(null)))
+            .withColumn("weight", when($"weight" < 0.2, $"weight").otherwise(lit(null)))
         alteredDs.show
-        alteredDs.write.option("update","true").dynamodb(tablename)
+        alteredDs.write.option("update", "true").dynamodb(tablename)
 
         val validationDs = spark.read.dynamodb(tablename)
         validationDs.show
         assert(validationDs.count() === 3)
         assert(validationDs.select("name").as[String].collect().forall(Seq("lemon", "orange", "pomegranate") contains _))
         assert(validationDs.select("color").as[String].collect().forall(Seq("yellow", "orange", "red") contains _))
         assert(validationDs.select("weight").as[Double].collect().forall(Seq(0.2, 0.1) contains _))
-
     }
+
 }

Original file line number	Diff line number	Diff line change
`@@ -2,3 +2,4 @@ logLevel := Level.Warn`
`2`	`2`
`3`	`3`	`addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.1")`
`4`	`4`	`addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "5.2.4")`
	`5`	`+addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.9.2")`
Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,6 @@ import org.apache.spark.sql.types.StructType`
`25`	`25`
`26`	`26`	`trait DynamoUpdatable {`
`27`	`27`
`28`		`-`
`29`	`28`	`def updateItems(schema: StructType)(items: Iterator[Row]): Unit`
`30`	`29`
`31`	`30`	`}`
Original file line number	Diff line number	Diff line change
`@@ -33,8 +33,7 @@ private[dynamodb] class DynamoWriteRelation(data: DataFrame, parameters: Map[Str`
`33`	`33`
`34`	`34`	`private val tableName = parameters("tableName")`
`35`	`35`	`private val batchSize = parameters.getOrElse("writeBatchSize", "25").toInt`
`36`		`- private val numPartitions = data.rdd.getNumPartitions`
`37`		`- private val connector = new TableConnector(tableName, numPartitions, parameters)`
	`36`	`+ private val connector = new TableConnector(tableName, sqlContext.sparkContext.defaultParallelism, parameters)`
`38`	`37`
`39`	`38`	`override val schema: StructType = data.schema`
`40`	`39`
`@@ -48,5 +47,4 @@ private[dynamodb] class DynamoWriteRelation(data: DataFrame, parameters: Map[Str`
`48`	`47`	`data.foreachPartition(connector.updateItems(schema) _)`
`49`	`48`	`}`
`50`	`49`
`51`		`-`
`52`	`50`	`}`