Merge branch 'master' into tmwong2003/fix-readme-call-to-read

jacobfi · web-flow · commit c706ed75d4b0 · 2018-09-27T12:07:31.000+02:00
diff --git a/README.md b/README.md
@@ -23,15 +23,21 @@ val dynamoDf = spark.read.dynamodb("SomeTableName") // <-- DataFrame of Row obje
 // Scan the table for the first 100 items (the order is arbitrary) and print them.
 dynamoDf.show(100)
 
+// write to some other table overwriting existing item with same keys
+dynamoDf.write.dynamodb("SomeOtherTable")
+
 // Case class representing the items in our table.
 import com.audienceproject.spark.dynamodb.attribute
 case class Vegetable (name: String, color: String, @attribute("weight_kg") weightKg: Double)
 
 // Load a Dataset[Vegetable]. Notice the @attribute annotation on the case class - we imagine the weight attribute is named with an underscore in DynamoDB.
 import org.apache.spark.sql.functions._
 import spark.implicits._
-val vegetableDs = spark.dynamodbAs[Vegetable]("VegeTable")
+val vegetableDs = spark.read.dynamodbAs[Vegetable]("VegeTable")
 val avgWeightByColor = vegetableDs.agg($"color", avg($"weightKg")) // The column is called 'weightKg' in the Dataset.
+
+
+
 ```
 
 ## Getting The Dependency
@@ -41,6 +47,10 @@ The library is available from Maven Central. Add the dependency in SBT as ```"co
 Spark is used in the library as a "provided" dependency, which means Spark has to be installed separately on the container where the application is running, such as is the case on AWS EMR.
 
 ## Parameters
+The following parameters can be set as options on the Spark reader and writer object before loading/saving.
+- `region` sets the region where the dynamodb table. Default is environment specific.
+
+
 The following parameters can be set as options on the Spark reader object before loading.
 
 - `readPartitions` number of partitions to split the initial RDD when loading the data into Spark. Corresponds 1-to-1 with total number of segments in the DynamoDB parallel scan used to load the data. Defaults to `sparkContext.defaultParallelism`
@@ -53,6 +63,7 @@ The following parameters can be set as options on the Spark writer object before
 
 - `writePartitions` number of partitions to split the given DataFrame into when writing to DynamoDB. Set to `skip` to avoid repartitioning the DataFrame before writing. Defaults to `sparkContext.defaultParallelism`
 - `writeBatchSize` number of items to send per call to DynamoDB BatchWriteItem. Default 25.
+- `update` if true writes will be using UpdateItem on keys rather than BatchWriteItem. Default false 
 
 ## Running Unit Tests
 The unit tests are dependent on the AWS DynamoDBLocal client, which in turn is dependent on [sqlite4java](https://bitbucket.org/almworks/sqlite4java/src/master/). I had some problems running this on OSX, so I had to put the library directly in the /lib folder, as graciously explained in [this Stack Overflow answer](https://stackoverflow.com/questions/34137043/amazon-dynamodb-local-unknown-error-exception-or-failure/35353377#35353377).
diff --git a/build.sbt b/build.sbt
@@ -2,7 +2,7 @@ organization := "com.audienceproject"
 
 name := "spark-dynamodb"
 
-version := "0.3.1"
+version := "0.3.2"
 
 description := "Plug-and-play implementation of an Apache Spark custom data source for AWS DynamoDB."
 
@@ -63,4 +63,11 @@ pomExtra := <url>https://github.com/audienceproject/spark-dynamodb</url>
             <organization>AudienceProject</organization>
             <organizationUrl>https://www.audienceproject.com</organizationUrl>
         </developer>
+        <developer>
+            <id>johsbk</id>
+            <name>Johs Kristoffersen</name>
+            <email>johs.kristoffersen@audienceproject.com</email>
+            <organization>AudienceProject</organization>
+            <organizationUrl>https://www.audienceproject.com</organizationUrl>
+        </developer>
     </developers>
diff --git a/src/main/scala/com/audienceproject/spark/dynamodb/DefaultSource.scala b/src/main/scala/com/audienceproject/spark/dynamodb/DefaultSource.scala
@@ -54,23 +54,25 @@ class DefaultSource extends RelationProvider
             if (parameters.get("writePartitions").contains("skip")) data
             else data.repartition(parameters.get("writePartitions").map(_.toInt).getOrElse(sqlContext.sparkContext.defaultParallelism))
 
-        val writeRelation = new DynamoWriteRelation(writeData, parameters)(sqlContext)
+        val writeRelation= new DynamoWriteRelation(writeData, parameters)(sqlContext)
+        if (parameters.getOrElse("update","false").toBoolean) {
+            writeRelation.update()
+        } else {
+            writeRelation.write()
 
-        writeRelation.write()
+        }
         writeRelation
+
     }
 
     private def getGuavaVersion: String = try {
         val file = new File(classOf[Charsets].getProtectionDomain.getCodeSource.getLocation.toURI)
-        try {
-            val jar = new JarFile(file)
-            try
-                jar.getManifest.getMainAttributes.getValue("Bundle-Version")
-            finally if (jar != null) jar.close()
-        }
+        val jar = new JarFile(file)
+        try
+            jar.getManifest.getMainAttributes.getValue("Bundle-Version")
+        finally if (jar != null) jar.close()
     } catch {
-        case ex: Exception =>
-            throw new RuntimeException("Unable to get the version of Guava", ex)
+        case ex: Exception => throw new RuntimeException("Unable to get the version of Guava", ex)
     }
 
 }
diff --git a/src/main/scala/com/audienceproject/spark/dynamodb/connector/DynamoConnector.scala b/src/main/scala/com/audienceproject/spark/dynamodb/connector/DynamoConnector.scala
@@ -23,29 +23,33 @@ package com.audienceproject.spark.dynamodb.connector
 import com.amazonaws.auth.DefaultAWSCredentialsProviderChain
 import com.amazonaws.auth.profile.ProfileCredentialsProvider
 import com.amazonaws.client.builder.AwsClientBuilder.EndpointConfiguration
-import com.amazonaws.services.dynamodbv2.AmazonDynamoDBClientBuilder
+import com.amazonaws.services.dynamodbv2.{AmazonDynamoDB, AmazonDynamoDBClientBuilder}
 import com.amazonaws.services.dynamodbv2.document.{DynamoDB, ItemCollection, ScanOutcome}
 import org.apache.spark.sql.sources.Filter
 
 private[dynamodb] trait DynamoConnector {
 
-    def getClient: DynamoDB = {
-        val client = Option(System.getProperty("aws.dynamodb.endpoint")).map(endpoint => {
-            val region = sys.env.getOrElse("aws.dynamodb.region", "us-east-1")
+    def getDynamoDB(region:Option[String]=None): DynamoDB = {
+        val client: AmazonDynamoDB = getDynamoDBClient(region)
+        new DynamoDB(client)
+    }
+
+    def getDynamoDBClient(region:Option[String]=None) = {
+        val chosenRegion = region.getOrElse(sys.env.getOrElse("aws.dynamodb.region", "us-east-1"))
+        Option(System.getProperty("aws.dynamodb.endpoint")).map(endpoint => {
             val credentials = Option(System.getProperty("aws.profile"))
                 .map(new ProfileCredentialsProvider(_))
                 .getOrElse(new DefaultAWSCredentialsProviderChain)
             AmazonDynamoDBClientBuilder.standard()
                 .withCredentials(credentials)
-                .withEndpointConfiguration(new EndpointConfiguration(endpoint, region))
+                .withEndpointConfiguration(new EndpointConfiguration(endpoint, chosenRegion))
                 .build()
-        }).getOrElse(AmazonDynamoDBClientBuilder.defaultClient())
-        new DynamoDB(client)
+        }).getOrElse(AmazonDynamoDBClientBuilder.standard().withRegion(chosenRegion).build())
     }
 
     val keySchema: KeySchema
 
-    val readLimit: Int
+    val readLimit: Double
 
     val itemLimit: Int
 
diff --git a/src/main/scala/com/audienceproject/spark/dynamodb/connector/DynamoUpdatable.scala b/src/main/scala/com/audienceproject/spark/dynamodb/connector/DynamoUpdatable.scala
@@ -0,0 +1,31 @@
+/**
+  * Licensed to the Apache Software Foundation (ASF) under one
+  * or more contributor license agreements.  See the NOTICE file
+  * distributed with this work for additional information
+  * regarding copyright ownership.  The ASF licenses this file
+  * to you under the Apache License, Version 2.0 (the
+  * "License"); you may not use this file except in compliance
+  * with the License.  You may obtain a copy of the License at
+  *
+  * http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing,
+  * software distributed under the License is distributed on an
+  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  * KIND, either express or implied.  See the License for the
+  * specific language governing permissions and limitations
+  * under the License.
+  *
+  * Copyright © 2018 AudienceProject. All rights reserved.
+  */
+package com.audienceproject.spark.dynamodb.connector
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.types.StructType
+
+trait DynamoUpdatable {
+
+
+    def updateItems(schema: StructType)(items: Iterator[Row]): Unit
+
+}
diff --git a/src/main/scala/com/audienceproject/spark/dynamodb/connector/DynamoWritable.scala b/src/main/scala/com/audienceproject/spark/dynamodb/connector/DynamoWritable.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.types.StructType
 
 trait DynamoWritable {
 
-    val writeLimit: Int
+    val writeLimit: Double
 
     def putItems(schema: StructType, batchSize: Int)(items: Iterator[Row]): Unit
 
diff --git a/src/main/scala/com/audienceproject/spark/dynamodb/connector/TableConnector.scala b/src/main/scala/com/audienceproject/spark/dynamodb/connector/TableConnector.scala
@@ -22,7 +22,7 @@ package com.audienceproject.spark.dynamodb.connector
 
 import com.amazonaws.services.dynamodbv2.document._
 import com.amazonaws.services.dynamodbv2.document.spec.{BatchWriteItemSpec, ScanSpec}
-import com.amazonaws.services.dynamodbv2.model.ReturnConsumedCapacity
+import com.amazonaws.services.dynamodbv2.model.{AttributeValue, ReturnConsumedCapacity, UpdateItemRequest, UpdateItemResult}
 import com.amazonaws.services.dynamodbv2.xspec.ExpressionSpecBuilder
 import com.google.common.util.concurrent.RateLimiter
 import org.apache.spark.sql.Row
@@ -33,13 +33,14 @@ import scala.annotation.tailrec
 import scala.collection.JavaConverters._
 
 private[dynamodb] class TableConnector(tableName: String, totalSegments: Int, parameters: Map[String, String])
-    extends DynamoConnector with DynamoWritable with Serializable {
+    extends DynamoConnector with DynamoWritable with DynamoUpdatable with Serializable {
 
     private val consistentRead = parameters.getOrElse("stronglyConsistentReads", "false").toBoolean
     private val filterPushdown = parameters.getOrElse("filterPushdown", "true").toBoolean
+    private val region = parameters.get("region")
 
     override val (keySchema, readLimit, writeLimit, itemLimit, totalSizeInBytes) = {
-        val table = getClient.getTable(tableName)
+        val table = getDynamoDB(region).getTable(tableName)
         val desc = table.describe()
 
         // Key schema.
@@ -56,10 +57,10 @@ private[dynamodb] class TableConnector(tableName: String, totalSegments: Int, pa
         val readCapacity = desc.getProvisionedThroughput.getReadCapacityUnits * targetCapacity
         val writeCapacity = desc.getProvisionedThroughput.getWriteCapacityUnits * targetCapacity
 
-        val readLimit = (readCapacity / totalSegments).toInt max 1
-        val itemLimit = (bytesPerRCU / avgItemSize * readLimit).toInt * readFactor
+        val readLimit = readCapacity / totalSegments
+        val itemLimit = ((bytesPerRCU / avgItemSize * readLimit).toInt * readFactor) max 1
 
-        val writeLimit = (writeCapacity / totalSegments).toInt
+        val writeLimit = writeCapacity / totalSegments
 
         (keySchema, readLimit, writeLimit, itemLimit, tableSize.toLong)
     }
@@ -82,7 +83,48 @@ private[dynamodb] class TableConnector(tableName: String, totalSegments: Int, pa
             scanSpec.withExpressionSpec(xspec.buildForScan())
         }
 
-        getClient.getTable(tableName).scan(scanSpec)
+        getDynamoDB(region).getTable(tableName).scan(scanSpec)
+    }
+
+    override def updateItems(schema: StructType)(items: Iterator[Row]): Unit = {
+        val columnNames = schema.map(_.name)
+        val hashKeyIndex = columnNames.indexOf(keySchema.hashKeyName)
+        val rangeKeyIndex = keySchema.rangeKeyName.map(columnNames.indexOf)
+        val columnIndices = columnNames.zipWithIndex.filterNot({
+            case (name, _) => keySchema match {
+                case KeySchema(hashKey, None) => name == hashKey
+                case KeySchema(hashKey, Some(rangeKey)) => name == hashKey || name == rangeKey
+            }
+        })
+
+        val rateLimiter = RateLimiter.create(writeLimit max 1)
+        val client = getDynamoDBClient(region)
+
+
+
+        // For each item.
+        items.foreach(row => {
+            val key:Map[String,AttributeValue] = keySchema match {
+                case KeySchema(hashKey, None) => Map(hashKey ->  mapValueToAttributeValue(row(hashKeyIndex), schema(hashKey).dataType))
+                case KeySchema(hashKey, Some(rangeKey)) =>
+                    Map(hashKey ->  mapValueToAttributeValue(row(hashKeyIndex), schema(hashKey).dataType),
+                        rangeKey-> mapValueToAttributeValue(row(rangeKeyIndex.get), schema(rangeKey).dataType))
+
+            }
+            val nonNullColumnIndices =columnIndices.filter(c => row(c._2)!=null)
+            val updateExpression = s"SET ${nonNullColumnIndices.map(c => s"${c._1}=:${c._1}").mkString(", ")}"
+            val expressionAttributeValues = nonNullColumnIndices.map(c => s":${c._1}" -> mapValueToAttributeValue(row(c._2), schema(c._1).dataType)).toMap.asJava
+            val updateItemReq = new UpdateItemRequest()
+                .withReturnConsumedCapacity(ReturnConsumedCapacity.TOTAL)
+              .withTableName(tableName)
+              .withKey(key.asJava)
+              .withUpdateExpression(updateExpression)
+              .withExpressionAttributeValues(expressionAttributeValues)
+
+            val updateItemResult = client.updateItem(updateItemReq)
+
+            handleUpdateItemResult(rateLimiter)(updateItemResult)
+        })
     }
 
     override def putItems(schema: StructType, batchSize: Int)(items: Iterator[Row]): Unit = {
@@ -97,7 +139,7 @@ private[dynamodb] class TableConnector(tableName: String, totalSegments: Int, pa
         })
 
         val rateLimiter = RateLimiter.create(writeLimit max 1)
-        val client = getClient
+        val client = getDynamoDB(region)
 
         // For each batch.
         items.grouped(batchSize).foreach(itemBatch => {
@@ -147,6 +189,26 @@ private[dynamodb] class TableConnector(tableName: String, totalSegments: Int, pa
         }
     }
 
+    private def mapValueToAttributeValue(element: Any, elementType: DataType): AttributeValue = {
+        elementType match {
+            case ArrayType(innerType, _) => new AttributeValue().withL(element.asInstanceOf[Seq[_]].map(e => mapValueToAttributeValue(e, innerType)):_*)
+            case MapType(keyType, valueType, _) =>
+                if (keyType != StringType) throw new IllegalArgumentException(
+                    s"Invalid Map key type '${keyType.typeName}'. DynamoDB only supports String as Map key type.")
+
+                new AttributeValue().withM(element.asInstanceOf[Map[String, _]].mapValues(e => mapValueToAttributeValue(e, valueType)).asJava)
+
+            case StructType(fields) =>
+                val row = element.asInstanceOf[Row]
+                new AttributeValue().withM( (fields.indices map { i =>
+                    fields(i).name -> mapValueToAttributeValue(row(i), fields(i).dataType)
+                }).toMap.asJava)
+            case StringType => new AttributeValue().withS(element.asInstanceOf[String])
+            case LongType | IntegerType | DoubleType | FloatType => new AttributeValue().withN(element.toString)
+            case BooleanType => new AttributeValue().withBOOL(element.asInstanceOf[Boolean])
+        }
+    }
+
     @tailrec
     private def handleBatchWriteResponse(client: DynamoDB, rateLimiter: RateLimiter)
                                         (response: BatchWriteItemOutcome): Unit = {
@@ -162,5 +224,12 @@ private[dynamodb] class TableConnector(tableName: String, totalSegments: Int, pa
             handleBatchWriteResponse(client, rateLimiter)(newResponse)
         }
     }
+    private def handleUpdateItemResult(rateLimiter: RateLimiter)
+                                        (result: UpdateItemResult): Unit = {
+        // Rate limit on write capacity.
+        if (result.getConsumedCapacity != null) {
+            rateLimiter.acquire(result.getConsumedCapacity.getCapacityUnits.toInt)
+        }
+    }
 
 }
diff --git a/src/main/scala/com/audienceproject/spark/dynamodb/connector/TableIndexConnector.scala b/src/main/scala/com/audienceproject/spark/dynamodb/connector/TableIndexConnector.scala
@@ -33,9 +33,10 @@ private[dynamodb] class TableIndexConnector(tableName: String, indexName: String
 
     private val consistentRead = parameters.getOrElse("stronglyConsistentReads", "false").toBoolean
     private val filterPushdown = parameters.getOrElse("filterPushdown", "true").toBoolean
+    private val region = parameters.get("region")
 
     override val (keySchema, readLimit, itemLimit, totalSizeInBytes) = {
-        val table = getClient.getTable(tableName)
+        val table = getDynamoDB(region).getTable(tableName)
         val indexDesc = table.describe().getGlobalSecondaryIndexes.asScala.find(_.getIndexName == indexName).get
 
         // Key schema.
@@ -51,8 +52,8 @@ private[dynamodb] class TableIndexConnector(tableName: String, indexName: String
         val avgItemSize = tableSize.toDouble / indexDesc.getItemCount
         val readCapacity = indexDesc.getProvisionedThroughput.getReadCapacityUnits * targetCapacity
 
-        val rateLimit = (readCapacity / totalSegments).toInt max 1
-        val itemLimit = (bytesPerRCU / avgItemSize * rateLimit).toInt * readFactor
+        val rateLimit = readCapacity / totalSegments
+        val itemLimit = ((bytesPerRCU / avgItemSize * rateLimit).toInt * readFactor) max 1
 
         (keySchema, rateLimit, itemLimit, tableSize.toLong)
     }
@@ -75,7 +76,7 @@ private[dynamodb] class TableIndexConnector(tableName: String, indexName: String
             scanSpec.withExpressionSpec(xspec.buildForScan())
         }
 
-        getClient.getTable(tableName).getIndex(indexName).scan(scanSpec)
+        getDynamoDB(region).getTable(tableName).getIndex(indexName).scan(scanSpec)
     }
 
 }
diff --git a/src/main/scala/com/audienceproject/spark/dynamodb/rdd/DynamoWriteRelation.scala b/src/main/scala/com/audienceproject/spark/dynamodb/rdd/DynamoWriteRelation.scala
@@ -44,4 +44,9 @@ private[dynamodb] class DynamoWriteRelation(data: DataFrame, parameters: Map[Str
         data.foreachPartition(connector.putItems(schema, batchSize) _)
     }
 
+    def update(): Unit = {
+        data.foreachPartition(connector.updateItems(schema) _)
+    }
+
+
 }
diff --git a/src/main/scala/com/audienceproject/spark/dynamodb/rdd/ScanPartition.scala b/src/main/scala/com/audienceproject/spark/dynamodb/rdd/ScanPartition.scala
@@ -46,7 +46,7 @@ private[dynamodb] class ScanPartition(schema: StructType,
 
         if (connector.isEmpty) return Iterator.empty
 
-        val rateLimiter = RateLimiter.create(connector.readLimit max 1)
+        val rateLimiter = RateLimiter.create(connector.readLimit)
 
         val scanResult = connector.scan(index, requiredColumns, filters)
 
diff --git a/src/test/scala/com/audienceproject/spark/dynamodb/AbstractInMemoryTest.scala b/src/test/scala/com/audienceproject/spark/dynamodb/AbstractInMemoryTest.scala
@@ -70,6 +70,7 @@ class AbstractInMemoryTest extends FunSuite with BeforeAndAfterAll {
     }
 
     override def afterAll(): Unit = {
+        client.deleteTable("TestFruit")
         server.stop()
     }
 
diff --git a/src/test/scala/com/audienceproject/spark/dynamodb/RegionTest.scala b/src/test/scala/com/audienceproject/spark/dynamodb/RegionTest.scala
diff --git a/src/test/scala/com/audienceproject/spark/dynamodb/WriteRelationTest.scala b/src/test/scala/com/audienceproject/spark/dynamodb/WriteRelationTest.scala

Original file line number	Diff line number	Diff line change
`@@ -33,9 +33,10 @@ private[dynamodb] class TableIndexConnector(tableName: String, indexName: String`
`33`	`33`
`34`	`34`	`private val consistentRead = parameters.getOrElse("stronglyConsistentReads", "false").toBoolean`
`35`	`35`	`private val filterPushdown = parameters.getOrElse("filterPushdown", "true").toBoolean`
	`36`	`+ private val region = parameters.get("region")`
`36`	`37`
`37`	`38`	`override val (keySchema, readLimit, itemLimit, totalSizeInBytes) = {`
`38`		`- val table = getClient.getTable(tableName)`
	`39`	`+ val table = getDynamoDB(region).getTable(tableName)`
`39`	`40`	`val indexDesc = table.describe().getGlobalSecondaryIndexes.asScala.find(_.getIndexName == indexName).get`
`40`	`41`
`41`	`42`	`// Key schema.`
`@@ -51,8 +52,8 @@ private[dynamodb] class TableIndexConnector(tableName: String, indexName: String`
`51`	`52`	`val avgItemSize = tableSize.toDouble / indexDesc.getItemCount`
`52`	`53`	`val readCapacity = indexDesc.getProvisionedThroughput.getReadCapacityUnits * targetCapacity`
`53`	`54`
`54`		`- val rateLimit = (readCapacity / totalSegments).toInt max 1`
`55`		`- val itemLimit = (bytesPerRCU / avgItemSize * rateLimit).toInt * readFactor`
	`55`	`+ val rateLimit = readCapacity / totalSegments`
	`56`	`+ val itemLimit = ((bytesPerRCU / avgItemSize * rateLimit).toInt * readFactor) max 1`
`56`	`57`
`57`	`58`	`(keySchema, rateLimit, itemLimit, tableSize.toLong)`
`58`	`59`	`}`
`@@ -75,7 +76,7 @@ private[dynamodb] class TableIndexConnector(tableName: String, indexName: String`
`75`	`76`	`scanSpec.withExpressionSpec(xspec.buildForScan())`
`76`	`77`	`}`
`77`	`78`
`78`		`- getClient.getTable(tableName).getIndex(indexName).scan(scanSpec)`
	`79`	`+ getDynamoDB(region).getTable(tableName).getIndex(indexName).scan(scanSpec)`
`79`	`80`	`}`
`80`	`81`
`81`	`82`	`}`
Original file line number	Diff line number	Diff line change
`@@ -44,4 +44,9 @@ private[dynamodb] class DynamoWriteRelation(data: DataFrame, parameters: Map[Str`
`44`	`44`	`data.foreachPartition(connector.putItems(schema, batchSize) _)`
`45`	`45`	`}`
`46`	`46`
	`47`	`+ def update(): Unit = {`
	`48`	`+ data.foreachPartition(connector.updateItems(schema) _)`
	`49`	`+ }`
	`50`	`+`
	`51`	`+`
`47`	`52`	`}`