Merge pull request #51 from emailage/feature/write-deletes

cosmincatalin · web-flow · commit 9ef5f45b9125 · 2020-03-02T17:56:03.000+01:00
diff --git a/src/main/scala/com/audienceproject/spark/dynamodb/connector/DynamoWritable.scala b/src/main/scala/com/audienceproject/spark/dynamodb/connector/DynamoWritable.scala
@@ -34,4 +34,7 @@ private[dynamodb] trait DynamoWritable {
     def updateItem(columnSchema: ColumnSchema, item: InternalRow)
                   (client: DynamoDB, rateLimiter: RateLimiter): Unit
 
+    def deleteItems(columnSchema: ColumnSchema, itema: Seq[InternalRow])
+                   (client: DynamoDB, rateLimiter: RateLimiter): Unit
+
 }
diff --git a/src/main/scala/com/audienceproject/spark/dynamodb/connector/TableConnector.scala b/src/main/scala/com/audienceproject/spark/dynamodb/connector/TableConnector.scala
@@ -171,6 +171,35 @@ private[dynamodb] class TableConnector(tableName: String, parallelism: Int, para
             .foreach(cap => rateLimiter.acquire(cap.getCapacityUnits.toInt max 1))
     }
 
+    override def deleteItems(columnSchema: ColumnSchema, items: Seq[InternalRow])
+                            (client: DynamoDB, rateLimiter: RateLimiter): Unit = {
+        // For each batch.
+        val batchWriteItemSpec = new BatchWriteItemSpec().withReturnConsumedCapacity(ReturnConsumedCapacity.TOTAL)
+
+        val tableWriteItems = new TableWriteItems(tableName)
+        val tableWriteItemsWithItems: TableWriteItems =
+            // check if hash key only or also range key
+            columnSchema.keys() match {
+                case Left((hashKey, hashKeyIndex, hashKeyType)) =>
+                    val hashKeys = items.map(row =>
+                        JavaConverter.convertRowValue(row, hashKeyIndex, hashKeyType).asInstanceOf[AnyRef])
+                    tableWriteItems.withHashOnlyKeysToDelete(hashKey, hashKeys: _*)
+
+                case Right(((hashKey, hashKeyIndex, hashKeyType), (rangeKey, rangeKeyIndex, rangeKeyType))) =>
+                    val alternatingHashAndRangeKeys = items.flatMap { case row =>
+                        val hashKeyValue = JavaConverter.convertRowValue(row, hashKeyIndex, hashKeyType)
+                        val rangeKeyValue = JavaConverter.convertRowValue(row, rangeKeyIndex, rangeKeyType)
+                        Seq(hashKeyValue.asInstanceOf[AnyRef], rangeKeyValue.asInstanceOf[AnyRef])
+                    }
+                    tableWriteItems.withHashAndRangeKeysToDelete(hashKey, rangeKey, alternatingHashAndRangeKeys: _*)
+            }
+
+        batchWriteItemSpec.withTableWriteItems(tableWriteItemsWithItems)
+
+        val response = client.batchWriteItem(batchWriteItemSpec)
+        handleBatchWriteResponse(client, rateLimiter)(response)
+    }
+
     @tailrec
     private def handleBatchWriteResponse(client: DynamoDB, rateLimiter: RateLimiter)
                                         (response: BatchWriteItemOutcome): Unit = {
diff --git a/src/main/scala/com/audienceproject/spark/dynamodb/datasource/DynamoBatchDeleter.scala b/src/main/scala/com/audienceproject/spark/dynamodb/datasource/DynamoBatchDeleter.scala
@@ -0,0 +1,18 @@
+package com.audienceproject.spark.dynamodb.datasource
+
+import com.amazonaws.services.dynamodbv2.document.DynamoDB
+import com.audienceproject.spark.dynamodb.connector.{ColumnSchema, TableConnector}
+
+class DynamoBatchDeleter(batchSize: Int,
+                         columnSchema: ColumnSchema,
+                         connector: TableConnector,
+                         client: DynamoDB)
+    extends DynamoBatchWriter(batchSize, columnSchema, connector, client) {
+
+    protected override def flush(): Unit = {
+        if (buffer.nonEmpty) {
+            connector.deleteItems(columnSchema, buffer)(client, rateLimiter)
+            buffer.clear()
+        }
+    }
+}
diff --git a/src/main/scala/com/audienceproject/spark/dynamodb/datasource/DynamoBatchWriter.scala b/src/main/scala/com/audienceproject/spark/dynamodb/datasource/DynamoBatchWriter.scala
@@ -31,11 +31,12 @@ import scala.collection.mutable.ArrayBuffer
 class DynamoBatchWriter(batchSize: Int,
                         columnSchema: ColumnSchema,
                         connector: TableConnector,
-                        client: DynamoDB)
+                        client: DynamoDB
+                       )
     extends DataWriter[InternalRow] {
 
-    private val buffer = new ArrayBuffer[InternalRow](batchSize)
-    private val rateLimiter = RateLimiter.create(connector.writeLimit)
+    protected val buffer = new ArrayBuffer[InternalRow](batchSize)
+    protected val rateLimiter = RateLimiter.create(connector.writeLimit)
 
     override def write(record: InternalRow): Unit = {
         buffer += record.copy()
@@ -51,7 +52,7 @@ class DynamoBatchWriter(batchSize: Int,
 
     override def abort(): Unit = {}
 
-    private def flush(): Unit = {
+    protected def flush(): Unit = {
         if (buffer.nonEmpty) {
             connector.putItems(columnSchema, buffer)(client, rateLimiter)
             buffer.clear()
diff --git a/src/main/scala/com/audienceproject/spark/dynamodb/datasource/DynamoWriterFactory.scala b/src/main/scala/com/audienceproject/spark/dynamodb/datasource/DynamoWriterFactory.scala
@@ -32,15 +32,21 @@ class DynamoWriterFactory(connector: TableConnector,
 
     private val batchSize = parameters.getOrElse("writebatchsize", "25").toInt
     private val update = parameters.getOrElse("update", "false").toBoolean
+    private val delete = parameters.getOrElse("delete", "false").toBoolean
 
     private val region = parameters.get("region")
     private val roleArn = parameters.get("rolearn")
 
     override def createDataWriter(partitionId: Int, taskId: Long, epochId: Long): DataWriter[InternalRow] = {
         val columnSchema = new ColumnSchema(connector.keySchema, schema)
         val client = connector.getDynamoDB(region, roleArn)
-        if (update)
+        if (update) {
+            assert(!delete, "Please provide exactly one of 'update' or 'delete' options.")
             new DynamoUpdateWriter(columnSchema, connector, client)
+        }
+        else if (delete) {
+            new DynamoBatchDeleter(batchSize, columnSchema, connector, client)
+        }
         else
             new DynamoBatchWriter(batchSize, columnSchema, connector, client)
     }
diff --git a/src/test/resources/log4j2.xml b/src/test/resources/log4j2.xml
@@ -12,6 +12,18 @@
         <Root level="INFO">
             <AppenderRef ref="console" />
         </Root>
+        <logger name="org.apache.spark" level="WARN">
+            <AppenderRef ref="simple-console"/>
+        </logger>
+        <logger name="org.spark_project.jetty" level="WARN">
+            <AppenderRef ref="simple-console"/>
+        </logger>
+        <logger name="com.amazonaws.services.dynamodbv2.local" level="WARN">
+            <AppenderRef ref="simple-console"/>
+        </logger>
+        <logger name="com.amazonaws.auth.profile.internal.BasicProfileConfigLoader" level="ERROR">
+            <AppenderRef ref="simple-console"/>
+        </logger>
         <Logger name="MessageOnly" level="INFO" additivity="false">
             <AppenderRef ref="simple-console"/>
         </Logger>
diff --git a/src/test/scala/com/audienceproject/spark/dynamodb/WriteRelationTest.scala b/src/test/scala/com/audienceproject/spark/dynamodb/WriteRelationTest.scala
@@ -20,11 +20,15 @@
   */
 package com.audienceproject.spark.dynamodb
 
-import com.amazonaws.services.dynamodbv2.model.{AttributeDefinition, CreateTableRequest, KeySchemaElement, ProvisionedThroughput}
+import java.util
+
+import collection.JavaConverters._
+import com.amazonaws.services.dynamodbv2.model.{AttributeDefinition, CreateTableRequest, KeySchemaElement, KeyType, ProvisionedThroughput}
 import com.audienceproject.spark.dynamodb.implicits._
-import org.apache.spark.sql.functions.{length, lit, when}
+import org.apache.spark.sql.functions.{lit, when, length => sqlLength}
+import org.scalatest.Matchers
 
-class WriteRelationTest extends AbstractInMemoryTest {
+class WriteRelationTest extends AbstractInMemoryTest with Matchers {
 
     test("Inserting from a local Dataset") {
         dynamoDB.createTable(new CreateTableRequest()
@@ -52,6 +56,78 @@ class WriteRelationTest extends AbstractInMemoryTest {
         assert(validationDs.select("weight").as[Double].collect().forall(Seq(0.1, 0.2, 0.2) contains _))
     }
 
+    test("Deleting from a local Dataset with a HashKey only") {
+        val tablename = "DeleteTest1"
+        dynamoDB.createTable(new CreateTableRequest()
+            .withTableName(tablename)
+            .withAttributeDefinitions(new AttributeDefinition("name", "S"))
+            .withKeySchema(new KeySchemaElement("name", "HASH"))
+            .withProvisionedThroughput(new ProvisionedThroughput(5L, 5L)))
+
+        import spark.implicits._
+
+        val newItemsDs = Seq(
+            ("lemon", "yellow", 0.1),
+            ("orange", "orange", 0.2),
+            ("pomegranate", "red", 0.2)
+        ).toDF("name", "color", "weight")
+        newItemsDs.write.dynamodb(tablename)
+
+        val toDelete = Seq(
+            ("lemon", "yellow"),
+            ("orange", "blue"),
+            ("doesn't exist", "black")
+        ).toDF("name", "color")
+        toDelete.write.option("delete", "true").dynamodb(tablename)
+
+        val validationDs = spark.read.dynamodb(tablename)
+        validationDs.count() shouldEqual 1
+        val rec = validationDs.first
+        rec.getString(rec.fieldIndex("name")) shouldEqual "pomegranate"
+        rec.getString(rec.fieldIndex("color")) shouldEqual "red"
+        rec.getDouble(rec.fieldIndex("weight")) shouldEqual 0.2
+    }
+
+    test("Deleting from a local Dataset with a HashKey and RangeKey") {
+        val tablename = "DeleteTest2"
+
+        dynamoDB.createTable(new CreateTableRequest()
+            .withTableName(tablename)
+            .withAttributeDefinitions(Seq(
+                new AttributeDefinition("name", "S"),
+                new AttributeDefinition("weight", "N")
+            ).asJavaCollection)
+            .withKeySchema(Seq(
+                new KeySchemaElement("name", KeyType.HASH),
+                // also test that non-string key works
+                new KeySchemaElement("weight", KeyType.RANGE)
+            ).asJavaCollection)
+            .withProvisionedThroughput(new ProvisionedThroughput(5L, 5L)))
+
+        import spark.implicits._
+
+        val newItemsDs = Seq(
+            ("lemon", "yellow", 0.1),
+            ("lemon", "blue", 4.0),
+            ("orange", "orange", 0.2),
+            ("pomegranate", "red", 0.2)
+        ).toDF("name", "color", "weight")
+        newItemsDs.write.dynamodb(tablename)
+
+        val toDelete = Seq(
+            ("lemon", "yellow", 0.1),
+            ("orange", "orange", 0.2),
+            ("pomegranate", "shouldn'tdelete", 0.5)
+        ).toDF("name", "color", "weight")
+        toDelete.write.option("delete", "true").dynamodb(tablename)
+
+        val validationDs = spark.read.dynamodb(tablename)
+        validationDs.show
+        validationDs.count() shouldEqual 2
+        validationDs.select("name").as[String].collect should contain theSameElementsAs Seq("lemon", "pomegranate")
+        validationDs.select("color").as[String].collect should contain theSameElementsAs Seq("blue", "red")
+    }
+
     test("Updating from a local Dataset with new and only some previous columns") {
         val tablename = "UpdateTest1"
         dynamoDB.createTable(new CreateTableRequest()
@@ -70,7 +146,7 @@ class WriteRelationTest extends AbstractInMemoryTest {
         newItemsDs.write.dynamodb(tablename)
 
         newItemsDs
-            .withColumn("size", length($"color"))
+            .withColumn("size", sqlLength($"color"))
             .drop("color")
             .withColumn("weight", $"weight" * 2)
             .write.option("update", "true").dynamodb(tablename)

Original file line number	Diff line number	Diff line change
`@@ -34,4 +34,7 @@ private[dynamodb] trait DynamoWritable {`
`34`	`34`	`def updateItem(columnSchema: ColumnSchema, item: InternalRow)`
`35`	`35`	`(client: DynamoDB, rateLimiter: RateLimiter): Unit`
`36`	`36`
	`37`	`+ def deleteItems(columnSchema: ColumnSchema, itema: Seq[InternalRow])`
	`38`	`+ (client: DynamoDB, rateLimiter: RateLimiter): Unit`
	`39`	`+`
`37`	`40`	`}`