Added delete option on write

rhelmstetter · rhelmstetter · commit 5c4842725f81 · 2020-01-28T16:45:45.000-07:00
Added delete tests
Decreased amount of logging when running tests
diff --git a/src/main/scala/com/audienceproject/spark/dynamodb/connector/DynamoWritable.scala b/src/main/scala/com/audienceproject/spark/dynamodb/connector/DynamoWritable.scala
@@ -29,7 +29,7 @@ private[dynamodb] trait DynamoWritable {
     val writeLimit: Double
 
     def putItems(columnSchema: ColumnSchema, items: Seq[InternalRow])
-                (client: DynamoDB, rateLimiter: RateLimiter): Unit
+                (client: DynamoDB, rateLimiter: RateLimiter, delete: Boolean): Unit
 
     def updateItem(columnSchema: ColumnSchema, item: InternalRow)
                   (client: DynamoDB, rateLimiter: RateLimiter): Unit
diff --git a/src/main/scala/com/audienceproject/spark/dynamodb/connector/TableConnector.scala b/src/main/scala/com/audienceproject/spark/dynamodb/connector/TableConnector.scala
@@ -110,34 +110,56 @@ private[dynamodb] class TableConnector(tableName: String, parallelism: Int, para
     }
 
     override def putItems(columnSchema: ColumnSchema, items: Seq[InternalRow])
-                         (client: DynamoDB, rateLimiter: RateLimiter): Unit = {
+                         (client: DynamoDB, rateLimiter: RateLimiter, delete: Boolean): Unit = {
         // For each batch.
         val batchWriteItemSpec = new BatchWriteItemSpec().withReturnConsumedCapacity(ReturnConsumedCapacity.TOTAL)
-        batchWriteItemSpec.withTableWriteItems(new TableWriteItems(tableName).withItemsToPut(
-            // Map the items.
-            items.map(row => {
-                val item = new Item()
-
-                // Map primary key.
-                columnSchema.keys() match {
-                    case Left((hashKey, hashKeyIndex, hashKeyType)) =>
-                        item.withPrimaryKey(hashKey, JavaConverter.convertRowValue(row, hashKeyIndex, hashKeyType))
-                    case Right(((hashKey, hashKeyIndex, hashKeyType), (rangeKey, rangeKeyIndex, rangeKeyType))) =>
+
+        val tableWriteItems = new TableWriteItems(tableName)
+        val tableWriteItemsWithItems: TableWriteItems = if (delete) {
+            // check if hash key only or also range key
+            columnSchema.keys() match {
+                case Left((hashKey, hashKeyIndex, hashKeyType)) =>
+                    val hashKeys = items.map(row =>
+                        JavaConverter.convertRowValue(row, hashKeyIndex, hashKeyType).asInstanceOf[AnyRef])
+                    tableWriteItems.withHashOnlyKeysToDelete(hashKey, hashKeys: _*)
+
+                case Right(((hashKey, hashKeyIndex, hashKeyType), (rangeKey, rangeKeyIndex, rangeKeyType))) =>
+                    val alternatingHashAndRangeKeys = items.flatMap { case row =>
                         val hashKeyValue = JavaConverter.convertRowValue(row, hashKeyIndex, hashKeyType)
                         val rangeKeyValue = JavaConverter.convertRowValue(row, rangeKeyIndex, rangeKeyType)
-                        item.withPrimaryKey(hashKey, hashKeyValue, rangeKey, rangeKeyValue)
-                }
-
-                // Map remaining columns.
-                columnSchema.attributes().foreach({
-                    case (name, index, dataType) if !row.isNullAt(index) =>
-                        item.`with`(name, JavaConverter.convertRowValue(row, index, dataType))
-                    case _ =>
-                })
-
-                item
-            }): _*
-        ))
+                        Seq(hashKeyValue.asInstanceOf[AnyRef], rangeKeyValue.asInstanceOf[AnyRef])
+                    }
+                    tableWriteItems.withHashAndRangeKeysToDelete(hashKey, rangeKey, alternatingHashAndRangeKeys: _*)
+            }
+        } else {
+            // Map the items.
+            tableWriteItems.withItemsToPut(
+                items.map(row => {
+                    val item = new Item()
+
+                    // Map primary key.
+                    columnSchema.keys() match {
+                        case Left((hashKey, hashKeyIndex, hashKeyType)) =>
+                            item.withPrimaryKey(hashKey, JavaConverter.convertRowValue(row, hashKeyIndex, hashKeyType))
+                        case Right(((hashKey, hashKeyIndex, hashKeyType), (rangeKey, rangeKeyIndex, rangeKeyType))) =>
+                            val hashKeyValue = JavaConverter.convertRowValue(row, hashKeyIndex, hashKeyType)
+                            val rangeKeyValue = JavaConverter.convertRowValue(row, rangeKeyIndex, rangeKeyType)
+                            item.withPrimaryKey(hashKey, hashKeyValue, rangeKey, rangeKeyValue)
+                    }
+
+                    // Map remaining columns.
+                    columnSchema.attributes().foreach({
+                        case (name, index, dataType) if !row.isNullAt(index) =>
+                            item.`with`(name, JavaConverter.convertRowValue(row, index, dataType))
+                        case _ =>
+                    })
+
+                    item
+                }): _*
+            )
+        }
+
+        batchWriteItemSpec.withTableWriteItems(tableWriteItemsWithItems)
 
         val response = client.batchWriteItem(batchWriteItemSpec)
         handleBatchWriteResponse(client, rateLimiter)(response)
diff --git a/src/main/scala/com/audienceproject/spark/dynamodb/datasource/DynamoBatchWriter.scala b/src/main/scala/com/audienceproject/spark/dynamodb/datasource/DynamoBatchWriter.scala
@@ -31,7 +31,9 @@ import scala.collection.mutable.ArrayBuffer
 class DynamoBatchWriter(batchSize: Int,
                         columnSchema: ColumnSchema,
                         connector: TableConnector,
-                        client: DynamoDB)
+                        client: DynamoDB,
+                        delete: Boolean
+                       )
     extends DataWriter[InternalRow] {
 
     private val buffer = new ArrayBuffer[InternalRow](batchSize)
@@ -53,7 +55,7 @@ class DynamoBatchWriter(batchSize: Int,
 
     private def flush(): Unit = {
         if (buffer.nonEmpty) {
-            connector.putItems(columnSchema, buffer)(client, rateLimiter)
+            connector.putItems(columnSchema, buffer)(client, rateLimiter, delete)
             buffer.clear()
         }
     }
diff --git a/src/main/scala/com/audienceproject/spark/dynamodb/datasource/DynamoWriterFactory.scala b/src/main/scala/com/audienceproject/spark/dynamodb/datasource/DynamoWriterFactory.scala
@@ -32,17 +32,20 @@ class DynamoWriterFactory(connector: TableConnector,
 
     private val batchSize = parameters.getOrElse("writebatchsize", "25").toInt
     private val update = parameters.getOrElse("update", "false").toBoolean
+    private val delete = parameters.getOrElse("delete", "false").toBoolean
 
     private val region = parameters.get("region")
     private val roleArn = parameters.get("rolearn")
 
     override def createDataWriter(partitionId: Int, taskId: Long, epochId: Long): DataWriter[InternalRow] = {
         val columnSchema = new ColumnSchema(connector.keySchema, schema)
         val client = connector.getDynamoDB(region, roleArn)
-        if (update)
+        if (update) {
+            assert(!delete, "Please provide exactly one of 'update' or 'delete' options.")
             new DynamoUpdateWriter(columnSchema, connector, client)
+        }
         else
-            new DynamoBatchWriter(batchSize, columnSchema, connector, client)
+            new DynamoBatchWriter(batchSize, columnSchema, connector, client, delete)
     }
 
 }
diff --git a/src/test/resources/log4j2.xml b/src/test/resources/log4j2.xml
@@ -12,6 +12,18 @@
         <Root level="INFO">
             <AppenderRef ref="console" />
         </Root>
+        <logger name="org.apache.spark" level="WARN">
+            <AppenderRef ref="simple-console"/>
+        </logger>
+        <logger name="org.spark_project.jetty" level="WARN">
+            <AppenderRef ref="simple-console"/>
+        </logger>
+        <logger name="com.amazonaws.services.dynamodbv2.local" level="WARN">
+            <AppenderRef ref="simple-console"/>
+        </logger>
+        <logger name="com.amazonaws.auth.profile.internal.BasicProfileConfigLoader" level="ERROR">
+            <AppenderRef ref="simple-console"/>
+        </logger>
         <Logger name="MessageOnly" level="INFO" additivity="false">
             <AppenderRef ref="simple-console"/>
         </Logger>
diff --git a/src/test/scala/com/audienceproject/spark/dynamodb/WriteRelationTest.scala b/src/test/scala/com/audienceproject/spark/dynamodb/WriteRelationTest.scala
@@ -20,11 +20,15 @@
   */
 package com.audienceproject.spark.dynamodb
 
-import com.amazonaws.services.dynamodbv2.model.{AttributeDefinition, CreateTableRequest, KeySchemaElement, ProvisionedThroughput}
+import java.util
+
+import collection.JavaConverters._
+import com.amazonaws.services.dynamodbv2.model.{AttributeDefinition, CreateTableRequest, KeySchemaElement, KeyType, ProvisionedThroughput}
 import com.audienceproject.spark.dynamodb.implicits._
-import org.apache.spark.sql.functions.{length, lit, when}
+import org.apache.spark.sql.functions.{lit, when, length => sqlLength}
+import org.scalatest.Matchers
 
-class WriteRelationTest extends AbstractInMemoryTest {
+class WriteRelationTest extends AbstractInMemoryTest with Matchers {
 
     test("Inserting from a local Dataset") {
         dynamoDB.createTable(new CreateTableRequest()
@@ -52,6 +56,69 @@ class WriteRelationTest extends AbstractInMemoryTest {
         assert(validationDs.select("weight").as[Double].collect().forall(Seq(0.1, 0.2, 0.2) contains _))
     }
 
+    test("Deleting from a local Dataset with a HashKey only") {
+        val tablename = "DeleteTest1"
+        dynamoDB.createTable(new CreateTableRequest()
+            .withTableName(tablename)
+            .withAttributeDefinitions(new AttributeDefinition("name", "S"))
+            .withKeySchema(new KeySchemaElement("name", "HASH"))
+            .withProvisionedThroughput(new ProvisionedThroughput(5L, 5L)))
+
+        import spark.implicits._
+
+        val newItemsDs = Seq(
+            ("lemon", "yellow", 0.1),
+            ("orange", "orange", 0.2),
+            ("pomegranate", "red", 0.2)
+        ).toDF("name", "color", "weight")
+        newItemsDs.write.dynamodb(tablename)
+
+        val toDelete = newItemsDs.filter("name in ('lemon','orange')")
+        toDelete.write.option("delete", "true").dynamodb(tablename)
+
+        val validationDs = spark.read.dynamodb(tablename)
+        validationDs.count() shouldEqual 1
+        val rec = validationDs.first
+        rec.getString(rec.fieldIndex("name")) shouldEqual "pomegranate"
+        rec.getString(rec.fieldIndex("color")) shouldEqual "red"
+        rec.getDouble(rec.fieldIndex("weight")) shouldEqual 0.2
+    }
+
+    test("Deleting from a local Dataset with a HashKey and RangeKey") {
+        val tablename = "DeleteTest2"
+
+        dynamoDB.createTable(new CreateTableRequest()
+            .withTableName(tablename)
+            .withAttributeDefinitions(Seq(
+                new AttributeDefinition("name", "S"),
+                new AttributeDefinition("weight", "N")
+            ).asJavaCollection)
+            .withKeySchema(Seq(
+                new KeySchemaElement("name", KeyType.HASH),
+                // also test that non-string key works
+                new KeySchemaElement("weight", KeyType.RANGE)
+            ).asJavaCollection)
+            .withProvisionedThroughput(new ProvisionedThroughput(5L, 5L)))
+
+        import spark.implicits._
+
+        val newItemsDs = Seq(
+            ("lemon", "yellow", 0.1),
+            ("lemon", "blue", 4.0),
+            ("orange", "orange", 0.2),
+            ("pomegranate", "red", 0.2)
+        ).toDF("name", "color", "weight")
+        newItemsDs.write.dynamodb(tablename)
+
+        val toDelete = newItemsDs.filter("color in ('yellow','orange')")
+        toDelete.write.option("delete", "true").dynamodb(tablename)
+
+        val validationDs = spark.read.dynamodb(tablename)
+        validationDs.count() shouldEqual 2
+        validationDs.select("name").as[String].collect should contain theSameElementsAs Seq("lemon", "pomegranate")
+        validationDs.select("color").as[String].collect should contain theSameElementsAs Seq("blue", "red")
+    }
+
     test("Updating from a local Dataset with new and only some previous columns") {
         val tablename = "UpdateTest1"
         dynamoDB.createTable(new CreateTableRequest()
@@ -70,13 +137,12 @@ class WriteRelationTest extends AbstractInMemoryTest {
         newItemsDs.write.dynamodb(tablename)
 
         newItemsDs
-            .withColumn("size", length($"color"))
+            .withColumn("size", sqlLength($"color"))
             .drop("color")
             .withColumn("weight", $"weight" * 2)
             .write.option("update", "true").dynamodb(tablename)
 
         val validationDs = spark.read.dynamodb(tablename)
-        validationDs.show
         assert(validationDs.count() === 3)
         assert(validationDs.select("name").as[String].collect().forall(Seq("lemon", "orange", "pomegranate") contains _))
         assert(validationDs.select("color").as[String].collect().forall(Seq("yellow", "orange", "red") contains _))
@@ -103,11 +169,9 @@ class WriteRelationTest extends AbstractInMemoryTest {
 
         val alteredDs = newItemsDs
             .withColumn("weight", when($"weight" < 0.2, $"weight").otherwise(lit(null)))
-        alteredDs.show
         alteredDs.write.option("update", "true").dynamodb(tablename)
 
         val validationDs = spark.read.dynamodb(tablename)
-        validationDs.show
         assert(validationDs.count() === 3)
         assert(validationDs.select("name").as[String].collect().forall(Seq("lemon", "orange", "pomegranate") contains _))
         assert(validationDs.select("color").as[String].collect().forall(Seq("yellow", "orange", "red") contains _))