audienceproject
diff --git a/‎README.md
Lines changed: 19 additions & 13 deletions b/‎README.md
Lines changed: 19 additions & 13 deletions
diff --git a/‎build.sbt
Lines changed: 7 additions & 6 deletions b/‎build.sbt
Lines changed: 7 additions & 6 deletions
@@ -4,6 +4,12 @@ Plug-and-play implementation of an Apache Spark custom data source for AWS Dynam
 We published a small article about the project, check it out here:
 https://www.audienceproject.com/blog/tech/sparkdynamodb-using-aws-dynamodb-data-source-apache-spark/
 
+## News
+
+UPDATE 2019-11-25: We are releasing version 1.0.0 of the Spark+DynamoDB connector, which is based on the Spark Data Source V2 API.
+Out-of-the-box throughput calculations, parallelism and partition planning should now be more reliable.
+We have also pulled out the external dependency on Guava, which was causing a lot of compatibility issues.
+
 ## Features
 
 - Distributed, parallel scan with lazy evaluation
@@ -15,11 +21,20 @@ https://www.audienceproject.com/blog/tech/sparkdynamodb-using-aws-dynamodb-data-
 - Global secondary index support
 - Write support
 
+## Getting The Dependency
+
+The library is available from [Maven Central](https://mvnrepository.com/artifact/com.audienceproject/spark-dynamodb). Add the dependency in SBT as ```"com.audienceproject" %% "spark-dynamodb" % "latest"```
+
+Spark is used in the library as a "provided" dependency, which means Spark has to be installed separately on the container where the application is running, such as is the case on AWS EMR.
+
 ## Quick Start Guide
 
 ### Scala
 ```scala
 import com.audienceproject.spark.dynamodb.implicits._
+import org.apache.spark.sql.SparkSession
+
+val spark = SparkSession.builder().getOrCreate()
 
 // Load a DataFrame from a Dynamo table. Only incurs the cost of a single scan for schema inference.
 val dynamoDf = spark.read.dynamodb("SomeTableName") // <-- DataFrame of Row objects with inferred schema.
@@ -45,15 +60,15 @@ val avgWeightByColor = vegetableDs.agg($"color", avg($"weightKg")) // The column
 ```python
 # Load a DataFrame from a Dynamo table. Only incurs the cost of a single scan for schema inference.
 dynamoDf = spark.read.option("tableName", "SomeTableName") \
-                     .format("com.audienceproject.spark.dynamodb") \
+                     .format("dynamodb") \
                      .load() # <-- DataFrame of Row objects with inferred schema.
 
 # Scan the table for the first 100 items (the order is arbitrary) and print them.
 dynamoDf.show(100)
 
 # write to some other table overwriting existing item with same keys
 dynamoDf.write.option("tableName", "SomeOtherTable") \
-              .format("com.audienceproject.spark.dynamodb") \
+              .format("dynamodb") \
               .save()
 ```
 
@@ -62,29 +77,20 @@ dynamoDf.write.option("tableName", "SomeOtherTable") \
 pyspark --packages com.audienceproject:spark-dynamodb_<spark-scala-version>:<version>
 ```
 
-
-## Getting The Dependency
-
-The library is available from [Maven Central](https://mvnrepository.com/artifact/com.audienceproject/spark-dynamodb). Add the dependency in SBT as ```"com.audienceproject" %% "spark-dynamodb" % "latest"```
-
-Spark is used in the library as a "provided" dependency, which means Spark has to be installed separately on the container where the application is running, such as is the case on AWS EMR.
-
 ## Parameters
 The following parameters can be set as options on the Spark reader and writer object before loading/saving.
 - `region` sets the region where the dynamodb table. Default is environment specific.
 - `roleArn` sets an IAM role to assume. This allows for access to a DynamoDB in a different account than the Spark cluster. Defaults to the standard role configuration.
 
-
 The following parameters can be set as options on the Spark reader object before loading.
 
-- `readPartitions` number of partitions to split the initial RDD when loading the data into Spark. Corresponds 1-to-1 with total number of segments in the DynamoDB parallel scan used to load the data. Defaults to `sparkContext.defaultParallelism`
+- `readPartitions` number of partitions to split the initial RDD when loading the data into Spark. Defaults to the size of the DynamoDB table divided into chunks of `maxPartitionBytes`
+- `maxPartitionBytes` the maximum size of a single input partition. Default 128 MB
 - `targetCapacity` fraction of provisioned read capacity on the table (or index) to consume for reading. Default 1 (i.e. 100% capacity).
 - `stronglyConsistentReads` whether or not to use strongly consistent reads. Default false.
 - `bytesPerRCU` number of bytes that can be read per second with a single Read Capacity Unit. Default 4000 (4 KB). This value is multiplied by two when `stronglyConsistentReads=false`
 - `filterPushdown` whether or not to use filter pushdown to DynamoDB on scan requests. Default true.
 - `throughput` the desired read throughput to use. It overwrites any calculation used by the package. It is intended to be used with tables that are on-demand. Defaults to 100 for on-demand.
-- `itemCount` the number of items in the table. This overrides requesting it from the table itself.
-- `tableSize` the number of bytes in the table. This overrides requesting it from the table itself.
 
 The following parameters can be set as options on the Spark writer object before saving.
 
 
@@ -2,22 +2,23 @@ organization := "com.audienceproject"
 
 name := "spark-dynamodb"
 
-version := "0.4.4"
+version := "1.0.0"
 
 description := "Plug-and-play implementation of an Apache Spark custom data source for AWS DynamoDB."
 
-scalaVersion := "2.12.7"
+scalaVersion := "2.11.12"
 
 crossScalaVersions := Seq("2.11.12", "2.12.7")
 
+compileOrder := CompileOrder.JavaThenScala
+
 resolvers += "DynamoDBLocal" at "https://s3-us-west-2.amazonaws.com/dynamodb-local/release"
 
-libraryDependencies += "com.amazonaws" % "aws-java-sdk-sts" % "1.11.571"
-libraryDependencies += "com.amazonaws" % "aws-java-sdk-dynamodb" % "1.11.571"
+libraryDependencies += "com.amazonaws" % "aws-java-sdk-sts" % "1.11.678"
+libraryDependencies += "com.amazonaws" % "aws-java-sdk-dynamodb" % "1.11.678"
 libraryDependencies += "com.amazonaws" % "DynamoDBLocal" % "[1.11,2.0)" % "test" exclude("com.google.guava", "guava")
 
-libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.4.0" % "provided"
-libraryDependencies += "com.google.guava" % "guava" % "14.0.1" % "provided"
+libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.4.4" % "provided"
 
 libraryDependencies += "org.scalatest" %% "scalatest" % "3.0.5" % "test"