Merge pull request #338 from ldbc/default-spark3.1

szarnyasg · web-flow · commit 8570f78486ab · 2021-09-18T17:40:29.000+02:00
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -29,6 +29,9 @@ jobs:
   test:
     resource_class: xlarge
     executor: my-executor
+    environment:
+      PLATFORM_VERSION: 2.12_spark3.1
+      DATAGEN_VERSION: 0.4.0-SNAPSHOT
     steps:
       - checkout
       - run: |
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM bde2020/spark-master:2.4.5-hadoop2.7
+FROM bde2020/spark-master:3.1.1-hadoop3.2
 
 ENV GOSU_VERSION 1.12
 
diff --git a/README.md b/README.md
@@ -55,51 +55,61 @@ pip install ./tools
 
 The `tools/run.py` is intended for **local runs**. To use it, download and extract Spark as follows.
 
-#### Spark 2.4.x
+#### Spark 3.1.x
+
+Spark 3.1.x is the recommended runtime to use. The rest of the instructions are provided assuming Spark 3.1.x.
 
 ```bash
-curl https://archive.apache.org/dist/spark/spark-2.4.8/spark-2.4.8-bin-hadoop2.7.tgz | sudo tar -xz -C /opt/
-export SPARK_HOME="/opt/spark-2.4.8-bin-hadoop2.7"
+curl https://downloads.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz | sudo tar -xz -C /opt/
+export SPARK_HOME="/opt/spark-3.1.2-bin-hadoop3.2"
 export PATH="$SPARK_HOME/bin":"$PATH"
 ```
 
-Make sure you use Java 8.
+Both Java 8 and Java 11 work.
+
+To build, run
+
+```bash
+tools/build.sh
+```
 
 Run the script with:
 
 ```bash
-export PLATFORM_VERSION=2.11_spark2.4
+export PLATFORM_VERSION=2.12_spark3.1
 export DATAGEN_VERSION=0.4.0-SNAPSHOT
-
 tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar <runtime configuration arguments> -- <generator configuration arguments>
 ```
 
-#### Spark 3.1.x
+#### Older Spark versions
+
+##### Spark 2.4.x
+
+Spark 2.4.x with Hadoop 2.7 (Scala 2.11 / JVM 8) is supported, but it is recommended to switch to Spark 3.
 
 ```bash
-curl https://downloads.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop2.7.tgz | sudo tar -xz -C /opt/
-export SPARK_HOME="/opt/spark-3.1.2-bin-hadoop2.7"
+curl https://archive.apache.org/dist/spark/spark-2.4.8/spark-2.4.8-bin-hadoop2.7.tgz | sudo tar -xz -C /opt/
+export SPARK_HOME="/opt/spark-2.4.8-bin-hadoop2.7"
 export PATH="$SPARK_HOME/bin":"$PATH"
 ```
 
-Both Java 8 and Java 11 work.
+Make sure you use Java 8.
 
 To build, run
 
 ```bash
-tools/build.sh -Pspark3.1
+tools/build.sh -Pspark2.4
 ```
 
 Run the script with:
 
 ```bash
-export PLATFORM_VERSION=2.12_spark3.1
+export PLATFORM_VERSION=2.11_spark2.4
 export DATAGEN_VERSION=0.4.0-SNAPSHOT
+
 tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar <runtime configuration arguments> -- <generator configuration arguments>
 ```
 
-The rest of the instructions are provided assuming Spark 2.4.x.
-
 #### Runtime configuration arguments
 
 The runtime configuration arguments determine the amount of memory, number of threads, degree of parallelism. For a list of arguments, see:
@@ -111,7 +121,7 @@ tools/run.py --help
 To generate a single `part-*.csv` file, reduce the parallelism (number of Spark partitions) to 1.
 
 ```bash
-./tools/run.py ./target/ldbc_snb_datagen_2.11_spark2.4-0.4.0-SNAPSHOT.jar --parallelism 1 -- --format csv --scale-factor 0.003 --mode interactive
+./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar --parallelism 1 -- --format csv --scale-factor 0.003 --mode interactive
 ```
 #### Generator configuration arguments
 
diff --git a/pom.xml b/pom.xml
@@ -14,10 +14,10 @@
     <java.version>1.8</java.version>
     <maven.compiler.source>${java.version}</maven.compiler.source>
     <maven.compiler.target>${java.version}</maven.compiler.target>
-    <scala.version>2.11.12</scala.version>
-    <scala.compat.version>2.11</scala.compat.version>
-    <spark.version>2.4.5</spark.version>
-    <spark.compat.version>2.4</spark.compat.version>
+    <scala.version>2.12.15</scala.version>
+    <scala.compat.version>2.12</scala.compat.version>
+    <spark.version>3.1.2</spark.version>
+    <spark.compat.version>3.1</spark.compat.version>
     <spec2.version>4.2.0</spec2.version>
   </properties>
 
@@ -296,10 +296,17 @@
     <profile>
       <id>spark3.1</id>
       <properties>
-        <scala.version>2.12.12</scala.version>
-        <scala.compat.version>2.12</scala.compat.version>
-        <spark.version>3.1.1</spark.version>
-        <spark.compat.version>3.1</spark.compat.version>
+        <!-- This is the default profile. -->
+      </properties>
+    </profile>
+    <profile>
+      <id>spark2.4</id>
+      <properties>
+        <java.version>1.8</java.version>
+        <scala.version>2.11.12</scala.version>
+        <scala.compat.version>2.11</scala.compat.version>
+        <spark.version>2.4.8</spark.version>
+        <spark.compat.version>2.4</spark.compat.version>
       </properties>
     </profile>
   </profiles>
diff --git a/src/main/java/ldbc/snb/datagen/dictionary/NamesDictionary.java b/src/main/java/ldbc/snb/datagen/dictionary/NamesDictionary.java
@@ -101,7 +101,7 @@ public void extractSurNames() {
                 String infos[] = line.split(",");
                 String locationName = infos[1];
                 int locationId = locationDic.getCountryId(locationName);
-                if (locationId != locationDic.INVALID_LOCATION) {
+                if (locationId != PlaceDictionary.INVALID_LOCATION) {
                     String surName = infos[2].trim();
                     surNamesByLocations.get(locationId).add(surName);
                     totalSurNames++;
@@ -127,7 +127,7 @@ public void extractGivenNames() {
                 int gender = Integer.parseInt(infos[2]);
                 int birthYearPeriod = Integer.parseInt(infos[3]);
                 int locationId = locationDic.getCountryId(locationName);
-                if (locationId != locationDic.INVALID_LOCATION) {
+                if (locationId != PlaceDictionary.INVALID_LOCATION) {
                     String givenName = infos[1].trim();
                     if (gender == 0) {
                         givenNamesByLocationsMale.get(birthYearPeriod).get(locationId).add(givenName);
diff --git a/src/main/scala/ldbc/snb/datagen/generation/generator/SparkKnowsGenerator.scala b/src/main/scala/ldbc/snb/datagen/generation/generator/SparkKnowsGenerator.scala
@@ -27,7 +27,7 @@ object SparkKnowsGenerator {
     val indexed = ranker(persons)
       .map { case (k, v) => (k / blockSize, (k, v)) }
 
-    val percentagesJava = percentages.map(new java.lang.Float(_)).asJava
+    val percentagesJava = percentages.map(Float.box).asJava
 
     indexed
       // groupByKey wouldn't guarantee keeping the order inside groups
@@ -40,7 +40,7 @@ object SparkKnowsGenerator {
       .mapPartitions(groups => {
         DatagenContext.initialize(conf)
         val knowsGeneratorClass = Class.forName(knowsGeneratorClassName)
-        val knowsGenerator = knowsGeneratorClass.newInstance().asInstanceOf[KnowsGenerator]
+        val knowsGenerator = knowsGeneratorClass.getConstructor().newInstance().asInstanceOf[KnowsGenerator]
         knowsGenerator.initialize(conf)
         val personSimilarity = DatagenParams.getPersonSimularity
 
diff --git a/tools/datagen/lib.py b/tools/datagen/lib.py
@@ -1,3 +1,3 @@
-platform_version = "2.11_spark2.4"
+platform_version = "2.12_spark3.1"
 version = "0.4.0-SNAPSHOT"
 main_class = 'ldbc.snb.datagen.spark.LdbcDatagen'
diff --git a/tools/docker-run.sh b/tools/docker-run.sh
@@ -1,14 +1,14 @@
 #!/bin/bash
 
-[ ! -f target/ldbc_snb_datagen_2.11_spark2.4-0.4.0-SNAPSHOT-jar-with-dependencies.jar ] && echo "target/ldbc_snb_datagen_2.11_spark2.4-0.4.0-SNAPSHOT-jar-with-dependencies.jar does not exist, exiting" && exit 1
+[ ! -f target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}-jar-with-dependencies.jar ] && echo "target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}-jar-with-dependencies.jar does not exist, exiting" && exit 1
 
 # make sure that out directory exists and clean previously generated data
 mkdir -p out/
 rm -rf out/*
 docker run \
   --env uid=`id -u` \
   --volume `pwd`/out:/mnt/data \
-  --volume `pwd`/target/ldbc_snb_datagen_2.11_spark2.4-0.4.0-SNAPSHOT-jar-with-dependencies.jar:/mnt/datagen.jar \
+  --volume `pwd`/target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}-jar-with-dependencies.jar:/mnt/datagen.jar \
   ldbc/spark \
   --output-dir /mnt/data \
   ${@} # pass arguments of this script to the submit.sh script (Docker entrypoint)
diff --git a/tools/emr/README.md b/tools/emr/README.md
@@ -42,7 +42,7 @@ pip install -e .
 1. Upload the JAR to S3. (We don't version the JARs yet, so you can only make sure that you run the intended code this way :( ) 
 
 ```bash
-PLATFORM_VERSION=2.11_spark2.4 # use 2.12_spark3.1 if you want to run on emr-6.3.0
+PLATFORM_VERSION=2.12_spark3.1
 VERSION=0.4.0-SNAPHOT
 aws s3 cp target/ldbc_snb_datagen_${PLATFORM_VERSION}-${VERSION}-jar-with-dependencies.jar s3://${BUCKET_NAME}/jars/ldbc_snb_datagen_${PLATFORM_VERSION}-${VERSION}-jar-with-dependencies.jar
 ```
@@ -65,14 +65,17 @@ To use spot instances, add the `--use-spot` argument:
 ./tools/emr/submit_datagen_job.py --use-spot --bucket ${BUCKET_NAME} ${JOB_NAME} ${SCALE_FACTOR} csv raw
 ```
 
-### Using a different EMR version
+### Using a different Spark / EMR version
+
 
-We use EMR 5.13.0 by default. You can try out `emr-6.3.0` by specifying it with the `--emr-version` option.
-Make sure you uploaded the right JAR first!
+
+We use EMR 6.3.0 by default, which contains Spark 3.1. You can use a different version by specifying it with the `--emr-version` option. 
+EMR 5.33.0 is the recommended EMR version to be used with Spark 2.4.
+Make sure that you have uploaded the right JAR first!
 
 ```bash
-PLATFORM_VERSION=2.12_spark3.1
-./tools/emr/submit_datagen_job.py --bucket ${BUCKET_NAME} --platform-version ${PLATFORM_VERSION} --emr-release emr-6.3.0 ${JOB_NAME} ${SCALE_FACTOR} csv raw
+PLATFORM_VERSION=2.11_spark2.4
+./tools/emr/submit_datagen_job.py --bucket ${BUCKET_NAME} --platform-version ${PLATFORM_VERSION} --emr-release emr-5.33.0 ${JOB_NAME} ${SCALE_FACTOR} csv raw
 ```
 
 ### Using a parameter file
diff --git a/tools/emr/submit_datagen_job.py b/tools/emr/submit_datagen_job.py
@@ -30,7 +30,7 @@
     'az': 'us-west-2c',
     'yes': False,
     'ec2_key': None,
-    'emr_release': 'emr-5.31.0'
+    'emr_release': 'emr-6.3.0'
 }
 
 pp = pprint.PrettyPrinter(indent=2)
@@ -238,13 +238,13 @@ def submit_datagen_job(name,
                         help='EC2 key name for SSH connection')
     parser.add_argument('--platform-version',
                         default=defaults['platform_version'],
-                        help='The spark platform the JAR is compiled for formatted like {scala.compat.version}_spark{spark.comapt.version}, e.g. 2.11_spark2.4, 2.12_spark3.1')
+                        help='The spark platform the JAR is compiled for formatted like {scala.compat.version}_spark{spark.compat.version}, e.g. 2.11_spark2.4, 2.12_spark3.1')
     parser.add_argument('--version',
                         default=defaults['version'],
                         help='LDBC SNB Datagen library version')
     parser.add_argument('--emr-release',
                         default=defaults['emr_release'],
-                        help='The EMR release to use. E.g emr-5.31.0, emr-6.1.0')
+                        help='The EMR release to use. E.g emr-5.33.0, emr-6.3.0')
     parser.add_argument('-y', '--yes',
                         default=defaults['yes'],
                         action='store_true',

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-FROM bde2020/spark-master:2.4.5-hadoop2.7`
	`1`	`+FROM bde2020/spark-master:3.1.1-hadoop3.2`
`2`	`2`
`3`	`3`	`ENV GOSU_VERSION 1.12`
`4`	`4`