ldbc
diff --git a/‎.circleci/config.yml
Lines changed: 11 additions & 4 deletions b/‎.circleci/config.yml
Lines changed: 11 additions & 4 deletions
diff --git a/‎.gitignore
Lines changed: 4 additions & 1 deletion b/‎.gitignore
Lines changed: 4 additions & 1 deletion
diff --git a/‎NOTICE.txt
Lines changed: 1 addition & 1 deletion b/‎NOTICE.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md
Lines changed: 40 additions & 30 deletions b/‎README.md
Lines changed: 40 additions & 30 deletions
diff --git a/‎code_of_conduct.md
Lines changed: 1 addition & 0 deletions b/‎code_of_conduct.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎contributing.md
Lines changed: 1 addition & 0 deletions b/‎contributing.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎dist/README.md
Lines changed: 3 additions & 3 deletions b/‎dist/README.md
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/main/java/ldbc/snb/datagen/entities/statictype/place/PopularPlace.java
Lines changed: 9 additions & 3 deletions b/‎src/main/java/ldbc/snb/datagen/entities/statictype/place/PopularPlace.java
Lines changed: 9 additions & 3 deletions
diff --git a/‎src/main/java/ldbc/snb/datagen/generator/generators/PersonGenerator.java
Lines changed: 5 additions & 2 deletions b/‎src/main/java/ldbc/snb/datagen/generator/generators/PersonGenerator.java
Lines changed: 5 additions & 2 deletions
@@ -42,15 +42,20 @@ jobs:
             sudo apt install -y openjdk-8-jdk zip
             sudo update-alternatives --install /usr/bin/java java /usr/lib/jvm/java-8-openjdk-amd64/bin/java 1
             sudo update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/bin/java
-            java -version
-      - run: docker build . -t ldbc/spark
+      - run:
+          name: Build Docker container
+          command: |
+            docker build . -t ldbc/spark
       - restore_cache:
           keys:
             - m2-dep-branch:{{ .Branch }}-pom:{{ checksum "pom.xml" }}-
             - m2-dep-branch:dev-pom:{{ checksum "pom.xml" }}-
             - m2-dep-branch:{{ .Branch }}-
             - m2-dep-branch:dev-
-      - run: mvn -ntp clean test-compile assembly:assembly
+      - run:
+          name: Build JAR file
+          command: |
+            mvn -ntp clean test-compile assembly:assembly
       - save_cache:
           key: m2-dep-branch:{{ .Branch }}-pom:{{ checksum "pom.xml" }}-{{ epoch }}
           paths:
@@ -74,6 +79,7 @@ jobs:
       - run:
           name: Generate SF0.003 / BI / composite-merged CSVs
           command: |
+            # we generate factors here but they are moved to a separate archive (social-network-sf0.003-bi-factors.zip)
             tools/docker-run.sh --mode bi --scale-factor 0.003 --generate-factors
             mv out/ social-network-sf0.003-bi-composite-merged-fk/
       - run:
@@ -118,7 +124,8 @@ jobs:
             # include the CircleCI configuration in the deployed package to provide the 'filters' instructions (and prevent failed builds on the gh-pages branch)
             mv .circleci dist/
             # move factors to a separate directory
-            mv social-network-sf0.003-bi-composite-merged-fk/factors social-network-sf0.003-bi-factors
+            mkdir social-network-sf0.003-bi-factors
+            mv social-network-sf0.003-bi-composite-merged-fk/factors social-network-sf0.003-bi-factors/factors
             # compress each directory
             for d in social-network-sf0.003*; do
               echo "Generated with <https://github.com/ldbc/ldbc_snb_datagen_spark/commit/${CIRCLE_SHA1}>" > $d/README.md
 
@@ -44,7 +44,10 @@ local.properties
 
 /*.crc
 /*.csv
-out/
+/out/
+/out-*/
+/out.tar.zst
+/out-*.tar.zst
 datagen_output/
 
 /sf*/
 
@@ -1,4 +1,4 @@
- Copyright [2020-]2021 Linked Data Benchmark Council
+ Copyright [2020-]2022 Linked Data Benchmark Council
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
 
@@ -4,18 +4,16 @@
 
 [![Build Status](https://circleci.com/gh/ldbc/ldbc_snb_datagen_spark.svg?style=svg)](https://circleci.com/gh/ldbc/ldbc_snb_datagen_spark)
 
-Datagen is part of the [LDBC project](https://ldbcouncil.org/).
+The LDBC SNB Data Generator (Datagen) produces the datasets for the [LDBC Social Network Benchmark's workloads](https://ldbcouncil.org/benchmarks/snb/). The generator is designed to produce directed labelled graphs that mimic the characteristics of those graphs of real data. A detailed description of the schema produced by Datagen, as well as the format of the output files, can be found in the latest version of official [LDBC SNB specification document](https://github.com/ldbc/ldbc_snb_docs).
 
 :scroll: If you wish to cite the LDBC SNB, please refer to the [documentation repository](https://github.com/ldbc/ldbc_snb_docs#how-to-cite-ldbc-benchmarks).
 
 :warning: There are two different versions of the Datagen:
 
-* The [Hadoop-based Datagen](https://github.com/ldbc/ldbc_snb_datagen_hadoop/) generates the Interactive SF1-1000 data sets
+* The [Hadoop-based Datagen](https://github.com/ldbc/ldbc_snb_datagen_hadoop/) generates the Interactive workload's SF1-1000 data sets.
 * For the BI workload, use the Spark-based Datagen (in this repository).
 * For the Interactive workloads's larger data sets, there is no out-of-the-box solution (see [this issue](https://github.com/ldbc/ldbc_snb_interactive/issues/173)).
 
-The LDBC SNB Data Generator (Datagen) is responsible for providing the datasets used by all the LDBC benchmarks. This data generator is designed to produce directed labelled graphs that mimic the characteristics of those graphs of real data. A detailed description of the schema produced by Datagen, as well as the format of the output files, can be found in the latest version of official [LDBC SNB specification document](https://github.com/ldbc/ldbc_snb_docs).
-
 [Generated small data sets](https://ldbcouncil.org/ldbc_snb_datagen_spark/) are deployed by the CI.
 
 ## Quick start
@@ -27,7 +25,7 @@ You can build the JAR with both Maven and SBT.
 * To assemble the JAR file with Maven, run:
 
     ```bash
-    tools/build.sh
+    ./tools/build.sh
     ```
 
 * For faster builds during development, consider using SBT. To assemble the JAR file with SBT, run:
@@ -45,48 +43,52 @@ and install the dependencies.
 
 E.g. with [pyenv](https://github.com/pyenv/pyenv) and [pyenv-virtualenv](https://github.com/pyenv/pyenv-virtualenv):
 ```bash
-pyenv install 3.7.7
-pyenv virtualenv 3.7.7 ldbc_datagen_tools
+pyenv install 3.7.13
+pyenv virtualenv 3.7.13 ldbc_datagen_tools
 pyenv local ldbc_datagen_tools
 pip install -U pip
 pip install ./tools
 ```
 ### Running locally
 
-The `tools/run.py` is intended for **local runs**. To use it, download and extract Spark as follows.
+The `./tools/run.py` script is intended for **local runs**. To use it, download and extract Spark as follows.
 
 #### Spark 3.2.x
 
 Spark 3.2.x is the recommended runtime to use. The rest of the instructions are provided assuming Spark 3.2.x.
 
+To place Spark under `/opt/`:
+
 ```bash
 curl https://downloads.apache.org/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz | sudo tar -xz -C /opt/
 export SPARK_HOME="/opt/spark-3.2.0-bin-hadoop3.2"
 export PATH="$SPARK_HOME/bin":"$PATH"
 ```
 
-Both Java 8 and Java 11 work.
-
-To build, run
+To place under `~/`:
 
 ```bash
-tools/build.sh
+curl https://downloads.apache.org/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz | tar -xz -C ~/
+export SPARK_HOME=~/spark-3.2.0-bin-hadoop3.2
+export PATH="$SPARK_HOME/bin":"$PATH"
 ```
 
-Run the script with:
+Both Java 8 and Java 11 are supported.
+
+Once you have Spark in place and built the JAR file, run the generator as follows:
 
 ```bash
 export PLATFORM_VERSION=2.12_spark3.2
 export DATAGEN_VERSION=0.5.0-SNAPSHOT
-tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar <runtime configuration arguments> -- <generator configuration arguments>
+./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar <runtime configuration arguments> -- <generator configuration arguments>
 ```
 
 #### Runtime configuration arguments
 
 The runtime configuration arguments determine the amount of memory, number of threads, degree of parallelism. For a list of arguments, see:
 
 ```bash
-tools/run.py --help
+./tools/run.py --help
 ```
 
 To generate a single `part-*.csv` file, reduce the parallelism (number of Spark partitions) to 1.
@@ -104,12 +106,6 @@ To get a complete list of the arguments, pass `--help` to the JAR file:
 ./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --help
 ```
 
-* Passing `params.ini` files:
-
-  ```bash
-  ./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --format csv --param-file params.ini
-  ```
-
 * Generating `CsvBasic` files in **Interactive mode**:
 
   ```bash
@@ -122,12 +118,24 @@ To get a complete list of the arguments, pass `--help` to the JAR file:
   ./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --format csv --scale-factor 0.003 --mode bi --format-options compression=gzip
   ```
 
+* Generating `CsvCompositeMergeForeign` files in **BI mode** and generating factors:
+
+  ```bash
+  ./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --format csv --scale-factor 0.003 --mode bi --generate-factors
+  ```
+
 * Generating CSVs in **raw mode**:
 
   ```bash
   ./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --format csv --scale-factor 0.003 --mode raw --output-dir sf0.003-raw
   ```
 
+* Generating Parquet files:
+
+  ```bash
+  ./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --format parquet --scale-factor 0.003 --mode bi
+  ```
+
 * For the `interactive` and `bi` formats, the `--format-options` argument allows passing formatting options such as timestamp/date formats, the presence/abscence of headers (see the [Spark formatting options](https://spark.apache.org/docs/2.4.8/api/scala/index.html#org.apache.spark.sql.DataFrameWriter) for details), and whether quoting the fields in the CSV required:
 
   ```bash
@@ -143,29 +151,31 @@ export SPARK_CONF_DIR=./conf
 ./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar --parallelism 4 --memory 8G -- --format csv --format-options timestampFormat=MM/dd/y\ HH:mm:ss,dateFormat=MM/dd/y --explode-edges --explode-attrs --mode interactive --scale-factor 0.003
 ```
 
+It is also possible to pass a parameter file:
+
+```bash
+./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --format csv --param-file params.ini
+```
+
 ### Docker image
 
 <!-- SNB Datagen images are available via [Docker Hub](https://hub.docker.com/r/ldbc/datagen/) (currently outdated). -->
 
 The Docker image can be built with the provided Dockerfile. To build, execute the following command from the repository directory:
 
 ```bash
-tools/docker-build.sh
+./tools/docker-build.sh
 ```
 
-See [Build the JAR](#build-the-jar) to build the library. Then, run the following:
+See [Build the JAR](#build-the-jar) to build the library (e.g. by invoking `./tools/build.sh`). Then, run the following:
 
 ```bash
-tools/docker-run.sh
+./tools/docker-run.sh
 ```
 
 ### Elastic MapReduce
 
-We provide scripts to run Datagen on AWS EMR. See the README in the [`tools/emr`](tools/emr) directory for details.
-
-## Larger scale factors
-
-The scale factors SF3k+ are currently being fine-tuned, both regarding optimizing the generator and also for tuning the distributions.
+We provide scripts to run Datagen on AWS EMR. See the README in the [`./tools/emr`](tools/emr) directory for details.
 
 ## Graph schema
 
@@ -177,4 +187,4 @@ The graph schema is as follows:
 
 * When running the tests, they might throw a `java.net.UnknownHostException: your_hostname: your_hostname: Name or service not known` coming from `org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal`. The solution is to add an entry of your machine's hostname to the `/etc/hosts` file: `127.0.1.1 your_hostname`.
 * If you are using Docker and Spark runs out of space, make sure that Docker has enough space to store its containers. To move the location of the Docker containers to a larger disk, stop Docker, edit (or create) the `/etc/docker/daemon.json` file and add `{ "data-root": "/path/to/new/docker/data/dir" }`, then sync the old folder if needed, and restart Docker. (See [more detailed instructions](https://www.guguweb.com/2019/02/07/how-to-move-docker-data-directory-to-another-location-on-ubuntu/)).
-* If you are using a local Spark installation and run out of space in `/tmp`, set the `SPARK_LOCAL_DIRS` to point to a directory with enough free space.
+* If you are using a local Spark installation and run out of space in `/tmp` (`java.io.IOException: No space left on device`), set the `SPARK_LOCAL_DIRS` to point to a directory with enough free space.
@@ -0,0 +1 @@
+For our code of conduct, see: https://github.com/ldbc/community/blob/main/code_of_conduct.md
@@ -0,0 +1 @@
+For our contributor's guide, see: https://github.com/ldbc/community/blob/main/contributing.md
@@ -1,12 +1,12 @@
 # LDBC SNB Datagen (Spark variant) – Latest artefacts
 
-This README is deployed to <http://ldbcouncil.org/ldbc_snb_datagen_spark>.
+This README is deployed to <https://ldbcouncil.org/ldbc_snb_datagen_spark>.
 
 ## Generated data sets
 
-The following data sets are generated for the `dev` variant, to be used for the BI workload.
+The following data sets were generated for the **LDBC Social Network Benchmark's BI (Business Intelligence) workload** by the latest commit at <https://github.com/ldbc/ldbc_snb_datagen_spark>.
 
-If you are looking for data sets to implement the Interactive workload, please use the [Hadoop-based legacy  Datagen](https://github.com/ldbc/ldbc_snb_datagen_hadoop) or reach out to us.
+If you are looking for data sets of the **SNB Interactive workload**, please use the [legacy Hadoop-based Datagen](https://github.com/ldbc/ldbc_snb_datagen_hadoop) or download them from the [SURF/CWI data repository](https://hdl.handle.net/11112/e6e00558-a2c3-9214-473e-04a16de09bf8).
 
 {% for file in site.static_files %}
   {% if file.extname == ".zip" -%}
 
@@ -47,14 +47,20 @@ public PopularPlace(String name, double latitude, double longitude) {
         this.longitude = longitude;
     }
 
-    public String getName() { return name; }
+    public String getName() {
+        return name;
+    }
 
     public void setName(String name) {
         this.name = name;
     }
 
-    public double getLatitude() { return latitude; }
+    public double getLatitude() {
+        return latitude;
+    }
 
-    public double getLongitude() { return longitude; }
+    public double getLongitude() {
+        return longitude;
+    }
 
 }
@@ -151,11 +151,14 @@ private Person generatePerson() {
         base = base.replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
         base = base.replaceAll(" ", ".");
         base = base.replaceAll("[.]+", ".");
-        for (int i = 0; i < numEmails; i++) {
+        while (person.getEmails().size() < numEmails) {
             String email = base + "" + person.getAccountId() + "@" +
                     Dictionaries.emails.getRandomEmail(randomFarm.get(RandomGeneratorFarm.Aspect.TOP_EMAIL),
                             randomFarm.get(RandomGeneratorFarm.Aspect.EMAIL));
-            person.getEmails().add(email);
+            // avoid duplicates
+            if (!person.getEmails().contains(email)) {
+                person.getEmails().add(email);
+            }
         }
 
         // Set class year
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`- Copyright [2020-]2021 Linked Data Benchmark Council`
	`1`	`+ Copyright [2020-]2022 Linked Data Benchmark Council`
`2`	`2`
`3`	`3`	`Licensed under the Apache License, Version 2.0 (the "License");`
`4`	`4`	`you may not use this file except in compliance with the License.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+For our code of conduct, see: https://github.com/ldbc/community/blob/main/code_of_conduct.md`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+For our contributor's guide, see: https://github.com/ldbc/community/blob/main/contributing.md`
Original file line number	Diff line number	Diff line change
`@@ -47,14 +47,20 @@ public PopularPlace(String name, double latitude, double longitude) {`
`47`	`47`	`this.longitude = longitude;`
`48`	`48`	`}`
`49`	`49`
`50`		`- public String getName() { return name; }`
	`50`	`+ public String getName() {`
	`51`	`+ return name;`
	`52`	`+ }`
`51`	`53`
`52`	`54`	`public void setName(String name) {`
`53`	`55`	`this.name = name;`
`54`	`56`	`}`
`55`	`57`
`56`		`- public double getLatitude() { return latitude; }`
	`58`	`+ public double getLatitude() {`
	`59`	`+ return latitude;`
	`60`	`+ }`
`57`	`61`
`58`		`- public double getLongitude() { return longitude; }`
	`62`	`+ public double getLongitude() {`
	`63`	`+ return longitude;`
	`64`	`+ }`
`59`	`65`
`60`	`66`	`}`