ldbc
diff --git a/‎.circleci/config.yml
Lines changed: 22 additions & 39 deletions b/‎.circleci/config.yml
Lines changed: 22 additions & 39 deletions
diff --git a/‎.dockerignore
Lines changed: 5 additions & 6 deletions b/‎.dockerignore
Lines changed: 5 additions & 6 deletions
diff --git a/‎.editorconfig
Lines changed: 20 additions & 0 deletions b/‎.editorconfig
Lines changed: 20 additions & 0 deletions
diff --git a/‎Dockerfile
Lines changed: 30 additions & 23 deletions b/‎Dockerfile
Lines changed: 30 additions & 23 deletions
diff --git a/‎README.md
Lines changed: 41 additions & 18 deletions b/‎README.md
Lines changed: 41 additions & 18 deletions
diff --git a/‎tools/docker-build.sh
Lines changed: 0 additions & 3 deletions b/‎tools/docker-build.sh
Lines changed: 0 additions & 3 deletions
diff --git a/‎tools/docker-run.sh
Lines changed: 1 addition & 9 deletions b/‎tools/docker-run.sh
Lines changed: 1 addition & 9 deletions
diff --git a/‎tools/emr/__init__.py b/‎tools/emr/__init__.py
@@ -22,7 +22,8 @@ workflows:
 executors:
   my-executor:
     machine:
-      image: ubuntu-2004:202008-01
+      image: ubuntu-2204:2022.04.1
+      docker_layer_caching: true
     working_directory: ~/ldbc/ldbc_snb_datagen
 
 jobs:
@@ -33,104 +34,86 @@ jobs:
       DATAGEN_VERSION: 0.5.0-SNAPSHOT
     steps:
       - checkout
-      - run:
-          name: Install dependencies
-          command: |
-            mkdir out/
-            DEBIAN_FRONTEND=noninteractive
-            sudo apt update
-            sudo apt install -y openjdk-8-jdk zip
-            sudo update-alternatives --install /usr/bin/java java /usr/lib/jvm/java-8-openjdk-amd64/bin/java 1
-            sudo update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/bin/java
       - run:
           name: Build Docker container
           command: |
-            docker build . -t ldbc/spark
-      - restore_cache:
-          keys:
-            - m2-dep-branch:{{ .Branch }}-pom:{{ checksum "pom.xml" }}-
-            - m2-dep-branch:dev-pom:{{ checksum "pom.xml" }}-
-            - m2-dep-branch:{{ .Branch }}-
-            - m2-dep-branch:dev-
-      - run:
-          name: Build JAR file
-          command: |
-            mvn -ntp clean test-compile assembly:assembly
-      - save_cache:
-          key: m2-dep-branch:{{ .Branch }}-pom:{{ checksum "pom.xml" }}-{{ epoch }}
-          paths:
-            - ~/.m2/repository   # maven deps
+            docker build . --target=standalone -t ldbc/datagen-standalone:latest
       # BI
-      - run: 
+      - run:
           name: Generate SF0.003 / BI / singular-projected CSVs
           command: |
-            tools/docker-run.sh --mode bi --scale-factor 0.003 --explode-edges --explode-attrs
+            tools/docker-run.sh -- --mode bi --scale-factor 0.003 --explode-edges --explode-attrs
             mv out/ social-network-sf0.003-bi-singular-projected-fk/
       - run:
           name: Generate SF0.003 / BI / singular-merged CSVs
           command: |
-            tools/docker-run.sh --mode bi --scale-factor 0.003 --explode-attrs
+            tools/docker-run.sh -- --mode bi --scale-factor 0.003 --explode-attrs
             mv out/ social-network-sf0.003-bi-singular-merged-fk/
       - run:
           name: Generate SF0.003 / BI / composite-projected CSVs
           command: |
-            tools/docker-run.sh --mode bi --scale-factor 0.003 --explode-edges
+            tools/docker-run.sh -- --mode bi --scale-factor 0.003 --explode-edges
             mv out/ social-network-sf0.003-bi-composite-projected-fk/
       - run:
           name: Generate SF0.003 / BI / composite-merged CSVs
           command: |
             # we generate factors here but they are moved to a separate archive (social-network-sf0.003-bi-factors.zip)
-            tools/docker-run.sh --mode bi --scale-factor 0.003 --generate-factors
+            tools/docker-run.sh -- --mode bi --scale-factor 0.003 --generate-factors
             mv out/ social-network-sf0.003-bi-composite-merged-fk/
       - run:
           name: Generate SF0.003 / BI / compressed composite-merged CSVs for Postgres
           command: |
-            tools/docker-run.sh --mode bi --scale-factor 0.003 --format-options compression=gzip
+            tools/docker-run.sh -- --mode bi --scale-factor 0.003 --format-options compression=gzip
             mv out/ social-network-sf0.003-bi-composite-merged-fk-postgres-compressed/
       - run:
           name: Generate SF0.003 / BI / composite-projected CSVs for Neo4j
           command: |
-            tools/docker-run.sh --mode bi --scale-factor 0.003 --explode-edges --format-options header=false,quoteAll=true
+            tools/docker-run.sh -- --mode bi --scale-factor 0.003 --explode-edges --format-options header=false,quoteAll=true
             mv out/ social-network-sf0.003-bi-composite-projected-fk-neo4j/
       - run:
           name: Generate SF0.003 / BI / compressed composite-projected CSVs for Neo4j
           command: |
-            tools/docker-run.sh --mode bi --scale-factor 0.003 --explode-edges --format-options header=false,quoteAll=true,compression=gzip
+            tools/docker-run.sh -- --mode bi --scale-factor 0.003 --explode-edges --format-options header=false,quoteAll=true,compression=gzip
             mv out/ social-network-sf0.003-bi-composite-projected-fk-neo4j-compressed/
       - run:
           name: Generate SF0.003 / BI / compressed composite-projected CSVs for Neo4j with epoch milli timestamps
           command: |
-            tools/docker-run.sh --mode bi --scale-factor 0.003 --explode-edges --epoch-millis --format-options header=false,quoteAll=true,compression=gzip
+            tools/docker-run.sh -- --mode bi --scale-factor 0.003 --explode-edges --epoch-millis --format-options header=false,quoteAll=true,compression=gzip
             mv out/ social-network-sf0.003-bi-composite-projected-fk-neo4j-compressed-epoch-millis/
       # Interactive
       - run:
           name: Generate SF0.003 / Interactive / singular-projected CSVs
           command: |
-            tools/docker-run.sh --mode interactive --scale-factor 0.003 --explode-edges --explode-attrs
+            tools/docker-run.sh -- --mode interactive --scale-factor 0.003 --explode-edges --explode-attrs
             mv out/ social-network-sf0.003-interactive-singular-projected-fk/
       - run:
           name: Generate SF0.003 / Interactive / singular-merged CSVs
           command: |
-            tools/docker-run.sh --mode interactive --scale-factor 0.003 --explode-attrs
+            tools/docker-run.sh -- --mode interactive --scale-factor 0.003 --explode-attrs
             mv out/ social-network-sf0.003-interactive-singular-merged-fk/
       - run:
           name: Generate SF0.003 / Interactive / composite-projected CSVs
           command: |
-            tools/docker-run.sh --mode interactive --scale-factor 0.003 --explode-edges                
+            tools/docker-run.sh -- --mode interactive --scale-factor 0.003 --explode-edges
             mv out/ social-network-sf0.003-interactive-composite-projected-fk/
       - run:
           name: Generate SF0.003 / Interactive / composite-merged CSVs
           command: |
-            tools/docker-run.sh --mode interactive --scale-factor 0.003                                
+            tools/docker-run.sh -- --mode interactive --scale-factor 0.003
             mv out/ social-network-sf0.003-interactive-composite-merged-fk/
+      - run:
+          name: Generate SF1 / Interactive / composite-merged CSVs
+          command: |
+            tools/docker-run.sh --parallelism 4 -- --mode interactive --scale-factor 1
+            mv out/ social-network-sf1-interactive-composite-merged-fk/
       - run:
           name: Compress directories and prepare for deployment
           command: |
             # include the CircleCI configuration in the deployed package to provide the 'filters' instructions (and prevent failed builds on the gh-pages branch)
             mv .circleci dist/
             # move factors to a separate directory
             mkdir social-network-sf0.003-bi-factors
-            mv social-network-sf0.003-bi-composite-merged-fk/factors social-network-sf0.003-bi-factors/factors
+            cp -r social-network-sf0.003-bi-composite-merged-fk/factors social-network-sf0.003-bi-factors/factors
             # compress each directory
             for d in social-network-sf0.003*; do
               echo "Generated with <https://github.com/ldbc/ldbc_snb_datagen_spark/commit/${CIRCLE_SHA1}>" > $d/README.md
 
@@ -1,15 +1,14 @@
-tools/*
-
-target/*
+**/*.egg-info
+**/target/*
 
 *.swp
 *.crc
 *.log
 
-*.iml
-.travis.yml
+**/*.iml
 .idea/*
-.gitignore
+**/.gitignore
 .git/*
+.circleci
 
 Dockerfile
@@ -0,0 +1,20 @@
+root = true
+
+# Unix-style newlines with a newline ending every file
+[*]
+end_of_line = lf
+insert_final_newline = true
+indent_style = space
+indent_size = 2
+trim_trailing_whitespace = true
+charset = utf-8
+
+[.py]
+indent_size = 4
+
+
+[.java]
+indent_size = 4
+
+[Makefile,*.mk]
+indent_style = tab
@@ -1,27 +1,34 @@
-FROM bde2020/spark-master:3.2.1-hadoop3.2
+FROM eclipse-temurin:8 as build-jar
+ARG MAVEN_VERSION=3.8.6
+COPY pom.xml /build/pom.xml
+WORKDIR build
+RUN cd /opt && curl https://dlcdn.apache.org/maven/maven-3/${MAVEN_VERSION}/binaries/apache-maven-${MAVEN_VERSION}-bin.tar.gz | tar xvz
+ENV PATH=/opt/apache-maven-${MAVEN_VERSION}/bin:$PATH
+RUN mvn install
+COPY src /build/src
+RUN mvn assembly:assembly -DskipTests
+
+FROM scratch as jar
+COPY --from=build-jar /build/target/ldbc_snb_datagen_*-jar-with-dependencies.jar /jar
+
+FROM python:3.7-slim as build-tools
+RUN pip install --no-cache virtualenv && virtualenv -p python3.7 /tools
+COPY tools build
+WORKDIR build
+RUN . /tools/bin/activate && pip install .
+
+FROM python:3.7-slim as tools
+COPY --from=build-tools /tools /tools
+
+FROM bde2020/spark-master:3.2.1-hadoop3.2 as standalone
+COPY --from=jar /jar /jar
+COPY --from=tools /tools /tools
+RUN ln -sf /usr/bin/python3 /tools/bin/python
 
-ENV GOSU_VERSION 1.12
-
-RUN apk add --no-cache su-exec
-RUN apk add shadow
-RUN [ -d /var/mail ] || mkdir /var/mail
-
-VOLUME /mnt/datagen.jar /mnt/params.ini /mnt/data
-
-WORKDIR /mnt/data
-
-# adjust these environment variables
 ENV TEMP_DIR /tmp
-ENV EXECUTOR_MEMORY "1G"
-ENV DRIVER_MEMORY "5G"
-
-# the SPARK_* variables are used by submit.sh to configure the Spark job
 ENV SPARK_LOCAL_DIRS ${TEMP_DIR}
-ENV SPARK_SUBMIT_ARGS --executor-memory ${EXECUTOR_MEMORY} --driver-memory ${DRIVER_MEMORY}
-ENV SPARK_APPLICATION_MAIN_CLASS ldbc.snb.datagen.LdbcDatagen
-ENV SPARK_MASTER_URL local[*]
-ENV SPARK_APPLICATION_JAR_LOCATION /mnt/datagen.jar
-
-COPY submit.sh /
+ENV PATH=/tools/bin:/spark/bin:$PATH
+ENV LDBC_SNB_DATAGEN_JAR=/jar
 
-ENTRYPOINT ["/bin/bash", "/submit.sh"]
+WORKDIR /
+ENTRYPOINT ["run.py"]
@@ -80,7 +80,8 @@ Once you have Spark in place and built the JAR file, run the generator as follow
 ```bash
 export PLATFORM_VERSION=2.12_spark3.2
 export DATAGEN_VERSION=0.5.0-SNAPSHOT
-./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar <runtime configuration arguments> -- <generator configuration arguments>
+export LDBC_SNB_DATAGEN_JAR=./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar
+./tools/run.py <runtime configuration arguments> -- <generator configuration arguments>
 ```
 
 #### Runtime configuration arguments
@@ -94,7 +95,7 @@ The runtime configuration arguments determine the amount of memory, number of th
 To generate a single `part-*.csv` file, reduce the parallelism (number of Spark partitions) to 1.
 
 ```bash
-./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar --parallelism 1 -- --format csv --scale-factor 0.003 --mode interactive
+./tools/run.py --parallelism 1 -- --format csv --scale-factor 0.003 --mode interactive
 ```
 #### Generator configuration arguments
 
@@ -103,49 +104,49 @@ The generator configuration arguments allow the configuration of the output dire
 To get a complete list of the arguments, pass `--help` to the JAR file:
 
 ```bash
-./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --help
+./tools/run.py -- --help
 ```
 
 * Generating `CsvBasic` files in **Interactive mode**:
 
   ```bash
-  ./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --format csv --scale-factor 0.003 --explode-edges --explode-attrs --mode interactive
+  ./tools/run.py -- --format csv --scale-factor 0.003 --explode-edges --explode-attrs --mode interactive
   ```
 
 * Generating `CsvCompositeMergeForeign` files in **BI mode** resulting in compressed `.csv.gz` files:
 
   ```bash
-  ./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --format csv --scale-factor 0.003 --mode bi --format-options compression=gzip
+  ./tools/run.py -- --format csv --scale-factor 0.003 --mode bi --format-options compression=gzip
   ```
 
 * Generating `CsvCompositeMergeForeign` files in **BI mode** and generating factors:
 
   ```bash
-  ./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --format csv --scale-factor 0.003 --mode bi --generate-factors
+  ./tools/run.py -- --format csv --scale-factor 0.003 --mode bi --generate-factors
   ```
 
 * Generating CSVs in **raw mode**:
 
   ```bash
-  ./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --format csv --scale-factor 0.003 --mode raw --output-dir sf0.003-raw
+  ./tools/run.py -- --format csv --scale-factor 0.003 --mode raw --output-dir sf0.003-raw
   ```
 
 * Generating Parquet files:
 
   ```bash
-  ./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --format parquet --scale-factor 0.003 --mode bi
+  ./tools/run.py -- --format parquet --scale-factor 0.003 --mode bi
   ```
 
 * Use epoch milliseconds encoded as longs (née `LongDateFormatter`) for serializing date and datetime values:
 
   ```bash
-  ./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --format csv --scale-factor 0.003 --mode bi --epoch-millis
+  ./tools/run.py -- --format csv --scale-factor 0.003 --mode bi --epoch-millis
   ```
 
 * For the `interactive` and `bi` formats, the `--format-options` argument allows passing formatting options such as timestamp/date formats, the presence/abscence of headers (see the [Spark formatting options](https://spark.apache.org/docs/2.4.8/api/scala/index.html#org.apache.spark.sql.DataFrameWriter) for details), and whether quoting the fields in the CSV required:
 
   ```bash
-  ./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --format csv --scale-factor 0.003 --mode interactive --format-options timestampFormat=MM/dd/y\ HH:mm:ss,dateFormat=MM/dd/y,header=false,quoteAll=true
+  ./tools/run.py -- --format csv --scale-factor 0.003 --mode interactive --format-options timestampFormat=MM/dd/y\ HH:mm:ss,dateFormat=MM/dd/y,header=false,quoteAll=true
   ```
 
 To change the Spark configuration directory, adjust the `SPARK_CONF_DIR` environment variable.
@@ -154,31 +155,53 @@ A complex example:
 
 ```bash
 export SPARK_CONF_DIR=./conf
-./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar --parallelism 4 --memory 8G -- --format csv --format-options timestampFormat=MM/dd/y\ HH:mm:ss,dateFormat=MM/dd/y --explode-edges --explode-attrs --mode interactive --scale-factor 0.003
+./tools/run.py --parallelism 4 --memory 8G -- --format csv --format-options timestampFormat=MM/dd/y\ HH:mm:ss,dateFormat=MM/dd/y --explode-edges --explode-attrs --mode interactive --scale-factor 0.003
 ```
 
 It is also possible to pass a parameter file:
 
 ```bash
-./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --format csv --param-file params.ini
+./tools/run.py -- --format csv --param-file params.ini
 ```
 
-### Docker image
+### Docker images
+SNB Datagen images are available via [Docker Hub](https://hub.docker.com/orgs/ldbc/repositories).
+The image tags follow the pattern `${DATAGEN_VERSION}-${PLATFORM_VERSION}`, e.g `ldbc/datagen-standalone:0.5.0-2.12_spark3.1`.
 
-<!-- SNB Datagen images are available via [Docker Hub](https://hub.docker.com/r/ldbc/datagen/) (currently outdated). -->
+When building images ensure that you [use BuildKit](https://docs.docker.com/develop/develop-images/build_enhancements/#to-enable-buildkit-builds).
 
-The Docker image can be built with the provided Dockerfile. To build, execute the following command from the repository directory:
+#### Standalone Docker image
 
+The standalone image bundles Spark with the JAR and Python helpers, so you can run a workload in a container similarly to a local run, as you can
+see in this example:
 ```bash
-./tools/docker-build.sh
+mkdir -p out_sf0.003_interactive   # create output directory
+docker run \
+    --mount type=bind,source="$(pwd)"/out_sf0.003_interactive,target=/out \
+    --mount type=bind,source="$(pwd)"/conf,target=/conf,readonly \
+    -e SPARK_CONF_DIR=/conf \
+    ldbc/datagen-standalone:latest --parallelism 1 -- --format csv --scale-factor 0.003 --mode interactive
 ```
 
-See [Build the JAR](#build-the-jar) to build the library (e.g. by invoking `./tools/build.sh`). Then, run the following:
+The standalone Docker image can be built with the provided Dockerfile. To build, execute the following command from the repository directory:
 
 ```bash
-./tools/docker-run.sh
+docker build . --target=standalone -t ldbc/datagen-standalone:latest
 ```
 
+#### JAR-only image
+The `ldbc/datagen-jar` image contains the assembly JAR, so it can bundled in your custom container:
+
+```docker
+FROM my-spark-image
+COPY --from=ldbc/datagen-jar:latest /jar /lib/ldbc-datagen.jar
+```
+
+The JAR-only Docker image can be built with the provided Dockerfile. To build, execute the following command from the repository directory:
+
+```bash
+docker build . --target=jar -t ldbc/datagen-jar:latest
+```
 ### Elastic MapReduce
 
 We provide scripts to run Datagen on AWS EMR. See the README in the [`./tools/emr`](tools/emr) directory for details.
 
@@ -1,14 +1,6 @@
 #!/bin/bash
 
-[ ! -f target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}-jar-with-dependencies.jar ] && echo "target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}-jar-with-dependencies.jar does not exist, exiting" && exit 1
-
 # make sure that out directory exists and clean previously generated data
 mkdir -p out/
 rm -rf out/*
-docker run \
-  --env uid=`id -u` \
-  --volume `pwd`/out:/mnt/data \
-  --volume `pwd`/target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}-jar-with-dependencies.jar:/mnt/datagen.jar \
-  ldbc/spark \
-  --output-dir /mnt/data \
-  ${@} # pass arguments of this script to the submit.sh script (Docker entrypoint)
+docker run --volume `pwd`/out:/out ldbc/datagen-standalone:latest ${@}