improve containerization support

dszakallas · dszakallas · commit df59f492156f · 2022-07-06T17:09:50.000+02:00
diff --git a/.dockerignore b/.dockerignore
@@ -1,15 +1,14 @@
-tools/*
-
-target/*
+**/*.egg-info
+**/target/*
 
 *.swp
 *.crc
 *.log
 
-*.iml
-.travis.yml
+**/*.iml
 .idea/*
-.gitignore
+**/.gitignore
 .git/*
+.circleci
 
 Dockerfile
diff --git a/.editorconfig b/.editorconfig
@@ -0,0 +1,20 @@
+root = true
+
+# Unix-style newlines with a newline ending every file
+[*]
+end_of_line = lf
+insert_final_newline = true
+indent_style = space
+indent_size = 2
+trim_trailing_whitespace = true
+charset = utf-8
+
+[.py]
+indent_size = 4
+
+
+[.java]
+indent_size = 4
+
+[Makefile,*.mk]
+indent_style = tab
diff --git a/Dockerfile b/Dockerfile
@@ -1,27 +1,34 @@
-FROM bde2020/spark-master:3.1.1-hadoop3.2
+FROM eclipse-temurin:8 as build-jar
+ARG MAVEN_VERSION=3.8.6
+COPY pom.xml /build/pom.xml
+WORKDIR build
+RUN cd /opt && curl https://dlcdn.apache.org/maven/maven-3/${MAVEN_VERSION}/binaries/apache-maven-${MAVEN_VERSION}-bin.tar.gz | tar xvz
+ENV PATH=/opt/apache-maven-${MAVEN_VERSION}/bin:$PATH
+RUN mvn install
+COPY src /build/src
+RUN mvn assembly:assembly -DskipTests
+
+FROM scratch as jar
+COPY --from=build-jar /build/target/ldbc_snb_datagen_*-jar-with-dependencies.jar /jar
+
+FROM python:3.7-slim as build-tools
+RUN pip install --no-cache virtualenv && virtualenv -p python3.7 /tools
+COPY tools build
+WORKDIR build
+RUN . /tools/bin/activate && pip install .
+
+FROM python:3.7-slim as tools
+COPY --from=build-tools /tools /tools
+
+FROM bde2020/spark-master:3.1.1-hadoop3.2 as standalone
+COPY --from=jar /jar /jar
+COPY --from=tools /tools /tools
+RUN ln -sf /usr/bin/python3 /tools/bin/python
 
-ENV GOSU_VERSION 1.12
-
-RUN apk add --no-cache su-exec
-RUN apk add shadow
-RUN [ -d /var/mail ] || mkdir /var/mail
-
-VOLUME /mnt/datagen.jar /mnt/params.ini /mnt/data
-
-WORKDIR /mnt/data
-
-# adjust these environment variables
 ENV TEMP_DIR /tmp
-ENV EXECUTOR_MEMORY "1G"
-ENV DRIVER_MEMORY "5G"
-
-# the SPARK_* variables are used by submit.sh to configure the Spark job
 ENV SPARK_LOCAL_DIRS ${TEMP_DIR}
-ENV SPARK_SUBMIT_ARGS --executor-memory ${EXECUTOR_MEMORY} --driver-memory ${DRIVER_MEMORY}
-ENV SPARK_APPLICATION_MAIN_CLASS ldbc.snb.datagen.LdbcDatagen
-ENV SPARK_MASTER_URL local[*]
-ENV SPARK_APPLICATION_JAR_LOCATION /mnt/datagen.jar
-
-COPY submit.sh /
+ENV PATH=/tools/bin:/spark/bin:$PATH
+ENV LDBC_SNB_DATAGEN_JAR=/jar
 
-ENTRYPOINT ["/bin/bash", "/submit.sh"]
+WORKDIR /
+ENTRYPOINT ["run.py"]
diff --git a/README.md b/README.md
@@ -46,7 +46,7 @@ E.g. with [pyenv](https://github.com/pyenv/pyenv) and [pyenv-virtualenv](https:/
 pyenv install 3.7.13
 pyenv virtualenv 3.7.13 ldbc_datagen_tools
 pyenv local ldbc_datagen_tools
-pip install -U pip 
+pip install -U pip
 pip install ./tools
 ```
 ### Running locally
@@ -80,7 +80,8 @@ Once you have Spark in place and built the JAR file, run the generator as follow
 ```bash
 export PLATFORM_VERSION=2.12_spark3.1
 export DATAGEN_VERSION=0.5.0-SNAPSHOT
-./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar <runtime configuration arguments> -- <generator configuration arguments>
+export LDBC_SNB_DATAGEN_JAR=./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar
+./tools/run.py <runtime configuration arguments> -- <generator configuration arguments>
 ```
 
 #### Runtime configuration arguments
@@ -94,7 +95,7 @@ The runtime configuration arguments determine the amount of memory, number of th
 To generate a single `part-*.csv` file, reduce the parallelism (number of Spark partitions) to 1.
 
 ```bash
-./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar --parallelism 1 -- --format csv --scale-factor 0.003 --mode interactive
+./tools/run.py --parallelism 1 -- --format csv --scale-factor 0.003 --mode interactive
 ```
 #### Generator configuration arguments
 
@@ -103,49 +104,49 @@ The generator configuration arguments allow the configuration of the output dire
 To get a complete list of the arguments, pass `--help` to the JAR file:
 
 ```bash
-./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --help
+./tools/run.py -- --help
 ```
 
 * Generating `CsvBasic` files in **Interactive mode**:
 
   ```bash
-  ./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --format csv --scale-factor 0.003 --explode-edges --explode-attrs --mode interactive
+  ./tools/run.py -- --format csv --scale-factor 0.003 --explode-edges --explode-attrs --mode interactive
   ```
 
 * Generating `CsvCompositeMergeForeign` files in **BI mode** resulting in compressed `.csv.gz` files:
 
   ```bash
-  ./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --format csv --scale-factor 0.003 --mode bi --format-options compression=gzip
+  ./tools/run.py -- --format csv --scale-factor 0.003 --mode bi --format-options compression=gzip
   ```
 
 * Generating `CsvCompositeMergeForeign` files in **BI mode** and generating factors:
 
   ```bash
-  ./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --format csv --scale-factor 0.003 --mode bi --generate-factors
+  ./tools/run.py -- --format csv --scale-factor 0.003 --mode bi --generate-factors
   ```
 
 * Generating CSVs in **raw mode**:
 
   ```bash
-  ./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --format csv --scale-factor 0.003 --mode raw --output-dir sf0.003-raw
+  ./tools/run.py -- --format csv --scale-factor 0.003 --mode raw --output-dir sf0.003-raw
   ```
 
 * Generating Parquet files:
 
   ```bash
-  ./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --format parquet --scale-factor 0.003 --mode bi
+  ./tools/run.py -- --format parquet --scale-factor 0.003 --mode bi
   ```
 
 * Use epoch milliseconds encoded as longs (née `LongDateFormatter`) for serializing date and datetime values:
 
   ```bash
-  ./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --format csv --scale-factor 0.003 --mode bi --epoch-millis
+  ./tools/run.py -- --format csv --scale-factor 0.003 --mode bi --epoch-millis
   ```
 
 * For the `interactive` and `bi` formats, the `--format-options` argument allows passing formatting options such as timestamp/date formats, the presence/abscence of headers (see the [Spark formatting options](https://spark.apache.org/docs/2.4.8/api/scala/index.html#org.apache.spark.sql.DataFrameWriter) for details), and whether quoting the fields in the CSV required:
 
   ```bash
-  ./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --format csv --scale-factor 0.003 --mode interactive --format-options timestampFormat=MM/dd/y\ HH:mm:ss,dateFormat=MM/dd/y,header=false,quoteAll=true
+  ./tools/run.py -- --format csv --scale-factor 0.003 --mode interactive --format-options timestampFormat=MM/dd/y\ HH:mm:ss,dateFormat=MM/dd/y,header=false,quoteAll=true
   ```
 
 To change the Spark configuration directory, adjust the `SPARK_CONF_DIR` environment variable.
@@ -154,31 +155,53 @@ A complex example:
 
 ```bash
 export SPARK_CONF_DIR=./conf
-./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar --parallelism 4 --memory 8G -- --format csv --format-options timestampFormat=MM/dd/y\ HH:mm:ss,dateFormat=MM/dd/y --explode-edges --explode-attrs --mode interactive --scale-factor 0.003
+./tools/run.py --parallelism 4 --memory 8G -- --format csv --format-options timestampFormat=MM/dd/y\ HH:mm:ss,dateFormat=MM/dd/y --explode-edges --explode-attrs --mode interactive --scale-factor 0.003
 ```
 
 It is also possible to pass a parameter file:
 
 ```bash
-./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --format csv --param-file params.ini
+./tools/run.py -- --format csv --param-file params.ini
 ```
 
-### Docker image
+### Docker images
 
 <!-- SNB Datagen images are available via [Docker Hub](https://hub.docker.com/r/ldbc/datagen/) (currently outdated). -->
 
-The Docker image can be built with the provided Dockerfile. To build, execute the following command from the repository directory:
+The image tags follow the pattern `${DATAGEN_VERSION}-${PLATFORM_VERSION}`, e.g `ldbc/datagen-standalone:0.5.0-2.12_spark3.1`.
 
+#### Standalone Docker image
+
+The standalone image bundles Spark with the JAR and Python helpers, so you can run a workload in a container similarly to a local run, as you can
+see in this example:
 ```bash
-./tools/docker-build.sh
+mkdir -p out_sf0.003_interactive   # create output directory
+docker run \
+    --mount type=bind,source="$(pwd)"/out_sf0.003_interactive,target=/out \
+    --mount type=bind,source="$(pwd)"/conf,target=/conf,readonly \
+    -e SPARK_CONF_DIR=/conf \
+    ldbc/datagen-standalone:latest --parallelism 1 -- --format csv --scale-factor 0.003 --mode interactive
 ```
 
-See [Build the JAR](#build-the-jar) to build the library (e.g. by invoking `./tools/build.sh`). Then, run the following:
+The standalone Docker image can be built with the provided Dockerfile. To build, execute the following command from the repository directory:
 
 ```bash
-./tools/docker-run.sh
+docker buildx build . --target=standalone -t ldbc/datagen-standalone:latest
+```
+
+#### JAR-only image
+The `ldbc/datagen-jar` image contains the assembly JAR, so it can bundled in your custom container:
+
+```docker
+FROM my-spark-image
+COPY --from=ldbc/datagen-jar:latest /jar /lib/ldbc-datagen.jar
 ```
 
+The JAR-only Docker image can be built with the provided Dockerfile. To build, execute the following command from the repository directory:
+
+```bash
+docker buildx build . --target=jar -t ldbc/datagen-jar:latest
+```
 ### Elastic MapReduce
 
 We provide scripts to run Datagen on AWS EMR. See the README in the [`./tools/emr`](tools/emr) directory for details.
diff --git a/pom.xml b/pom.xml
@@ -6,7 +6,7 @@
 
   <groupId>ldbc.snb.datagen</groupId>
   <artifactId>ldbc_snb_datagen</artifactId>
-  <version>0.5.0-SNAPSHOT</version>
+  <version>0.0.1</version>
   <packaging>jar</packaging>
 
   <properties>
@@ -129,6 +129,17 @@
       ${project.artifactId}_${scala.compat.version}_spark${spark.compat.version}-${project.version}
     </finalName>
     <plugins>
+      <plugin>
+          <groupId>me.ccampo</groupId>
+          <artifactId>git-version-maven-plugin</artifactId>
+          <version>0.1.0</version> <!-- Use the latest stable version if possible -->
+          <extensions>true</extensions>
+          <configuration>
+              <strategy hint="git">
+                  <!-- Strategy specific configuration goes here -->
+              </strategy>
+          </configuration>
+      </plugin>
       <plugin>
         <groupId>net.alchim31.maven</groupId>
         <artifactId>scala-maven-plugin</artifactId>
diff --git a/tools/docker-build.sh b/tools/docker-build.sh
diff --git a/tools/docker-run.sh b/tools/docker-run.sh
diff --git a/tools/emr/__init__.py b/tools/emr/__init__.py
diff --git a/tools/run.py b/tools/run.py
@@ -65,12 +65,12 @@ def run_local(
 
     run(cmd, env=default_env)
 
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='Run a Datagen job locally')
-    parser.add_argument('jar',
+    parser.add_argument('--jar',
                         type=str,
-                        help='LDBC Datagen JAR file')
+                        default=os.environ.get('LDBC_SNB_DATAGEN_JAR'),
+                        help='LDBC Datagen JAR file [LDBC_SNB_DATAGEN_JAR]')
     parser.add_argument('--main-class',
                         type=str,
                         help='Overrides default main class.')
@@ -98,6 +98,9 @@ def run_local(
 
     args = parser.parse_args(self_args)
 
+    if not args.jar:
+      raise ValueError('No JAR given. Specify with --jar or LDBC_SNB_DATAGEN_JAR env var')
+
     run_local(
         args.jar,
         main_class=args.main_class,
diff --git a/tools/setup.cfg b/tools/setup.cfg
@@ -9,3 +9,7 @@ install_requires =
     urllib3
     chardet
     requests
+scripts =
+    run.py
+    get-sizes.py
+    emr/submit_datagen_job.py