Skip to content

Commit 2b22c43

Browse files
authored
Merge pull request #408 from ldbc/docker-image
Improve container support
2 parents bf02fde + 37f05df commit 2b22c43

File tree

10 files changed

+129
-101
lines changed

10 files changed

+129
-101
lines changed

.circleci/config.yml

Lines changed: 22 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ workflows:
2222
executors:
2323
my-executor:
2424
machine:
25-
image: ubuntu-2004:202008-01
25+
image: ubuntu-2204:2022.04.1
26+
docker_layer_caching: true
2627
working_directory: ~/ldbc/ldbc_snb_datagen
2728

2829
jobs:
@@ -33,104 +34,86 @@ jobs:
3334
DATAGEN_VERSION: 0.5.0-SNAPSHOT
3435
steps:
3536
- checkout
36-
- run:
37-
name: Install dependencies
38-
command: |
39-
mkdir out/
40-
DEBIAN_FRONTEND=noninteractive
41-
sudo apt update
42-
sudo apt install -y openjdk-8-jdk zip
43-
sudo update-alternatives --install /usr/bin/java java /usr/lib/jvm/java-8-openjdk-amd64/bin/java 1
44-
sudo update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/bin/java
4537
- run:
4638
name: Build Docker container
4739
command: |
48-
docker build . -t ldbc/spark
49-
- restore_cache:
50-
keys:
51-
- m2-dep-branch:{{ .Branch }}-pom:{{ checksum "pom.xml" }}-
52-
- m2-dep-branch:dev-pom:{{ checksum "pom.xml" }}-
53-
- m2-dep-branch:{{ .Branch }}-
54-
- m2-dep-branch:dev-
55-
- run:
56-
name: Build JAR file
57-
command: |
58-
mvn -ntp clean test-compile assembly:assembly
59-
- save_cache:
60-
key: m2-dep-branch:{{ .Branch }}-pom:{{ checksum "pom.xml" }}-{{ epoch }}
61-
paths:
62-
- ~/.m2/repository # maven deps
40+
docker build . --target=standalone -t ldbc/datagen-standalone:latest
6341
# BI
64-
- run:
42+
- run:
6543
name: Generate SF0.003 / BI / singular-projected CSVs
6644
command: |
67-
tools/docker-run.sh --mode bi --scale-factor 0.003 --explode-edges --explode-attrs
45+
tools/docker-run.sh -- --mode bi --scale-factor 0.003 --explode-edges --explode-attrs
6846
mv out/ social-network-sf0.003-bi-singular-projected-fk/
6947
- run:
7048
name: Generate SF0.003 / BI / singular-merged CSVs
7149
command: |
72-
tools/docker-run.sh --mode bi --scale-factor 0.003 --explode-attrs
50+
tools/docker-run.sh -- --mode bi --scale-factor 0.003 --explode-attrs
7351
mv out/ social-network-sf0.003-bi-singular-merged-fk/
7452
- run:
7553
name: Generate SF0.003 / BI / composite-projected CSVs
7654
command: |
77-
tools/docker-run.sh --mode bi --scale-factor 0.003 --explode-edges
55+
tools/docker-run.sh -- --mode bi --scale-factor 0.003 --explode-edges
7856
mv out/ social-network-sf0.003-bi-composite-projected-fk/
7957
- run:
8058
name: Generate SF0.003 / BI / composite-merged CSVs
8159
command: |
8260
# we generate factors here but they are moved to a separate archive (social-network-sf0.003-bi-factors.zip)
83-
tools/docker-run.sh --mode bi --scale-factor 0.003 --generate-factors
61+
tools/docker-run.sh -- --mode bi --scale-factor 0.003 --generate-factors
8462
mv out/ social-network-sf0.003-bi-composite-merged-fk/
8563
- run:
8664
name: Generate SF0.003 / BI / compressed composite-merged CSVs for Postgres
8765
command: |
88-
tools/docker-run.sh --mode bi --scale-factor 0.003 --format-options compression=gzip
66+
tools/docker-run.sh -- --mode bi --scale-factor 0.003 --format-options compression=gzip
8967
mv out/ social-network-sf0.003-bi-composite-merged-fk-postgres-compressed/
9068
- run:
9169
name: Generate SF0.003 / BI / composite-projected CSVs for Neo4j
9270
command: |
93-
tools/docker-run.sh --mode bi --scale-factor 0.003 --explode-edges --format-options header=false,quoteAll=true
71+
tools/docker-run.sh -- --mode bi --scale-factor 0.003 --explode-edges --format-options header=false,quoteAll=true
9472
mv out/ social-network-sf0.003-bi-composite-projected-fk-neo4j/
9573
- run:
9674
name: Generate SF0.003 / BI / compressed composite-projected CSVs for Neo4j
9775
command: |
98-
tools/docker-run.sh --mode bi --scale-factor 0.003 --explode-edges --format-options header=false,quoteAll=true,compression=gzip
76+
tools/docker-run.sh -- --mode bi --scale-factor 0.003 --explode-edges --format-options header=false,quoteAll=true,compression=gzip
9977
mv out/ social-network-sf0.003-bi-composite-projected-fk-neo4j-compressed/
10078
- run:
10179
name: Generate SF0.003 / BI / compressed composite-projected CSVs for Neo4j with epoch milli timestamps
10280
command: |
103-
tools/docker-run.sh --mode bi --scale-factor 0.003 --explode-edges --epoch-millis --format-options header=false,quoteAll=true,compression=gzip
81+
tools/docker-run.sh -- --mode bi --scale-factor 0.003 --explode-edges --epoch-millis --format-options header=false,quoteAll=true,compression=gzip
10482
mv out/ social-network-sf0.003-bi-composite-projected-fk-neo4j-compressed-epoch-millis/
10583
# Interactive
10684
- run:
10785
name: Generate SF0.003 / Interactive / singular-projected CSVs
10886
command: |
109-
tools/docker-run.sh --mode interactive --scale-factor 0.003 --explode-edges --explode-attrs
87+
tools/docker-run.sh -- --mode interactive --scale-factor 0.003 --explode-edges --explode-attrs
11088
mv out/ social-network-sf0.003-interactive-singular-projected-fk/
11189
- run:
11290
name: Generate SF0.003 / Interactive / singular-merged CSVs
11391
command: |
114-
tools/docker-run.sh --mode interactive --scale-factor 0.003 --explode-attrs
92+
tools/docker-run.sh -- --mode interactive --scale-factor 0.003 --explode-attrs
11593
mv out/ social-network-sf0.003-interactive-singular-merged-fk/
11694
- run:
11795
name: Generate SF0.003 / Interactive / composite-projected CSVs
11896
command: |
119-
tools/docker-run.sh --mode interactive --scale-factor 0.003 --explode-edges
97+
tools/docker-run.sh -- --mode interactive --scale-factor 0.003 --explode-edges
12098
mv out/ social-network-sf0.003-interactive-composite-projected-fk/
12199
- run:
122100
name: Generate SF0.003 / Interactive / composite-merged CSVs
123101
command: |
124-
tools/docker-run.sh --mode interactive --scale-factor 0.003
102+
tools/docker-run.sh -- --mode interactive --scale-factor 0.003
125103
mv out/ social-network-sf0.003-interactive-composite-merged-fk/
104+
- run:
105+
name: Generate SF1 / Interactive / composite-merged CSVs
106+
command: |
107+
tools/docker-run.sh --parallelism 4 -- --mode interactive --scale-factor 1
108+
mv out/ social-network-sf1-interactive-composite-merged-fk/
126109
- run:
127110
name: Compress directories and prepare for deployment
128111
command: |
129112
# include the CircleCI configuration in the deployed package to provide the 'filters' instructions (and prevent failed builds on the gh-pages branch)
130113
mv .circleci dist/
131114
# move factors to a separate directory
132115
mkdir social-network-sf0.003-bi-factors
133-
mv social-network-sf0.003-bi-composite-merged-fk/factors social-network-sf0.003-bi-factors/factors
116+
cp -r social-network-sf0.003-bi-composite-merged-fk/factors social-network-sf0.003-bi-factors/factors
134117
# compress each directory
135118
for d in social-network-sf0.003*; do
136119
echo "Generated with <https://github.com/ldbc/ldbc_snb_datagen_spark/commit/${CIRCLE_SHA1}>" > $d/README.md

.dockerignore

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,14 @@
1-
tools/*
2-
3-
target/*
1+
**/*.egg-info
2+
**/target/*
43

54
*.swp
65
*.crc
76
*.log
87

9-
*.iml
10-
.travis.yml
8+
**/*.iml
119
.idea/*
12-
.gitignore
10+
**/.gitignore
1311
.git/*
12+
.circleci
1413

1514
Dockerfile

.editorconfig

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
root = true
2+
3+
# Unix-style newlines with a newline ending every file
4+
[*]
5+
end_of_line = lf
6+
insert_final_newline = true
7+
indent_style = space
8+
indent_size = 2
9+
trim_trailing_whitespace = true
10+
charset = utf-8
11+
12+
[.py]
13+
indent_size = 4
14+
15+
16+
[.java]
17+
indent_size = 4
18+
19+
[Makefile,*.mk]
20+
indent_style = tab

Dockerfile

Lines changed: 30 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,34 @@
1-
FROM bde2020/spark-master:3.2.1-hadoop3.2
1+
FROM eclipse-temurin:8 as build-jar
2+
ARG MAVEN_VERSION=3.8.6
3+
COPY pom.xml /build/pom.xml
4+
WORKDIR build
5+
RUN cd /opt && curl https://dlcdn.apache.org/maven/maven-3/${MAVEN_VERSION}/binaries/apache-maven-${MAVEN_VERSION}-bin.tar.gz | tar xvz
6+
ENV PATH=/opt/apache-maven-${MAVEN_VERSION}/bin:$PATH
7+
RUN mvn install
8+
COPY src /build/src
9+
RUN mvn assembly:assembly -DskipTests
10+
11+
FROM scratch as jar
12+
COPY --from=build-jar /build/target/ldbc_snb_datagen_*-jar-with-dependencies.jar /jar
13+
14+
FROM python:3.7-slim as build-tools
15+
RUN pip install --no-cache virtualenv && virtualenv -p python3.7 /tools
16+
COPY tools build
17+
WORKDIR build
18+
RUN . /tools/bin/activate && pip install .
19+
20+
FROM python:3.7-slim as tools
21+
COPY --from=build-tools /tools /tools
22+
23+
FROM bde2020/spark-master:3.2.1-hadoop3.2 as standalone
24+
COPY --from=jar /jar /jar
25+
COPY --from=tools /tools /tools
26+
RUN ln -sf /usr/bin/python3 /tools/bin/python
227

3-
ENV GOSU_VERSION 1.12
4-
5-
RUN apk add --no-cache su-exec
6-
RUN apk add shadow
7-
RUN [ -d /var/mail ] || mkdir /var/mail
8-
9-
VOLUME /mnt/datagen.jar /mnt/params.ini /mnt/data
10-
11-
WORKDIR /mnt/data
12-
13-
# adjust these environment variables
1428
ENV TEMP_DIR /tmp
15-
ENV EXECUTOR_MEMORY "1G"
16-
ENV DRIVER_MEMORY "5G"
17-
18-
# the SPARK_* variables are used by submit.sh to configure the Spark job
1929
ENV SPARK_LOCAL_DIRS ${TEMP_DIR}
20-
ENV SPARK_SUBMIT_ARGS --executor-memory ${EXECUTOR_MEMORY} --driver-memory ${DRIVER_MEMORY}
21-
ENV SPARK_APPLICATION_MAIN_CLASS ldbc.snb.datagen.LdbcDatagen
22-
ENV SPARK_MASTER_URL local[*]
23-
ENV SPARK_APPLICATION_JAR_LOCATION /mnt/datagen.jar
24-
25-
COPY submit.sh /
30+
ENV PATH=/tools/bin:/spark/bin:$PATH
31+
ENV LDBC_SNB_DATAGEN_JAR=/jar
2632

27-
ENTRYPOINT ["/bin/bash", "/submit.sh"]
33+
WORKDIR /
34+
ENTRYPOINT ["run.py"]

README.md

Lines changed: 41 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,8 @@ Once you have Spark in place and built the JAR file, run the generator as follow
8080
```bash
8181
export PLATFORM_VERSION=2.12_spark3.2
8282
export DATAGEN_VERSION=0.5.0-SNAPSHOT
83-
./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar <runtime configuration arguments> -- <generator configuration arguments>
83+
export LDBC_SNB_DATAGEN_JAR=./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar
84+
./tools/run.py <runtime configuration arguments> -- <generator configuration arguments>
8485
```
8586

8687
#### Runtime configuration arguments
@@ -94,7 +95,7 @@ The runtime configuration arguments determine the amount of memory, number of th
9495
To generate a single `part-*.csv` file, reduce the parallelism (number of Spark partitions) to 1.
9596

9697
```bash
97-
./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar --parallelism 1 -- --format csv --scale-factor 0.003 --mode interactive
98+
./tools/run.py --parallelism 1 -- --format csv --scale-factor 0.003 --mode interactive
9899
```
99100
#### Generator configuration arguments
100101

@@ -103,49 +104,49 @@ The generator configuration arguments allow the configuration of the output dire
103104
To get a complete list of the arguments, pass `--help` to the JAR file:
104105

105106
```bash
106-
./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --help
107+
./tools/run.py -- --help
107108
```
108109

109110
* Generating `CsvBasic` files in **Interactive mode**:
110111

111112
```bash
112-
./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --format csv --scale-factor 0.003 --explode-edges --explode-attrs --mode interactive
113+
./tools/run.py -- --format csv --scale-factor 0.003 --explode-edges --explode-attrs --mode interactive
113114
```
114115

115116
* Generating `CsvCompositeMergeForeign` files in **BI mode** resulting in compressed `.csv.gz` files:
116117

117118
```bash
118-
./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --format csv --scale-factor 0.003 --mode bi --format-options compression=gzip
119+
./tools/run.py -- --format csv --scale-factor 0.003 --mode bi --format-options compression=gzip
119120
```
120121

121122
* Generating `CsvCompositeMergeForeign` files in **BI mode** and generating factors:
122123

123124
```bash
124-
./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --format csv --scale-factor 0.003 --mode bi --generate-factors
125+
./tools/run.py -- --format csv --scale-factor 0.003 --mode bi --generate-factors
125126
```
126127

127128
* Generating CSVs in **raw mode**:
128129

129130
```bash
130-
./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --format csv --scale-factor 0.003 --mode raw --output-dir sf0.003-raw
131+
./tools/run.py -- --format csv --scale-factor 0.003 --mode raw --output-dir sf0.003-raw
131132
```
132133

133134
* Generating Parquet files:
134135

135136
```bash
136-
./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --format parquet --scale-factor 0.003 --mode bi
137+
./tools/run.py -- --format parquet --scale-factor 0.003 --mode bi
137138
```
138139

139140
* Use epoch milliseconds encoded as longs (née `LongDateFormatter`) for serializing date and datetime values:
140141

141142
```bash
142-
./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --format csv --scale-factor 0.003 --mode bi --epoch-millis
143+
./tools/run.py -- --format csv --scale-factor 0.003 --mode bi --epoch-millis
143144
```
144145

145146
* For the `interactive` and `bi` formats, the `--format-options` argument allows passing formatting options such as timestamp/date formats, the presence/abscence of headers (see the [Spark formatting options](https://spark.apache.org/docs/2.4.8/api/scala/index.html#org.apache.spark.sql.DataFrameWriter) for details), and whether quoting the fields in the CSV required:
146147

147148
```bash
148-
./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --format csv --scale-factor 0.003 --mode interactive --format-options timestampFormat=MM/dd/y\ HH:mm:ss,dateFormat=MM/dd/y,header=false,quoteAll=true
149+
./tools/run.py -- --format csv --scale-factor 0.003 --mode interactive --format-options timestampFormat=MM/dd/y\ HH:mm:ss,dateFormat=MM/dd/y,header=false,quoteAll=true
149150
```
150151

151152
To change the Spark configuration directory, adjust the `SPARK_CONF_DIR` environment variable.
@@ -154,31 +155,53 @@ A complex example:
154155

155156
```bash
156157
export SPARK_CONF_DIR=./conf
157-
./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar --parallelism 4 --memory 8G -- --format csv --format-options timestampFormat=MM/dd/y\ HH:mm:ss,dateFormat=MM/dd/y --explode-edges --explode-attrs --mode interactive --scale-factor 0.003
158+
./tools/run.py --parallelism 4 --memory 8G -- --format csv --format-options timestampFormat=MM/dd/y\ HH:mm:ss,dateFormat=MM/dd/y --explode-edges --explode-attrs --mode interactive --scale-factor 0.003
158159
```
159160

160161
It is also possible to pass a parameter file:
161162

162163
```bash
163-
./tools/run.py ./target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}.jar -- --format csv --param-file params.ini
164+
./tools/run.py -- --format csv --param-file params.ini
164165
```
165166

166-
### Docker image
167+
### Docker images
168+
SNB Datagen images are available via [Docker Hub](https://hub.docker.com/orgs/ldbc/repositories).
169+
The image tags follow the pattern `${DATAGEN_VERSION}-${PLATFORM_VERSION}`, e.g `ldbc/datagen-standalone:0.5.0-2.12_spark3.1`.
167170

168-
<!-- SNB Datagen images are available via [Docker Hub](https://hub.docker.com/r/ldbc/datagen/) (currently outdated). -->
171+
When building images ensure that you [use BuildKit](https://docs.docker.com/develop/develop-images/build_enhancements/#to-enable-buildkit-builds).
169172

170-
The Docker image can be built with the provided Dockerfile. To build, execute the following command from the repository directory:
173+
#### Standalone Docker image
171174

175+
The standalone image bundles Spark with the JAR and Python helpers, so you can run a workload in a container similarly to a local run, as you can
176+
see in this example:
172177
```bash
173-
./tools/docker-build.sh
178+
mkdir -p out_sf0.003_interactive # create output directory
179+
docker run \
180+
--mount type=bind,source="$(pwd)"/out_sf0.003_interactive,target=/out \
181+
--mount type=bind,source="$(pwd)"/conf,target=/conf,readonly \
182+
-e SPARK_CONF_DIR=/conf \
183+
ldbc/datagen-standalone:latest --parallelism 1 -- --format csv --scale-factor 0.003 --mode interactive
174184
```
175185

176-
See [Build the JAR](#build-the-jar) to build the library (e.g. by invoking `./tools/build.sh`). Then, run the following:
186+
The standalone Docker image can be built with the provided Dockerfile. To build, execute the following command from the repository directory:
177187

178188
```bash
179-
./tools/docker-run.sh
189+
docker build . --target=standalone -t ldbc/datagen-standalone:latest
180190
```
181191

192+
#### JAR-only image
193+
The `ldbc/datagen-jar` image contains the assembly JAR, so it can bundled in your custom container:
194+
195+
```docker
196+
FROM my-spark-image
197+
COPY --from=ldbc/datagen-jar:latest /jar /lib/ldbc-datagen.jar
198+
```
199+
200+
The JAR-only Docker image can be built with the provided Dockerfile. To build, execute the following command from the repository directory:
201+
202+
```bash
203+
docker build . --target=jar -t ldbc/datagen-jar:latest
204+
```
182205
### Elastic MapReduce
183206

184207
We provide scripts to run Datagen on AWS EMR. See the README in the [`./tools/emr`](tools/emr) directory for details.

tools/docker-build.sh

Lines changed: 0 additions & 3 deletions
This file was deleted.

tools/docker-run.sh

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,6 @@
11
#!/bin/bash
22

3-
[ ! -f target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}-jar-with-dependencies.jar ] && echo "target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}-jar-with-dependencies.jar does not exist, exiting" && exit 1
4-
53
# make sure that out directory exists and clean previously generated data
64
mkdir -p out/
75
rm -rf out/*
8-
docker run \
9-
--env uid=`id -u` \
10-
--volume `pwd`/out:/mnt/data \
11-
--volume `pwd`/target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}-jar-with-dependencies.jar:/mnt/datagen.jar \
12-
ldbc/spark \
13-
--output-dir /mnt/data \
14-
${@} # pass arguments of this script to the submit.sh script (Docker entrypoint)
6+
docker run --volume `pwd`/out:/out ldbc/datagen-standalone:latest ${@}

tools/emr/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)