Skip to content

Commit b71278e

Browse files
committed
Merge branch 'main' into interactive-more-factor-tables
2 parents 64fe791 + d060baa commit b71278e

34 files changed

+518
-607
lines changed

.circleci/config.yml

Lines changed: 20 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -22,110 +22,77 @@ workflows:
2222
executors:
2323
my-executor:
2424
machine:
25-
image: ubuntu-2004:202008-01
25+
image: ubuntu-2204:2022.04.1
26+
docker_layer_caching: true
2627
working_directory: ~/ldbc/ldbc_snb_datagen
2728

2829
jobs:
2930
test:
3031
executor: my-executor
3132
environment:
32-
PLATFORM_VERSION: 2.12_spark3.1
33+
PLATFORM_VERSION: 2.12_spark3.2
3334
DATAGEN_VERSION: 0.5.0-SNAPSHOT
3435
steps:
3536
- checkout
36-
- run:
37-
name: Install dependencies
38-
command: |
39-
mkdir out/
40-
DEBIAN_FRONTEND=noninteractive
41-
sudo apt update
42-
sudo apt install -y openjdk-8-jdk zip
43-
sudo update-alternatives --install /usr/bin/java java /usr/lib/jvm/java-8-openjdk-amd64/bin/java 1
44-
sudo update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/bin/java
4537
- run:
4638
name: Build Docker container
4739
command: |
48-
docker build . -t ldbc/spark
49-
- restore_cache:
50-
keys:
51-
- m2-dep-branch:{{ .Branch }}-pom:{{ checksum "pom.xml" }}-
52-
- m2-dep-branch:dev-pom:{{ checksum "pom.xml" }}-
53-
- m2-dep-branch:{{ .Branch }}-
54-
- m2-dep-branch:dev-
55-
- run:
56-
name: Build JAR file
57-
command: |
58-
mvn -ntp clean test-compile assembly:assembly
59-
- save_cache:
60-
key: m2-dep-branch:{{ .Branch }}-pom:{{ checksum "pom.xml" }}-{{ epoch }}
61-
paths:
62-
- ~/.m2/repository # maven deps
40+
docker build . --target=standalone --tag ldbc/datagen-standalone:latest
6341
# BI
64-
- run:
42+
- run:
6543
name: Generate SF0.003 / BI / singular-projected CSVs
6644
command: |
67-
tools/docker-run.sh --mode bi --scale-factor 0.003 --explode-edges --explode-attrs
45+
tools/docker-run.sh -- --mode bi --scale-factor 0.003 --explode-edges --explode-attrs
6846
mv out/ social-network-sf0.003-bi-singular-projected-fk/
6947
- run:
7048
name: Generate SF0.003 / BI / singular-merged CSVs
7149
command: |
72-
tools/docker-run.sh --mode bi --scale-factor 0.003 --explode-attrs
50+
tools/docker-run.sh -- --mode bi --scale-factor 0.003 --explode-attrs
7351
mv out/ social-network-sf0.003-bi-singular-merged-fk/
7452
- run:
7553
name: Generate SF0.003 / BI / composite-projected CSVs
7654
command: |
77-
tools/docker-run.sh --mode bi --scale-factor 0.003 --explode-edges
55+
tools/docker-run.sh -- --mode bi --scale-factor 0.003 --explode-edges
7856
mv out/ social-network-sf0.003-bi-composite-projected-fk/
7957
- run:
80-
name: Generate SF0.003 / BI / composite-merged CSVs
58+
name: Generate SF0.003 / BI / composite-merged CSVs, generate factors
8159
command: |
8260
# we generate factors here but they are moved to a separate archive (social-network-sf0.003-bi-factors.zip)
83-
tools/docker-run.sh --mode bi --scale-factor 0.003 --generate-factors
61+
tools/docker-run.sh -- --mode bi --scale-factor 0.003 --generate-factors
8462
mv out/ social-network-sf0.003-bi-composite-merged-fk/
8563
- run:
8664
name: Generate SF0.003 / BI / compressed composite-merged CSVs for Postgres
8765
command: |
88-
tools/docker-run.sh --mode bi --scale-factor 0.003 --format-options compression=gzip
66+
tools/docker-run.sh -- --mode bi --scale-factor 0.003 --format-options compression=gzip
8967
mv out/ social-network-sf0.003-bi-composite-merged-fk-postgres-compressed/
9068
- run:
9169
name: Generate SF0.003 / BI / composite-projected CSVs for Neo4j
9270
command: |
93-
tools/docker-run.sh --mode bi --scale-factor 0.003 --explode-edges --format-options header=false,quoteAll=true
71+
tools/docker-run.sh -- --mode bi --scale-factor 0.003 --explode-edges --format-options header=false,quoteAll=true
9472
mv out/ social-network-sf0.003-bi-composite-projected-fk-neo4j/
9573
- run:
9674
name: Generate SF0.003 / BI / compressed composite-projected CSVs for Neo4j
9775
command: |
98-
tools/docker-run.sh --mode bi --scale-factor 0.003 --explode-edges --format-options header=false,quoteAll=true,compression=gzip
76+
tools/docker-run.sh -- --mode bi --scale-factor 0.003 --explode-edges --format-options header=false,quoteAll=true,compression=gzip
9977
mv out/ social-network-sf0.003-bi-composite-projected-fk-neo4j-compressed/
100-
# Interactive
101-
- run:
102-
name: Generate SF0.003 / Interactive / singular-projected CSVs
103-
command: |
104-
tools/docker-run.sh --mode interactive --scale-factor 0.003 --explode-edges --explode-attrs
105-
mv out/ social-network-sf0.003-interactive-singular-projected-fk/
106-
- run:
107-
name: Generate SF0.003 / Interactive / singular-merged CSVs
108-
command: |
109-
tools/docker-run.sh --mode interactive --scale-factor 0.003 --explode-attrs
110-
mv out/ social-network-sf0.003-interactive-singular-merged-fk/
11178
- run:
112-
name: Generate SF0.003 / Interactive / composite-projected CSVs
79+
name: Generate SF0.003 / BI / compressed composite-projected CSVs for Neo4j with epoch milli timestamps
11380
command: |
114-
tools/docker-run.sh --mode interactive --scale-factor 0.003 --explode-edges
115-
mv out/ social-network-sf0.003-interactive-composite-projected-fk/
81+
tools/docker-run.sh -- --mode bi --scale-factor 0.003 --explode-edges --epoch-millis --format-options header=false,quoteAll=true,compression=gzip
82+
mv out/ social-network-sf0.003-bi-composite-projected-fk-neo4j-compressed-epoch-millis/
11683
- run:
117-
name: Generate SF0.003 / Interactive / composite-merged CSVs
84+
name: Generate SF0.003 / BI / compressed composite-projected CSVs for Neo4j with epoch milli timestamps
11885
command: |
119-
tools/docker-run.sh --mode interactive --scale-factor 0.003
120-
mv out/ social-network-sf0.003-interactive-composite-merged-fk/
86+
tools/docker-run.sh -- --mode bi --scale-factor 0.003 --format parquet
87+
mv out/ social-network-sf0.003-bi-parquet
12188
- run:
12289
name: Compress directories and prepare for deployment
12390
command: |
12491
# include the CircleCI configuration in the deployed package to provide the 'filters' instructions (and prevent failed builds on the gh-pages branch)
12592
mv .circleci dist/
12693
# move factors to a separate directory
12794
mkdir social-network-sf0.003-bi-factors
128-
mv social-network-sf0.003-bi-composite-merged-fk/factors social-network-sf0.003-bi-factors/factors
95+
cp -r social-network-sf0.003-bi-composite-merged-fk/factors social-network-sf0.003-bi-factors/factors
12996
# compress each directory
13097
for d in social-network-sf0.003*; do
13198
echo "Generated with <https://github.com/ldbc/ldbc_snb_datagen_spark/commit/${CIRCLE_SHA1}>" > $d/README.md

.dockerignore

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,13 @@
1-
tools/*
2-
3-
target/*
1+
**/*.egg-info
2+
**/target/*
43

54
*.swp
65
*.crc
76
*.log
8-
9-
*.iml
10-
.travis.yml
7+
**/*.iml
118
.idea/*
12-
.gitignore
9+
**/.gitignore
1310
.git/*
14-
11+
.circleci
12+
**/.bloop
1513
Dockerfile

.editorconfig

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
root = true
2+
3+
# Unix-style newlines with a newline ending every file
4+
[*]
5+
end_of_line = lf
6+
insert_final_newline = true
7+
indent_style = space
8+
indent_size = 2
9+
trim_trailing_whitespace = true
10+
charset = utf-8
11+
12+
[.py]
13+
indent_size = 4
14+
15+
16+
[.java]
17+
indent_size = 4
18+
19+
[Makefile,*.mk]
20+
indent_style = tab

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ target/
1818
test_data/
1919
substitution_parameters/
2020
*.pyc
21-
scripts/
2221
*.iml
2322
/hadoop/
2423
/social_network/
@@ -208,3 +207,4 @@ dmypy.json
208207
.prof
209208

210209
# End of https://www.toptal.com/developers/gitignore/api/python
210+
.bsp/

Dockerfile

Lines changed: 35 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,39 @@
1-
FROM bde2020/spark-master:3.1.1-hadoop3.2
1+
FROM eclipse-temurin:8 as build-jar
2+
3+
RUN apt-get update
4+
RUN apt-get install -y curl
5+
6+
ARG SBT_VERSION=1.5.2
7+
RUN cd /opt && curl -fSsL https://github.com/sbt/sbt/releases/download/v${SBT_VERSION}/sbt-${SBT_VERSION}.tgz | tar xvz
8+
ENV PATH=/opt/sbt/bin:$PATH
9+
WORKDIR build
10+
COPY build.sbt build.sbt
11+
COPY project project
12+
RUN sbt update
13+
COPY src src
14+
RUN sbt assembly
15+
16+
FROM scratch as jar
17+
COPY --from=build-jar /build/target/ldbc_snb_datagen_*-jar-with-dependencies.jar /jar
18+
19+
FROM python:3.7-slim as build-tools
20+
RUN pip install --no-cache virtualenv && virtualenv -p python3.7 /tools
21+
COPY tools build
22+
WORKDIR build
23+
RUN . /tools/bin/activate && pip install .
24+
25+
FROM python:3.7-slim as tools
26+
COPY --from=build-tools /tools /tools
27+
28+
FROM bde2020/spark-master:3.2.1-hadoop3.2 as standalone
29+
COPY --from=jar /jar /jar
30+
COPY --from=tools /tools /tools
31+
RUN ln -sf /usr/bin/python3 /tools/bin/python
232

3-
ENV GOSU_VERSION 1.12
4-
5-
RUN apk add --no-cache su-exec
6-
RUN apk add shadow
7-
RUN [ -d /var/mail ] || mkdir /var/mail
8-
9-
VOLUME /mnt/datagen.jar /mnt/params.ini /mnt/data
10-
11-
WORKDIR /mnt/data
12-
13-
# adjust these environment variables
1433
ENV TEMP_DIR /tmp
15-
ENV EXECUTOR_MEMORY "1G"
16-
ENV DRIVER_MEMORY "5G"
17-
18-
# the SPARK_* variables are used by submit.sh to configure the Spark job
1934
ENV SPARK_LOCAL_DIRS ${TEMP_DIR}
20-
ENV SPARK_SUBMIT_ARGS --executor-memory ${EXECUTOR_MEMORY} --driver-memory ${DRIVER_MEMORY}
21-
ENV SPARK_APPLICATION_MAIN_CLASS ldbc.snb.datagen.LdbcDatagen
22-
ENV SPARK_MASTER_URL local[*]
23-
ENV SPARK_APPLICATION_JAR_LOCATION /mnt/datagen.jar
24-
25-
COPY submit.sh /
35+
ENV PATH=/tools/bin:/spark/bin:$PATH
36+
ENV LDBC_SNB_DATAGEN_JAR=/jar
2637

27-
ENTRYPOINT ["/bin/bash", "/submit.sh"]
38+
WORKDIR /
39+
ENTRYPOINT ["run.py"]

0 commit comments

Comments
 (0)