Add more details/sample command to EMR script

szarnyasg · szarnyasg · commit dc3a5448ccd7 · 2022-03-15T23:19:24.000+01:00
diff --git a/tools/emr/README.md b/tools/emr/README.md
@@ -41,7 +41,13 @@ aws s3 cp target/ldbc_snb_datagen_${PLATFORM_VERSION}-${DATAGEN_VERSION}-jar-wit
 ```bash
 JOB_NAME=MyTest
 SCALE_FACTOR=10
-./tools/emr/submit_datagen_job.py --bucket ${BUCKET_NAME} ${JOB_NAME} ${SCALE_FACTOR} csv raw
+./tools/emr/submit_datagen_job.py \
+    --bucket \
+    ${BUCKET_NAME} \
+    ${JOB_NAME} \
+    ${SCALE_FACTOR} \
+    csv \
+    raw
 ```
 
 Note: scale factors below 1 are not supported.
@@ -51,7 +57,38 @@ Note: scale factors below 1 are not supported.
 To use spot instances, add the `--use-spot` argument:
 
 ```bash
-./tools/emr/submit_datagen_job.py --use-spot --bucket ${BUCKET_NAME} ${JOB_NAME} ${SCALE_FACTOR} csv raw
+./tools/emr/submit_datagen_job.py \
+    --use-spot \
+    --bucket \
+    ${BUCKET_NAME} \
+    ${JOB_NAME} \
+    ${SCALE_FACTOR} \
+    csv \
+    raw
+```
+
+### Sample command
+
+Generate the BI data set with the following configuration:
+
+* use spot instances
+* in the `csv-composite-projected-fk` format (`--explode-edges`)
+* compress CSVs with `gzip`, and
+* generate factors.
+
+```bash
+./tools/emr/submit_datagen_job.py \
+    --use-spot \
+    --bucket \
+    ${BUCKET_NAME} \
+    ${JOB_NAME} \
+    ${SCALE_FACTOR} \
+    csv \
+    bi \
+    -- \
+    --explode-edges \
+    --format-options compression=gzip \
+    --generate-factors
 ```
 
 ### Using a different Spark / EMR version
@@ -61,7 +98,14 @@ Make sure that you have uploaded the right JAR first.
 
 ```bash
 PLATFORM_VERSION=2.12_spark3.1
-./tools/emr/submit_datagen_job.py --bucket ${BUCKET_NAME} --platform-version ${PLATFORM_VERSION} --emr-release emr-6.2.0 ${JOB_NAME} ${SCALE_FACTOR} csv raw
+./tools/emr/submit_datagen_job.py \
+    --bucket ${BUCKET_NAME} \
+    --platform-version ${PLATFORM_VERSION} \
+    --emr-release emr-6.2.0 \
+    ${JOB_NAME} \
+    ${SCALE_FACTOR} \
+    csv \
+    raw
 ```
 
 ### Using a parameter file