Skip to content

Commit aa74250

Browse files
Refactor host graph Spark configuration for larger graphs
- Adapt the Spark configuration for building the host graphs to larger graphs (more edges), avoiding out-of-memory errors - Add separate configuration of executor cores to run step 2 (merge from-to host link pairs and construct numeric graph) - Assign less cores per executor running step 2 - Increase number of edge partitions
1 parent 02c1eb3 commit aa74250

File tree

2 files changed

+31
-10
lines changed

2 files changed

+31
-10
lines changed

src/script/hostgraph/build_hostgraph.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -365,7 +365,7 @@ if [ -n "MERGE_NAME" ]; then
365365
--conf spark.io.compression.codec=zstd \
366366
--conf spark.checkpoint.compress=true \
367367
--num-executors $NUM_EXECUTORS \
368-
--executor-cores $EXECUTOR_CORES \
368+
--executor-cores $EXECUTOR_CORES_STEP2 \
369369
--executor-memory $EXECUTOR_MEM \
370370
--conf spark.sql.warehouse.dir=$WAREHOUSE_DIR \
371371
--conf spark.sql.parquet.compression.codec=zstd \

src/script/hostgraph/hostgraph_config.sh

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -84,69 +84,85 @@ export PYSPARK_PYTHON=python3
8484
NUM_EXECUTORS=${NUM_EXECUTORS:-16}
8585
EXECUTOR_CONFIG=${EXECUTOR_CONFIG:-"r5.xlarge"}
8686
# NOTE:
87-
# - step 1 (host link extraction) can be run on smaller instances
88-
# or "compute optimized" instance types
89-
# - webgraph construction (esp. for merged graphs including multiple monthly crawls)
90-
# needs instances with sufficient amount of RAM (32 GB or more)
91-
# - assigning IDs in multiple partitions
87+
# The hardware requirements depend on the size of the webgraph. In addition,
88+
# - The first step (host link extraction) can be run on smaller
89+
# or "compute optimized" instances. The link extraction is highly
90+
# parallizable and has a low memory footprint.
91+
# - The second step, the webgraph construction, that is enumeration
92+
# of vertices and mapping of host-host links to numerical arcs requires
93+
# instances with sufficient amount of RAM (usually, 32 GB or more).
94+
# Especially, for merged graphs spanning over multiple monthly crawls.
95+
# The configuration assigns a smaller number of cores per executor,
96+
# see EXECUTOR_CORES_STEP2.
97+
# - Assigning the vertex IDs in multiple partitions
9298
# (see hostlinks_to_graph.py --vertex_partitions)
93-
# reduces the memory requirements significantly
99+
# reduces the memory requirements significantly.
94100

95101

96102
case "$EXECUTOR_CONFIG" in
97103
c[5678]*.xlarge )
98104
EXECUTOR_CORES=3
105+
EXECUTOR_CORES_STEP2=2
99106
EXECUTOR_MEM=5g
100107
NODEMANAGER_MEM_MB=$((6*1024))
101108
;;
102109
c[5678]*.2xlarge )
103110
EXECUTOR_CORES=6
111+
EXECUTOR_CORES_STEP2=4
104112
EXECUTOR_MEM=10g
105113
NODEMANAGER_MEM_MB=$((11*1024))
106114
;;
107115
c[5678]*.4xlarge )
108116
EXECUTOR_CORES=12
117+
EXECUTOR_CORES_STEP2=8
109118
EXECUTOR_MEM=22g
110119
NODEMANAGER_MEM_MB=$((24*1024))
111120
;;
112121
r[5678]*.xlarge )
113122
EXECUTOR_CORES=4
123+
EXECUTOR_CORES_STEP2=3
114124
EXECUTOR_MEM=23g
115125
NODEMANAGER_MEM_MB=$((24*1024))
116126
;;
117127
r[5678]*.2xlarge )
118128
EXECUTOR_CORES=7
129+
EXECUTOR_CORES_STEP2=5
119130
EXECUTOR_MEM=46g
120131
NODEMANAGER_MEM_MB=$((48*1024))
121132
;;
122133
r[5678]*.4xlarge )
123134
EXECUTOR_CORES=15
135+
EXECUTOR_CORES_STEP2=10
124136
EXECUTOR_MEM=94g
125137
NODEMANAGER_MEM_MB=$((96*1024))
126138
;;
127139
r[5678]*.8xlarge )
128140
EXECUTOR_CORES=30
141+
EXECUTOR_CORES_STEP2=20
129142
EXECUTOR_MEM=190g
130143
NODEMANAGER_MEM_MB=$((192*1024))
131144
;;
132145
m[5678]*.2xlarge )
133146
EXECUTOR_CORES=8
147+
EXECUTOR_CORES_STEP2=6
134148
EXECUTOR_MEM=23g
135149
NODEMANAGER_MEM_MB=$((24*1024))
136150
;;
137151
m[5678]*.4xlarge )
138152
EXECUTOR_CORES=16
153+
EXECUTOR_CORES_STEP2=12
139154
EXECUTOR_MEM=46g
140155
NODEMANAGER_MEM_MB=$((48*1024))
141156
;;
142157
m[5678]*.8xlarge )
143158
EXECUTOR_CORES=32
159+
EXECUTOR_CORES_STEP2=24
144160
EXECUTOR_MEM=94g
145161
NODEMANAGER_MEM_MB=$((98*1024))
146162
;;
147163
"custom" )
148164
if [ -z "$EXECUTOR_CORES" ] || [ -z "$EXECUTOR_MEM" ]; then
149-
echo "No valid custom executor configuration: must specify EXECUTOR_CORES and EXECUTOR_MEM'" >&2
165+
echo "No valid custom executor configuration: must specify EXECUTOR_CORES and EXECUTOR_MEM, eventually also EXECUTOR_CORES_STEP2'" >&2
150166
exit 1
151167
fi
152168
;;
@@ -155,11 +171,16 @@ case "$EXECUTOR_CONFIG" in
155171
exit 1
156172
esac
157173

174+
if [ -z "$EXECUTOR_CORES_STEP2" ]; then
175+
# fall-back definition of EXECUTOR_CORES_STEP2 : ceil( $EXECUTOR_CORES * 2/3 )
176+
EXECUTOR_CORES_STEP2=$(((2*EXECUTOR_CORES+2)/3))
177+
fi
178+
158179
SPARK_EXTRA_OPTS="$SPARK_EXTRA_OPTS --conf spark.yarn.nodemanager.resource.memory-mb=$NODEMANAGER_MEM_MB"
159180

160181
OUTPUT_PARTITIONS=$((NUM_EXECUTORS*EXECUTOR_CORES/2))
161-
WEBGRAPH_EDGE_PARTITIONS=$((NUM_EXECUTORS*EXECUTOR_CORES/2))
182+
WEBGRAPH_EDGE_PARTITIONS=$((NUM_EXECUTORS*EXECUTOR_CORES_STEP2))
162183
WEBGRAPH_EDGE_PARTITIONS=$(((WEBGRAPH_EDGE_PARTITIONS<NUM_EXECUTORS)?NUM_EXECUTORS:WEBGRAPH_EDGE_PARTITIONS))
163-
WEBGRAPH_VERTEX_PARTITIONS=$((NUM_EXECUTORS*EXECUTOR_CORES/4))
184+
WEBGRAPH_VERTEX_PARTITIONS=$((NUM_EXECUTORS*EXECUTOR_CORES_STEP2/4))
164185
WEBGRAPH_VERTEX_PARTITIONS=$(((WEBGRAPH_VERTEX_PARTITIONS<NUM_EXECUTORS)?NUM_EXECUTORS:WEBGRAPH_VERTEX_PARTITIONS))
165186
DIVISOR_INPUT_PARTITIONS=5

0 commit comments

Comments
 (0)