@@ -84,69 +84,85 @@ export PYSPARK_PYTHON=python3
8484NUM_EXECUTORS=${NUM_EXECUTORS:- 16}
8585EXECUTOR_CONFIG=${EXECUTOR_CONFIG:- " r5.xlarge" }
8686# NOTE:
87- # - step 1 (host link extraction) can be run on smaller instances
88- # or "compute optimized" instance types
89- # - webgraph construction (esp. for merged graphs including multiple monthly crawls)
90- # needs instances with sufficient amount of RAM (32 GB or more)
91- # - assigning IDs in multiple partitions
87+ # The hardware requirements depend on the size of the webgraph. In addition,
88+ # - The first step (host link extraction) can be run on smaller
89+ # or "compute optimized" instances. The link extraction is highly
90+ # parallizable and has a low memory footprint.
91+ # - The second step, the webgraph construction, that is enumeration
92+ # of vertices and mapping of host-host links to numerical arcs requires
93+ # instances with sufficient amount of RAM (usually, 32 GB or more).
94+ # Especially, for merged graphs spanning over multiple monthly crawls.
95+ # The configuration assigns a smaller number of cores per executor,
96+ # see EXECUTOR_CORES_STEP2.
97+ # - Assigning the vertex IDs in multiple partitions
9298# (see hostlinks_to_graph.py --vertex_partitions)
93- # reduces the memory requirements significantly
99+ # reduces the memory requirements significantly.
94100
95101
96102case " $EXECUTOR_CONFIG " in
97103 c[5678]* .xlarge )
98104 EXECUTOR_CORES=3
105+ EXECUTOR_CORES_STEP2=2
99106 EXECUTOR_MEM=5g
100107 NODEMANAGER_MEM_MB=$(( 6 * 1024 ))
101108 ;;
102109 c[5678]* .2xlarge )
103110 EXECUTOR_CORES=6
111+ EXECUTOR_CORES_STEP2=4
104112 EXECUTOR_MEM=10g
105113 NODEMANAGER_MEM_MB=$(( 11 * 1024 ))
106114 ;;
107115 c[5678]* .4xlarge )
108116 EXECUTOR_CORES=12
117+ EXECUTOR_CORES_STEP2=8
109118 EXECUTOR_MEM=22g
110119 NODEMANAGER_MEM_MB=$(( 24 * 1024 ))
111120 ;;
112121 r[5678]* .xlarge )
113122 EXECUTOR_CORES=4
123+ EXECUTOR_CORES_STEP2=3
114124 EXECUTOR_MEM=23g
115125 NODEMANAGER_MEM_MB=$(( 24 * 1024 ))
116126 ;;
117127 r[5678]* .2xlarge )
118128 EXECUTOR_CORES=7
129+ EXECUTOR_CORES_STEP2=5
119130 EXECUTOR_MEM=46g
120131 NODEMANAGER_MEM_MB=$(( 48 * 1024 ))
121132 ;;
122133 r[5678]* .4xlarge )
123134 EXECUTOR_CORES=15
135+ EXECUTOR_CORES_STEP2=10
124136 EXECUTOR_MEM=94g
125137 NODEMANAGER_MEM_MB=$(( 96 * 1024 ))
126138 ;;
127139 r[5678]* .8xlarge )
128140 EXECUTOR_CORES=30
141+ EXECUTOR_CORES_STEP2=20
129142 EXECUTOR_MEM=190g
130143 NODEMANAGER_MEM_MB=$(( 192 * 1024 ))
131144 ;;
132145 m[5678]* .2xlarge )
133146 EXECUTOR_CORES=8
147+ EXECUTOR_CORES_STEP2=6
134148 EXECUTOR_MEM=23g
135149 NODEMANAGER_MEM_MB=$(( 24 * 1024 ))
136150 ;;
137151 m[5678]* .4xlarge )
138152 EXECUTOR_CORES=16
153+ EXECUTOR_CORES_STEP2=12
139154 EXECUTOR_MEM=46g
140155 NODEMANAGER_MEM_MB=$(( 48 * 1024 ))
141156 ;;
142157 m[5678]* .8xlarge )
143158 EXECUTOR_CORES=32
159+ EXECUTOR_CORES_STEP2=24
144160 EXECUTOR_MEM=94g
145161 NODEMANAGER_MEM_MB=$(( 98 * 1024 ))
146162 ;;
147163 " custom" )
148164 if [ -z " $EXECUTOR_CORES " ] || [ -z " $EXECUTOR_MEM " ]; then
149- echo " No valid custom executor configuration: must specify EXECUTOR_CORES and EXECUTOR_MEM'" >&2
165+ echo " No valid custom executor configuration: must specify EXECUTOR_CORES and EXECUTOR_MEM, eventually also EXECUTOR_CORES_STEP2 '" >&2
150166 exit 1
151167 fi
152168 ;;
@@ -155,11 +171,16 @@ case "$EXECUTOR_CONFIG" in
155171 exit 1
156172esac
157173
174+ if [ -z " $EXECUTOR_CORES_STEP2 " ]; then
175+ # fall-back definition of EXECUTOR_CORES_STEP2 : ceil( $EXECUTOR_CORES * 2/3 )
176+ EXECUTOR_CORES_STEP2=$(( (2 * EXECUTOR_CORES+ 2 )/ 3 ))
177+ fi
178+
158179SPARK_EXTRA_OPTS=" $SPARK_EXTRA_OPTS --conf spark.yarn.nodemanager.resource.memory-mb=$NODEMANAGER_MEM_MB "
159180
160181OUTPUT_PARTITIONS=$(( NUM_EXECUTORS* EXECUTOR_CORES/ 2 ))
161- WEBGRAPH_EDGE_PARTITIONS=$(( NUM_EXECUTORS* EXECUTOR_CORES / 2 ))
182+ WEBGRAPH_EDGE_PARTITIONS=$(( NUM_EXECUTORS* EXECUTOR_CORES_STEP 2 ))
162183WEBGRAPH_EDGE_PARTITIONS=$(( (WEBGRAPH_EDGE_PARTITIONS< NUM_EXECUTORS)? NUM_EXECUTORS: WEBGRAPH_EDGE_PARTITIONS))
163- WEBGRAPH_VERTEX_PARTITIONS=$(( NUM_EXECUTORS* EXECUTOR_CORES / 4 ))
184+ WEBGRAPH_VERTEX_PARTITIONS=$(( NUM_EXECUTORS* EXECUTOR_CORES_STEP 2 / 4 ))
164185WEBGRAPH_VERTEX_PARTITIONS=$(( (WEBGRAPH_VERTEX_PARTITIONS< NUM_EXECUTORS)? NUM_EXECUTORS: WEBGRAPH_VERTEX_PARTITIONS))
165186DIVISOR_INPUT_PARTITIONS=5
0 commit comments