9
9
import __main__
10
10
11
11
from math import ceil
12
- from botocore .credentials import subprocess
13
12
from datagen import lib , util
14
- import subprocess
15
13
16
14
import argparse
17
15
18
16
from datagen .util import KeyValue , split_passthrough_args
19
17
20
18
min_num_workers = 1
21
19
max_num_workers = 1000
20
+ min_num_threads = 1
22
21
23
22
defaults = {
24
23
'bucket' : 'ldbc-snb-datagen-store' ,
25
24
'use_spot' : True ,
26
25
'master_instance_type' : 'r6gd.2xlarge' ,
27
- 'instance_type' : 'r6gd.4xlarge' ,
28
- 'sf_ratio' : 100.0 , # ratio of SFs and machines. a ratio of 250.0 for SF1000 yields 4 machines
26
+ 'instance_type' : 'i3.4xlarge' ,
27
+ 'sf_per_executor' : 3e3 ,
28
+ 'sf_per_partition' : 10 ,
29
29
'az' : 'us-west-2c' ,
30
30
'yes' : False ,
31
31
'ec2_key' : None ,
43
43
ec2_instances = [dict (row ) for row in reader ]
44
44
45
45
46
- def calculate_cluster_config (scale_factor , sf_ratio , vcpu ):
47
- num_workers = max (min_num_workers , min (max_num_workers , ceil (scale_factor / sf_ratio )))
48
- num_threads = ceil (num_workers * vcpu * 2 )
49
- return {
50
- 'num_workers' : num_workers ,
51
- 'num_threads' : num_threads
52
- }
53
-
54
-
55
46
def get_instance_info (instance_type ):
56
47
def parse_vcpu (col ):
57
48
return int (re .search (r'(\d+) .*' , col ).group (1 ))
@@ -76,7 +67,10 @@ def submit_datagen_job(name,
76
67
jar ,
77
68
use_spot ,
78
69
instance_type ,
79
- sf_ratio ,
70
+ executors ,
71
+ sf_per_executor ,
72
+ partitions ,
73
+ sf_per_partition ,
80
74
master_instance_type ,
81
75
az ,
82
76
emr_release ,
@@ -97,10 +91,6 @@ def submit_datagen_job(name,
97
91
else :
98
92
copy_filter = f'.*{ build_dir } /{ copy_filter } '
99
93
100
- exec_info = get_instance_info (instance_type )
101
-
102
- cluster_config = calculate_cluster_config (sf , sf_ratio , exec_info ['vcpu' ])
103
-
104
94
emr = boto3 .client ('emr' )
105
95
106
96
ts = datetime .utcnow ()
@@ -115,8 +105,15 @@ def submit_datagen_job(name,
115
105
'maximizeResourceAllocation' : 'true'
116
106
}
117
107
108
+ if executors is None :
109
+ executors = max (min_num_workers , min (max_num_workers , ceil (sf / sf_per_executor )))
110
+
111
+ if partitions is None :
112
+ partitions = max (min_num_threads , ceil (sf / sf_per_partition ))
113
+
118
114
spark_defaults_config = {
119
115
'spark.serializer' : 'org.apache.spark.serializer.KryoSerializer' ,
116
+ 'spark.default.parallelism' : str (partitions ),
120
117
** (dict (conf ) if conf else {})
121
118
}
122
119
@@ -157,7 +154,7 @@ def submit_datagen_job(name,
157
154
'Market' : market ,
158
155
'InstanceRole' : 'CORE' ,
159
156
'InstanceType' : instance_type ,
160
- 'InstanceCount' : cluster_config [ 'num_workers' ] ,
157
+ 'InstanceCount' : executors ,
161
158
}
162
159
],
163
160
** ec2_key_dict ,
@@ -178,7 +175,7 @@ def submit_datagen_job(name,
178
175
'Args' : ['spark-submit' , '--class' , lib .main_class , jar_url ,
179
176
'--output-dir' , build_dir ,
180
177
'--scale-factor' , str (sf ),
181
- '--num-threads' , str (cluster_config [ 'num_threads' ] ),
178
+ '--num-threads' , str (partitions ),
182
179
'--mode' , mode ,
183
180
'--format' , format ,
184
181
* passthrough_args
@@ -263,6 +260,26 @@ def submit_datagen_job(name,
263
260
nargs = '+' ,
264
261
action = KeyValue ,
265
262
help = "SparkConf as key=value pairs" )
263
+ executor_args = parser .add_mutually_exclusive_group ()
264
+ executor_args .add_argument ("--executors" ,
265
+ type = int ,
266
+ help = f"Total number of Spark executors."
267
+ )
268
+ executor_args .add_argument ("--sf-per-executor" ,
269
+ type = float ,
270
+ default = defaults ['sf_per_executor' ],
271
+ help = f"Number of scale factors per Spark executor. Default: { defaults ['sf_per_executor' ]} "
272
+ )
273
+ partitioning_args = parser .add_mutually_exclusive_group ()
274
+ partitioning_args .add_argument ("--partitions" ,
275
+ type = int ,
276
+ help = f"Total number of Spark partitions to use when generating the dataset."
277
+ )
278
+ partitioning_args .add_argument ("--sf-per-partition" ,
279
+ type = float ,
280
+ default = defaults ['sf_per_partition' ],
281
+ help = f"Number of scale factors per Spark partitions. Default: { defaults ['sf_per_partition' ]} "
282
+ )
266
283
267
284
parser .add_argument ('--' , nargs = '*' , help = 'Arguments passed to LDBC SNB Datagen' , dest = "arg" )
268
285
@@ -271,6 +288,5 @@ def submit_datagen_job(name,
271
288
args = parser .parse_args (self_args )
272
289
273
290
submit_datagen_job (passthrough_args = passthrough_args ,
274
- sf_ratio = defaults ['sf_ratio' ],
275
291
master_instance_type = defaults ['master_instance_type' ],
276
292
** args .__dict__ )
0 commit comments