13
13
14
14
import argparse
15
15
16
- from datagen .util import split_passthrough_args
16
+ from datagen .util import KeyValue , split_passthrough_args
17
17
18
18
min_num_workers = 1
19
19
max_num_workers = 1000
28
28
'platform_version' : lib .platform_version ,
29
29
'version' : lib .version ,
30
30
'az' : 'us-west-2c' ,
31
- 'is_interactive ' : False ,
31
+ 'yes ' : False ,
32
32
'ec2_key' : None ,
33
33
'emr_release' : 'emr-5.31.0'
34
34
}
@@ -70,21 +70,35 @@ def parse_mem(col):
70
70
return {'vcpu' : vcpu , 'mem' : mem }
71
71
72
72
73
- def submit_datagen_job (name , sf ,
74
- bucket = defaults ['bucket' ],
75
- use_spot = defaults ['use_spot' ],
76
- instance_type = defaults ['instance_type' ],
77
- sf_ratio = defaults ['sf_ratio' ],
78
- master_instance_type = defaults ['master_instance_type' ],
79
- az = defaults ['az' ],
80
- emr_release = defaults ['emr_release' ],
81
- platform_version = defaults ['platform_version' ],
82
- version = defaults ['version' ],
83
- is_interactive = defaults ['is_interactive' ],
84
- ec2_key = defaults ['ec2_key' ],
85
- passthrough_args = None ,
86
- conf = None
73
+ def submit_datagen_job (name ,
74
+ sf ,
75
+ format ,
76
+ mode ,
77
+ bucket ,
78
+ use_spot ,
79
+ instance_type ,
80
+ sf_ratio ,
81
+ master_instance_type ,
82
+ az ,
83
+ emr_release ,
84
+ platform_version ,
85
+ version ,
86
+ yes ,
87
+ ec2_key ,
88
+ conf ,
89
+ copy_filter ,
90
+ copy_all ,
91
+ passthrough_args , ** kwargs
87
92
):
93
+
94
+ is_interactive = (not yes ) and hasattr (__main__ , '__file__' )
95
+
96
+ build_dir = '/ldbc_snb_datagen/build'
97
+
98
+ if not copy_filter :
99
+ copy_filter = f'.*{ build_dir } /graphs/{ format } /{ mode } /.*'
100
+ else :
101
+ copy_filter = f'.*{ build_dir } /{ copy_filter } '
88
102
89
103
exec_info = get_instance_info (instance_type )
90
104
@@ -103,13 +117,9 @@ def submit_datagen_job(name, sf,
103
117
spark_config = {
104
118
'maximizeResourceAllocation' : 'true' ,
105
119
'spark.serializer' : 'org.apache.spark.serializer.KryoSerializer' ,
106
- ** (conf if conf else {})
120
+ ** (dict ( conf ) if conf else {})
107
121
}
108
122
109
- hdfs_prefix = '/ldbc_snb_datagen'
110
-
111
- build_dir = f'{ hdfs_prefix } /build'
112
-
113
123
market = 'SPOT' if use_spot else 'ON_DEMAND'
114
124
115
125
ec2_key_dict = {'Ec2KeyName' : ec2_key } if ec2_key is not None else {}
@@ -165,6 +175,8 @@ def submit_datagen_job(name, sf,
165
175
'--output-dir' , build_dir ,
166
176
'--scale-factor' , str (sf ),
167
177
'--num-threads' , str (cluster_config ['num_threads' ]),
178
+ '--mode' , mode ,
179
+ '--format' , format ,
168
180
* passthrough_args
169
181
]
170
182
}
@@ -178,7 +190,8 @@ def submit_datagen_job(name, sf,
178
190
'Jar' : 'command-runner.jar' ,
179
191
'Args' : ['s3-dist-cp' ,
180
192
'--src' , f'hdfs://{ build_dir } ' ,
181
- '--dest' , f'{ run_url } /social_network'
193
+ '--dest' , f'{ run_url } /social_network' ,
194
+ * (['--srcPattern' , copy_filter ] if not copy_all else [])
182
195
]
183
196
}
184
197
}]
@@ -191,23 +204,6 @@ def submit_datagen_job(name, sf,
191
204
192
205
emr .run_job_flow (** job_flow_args )
193
206
194
- def parse_var (s ):
195
- items = s .split ('=' )
196
- key = items [0 ].strip () # we remove blanks around keys, as is logical
197
- if len (items ) > 1 :
198
- # rejoin the rest:
199
- value = '=' .join (items [1 :])
200
- return (key , value )
201
-
202
-
203
- def parse_vars (items ):
204
- d = {}
205
- if items :
206
- for item in items :
207
- key , value = parse_var (item )
208
- d [key ] = value
209
- return d
210
-
211
207
212
208
if __name__ == "__main__" :
213
209
parser = argparse .ArgumentParser (description = 'Submit a Datagen job to EMR' )
@@ -216,7 +212,10 @@ def parse_vars(items):
216
212
help = 'name' )
217
213
parser .add_argument ('sf' , type = int ,
218
214
help = 'scale factor (used to calculate cluster size)' )
215
+ parser .add_argument ('format' , type = str , help = 'the required output format' )
216
+ parser .add_argument ('mode' , type = str , help = 'output mode' )
219
217
parser .add_argument ('--use-spot' ,
218
+ default = defaults ['use_spot' ],
220
219
action = 'store_true' ,
221
220
help = 'Use SPOT workers' )
222
221
parser .add_argument ('--az' ,
@@ -240,33 +239,31 @@ def parse_vars(items):
240
239
parser .add_argument ('--emr-release' ,
241
240
default = defaults ['emr_release' ],
242
241
help = 'The EMR release to use. E.g emr-5.31.0, emr-6.1.0' )
243
- parser .add_argument ('-y' ,
242
+ parser .add_argument ('-y' , '--yes' ,
243
+ default = defaults ['yes' ],
244
244
action = 'store_true' ,
245
245
help = 'Assume \' yes\' for prompts' )
246
+ copy_args = parser .add_mutually_exclusive_group ()
247
+ copy_args .add_argument ('--copy-filter' ,
248
+ type = str ,
249
+ help = 'A regular expression specifying filtering paths to copy from the build dir to S3. '
250
+ 'By default it is \' graphs/{format}/{mode}/.*\' ' )
251
+ copy_args .add_argument ('--copy-all' ,
252
+ action = 'store_true' ,
253
+ help = 'Copy the complete build dir to S3' )
246
254
parser .add_argument ("--conf" ,
247
255
metavar = "KEY=VALUE" ,
248
256
nargs = '+' ,
257
+ type = KeyValue ,
249
258
help = "SparkConf as key=value pairs" )
250
259
251
260
parser .add_argument ('--' , nargs = '*' , help = 'Arguments passed to LDBC SNB Datagen' , dest = "arg" )
252
261
253
-
254
- self_args , child_args = split_passthrough_args ()
262
+ self_args , passthrough_args = split_passthrough_args ()
255
263
256
264
args = parser .parse_args (self_args )
257
265
258
- conf = parse_vars (args .conf )
259
-
260
- is_interactive = hasattr (__main__ , '__file__' )
261
-
262
- submit_datagen_job (args .name , args .sf ,
263
- bucket = args .bucket , use_spot = args .use_spot , az = args .az ,
264
- is_interactive = is_interactive and not args .y ,
265
- instance_type = args .instance_type ,
266
- emr_release = args .emr_release ,
267
- ec2_key = args .ec2_key ,
268
- platform_version = args .platform_version ,
269
- version = args .version ,
270
- passthrough_args = child_args ,
271
- conf = conf
272
- )
266
+ submit_datagen_job (passthrough_args = passthrough_args ,
267
+ sf_ratio = defaults ['sf_ratio' ],
268
+ master_instance_type = defaults ['master_instance_type' ],
269
+ ** args .__dict__ )
0 commit comments