@@ -81,6 +81,29 @@ def post_month_params(sample, lower_bound, upper_bound):
81
81
results .append ([[start_day , end_day ], count_sum ])
82
82
return results
83
83
84
+ def enumerate_path_bounds (minLength ,maxLength ,minDifference ):
85
+ results = []
86
+ for i in range (minLength , maxLength ):
87
+ for j in range (i + minDifference ,maxLength ):
88
+ results .append ([i ,j ])
89
+ return results
90
+
91
+ def prob_language_codes ():
92
+ results = []
93
+ results .append (["ar" ])
94
+ for i in range (0 , 2 ):
95
+ results .append (["tk" ])
96
+ for i in range (0 , 8 ):
97
+ results .append (["uz" ])
98
+ for i in range (0 , 2 ):
99
+ results .append (["uz" ,"tk" ])
100
+ return results
101
+
102
+ def prob_post_lengths ():
103
+ results = [20 ,40 ,113 ,97 ,240 ]
104
+ return results
105
+
106
+
84
107
# def post_three_month_params(sample, lower_bound, upper_bound):
85
108
# results = []
86
109
# for ix in range(0, len(sample)/12):
@@ -197,24 +220,25 @@ def serializes_q15(outdir, countries):
197
220
for country , count in countries :
198
221
writer .append ([country ])
199
222
200
- def serializes_q16 (outdir , persons , tagclasses , countries ):
223
+ def serializes_q16 (outdir , persons , tagclasses , countries , path_bounds ):
201
224
writer = ParamsWriter (outdir , "q16" , ["person" , "tag" , "country" , "minPathDistance" , "maxPathDistance" ])
202
225
random .seed (1988 + 2 )
203
226
for tag , count_a in tagclasses :
204
227
for country , count_b in countries :
205
- writer . append ([ str ( persons [ random . randint ( 0 , len ( persons ))]), tag , country ])
206
- # TODO minPathDistance and maxPathDistance are missing
228
+ for minDist , maxDist in path_bounds :
229
+ writer . append ([ str ( persons [ random . randint ( 0 , len ( persons ))]), tag , country , str ( minDist ), str ( maxDist )])
207
230
208
231
def serializes_q17 (outdir , countries ):
209
232
writer = ParamsWriter (outdir , "q17" , ["country" ])
210
233
for country , count in countries :
211
234
writer .append ([country ])
212
235
213
- def serializes_q18 (outdir , post_weeks ):
236
+ def serializes_q18 (outdir , post_weeks , lengths , languages ):
214
237
writer = ParamsWriter (outdir , "q18" , ["date" , "lengthThreshold" , "languages" ])
215
238
for week , count in post_weeks :
216
- writer .append ([str (week )])
217
- # TODO lengthThreshold and languages are missing
239
+ for length in lengths :
240
+ for language_set in languages :
241
+ writer .append ([str (week ), str (length ), ";" .join (language_set )])
218
242
219
243
def serializes_q19 (outdir , tagclasses ):
220
244
PERS_DATE = datetime .strptime ("1989-1-1" , "%Y-%m-%d" )
@@ -251,9 +275,19 @@ def serializes_q24(outdir, tagclasses):
251
275
for tagclass , count in tagclasses :
252
276
writer .append ([tagclass ])
253
277
254
- def serializes_q25 (outdir ):
255
- writer = ParamsWriter (outdir , "q25" , ["person1Id" , "person2Id" , "startDate" , "endDate" ])
256
- # TODO
278
+ def serialize_q25 (outdir , persons , post_month_ranges ):
279
+ writer = ParamsWriter ("q25" , ["person1Id" , "person2Id" , "startDate" , "endDate" ])
280
+ for day_range , count_post in post_month_ranges :
281
+ count = min (len (persons ), 10 )
282
+ for _ in range (0 , count ):
283
+ person1Id = persons [random .randint (0 , len (persons ) - 1 )]
284
+ while True :
285
+ person2Id = persons [random .randint (0 , len (persons ) - 1 )]
286
+ if person2Id != person1Id :
287
+ writer .append ([str (person1Id ), str (person2Id ), str (day_range [0 ]), str (day_range [1 ])],
288
+ [0 , 0 , count_post , count_post ])
289
+ break
290
+
257
291
258
292
def add_months (sourcedate ,months ):
259
293
month = sourcedate .month - 1 + months
@@ -343,13 +377,17 @@ def main(argv=None):
343
377
post_upper_threshold = (total_posts / (non_empty_weeks / 4 ))* 1.2
344
378
post_months = post_month_params (week_posts , post_lower_threshold , post_upper_threshold )
345
379
380
+ path_bounds = enumerate_path_bounds (3 , 9 , 2 )
381
+ language_codes = prob_language_codes ()
382
+ post_lengths = prob_post_lengths ()
383
+
346
384
serializes_q2 (outdir , key_params (country_sample , total_posts / 200 , total_posts / 100 ), post_day_ranges ) # TODO determine constants
347
385
serializes_q3 (outdir , post_months )
348
386
serializes_q14 (outdir , post_month_params (week_posts , post_lower_threshold * 2 , post_upper_threshold * 2 ))
349
387
350
388
serializes_q1 (outdir , post_date_right_open_range_params (week_posts , 0.3 * total_posts , 0.6 * total_posts ))
351
389
serializes_q12 (outdir , post_date_right_open_range_params (week_posts , 0.3 * total_posts , 0.6 * total_posts ))
352
- serializes_q18 (outdir , post_date_right_open_range_params (week_posts , 0.3 * total_posts , 0.6 * total_posts ))
390
+ serializes_q18 (outdir , post_date_right_open_range_params (week_posts , 0.3 * total_posts , 0.6 * total_posts ), post_lengths , language_codes )
353
391
serializes_q10 (outdir , key_params (tag_posts , total_posts / 900 , total_posts / 600 ), post_date_right_open_range_params (week_posts , 0.3 * total_posts , 0.6 * total_posts ))
354
392
355
393
serializes_q4 (outdir , key_params (tagclass_posts , total_posts / 20 , total_posts / 10 ), key_params (country_sample , total_posts / 120 , total_posts / 70 ))
@@ -360,20 +398,17 @@ def main(argv=None):
360
398
serializes_q9 (outdir , key_params (tagclass_posts , 6000 , 25000 ))
361
399
serializes_q13 (outdir , key_params (country_sample , total_posts / 200 , total_posts / 100 ))
362
400
serializes_q15 (outdir , key_params (country_sample , total_posts / 200 , total_posts / 100 ))
363
- serializes_q16 (outdir , persons , key_params (tagclass_posts , total_posts / 30 , total_posts / 10 ), key_params (country_sample , total_posts / 80 , total_posts / 20 ))
401
+ serializes_q16 (outdir , persons , key_params (tagclass_posts , total_posts / 30 , total_posts / 10 ), key_params (country_sample , total_posts / 80 , total_posts / 20 ), path_bounds )
364
402
serializes_q17 (outdir , key_params (country_sample , total_posts / 200 , total_posts / 100 ))
365
403
serializes_q19 (outdir , key_params (tagclass_posts , total_posts / 60 , total_posts / 10 ))
366
404
serializes_q21 (outdir , key_params (country_sample , total_posts / 200 , total_posts / 100 ))
367
405
serializes_q22 (outdir , key_params (country_sample , total_posts / 120 , total_posts / 40 ))
368
406
serializes_q23 (outdir , key_params (country_sample , total_posts / 200 , total_posts / 100 ))
369
- serializes_q24 (outdir , key_params (tagclass_posts , total_posts / 140 , total_posts / 5 ))
407
+ serializes_q24 (outdir , key_params (tagclass_posts , total_posts / 140 , total_posts / 5 )) serialize_q25 ( outdir , persons , post_months )
370
408
371
409
# TODO: Refine
372
410
serializes_q20 (outdir , key_params (tagclass_posts , total_posts / 20 , total_posts / 2 ))
373
411
serializes_q11 (outdir , key_params (country_sample , total_posts / 80 , total_posts / 20 ), bad_words )
374
412
375
- # TODO: implement
376
- #serializes_q25(outdir, ...)
377
-
378
413
if __name__ == "__main__" :
379
414
sys .exit (main ())
0 commit comments