@@ -30,8 +30,8 @@ def format_date(date):
30
30
# self.files[i].write(param+"\n")
31
31
32
32
class ParamsWriter :
33
- def __init__ (self , outdir , name , param_names ):
34
- self .file = codecs .open (outdir + "/" + name + "_param.txt" , "w" ,encoding = "utf-8" )
33
+ def __init__ (self , outdir , number , param_names ):
34
+ self .file = codecs .open (outdir + "/bi_" + str ( number ) + "_param.txt" , "w" ,encoding = "utf-8" )
35
35
for i in range (0 ,len (param_names )):
36
36
if i > 0 :
37
37
self .file .write ("|" )
@@ -81,6 +81,29 @@ def post_month_params(sample, lower_bound, upper_bound):
81
81
results .append ([[start_day , end_day ], count_sum ])
82
82
return results
83
83
84
+ def enumerate_path_bounds (minLength ,maxLength ,minDifference ):
85
+ results = []
86
+ for i in range (minLength , maxLength ):
87
+ for j in range (i + minDifference ,maxLength ):
88
+ results .append ([i ,j ])
89
+ return results
90
+
91
+ def prob_language_codes ():
92
+ results = []
93
+ results .append (["ar" ])
94
+ for i in range (0 , 2 ):
95
+ results .append (["tk" ])
96
+ for i in range (0 , 8 ):
97
+ results .append (["uz" ])
98
+ for i in range (0 , 2 ):
99
+ results .append (["uz" ,"tk" ])
100
+ return results
101
+
102
+ def prob_post_lengths ():
103
+ results = [20 ,40 ,113 ,97 ,240 ]
104
+ return results
105
+
106
+
84
107
# def post_three_month_params(sample, lower_bound, upper_bound):
85
108
# results = []
86
109
# for ix in range(0, len(sample)/12):
@@ -101,66 +124,69 @@ def key_params(sample, lower_bound, upper_bound):
101
124
results .append ([key , count ])
102
125
return results
103
126
104
- def serializes_q1 (outdir , post_weeks ):
105
- writer = ParamsWriter (outdir , "q1" , ["date" ])
127
+ def serialize_q1 (outdir , post_weeks ):
128
+ writer = ParamsWriter (outdir , 1 , ["date" ])
106
129
for week , count in post_weeks :
107
130
writer .append ([str (week )])
108
131
109
- def serializes_q2 (outdir , countries , post_day_ranges ):
110
- writer = ParamsWriter (outdir , "q2" , ["date1" , "date2" , "country1" , "country2" ])
132
+ def serialize_q2 (outdir , countries , post_day_ranges ):
133
+ writer = ParamsWriter (outdir , 2 , ["date1" , "date2" , "country1" , "country2" ])
111
134
for day_range , count_post in post_day_ranges :
112
135
for ix in range (0 ,len (countries )):
113
136
country_1 , count_1 = countries [ix ]
114
137
for country_2 , count_2 in countries [ix + 1 :]:
115
138
writer .append ([str (day_range [0 ]),str (day_range [1 ]),country_1 ,country_2 ])
116
139
117
- def serializes_q3 (outdir , post_months ):
118
- writer = ParamsWriter (outdir , "q3" , ["year" , "month" ] )
119
- # TODO year, month
140
+ def serialize_q3 (outdir , post_months ):
141
+ writer = ParamsWriter (outdir , 3 , ["year" , "month" ] )
142
+ for post_month in post_months :
143
+ t = time .gmtime (post_month [0 ][0 ]/ 1000 )
144
+ writer .append ([str (t .tm_year ), str (t .tm_mon )])
120
145
121
- def serializes_q4 (outdir , tagclasses , countries ):
122
- writer = ParamsWriter (outdir , "q4" , ["tagClass" , "country" ])
146
+ def serialize_q4 (outdir , tagclasses , countries ):
147
+ writer = ParamsWriter (outdir , 4 , ["tagClass" , "country" ])
123
148
for tag , count_a in tagclasses :
124
149
for country , count_b in countries :
125
150
writer .append ([tag ,country ])
126
151
127
- def serializes_q5 (outdir , countries ):
128
- writer = ParamsWriter (outdir , "q5" , ["country" ])
152
+ def serialize_q5 (outdir , countries ):
153
+ writer = ParamsWriter (outdir , 5 , ["country" ])
129
154
for country , count in countries :
130
155
writer .append ([country ])
131
156
132
157
133
- def serializes_q6 (outdir , tags ):
134
- writer = ParamsWriter (outdir , "q6" , ["tag" ])
158
+ def serialize_q6 (outdir , tags ):
159
+ writer = ParamsWriter (outdir , 6 , ["tag" ])
135
160
for tag , count in tags :
136
161
writer .append ([tag ])
137
162
138
- def serializes_q7 (outdir , tags ):
139
- writer = ParamsWriter (outdir , "q7" , ["tag" ])
163
+ def serialize_q7 (outdir , tags ):
164
+ writer = ParamsWriter (outdir , 7 , ["tag" ])
140
165
for tag , count in tags :
141
166
writer .append ([tag ])
142
167
143
- def serializes_q8 (outdir , tags ):
144
- writer = ParamsWriter (outdir , "q8" , ["tag" ])
168
+ def serialize_q8 (outdir , tags ):
169
+ writer = ParamsWriter (outdir , 8 , ["tag" ])
145
170
for tag , count in tags :
146
171
writer .append ([tag ])
147
172
148
- def serializes_q9 (outdir , tagclasses ):
149
- writer = ParamsWriter (outdir , "q9" , ["tagClass1" , "tagClass2" , "threshold" ])
173
+ def serialize_q9 (outdir , tagclasses ):
174
+ writer = ParamsWriter (outdir , 9 , ["tagClass1" , "tagClass2" , "threshold" ])
150
175
for ix in range (0 ,len (tagclasses )):
151
176
tag_class_a , count_a = tagclasses [ix ]
152
177
for tag_class_b , count_b in tagclasses [ix + 1 :]:
153
178
writer .append ([tag_class_a , tag_class_b , str (200 )])
154
179
155
- def serializes_q10 (outdir , tags , post_weeks ):
156
- writer = ParamsWriter (outdir , "q10" , ["tag" , "date" ])
180
+ def serialize_q10 (outdir , tags , post_weeks ):
181
+ writer = ParamsWriter (outdir , 10 , ["tag" , "date" ])
157
182
for tag , count in tags :
158
183
for week , count in post_weeks :
159
184
writer .append ([tag , str (week )])
160
185
161
- def serializes_q11 (outdir , countries , bad_words ):
162
- writer = ParamsWriter (outdir , "q11" , ["country" , "blacklist" ])
163
- random .seed (1988 + 2 )
186
+ def serialize_q11 (outdir , countries , bad_words ):
187
+ writer = ParamsWriter (outdir , 11 , ["country" , "blacklist" ])
188
+ random .seed (1988 + 1 )
189
+ # note: this approach keeps shuffling the bad_words list
164
190
for country , count in countries :
165
191
num_words = random .randint (1 ,min (len (bad_words ),4 ));
166
192
random .shuffle (bad_words )
@@ -177,83 +203,111 @@ def serializes_q11(outdir, countries, bad_words):
177
203
blacklist = bad_words [0 :num_words ]
178
204
writer .append ([country ,";" .join (blacklist )])
179
205
180
- def serializes_q12 (outdir , post_weeks ):
181
- writer = ParamsWriter (outdir , "q12" , ["date" , "likeThreshold" ])
206
+ def serialize_q12 (outdir , post_weeks ):
207
+ writer = ParamsWriter (outdir , 12 , ["date" , "likeThreshold" ])
182
208
for week , count in post_weeks :
183
209
writer .append ([str (week ),str (400 )])
184
210
185
- def serializes_q13 (outdir , countries ):
186
- writer = ParamsWriter (outdir , "q13" , ["country" ])
211
+ def serialize_q13 (outdir , countries ):
212
+ writer = ParamsWriter (outdir , 13 , ["country" ])
187
213
for country , count in countries :
188
214
writer .append ([country ])
189
215
190
- def serializes_q14 (outdir , creationdates ):
191
- writer = ParamsWriter (outdir , "q14" , ["begin" , "end" ])
216
+ def serialize_q14 (outdir , creationdates ):
217
+ writer = ParamsWriter (outdir , 14 , ["begin" , "end" ])
192
218
for creation , count in creationdates :
193
219
writer .append ([str (creation [0 ]),str (creation [1 ])])
194
220
195
- def serializes_q15 (outdir , countries ):
196
- writer = ParamsWriter (outdir , "q15" , ["country" ])
221
+ def serialize_q15 (outdir , countries ):
222
+ writer = ParamsWriter (outdir , 15 , ["country" ])
197
223
for country , count in countries :
198
224
writer .append ([country ])
199
225
200
- def serializes_q16 (outdir , persons , tagclasses , countries ):
201
- writer = ParamsWriter (outdir , "q16" , ["person" , "tag " , "country " , "minPathDistance" , "maxPathDistance" ])
226
+ def serialize_q16 (outdir , persons , tagclasses , countries , path_bounds ):
227
+ writer = ParamsWriter (outdir , 16 , ["person" , "country " , "tagClass " , "minPathDistance" , "maxPathDistance" ])
202
228
random .seed (1988 + 2 )
203
- for tag , count_a in tagclasses :
204
- for country , count_b in countries :
205
- writer . append ([ str ( persons [ random . randint ( 0 , len ( persons ))]), tag , country ])
206
- # TODO minPathDistance and maxPathDistance are missing
229
+ for country , count_b in countries :
230
+ for tagClass , count_a in tagclasses :
231
+ for minDist , maxDist in path_bounds :
232
+ writer . append ([ str ( persons [ random . randint ( 0 , len ( persons ))]), country , tagClass , str ( minDist ), str ( maxDist )])
207
233
208
- def serializes_q17 (outdir , countries ):
209
- writer = ParamsWriter (outdir , "q17" , ["country" ])
234
+ def serialize_q17 (outdir , countries ):
235
+ writer = ParamsWriter (outdir , 17 , ["country" ])
210
236
for country , count in countries :
211
237
writer .append ([country ])
212
238
213
- def serializes_q18 (outdir , post_weeks ):
214
- writer = ParamsWriter (outdir , "q18" , ["date" , "lengthThreshold" , "languages" ])
239
+ def serialize_q18 (outdir , post_weeks , lengths , languages ):
240
+ writer = ParamsWriter (outdir , 18 , ["date" , "lengthThreshold" , "languages" ])
215
241
for week , count in post_weeks :
216
- writer .append ([str (week )])
217
- # TODO lengthThreshold and languages are missing
242
+ for length in lengths :
243
+ for language_set in languages :
244
+ writer .append ([str (week ), str (length ), ";" .join (language_set )])
218
245
219
- def serializes_q19 (outdir , tagclasses ):
246
+ def serialize_q19 (outdir , tagclasses ):
220
247
PERS_DATE = datetime .strptime ("1989-1-1" , "%Y-%m-%d" )
221
- writer = ParamsWriter (outdir , "q19" , ["date" , "tagClass1" , "tagClass2" ])
248
+ writer = ParamsWriter (outdir , 19 , ["date" , "tagClass1" , "tagClass2" ])
222
249
for ix in range (0 ,len (tagclasses )):
223
250
tag_class_a , count_a = tagclasses [ix ]
224
251
for tag_class_b , count_b in tagclasses [ix + 1 :]:
225
252
writer .append ([str (format_date (PERS_DATE )),tag_class_a , tag_class_b ])
226
253
227
- def serializes_q20 (outdir , tagclasses ):
228
- writer = ParamsWriter (outdir , "q20" , ["tagClasses" ]) # TODO tagclasses
229
- for tagclass , count in tagclasses :
230
- writer .append ([tagclass ])
254
+ def serialize_q20 (outdir , tagclasses ):
255
+ random .seed (1988 + 3 )
256
+ writer = ParamsWriter (outdir , 20 , ["tagClasses" ])
257
+
258
+ tagclasses = [tc [0 ] for tc in tagclasses ]
259
+
260
+ # I'm not sure this is the correct way to approach this problem,
261
+ # but it should work reasonably well
262
+ num_words = random .randint (1 ,min (len (tagclasses ),4 ));
263
+ random .shuffle (tagclasses )
264
+ tcs = tagclasses [0 :num_words ]
265
+ writer .append ([";" .join (tcs )])
266
+
267
+ num_words = random .randint (1 ,min (len (tagclasses ),10 ));
268
+ random .shuffle (tagclasses )
269
+ tcs = tagclasses [0 :num_words ]
270
+ writer .append ([";" .join (tcs )])
231
271
232
- def serializes_q21 (outdir , countries ):
233
- writer = ParamsWriter (outdir , "q21" , ["country" , "endDate" ])
272
+ num_words = random .randint (1 ,min (len (tagclasses ),7 ));
273
+ random .shuffle (tagclasses )
274
+ tcs = tagclasses [0 :num_words ]
275
+ writer .append ([";" .join (tcs )])
276
+
277
+ def serialize_q21 (outdir , countries ):
278
+ writer = ParamsWriter (outdir , 21 , ["country" , "endDate" ])
234
279
for country , count in countries :
235
280
writer .append ([country ,str (format_date (END_DATE ))])
236
281
237
- def serializes_q22 (outdir , countries ):
238
- writer = ParamsWriter (outdir , "q22" , ["country1" , "country2" ])
282
+ def serialize_q22 (outdir , countries ):
283
+ writer = ParamsWriter (outdir , 22 , ["country1" , "country2" ])
239
284
for ix in range (0 ,len (countries )):
240
285
country_a , count_a = countries [ix ]
241
286
for country_b , count_b in countries [ix + 1 :]:
242
287
writer .append ([country_a , country_b ])
243
288
244
- def serializes_q23 (outdir , countries ):
245
- writer = ParamsWriter (outdir , "q23" , ["country" ])
289
+ def serialize_q23 (outdir , countries ):
290
+ writer = ParamsWriter (outdir , 23 , ["country" ])
246
291
for country , count in countries :
247
292
writer .append ([country ])
248
293
249
- def serializes_q24 (outdir , tagclasses ):
250
- writer = ParamsWriter (outdir , "q24" , ["tagClass" ])
294
+ def serialize_q24 (outdir , tagclasses ):
295
+ writer = ParamsWriter (outdir , 24 , ["tagClass" ])
251
296
for tagclass , count in tagclasses :
252
297
writer .append ([tagclass ])
253
298
254
- def serializes_q25 (outdir ):
255
- writer = ParamsWriter (outdir , "q25" , ["person1Id" , "person2Id" , "startDate" , "endDate" ])
256
- # TODO
299
+ def serialize_q25 (outdir , persons , post_month_ranges ):
300
+ writer = ParamsWriter (outdir , 25 , ["person1Id" , "person2Id" , "startDate" , "endDate" ])
301
+ for day_range , count_post in post_month_ranges :
302
+ count = min (len (persons ), 10 )
303
+ for _ in range (0 , count ):
304
+ person1Id = persons [random .randint (0 , len (persons ) - 1 )]
305
+ while True :
306
+ person2Id = persons [random .randint (0 , len (persons ) - 1 )]
307
+ if person2Id != person1Id :
308
+ writer .append ([str (person1Id ), str (person2Id ), str (day_range [0 ]), str (day_range [1 ])])
309
+ break
310
+
257
311
258
312
def add_months (sourcedate ,months ):
259
313
month = sourcedate .month - 1 + months
@@ -343,37 +397,39 @@ def main(argv=None):
343
397
post_upper_threshold = (total_posts / (non_empty_weeks / 4 ))* 1.2
344
398
post_months = post_month_params (week_posts , post_lower_threshold , post_upper_threshold )
345
399
346
- serializes_q2 (outdir , key_params (country_sample , total_posts / 200 , total_posts / 100 ), post_day_ranges ) # TODO determine constants
347
- serializes_q3 (outdir , post_months )
348
- serializes_q14 (outdir , post_month_params (week_posts , post_lower_threshold * 2 , post_upper_threshold * 2 ))
349
-
350
- serializes_q1 (outdir , post_date_right_open_range_params (week_posts , 0.3 * total_posts , 0.6 * total_posts ))
351
- serializes_q12 (outdir , post_date_right_open_range_params (week_posts , 0.3 * total_posts , 0.6 * total_posts ))
352
- serializes_q18 (outdir , post_date_right_open_range_params (week_posts , 0.3 * total_posts , 0.6 * total_posts ))
353
- serializes_q10 (outdir , key_params (tag_posts , total_posts / 900 , total_posts / 600 ), post_date_right_open_range_params (week_posts , 0.3 * total_posts , 0.6 * total_posts ))
354
-
355
- serializes_q4 (outdir , key_params (tagclass_posts , total_posts / 20 , total_posts / 10 ), key_params (country_sample , total_posts / 120 , total_posts / 70 ))
356
- serializes_q5 (outdir , key_params (country_sample , total_posts / 200 , total_posts / 100 ))
357
- serializes_q6 (outdir , key_params (tag_posts , total_posts / 1300 , total_posts / 900 ))
358
- serializes_q7 (outdir , key_params (tag_posts , total_posts / 900 , total_posts / 600 ))
359
- serializes_q8 (outdir , key_params (tag_posts , total_posts / 600 , total_posts / 300 ))
360
- serializes_q9 (outdir , key_params (tagclass_posts , 6000 , 25000 ))
361
- serializes_q13 (outdir , key_params (country_sample , total_posts / 200 , total_posts / 100 ))
362
- serializes_q15 (outdir , key_params (country_sample , total_posts / 200 , total_posts / 100 ))
363
- serializes_q16 (outdir , persons , key_params (tagclass_posts , total_posts / 30 , total_posts / 10 ), key_params (country_sample , total_posts / 80 , total_posts / 20 ))
364
- serializes_q17 (outdir , key_params (country_sample , total_posts / 200 , total_posts / 100 ))
365
- serializes_q19 (outdir , key_params (tagclass_posts , total_posts / 60 , total_posts / 10 ))
366
- serializes_q21 (outdir , key_params (country_sample , total_posts / 200 , total_posts / 100 ))
367
- serializes_q22 (outdir , key_params (country_sample , total_posts / 120 , total_posts / 40 ))
368
- serializes_q23 (outdir , key_params (country_sample , total_posts / 200 , total_posts / 100 ))
369
- serializes_q24 (outdir , key_params (tagclass_posts , total_posts / 140 , total_posts / 5 ))
400
+ path_bounds = enumerate_path_bounds (3 , 9 , 2 )
401
+ language_codes = prob_language_codes ()
402
+ post_lengths = prob_post_lengths ()
403
+
404
+ serialize_q2 (outdir , key_params (country_sample , total_posts / 200 , total_posts / 100 ), post_day_ranges ) # TODO determine constants
405
+ serialize_q3 (outdir , post_months )
406
+ serialize_q14 (outdir , post_month_params (week_posts , post_lower_threshold * 2 , post_upper_threshold * 2 ))
407
+
408
+ serialize_q1 (outdir , post_date_right_open_range_params (week_posts , 0.3 * total_posts , 0.6 * total_posts ))
409
+ serialize_q12 (outdir , post_date_right_open_range_params (week_posts , 0.3 * total_posts , 0.6 * total_posts ))
410
+ serialize_q18 (outdir , post_date_right_open_range_params (week_posts , 0.3 * total_posts , 0.6 * total_posts ), post_lengths , language_codes )
411
+ serialize_q10 (outdir , key_params (tag_posts , total_posts / 900 , total_posts / 600 ), post_date_right_open_range_params (week_posts , 0.3 * total_posts , 0.6 * total_posts ))
412
+
413
+ serialize_q4 (outdir , key_params (tagclass_posts , total_posts / 20 , total_posts / 10 ), key_params (country_sample , total_posts / 120 , total_posts / 70 ))
414
+ serialize_q5 (outdir , key_params (country_sample , total_posts / 200 , total_posts / 100 ))
415
+ serialize_q6 (outdir , key_params (tag_posts , total_posts / 1300 , total_posts / 900 ))
416
+ serialize_q7 (outdir , key_params (tag_posts , total_posts / 900 , total_posts / 600 ))
417
+ serialize_q8 (outdir , key_params (tag_posts , total_posts / 600 , total_posts / 300 ))
418
+ serialize_q9 (outdir , key_params (tagclass_posts , 6000 , 25000 ))
419
+ serialize_q13 (outdir , key_params (country_sample , total_posts / 200 , total_posts / 100 ))
420
+ serialize_q15 (outdir , key_params (country_sample , total_posts / 200 , total_posts / 100 ))
421
+ serialize_q16 (outdir , persons , key_params (tagclass_posts , total_posts / 30 , total_posts / 10 ), key_params (country_sample , total_posts / 80 , total_posts / 20 ), path_bounds )
422
+ serialize_q17 (outdir , key_params (country_sample , total_posts / 200 , total_posts / 100 ))
423
+ serialize_q19 (outdir , key_params (tagclass_posts , total_posts / 60 , total_posts / 10 ))
424
+ serialize_q21 (outdir , key_params (country_sample , total_posts / 200 , total_posts / 100 ))
425
+ serialize_q22 (outdir , key_params (country_sample , total_posts / 120 , total_posts / 40 ))
426
+ serialize_q23 (outdir , key_params (country_sample , total_posts / 200 , total_posts / 100 ))
427
+ serialize_q24 (outdir , key_params (tagclass_posts , total_posts / 140 , total_posts / 5 ))
428
+ serialize_q25 (outdir , persons , post_months )
370
429
371
430
# TODO: Refine
372
- serializes_q20 (outdir , key_params (tagclass_posts , total_posts / 20 , total_posts / 2 ))
373
- serializes_q11 (outdir , key_params (country_sample , total_posts / 80 , total_posts / 20 ), bad_words )
374
-
375
- # TODO: implement
376
- #serializes_q25(outdir, ...)
431
+ serialize_q20 (outdir , key_params (tagclass_posts , total_posts / 20 , total_posts / 2 ))
432
+ serialize_q11 (outdir , key_params (country_sample , total_posts / 80 , total_posts / 20 ), bad_words )
377
433
378
434
if __name__ == "__main__" :
379
435
sys .exit (main ())
0 commit comments