@@ -25,7 +25,7 @@ def format_date(date):
25
25
# for i in range(0, num_params):
26
26
# self.files.append(codecs.open("params/"+name+"."+str(i+1)+".params", "w",encoding="utf-8"))
27
27
28
- # def append(self, params, counts ):
28
+ # def append(self, params):
29
29
# for i, param in enumerate(params):
30
30
# self.files[i].write(param+"\n")
31
31
@@ -38,35 +38,14 @@ def __init__(self, outdir, name, param_names):
38
38
self .file .write (param_names [i ])
39
39
self .file .write ("\n " )
40
40
41
- def append (self , params , counts ):
41
+ def append (self , params ):
42
42
for i , param in enumerate (params ):
43
43
if i > 0 :
44
44
self .file .write ("|" )
45
45
self .file .write (param )
46
46
self .file .write ("\n " )
47
47
48
48
49
- def country_sets_params (sample , lower_bound , upper_bound , max_depth , start = 0 ):
50
- if max_depth == 0 :
51
- return []
52
-
53
- results = []
54
- ix = start
55
- for country , count in sample [start :]:
56
- if count < (lower_bound / (max_depth + 1 )):
57
- continue
58
- if count < lower_bound :
59
- others = country_sets_params (sample , lower_bound - count , upper_bound - count , max_depth - 1 , ix + 1 )
60
- for other_countries , other_count in others :
61
- combined_count = count + other_count
62
- if combined_count > lower_bound and combined_count < upper_bound :
63
- other_countries .append (country )
64
- results .append ([other_countries , combined_count ])
65
- if count > lower_bound and count < upper_bound :
66
- results .append ([[country ], count ])
67
- ix = ix + 1
68
- return results
69
-
70
49
def post_date_right_open_range_params (sample , lower_bound , upper_bound ):
71
50
results = []
72
51
for ix in range (0 , len (sample )):
@@ -125,61 +104,59 @@ def key_params(sample, lower_bound, upper_bound):
125
104
def serializes_q1 (outdir , post_weeks ):
126
105
writer = ParamsWriter (outdir , "q1" , ["date" ])
127
106
for week , count in post_weeks :
128
- writer .append ([str (week )], [ count ] )
107
+ writer .append ([str (week )])
129
108
130
- def serializes_q2 (outdir , country_sets , post_day_ranges ): # TODO country1, country2
131
- writer = ParamsWriter (outdir , "q2" , ["date1" , "date2" , "countries" , "endDate" , "messageThreshold" ])
132
- random .seed (1988 + 2 )
133
- for country_set , count_country in country_sets :
134
- for day_range , count_post in post_day_ranges :
135
- if random .randint (0 ,len (country_sets ) + len (post_day_ranges )) == 0 :
136
- writer .append ([str (day_range [0 ]), str (day_range [1 ]), ";" .join (country_set ), str (format_date (END_DATE )),str (20 )], [count_post ,count_post ,count_country ,333 ])
137
-
138
- def serializes_q3 (outdir , post_months ): # TODO year, month
139
- writer = ParamsWriter (outdir , "q3" , ["range1Start" , "range1End" , "range2Start" , "range2End" ])
140
- for ix in range (0 ,len (post_months )):
141
- week_range_a , count_a = post_months [ix ]
142
- for week_range_b , count_b in post_months [ix + 1 :]:
143
- writer .append ([str (week_range_a [0 ]),str (week_range_a [1 ]),str (week_range_b [0 ]),str (week_range_b [1 ])], [count_a ,count_b ])
109
+ def serializes_q2 (outdir , countries , post_day_ranges ):
110
+ writer = ParamsWriter (outdir , "q2" , ["date1" , "date2" , "country1" , "country2" ])
111
+ for day_range , count_post in post_day_ranges :
112
+ for ix in range (0 ,len (countries )):
113
+ country_1 , count_1 = countries [ix ]
114
+ for country_2 , count_2 in countries [ix + 1 :]:
115
+ writer .append ([str (day_range [0 ]),str (day_range [1 ]),country_1 ,country_2 ])
116
+
117
+ def serializes_q3 (outdir , post_months ):
118
+ writer = ParamsWriter (outdir , "q3" , ["year" , "month" ] )
119
+ # TODO year, month
144
120
145
121
def serializes_q4 (outdir , tagclasses , countries ):
146
122
writer = ParamsWriter (outdir , "q4" , ["tagClass" , "country" ])
147
123
for tag , count_a in tagclasses :
148
124
for country , count_b in countries :
149
- writer .append ([tag ,country ], [ count_a , count_b ] )
125
+ writer .append ([tag ,country ])
150
126
151
127
def serializes_q5 (outdir , countries ):
152
128
writer = ParamsWriter (outdir , "q5" , ["country" ])
153
129
for country , count in countries :
154
- writer .append ([country ], [ count ] )
130
+ writer .append ([country ])
155
131
156
132
157
133
def serializes_q6 (outdir , tags ):
158
134
writer = ParamsWriter (outdir , "q6" , ["tag" ])
159
135
for tag , count in tags :
160
- writer .append ([tag ], [ count ] )
136
+ writer .append ([tag ])
161
137
162
138
def serializes_q7 (outdir , tags ):
163
139
writer = ParamsWriter (outdir , "q7" , ["tag" ])
164
140
for tag , count in tags :
165
- writer .append ([tag ], [ count ] )
141
+ writer .append ([tag ])
166
142
167
143
def serializes_q8 (outdir , tags ):
168
144
writer = ParamsWriter (outdir , "q8" , ["tag" ])
169
145
for tag , count in tags :
170
- writer .append ([tag ], [ count ] )
146
+ writer .append ([tag ])
171
147
172
148
def serializes_q9 (outdir , tagclasses ):
173
149
writer = ParamsWriter (outdir , "q9" , ["tagClass1" , "tagClass2" , "threshold" ])
174
150
for ix in range (0 ,len (tagclasses )):
175
151
tag_class_a , count_a = tagclasses [ix ]
176
152
for tag_class_b , count_b in tagclasses [ix + 1 :]:
177
- writer .append ([tag_class_a , tag_class_b , str (200 )], [ count_a , count_b ] )
153
+ writer .append ([tag_class_a , tag_class_b , str (200 )])
178
154
179
- def serializes_q10 (outdir , tags ): # TODO date
180
- writer = ParamsWriter (outdir , "q10" , ["tag" ])
155
+ def serializes_q10 (outdir , tags , post_weeks ):
156
+ writer = ParamsWriter (outdir , "q10" , ["tag" , "date" ])
181
157
for tag , count in tags :
182
- writer .append ([tag ], [count ])
158
+ for week , count in post_weeks :
159
+ writer .append ([tag , str (week )])
183
160
184
161
def serializes_q11 (outdir , countries , bad_words ):
185
162
writer = ParamsWriter (outdir , "q11" , ["country" , "blacklist" ])
@@ -188,89 +165,95 @@ def serializes_q11(outdir, countries, bad_words):
188
165
num_words = random .randint (1 ,min (len (bad_words ),4 ));
189
166
random .shuffle (bad_words )
190
167
blacklist = bad_words [0 :num_words ]
191
- writer .append ([country ,";" .join (blacklist )], [ count ] )
168
+ writer .append ([country ,";" .join (blacklist )])
192
169
193
170
num_words = random .randint (1 ,min (len (bad_words ),10 ));
194
171
random .shuffle (bad_words )
195
172
blacklist = bad_words [0 :num_words ]
196
- writer .append ([country ,";" .join (blacklist )], [ count ] )
173
+ writer .append ([country ,";" .join (blacklist )])
197
174
198
175
num_words = random .randint (1 ,min (len (bad_words ),7 ));
199
176
random .shuffle (bad_words )
200
177
blacklist = bad_words [0 :num_words ]
201
- writer .append ([country ,";" .join (blacklist )], [ count ] )
178
+ writer .append ([country ,";" .join (blacklist )])
202
179
203
180
def serializes_q12 (outdir , post_weeks ):
204
- writer = ParamsWriter (outdir , "q12" , ["creationDate " , "likeThreshold" ])
181
+ writer = ParamsWriter (outdir , "q12" , ["date " , "likeThreshold" ])
205
182
for week , count in post_weeks :
206
- writer .append ([str (week ),str (400 )], [ count ] )
183
+ writer .append ([str (week ),str (400 )])
207
184
208
185
def serializes_q13 (outdir , countries ):
209
186
writer = ParamsWriter (outdir , "q13" , ["country" ])
210
187
for country , count in countries :
211
- writer .append ([country ], [ count ] )
188
+ writer .append ([country ])
212
189
213
190
def serializes_q14 (outdir , creationdates ):
214
191
writer = ParamsWriter (outdir , "q14" , ["begin" , "end" ])
215
192
for creation , count in creationdates :
216
- writer .append ([str (creation [0 ]),str (creation [1 ])], [ count ] )
193
+ writer .append ([str (creation [0 ]),str (creation [1 ])])
217
194
218
195
def serializes_q15 (outdir , countries ):
219
196
writer = ParamsWriter (outdir , "q15" , ["country" ])
220
197
for country , count in countries :
221
- writer .append ([country ], [ count ] )
198
+ writer .append ([country ])
222
199
223
200
def serializes_q16 (outdir , persons , tagclasses , countries ):
224
- writer = ParamsWriter (outdir , "q16" , ["person" , "tag" , "country" ]) # TODO minPathDistance and maxPathDistance are missing
201
+ writer = ParamsWriter (outdir , "q16" , ["person" , "tag" , "country" , " minPathDistance" , " maxPathDistance" ])
225
202
random .seed (1988 + 2 )
226
203
for tag , count_a in tagclasses :
227
204
for country , count_b in countries :
228
- writer .append ([str (persons [random .randint (0 ,len (persons ))]), tag , country ], [0 , count_a , count_b ])
205
+ writer .append ([str (persons [random .randint (0 ,len (persons ))]), tag , country ])
206
+ # TODO minPathDistance and maxPathDistance are missing
229
207
230
208
def serializes_q17 (outdir , countries ):
231
209
writer = ParamsWriter (outdir , "q17" , ["country" ])
232
210
for country , count in countries :
233
- writer .append ([country ], [ count ] )
211
+ writer .append ([country ])
234
212
235
213
def serializes_q18 (outdir , post_weeks ):
236
- writer = ParamsWriter (outdir , "q18" , ["date" ]) # TODO lengthThreshold and languages are missing
214
+ writer = ParamsWriter (outdir , "q18" , ["date" , " lengthThreshold" , " languages" ])
237
215
for week , count in post_weeks :
238
- writer .append ([str (week )], [count ])
216
+ writer .append ([str (week )])
217
+ # TODO lengthThreshold and languages are missing
239
218
240
219
def serializes_q19 (outdir , tagclasses ):
241
220
PERS_DATE = datetime .strptime ("1989-1-1" , "%Y-%m-%d" )
242
221
writer = ParamsWriter (outdir , "q19" , ["date" , "tagClass1" , "tagClass2" ])
243
222
for ix in range (0 ,len (tagclasses )):
244
223
tag_class_a , count_a = tagclasses [ix ]
245
224
for tag_class_b , count_b in tagclasses [ix + 1 :]:
246
- writer .append ([str (format_date (PERS_DATE )),tag_class_a , tag_class_b ], [ count_a , count_b ] )
225
+ writer .append ([str (format_date (PERS_DATE )),tag_class_a , tag_class_b ])
247
226
248
227
def serializes_q20 (outdir , tagclasses ):
249
- writer = ParamsWriter (outdir , "q20" , ["tagclass " ]) # TODO tagclasses
228
+ writer = ParamsWriter (outdir , "q20" , ["tagClasses " ]) # TODO tagclasses
250
229
for tagclass , count in tagclasses :
251
- writer .append ([tagclass ], [ count ] )
230
+ writer .append ([tagclass ])
252
231
253
232
def serializes_q21 (outdir , countries ):
254
233
writer = ParamsWriter (outdir , "q21" , ["country" , "endDate" ])
255
234
for country , count in countries :
256
- writer .append ([country ,str (format_date (END_DATE ))], [ count ] )
235
+ writer .append ([country ,str (format_date (END_DATE ))])
257
236
258
237
def serializes_q22 (outdir , countries ):
259
238
writer = ParamsWriter (outdir , "q22" , ["country1" , "country2" ])
260
239
for ix in range (0 ,len (countries )):
261
240
country_a , count_a = countries [ix ]
262
241
for country_b , count_b in countries [ix + 1 :]:
263
- writer .append ([country_a , country_b ], [ count_a , count_b ] )
242
+ writer .append ([country_a , country_b ])
264
243
265
244
def serializes_q23 (outdir , countries ):
266
245
writer = ParamsWriter (outdir , "q23" , ["country" ])
267
246
for country , count in countries :
268
- writer .append ([country ], [ count ] )
247
+ writer .append ([country ])
269
248
270
249
def serializes_q24 (outdir , tagclasses ):
271
250
writer = ParamsWriter (outdir , "q24" , ["tagClass" ])
272
251
for tagclass , count in tagclasses :
273
- writer .append ([tagclass ], [count ])
252
+ writer .append ([tagclass ])
253
+
254
+ def serializes_q25 (outdir ):
255
+ writer = ParamsWriter (outdir , "q25" , ["person1Id" , "person2Id" , "startDate" , "endDate" ])
256
+ # TODO
274
257
275
258
def add_months (sourcedate ,months ):
276
259
month = sourcedate .month - 1 + months
@@ -315,7 +298,8 @@ def main(argv=None):
315
298
friendsFiles .append (indir + file )
316
299
317
300
# read precomputed counts from files
318
- (personFactors , countryFactors , tagFactors , tagClassFactors , nameFactors , givenNames , ts , postsHisto ) = readfactors .load (personFactorFiles ,activityFactorFiles , friendsFiles )
301
+ (personFactors , countryFactors , tagFactors , tagClassFactors , nameFactors , givenNames , ts , postsHisto ) = \
302
+ readfactors .load (personFactorFiles ,activityFactorFiles , friendsFiles )
319
303
week_posts = convert_posts_histo (postsHisto )
320
304
321
305
persons = []
@@ -343,10 +327,6 @@ def main(argv=None):
343
327
for country , count in country_sample :
344
328
person_sum += count
345
329
346
- country_lower_threshold = 0.1 * total_posts * 0.9
347
- country_upper_threshold = 0.1 * total_posts * 1.1
348
- country_sets = country_sets_params (country_sample , country_lower_threshold , country_upper_threshold , 4 )
349
-
350
330
post_lower_threshold = 0.1 * total_posts * 0.9
351
331
post_upper_threshold = 0.1 * total_posts * 1.1
352
332
post_day_ranges = post_date_range_params (week_posts , post_lower_threshold , post_upper_threshold )
@@ -363,21 +343,21 @@ def main(argv=None):
363
343
post_upper_threshold = (total_posts / (non_empty_weeks / 4 ))* 1.2
364
344
post_months = post_month_params (week_posts , post_lower_threshold , post_upper_threshold )
365
345
366
- serializes_q2 (outdir , country_sets , post_day_ranges )
367
- serializes_q3 (outdir , post_months )
346
+ serializes_q2 (outdir , key_params ( country_sample , total_posts / 200 , total_posts / 100 ), post_day_ranges ) # TODO determine constants
347
+ serializes_q3 (outdir , post_months )
368
348
serializes_q14 (outdir , post_month_params (week_posts , post_lower_threshold * 2 , post_upper_threshold * 2 ))
369
349
370
- serializes_q1 (outdir , post_date_right_open_range_params (week_posts , 0.3 * total_posts , 0.6 * total_posts ))
350
+ serializes_q1 (outdir , post_date_right_open_range_params (week_posts , 0.3 * total_posts , 0.6 * total_posts ))
371
351
serializes_q12 (outdir , post_date_right_open_range_params (week_posts , 0.3 * total_posts , 0.6 * total_posts ))
372
352
serializes_q18 (outdir , post_date_right_open_range_params (week_posts , 0.3 * total_posts , 0.6 * total_posts ))
373
-
374
- serializes_q4 ( outdir , key_params ( tagclass_posts , total_posts / 20 , total_posts / 10 ), key_params ( country_sample , total_posts / 120 , total_posts / 70 ))
375
- serializes_q5 (outdir , key_params (country_sample , total_posts / 200 , total_posts / 100 ))
376
- serializes_q6 (outdir , key_params (tag_posts , total_posts / 1300 , total_posts / 900 ))
377
- serializes_q7 (outdir , key_params (tag_posts , total_posts / 900 , total_posts / 600 ))
378
- serializes_q8 (outdir , key_params (tag_posts , total_posts / 600 , total_posts / 300 ))
379
- serializes_q9 (outdir , key_params (tagclass_posts , 6000 , 25000 ))
380
- serializes_q10 (outdir , key_params (tag_posts , total_posts / 900 , total_posts / 600 ))
353
+ serializes_q10 ( outdir , key_params ( tag_posts , total_posts / 900 , total_posts / 600 ), post_date_right_open_range_params ( week_posts , 0.3 * total_posts , 0.6 * total_posts ))
354
+
355
+ serializes_q4 (outdir , key_params (tagclass_posts , total_posts / 20 , total_posts / 10 ), key_params ( country_sample , total_posts / 120 , total_posts / 70 ))
356
+ serializes_q5 (outdir , key_params (country_sample , total_posts / 200 , total_posts / 100 ))
357
+ serializes_q6 (outdir , key_params (tag_posts , total_posts / 1300 , total_posts / 900 ))
358
+ serializes_q7 (outdir , key_params (tag_posts , total_posts / 900 , total_posts / 600 ))
359
+ serializes_q8 (outdir , key_params (tag_posts , total_posts / 600 , total_posts / 300 ))
360
+ serializes_q9 (outdir , key_params (tagclass_posts , 6000 , 25000 ))
381
361
serializes_q13 (outdir , key_params (country_sample , total_posts / 200 , total_posts / 100 ))
382
362
serializes_q15 (outdir , key_params (country_sample , total_posts / 200 , total_posts / 100 ))
383
363
serializes_q16 (outdir , persons , key_params (tagclass_posts , total_posts / 30 , total_posts / 10 ), key_params (country_sample , total_posts / 80 , total_posts / 20 ))
@@ -392,5 +372,8 @@ def main(argv=None):
392
372
serializes_q20 (outdir , key_params (tagclass_posts , total_posts / 20 , total_posts / 2 ))
393
373
serializes_q11 (outdir , key_params (country_sample , total_posts / 80 , total_posts / 20 ), bad_words )
394
374
375
+ # TODO: implement
376
+ #serializes_q25(outdir, ...)
377
+
395
378
if __name__ == "__main__" :
396
379
sys .exit (main ())
0 commit comments