Skip to content

Commit 38045cb

Browse files
committed
Remove count parameter, remove country set generator, fix more queries #46
1 parent 84dcd9e commit 38045cb

File tree

1 file changed

+66
-83
lines changed

1 file changed

+66
-83
lines changed

paramgenerator/generateparamsbi.py

Lines changed: 66 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def format_date(date):
2525
# for i in range(0, num_params):
2626
# self.files.append(codecs.open("params/"+name+"."+str(i+1)+".params", "w",encoding="utf-8"))
2727

28-
# def append(self, params, counts):
28+
# def append(self, params):
2929
# for i, param in enumerate(params):
3030
# self.files[i].write(param+"\n")
3131

@@ -38,35 +38,14 @@ def __init__(self, outdir, name, param_names):
3838
self.file.write(param_names[i])
3939
self.file.write("\n")
4040

41-
def append(self, params, counts):
41+
def append(self, params):
4242
for i, param in enumerate(params):
4343
if i>0:
4444
self.file.write("|")
4545
self.file.write(param)
4646
self.file.write("\n")
4747

4848

49-
def country_sets_params(sample, lower_bound, upper_bound, max_depth, start = 0):
50-
if max_depth == 0:
51-
return []
52-
53-
results = []
54-
ix = start
55-
for country, count in sample[start:]:
56-
if count < (lower_bound / (max_depth + 1)):
57-
continue
58-
if count < lower_bound:
59-
others = country_sets_params(sample, lower_bound-count, upper_bound-count, max_depth - 1, ix + 1)
60-
for other_countries, other_count in others:
61-
combined_count = count + other_count
62-
if combined_count > lower_bound and combined_count < upper_bound:
63-
other_countries.append(country)
64-
results.append([other_countries, combined_count])
65-
if count > lower_bound and count < upper_bound:
66-
results.append([[country], count])
67-
ix = ix + 1
68-
return results
69-
7049
def post_date_right_open_range_params(sample, lower_bound, upper_bound):
7150
results = []
7251
for ix in range(0, len(sample)):
@@ -125,61 +104,59 @@ def key_params(sample, lower_bound, upper_bound):
125104
def serializes_q1(outdir, post_weeks):
126105
writer = ParamsWriter(outdir, "q1", ["date"])
127106
for week, count in post_weeks:
128-
writer.append([str(week)], [count])
107+
writer.append([str(week)])
129108

130-
def serializes_q2(outdir, country_sets, post_day_ranges): # TODO country1, country2
131-
writer = ParamsWriter(outdir, "q2", ["date1", "date2", "countries", "endDate", "messageThreshold"])
132-
random.seed(1988+2)
133-
for country_set, count_country in country_sets:
134-
for day_range, count_post in post_day_ranges:
135-
if random.randint(0,len(country_sets) + len(post_day_ranges)) == 0:
136-
writer.append([str(day_range[0]), str(day_range[1]), ";".join(country_set), str(format_date(END_DATE)),str(20)], [count_post,count_post,count_country,333])
137-
138-
def serializes_q3(outdir, post_months): # TODO year, month
139-
writer = ParamsWriter(outdir, "q3", ["range1Start", "range1End", "range2Start", "range2End"])
140-
for ix in range(0,len(post_months)):
141-
week_range_a, count_a = post_months[ix]
142-
for week_range_b, count_b in post_months[ix+1:]:
143-
writer.append([str(week_range_a[0]),str(week_range_a[1]),str(week_range_b[0]),str(week_range_b[1])], [count_a,count_b])
109+
def serializes_q2(outdir, countries, post_day_ranges):
110+
writer = ParamsWriter(outdir, "q2", ["date1", "date2", "country1", "country2"])
111+
for day_range, count_post in post_day_ranges:
112+
for ix in range(0,len(countries)):
113+
country_1, count_1 = countries[ix]
114+
for country_2, count_2 in countries[ix+1:]:
115+
writer.append([str(day_range[0]),str(day_range[1]),country_1,country_2])
116+
117+
def serializes_q3(outdir, post_months):
118+
writer = ParamsWriter(outdir, "q3", ["year", "month"] )
119+
# TODO year, month
144120

145121
def serializes_q4(outdir, tagclasses, countries):
146122
writer = ParamsWriter(outdir, "q4", ["tagClass", "country"])
147123
for tag, count_a in tagclasses:
148124
for country, count_b in countries:
149-
writer.append([tag,country], [count_a,count_b])
125+
writer.append([tag,country])
150126

151127
def serializes_q5(outdir, countries):
152128
writer = ParamsWriter(outdir, "q5", ["country"])
153129
for country, count in countries:
154-
writer.append([country], [count])
130+
writer.append([country])
155131

156132

157133
def serializes_q6(outdir, tags):
158134
writer = ParamsWriter(outdir, "q6", ["tag"])
159135
for tag, count in tags:
160-
writer.append([tag], [count])
136+
writer.append([tag])
161137

162138
def serializes_q7(outdir, tags):
163139
writer = ParamsWriter(outdir, "q7", ["tag"])
164140
for tag, count in tags:
165-
writer.append([tag], [count])
141+
writer.append([tag])
166142

167143
def serializes_q8(outdir, tags):
168144
writer = ParamsWriter(outdir, "q8", ["tag"])
169145
for tag, count in tags:
170-
writer.append([tag], [count])
146+
writer.append([tag])
171147

172148
def serializes_q9(outdir, tagclasses):
173149
writer = ParamsWriter(outdir, "q9", ["tagClass1", "tagClass2", "threshold"])
174150
for ix in range(0,len(tagclasses)):
175151
tag_class_a, count_a = tagclasses[ix]
176152
for tag_class_b, count_b in tagclasses[ix+1:]:
177-
writer.append([tag_class_a, tag_class_b, str(200)], [count_a, count_b])
153+
writer.append([tag_class_a, tag_class_b, str(200)])
178154

179-
def serializes_q10(outdir, tags): # TODO date
180-
writer = ParamsWriter(outdir, "q10", ["tag"])
155+
def serializes_q10(outdir, tags, post_weeks):
156+
writer = ParamsWriter(outdir, "q10", ["tag", "date"])
181157
for tag, count in tags:
182-
writer.append([tag], [count])
158+
for week, count in post_weeks:
159+
writer.append([tag, str(week)])
183160

184161
def serializes_q11(outdir, countries, bad_words):
185162
writer = ParamsWriter(outdir, "q11", ["country", "blacklist"])
@@ -188,89 +165,95 @@ def serializes_q11(outdir, countries, bad_words):
188165
num_words = random.randint(1,min(len(bad_words),4));
189166
random.shuffle(bad_words)
190167
blacklist = bad_words[0:num_words]
191-
writer.append([country,";".join(blacklist)], [count])
168+
writer.append([country,";".join(blacklist)])
192169

193170
num_words = random.randint(1,min(len(bad_words),10));
194171
random.shuffle(bad_words)
195172
blacklist = bad_words[0:num_words]
196-
writer.append([country,";".join(blacklist)], [count])
173+
writer.append([country,";".join(blacklist)])
197174

198175
num_words = random.randint(1,min(len(bad_words),7));
199176
random.shuffle(bad_words)
200177
blacklist = bad_words[0:num_words]
201-
writer.append([country,";".join(blacklist)], [count])
178+
writer.append([country,";".join(blacklist)])
202179

203180
def serializes_q12(outdir, post_weeks):
204-
writer = ParamsWriter(outdir, "q12", ["creationDate", "likeThreshold"])
181+
writer = ParamsWriter(outdir, "q12", ["date", "likeThreshold"])
205182
for week, count in post_weeks:
206-
writer.append([str(week),str(400)], [count])
183+
writer.append([str(week),str(400)])
207184

208185
def serializes_q13(outdir, countries):
209186
writer = ParamsWriter(outdir, "q13", ["country"])
210187
for country, count in countries:
211-
writer.append([country], [count])
188+
writer.append([country])
212189

213190
def serializes_q14(outdir, creationdates):
214191
writer = ParamsWriter(outdir, "q14", ["begin", "end"])
215192
for creation, count in creationdates:
216-
writer.append([str(creation[0]),str(creation[1])], [count])
193+
writer.append([str(creation[0]),str(creation[1])])
217194

218195
def serializes_q15(outdir, countries):
219196
writer = ParamsWriter(outdir, "q15", ["country"])
220197
for country, count in countries:
221-
writer.append([country], [count])
198+
writer.append([country])
222199

223200
def serializes_q16(outdir, persons, tagclasses, countries):
224-
writer = ParamsWriter(outdir, "q16", ["person", "tag", "country"]) # TODO minPathDistance and maxPathDistance are missing
201+
writer = ParamsWriter(outdir, "q16", ["person", "tag", "country", "minPathDistance", "maxPathDistance"])
225202
random.seed(1988+2)
226203
for tag, count_a in tagclasses:
227204
for country, count_b in countries:
228-
writer.append([str(persons[random.randint(0,len(persons))]), tag, country], [0, count_a, count_b])
205+
writer.append([str(persons[random.randint(0,len(persons))]), tag, country])
206+
# TODO minPathDistance and maxPathDistance are missing
229207

230208
def serializes_q17(outdir, countries):
231209
writer = ParamsWriter(outdir, "q17", ["country"])
232210
for country, count in countries:
233-
writer.append([country], [count])
211+
writer.append([country])
234212

235213
def serializes_q18(outdir, post_weeks):
236-
writer = ParamsWriter(outdir, "q18", ["date"]) # TODO lengthThreshold and languages are missing
214+
writer = ParamsWriter(outdir, "q18", ["date", "lengthThreshold", "languages"])
237215
for week, count in post_weeks:
238-
writer.append([str(week)], [count])
216+
writer.append([str(week)])
217+
# TODO lengthThreshold and languages are missing
239218

240219
def serializes_q19(outdir, tagclasses):
241220
PERS_DATE=datetime.strptime("1989-1-1", "%Y-%m-%d")
242221
writer = ParamsWriter(outdir, "q19", ["date", "tagClass1", "tagClass2"])
243222
for ix in range(0,len(tagclasses)):
244223
tag_class_a, count_a = tagclasses[ix]
245224
for tag_class_b, count_b in tagclasses[ix+1:]:
246-
writer.append([str(format_date(PERS_DATE)),tag_class_a, tag_class_b], [count_a, count_b])
225+
writer.append([str(format_date(PERS_DATE)),tag_class_a, tag_class_b])
247226

248227
def serializes_q20(outdir, tagclasses):
249-
writer = ParamsWriter(outdir, "q20", ["tagclass"]) # TODO tagclasses
228+
writer = ParamsWriter(outdir, "q20", ["tagClasses"]) # TODO tagclasses
250229
for tagclass, count in tagclasses:
251-
writer.append([tagclass], [count])
230+
writer.append([tagclass])
252231

253232
def serializes_q21(outdir, countries):
254233
writer = ParamsWriter(outdir, "q21", ["country", "endDate"])
255234
for country, count in countries:
256-
writer.append([country,str(format_date(END_DATE))], [count])
235+
writer.append([country,str(format_date(END_DATE))])
257236

258237
def serializes_q22(outdir, countries):
259238
writer = ParamsWriter(outdir, "q22", ["country1", "country2"])
260239
for ix in range(0,len(countries)):
261240
country_a, count_a = countries[ix]
262241
for country_b, count_b in countries[ix+1:]:
263-
writer.append([country_a, country_b], [count_a, count_b])
242+
writer.append([country_a, country_b])
264243

265244
def serializes_q23(outdir, countries):
266245
writer = ParamsWriter(outdir, "q23", ["country"])
267246
for country, count in countries:
268-
writer.append([country], [count])
247+
writer.append([country])
269248

270249
def serializes_q24(outdir, tagclasses):
271250
writer = ParamsWriter(outdir, "q24", ["tagClass"])
272251
for tagclass, count in tagclasses:
273-
writer.append([tagclass], [count])
252+
writer.append([tagclass])
253+
254+
def serializes_q25(outdir):
255+
writer = ParamsWriter(outdir, "q25", ["person1Id", "person2Id", "startDate", "endDate"])
256+
# TODO
274257

275258
def add_months(sourcedate,months):
276259
month = sourcedate.month - 1 + months
@@ -315,7 +298,8 @@ def main(argv=None):
315298
friendsFiles.append(indir+file)
316299

317300
# read precomputed counts from files
318-
(personFactors, countryFactors, tagFactors, tagClassFactors, nameFactors, givenNames, ts, postsHisto) = readfactors.load(personFactorFiles,activityFactorFiles, friendsFiles)
301+
(personFactors, countryFactors, tagFactors, tagClassFactors, nameFactors, givenNames, ts, postsHisto) = \
302+
readfactors.load(personFactorFiles,activityFactorFiles, friendsFiles)
319303
week_posts = convert_posts_histo(postsHisto)
320304

321305
persons = []
@@ -343,10 +327,6 @@ def main(argv=None):
343327
for country, count in country_sample:
344328
person_sum += count
345329

346-
country_lower_threshold = 0.1*total_posts*0.9
347-
country_upper_threshold = 0.1*total_posts*1.1
348-
country_sets = country_sets_params(country_sample, country_lower_threshold, country_upper_threshold, 4)
349-
350330
post_lower_threshold = 0.1*total_posts*0.9
351331
post_upper_threshold = 0.1*total_posts*1.1
352332
post_day_ranges = post_date_range_params(week_posts, post_lower_threshold, post_upper_threshold)
@@ -363,21 +343,21 @@ def main(argv=None):
363343
post_upper_threshold = (total_posts/(non_empty_weeks/4))*1.2
364344
post_months = post_month_params(week_posts, post_lower_threshold, post_upper_threshold)
365345

366-
serializes_q2(outdir, country_sets, post_day_ranges)
367-
serializes_q3(outdir, post_months)
346+
serializes_q2 (outdir, key_params(country_sample, total_posts/200, total_posts/100), post_day_ranges) # TODO determine constants
347+
serializes_q3 (outdir, post_months)
368348
serializes_q14(outdir, post_month_params(week_posts, post_lower_threshold*2, post_upper_threshold*2))
369349

370-
serializes_q1(outdir, post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts))
350+
serializes_q1 (outdir, post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts))
371351
serializes_q12(outdir, post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts))
372352
serializes_q18(outdir, post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts))
373-
374-
serializes_q4(outdir, key_params(tagclass_posts, total_posts/20, total_posts/10), key_params(country_sample, total_posts/120, total_posts/70))
375-
serializes_q5(outdir, key_params(country_sample, total_posts/200, total_posts/100))
376-
serializes_q6(outdir, key_params(tag_posts, total_posts/1300, total_posts/900))
377-
serializes_q7(outdir, key_params(tag_posts, total_posts/900, total_posts/600))
378-
serializes_q8(outdir, key_params(tag_posts, total_posts/600, total_posts/300))
379-
serializes_q9(outdir, key_params(tagclass_posts, 6000, 25000))
380-
serializes_q10(outdir, key_params(tag_posts, total_posts/900, total_posts/600))
353+
serializes_q10(outdir, key_params(tag_posts, total_posts/900, total_posts/600), post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts))
354+
355+
serializes_q4 (outdir, key_params(tagclass_posts, total_posts/20, total_posts/10), key_params(country_sample, total_posts/120, total_posts/70))
356+
serializes_q5 (outdir, key_params(country_sample, total_posts/200, total_posts/100))
357+
serializes_q6 (outdir, key_params(tag_posts, total_posts/1300, total_posts/900))
358+
serializes_q7 (outdir, key_params(tag_posts, total_posts/900, total_posts/600))
359+
serializes_q8 (outdir, key_params(tag_posts, total_posts/600, total_posts/300))
360+
serializes_q9 (outdir, key_params(tagclass_posts, 6000, 25000))
381361
serializes_q13(outdir, key_params(country_sample, total_posts/200, total_posts/100))
382362
serializes_q15(outdir, key_params(country_sample, total_posts/200, total_posts/100))
383363
serializes_q16(outdir, persons, key_params(tagclass_posts, total_posts/30, total_posts/10), key_params(country_sample, total_posts/80, total_posts/20))
@@ -392,5 +372,8 @@ def main(argv=None):
392372
serializes_q20(outdir, key_params(tagclass_posts, total_posts/20, total_posts/2))
393373
serializes_q11(outdir, key_params(country_sample, total_posts/80, total_posts/20), bad_words)
394374

375+
# TODO: implement
376+
#serializes_q25(outdir, ...)
377+
395378
if __name__ == "__main__":
396379
sys.exit(main())

0 commit comments

Comments
 (0)