Skip to content

Commit 3257193

Browse files
authored
Merge pull request #47 from ldbc/bi-moreparams-merge
Add implementations for 25, update for 16 and 18 (@mkaufmann)
2 parents 9a2e8e5 + d46e671 commit 3257193

File tree

3 files changed

+152
-94
lines changed

3 files changed

+152
-94
lines changed

paramgenerator/README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22

33
The DATAGEN application also runs the parameters generators
44

5-
For standalone testing, provide the input and output directories as parameters, for example:
5+
For standalone testing, provide the input and output directories as parameters. The input directory should contain files defining activity a friend list, along with activity and person factors.
6+
7+
For example:
68

79
```bash
810
./generateparamsbi.py ../hadoop ../substitution_parameters

paramgenerator/generateparams.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,7 @@ def main(argv=None):
269269
# all the queries have Person as parameter
270270
for i in range(1,15):
271271
csvWriter = CSVSerializer()
272-
csvWriter.setOutputFile(outdir+"query_%d_param.txt"%(i))
272+
csvWriter.setOutputFile(outdir+"interactive_%d_param.txt"%(i))
273273
if i != 13 and i != 14: # these three queries take two Persons as parameters
274274
csvWriter.registerHandler(handlePersonParam, selectedPersonParams[i], "Person")
275275
csvWriters[i] = csvWriter

paramgenerator/generateparamsbi.py

Lines changed: 148 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@ def format_date(date):
3030
# self.files[i].write(param+"\n")
3131

3232
class ParamsWriter:
33-
def __init__(self, outdir, name, param_names):
34-
self.file = codecs.open(outdir+"/"+name+"_param.txt", "w",encoding="utf-8")
33+
def __init__(self, outdir, number, param_names):
34+
self.file = codecs.open(outdir+"/bi_"+str(number)+"_param.txt", "w",encoding="utf-8")
3535
for i in range(0,len(param_names)):
3636
if i>0:
3737
self.file.write("|")
@@ -81,6 +81,29 @@ def post_month_params(sample, lower_bound, upper_bound):
8181
results.append([[start_day, end_day], count_sum])
8282
return results
8383

84+
def enumerate_path_bounds(minLength,maxLength,minDifference):
85+
results = []
86+
for i in range(minLength, maxLength):
87+
for j in range(i+minDifference,maxLength):
88+
results.append([i,j])
89+
return results
90+
91+
def prob_language_codes():
92+
results = []
93+
results.append(["ar"])
94+
for i in range(0, 2):
95+
results.append(["tk"])
96+
for i in range(0, 8):
97+
results.append(["uz"])
98+
for i in range(0, 2):
99+
results.append(["uz","tk"])
100+
return results
101+
102+
def prob_post_lengths():
103+
results = [20,40,113,97,240]
104+
return results
105+
106+
84107
# def post_three_month_params(sample, lower_bound, upper_bound):
85108
# results = []
86109
# for ix in range(0, len(sample)/12):
@@ -101,66 +124,69 @@ def key_params(sample, lower_bound, upper_bound):
101124
results.append([key, count])
102125
return results
103126

104-
def serializes_q1(outdir, post_weeks):
105-
writer = ParamsWriter(outdir, "q1", ["date"])
127+
def serialize_q1(outdir, post_weeks):
128+
writer = ParamsWriter(outdir, 1, ["date"])
106129
for week, count in post_weeks:
107130
writer.append([str(week)])
108131

109-
def serializes_q2(outdir, countries, post_day_ranges):
110-
writer = ParamsWriter(outdir, "q2", ["date1", "date2", "country1", "country2"])
132+
def serialize_q2(outdir, countries, post_day_ranges):
133+
writer = ParamsWriter(outdir, 2, ["date1", "date2", "country1", "country2"])
111134
for day_range, count_post in post_day_ranges:
112135
for ix in range(0,len(countries)):
113136
country_1, count_1 = countries[ix]
114137
for country_2, count_2 in countries[ix+1:]:
115138
writer.append([str(day_range[0]),str(day_range[1]),country_1,country_2])
116139

117-
def serializes_q3(outdir, post_months):
118-
writer = ParamsWriter(outdir, "q3", ["year", "month"] )
119-
# TODO year, month
140+
def serialize_q3(outdir, post_months):
141+
writer = ParamsWriter(outdir, 3, ["year", "month"] )
142+
for post_month in post_months:
143+
t = time.gmtime(post_month[0][0]/1000)
144+
writer.append([str(t.tm_year), str(t.tm_mon)])
120145

121-
def serializes_q4(outdir, tagclasses, countries):
122-
writer = ParamsWriter(outdir, "q4", ["tagClass", "country"])
146+
def serialize_q4(outdir, tagclasses, countries):
147+
writer = ParamsWriter(outdir, 4, ["tagClass", "country"])
123148
for tag, count_a in tagclasses:
124149
for country, count_b in countries:
125150
writer.append([tag,country])
126151

127-
def serializes_q5(outdir, countries):
128-
writer = ParamsWriter(outdir, "q5", ["country"])
152+
def serialize_q5(outdir, countries):
153+
writer = ParamsWriter(outdir, 5, ["country"])
129154
for country, count in countries:
130155
writer.append([country])
131156

132157

133-
def serializes_q6(outdir, tags):
134-
writer = ParamsWriter(outdir, "q6", ["tag"])
158+
def serialize_q6(outdir, tags):
159+
writer = ParamsWriter(outdir, 6, ["tag"])
135160
for tag, count in tags:
136161
writer.append([tag])
137162

138-
def serializes_q7(outdir, tags):
139-
writer = ParamsWriter(outdir, "q7", ["tag"])
163+
def serialize_q7(outdir, tags):
164+
writer = ParamsWriter(outdir, 7, ["tag"])
140165
for tag, count in tags:
141166
writer.append([tag])
142167

143-
def serializes_q8(outdir, tags):
144-
writer = ParamsWriter(outdir, "q8", ["tag"])
168+
def serialize_q8(outdir, tags):
169+
writer = ParamsWriter(outdir, 8, ["tag"])
145170
for tag, count in tags:
146171
writer.append([tag])
147172

148-
def serializes_q9(outdir, tagclasses):
149-
writer = ParamsWriter(outdir, "q9", ["tagClass1", "tagClass2", "threshold"])
173+
def serialize_q9(outdir, tagclasses):
174+
writer = ParamsWriter(outdir, 9, ["tagClass1", "tagClass2", "threshold"])
150175
for ix in range(0,len(tagclasses)):
151176
tag_class_a, count_a = tagclasses[ix]
152177
for tag_class_b, count_b in tagclasses[ix+1:]:
153178
writer.append([tag_class_a, tag_class_b, str(200)])
154179

155-
def serializes_q10(outdir, tags, post_weeks):
156-
writer = ParamsWriter(outdir, "q10", ["tag", "date"])
180+
def serialize_q10(outdir, tags, post_weeks):
181+
writer = ParamsWriter(outdir, 10, ["tag", "date"])
157182
for tag, count in tags:
158183
for week, count in post_weeks:
159184
writer.append([tag, str(week)])
160185

161-
def serializes_q11(outdir, countries, bad_words):
162-
writer = ParamsWriter(outdir, "q11", ["country", "blacklist"])
163-
random.seed(1988+2)
186+
def serialize_q11(outdir, countries, bad_words):
187+
writer = ParamsWriter(outdir, 11, ["country", "blacklist"])
188+
random.seed(1988+1)
189+
# note: this approach keeps shuffling the bad_words list
164190
for country, count in countries:
165191
num_words = random.randint(1,min(len(bad_words),4));
166192
random.shuffle(bad_words)
@@ -177,83 +203,111 @@ def serializes_q11(outdir, countries, bad_words):
177203
blacklist = bad_words[0:num_words]
178204
writer.append([country,";".join(blacklist)])
179205

180-
def serializes_q12(outdir, post_weeks):
181-
writer = ParamsWriter(outdir, "q12", ["date", "likeThreshold"])
206+
def serialize_q12(outdir, post_weeks):
207+
writer = ParamsWriter(outdir, 12, ["date", "likeThreshold"])
182208
for week, count in post_weeks:
183209
writer.append([str(week),str(400)])
184210

185-
def serializes_q13(outdir, countries):
186-
writer = ParamsWriter(outdir, "q13", ["country"])
211+
def serialize_q13(outdir, countries):
212+
writer = ParamsWriter(outdir, 13, ["country"])
187213
for country, count in countries:
188214
writer.append([country])
189215

190-
def serializes_q14(outdir, creationdates):
191-
writer = ParamsWriter(outdir, "q14", ["begin", "end"])
216+
def serialize_q14(outdir, creationdates):
217+
writer = ParamsWriter(outdir, 14, ["begin", "end"])
192218
for creation, count in creationdates:
193219
writer.append([str(creation[0]),str(creation[1])])
194220

195-
def serializes_q15(outdir, countries):
196-
writer = ParamsWriter(outdir, "q15", ["country"])
221+
def serialize_q15(outdir, countries):
222+
writer = ParamsWriter(outdir, 15, ["country"])
197223
for country, count in countries:
198224
writer.append([country])
199225

200-
def serializes_q16(outdir, persons, tagclasses, countries):
201-
writer = ParamsWriter(outdir, "q16", ["person", "tag", "country", "minPathDistance", "maxPathDistance"])
226+
def serialize_q16(outdir, persons, tagclasses, countries, path_bounds):
227+
writer = ParamsWriter(outdir, 16, ["person", "country", "tagClass", "minPathDistance", "maxPathDistance"])
202228
random.seed(1988+2)
203-
for tag, count_a in tagclasses:
204-
for country, count_b in countries:
205-
writer.append([str(persons[random.randint(0,len(persons))]), tag, country])
206-
# TODO minPathDistance and maxPathDistance are missing
229+
for country, count_b in countries:
230+
for tagClass, count_a in tagclasses:
231+
for minDist, maxDist in path_bounds:
232+
writer.append([str(persons[random.randint(0, len(persons))]), country, tagClass, str(minDist), str(maxDist)])
207233

208-
def serializes_q17(outdir, countries):
209-
writer = ParamsWriter(outdir, "q17", ["country"])
234+
def serialize_q17(outdir, countries):
235+
writer = ParamsWriter(outdir, 17, ["country"])
210236
for country, count in countries:
211237
writer.append([country])
212238

213-
def serializes_q18(outdir, post_weeks):
214-
writer = ParamsWriter(outdir, "q18", ["date", "lengthThreshold", "languages"])
239+
def serialize_q18(outdir, post_weeks, lengths, languages):
240+
writer = ParamsWriter(outdir, 18, ["date", "lengthThreshold", "languages"])
215241
for week, count in post_weeks:
216-
writer.append([str(week)])
217-
# TODO lengthThreshold and languages are missing
242+
for length in lengths:
243+
for language_set in languages:
244+
writer.append([str(week), str(length), ";".join(language_set)])
218245

219-
def serializes_q19(outdir, tagclasses):
246+
def serialize_q19(outdir, tagclasses):
220247
PERS_DATE=datetime.strptime("1989-1-1", "%Y-%m-%d")
221-
writer = ParamsWriter(outdir, "q19", ["date", "tagClass1", "tagClass2"])
248+
writer = ParamsWriter(outdir, 19, ["date", "tagClass1", "tagClass2"])
222249
for ix in range(0,len(tagclasses)):
223250
tag_class_a, count_a = tagclasses[ix]
224251
for tag_class_b, count_b in tagclasses[ix+1:]:
225252
writer.append([str(format_date(PERS_DATE)),tag_class_a, tag_class_b])
226253

227-
def serializes_q20(outdir, tagclasses):
228-
writer = ParamsWriter(outdir, "q20", ["tagClasses"]) # TODO tagclasses
229-
for tagclass, count in tagclasses:
230-
writer.append([tagclass])
254+
def serialize_q20(outdir, tagclasses):
255+
random.seed(1988+3)
256+
writer = ParamsWriter(outdir, 20, ["tagClasses"])
257+
258+
tagclasses = [tc[0] for tc in tagclasses]
259+
260+
# I'm not sure this is the correct way to approach this problem,
261+
# but it should work reasonably well
262+
num_words = random.randint(1,min(len(tagclasses),4));
263+
random.shuffle(tagclasses)
264+
tcs = tagclasses[0:num_words]
265+
writer.append([";".join(tcs)])
266+
267+
num_words = random.randint(1,min(len(tagclasses),10));
268+
random.shuffle(tagclasses)
269+
tcs = tagclasses[0:num_words]
270+
writer.append([";".join(tcs)])
231271

232-
def serializes_q21(outdir, countries):
233-
writer = ParamsWriter(outdir, "q21", ["country", "endDate"])
272+
num_words = random.randint(1,min(len(tagclasses),7));
273+
random.shuffle(tagclasses)
274+
tcs = tagclasses[0:num_words]
275+
writer.append([";".join(tcs)])
276+
277+
def serialize_q21(outdir, countries):
278+
writer = ParamsWriter(outdir, 21, ["country", "endDate"])
234279
for country, count in countries:
235280
writer.append([country,str(format_date(END_DATE))])
236281

237-
def serializes_q22(outdir, countries):
238-
writer = ParamsWriter(outdir, "q22", ["country1", "country2"])
282+
def serialize_q22(outdir, countries):
283+
writer = ParamsWriter(outdir, 22, ["country1", "country2"])
239284
for ix in range(0,len(countries)):
240285
country_a, count_a = countries[ix]
241286
for country_b, count_b in countries[ix+1:]:
242287
writer.append([country_a, country_b])
243288

244-
def serializes_q23(outdir, countries):
245-
writer = ParamsWriter(outdir, "q23", ["country"])
289+
def serialize_q23(outdir, countries):
290+
writer = ParamsWriter(outdir, 23, ["country"])
246291
for country, count in countries:
247292
writer.append([country])
248293

249-
def serializes_q24(outdir, tagclasses):
250-
writer = ParamsWriter(outdir, "q24", ["tagClass"])
294+
def serialize_q24(outdir, tagclasses):
295+
writer = ParamsWriter(outdir, 24, ["tagClass"])
251296
for tagclass, count in tagclasses:
252297
writer.append([tagclass])
253298

254-
def serializes_q25(outdir):
255-
writer = ParamsWriter(outdir, "q25", ["person1Id", "person2Id", "startDate", "endDate"])
256-
# TODO
299+
def serialize_q25(outdir, persons, post_month_ranges):
300+
writer = ParamsWriter(outdir, 25, ["person1Id", "person2Id", "startDate", "endDate"])
301+
for day_range, count_post in post_month_ranges:
302+
count = min(len(persons), 10)
303+
for _ in range(0, count):
304+
person1Id = persons[random.randint(0, len(persons) - 1)]
305+
while True:
306+
person2Id = persons[random.randint(0, len(persons) - 1)]
307+
if person2Id != person1Id:
308+
writer.append([str(person1Id), str(person2Id), str(day_range[0]), str(day_range[1])])
309+
break
310+
257311

258312
def add_months(sourcedate,months):
259313
month = sourcedate.month - 1 + months
@@ -343,37 +397,39 @@ def main(argv=None):
343397
post_upper_threshold = (total_posts/(non_empty_weeks/4))*1.2
344398
post_months = post_month_params(week_posts, post_lower_threshold, post_upper_threshold)
345399

346-
serializes_q2 (outdir, key_params(country_sample, total_posts/200, total_posts/100), post_day_ranges) # TODO determine constants
347-
serializes_q3 (outdir, post_months)
348-
serializes_q14(outdir, post_month_params(week_posts, post_lower_threshold*2, post_upper_threshold*2))
349-
350-
serializes_q1 (outdir, post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts))
351-
serializes_q12(outdir, post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts))
352-
serializes_q18(outdir, post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts))
353-
serializes_q10(outdir, key_params(tag_posts, total_posts/900, total_posts/600), post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts))
354-
355-
serializes_q4 (outdir, key_params(tagclass_posts, total_posts/20, total_posts/10), key_params(country_sample, total_posts/120, total_posts/70))
356-
serializes_q5 (outdir, key_params(country_sample, total_posts/200, total_posts/100))
357-
serializes_q6 (outdir, key_params(tag_posts, total_posts/1300, total_posts/900))
358-
serializes_q7 (outdir, key_params(tag_posts, total_posts/900, total_posts/600))
359-
serializes_q8 (outdir, key_params(tag_posts, total_posts/600, total_posts/300))
360-
serializes_q9 (outdir, key_params(tagclass_posts, 6000, 25000))
361-
serializes_q13(outdir, key_params(country_sample, total_posts/200, total_posts/100))
362-
serializes_q15(outdir, key_params(country_sample, total_posts/200, total_posts/100))
363-
serializes_q16(outdir, persons, key_params(tagclass_posts, total_posts/30, total_posts/10), key_params(country_sample, total_posts/80, total_posts/20))
364-
serializes_q17(outdir, key_params(country_sample, total_posts/200, total_posts/100))
365-
serializes_q19(outdir, key_params(tagclass_posts, total_posts/60, total_posts/10))
366-
serializes_q21(outdir, key_params(country_sample, total_posts/200, total_posts/100))
367-
serializes_q22(outdir, key_params(country_sample, total_posts/120, total_posts/40))
368-
serializes_q23(outdir, key_params(country_sample, total_posts/200, total_posts/100))
369-
serializes_q24(outdir, key_params(tagclass_posts, total_posts/140, total_posts/5))
400+
path_bounds = enumerate_path_bounds(3, 9, 2)
401+
language_codes = prob_language_codes()
402+
post_lengths = prob_post_lengths()
403+
404+
serialize_q2 (outdir, key_params(country_sample, total_posts/200, total_posts/100), post_day_ranges) # TODO determine constants
405+
serialize_q3 (outdir, post_months)
406+
serialize_q14(outdir, post_month_params(week_posts, post_lower_threshold*2, post_upper_threshold*2))
407+
408+
serialize_q1 (outdir, post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts))
409+
serialize_q12(outdir, post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts))
410+
serialize_q18(outdir, post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts), post_lengths, language_codes)
411+
serialize_q10(outdir, key_params(tag_posts, total_posts/900, total_posts/600), post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts))
412+
413+
serialize_q4 (outdir, key_params(tagclass_posts, total_posts/20, total_posts/10), key_params(country_sample, total_posts/120, total_posts/70))
414+
serialize_q5 (outdir, key_params(country_sample, total_posts/200, total_posts/100))
415+
serialize_q6 (outdir, key_params(tag_posts, total_posts/1300, total_posts/900))
416+
serialize_q7 (outdir, key_params(tag_posts, total_posts/900, total_posts/600))
417+
serialize_q8 (outdir, key_params(tag_posts, total_posts/600, total_posts/300))
418+
serialize_q9 (outdir, key_params(tagclass_posts, 6000, 25000))
419+
serialize_q13(outdir, key_params(country_sample, total_posts/200, total_posts/100))
420+
serialize_q15(outdir, key_params(country_sample, total_posts/200, total_posts/100))
421+
serialize_q16(outdir, persons, key_params(tagclass_posts, total_posts/30, total_posts/10), key_params(country_sample, total_posts/80, total_posts/20), path_bounds)
422+
serialize_q17(outdir, key_params(country_sample, total_posts/200, total_posts/100))
423+
serialize_q19(outdir, key_params(tagclass_posts, total_posts/60, total_posts/10))
424+
serialize_q21(outdir, key_params(country_sample, total_posts/200, total_posts/100))
425+
serialize_q22(outdir, key_params(country_sample, total_posts/120, total_posts/40))
426+
serialize_q23(outdir, key_params(country_sample, total_posts/200, total_posts/100))
427+
serialize_q24(outdir, key_params(tagclass_posts, total_posts/140, total_posts/5))
428+
serialize_q25(outdir, persons, post_months)
370429

371430
# TODO: Refine
372-
serializes_q20(outdir, key_params(tagclass_posts, total_posts/20, total_posts/2))
373-
serializes_q11(outdir, key_params(country_sample, total_posts/80, total_posts/20), bad_words)
374-
375-
# TODO: implement
376-
#serializes_q25(outdir, ...)
431+
serialize_q20(outdir, key_params(tagclass_posts, total_posts/20, total_posts/2))
432+
serialize_q11(outdir, key_params(country_sample, total_posts/80, total_posts/20), bad_words)
377433

378434
if __name__ == "__main__":
379435
sys.exit(main())

0 commit comments

Comments
 (0)