Skip to content

Commit 8790428

Browse files
committed
Merge branch 'bi' of github.com:ldbc/ldbc_snb_datagen into bi
2 parents d3d5214 + 695ab73 commit 8790428

File tree

1 file changed

+36
-10
lines changed

1 file changed

+36
-10
lines changed

paramgenerator/generateparamsbi.py

Lines changed: 36 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -179,8 +179,24 @@ def serialize_q10(tags):
179179
for tag, count in tags:
180180
writer.append([tag], [count])
181181

182-
def serialize_q11():
182+
def serialize_q11(countries, bad_words):
183183
writer = ParamsWriter("q11", ["country", "blacklist"])
184+
random.seed(1988+2)
185+
for country, count in countries:
186+
num_words = random.randint(1,min(len(bad_words),4));
187+
random.shuffle(bad_words)
188+
blacklist = bad_words[0:num_words]
189+
writer.append([country,";".join(blacklist)], [count])
190+
191+
num_words = random.randint(1,min(len(bad_words),10));
192+
random.shuffle(bad_words)
193+
blacklist = bad_words[0:num_words]
194+
writer.append([country,";".join(blacklist)], [count])
195+
196+
num_words = random.randint(1,min(len(bad_words),7));
197+
random.shuffle(bad_words)
198+
blacklist = bad_words[0:num_words]
199+
writer.append([country,";".join(blacklist)], [count])
184200

185201
def serialize_q12(post_weeks):
186202
writer = ParamsWriter("q12", ["creationDate", "likeCount"])
@@ -202,11 +218,12 @@ def serialize_q15(countries):
202218
for country, count in countries:
203219
writer.append([country], [count])
204220

205-
def serialize_q16(tagclasses, countries):
206-
writer = ParamsWriter("q16", ["todoPerson","tag","country"])
221+
def serialize_q16(persons, tagclasses, countries):
222+
writer = ParamsWriter("q16", ["person","tag","country"])
223+
random.seed(1988+2)
207224
for tag, count_a in tagclasses:
208225
for country, count_b in countries:
209-
writer.append([str(11052), tag, country], [count_a, count_b])
226+
writer.append([str(persons[random.randint(0,len(persons))]), tag, country], [0, count_a, count_b])
210227

211228
def serialize_q17(countries):
212229
writer = ParamsWriter("q17", ["country"])
@@ -226,8 +243,10 @@ def serialize_q19(tagclasses):
226243
for tag_class_b, count_b in tagclasses[ix+1:]:
227244
writer.append([str(format_date(PERS_DATE)),tag_class_a, tag_class_b], [count_a, count_b])
228245

229-
def serialize_q20():
230-
writer = ParamsWriter("q20", [])
246+
def serialize_q20(tagclasses):
247+
writer = ParamsWriter("q20", ["tagclass"])
248+
for tagclass, count in tagclasses:
249+
writer.append([tagclass], [count])
231250

232251
def serialize_q21(countries):
233252
writer = ParamsWriter("q21", ["country","endDate"])
@@ -294,6 +313,12 @@ def main(argv=None):
294313
(personFactors, countryFactors, tagFactors, tagClassFactors, nameFactors, givenNames, ts, postsHisto) = readfactors.load(factorFiles, friendsFiles)
295314
week_posts = convert_posts_histo(postsHisto)
296315

316+
persons = []
317+
for key, _ in personFactors.values.iteritems():
318+
persons.append(key)
319+
random.seed(1988)
320+
random.shuffle(persons)
321+
297322
country_sample = []
298323
for key, value in countryFactors.values.iteritems():
299324
country_sample.append([key, value.getValue("p")])
@@ -321,6 +346,7 @@ def main(argv=None):
321346
post_upper_threshold = 0.1*total_posts*1.1
322347
post_day_ranges = post_date_range_params(week_posts, post_lower_threshold, post_upper_threshold)
323348

349+
bad_words = ['Augustine','William','James','with','Henry','Robert','from','Pope','Hippo','album','David','has','one','also','Green','which','that']
324350
#post_lower_threshold = (total_posts/(week_posts[len(week_posts)-1][0]/7/4))*0.8
325351
#post_upper_threshold = (total_posts/(week_posts[len(week_posts)-1][0]/7/4))*1.2
326352
non_empty_weeks=len(week_posts)
@@ -348,18 +374,18 @@ def main(argv=None):
348374
serialize_q9(key_params(tagclass_posts, 6000, 25000))
349375
serialize_q10(key_params(tag_posts, total_posts/900, total_posts/600))
350376
serialize_q13(key_params(country_sample, total_posts/200, total_posts/100))
351-
# serialize_q14(post_month_params(week_posts, post_lower_threshold*2, post_upper_threshold*2))
352377
serialize_q15(key_params(country_sample, total_posts/200, total_posts/100))
353-
serialize_q16(key_params(tagclass_posts, total_posts/30, total_posts/10), key_params(country_sample, total_posts/110, total_posts/70))
378+
serialize_q16(persons, key_params(tagclass_posts, total_posts/30, total_posts/10), key_params(country_sample, total_posts/80, total_posts/20))
354379
serialize_q17(key_params(country_sample, total_posts/200, total_posts/100))
355380
serialize_q19(key_params(tagclass_posts, total_posts/60, total_posts/10))
356381
serialize_q21(key_params(country_sample, total_posts/200, total_posts/100))
357382
serialize_q22(key_params(country_sample, total_posts/120, total_posts/40))
358383
serialize_q23(key_params(country_sample, total_posts/200, total_posts/100))
359384
serialize_q24(key_params(tagclass_posts, total_posts/140, total_posts/5))
360385

361-
serialize_q11()
362-
serialize_q20()
386+
# TODO: Refine
387+
serialize_q20(key_params(tagclass_posts, total_posts/20, total_posts/2))
388+
serialize_q11(key_params(country_sample, total_posts/80, total_posts/20), bad_words)
363389

364390
if __name__ == "__main__":
365391
sys.exit(main())

0 commit comments

Comments
 (0)