Skip to content

Commit 7bb2a17

Browse files
committed
First version of BI parameter generation
1 parent ff136bd commit 7bb2a17

File tree

2 files changed

+319
-0
lines changed

2 files changed

+319
-0
lines changed

paramgenerator/generateparamsbi.py

Lines changed: 318 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,318 @@
1+
import sys
2+
import discoverparams
3+
import readfactors
4+
import random
5+
import json
6+
import os
7+
import codecs
8+
from datetime import date
9+
from timeparameters import *
10+
from calendar import timegm
11+
12+
13+
# class ParamsWriter:
14+
# def __init__(self, name, num_params):
15+
# self.files = []
16+
# for i in range(0, num_params):
17+
# self.files.append(codecs.open("params/"+name+"."+str(i+1)+".params", "w",encoding="utf-8"))
18+
19+
# def append(self, params, counts):
20+
# for i, param in enumerate(params):
21+
# self.files[i].write(param+"\n")
22+
23+
class ParamsWriter:
24+
def __init__(self, name, num_params):
25+
self.file = codecs.open("substitution_parameters/"+name+"_param.txt", "w",encoding="utf-8")
26+
for i in range(0,num_params):
27+
if i>0:
28+
self.file.write("|")
29+
self.file.write("Param"+str(i))
30+
self.file.write("\n")
31+
32+
def append(self, params, counts):
33+
for i, param in enumerate(params):
34+
if i>0:
35+
self.file.write("|")
36+
self.file.write(param)
37+
self.file.write("\n")
38+
39+
40+
def country_sets_params(sample, lower_bound, upper_bound, max_depth, start = 0):
41+
if max_depth == 0:
42+
return []
43+
44+
results = []
45+
ix = start
46+
for country, count in sample[start:]:
47+
if count < (lower_bound / (max_depth + 1)):
48+
continue
49+
if count < lower_bound:
50+
others = country_sets_params(sample, lower_bound-count, upper_bound-count, max_depth - 1, ix + 1)
51+
for other_countries, other_count in others:
52+
combined_count = count + other_count
53+
if combined_count > lower_bound and combined_count < upper_bound:
54+
other_countries.append(country)
55+
results.append([other_countries, combined_count])
56+
if count > lower_bound and count < upper_bound:
57+
results.append([[country], count])
58+
ix = ix + 1
59+
return results
60+
61+
def post_date_right_open_range_params(sample, lower_bound, upper_bound):
62+
results = []
63+
for ix in range(0, len(sample)):
64+
start_offset = sample[ix][0]
65+
count_sum = 0
66+
for offset, count in sample[ix:]:
67+
count_sum += count
68+
if count_sum > lower_bound and count_sum < upper_bound:
69+
results.append([start_offset, count_sum])
70+
return results
71+
72+
def post_date_range_params(sample, lower_bound, upper_bound):
73+
results = []
74+
for ix in range(0, len(sample)):
75+
start_offset = sample[ix][0]
76+
count_sum = 0
77+
for offset, count in sample[ix:]:
78+
count_sum += count
79+
if count_sum > lower_bound and count_sum < upper_bound:
80+
results.append([[start_offset, offset], count_sum])
81+
return results
82+
83+
def post_month_params(sample, lower_bound, upper_bound):
84+
results = []
85+
for ix in range(0, len(sample)/4):
86+
start_ix = ix*4
87+
count_sum = 0
88+
for offset, count in sample[start_ix:start_ix+4]:
89+
count_sum += count
90+
if count_sum > lower_bound and count_sum < upper_bound:
91+
start_day = sample[start_ix][0]
92+
end_day = sample[start_ix+4][0]
93+
results.append([[start_day, end_day], count_sum])
94+
return results
95+
96+
def post_three_month_params(sample, lower_bound, upper_bound):
97+
results = []
98+
for ix in range(0, len(sample)/12):
99+
start_ix = ix*12
100+
count_sum = 0
101+
for offset, count in sample[start_ix:start_ix+12]:
102+
count_sum += count
103+
if count_sum > lower_bound and count_sum < upper_bound:
104+
start_day = sample[start_ix][0]
105+
end_day = sample[start_ix+12][0]
106+
results.append([[start_day, end_day], count_sum])
107+
return results
108+
109+
110+
def key_params(sample, lower_bound, upper_bound):
111+
results = []
112+
for key, count in sample:
113+
if count > lower_bound and count < upper_bound:
114+
results.append([key, count])
115+
return results
116+
117+
def serialize_q1(post_weeks):
118+
f1 = open('params/q1.1.params', 'w+')
119+
fcounts = open('params/q1.counts.params', 'w+')
120+
for week, count in post_weeks:
121+
f1.write(str(week)+"\n")
122+
fcounts.write(str(count)+"\n")
123+
124+
def serialize_q2(country_sets, post_day_ranges):
125+
# Generate Q2 params
126+
f1 = open('params/q2.1.params', 'w+')
127+
f2 = open('params/q2.2.params', 'w+')
128+
f3 = open('params/q2.3.params', 'w+')
129+
fcounts = open('params/q2.counts.params', 'w+')
130+
random.seed(1988+2)
131+
for country_set, count_country in country_sets:
132+
for day_range, count_post in post_day_ranges:
133+
if random.randint(0,len(country_sets) + len(post_day_ranges)) == 0:
134+
f1.write(str(day_range[0])+"\n")
135+
f2.write(str(day_range[1])+"\n")
136+
f3.write("ctry_name = '"+"' or ctry_name = '".join(country_set)+"'\n")
137+
fcounts.write(str(count_post)+"|"+str(count_country)+"\n")
138+
139+
def serialize_q3(post_months):
140+
# Generate Q2 params
141+
f1 = open('params/q3.1.params', 'w+')
142+
f2 = open('params/q3.2.params', 'w+')
143+
fcounts = open('params/q3.counts.params', 'w+')
144+
for ix in range(0,len(post_months)):
145+
week_range_a, count_a = post_months[ix]
146+
for week_range_b, count_b in post_months[ix+1:]:
147+
f1.write(str(week_range_a[0])+"\n")
148+
f2.write(str(week_range_b[0])+"\n")
149+
fcounts.write(str(count_a)+"|"+str(count_b)+"\n")
150+
151+
def serialize_q4(tagclasses, countries):
152+
writer = ParamsWriter("q4", 2)
153+
for tag, count_a in tagclasses:
154+
for country, count_b in countries:
155+
writer.append([tag,country], [count_a,count_b])
156+
157+
def serialize_q5(countries):
158+
writer = ParamsWriter("q5", 1)
159+
for country, count in countries:
160+
writer.append([country], [count])
161+
162+
163+
def serialize_q6(tags):
164+
writer = ParamsWriter("q6", 1)
165+
for tag, count in tags:
166+
writer.append([tag], [count])
167+
168+
def serialize_q7(tags):
169+
writer = ParamsWriter("q7", 1)
170+
for tag, count in tags:
171+
writer.append([tag], [count])
172+
173+
def serialize_q8(tags):
174+
writer = ParamsWriter("q8", 1)
175+
for tag, count in tags:
176+
writer.append([tag], [count])
177+
178+
def serialize_q9(tagclasses):
179+
writer = ParamsWriter("q9", 2)
180+
for ix in range(0,len(tagclasses)):
181+
tag_class_a, count_a = tagclasses[ix]
182+
for tag_class_b, count_b in tagclasses[ix+1:]:
183+
writer.append([tag_class_a, tag_class_b], [count_a, count_b])
184+
185+
def serialize_q10(tags):
186+
writer = ParamsWriter("q10", 1)
187+
for tag, count in tags:
188+
writer.append([tag], [count])
189+
190+
def serialize_q12(post_weeks):
191+
f1 = open('params/q12.1.params', 'w+')
192+
fcounts = open('params/q12.counts.params', 'w+')
193+
for week, count in post_weeks:
194+
f1.write(str(week)+"\n")
195+
fcounts.write(str(count)+"\n")
196+
197+
def serialize_q13(countries):
198+
writer = ParamsWriter("q13", 1)
199+
for country, count in countries:
200+
writer.append([country], [count])
201+
202+
def serialize_q14(creationdates):
203+
f1 = open('params/q14.1.params', 'w+')
204+
fcounts = open('params/q14.counts.params', 'w+')
205+
for creation, count in creationdates:
206+
f1.write(str(creation[0])+"\n")
207+
fcounts.write(str(count)+"\n")
208+
209+
def serialize_q15(countries):
210+
writer = ParamsWriter("q15", 1)
211+
for country, count in countries:
212+
writer.append([country], [count])
213+
214+
def serialize_q16(tagclasses, countries):
215+
writer = ParamsWriter("q16", 2)
216+
for tag, count_a in tagclasses:
217+
for country, count_b in countries:
218+
writer.append([tag, country], [count_a, count_b])
219+
220+
def serialize_q17(countries):
221+
writer = ParamsWriter("q17", 1)
222+
for country, count in countries:
223+
writer.append([country], [count])
224+
225+
def serialize_q18(post_weeks):
226+
f1 = open('params/q18.1.params', 'w+')
227+
fcounts = open('params/q18.counts.params', 'w+')
228+
for week, count in post_weeks:
229+
f1.write(str(week)+"\n")
230+
231+
def serialize_q19(tagclasses):
232+
writer = ParamsWriter("q19", 2)
233+
for ix in range(0,len(tagclasses)):
234+
tag_class_a, count_a = tagclasses[ix]
235+
for tag_class_b, count_b in tagclasses[ix+1:]:
236+
writer.append([tag_class_a, tag_class_b], [count_a, count_b])
237+
238+
def serialize_q21(countries):
239+
writer = ParamsWriter("q21", 1)
240+
for country, count in countries:
241+
writer.append([country], [count])
242+
243+
def serialize_q22(countries):
244+
writer = ParamsWriter("q22", 2)
245+
for ix in range(0,len(countries)):
246+
country_a, count_a = countries[ix]
247+
for country_b, count_b in countries[ix+1:]:
248+
writer.append([country_a, country_b], [count_a, count_b])
249+
250+
def serialize_q23(countries):
251+
writer = ParamsWriter("q23", 1)
252+
for country, count in countries:
253+
writer.append([country], [count])
254+
255+
def serialize_q24(tagclasses):
256+
writer = ParamsWriter("q24", 1)
257+
for tagclass, count in tagclasses:
258+
writer.append([tagclass], [count])
259+
260+
def main(argv=None):
261+
if argv is None:
262+
argv = sys.argv
263+
264+
if len(argv) < 3:
265+
print "arguments: <input dir> <output>"
266+
return 1
267+
268+
indir = argv[1]+"/"
269+
factorFiles=[]
270+
friendsFiles = []
271+
outdir = argv[2]+"/"
272+
273+
for file in os.listdir(indir):
274+
if file.endswith("factors.txt"):
275+
factorFiles.append(indir+file)
276+
if file.startswith("m0friendList"):
277+
friendsFiles.append(indir+file)
278+
279+
# read precomputed counts from files
280+
(personFactors, countryFactors, tagFactors, tagClassFactors, nameFactors, givenNames, ts) = readfactors.load(factorFiles, friendsFiles)
281+
282+
country_sample = []
283+
for key, value in countryFactors.values.iteritems():
284+
country_sample.append([key, value.getValue("p")])
285+
country_sample.sort(key=lambda x: x[1], reverse=True)
286+
287+
tagclass_posts = tagClassFactors
288+
tagclass_posts.sort(key=lambda x: x[1], reverse=True)
289+
290+
tag_posts = tagFactors
291+
tag_posts.sort(key=lambda x: x[1], reverse=True)
292+
293+
total_posts = 0
294+
for day, count in tag_posts:
295+
total_posts += count
296+
297+
serialize_q4(key_params(tagclass_posts, total_posts/20, total_posts/10), key_params(country_sample, total_posts/120, total_posts/70))
298+
serialize_q5(key_params(country_sample, total_posts/200, total_posts/100))
299+
serialize_q6(key_params(tag_posts, total_posts/1300, total_posts/900))
300+
serialize_q7(key_params(tag_posts, total_posts/900, total_posts/600))
301+
serialize_q8(key_params(tag_posts, total_posts/600, total_posts/300))
302+
serialize_q9(key_params(tagclass_posts, 6000, 25000))
303+
serialize_q10(key_params(tag_posts, total_posts/900, total_posts/600))
304+
# serialize_q12(post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts))
305+
serialize_q13(key_params(country_sample, total_posts/200, total_posts/100))
306+
# serialize_q14(post_month_params(week_posts, post_lower_threshold*2, post_upper_threshold*2))
307+
serialize_q15(key_params(country_sample, total_posts/200, total_posts/100))
308+
serialize_q16(key_params(tagclass_posts, total_posts/30, total_posts/10), key_params(country_sample, total_posts/110, total_posts/70))
309+
serialize_q17(key_params(country_sample, total_posts/200, total_posts/100))
310+
# serialize_q18(post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts))
311+
serialize_q19(key_params(tagclass_posts, total_posts/60, total_posts/10))
312+
serialize_q21(key_params(country_sample, total_posts/200, total_posts/100))
313+
serialize_q22(key_params(country_sample, total_posts/120, total_posts/40))
314+
serialize_q23(key_params(country_sample, total_posts/200, total_posts/100))
315+
serialize_q24(key_params(tagclass_posts, total_posts/140, total_posts/5))
316+
317+
if __name__ == "__main__":
318+
sys.exit(main())

run.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ if [ $PARAM_GENERATION -eq 1 ]
2626
then
2727
mkdir -p substitution_parameters
2828
python paramgenerator/generateparams.py $LDBC_SNB_DATAGEN_HOME substitution_parameters/
29+
python paramgenerator/generateparamsbi.py $LDBC_SNB_DATAGEN_HOME substitution_parameters/
2930
rm -f m*factors*
3031
rm -f .m*factors*
3132
rm -f m0friendList*

0 commit comments

Comments
 (0)