Skip to content

Commit c229d70

Browse files
committed
Extended BI parameter generation
1 parent 7bb2a17 commit c229d70

File tree

4 files changed

+70
-50
lines changed

4 files changed

+70
-50
lines changed

paramgenerator/generateparams.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ def main(argv=None):
142142
friendsFiles.append(indir+file)
143143

144144
# read precomputed counts from files
145-
(personFactors, countryFactors, tagFactors, tagClassFactors, nameFactors, givenNames, ts) = readfactors.load(factorFiles, friendsFiles)
145+
(personFactors, countryFactors, tagFactors, tagClassFactors, nameFactors, givenNames, ts, postHisto) = readfactors.load(factorFiles, friendsFiles)
146146

147147
# find person parameters
148148
print "find parameter bindings for Persons"

paramgenerator/generateparamsbi.py

Lines changed: 62 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
from timeparameters import *
1010
from calendar import timegm
1111

12-
1312
# class ParamsWriter:
1413
# def __init__(self, name, num_params):
1514
# self.files = []
@@ -93,18 +92,18 @@ def post_month_params(sample, lower_bound, upper_bound):
9392
results.append([[start_day, end_day], count_sum])
9493
return results
9594

96-
def post_three_month_params(sample, lower_bound, upper_bound):
97-
results = []
98-
for ix in range(0, len(sample)/12):
99-
start_ix = ix*12
100-
count_sum = 0
101-
for offset, count in sample[start_ix:start_ix+12]:
102-
count_sum += count
103-
if count_sum > lower_bound and count_sum < upper_bound:
104-
start_day = sample[start_ix][0]
105-
end_day = sample[start_ix+12][0]
106-
results.append([[start_day, end_day], count_sum])
107-
return results
95+
# def post_three_month_params(sample, lower_bound, upper_bound):
96+
# results = []
97+
# for ix in range(0, len(sample)/12):
98+
# start_ix = ix*12
99+
# count_sum = 0
100+
# for offset, count in sample[start_ix:start_ix+12]:
101+
# count_sum += count
102+
# if count_sum > lower_bound and count_sum < upper_bound:
103+
# start_day = sample[start_ix][0]
104+
# end_day = sample[start_ix+12][0]
105+
# results.append([[start_day, end_day], count_sum])
106+
# return results
108107

109108

110109
def key_params(sample, lower_bound, upper_bound):
@@ -115,38 +114,24 @@ def key_params(sample, lower_bound, upper_bound):
115114
return results
116115

117116
def serialize_q1(post_weeks):
118-
f1 = open('params/q1.1.params', 'w+')
119-
fcounts = open('params/q1.counts.params', 'w+')
117+
writer = ParamsWriter("q1", 1)
120118
for week, count in post_weeks:
121-
f1.write(str(week)+"\n")
122-
fcounts.write(str(count)+"\n")
119+
writer.append([str(week)], [count])
123120

124121
def serialize_q2(country_sets, post_day_ranges):
125-
# Generate Q2 params
126-
f1 = open('params/q2.1.params', 'w+')
127-
f2 = open('params/q2.2.params', 'w+')
128-
f3 = open('params/q2.3.params', 'w+')
129-
fcounts = open('params/q2.counts.params', 'w+')
122+
writer = ParamsWriter("q2", 3)
130123
random.seed(1988+2)
131124
for country_set, count_country in country_sets:
132125
for day_range, count_post in post_day_ranges:
133126
if random.randint(0,len(country_sets) + len(post_day_ranges)) == 0:
134-
f1.write(str(day_range[0])+"\n")
135-
f2.write(str(day_range[1])+"\n")
136-
f3.write("ctry_name = '"+"' or ctry_name = '".join(country_set)+"'\n")
137-
fcounts.write(str(count_post)+"|"+str(count_country)+"\n")
127+
writer.append([str(day_range[0]), str(day_range[1]), ",".join(country_set)], [count_post,count_post,count_country])
138128

139129
def serialize_q3(post_months):
140-
# Generate Q2 params
141-
f1 = open('params/q3.1.params', 'w+')
142-
f2 = open('params/q3.2.params', 'w+')
143-
fcounts = open('params/q3.counts.params', 'w+')
130+
writer = ParamsWriter("q3", 2)
144131
for ix in range(0,len(post_months)):
145132
week_range_a, count_a = post_months[ix]
146133
for week_range_b, count_b in post_months[ix+1:]:
147-
f1.write(str(week_range_a[0])+"\n")
148-
f2.write(str(week_range_b[0])+"\n")
149-
fcounts.write(str(count_a)+"|"+str(count_b)+"\n")
134+
writer.append([str(week_range_a),str(week_range_b)], [count_a,count_b])
150135

151136
def serialize_q4(tagclasses, countries):
152137
writer = ParamsWriter("q4", 2)
@@ -188,23 +173,19 @@ def serialize_q10(tags):
188173
writer.append([tag], [count])
189174

190175
def serialize_q12(post_weeks):
191-
f1 = open('params/q12.1.params', 'w+')
192-
fcounts = open('params/q12.counts.params', 'w+')
176+
writer = ParamsWriter("q12", 1)
193177
for week, count in post_weeks:
194-
f1.write(str(week)+"\n")
195-
fcounts.write(str(count)+"\n")
178+
writer.append([str(week)], [count])
196179

197180
def serialize_q13(countries):
198181
writer = ParamsWriter("q13", 1)
199182
for country, count in countries:
200183
writer.append([country], [count])
201184

202185
def serialize_q14(creationdates):
203-
f1 = open('params/q14.1.params', 'w+')
204-
fcounts = open('params/q14.counts.params', 'w+')
186+
writer = ParamsWriter("q14", 1)
205187
for creation, count in creationdates:
206-
f1.write(str(creation[0])+"\n")
207-
fcounts.write(str(count)+"\n")
188+
writer.append([str(creation)], [count])
208189

209190
def serialize_q15(countries):
210191
writer = ParamsWriter("q15", 1)
@@ -223,10 +204,9 @@ def serialize_q17(countries):
223204
writer.append([country], [count])
224205

225206
def serialize_q18(post_weeks):
226-
f1 = open('params/q18.1.params', 'w+')
227-
fcounts = open('params/q18.counts.params', 'w+')
207+
writer = ParamsWriter("q18", 1)
228208
for week, count in post_weeks:
229-
f1.write(str(week)+"\n")
209+
writer.append([str(week)], [count])
230210

231211
def serialize_q19(tagclasses):
232212
writer = ParamsWriter("q19", 2)
@@ -257,6 +237,18 @@ def serialize_q24(tagclasses):
257237
for tagclass, count in tagclasses:
258238
writer.append([tagclass], [count])
259239

240+
def convert_posts_histo(histogram):
241+
week_posts = []
242+
month = 0
243+
while (histogram.existParam(month)):
244+
monthTotal = histogram.getValue(month, "p")
245+
week_posts.append([month*30, monthTotal/4])
246+
week_posts.append([month*30+7, monthTotal/4])
247+
week_posts.append([month*30+14, monthTotal/4])
248+
week_posts.append([month*30+21, monthTotal/4])
249+
month = month + 1
250+
return week_posts
251+
260252
def main(argv=None):
261253
if argv is None:
262254
argv = sys.argv
@@ -277,7 +269,8 @@ def main(argv=None):
277269
friendsFiles.append(indir+file)
278270

279271
# read precomputed counts from files
280-
(personFactors, countryFactors, tagFactors, tagClassFactors, nameFactors, givenNames, ts) = readfactors.load(factorFiles, friendsFiles)
272+
(personFactors, countryFactors, tagFactors, tagClassFactors, nameFactors, givenNames, ts, postsHisto) = readfactors.load(factorFiles, friendsFiles)
273+
week_posts = convert_posts_histo(postsHisto)
281274

282275
country_sample = []
283276
for key, value in countryFactors.values.iteritems():
@@ -293,21 +286,43 @@ def main(argv=None):
293286
total_posts = 0
294287
for day, count in tag_posts:
295288
total_posts += count
289+
290+
person_sum = 0
291+
for country, count in country_sample:
292+
person_sum += count
293+
294+
country_lower_threshold = 0.1*total_posts*0.9
295+
country_upper_threshold = 0.1*total_posts*1.1
296+
country_sets = country_sets_params(country_sample, country_lower_threshold, country_upper_threshold, 4)
297+
298+
post_lower_threshold = 0.1*total_posts*0.9
299+
post_upper_threshold = 0.1*total_posts*1.1
300+
post_day_ranges = post_date_range_params(week_posts, post_lower_threshold, post_upper_threshold)
296301

302+
post_lower_threshold = (total_posts/(week_posts[len(week_posts)-1][0]/7/4))*0.8
303+
post_upper_threshold = (total_posts/(week_posts[len(week_posts)-1][0]/7/4))*1.2
304+
post_months = post_month_params(week_posts, post_lower_threshold, post_upper_threshold)
305+
306+
serialize_q2(country_sets, post_day_ranges)
307+
serialize_q3(post_months)
308+
serialize_q14(post_month_params(week_posts, post_lower_threshold*2, post_upper_threshold*2))
309+
310+
serialize_q1(post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts))
311+
serialize_q12(post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts))
312+
serialize_q18(post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts))
313+
297314
serialize_q4(key_params(tagclass_posts, total_posts/20, total_posts/10), key_params(country_sample, total_posts/120, total_posts/70))
298315
serialize_q5(key_params(country_sample, total_posts/200, total_posts/100))
299316
serialize_q6(key_params(tag_posts, total_posts/1300, total_posts/900))
300317
serialize_q7(key_params(tag_posts, total_posts/900, total_posts/600))
301318
serialize_q8(key_params(tag_posts, total_posts/600, total_posts/300))
302319
serialize_q9(key_params(tagclass_posts, 6000, 25000))
303320
serialize_q10(key_params(tag_posts, total_posts/900, total_posts/600))
304-
# serialize_q12(post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts))
305321
serialize_q13(key_params(country_sample, total_posts/200, total_posts/100))
306322
# serialize_q14(post_month_params(week_posts, post_lower_threshold*2, post_upper_threshold*2))
307323
serialize_q15(key_params(country_sample, total_posts/200, total_posts/100))
308324
serialize_q16(key_params(tagclass_posts, total_posts/30, total_posts/10), key_params(country_sample, total_posts/110, total_posts/70))
309325
serialize_q17(key_params(country_sample, total_posts/200, total_posts/100))
310-
# serialize_q18(post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts))
311326
serialize_q19(key_params(tagclass_posts, total_posts/60, total_posts/10))
312327
serialize_q21(key_params(country_sample, total_posts/200, total_posts/100))
313328
serialize_q22(key_params(country_sample, total_posts/120, total_posts/40))

paramgenerator/readfactors.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ def load(factorFiles, friendFiles):
5757
print "loading input for parameter generation"
5858
results = Factors()
5959
countries = Factors()
60+
postsHisto = Factors()
6061
givenNames = NameParameter()
6162

6263
tagClasses = {}
@@ -82,6 +83,10 @@ def load(factorFiles, friendFiles):
8283
results.addValue(person, "g", int(line[6]))
8384
results.addValue(person, "w", int(line[7]))
8485
results.addValue(person, "pr", int(line[8]))
86+
for i in range((len(line)-9)/2):
87+
if not postsHisto.existParam(i):
88+
postsHisto.addNewParam(i)
89+
postsHisto.addValue(i, "p", int(line[9+i]))
8590

8691
countryCount = int(f.readline())
8792
for i in range(countryCount):
@@ -123,7 +128,7 @@ def load(factorFiles, friendFiles):
123128

124129
loadFriends(friendFiles, results)
125130

126-
return (results, countries, tags.items(), tagClasses.items(), names.items(), givenNames,timestamp)
131+
return (results, countries, tags.items(), tagClasses.items(), names.items(), givenNames,timestamp, postsHisto)
127132

128133
def loadFriends(friendFiles, factors):
129134

run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ if [ $PARAM_GENERATION -eq 1 ]
2626
then
2727
mkdir -p substitution_parameters
2828
python paramgenerator/generateparams.py $LDBC_SNB_DATAGEN_HOME substitution_parameters/
29-
python paramgenerator/generateparamsbi.py $LDBC_SNB_DATAGEN_HOME substitution_parameters/
29+
python paramgenerator/generateparamsbi.py $LDBC_SNB_DATAGEN_HOME substitution_parameters/
3030
rm -f m*factors*
3131
rm -f .m*factors*
3232
rm -f m0friendList*

0 commit comments

Comments
 (0)