|
| 1 | +import sys |
| 2 | +import discoverparams |
| 3 | +import readfactors |
| 4 | +import random |
| 5 | +import json |
| 6 | +import os |
| 7 | +import codecs |
| 8 | +from datetime import date |
| 9 | +from timeparameters import * |
| 10 | +from calendar import timegm |
| 11 | + |
| 12 | + |
| 13 | +# class ParamsWriter: |
| 14 | +# def __init__(self, name, num_params): |
| 15 | +# self.files = [] |
| 16 | +# for i in range(0, num_params): |
| 17 | +# self.files.append(codecs.open("params/"+name+"."+str(i+1)+".params", "w",encoding="utf-8")) |
| 18 | + |
| 19 | +# def append(self, params, counts): |
| 20 | +# for i, param in enumerate(params): |
| 21 | +# self.files[i].write(param+"\n") |
| 22 | + |
| 23 | +class ParamsWriter: |
| 24 | + def __init__(self, name, num_params): |
| 25 | + self.file = codecs.open("substitution_parameters/"+name+"_param.txt", "w",encoding="utf-8") |
| 26 | + for i in range(0,num_params): |
| 27 | + if i>0: |
| 28 | + self.file.write("|") |
| 29 | + self.file.write("Param"+str(i)) |
| 30 | + self.file.write("\n") |
| 31 | + |
| 32 | + def append(self, params, counts): |
| 33 | + for i, param in enumerate(params): |
| 34 | + if i>0: |
| 35 | + self.file.write("|") |
| 36 | + self.file.write(param) |
| 37 | + self.file.write("\n") |
| 38 | + |
| 39 | + |
| 40 | +def country_sets_params(sample, lower_bound, upper_bound, max_depth, start = 0): |
| 41 | + if max_depth == 0: |
| 42 | + return [] |
| 43 | + |
| 44 | + results = [] |
| 45 | + ix = start |
| 46 | + for country, count in sample[start:]: |
| 47 | + if count < (lower_bound / (max_depth + 1)): |
| 48 | + continue |
| 49 | + if count < lower_bound: |
| 50 | + others = country_sets_params(sample, lower_bound-count, upper_bound-count, max_depth - 1, ix + 1) |
| 51 | + for other_countries, other_count in others: |
| 52 | + combined_count = count + other_count |
| 53 | + if combined_count > lower_bound and combined_count < upper_bound: |
| 54 | + other_countries.append(country) |
| 55 | + results.append([other_countries, combined_count]) |
| 56 | + if count > lower_bound and count < upper_bound: |
| 57 | + results.append([[country], count]) |
| 58 | + ix = ix + 1 |
| 59 | + return results |
| 60 | + |
| 61 | +def post_date_right_open_range_params(sample, lower_bound, upper_bound): |
| 62 | + results = [] |
| 63 | + for ix in range(0, len(sample)): |
| 64 | + start_offset = sample[ix][0] |
| 65 | + count_sum = 0 |
| 66 | + for offset, count in sample[ix:]: |
| 67 | + count_sum += count |
| 68 | + if count_sum > lower_bound and count_sum < upper_bound: |
| 69 | + results.append([start_offset, count_sum]) |
| 70 | + return results |
| 71 | + |
| 72 | +def post_date_range_params(sample, lower_bound, upper_bound): |
| 73 | + results = [] |
| 74 | + for ix in range(0, len(sample)): |
| 75 | + start_offset = sample[ix][0] |
| 76 | + count_sum = 0 |
| 77 | + for offset, count in sample[ix:]: |
| 78 | + count_sum += count |
| 79 | + if count_sum > lower_bound and count_sum < upper_bound: |
| 80 | + results.append([[start_offset, offset], count_sum]) |
| 81 | + return results |
| 82 | + |
| 83 | +def post_month_params(sample, lower_bound, upper_bound): |
| 84 | + results = [] |
| 85 | + for ix in range(0, len(sample)/4): |
| 86 | + start_ix = ix*4 |
| 87 | + count_sum = 0 |
| 88 | + for offset, count in sample[start_ix:start_ix+4]: |
| 89 | + count_sum += count |
| 90 | + if count_sum > lower_bound and count_sum < upper_bound: |
| 91 | + start_day = sample[start_ix][0] |
| 92 | + end_day = sample[start_ix+4][0] |
| 93 | + results.append([[start_day, end_day], count_sum]) |
| 94 | + return results |
| 95 | + |
| 96 | +def post_three_month_params(sample, lower_bound, upper_bound): |
| 97 | + results = [] |
| 98 | + for ix in range(0, len(sample)/12): |
| 99 | + start_ix = ix*12 |
| 100 | + count_sum = 0 |
| 101 | + for offset, count in sample[start_ix:start_ix+12]: |
| 102 | + count_sum += count |
| 103 | + if count_sum > lower_bound and count_sum < upper_bound: |
| 104 | + start_day = sample[start_ix][0] |
| 105 | + end_day = sample[start_ix+12][0] |
| 106 | + results.append([[start_day, end_day], count_sum]) |
| 107 | + return results |
| 108 | + |
| 109 | + |
| 110 | +def key_params(sample, lower_bound, upper_bound): |
| 111 | + results = [] |
| 112 | + for key, count in sample: |
| 113 | + if count > lower_bound and count < upper_bound: |
| 114 | + results.append([key, count]) |
| 115 | + return results |
| 116 | + |
| 117 | +def serialize_q1(post_weeks): |
| 118 | + f1 = open('params/q1.1.params', 'w+') |
| 119 | + fcounts = open('params/q1.counts.params', 'w+') |
| 120 | + for week, count in post_weeks: |
| 121 | + f1.write(str(week)+"\n") |
| 122 | + fcounts.write(str(count)+"\n") |
| 123 | + |
| 124 | +def serialize_q2(country_sets, post_day_ranges): |
| 125 | + # Generate Q2 params |
| 126 | + f1 = open('params/q2.1.params', 'w+') |
| 127 | + f2 = open('params/q2.2.params', 'w+') |
| 128 | + f3 = open('params/q2.3.params', 'w+') |
| 129 | + fcounts = open('params/q2.counts.params', 'w+') |
| 130 | + random.seed(1988+2) |
| 131 | + for country_set, count_country in country_sets: |
| 132 | + for day_range, count_post in post_day_ranges: |
| 133 | + if random.randint(0,len(country_sets) + len(post_day_ranges)) == 0: |
| 134 | + f1.write(str(day_range[0])+"\n") |
| 135 | + f2.write(str(day_range[1])+"\n") |
| 136 | + f3.write("ctry_name = '"+"' or ctry_name = '".join(country_set)+"'\n") |
| 137 | + fcounts.write(str(count_post)+"|"+str(count_country)+"\n") |
| 138 | + |
| 139 | +def serialize_q3(post_months): |
| 140 | + # Generate Q2 params |
| 141 | + f1 = open('params/q3.1.params', 'w+') |
| 142 | + f2 = open('params/q3.2.params', 'w+') |
| 143 | + fcounts = open('params/q3.counts.params', 'w+') |
| 144 | + for ix in range(0,len(post_months)): |
| 145 | + week_range_a, count_a = post_months[ix] |
| 146 | + for week_range_b, count_b in post_months[ix+1:]: |
| 147 | + f1.write(str(week_range_a[0])+"\n") |
| 148 | + f2.write(str(week_range_b[0])+"\n") |
| 149 | + fcounts.write(str(count_a)+"|"+str(count_b)+"\n") |
| 150 | + |
| 151 | +def serialize_q4(tagclasses, countries): |
| 152 | + writer = ParamsWriter("q4", 2) |
| 153 | + for tag, count_a in tagclasses: |
| 154 | + for country, count_b in countries: |
| 155 | + writer.append([tag,country], [count_a,count_b]) |
| 156 | + |
| 157 | +def serialize_q5(countries): |
| 158 | + writer = ParamsWriter("q5", 1) |
| 159 | + for country, count in countries: |
| 160 | + writer.append([country], [count]) |
| 161 | + |
| 162 | + |
| 163 | +def serialize_q6(tags): |
| 164 | + writer = ParamsWriter("q6", 1) |
| 165 | + for tag, count in tags: |
| 166 | + writer.append([tag], [count]) |
| 167 | + |
| 168 | +def serialize_q7(tags): |
| 169 | + writer = ParamsWriter("q7", 1) |
| 170 | + for tag, count in tags: |
| 171 | + writer.append([tag], [count]) |
| 172 | + |
| 173 | +def serialize_q8(tags): |
| 174 | + writer = ParamsWriter("q8", 1) |
| 175 | + for tag, count in tags: |
| 176 | + writer.append([tag], [count]) |
| 177 | + |
| 178 | +def serialize_q9(tagclasses): |
| 179 | + writer = ParamsWriter("q9", 2) |
| 180 | + for ix in range(0,len(tagclasses)): |
| 181 | + tag_class_a, count_a = tagclasses[ix] |
| 182 | + for tag_class_b, count_b in tagclasses[ix+1:]: |
| 183 | + writer.append([tag_class_a, tag_class_b], [count_a, count_b]) |
| 184 | + |
| 185 | +def serialize_q10(tags): |
| 186 | + writer = ParamsWriter("q10", 1) |
| 187 | + for tag, count in tags: |
| 188 | + writer.append([tag], [count]) |
| 189 | + |
| 190 | +def serialize_q12(post_weeks): |
| 191 | + f1 = open('params/q12.1.params', 'w+') |
| 192 | + fcounts = open('params/q12.counts.params', 'w+') |
| 193 | + for week, count in post_weeks: |
| 194 | + f1.write(str(week)+"\n") |
| 195 | + fcounts.write(str(count)+"\n") |
| 196 | + |
| 197 | +def serialize_q13(countries): |
| 198 | + writer = ParamsWriter("q13", 1) |
| 199 | + for country, count in countries: |
| 200 | + writer.append([country], [count]) |
| 201 | + |
| 202 | +def serialize_q14(creationdates): |
| 203 | + f1 = open('params/q14.1.params', 'w+') |
| 204 | + fcounts = open('params/q14.counts.params', 'w+') |
| 205 | + for creation, count in creationdates: |
| 206 | + f1.write(str(creation[0])+"\n") |
| 207 | + fcounts.write(str(count)+"\n") |
| 208 | + |
| 209 | +def serialize_q15(countries): |
| 210 | + writer = ParamsWriter("q15", 1) |
| 211 | + for country, count in countries: |
| 212 | + writer.append([country], [count]) |
| 213 | + |
| 214 | +def serialize_q16(tagclasses, countries): |
| 215 | + writer = ParamsWriter("q16", 2) |
| 216 | + for tag, count_a in tagclasses: |
| 217 | + for country, count_b in countries: |
| 218 | + writer.append([tag, country], [count_a, count_b]) |
| 219 | + |
| 220 | +def serialize_q17(countries): |
| 221 | + writer = ParamsWriter("q17", 1) |
| 222 | + for country, count in countries: |
| 223 | + writer.append([country], [count]) |
| 224 | + |
| 225 | +def serialize_q18(post_weeks): |
| 226 | + f1 = open('params/q18.1.params', 'w+') |
| 227 | + fcounts = open('params/q18.counts.params', 'w+') |
| 228 | + for week, count in post_weeks: |
| 229 | + f1.write(str(week)+"\n") |
| 230 | + |
| 231 | +def serialize_q19(tagclasses): |
| 232 | + writer = ParamsWriter("q19", 2) |
| 233 | + for ix in range(0,len(tagclasses)): |
| 234 | + tag_class_a, count_a = tagclasses[ix] |
| 235 | + for tag_class_b, count_b in tagclasses[ix+1:]: |
| 236 | + writer.append([tag_class_a, tag_class_b], [count_a, count_b]) |
| 237 | + |
| 238 | +def serialize_q21(countries): |
| 239 | + writer = ParamsWriter("q21", 1) |
| 240 | + for country, count in countries: |
| 241 | + writer.append([country], [count]) |
| 242 | + |
| 243 | +def serialize_q22(countries): |
| 244 | + writer = ParamsWriter("q22", 2) |
| 245 | + for ix in range(0,len(countries)): |
| 246 | + country_a, count_a = countries[ix] |
| 247 | + for country_b, count_b in countries[ix+1:]: |
| 248 | + writer.append([country_a, country_b], [count_a, count_b]) |
| 249 | + |
| 250 | +def serialize_q23(countries): |
| 251 | + writer = ParamsWriter("q23", 1) |
| 252 | + for country, count in countries: |
| 253 | + writer.append([country], [count]) |
| 254 | + |
| 255 | +def serialize_q24(tagclasses): |
| 256 | + writer = ParamsWriter("q24", 1) |
| 257 | + for tagclass, count in tagclasses: |
| 258 | + writer.append([tagclass], [count]) |
| 259 | + |
| 260 | +def main(argv=None): |
| 261 | + if argv is None: |
| 262 | + argv = sys.argv |
| 263 | + |
| 264 | + if len(argv) < 3: |
| 265 | + print "arguments: <input dir> <output>" |
| 266 | + return 1 |
| 267 | + |
| 268 | + indir = argv[1]+"/" |
| 269 | + factorFiles=[] |
| 270 | + friendsFiles = [] |
| 271 | + outdir = argv[2]+"/" |
| 272 | + |
| 273 | + for file in os.listdir(indir): |
| 274 | + if file.endswith("factors.txt"): |
| 275 | + factorFiles.append(indir+file) |
| 276 | + if file.startswith("m0friendList"): |
| 277 | + friendsFiles.append(indir+file) |
| 278 | + |
| 279 | + # read precomputed counts from files |
| 280 | + (personFactors, countryFactors, tagFactors, tagClassFactors, nameFactors, givenNames, ts) = readfactors.load(factorFiles, friendsFiles) |
| 281 | + |
| 282 | + country_sample = [] |
| 283 | + for key, value in countryFactors.values.iteritems(): |
| 284 | + country_sample.append([key, value.getValue("p")]) |
| 285 | + country_sample.sort(key=lambda x: x[1], reverse=True) |
| 286 | + |
| 287 | + tagclass_posts = tagClassFactors |
| 288 | + tagclass_posts.sort(key=lambda x: x[1], reverse=True) |
| 289 | + |
| 290 | + tag_posts = tagFactors |
| 291 | + tag_posts.sort(key=lambda x: x[1], reverse=True) |
| 292 | + |
| 293 | + total_posts = 0 |
| 294 | + for day, count in tag_posts: |
| 295 | + total_posts += count |
| 296 | + |
| 297 | + serialize_q4(key_params(tagclass_posts, total_posts/20, total_posts/10), key_params(country_sample, total_posts/120, total_posts/70)) |
| 298 | + serialize_q5(key_params(country_sample, total_posts/200, total_posts/100)) |
| 299 | + serialize_q6(key_params(tag_posts, total_posts/1300, total_posts/900)) |
| 300 | + serialize_q7(key_params(tag_posts, total_posts/900, total_posts/600)) |
| 301 | + serialize_q8(key_params(tag_posts, total_posts/600, total_posts/300)) |
| 302 | + serialize_q9(key_params(tagclass_posts, 6000, 25000)) |
| 303 | + serialize_q10(key_params(tag_posts, total_posts/900, total_posts/600)) |
| 304 | + # serialize_q12(post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts)) |
| 305 | + serialize_q13(key_params(country_sample, total_posts/200, total_posts/100)) |
| 306 | + # serialize_q14(post_month_params(week_posts, post_lower_threshold*2, post_upper_threshold*2)) |
| 307 | + serialize_q15(key_params(country_sample, total_posts/200, total_posts/100)) |
| 308 | + serialize_q16(key_params(tagclass_posts, total_posts/30, total_posts/10), key_params(country_sample, total_posts/110, total_posts/70)) |
| 309 | + serialize_q17(key_params(country_sample, total_posts/200, total_posts/100)) |
| 310 | + # serialize_q18(post_date_right_open_range_params(week_posts, 0.3*total_posts, 0.6*total_posts)) |
| 311 | + serialize_q19(key_params(tagclass_posts, total_posts/60, total_posts/10)) |
| 312 | + serialize_q21(key_params(country_sample, total_posts/200, total_posts/100)) |
| 313 | + serialize_q22(key_params(country_sample, total_posts/120, total_posts/40)) |
| 314 | + serialize_q23(key_params(country_sample, total_posts/200, total_posts/100)) |
| 315 | + serialize_q24(key_params(tagclass_posts, total_posts/140, total_posts/5)) |
| 316 | + |
| 317 | +if __name__ == "__main__": |
| 318 | + sys.exit(main()) |
0 commit comments