Skip to content

Commit de2547a

Browse files
committed
.
1 parent a4f1327 commit de2547a

File tree

2 files changed

+263
-0
lines changed

2 files changed

+263
-0
lines changed

paramgenerator/timeparameters.py

Lines changed: 263 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,263 @@
1+
import sys
2+
import getopt
3+
import math
4+
from readfactors import FactorCount
5+
from operator import itemgetter
6+
import itertools
7+
8+
LAST_MONTHS = 3 # number of months that we consider for date parameters in the filters of a form timestamp <= Date0
9+
START_YEAR = 2010 # default value that gets over-written from the data generator output
10+
11+
class MonthYearCount:
12+
def __init__(self, month, year, count):
13+
self.month=month
14+
self.year=year
15+
self.count=count
16+
17+
18+
class TimeParameter:
19+
def __init__(self, year, month, day, duration):
20+
self.month=month
21+
self.year=year
22+
self.day=day
23+
self.duration=duration
24+
25+
def findTimeParameters(persons, factors, procedure, timestampSelection):
26+
if "w" == procedure:
27+
medians = computeTimeMedians(factors, lastmonthcount = 12)
28+
else:
29+
medians = computeTimeMedians(factors)
30+
31+
timeParams = timestampSelection(factors,medians)
32+
33+
return timeParams
34+
35+
36+
def getMedian(data, sort_key, getEntireTuple = False):
37+
if len(data) == 0:
38+
if getEntireTuple:
39+
return MonthYearCount(0,0,0)
40+
return 0
41+
42+
if len(data) == 1:
43+
if getEntireTuple:
44+
return data[0]
45+
return data[0].count
46+
47+
srtd = sorted(data,key=sort_key)
48+
mid = len(data)/2
49+
50+
if len(data) % 2 == 0:
51+
if getEntireTuple:
52+
return srtd[mid]
53+
return (sort_key(srtd[mid-1]) + sort_key(srtd[mid])) / 2.0
54+
55+
if getEntireTuple:
56+
return srtd[mid]
57+
return sort_key(srtd[mid])
58+
59+
60+
def MonthYearToDate(myc, day):
61+
return "%d-%d-%d"%(myc.year, myc.month, day)
62+
63+
64+
def getTimeParamsWithMedian(factors, (medianFirstMonth, medianLastMonth, median)):
65+
# strategy: find the median of the given distribution, then increase the time interval until it matches the given parameter
66+
res = []
67+
for values in factors:
68+
input = sorted(values,key=lambda myc: (myc.year, myc.month))
69+
currentMedian = getMedian(values,lambda myc: myc.count, True)
70+
if int(median) == 0 or int(currentMedian.count) == 0 or int(currentMedian.year) == 0:
71+
res.append(TimeParameter(START_YEAR,1,1,0))
72+
continue
73+
if currentMedian.count > median:
74+
duration = int(28*currentMedian.count/median)
75+
res.append(TimeParameter(currentMedian.year, currentMedian.month, 1, duration))
76+
else:
77+
duration = int(28*median/currentMedian.count)
78+
res.append(TimeParameter(currentMedian.year, currentMedian.month, 1, duration))
79+
return res
80+
81+
def getTimeParamsBeforeMedian(factors, (medianFirstMonth, medianLastMonth, median)):
82+
# strategy: find the interval [0: median] with the sum of counts as close as possible to medianFirstMonth
83+
res = []
84+
i = 0
85+
for values in factors:
86+
input = sorted(values,key=lambda myc: (myc.year, myc.month))
87+
localsum = 0
88+
best = MonthYearCount(0,0,0)
89+
for myc in input:
90+
localsum += myc.count
91+
i+=1
92+
if localsum >= medianFirstMonth:
93+
day = max(28 -28*(localsum-medianFirstMonth)/myc.count,1)
94+
res.append(TimeParameter(myc.year, myc.month, day, None))
95+
break
96+
best = myc
97+
98+
if localsum < medianFirstMonth:
99+
res.append(TimeParameter(best.year, best.month, 28, None))
100+
101+
return res
102+
103+
def getTimeParamsAfterMedian(factors, (medianFirstMonth, medianLastMonth, median)):
104+
# strategy: find the interval [median: end] with the sum of counts as close as possible to medianFirstMonth
105+
res = []
106+
107+
for values in factors:
108+
input = sorted(values,key=lambda myc: (-myc.year, -myc.month))
109+
localsum = 0
110+
best = MonthYearCount(0,0,0)
111+
for myc in input:
112+
localsum += myc.count
113+
if localsum >= medianLastMonth:
114+
day = max(28 * (localsum-medianLastMonth)/myc.count,1)
115+
res.append(TimeParameter(myc.year, myc.month, day, None))
116+
break
117+
best = myc
118+
119+
if localsum < medianLastMonth:
120+
res.append(TimeParameter(best.year, best.month, 1, None))
121+
return res
122+
123+
def computeTimeMedians(factors, lastmonthcount = LAST_MONTHS):
124+
mediantimes = []
125+
lastmonths = []
126+
firstmonths = []
127+
for values in factors:
128+
values.sort(key=lambda myc: (myc.year, myc.month))
129+
130+
l = len(values)
131+
lastmonthsum = sum(myc.count for myc in values[max(l-lastmonthcount,0):l])
132+
lastmonths.append(lastmonthsum)
133+
cutoff_max = l-lastmonthcount
134+
if cutoff_max < 0:
135+
cutoff_max = l
136+
firstmonthsum = sum(myc.count for myc in values[0:cutoff_max])
137+
firstmonths.append(firstmonthsum)
138+
mediantimes.append(getMedian(values,lambda myc: myc.count))
139+
140+
median = getMedian(mediantimes, lambda x: x)
141+
medianLastMonth = getMedian(lastmonths, lambda x: x)
142+
medianFirstMonth = getMedian(firstmonths, lambda x: x)
143+
144+
return (medianFirstMonth, medianLastMonth, median)
145+
146+
def readTimeParams(persons, factorFiles, friendFiles):
147+
148+
postCounts = {}
149+
groupCounts = {}
150+
offset = 8
151+
monthcount = 12*3 + 1
152+
153+
for inputFactorFile in factorFiles:
154+
with open(inputFactorFile, 'r') as f:
155+
personCount = int(f.readline())
156+
for i in range(personCount):
157+
line = f.readline().split(",")
158+
person = int(line[0])
159+
localPostCounts = map(int,line[offset:offset+monthcount])
160+
localGroupCounts = map(int, line[offset+monthcount:])
161+
if not person in postCounts:
162+
postCounts[person] = localPostCounts
163+
else:
164+
postCounts[person] = [sum(x) for x in zip(postCounts[person], localPostCounts)]
165+
166+
if not person in groupCounts:
167+
groupCounts[person] = localGroupCounts
168+
else:
169+
groupCounts[person] = [sum(x) for x in zip(groupCounts[person], localGroupCounts)]
170+
171+
friendsPostsCounts = {}
172+
fGroupCount = {}
173+
for inputFriendFile in friendFiles:
174+
with open(inputFriendFile, 'r') as f:
175+
for line in f:
176+
people = map(int, line.split(","))
177+
person = people[0]
178+
friendsPostsCounts[person] = [0]*monthcount
179+
for friend in people[1:]:
180+
if not friend in postCounts:
181+
continue
182+
friendsPostsCounts[person] = [x+y for x,y in zip(friendsPostsCounts[person], postCounts[friend])]
183+
fGroupCount[person] = [0]*monthcount
184+
for friend in people[1:]:
185+
if not friend in groupCounts:
186+
continue
187+
fGroupCount[person] = [x+y for x,y in zip(fGroupCount[person], groupCounts[friend])]
188+
189+
190+
ffPostCounts = {}
191+
ffGroupCount = {}
192+
for inputFriendFile in friendFiles:
193+
with open(inputFriendFile, 'r') as f:
194+
for line in f:
195+
people = map(int, line.split(","))
196+
person = people[0]
197+
ffPostCounts[person] = [0]*monthcount
198+
for friend in people[1:]:
199+
if not friend in friendsPostsCounts:
200+
continue
201+
ffPostCounts[person] = [x+y for x,y in zip(ffPostCounts[person],friendsPostsCounts[friend])]
202+
ffGroupCount[person] = [0]*monthcount
203+
for friend in people[1:]:
204+
if not friend in fGroupCount:
205+
continue
206+
ffGroupCount[person] = [x+y for x,y in zip(ffGroupCount[person],fGroupCount[friend])]
207+
208+
return (friendsPostsCounts, ffPostCounts, ffGroupCount)
209+
210+
211+
212+
def findTimeParams(input, factorFiles, friendFiles, startYear):
213+
START_YEAR = startYear
214+
fPostCount = {}
215+
ffPostCount = {}
216+
persons = []
217+
for queryId in input:
218+
persons += input[queryId][0]
219+
220+
(fPostCount, ffPostCount, ffGroupCount) = readTimeParams(set(persons),factorFiles, friendFiles)
221+
222+
mapParam = {
223+
"f" : fPostCount,
224+
"ff": ffPostCount,
225+
"ffg": ffGroupCount
226+
}
227+
228+
output = {}
229+
for queryId in input:
230+
factors = mapParam[input[queryId][1]]
231+
mycFactors = []
232+
for person in input[queryId][0]:
233+
countsPerMonth = factors[person]
234+
myc = []
235+
for (month,count) in enumerate(countsPerMonth):
236+
if count == 0:
237+
continue
238+
year = startYear + month / 12
239+
myc.append(MonthYearCount(month % 12 + 1, int(year), count))
240+
mycFactors.append(myc)
241+
242+
output[queryId] = findTimeParameters(input[queryId][0], mycFactors, input[queryId][1], input[queryId][2])
243+
244+
return output
245+
246+
def main(argv=None):
247+
if argv is None:
248+
argv = sys.argv
249+
250+
if len(argv)< 2:
251+
print "arguments: <input persons file>"
252+
return 1
253+
254+
f = open(argv[1])
255+
256+
factors = prepareTimedFactors(f, getFriendFriendPostByTime)
257+
258+
medians = computeTimeMedians(factors)
259+
260+
timestamps = getTimeParamsWithMedian(factors,medians)
261+
262+
if __name__ == "__main__":
263+
sys.exit(main())

paramgenerator/timeparameters.pyc

-10 KB
Binary file not shown.

0 commit comments

Comments
 (0)