Skip to content

Commit 0fb7be9

Browse files
committed
2 parents 495553e + b24b1e0 commit 0fb7be9

File tree

5 files changed

+59
-33
lines changed

5 files changed

+59
-33
lines changed

paramgenerator/generateparams.py

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,10 @@ def findNameParameters(names, amount = 100):
3232
while counts[mid] - counts[i] < 0.1 * counts[mid]:
3333
res.extend([name for name in hist[counts[i]]])
3434
i -= 1
35-
3635
return res
3736

37+
38+
3839
class CSVSerializer:
3940
def __init__(self):
4041
self.handlers = []
@@ -49,7 +50,7 @@ def registerHandler(self, handler, inputParams, header):
4950
self.inputs.append(inputParams)
5051

5152
def writeCSV(self):
52-
output = codecs.open(self.outputFile, "w", encoding="utf-8")
53+
output = codecs.open( self.outputFile, "w",encoding="utf-8")
5354

5455
if len(self.inputs) == 0:
5556
return
@@ -65,8 +66,7 @@ def writeCSV(self):
6566
handler = self.handlers[j]
6667
data = self.inputs[j][i]
6768
csvLine.append(handler(data))
68-
69-
output.write("|".join(csvLine))
69+
output.write('|'.join([s for s in csvLine]))
7070
output.write("\n")
7171
output.close()
7272

@@ -75,6 +75,7 @@ def handlePersonParam(person):
7575
#return {"PersonID": person, "PersonURI":(PERSON_PREFIX+str("%020d"%person))}
7676

7777
def handleTimeParam(timeParam):
78+
#print timeParam.year
7879
#print timeParam.year
7980
res = str(timegm(date(year=int(timeParam.year),
8081
month=int(timeParam.month), day=int(timeParam.day)).timetuple())*1000)
@@ -141,7 +142,7 @@ def main(argv=None):
141142
friendsFiles.append(indir+file)
142143

143144
# read precomputed counts from files
144-
(personFactors, countryFactors, tagFactors, tagClassFactors, nameFactors, ts) = readfactors.load(factorFiles, friendsFiles)
145+
(personFactors, countryFactors, tagFactors, tagClassFactors, nameFactors, givenNames, ts) = readfactors.load(factorFiles, friendsFiles)
145146

146147
# find person parameters
147148
print "find parameter bindings for Persons"
@@ -233,12 +234,16 @@ def main(argv=None):
233234
HS.append((HS0, HS1))
234235

235236
# Query 1 takes first name as a parameter
236-
nameParams = findNameParameters(nameFactors)# discoverparams.generate(nameFactors)
237-
# if there are fewer first names than person parameters, repeat some of the names
238-
if len(nameParams) < len(selectedPersonParams[2]):
239-
oldlen = len(nameParams)
240-
newlen = len(selectedPersonParams[2])
241-
nameParams.extend([nameParams[random.randint(0, oldlen-1)] for j in range(newlen-oldlen)])
237+
#nameParams = findNameParameters(nameFactors)# discoverparams.generate(nameFactors)
238+
## if there are fewer first names than person parameters, repeat some of the names
239+
#if len(nameParams) < len(selectedPersonParams[2]):
240+
# oldlen = len(nameParams)
241+
# newlen = len(selectedPersonParams[2])
242+
# nameParams.extend([nameParams[random.randint(0, oldlen-1)] for j in range(newlen-oldlen)])
243+
nameParams = []
244+
for person in selectedPersonParams[1]:
245+
n = givenNames.getValue(person)
246+
nameParams.append(n)
242247

243248
# serialize all the parameters as CSV
244249
csvWriters = {}

paramgenerator/readfactors.py

Lines changed: 22 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,23 @@ def getValue(self, person, factor):
4141
def addValue(self, person, factor, value):
4242
self.values[person].addValue(factor, value)
4343

44+
class NameParameter:
45+
def __init__(self, persons=[]):
46+
self.values={}
47+
for p in persons:
48+
self.values[p] = 0
49+
50+
def setValue(self, person, value):
51+
self.values[person] = value
52+
53+
def getValue(self, person):
54+
return self.values[person]
4455

4556
def load(factorFiles, friendFiles):
4657
print "loading input for parameter generation"
4758
results = Factors()
4859
countries = Factors()
49-
60+
givenNames = NameParameter()
5061

5162
tagClasses = {}
5263
tags = {}
@@ -62,13 +73,15 @@ def load(factorFiles, friendFiles):
6273
person = int(line[0])
6374
if not results.existParam(person):
6475
results.addNewParam(person)
65-
results.addValue(person, "f", int(line[1]))
66-
results.addValue(person, "p", int(line[2]))
67-
results.addValue(person, "pl", int(line[3]))
68-
results.addValue(person, "pt", int(line[4]))
69-
results.addValue(person, "g", int(line[5]))
70-
results.addValue(person, "w", int(line[6]))
71-
results.addValue(person, "pr", int(line[7]))
76+
name = line[1]
77+
givenNames.setValue(person, name)
78+
results.addValue(person, "f", int(line[2]))
79+
results.addValue(person, "p", int(line[3]))
80+
results.addValue(person, "pl", int(line[4]))
81+
results.addValue(person, "pt", int(line[5]))
82+
results.addValue(person, "g", int(line[6]))
83+
results.addValue(person, "w", int(line[7]))
84+
results.addValue(person, "pr", int(line[8]))
7285

7386
countryCount = int(f.readline())
7487
for i in range(countryCount):
@@ -91,11 +104,6 @@ def load(factorFiles, friendFiles):
91104
line = f.readline()
92105
count = line[1+line.rfind(","):]
93106
name = line[:line.rfind(",")]
94-
try:
95-
name.decode('ascii')
96-
except UnicodeEncodeError:
97-
continue
98-
99107
if not name in tags:
100108
tags[name] = 0
101109
tags[name] += int(count)
@@ -104,11 +112,6 @@ def load(factorFiles, friendFiles):
104112
for i in range(nameCount):
105113
line = f.readline().split(",")
106114
name = line[0]
107-
try:
108-
name.decode('ascii')
109-
except UnicodeEncodeError:
110-
continue
111-
112115
if not name in names:
113116
names[name] = 0
114117
names[name] += int(line[1])
@@ -120,7 +123,7 @@ def load(factorFiles, friendFiles):
120123

121124
loadFriends(friendFiles, results)
122125

123-
return (results, countries, tags.items(), tagClasses.items(), names.items(), timestamp)
126+
return (results, countries, tags.items(), tagClasses.items(), names.items(), givenNames,timestamp)
124127

125128
def loadFriends(friendFiles, factors):
126129

paramgenerator/timeparameters.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import sys
22
import getopt
33
import math
4-
import random
54
from readfactors import FactorCount
65
from operator import itemgetter
76
import itertools
@@ -68,7 +67,7 @@ def getTimeParamsWithMedian(factors, (medianFirstMonth, medianLastMonth, median)
6867
for values in factors:
6968
input = sorted(values,key=lambda myc: (myc.year, myc.month))
7069
currentMedian = getMedian(values,lambda myc: myc.count, True)
71-
if int(median) == 0 or int(currentMedian.count) == 0:
70+
if int(median) == 0 or int(currentMedian.count) == 0 or int(currentMedian.year) == 0:
7271
res.append(TimeParameter(START_YEAR,1,1,0))
7372
continue
7473
if currentMedian.count > median:

src/main/java/ldbc/socialnet/dbgen/dictionary/NamesDictionary.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,5 +219,17 @@ public String getRandomGivenName(Random random, int locationId, boolean isMale,
219219

220220
return name;
221221
}
222+
223+
/**
224+
* return a given name which is the median of topN for a given location/gender/year
225+
* we use it for parameter generation
226+
*/
227+
public String getMedianGivenName(int locationId, boolean isMale, int birthYear){
228+
int period = 0;
229+
Vector<HashMap<Integer, Vector<String>>> target = (isMale) ? givenNamesByLocationsMale : givenNamesByLocationsFemale;
230+
int size = target.get(period).get(locationId).size();
231+
String name = target.get(period).get(locationId).get(size/2);
232+
return name;
233+
}
222234
}
223235

src/main/java/ldbc/socialnet/dbgen/generator/ScalableGenerator.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,7 @@ public enum OrganisationType {
252252
private HashMap<Integer, Integer> tagClassCount;
253253
private HashMap<String, Integer> firstNameCount;
254254
private HashMap<Integer, Integer> tagNameCount;
255+
private HashMap<Long, String> medianFirstName;
255256
// For blocking
256257
private static final int reducerShift[] = { 26, 8, 1 };
257258

@@ -429,6 +430,7 @@ public ScalableGenerator(int threadId, Configuration conf ){
429430
this.postsPerCountry = new HashMap<Integer, Integer>();
430431
this.tagClassCount = new HashMap<Integer, Integer>();
431432
this.firstNameCount = new HashMap<String, Integer>();
433+
this.medianFirstName = new HashMap<Long, String>();
432434
this.tagNameCount = new HashMap<Integer, Integer>();
433435
if (threadId != -1){
434436
outUserProfile = "mr" + threadId + "_" + outUserProfileName;
@@ -793,6 +795,9 @@ public void generateUserActivity( ReducedUserProfile userProfile, Reducer<MapRed
793795
dataExporter.export(userInfo);
794796
int nameCount = firstNameCount.containsKey(extraInfo.getFirstName())? firstNameCount.get(extraInfo.getFirstName()):0;
795797
firstNameCount.put(extraInfo.getFirstName(), nameCount+1);
798+
String medianName = namesDictionary.getMedianGivenName(userProfile.getLocationId(),
799+
userProfile.getGender()==1, dateTimeGenerator.getBirthYear(userProfile.getBirthDay()));
800+
medianFirstName.put(userProfile.getAccountId(), medianName);
796801
long init = System.currentTimeMillis();
797802
if(conf.getBoolean("activity",true)) {
798803
Group wall = generateWall(userInfo);
@@ -1570,7 +1575,9 @@ private void writeFactorTable(){
15701575
// correct the group counts
15711576
//count.numberOfGroups += count.numberOfFriends;
15721577
StringBuffer strbuf = new StringBuffer();
1573-
strbuf.append(c.getKey()); strbuf.append(",");
1578+
strbuf.append(c.getKey()); strbuf.append(",");
1579+
String name = medianFirstName.get(c.getKey());
1580+
strbuf.append(name); strbuf.append(",");
15741581
strbuf.append(count.numberOfFriends); strbuf.append(",");
15751582
strbuf.append(count.numberOfPosts); strbuf.append(",");
15761583
strbuf.append(count.numberOfLikes); strbuf.append(",");

0 commit comments

Comments
 (0)