Skip to content

Commit a685816

Browse files
committed
Added Empirical degree distribution Distribution Generator
Added dblp distribution dataset
1 parent c34bf5c commit a685816

24 files changed

+353
-65
lines changed

src/main/java/ldbc/snb/datagen/generator/CommentGenerator.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import ldbc.snb.datagen.util.RandomGeneratorFarm;
1212
import ldbc.snb.datagen.vocabulary.SN;
1313

14+
import java.io.IOException;
1415
import java.util.ArrayList;
1516
import java.util.Iterator;
1617
import java.util.Properties;
@@ -33,7 +34,7 @@ public CommentGenerator(TextGenerator generator, LikeGenerator likeGenerator){
3334
this.comment_ = new Comment();
3435
}
3536

36-
public long createComments(RandomGeneratorFarm randomFarm, final Forum forum, final Post post, long numComments, long startId, PersonActivityExporter exporter){
37+
public long createComments(RandomGeneratorFarm randomFarm, final Forum forum, final Post post, long numComments, long startId, PersonActivityExporter exporter) throws IOException {
3738
long nextId = startId;
3839
ArrayList<Message> replyCandidates = new ArrayList<Message>();
3940
replyCandidates.add(post);

src/main/java/ldbc/snb/datagen/generator/LikeGenerator.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import ldbc.snb.datagen.objects.Message;
1414
import ldbc.snb.datagen.serializer.PersonActivityExporter;
1515

16+
import java.io.IOException;
1617
import java.util.ArrayList;
1718
import java.util.Random;
1819

@@ -31,7 +32,7 @@ public LikeGenerator() {
3132
this.like = new Like();
3233
}
3334

34-
public void generateLikes(Random random, final Forum forum, final Message message, LikeType type, PersonActivityExporter exporter) {
35+
public void generateLikes(Random random, final Forum forum, final Message message, LikeType type, PersonActivityExporter exporter) throws IOException {
3536
int numMembers = forum.memberships().size();
3637
int numLikes = likesGenerator_.getValue(random);
3738
numLikes = numLikes >= numMembers ? numMembers : numLikes;

src/main/java/ldbc/snb/datagen/generator/PersonActivityGenerator.java

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
import org.apache.hadoop.mapreduce.Reducer.Context;
1414

15+
import java.io.IOException;
1516
import java.io.OutputStream;
1617
import java.util.ArrayList;
1718
import java.util.Random;
@@ -47,7 +48,7 @@ public PersonActivityGenerator( PersonActivitySerializer serializer, UpdateEvent
4748
exporter_ = new PersonActivityExporter(personActivitySerializer_, updateSerializer_, factorTable_);
4849
}
4950

50-
private void generateActivity( Person person, ArrayList<Person> block ) {
51+
private void generateActivity( Person person, ArrayList<Person> block ) throws IOException {
5152
try {
5253
factorTable_.extractFactors(person);
5354
generateWall(person, block);
@@ -65,7 +66,7 @@ public void reset() {
6566
personActivitySerializer_.reset();
6667
}
6768

68-
private void generateWall( Person person, ArrayList<Person> block ) {
69+
private void generateWall( Person person, ArrayList<Person> block ) throws IOException {
6970
// generate wall
7071
Forum wall = forumGenerator_.createWall(randomFarm_, forumId++, person);
7172
exporter_.export(wall);
@@ -83,7 +84,7 @@ private void generateWall( Person person, ArrayList<Person> block ) {
8384
messageId = flashmobPostGenerator_.createPosts(randomFarm_, wall, fakeMembers, numPostsPerGroup(randomFarm_, wall, DatagenParams.maxNumFlashmobPostPerMonth, DatagenParams.maxNumFriends), messageId, exporter_ );
8485
}
8586

86-
private void generateGroups( Person person, ArrayList<Person> block ) {
87+
private void generateGroups( Person person, ArrayList<Person> block ) throws IOException {
8788
// generate user created groups
8889
double moderatorProb = randomFarm_.get(RandomGeneratorFarm.Aspect.FORUM_MODERATOR).nextDouble();
8990
if (moderatorProb <= DatagenParams.groupModeratorProb) {
@@ -104,7 +105,7 @@ private void generateGroups( Person person, ArrayList<Person> block ) {
104105

105106
}
106107

107-
private void generateAlbums(Person person, ArrayList<Person> block ) {
108+
private void generateAlbums(Person person, ArrayList<Person> block ) throws IOException {
108109
// generate albums
109110
int numOfmonths = (int) Dictionaries.dates.numberOfMonths(person);
110111
int numPhotoAlbums = randomFarm_.get(RandomGeneratorFarm.Aspect.NUM_PHOTO_ALBUM).nextInt(DatagenParams.maxNumPhotoAlbumsPerMonth+1);
@@ -141,7 +142,7 @@ private int numPostsPerGroup( RandomGeneratorFarm randomFarm, Forum forum, int m
141142
return (numberPost * forum.memberships().size()) / maxMembersPerForum;
142143
}
143144

144-
public void generateActivityForBlock( int seed, ArrayList<Person> block, Context context ) {
145+
public void generateActivityForBlock( int seed, ArrayList<Person> block, Context context ) throws IOException {
145146
randomFarm_.resetRandomGenerators(seed);
146147
forumId = 0;
147148
messageId = 0;

src/main/java/ldbc/snb/datagen/generator/PhotoGenerator.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import ldbc.snb.datagen.util.RandomGeneratorFarm;
1212
import ldbc.snb.datagen.vocabulary.SN;
1313

14+
import java.io.IOException;
1415
import java.util.ArrayList;
1516
import java.util.TreeSet;
1617

@@ -29,7 +30,7 @@ public PhotoGenerator(LikeGenerator likeGenerator) {
2930
this.likeGenerator_ = likeGenerator;
3031
this.photo_ = new Photo();
3132
}
32-
public long createPhotos(RandomGeneratorFarm randomFarm, final Forum album, final ArrayList<ForumMembership> memberships, long numPhotos, long startId, PersonActivityExporter exporter){
33+
public long createPhotos(RandomGeneratorFarm randomFarm, final Forum album, final ArrayList<ForumMembership> memberships, long numPhotos, long startId, PersonActivityExporter exporter) throws IOException {
3334
long nextId = startId;
3435
ArrayList<Photo> photos = new ArrayList<Photo>();
3536
int numPopularPlaces = randomFarm.get(RandomGeneratorFarm.Aspect.NUM_POPULAR).nextInt(DatagenParams.maxNumPopularPlaces + 1);

src/main/java/ldbc/snb/datagen/generator/PostGenerator.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
import ldbc.snb.datagen.util.RandomGeneratorFarm;
4444
import ldbc.snb.datagen.vocabulary.SN;
4545

46+
import java.io.IOException;
4647
import java.util.ArrayList;
4748
import java.util.Properties;
4849
import java.util.Random;
@@ -80,7 +81,7 @@ public void initialize() {
8081
}
8182

8283

83-
public long createPosts(RandomGeneratorFarm randomFarm, final Forum forum, final ArrayList<ForumMembership> memberships, long numPosts, long startId, PersonActivityExporter exporter){
84+
public long createPosts(RandomGeneratorFarm randomFarm, final Forum forum, final ArrayList<ForumMembership> memberships, long numPosts, long startId, PersonActivityExporter exporter) throws IOException {
8485
long postId = startId;
8586
Properties prop = new Properties();
8687
prop.setProperty("type","post");
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
package ldbc.snb.datagen.generator.distribution;
2+
3+
import javafx.util.Pair;
4+
import ldbc.snb.datagen.generator.DatagenParams;
5+
import ldbc.snb.datagen.generator.distribution.utils.Bucket;
6+
import ldbc.snb.datagen.generator.distribution.utils.BucketedDistribution;
7+
import org.apache.hadoop.conf.Configuration;
8+
9+
import java.io.BufferedReader;
10+
import java.io.IOException;
11+
import java.io.InputStreamReader;
12+
import java.util.ArrayList;
13+
14+
/**
15+
* Created by aprat on 27/06/16.
16+
*/
17+
public class EmpiricalDistribution extends BucketedDistribution{
18+
19+
String fileName = null;
20+
21+
@Override
22+
public ArrayList<Bucket> getBuckets(Configuration conf) {
23+
fileName = conf.get("ldbc.snb.datagen.generator.distribution.EmpiricalDistribution.fileName");
24+
ArrayList<Pair<Integer,Integer>> histogram = new ArrayList<Pair<Integer,Integer>>();
25+
try {
26+
BufferedReader reader = new BufferedReader(
27+
new InputStreamReader(getClass().getResourceAsStream(fileName), "UTF-8"));
28+
String line;
29+
while ((line = reader.readLine()) != null) {
30+
String data[] = line.split(" ");
31+
histogram.add(new Pair<Integer,Integer>(Integer.parseInt(data[0]),Integer.parseInt(data[1])));
32+
}
33+
reader.close();
34+
return Bucket.bucketizeHistogram(histogram,1000);
35+
} catch (IOException e) {
36+
e.printStackTrace();
37+
} catch (Exception e) {
38+
e.printStackTrace();
39+
}
40+
return null;
41+
}
42+
}

src/main/java/ldbc/snb/datagen/generator/distribution/FacebookDegreeDistribution.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ public FacebookDegreeDistribution() {
7070
}
7171

7272
@Override
73-
public ArrayList<Bucket> getBuckets() {
73+
public ArrayList<Bucket> getBuckets(Configuration conf) {
7474
mean_ = (int) Math.round(Math.pow(DatagenParams.numPersons, (0.512 - 0.028 * Math.log10(DatagenParams.numPersons))));
7575
System.out.println("Mean = " + mean_);
7676
buckets_ = new ArrayList<Bucket>();
@@ -86,7 +86,7 @@ public void loadFBBuckets() {
8686
String line;
8787
while ((line = fbDataReader.readLine()) != null) {
8888
String data[] = line.split(" ");
89-
buckets_.add(new Bucket(Double.parseDouble(data[0]), Double.parseDouble(data[1])));
89+
buckets_.add(new Bucket(Integer.parseInt(data[0]), Integer.parseInt(data[1])));
9090
}
9191
fbDataReader.close();
9292
} catch (IOException e) {
@@ -95,7 +95,7 @@ public void loadFBBuckets() {
9595
}
9696

9797
public void rebuildBucketRange() {
98-
double newMin, newMax;
98+
int newMin, newMax;
9999
for (int i = 0; i < buckets_.size(); i++) {
100100
newMin = buckets_.get(i).min() * mean_ / FB_MEAN_;
101101
newMax = buckets_.get(i).max() * mean_ / FB_MEAN_;
Lines changed: 47 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,59 +1,84 @@
11
package ldbc.snb.datagen.generator.distribution.utils;
22

3+
import javafx.util.Pair;
4+
import ldbc.snb.datagen.generator.DatagenParams;
5+
36
import java.util.ArrayList;
47

58
/**
69
* Created by aprat on 3/03/15.
710
*/
811
public class Bucket {
912

10-
public static ArrayList<Bucket> bucketizeHistogram(ArrayList<Double> histogram, int num_buckets) {
13+
public static ArrayList<Bucket> bucketizeHistogram(ArrayList<Pair<Integer,Integer>> histogram, int num_buckets) {
14+
1115

1216
ArrayList<Bucket> buckets = new ArrayList<Bucket>();
13-
Double population = 0.0;
14-
for( Double d : histogram ) {
15-
population+=d;
17+
int population = 0;
18+
int num_edges = 0;
19+
for( Pair<Integer,Integer> i : histogram ) {
20+
population+=i.getValue();
21+
num_edges+=i.getValue()*i.getKey();
1622
}
17-
double percentile = 1 / (double)num_buckets;
23+
24+
int avgDegreeAt1B = 200;
25+
int avgDegree = num_edges/population;
26+
double aCoeff = Math.log(avgDegreeAt1B) / Math.log(1000000000) ;
27+
double bCoeff = (aCoeff - (Math.log(avgDegree) / Math.log(population))) / Math.log10(population);
28+
29+
int target_mean = (int) Math.round(Math.pow(DatagenParams.numPersons, (aCoeff - bCoeff * Math.log10(DatagenParams.numPersons))));
30+
System.out.println("Distribution mean degree: "+avgDegree+" Distribution target mean "+target_mean);
31+
int bucket_size = (int)(Math.ceil(population / (double)(num_buckets)));
1832
int current_histogram_index = 0;
19-
int current_histogram_counter = histogram.get(current_histogram_index).intValue();
20-
for( int i = 0; i < num_buckets; ++i ) {
21-
double min = population;
22-
double max = 0;
23-
for( int j = 0; j/(double)population < percentile; ++j ) {
24-
min = min > (current_histogram_index+1) ? (current_histogram_index+1) : min;
25-
max = max < (current_histogram_index+1) ? (current_histogram_index+1) : max;
26-
if(--current_histogram_counter <= 0) {
33+
int current_histogram_left = histogram.get(current_histogram_index).getValue();
34+
for( int i = 0; i < num_buckets && (current_histogram_index < histogram.size()); ++i ) {
35+
int current_bucket_count = 0;
36+
int min = population;
37+
int max = 0;
38+
while(current_bucket_count < bucket_size && current_histogram_index < histogram.size()) {
39+
int degree = histogram.get(current_histogram_index).getKey();
40+
min = degree < min ? degree : min;
41+
max = degree > max ? degree : max;
42+
if( (bucket_size - current_bucket_count) > current_histogram_left) {
43+
current_bucket_count += current_histogram_left;
2744
current_histogram_index++;
28-
current_histogram_counter = histogram.get(current_histogram_index).intValue();
45+
if(current_histogram_index < histogram.size()) {
46+
current_histogram_left = histogram.get(current_histogram_index).getValue();
47+
}
48+
} else {
49+
current_histogram_left -= (bucket_size - current_bucket_count);
50+
current_bucket_count = bucket_size;
2951
}
3052
}
31-
buckets.add(new Bucket(min, max));
53+
min = (int)(min*target_mean/(double)avgDegree);
54+
max = (int)(max*target_mean/(double)avgDegree);
55+
buckets.add(new Bucket(min,max));
56+
//System.out.println(i+" "+min+" "+max);
3257
}
3358
return buckets;
3459
}
3560

36-
double min_;
37-
double max_;
61+
int min_;
62+
int max_;
3863

39-
public Bucket(double min, double max) {
64+
public Bucket(int min, int max) {
4065
this.min_ = min;
4166
this.max_ = max;
4267
}
4368

44-
public double min() {
69+
public int min() {
4570
return min_;
4671
}
4772

48-
public void min(double min) {
73+
public void min(int min) {
4974
min_ = min;
5075
}
5176

52-
public double max() {
77+
public int max() {
5378
return max_;
5479
}
5580

56-
public void max(double max) {
81+
public void max(int max) {
5782
max_ = max;
5883
}
5984
}

src/main/java/ldbc/snb/datagen/generator/distribution/utils/BucketedDistribution.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ public abstract class BucketedDistribution implements DegreeDistribution {
1515
private ArrayList<Random> randomDegree_;
1616
private Random randomPercentile_;
1717

18-
public abstract ArrayList<Bucket> getBuckets();
18+
public abstract ArrayList<Bucket> getBuckets(Configuration conf);
1919

2020
public void initialize( Configuration conf ) {
21-
buckets_ = this.getBuckets();
21+
buckets_ = this.getBuckets(conf);
2222
randomPercentile_ = new Random(0);
2323
randomDegree_ = new ArrayList<Random>();
2424
for (int i = 0; i < buckets_.size(); i++) {
@@ -36,8 +36,8 @@ public void reset(long seed) {
3636

3737
public long nextDegree() {
3838
int idx = randomPercentile_.nextInt(buckets_.size());
39-
int minRange = (int)(buckets_.get(idx).min());
40-
int maxRange = (int)(buckets_.get(idx).max());
39+
int minRange = (buckets_.get(idx).min());
40+
int maxRange = (buckets_.get(idx).max());
4141
if( maxRange < minRange ) {
4242
maxRange = minRange;
4343
}

src/main/java/ldbc/snb/datagen/hadoop/HadoopFileKeyChanger.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,13 @@ public void setup( Context context ) {
4141
keySetter = (HadoopFileKeyChanger.KeySetter) Class.forName(className).newInstance();
4242
} catch(ClassNotFoundException e) {
4343
System.out.print(e.getMessage());
44+
e.printStackTrace();
4445
} catch(IllegalAccessException e) {
4546
System.out.print(e.getMessage());
47+
e.printStackTrace();
4648
} catch(InstantiationException e) {
4749
System.out.print(e.getMessage());
50+
e.printStackTrace();
4851
}
4952
}
5053

0 commit comments

Comments
 (0)