Skip to content

Commit c3da811

Browse files
authored
Scale option (#32)
* Added scale option * Improved performance of Zipf distribution at expenses of accuracy * Fixed bug in serializers when numPartitions was larger than 1 * Added .travis.yml file * Improved tests
1 parent 40424bd commit c3da811

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+3156
-520
lines changed

.travis.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ sudo: required
33
language: java
44
jdk:
55
- openjdk7
6+
python:
7+
- "2.7"
68
install:
79
- curl -s http://www.eu.apache.org/dist/hadoop/common/hadoop-2.6.5/hadoop-2.6.5.tar.gz | tar -xz
810
before_script:

CHANGELOG.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,12 @@
11

2+
v0.2.7 CHANGELOG
3+
4+
- Added gscale option which allows specifying the size of the generated dataset
5+
based on the graphalytics scaling metric.
6+
7+
- Fixed a bug which caused wrong serialization of data once the numPartitions
8+
parameter was larger than 1.
9+
210
v0.2.6 CHANGELOG
311
Bi parameter generation
412
Added testing

pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
<groupId>ldbc.snb.datagen</groupId>
88
<artifactId>ldbc_snb_datagen</artifactId>
9-
<version>0.2.5</version>
9+
<version>0.2.7</version>
1010
<packaging>jar</packaging>
1111

1212
<properties>

src/main/java/ldbc/snb/datagen/dictionary/Dictionaries.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ public static void loadDictionaries(Configuration conf) {
4949

5050
emails = new EmailDictionary();
5151

52-
places = new PlaceDictionary(DatagenParams.numPersons);
52+
places = new PlaceDictionary();
5353

5454
ips = new IPAddressDictionary( places,
5555
DatagenParams.probDiffIPinTravelSeason,

src/main/java/ldbc/snb/datagen/dictionary/PlaceDictionary.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,10 +100,9 @@ public int compareTo(PlaceZOrder obj) {
100100
}
101101

102102
/**
103-
* @param numUsers: The total number of users.
104103
* @brief Creator.
105104
*/
106-
public PlaceDictionary(int numUsers) {
105+
public PlaceDictionary() {
107106
this.cumulativeDistribution = new ArrayList<Float>();
108107
this.countryNames = new HashMap<String, Integer>();
109108
this.cityNames = new HashMap<String, Integer>();

src/main/java/ldbc/snb/datagen/generator/DatagenParams.java

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -38,16 +38,9 @@
3838

3939
package ldbc.snb.datagen.generator;
4040

41-
import ldbc.snb.datagen.util.ScaleFactor;
41+
import ldbc.snb.datagen.generator.distribution.DegreeDistribution;
42+
import ldbc.snb.datagen.generator.distribution.utils.Algorithms;
4243
import org.apache.hadoop.conf.Configuration;
43-
import org.w3c.dom.Document;
44-
import org.w3c.dom.Element;
45-
import org.w3c.dom.Node;
46-
import org.w3c.dom.NodeList;
47-
48-
import javax.xml.parsers.DocumentBuilder;
49-
import javax.xml.parsers.DocumentBuilderFactory;
50-
import java.util.TreeMap;
5144

5245
public class DatagenParams {
5346

@@ -217,12 +210,12 @@ public String toString() {
217210
public static final double alpha = 0.4;
218211

219212

220-
public static String outputDir = "./";
213+
public static String outputDir = "./";
221214
public static String hadoopDir = "./";
222215
public static String socialNetworkDir = "./";
223216
public static int numThreads = 1;
224217
public static int deltaTime = 10000;
225-
public static int numPersons = 10000;
218+
public static long numPersons = 10000;
226219
public static int startYear = 2010;
227220
public static int endYear = 2013;
228221
public static int numYears = 3;
@@ -307,7 +300,7 @@ public static void readConf( Configuration conf ) {
307300
}
308301

309302
try {
310-
numPersons = Integer.parseInt(conf.get("ldbc.snb.datagen.generator.numPersons"));
303+
numPersons = Long.parseLong(conf.get("ldbc.snb.datagen.generator.numPersons"));
311304
startYear = Integer.parseInt(conf.get("ldbc.snb.datagen.generator.startYear"));
312305
numYears = Integer.parseInt(conf.get("ldbc.snb.datagen.generator.numYears"));
313306
endYear = startYear + numYears;
@@ -320,6 +313,13 @@ public static void readConf( Configuration conf ) {
320313
outputDir = conf.get("ldbc.snb.datagen.serializer.outputDir");
321314
hadoopDir = outputDir+"/hadoop";
322315
socialNetworkDir = outputDir+"social_network";
316+
if(conf.get("ldbc.snb.datagen.generator.gscale") != null) {
317+
double scale = conf.getDouble("ldbc.snb.datagen.generator.gscale", 6.0);
318+
String degreeDistributionName = conf.get("ldbc.snb.datagen.generator.distribution.degreeDistribution");
319+
DegreeDistribution degreeDistribution = (DegreeDistribution)Class.forName(degreeDistributionName).newInstance();
320+
degreeDistribution.initialize(conf);
321+
numPersons = Algorithms.findNumPersonsFromGraphalyticsScale(degreeDistribution,scale);
322+
}
323323
System.out.println(" ... Num Persons " + numPersons);
324324
System.out.println(" ... Start Year " + startYear);
325325
System.out.println(" ... Num Years " + numYears);

src/main/java/ldbc/snb/datagen/generator/LDBCDatagen.java

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -39,15 +39,12 @@
3939
import ldbc.snb.datagen.dictionary.Dictionaries;
4040
import ldbc.snb.datagen.hadoop.*;
4141
import ldbc.snb.datagen.objects.Person;
42-
import ldbc.snb.datagen.objects.similarity.GeoDistanceSimilarity;
4342
import ldbc.snb.datagen.util.ConfigParser;
4443
import ldbc.snb.datagen.vocabulary.SN;
4544
import org.apache.hadoop.conf.Configuration;
4645
import org.apache.hadoop.fs.FSDataInputStream;
4746
import org.apache.hadoop.fs.FileSystem;
4847
import org.apache.hadoop.fs.Path;
49-
import org.apache.hadoop.io.LongWritable;
50-
import org.apache.hadoop.io.Text;
5148

5249

5350
import java.io.File;
@@ -59,7 +56,7 @@
5956
public class LDBCDatagen {
6057

6158
static boolean initialized = false;
62-
public static synchronized void init (Configuration conf) {
59+
public static synchronized void initializeContext(Configuration conf) {
6360
if(!initialized) {
6461
DatagenParams.readConf(conf);
6562
Dictionaries.loadDictionaries(conf);
@@ -352,26 +349,31 @@ public int runGenerateJob(Configuration conf) throws Exception {
352349
return 0;
353350
}
354351

355-
public static void main(String[] args) /*throws Exception*/ {
352+
public static void prepareConfiguration(Configuration conf) throws Exception {
356353

357-
try {
358-
Configuration conf = ConfigParser.initialize();
359-
ConfigParser.readConfig(conf, args[0]);
360-
ConfigParser.readConfig(conf, LDBCDatagen.class.getResourceAsStream("/params.ini"));
361354
conf.set("ldbc.snb.datagen.serializer.hadoopDir",conf.get("ldbc.snb.datagen.serializer.outputDir")+"/hadoop");
362355
conf.set("ldbc.snb.datagen.serializer.socialNetworkDir",conf.get("ldbc.snb.datagen.serializer.outputDir")+"/social_network");
363-
ConfigParser.printConfig(conf);
364-
// conf.setBoolean("mapreduce.map.output.compress", true);
365-
// conf.setBoolean("mapreduce.output.fileoutputformat.compress", false);
366356

367357
// Deleting existing files
368358
FileSystem dfs = FileSystem.get(conf);
369359
dfs.delete(new Path(conf.get("ldbc.snb.datagen.serializer.hadoopDir")), true);
370360
dfs.delete(new Path(conf.get("ldbc.snb.datagen.serializer.socialNetworkDir")), true);
371361

372-
// Create input text file in HDFS
373-
LDBCDatagen datagen = new LDBCDatagen();
374-
LDBCDatagen.init(conf);
362+
ConfigParser.printConfig(conf);
363+
364+
}
365+
366+
public static void main(String[] args) /*throws Exception*/ {
367+
368+
try {
369+
Configuration conf = ConfigParser.initialize();
370+
ConfigParser.readConfig(conf, args[0]);
371+
ConfigParser.readConfig(conf, LDBCDatagen.class.getResourceAsStream("/params.ini"));
372+
373+
// Create input text file in HDFS
374+
LDBCDatagen.prepareConfiguration(conf);
375+
LDBCDatagen.initializeContext(conf);
376+
LDBCDatagen datagen = new LDBCDatagen();
375377
datagen.runGenerateJob(conf);
376378
}catch(AssertionError e ) {
377379
System.err.println("Error during execution");

src/main/java/ldbc/snb/datagen/generator/distribution/AltmannDistribution.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
*/
1313
public class AltmannDistribution extends CumulativeBasedDegreeDistribution {
1414

15-
private int POPULATION_ = 10000;
15+
private long POPULATION_ = 10000;
1616
private double normalization_factor_ = 0.0;
1717
private double ALPHA_ = 0.4577;
1818
private double BETA_ = 0.0162;

src/main/java/ldbc/snb/datagen/generator/distribution/CumulativeBasedDegreeDistribution.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
/**
99
* Created by aprat on 12/05/15.
1010
*/
11-
public abstract class CumulativeBasedDegreeDistribution implements DegreeDistribution{
11+
public abstract class CumulativeBasedDegreeDistribution extends DegreeDistribution {
1212

1313
public class CumulativeEntry {
1414
double prob_;

src/main/java/ldbc/snb/datagen/generator/distribution/DegreeDistribution.java

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,22 +5,21 @@
55
*/
66
package ldbc.snb.datagen.generator.distribution;
77

8-
import ldbc.snb.datagen.generator.distribution.utils.Bucket;
9-
import ldbc.snb.datagen.generator.distribution.utils.BucketedDistribution;
108
import org.apache.hadoop.conf.Configuration;
119

12-
import java.util.ArrayList;
13-
import java.util.Random;
14-
1510
/**
1611
*
1712
* @author aprat
1813
*/
19-
public interface DegreeDistribution {
14+
public abstract class DegreeDistribution {
15+
16+
public abstract void initialize( Configuration conf );
2017

21-
public void initialize( Configuration conf );
18+
public abstract void reset (long seed);
2219

23-
public void reset (long seed);
20+
public abstract long nextDegree();
2421

25-
public long nextDegree();
22+
public double mean(long numPersons) {
23+
return -1;
24+
}
2625
}

0 commit comments

Comments
 (0)