48
48
import org .apache .hadoop .io .Text ;
49
49
50
50
import java .io .OutputStream ;
51
+ import java .util .ArrayList ;
51
52
import java .util .Properties ;
52
53
53
54
public class LDBCDatagen {
@@ -70,39 +71,45 @@ private void printProgress(String message) {
70
71
71
72
public int runGenerateJob (Configuration conf ) throws Exception {
72
73
73
- String personsFileName1 = conf .get ("ldbc.snb.datagen.serializer.hadoopDir" ) + "/persons1" ;
74
- String personsFileName2 = conf .get ("ldbc.snb.datagen.serializer.hadoopDir" ) + "/persons2" ;
74
+ String hadoopPrefix = conf .get ("ldbc.snb.datagen.serializer.hadoopDir" );
75
75
FileSystem fs = FileSystem .get (conf );
76
76
77
77
long start = System .currentTimeMillis ();
78
78
printProgress ("Starting: Person generation" );
79
79
long startPerson = System .currentTimeMillis ();
80
80
HadoopPersonGenerator personGenerator = new HadoopPersonGenerator ( conf );
81
- personGenerator .run (personsFileName1 , "ldbc.snb.datagen.hadoop.UniversityKeySetter" );
81
+ personGenerator .run (hadoopPrefix + "/persons" , "ldbc.snb.datagen.hadoop.UniversityKeySetter" );
82
82
long endPerson = System .currentTimeMillis ();
83
83
84
-
85
84
printProgress ("Creating university location correlated edges" );
86
85
long startUniversity = System .currentTimeMillis ();
87
- HadoopKnowsGenerator knowsGenerator = new HadoopKnowsGenerator (conf ,null , "ldbc.snb.datagen.hadoop.InterestKeySetter" , 0.45f );
88
- knowsGenerator .run (personsFileName1 ,personsFileName2 );
89
- fs .delete (new Path (personsFileName1 ), true );
86
+ HadoopKnowsGenerator knowsGenerator = new HadoopKnowsGenerator (conf ,"ldbc.snb.datagen.hadoop.UniversityKeySetter" , "ldbc.snb.datagen.hadoop.RandomKeySetter" , 0.45f );
87
+ knowsGenerator .run (hadoopPrefix +"/persons" ,hadoopPrefix +"/universityEdges" );
90
88
long endUniversity = System .currentTimeMillis ();
91
89
92
90
printProgress ("Creating main interest correlated edges" );
93
91
long startInterest = System .currentTimeMillis ();
94
- knowsGenerator = new HadoopKnowsGenerator (conf ,null , "ldbc.snb.datagen.hadoop.RandomKeySetter" , 0.90f );
95
- knowsGenerator .run (personsFileName2 ,personsFileName1 );
96
- fs .delete (new Path (personsFileName2 ), true );
92
+ knowsGenerator = new HadoopKnowsGenerator (conf ,"ldbc.snb.datagen.hadoop.InterestKeySetter" , "ldbc.snb.datagen.hadoop.RandomKeySetter" , 0.45f );
93
+ knowsGenerator .run (hadoopPrefix +"/persons" ,hadoopPrefix +"/interestEdges" );
97
94
long endInterest = System .currentTimeMillis ();
98
95
99
96
printProgress ("Creating random correlated edges" );
100
97
long startRandom = System .currentTimeMillis ();
101
- knowsGenerator = new HadoopKnowsGenerator (conf ,null , "ldbc.snb.datagen.hadoop.RandomKeySetter" , 1.0f );
102
- knowsGenerator .run (personsFileName1 ,personsFileName2 );
103
- fs .delete (new Path (personsFileName1 ), true );
98
+ knowsGenerator = new HadoopKnowsGenerator (conf ,"ldbc.snb.datagen.hadoop.RandomKeySetter" , "ldbc.snb.datagen.hadoop.RandomKeySetter" , 0.1f );
99
+ knowsGenerator .run (hadoopPrefix +"/persons" ,hadoopPrefix +"/randomEdges" );
104
100
long endRandom = System .currentTimeMillis ();
105
101
102
+
103
+ fs .delete (new Path (DatagenParams .hadoopDir + "/persons" ), true );
104
+ printProgress ("Merging the different edge files" );
105
+ ArrayList <String > edgeFileNames = new ArrayList <String >();
106
+ edgeFileNames .add (hadoopPrefix +"/universityEdges" );
107
+ edgeFileNames .add (hadoopPrefix +"/interestEdges" );
108
+ edgeFileNames .add (hadoopPrefix +"/randomEdges" );
109
+ long startMerge = System .currentTimeMillis ();
110
+ HadoopMergeFriendshipFiles merger = new HadoopMergeFriendshipFiles (conf ,"ldbc.snb.datagen.hadoop.RandomKeySetter" );
111
+ merger .run (hadoopPrefix +"/mergedPersons" , edgeFileNames );
112
+ long endMerge = System .currentTimeMillis ();
106
113
/*printProgress("Creating edges to fill the degree gap");
107
114
long startGap = System.currentTimeMillis();
108
115
knowsGenerator = new HadoopKnowsGenerator(conf,null, "ldbc.snb.datagen.hadoop.DegreeGapKeySetter", 1.0f);
@@ -114,14 +121,14 @@ public int runGenerateJob(Configuration conf) throws Exception {
114
121
printProgress ("Serializing persons" );
115
122
long startPersonSerializing = System .currentTimeMillis ();
116
123
HadoopPersonSerializer serializer = new HadoopPersonSerializer (conf );
117
- serializer .run (personsFileName2 );
124
+ serializer .run (hadoopPrefix + "/mergedPersons" );
118
125
long endPersonSerializing = System .currentTimeMillis ();
119
126
120
127
long startPersonActivity = System .currentTimeMillis ();
121
128
if (conf .getBoolean ("ldbc.snb.datagen.generator.activity" , true )) {
122
129
printProgress ("Generating and serializing person activity" );
123
130
HadoopPersonActivityGenerator activityGenerator = new HadoopPersonActivityGenerator (conf );
124
- activityGenerator .run (personsFileName2 );
131
+ activityGenerator .run (hadoopPrefix + "/mergedPersons" );
125
132
126
133
int numThreads = DatagenParams .numThreads ;
127
134
int blockSize = DatagenParams .blockSize ;
@@ -134,41 +141,41 @@ public int runGenerateJob(Configuration conf) throws Exception {
134
141
}
135
142
}
136
143
}
137
- fs .delete (new Path (personsFileName2 ), true );
138
144
long endPersonActivity = System .currentTimeMillis ();
139
145
140
146
long startSortingUpdateStreams = System .currentTimeMillis ();
141
- if (conf .getBoolean ("ldbc.snb.datagen.serializer.updateStreams" , false )) {
142
- printProgress ("Sorting update streams " );
143
147
144
- int blockSize = DatagenParams .blockSize ;
145
- int numBlocks = (int )Math .ceil (DatagenParams .numPersons / (double )blockSize );
146
-
147
- for ( int i = 0 ; i < DatagenParams .numThreads ; ++i ) {
148
- int numPartitions = conf .getInt ("ldbc.snb.datagen.serializer.numUpdatePartitions" , 1 );
149
- if ( i < numBlocks ) {
150
- for (int j = 0 ; j < numPartitions ; ++j ) {
151
- HadoopFileSorter updateStreamSorter = new HadoopFileSorter (conf , LongWritable .class , Text .class );
152
- updateStreamSorter .run (DatagenParams .hadoopDir + "/temp_updateStream_person_" + i + "_" + j , DatagenParams .hadoopDir + "/updateStream_person_" + i + "_" + j );
153
- updateStreamSorter .run (DatagenParams .hadoopDir + "/temp_updateStream_forum_" + i + "_" + j , DatagenParams .hadoopDir + "/updateStream_forum_" + i + "_" + j );
148
+ if (conf .getBoolean ("ldbc.snb.datagen.serializer.updateStreams" , false )) {
154
149
155
- fs .delete (new Path (DatagenParams .hadoopDir + "/temp_updateStream_person_" + i + "_" + j ), true );
156
- fs .delete (new Path (DatagenParams .hadoopDir + "/temp_updateStream_forum_" + i + "_" + j ), true );
150
+ printProgress ("Sorting update streams " );
157
151
158
- HadoopUpdateStreamSerializer updateSerializer = new HadoopUpdateStreamSerializer (conf );
159
- updateSerializer .run (DatagenParams .hadoopDir + "/updateStream_person_" + i + "_" + j , i , j , "person" );
160
- updateSerializer .run (DatagenParams .hadoopDir + "/updateStream_forum_" + i + "_" + j , i , j , "forum" );
152
+ int blockSize = DatagenParams .blockSize ;
153
+ int numBlocks = (int )Math .ceil (DatagenParams .numPersons / (double )blockSize );
161
154
162
- fs .delete (new Path (DatagenParams .hadoopDir + "/updateStream_person_" + i + "_" + j ), true );
163
- fs .delete (new Path (DatagenParams .hadoopDir + "/updateStream_forum_" + i + "_" + j ), true );
164
- }
165
- } else {
166
- for (int j = 0 ; j < numPartitions ; ++j ) {
167
- fs .delete (new Path (DatagenParams .hadoopDir + "/temp_updateStream_person_" + i + "_" + j ), true );
168
- fs .delete (new Path (DatagenParams .hadoopDir + "/temp_updateStream_forum_" + i + "_" + j ), true );
155
+ for ( int i = 0 ; i < DatagenParams .numThreads ; ++i ) {
156
+ int numPartitions = conf .getInt ("ldbc.snb.datagen.serializer.numUpdatePartitions" , 1 );
157
+ if ( i < numBlocks ) {
158
+ for (int j = 0 ; j < numPartitions ; ++j ) {
159
+ HadoopFileSorter updateStreamSorter = new HadoopFileSorter (conf , LongWritable .class , Text .class );
160
+ HadoopUpdateStreamSerializer updateSerializer = new HadoopUpdateStreamSerializer (conf );
161
+ updateStreamSorter .run (DatagenParams .hadoopDir + "/temp_updateStream_person_" + i + "_" + j , DatagenParams .hadoopDir + "/updateStream_person_" + i + "_" + j );
162
+ fs .delete (new Path (DatagenParams .hadoopDir + "/temp_updateStream_person_" + i + "_" + j ), true );
163
+ updateSerializer .run (DatagenParams .hadoopDir + "/updateStream_person_" + i + "_" + j , i , j , "person" );
164
+ fs .delete (new Path (DatagenParams .hadoopDir + "/updateStream_person_" + i + "_" + j ), true );
165
+ if ( conf .getBoolean ("ldbc.snb.datagen.generator.activity" , false )) {
166
+ updateStreamSorter .run (DatagenParams .hadoopDir + "/temp_updateStream_forum_" + i + "_" + j , DatagenParams .hadoopDir + "/updateStream_forum_" + i + "_" + j );
167
+ fs .delete (new Path (DatagenParams .hadoopDir + "/temp_updateStream_forum_" + i + "_" + j ), true );
168
+ updateSerializer .run (DatagenParams .hadoopDir + "/updateStream_forum_" + i + "_" + j , i , j , "forum" );
169
+ fs .delete (new Path (DatagenParams .hadoopDir + "/updateStream_forum_" + i + "_" + j ), true );
170
+ }
171
+ }
172
+ } else {
173
+ for (int j = 0 ; j < numPartitions ; ++j ) {
174
+ fs .delete (new Path (DatagenParams .hadoopDir + "/temp_updateStream_person_" + i + "_" + j ), true );
175
+ fs .delete (new Path (DatagenParams .hadoopDir + "/temp_updateStream_forum_" + i + "_" + j ), true );
176
+ }
169
177
}
170
178
}
171
- }
172
179
173
180
long minDate = Long .MAX_VALUE ;
174
181
long maxDate = Long .MIN_VALUE ;
@@ -188,18 +195,20 @@ public int runGenerateJob(Configuration conf) throws Exception {
188
195
file .close ();
189
196
fs .delete (propertiesFile ,true );
190
197
191
- propertiesFile = new Path (DatagenParams .hadoopDir +"/temp_updateStream_forum_" +i +".properties" );
192
- file = fs .open (propertiesFile );
193
- properties = new Properties ();
194
- properties .load (file );
195
- aux = Long .parseLong (properties .getProperty ("ldbc.snb.interactive.min_write_event_start_time" ));
196
- minDate = aux < minDate ? aux : minDate ;
197
- aux = Long .parseLong (properties .getProperty ("ldbc.snb.interactive.max_write_event_start_time" ));
198
- maxDate = aux > maxDate ? aux : maxDate ;
199
- aux = Long .parseLong (properties .getProperty ("ldbc.snb.interactive.num_events" ));
200
- count += aux ;
201
- file .close ();
202
- fs .delete (propertiesFile ,true );
198
+ if ( conf .getBoolean ("ldbc.snb.datagen.generator.activity" , false )) {
199
+ propertiesFile = new Path (DatagenParams .hadoopDir + "/temp_updateStream_forum_" + i + ".properties" );
200
+ file = fs .open (propertiesFile );
201
+ properties = new Properties ();
202
+ properties .load (file );
203
+ aux = Long .parseLong (properties .getProperty ("ldbc.snb.interactive.min_write_event_start_time" ));
204
+ minDate = aux < minDate ? aux : minDate ;
205
+ aux = Long .parseLong (properties .getProperty ("ldbc.snb.interactive.max_write_event_start_time" ));
206
+ maxDate = aux > maxDate ? aux : maxDate ;
207
+ aux = Long .parseLong (properties .getProperty ("ldbc.snb.interactive.num_events" ));
208
+ count += aux ;
209
+ file .close ();
210
+ fs .delete (propertiesFile , true );
211
+ }
203
212
}
204
213
205
214
OutputStream output = fs .create (new Path (DatagenParams .socialNetworkDir +"/updateStream" +".properties" ),true );
@@ -229,8 +238,9 @@ public int runGenerateJob(Configuration conf) throws Exception {
229
238
+ " total seconds" );
230
239
System .out .println ("Person generation time: " +((endPerson - startPerson ) / 1000 ));
231
240
System .out .println ("University correlated edge generation time: " +((endUniversity - startUniversity ) / 1000 ));
232
- // System.out.println("Interest correlated edge generation time: "+((endInterest - startInterest) / 1000));
233
- // System.out.println("Random correlated edge generation time: "+((endRandom - startRandom) / 1000));
241
+ System .out .println ("Interest correlated edge generation time: " +((endInterest - startInterest ) / 1000 ));
242
+ System .out .println ("Random correlated edge generation time: " +((endRandom - startRandom ) / 1000 ));
243
+ System .out .println ("Edges merge time: " +((endMerge - startMerge ) / 1000 ));
234
244
System .out .println ("Person serialization time: " +((endPersonSerializing - startPersonSerializing ) / 1000 ));
235
245
System .out .println ("Person activity generation and serialization time: " +((endPersonActivity - startPersonActivity ) / 1000 ));
236
246
System .out .println ("Sorting update streams time: " +((endSortingUpdateStreams - startSortingUpdateStreams ) / 1000 ));
0 commit comments