Fully parallelized update stream sorting and serialization

ArnauPrat · ArnauPrat · commit 3ebbedde1807 · 2016-01-05T13:19:08.000+01:00
diff --git a/src/main/java/ldbc/snb/datagen/generator/LDBCDatagen.java b/src/main/java/ldbc/snb/datagen/generator/LDBCDatagen.java
@@ -49,6 +49,7 @@
 
 import java.io.OutputStream;
 import java.util.ArrayList;
+import java.util.List;
 import java.util.Properties;
 
 public class LDBCDatagen {
@@ -157,7 +158,36 @@ public int runGenerateJob(Configuration conf) throws Exception {
             int blockSize = DatagenParams.blockSize;
             int numBlocks = (int)Math.ceil(DatagenParams.numPersons / (double)blockSize);
 
+            List<String> personStreamsFileNames = new ArrayList<String>();
+            List<String> forumStreamsFileNames = new ArrayList<String>();
             for( int i = 0; i < DatagenParams.numThreads; ++i) {
+                int numPartitions = conf.getInt("ldbc.snb.datagen.serializer.numUpdatePartitions", 1);
+                if( i < numBlocks ) {
+                    for (int j = 0; j < numPartitions; ++j) {
+                        personStreamsFileNames.add(DatagenParams.hadoopDir + "/temp_updateStream_person_" + i + "_" + j);
+                        if( conf.getBoolean("ldbc.snb.datagen.generator.activity", false)) {
+                            forumStreamsFileNames.add(DatagenParams.hadoopDir + "/temp_updateStream_forum_" + i + "_" + j);
+                        }
+                    }
+                } else {
+                    for (int j = 0; j < numPartitions; ++j) {
+                        fs.delete(new Path(DatagenParams.hadoopDir + "/temp_updateStream_person_" + i + "_" + j), true);
+                        fs.delete(new Path(DatagenParams.hadoopDir + "/temp_updateStream_forum_" + i + "_" + j), true);
+                    }
+                }
+            }
+            HadoopUpdateStreamSorterAndSerializer updateSorterAndSerializer = new HadoopUpdateStreamSorterAndSerializer(conf);
+            updateSorterAndSerializer.run(personStreamsFileNames, "person");
+            updateSorterAndSerializer.run(forumStreamsFileNames, "forum");
+            for(String file : personStreamsFileNames) {
+                fs.delete(new Path(file), true);
+            }
+
+            for(String file : forumStreamsFileNames) {
+                fs.delete(new Path(file), true);
+            }
+
+            /*for( int i = 0; i < DatagenParams.numThreads; ++i) {
                 int numPartitions = conf.getInt("ldbc.snb.datagen.serializer.numUpdatePartitions", 1);
                 if( i < numBlocks ) {
                     for (int j = 0; j < numPartitions; ++j) {
@@ -180,7 +210,7 @@ public int runGenerateJob(Configuration conf) throws Exception {
                         fs.delete(new Path(DatagenParams.hadoopDir + "/temp_updateStream_forum_" + i + "_" + j), true);
                     }
                 }
-            }
+            }*/
 
             long minDate = Long.MAX_VALUE;
             long maxDate = Long.MIN_VALUE;
diff --git a/src/main/java/ldbc/snb/datagen/hadoop/BlockKey.java b/src/main/java/ldbc/snb/datagen/hadoop/BlockKey.java
@@ -28,19 +28,16 @@ public BlockKey( long block, TupleKey tk) {
             this.tk = tk;
         }
 
-        @Override
         public void write(DataOutput out) throws IOException {
             out.writeLong(block);
             tk.write(out);
         }
 
-        @Override
         public void readFields(DataInput in) throws IOException {
             block = in.readLong();
             tk.readFields(in);
         }
 
-        @Override
         public int compareTo( BlockKey mpk) {
             if (block < mpk.block) return -1;
             if (block > mpk.block) return 1;
diff --git a/src/main/java/ldbc/snb/datagen/hadoop/HadoopPersonActivityGenerator.java b/src/main/java/ldbc/snb/datagen/hadoop/HadoopPersonActivityGenerator.java
@@ -54,7 +54,7 @@ protected void setup(Context context) {
                 personActivitySerializer_ = (PersonActivitySerializer) Class.forName(conf.get("ldbc.snb.datagen.serializer.personActivitySerializer")).newInstance();
                 personActivitySerializer_.initialize(conf,reducerId);
                 if(DatagenParams.updateStreams) {
-                    updateSerializer_ = new UpdateEventSerializer(conf, DatagenParams.hadoopDir + "/temp_updateStream_forum_" + reducerId, DatagenParams.numUpdatePartitions);
+                    updateSerializer_ = new UpdateEventSerializer(conf, DatagenParams.hadoopDir + "/temp_updateStream_forum_" + reducerId, reducerId, DatagenParams.numUpdatePartitions);
                 }
                 personActivityGenerator_ = new PersonActivityGenerator(personActivitySerializer_, updateSerializer_);
 
diff --git a/src/main/java/ldbc/snb/datagen/hadoop/HadoopPersonSerializer.java b/src/main/java/ldbc/snb/datagen/hadoop/HadoopPersonSerializer.java
@@ -39,7 +39,7 @@ protected void setup(Context context) {
 				personSerializer_ = (PersonSerializer) Class.forName(conf.get("ldbc.snb.datagen.serializer.personSerializer")).newInstance();
 				personSerializer_.initialize(conf,reducerId);
 				if (DatagenParams.updateStreams) {
-					updateSerializer_ = new UpdateEventSerializer(conf, DatagenParams.hadoopDir + "/temp_updateStream_person_" + reducerId, DatagenParams.numUpdatePartitions);
+					updateSerializer_ = new UpdateEventSerializer(conf, DatagenParams.hadoopDir + "/temp_updateStream_person_" + reducerId, reducerId, DatagenParams.numUpdatePartitions);
 				}
 			} catch( Exception e ) {
 				System.err.println(e.getMessage());
diff --git a/src/main/java/ldbc/snb/datagen/hadoop/HadoopUpdateEventKeyPartitioner.java b/src/main/java/ldbc/snb/datagen/hadoop/HadoopUpdateEventKeyPartitioner.java
@@ -0,0 +1,20 @@
+package ldbc.snb.datagen.hadoop;
+
+import ldbc.snb.datagen.objects.Person;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Partitioner;
+
+/**
+ * Created by aprat on 25/08/15.
+ */
+public class HadoopUpdateEventKeyPartitioner extends Partitioner<UpdateEventKey, Text> {
+
+    public HadoopUpdateEventKeyPartitioner() {
+        super();
+    }
+
+    @Override
+    public int getPartition(UpdateEventKey key, Text text, int numReduceTasks) {
+        return (int)(key.reducerId);
+    }
+}
diff --git a/src/main/java/ldbc/snb/datagen/hadoop/HadoopUpdateStreamSorterAndSerializer.java b/src/main/java/ldbc/snb/datagen/hadoop/HadoopUpdateStreamSorterAndSerializer.java
@@ -0,0 +1,119 @@
+package ldbc.snb.datagen.hadoop;
+
+import ldbc.snb.datagen.serializer.PersonSerializer;
+import ldbc.snb.datagen.serializer.UpdateEventSerializer;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.zip.GZIPOutputStream;
+
+/**
+ * Created by aprat on 10/15/14.
+ */
+public class HadoopUpdateStreamSorterAndSerializer {
+
+	public static class HadoopUpdateStreamSorterAndSerializerReducer  extends Reducer<UpdateEventKey, Text, UpdateEventKey, Text> {
+
+		private int reducerId;                          /** The id of the reducer.**/
+		private PersonSerializer personSerializer_;   /** The person serializer **/
+		private UpdateEventSerializer updateSerializer_;
+		private boolean compressed = false;
+		private Configuration conf;
+		private String streamType;
+
+		protected void setup(Context context) {
+			conf = context.getConfiguration();
+			streamType = conf.get("streamType");
+			try {
+				compressed = Boolean.parseBoolean(conf.get("ldbc.snb.datagen.serializer.compressed"));
+			} catch( Exception e) {
+				System.err.println(e.getMessage());
+			}
+		}
+
+		@Override
+		public void reduce(UpdateEventKey key, Iterable<Text> valueSet,Context context)
+				throws IOException, InterruptedException {
+			OutputStream out;
+			try {
+				FileSystem fs = FileSystem.get(conf);
+				if(  compressed ) {
+					Path outFile = new Path(context.getConfiguration().get("ldbc.snb.datagen.serializer.socialNetworkDir")+"/updateStream_"+reducerId+"_"+key.partition+"_"+streamType+".csv.gz");
+					out = new GZIPOutputStream( fs.create(outFile));
+				} else {
+					Path outFile = new Path(context.getConfiguration().get("ldbc.snb.datagen.serializer.socialNetworkDir")+"/updateStream_"+reducerId+"_"+key.partition+"_"+streamType+".csv");
+					out = fs.create(outFile);
+				}
+				for( Text t : valueSet ) {
+					out.write(t.toString().getBytes("UTF8"));
+				}
+				out.close();
+			} catch( Exception e ) {
+				System.err.println(e.getMessage());
+			}
+		}
+		protected void cleanup(Context context){
+			try {
+			} catch( Exception e ) {
+				System.err.println(e.getMessage());
+			}
+		}
+	}
+
+
+	private Configuration conf;
+
+	public HadoopUpdateStreamSorterAndSerializer(Configuration conf ) {
+		this.conf = new Configuration(conf);
+	}
+	
+	public void run(List<String> inputFileNames, String type ) throws Exception {
+
+		int numThreads = conf.getInt("ldbc.snb.datagen.generator.numThreads",1);
+		conf.set("streamType", type);
+		
+		Job job = Job.getInstance(conf, "Update Stream Serializer");
+		job.setMapOutputKeyClass(UpdateEventKey.class);
+		job.setMapOutputValueClass(Text.class);
+		job.setOutputKeyClass(UpdateEventKey.class);
+		job.setOutputValueClass(Text.class);
+        job.setJarByClass(HadoopUpdateStreamSorterAndSerializerReducer.class);
+		job.setReducerClass(HadoopUpdateStreamSorterAndSerializerReducer.class);
+		job.setNumReduceTasks(numThreads);
+		job.setInputFormatClass(SequenceFileInputFormat.class);
+		job.setOutputFormatClass(SequenceFileOutputFormat.class);
+		job.setPartitionerClass(HadoopUpdateEventKeyPartitioner.class);
+		job.setGroupingComparatorClass(UpdateEventKeyGroupComparator.class);
+
+		for(String s : inputFileNames) {
+			FileInputFormat.addInputPath(job, new Path(s));
+		}
+		FileOutputFormat.setOutputPath(job, new Path(conf.get("ldbc.snb.datagen.serializer.hadoopDir")+"/aux"));
+		if(!job.waitForCompletion(true)) {
+            throw new Exception();
+        }
+		
+		
+		try{
+			FileSystem fs = FileSystem.get(conf);
+			fs.delete(new Path(conf.get("ldbc.snb.datagen.serializer.hadoopDir")+"/aux"),true);
+		} catch(IOException e) {
+			System.err.println(e.getMessage());
+		}
+	}
+}
+
diff --git a/src/main/java/ldbc/snb/datagen/hadoop/TupleKey.java b/src/main/java/ldbc/snb/datagen/hadoop/TupleKey.java
@@ -26,19 +26,16 @@ public TupleKey(  long key, long id) {
         this.id = id;
     }
 
-    @Override
     public void write(DataOutput out) throws IOException {
         out.writeLong(key);
         out.writeLong(id);
     }
 
-    @Override
     public void readFields(DataInput in) throws IOException {
         key = in.readLong();
         id = in.readLong();
     }
 
-    @Override
     public int compareTo( TupleKey tk) {
         if (key < tk.key) return -1;
         if (key > tk.key) return 1;
diff --git a/src/main/java/ldbc/snb/datagen/hadoop/UpdateEventKey.java b/src/main/java/ldbc/snb/datagen/hadoop/UpdateEventKey.java
@@ -0,0 +1,55 @@
+package ldbc.snb.datagen.hadoop;
+
+import ldbc.snb.datagen.objects.UpdateEvent;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+/**
+ * Created by aprat on 5/01/16.
+ */
+public class UpdateEventKey implements WritableComparable<UpdateEventKey> {
+
+    public long date;
+    public int reducerId;
+    public int partition;
+
+    public UpdateEventKey( ) {
+    }
+
+    public UpdateEventKey(UpdateEventKey key) {
+        this.date = key.date;
+        this.reducerId = key.reducerId;
+        this.partition = key.partition;
+    }
+
+    public UpdateEventKey(  long date, int reducerId, int partition) {
+
+        this.date = date;
+        this.reducerId = reducerId;
+        this.partition = partition;
+    }
+
+    public void write(DataOutput out) throws IOException {
+        out.writeLong(date);
+        out.writeInt(reducerId);
+        out.writeInt(partition);
+    }
+
+    public void readFields(DataInput in) throws IOException {
+        date = in.readLong();
+        reducerId = in.readInt();
+        partition = in.readInt();
+    }
+
+    public int compareTo( UpdateEventKey key) {
+        if( date < key.date) return -1;
+        if( date > key.date) return 1;
+        if (reducerId != key.reducerId) return reducerId - key.reducerId;
+        if (partition != key.partition) return partition - key.partition;
+        return 0;
+    }
+}
diff --git a/src/main/java/ldbc/snb/datagen/hadoop/UpdateEventKeyGroupComparator.java b/src/main/java/ldbc/snb/datagen/hadoop/UpdateEventKeyGroupComparator.java
@@ -0,0 +1,24 @@
+package ldbc.snb.datagen.hadoop;
+
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.io.WritableComparator;
+
+/**
+ * Created by aprat on 11/17/14.
+ */
+
+public class UpdateEventKeyGroupComparator extends WritableComparator {
+
+    protected UpdateEventKeyGroupComparator() {
+        super(UpdateEventKey.class,true);
+    }
+
+    @Override
+    public int compare(WritableComparable a, WritableComparable b) {
+        UpdateEventKey keyA = (UpdateEventKey)a;
+        UpdateEventKey keyB = (UpdateEventKey)b;
+        if (keyA.reducerId != keyB.reducerId) return keyA.reducerId - keyB.reducerId;
+        if (keyA.partition != keyB.partition) return keyA.partition - keyB.partition;
+        return 0;
+    }
+}
diff --git a/src/main/java/ldbc/snb/datagen/serializer/UpdateEventSerializer.java b/src/main/java/ldbc/snb/datagen/serializer/UpdateEventSerializer.java
@@ -38,6 +38,8 @@
 
 import ldbc.snb.datagen.dictionary.Dictionaries;
 import ldbc.snb.datagen.generator.DatagenParams;
+import ldbc.snb.datagen.hadoop.TupleKey;
+import ldbc.snb.datagen.hadoop.UpdateEventKey;
 import ldbc.snb.datagen.objects.*;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.*;
@@ -75,9 +77,11 @@ private class UpdateStreamStats {
 	private Configuration conf_;
 	private UpdateStreamStats  stats_;
 	private String fileNamePrefix_;
+	private int reducerId_;
 	
-	public UpdateEventSerializer(Configuration conf, String fileNamePrefix, int numPartitions ) {
+	public UpdateEventSerializer(Configuration conf, String fileNamePrefix, int reducerId, int numPartitions ) {
 		conf_ = conf;
+		reducerId_ = reducerId;
 		stringBuffer_ = new StringBuffer(512);
 		data_ = new ArrayList<String>();
 		list_ = new ArrayList<String>();
@@ -90,7 +94,7 @@ public UpdateEventSerializer(Configuration conf, String fileNamePrefix, int numP
 			FileContext fc = FileContext.getFileContext(conf);
 			for( int i = 0; i < numPartitions_; ++i ) {
 				Path outFile = new Path(fileNamePrefix_+"_"+i);
-				streamWriter_[i] = SequenceFile.createWriter(fc, conf, outFile, LongWritable.class, Text.class, CompressionType.NONE, new DefaultCodec(),new SequenceFile.Metadata(), EnumSet.of(CreateFlag.CREATE), Options.CreateOpts.checksumParam(Options.ChecksumOpt.createDisabled()));
+				streamWriter_[i] = SequenceFile.createWriter(fc, conf, outFile, UpdateEventKey.class, Text.class, CompressionType.NONE, new DefaultCodec(),new SequenceFile.Metadata(), EnumSet.of(CreateFlag.CREATE), Options.CreateOpts.checksumParam(Options.ChecksumOpt.createDisabled()));
 				FileSystem fs = FileSystem.get(conf);
 				Path propertiesFile = new Path(fileNamePrefix+".properties");
 				if(fs.exists(propertiesFile)){
@@ -116,7 +120,7 @@ public void changePartition() {
 	
 	public void writeKeyValue( UpdateEvent event ) {
 		try{
-			StringBuffer string = new StringBuffer();
+			StringBuilder string = new StringBuilder();
 			string.append(Long.toString(event.date));
 			string.append("|");
 			string.append(Long.toString(event.dependantDate));
@@ -125,7 +129,7 @@ public void writeKeyValue( UpdateEvent event ) {
 			string.append("|");
 			string.append(event.eventData);
 			string.append("\n");
-			streamWriter_[nextPartition_].append(new LongWritable(event.date),new Text(string.toString()));
+			streamWriter_[nextPartition_].append(new UpdateEventKey(event.date, reducerId_, nextPartition_),new Text(string.toString()));
 		} catch(IOException e){
 			System.err.println(e.getMessage());
 			System.exit(-1);

Original file line number	Diff line number	Diff line change
`@@ -28,19 +28,16 @@ public BlockKey( long block, TupleKey tk) {`
`28`	`28`	`this.tk = tk;`
`29`	`29`	`}`
`30`	`30`
`31`		`- @Override`
`32`	`31`	`public void write(DataOutput out) throws IOException {`
`33`	`32`	`out.writeLong(block);`
`34`	`33`	`tk.write(out);`
`35`	`34`	`}`
`36`	`35`
`37`		`- @Override`
`38`	`36`	`public void readFields(DataInput in) throws IOException {`
`39`	`37`	`block = in.readLong();`
`40`	`38`	`tk.readFields(in);`
`41`	`39`	`}`
`42`	`40`
`43`		`- @Override`
`44`	`41`	`public int compareTo( BlockKey mpk) {`
`45`	`42`	`if (block < mpk.block) return -1;`
`46`	`43`	`if (block > mpk.block) return 1;`
Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,7 @@ protected void setup(Context context) {`
`54`	`54`	`personActivitySerializer_ = (PersonActivitySerializer) Class.forName(conf.get("ldbc.snb.datagen.serializer.personActivitySerializer")).newInstance();`
`55`	`55`	`personActivitySerializer_.initialize(conf,reducerId);`
`56`	`56`	`if(DatagenParams.updateStreams) {`
`57`		`- updateSerializer_ = new UpdateEventSerializer(conf, DatagenParams.hadoopDir + "/temp_updateStream_forum_" + reducerId, DatagenParams.numUpdatePartitions);`
	`57`	`+ updateSerializer_ = new UpdateEventSerializer(conf, DatagenParams.hadoopDir + "/temp_updateStream_forum_" + reducerId, reducerId, DatagenParams.numUpdatePartitions);`
`58`	`58`	`}`
`59`	`59`	`personActivityGenerator_ = new PersonActivityGenerator(personActivitySerializer_, updateSerializer_);`
`60`	`60`
Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,7 @@ protected void setup(Context context) {`
`39`	`39`	`personSerializer_ = (PersonSerializer) Class.forName(conf.get("ldbc.snb.datagen.serializer.personSerializer")).newInstance();`
`40`	`40`	`personSerializer_.initialize(conf,reducerId);`
`41`	`41`	`if (DatagenParams.updateStreams) {`
`42`		`- updateSerializer_ = new UpdateEventSerializer(conf, DatagenParams.hadoopDir + "/temp_updateStream_person_" + reducerId, DatagenParams.numUpdatePartitions);`
	`42`	`+ updateSerializer_ = new UpdateEventSerializer(conf, DatagenParams.hadoopDir + "/temp_updateStream_person_" + reducerId, reducerId, DatagenParams.numUpdatePartitions);`
`43`	`43`	`}`
`44`	`44`	`} catch( Exception e ) {`
`45`	`45`	`System.err.println(e.getMessage());`
Original file line number	Diff line number	Diff line change
`@@ -26,19 +26,16 @@ public TupleKey( long key, long id) {`
`26`	`26`	`this.id = id;`
`27`	`27`	`}`
`28`	`28`
`29`		`- @Override`
`30`	`29`	`public void write(DataOutput out) throws IOException {`
`31`	`30`	`out.writeLong(key);`
`32`	`31`	`out.writeLong(id);`
`33`	`32`	`}`
`34`	`33`
`35`		`- @Override`
`36`	`34`	`public void readFields(DataInput in) throws IOException {`
`37`	`35`	`key = in.readLong();`
`38`	`36`	`id = in.readLong();`
`39`	`37`	`}`
`40`	`38`
`41`		`- @Override`
`42`	`39`	`public int compareTo( TupleKey tk) {`
`43`	`40`	`if (key < tk.key) return -1;`
`44`	`41`	`if (key > tk.key) return 1;`