Reenabled sorting prior person serialization.

ArnauPrat · ArnauPrat · commit 0dfdcf42adbc · 2016-02-03T16:12:56.000+01:00
Fixed bug with ttl serializer which was producing repeated knows, studyat and workat ids
diff --git a/src/main/java/ldbc/snb/datagen/hadoop/HadoopPersonSerializer.java b/src/main/java/ldbc/snb/datagen/hadoop/HadoopPersonSerializer.java
@@ -7,6 +7,7 @@
 import ldbc.snb.datagen.objects.Person;
 import ldbc.snb.datagen.serializer.PersonSerializer;
 import ldbc.snb.datagen.serializer.UpdateEventSerializer;
+import ldbc.snb.datagen.vocabulary.SN;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -25,7 +26,7 @@
  */
 public class HadoopPersonSerializer {
 	
-	public static class HadoopPersonSerializerReducer  extends Reducer<TupleKey, Person, LongWritable, Person> {
+	public static class HadoopPersonSerializerReducer  extends Reducer<BlockKey, Person, LongWritable, Person> {
 		
 		private int reducerId;                          /** The id of the reducer.**/
 		private PersonSerializer personSerializer_;   /** The person serializer **/
@@ -47,8 +48,9 @@ protected void setup(Context context) {
 		}
 		
 		@Override
-		public void reduce(TupleKey key, Iterable<Person> valueSet,Context context)
+		public void reduce(BlockKey key, Iterable<Person> valueSet,Context context)
 			throws IOException, InterruptedException {
+			SN.machineId = key.block;
 			personSerializer_.reset();
 			for( Person p : valueSet ) {
 				if(p.creationDate()< Dictionaries.dates.getUpdateThreshold() || !DatagenParams.updateStreams  ) {
@@ -85,42 +87,40 @@ public void run( String inputFileName ) throws Exception {
 		
 		FileSystem fs = FileSystem.get(conf);
 
-		/*String rankedFileName = conf.get("ldbc.snb.datagen.serializer.hadoopDir") + "/ranked";
+		String rankedFileName = conf.get("ldbc.snb.datagen.serializer.hadoopDir") + "/ranked";
 		HadoopFileRanker hadoopFileRanker = new HadoopFileRanker( conf, TupleKey.class, Person.class, null );
         hadoopFileRanker.run(inputFileName,rankedFileName);
-        */
 
 		int numThreads = Integer.parseInt(conf.get("ldbc.snb.datagen.generator.numThreads"));
 		Job job = Job.getInstance(conf, "Person Serializer");
-		//job.setMapOutputKeyClass(BlockKey.class);
-		job.setMapOutputKeyClass(TupleKey.class);
+		job.setMapOutputKeyClass(BlockKey.class);
+		//job.setMapOutputKeyClass(TupleKey.class);
 		job.setMapOutputValueClass(Person.class);
 		job.setOutputKeyClass(LongWritable.class);
 		job.setOutputValueClass(Person.class);
 		job.setJarByClass(HadoopBlockMapper.class);
-		//job.setMapperClass(HadoopBlockMapper.class);
+		job.setMapperClass(HadoopBlockMapper.class);
 		job.setReducerClass(HadoopPersonSerializerReducer.class);
 		job.setNumReduceTasks(numThreads);
 		job.setInputFormatClass(SequenceFileInputFormat.class);
 		job.setOutputFormatClass(SequenceFileOutputFormat.class);
 
 		job.setPartitionerClass(HadoopTuplePartitioner.class);
 
-		/*job.setSortComparatorClass(BlockKeyComparator.class);
+		job.setSortComparatorClass(BlockKeyComparator.class);
 		job.setGroupingComparatorClass(BlockKeyGroupComparator.class);
 		job.setPartitionerClass(HadoopBlockPartitioner.class);
-		*/
-		
-		//FileInputFormat.setInputPaths(job, new Path(rankedFileName));
-		FileInputFormat.setInputPaths(job, new Path(inputFileName));
+
+		FileInputFormat.setInputPaths(job, new Path(rankedFileName));
+		//FileInputFormat.setInputPaths(job, new Path(inputFileName));
 		FileOutputFormat.setOutputPath(job, new Path(conf.get("ldbc.snb.datagen.serializer.hadoopDir")+"/aux"));
 		if(!job.waitForCompletion(true)) {
             throw new Exception();
         }
 		
 		
 		try{
-		//	fs.delete(new Path(rankedFileName), true);
+			fs.delete(new Path(rankedFileName), true);
 			fs.delete(new Path(conf.get("ldbc.snb.datagen.serializer.hadoopDir")+"/aux"),true);
 		} catch(IOException e) {
 			System.err.println(e.getMessage());