RADAR-base
diff --git a/‎README.md‎
Lines changed: 2 additions & 2 deletions b/‎README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎build.gradle‎
Lines changed: 1 addition & 1 deletion b/‎build.gradle‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/main/java/org/radarcns/RestructureAvroRecords.java‎
Lines changed: 51 additions & 17 deletions b/‎src/main/java/org/radarcns/RestructureAvroRecords.java‎
Lines changed: 51 additions & 17 deletions
diff --git a/‎src/main/java/org/radarcns/util/CsvAvroConverter.java‎
Lines changed: 52 additions & 10 deletions b/‎src/main/java/org/radarcns/util/CsvAvroConverter.java‎
Lines changed: 52 additions & 10 deletions
diff --git a/‎src/main/java/org/radarcns/util/FileCache.java‎
Lines changed: 17 additions & 13 deletions b/‎src/main/java/org/radarcns/util/FileCache.java‎
Lines changed: 17 additions & 13 deletions
diff --git a/‎src/main/java/org/radarcns/util/FileCacheStore.java‎
Lines changed: 24 additions & 6 deletions b/‎src/main/java/org/radarcns/util/FileCacheStore.java‎
Lines changed: 24 additions & 6 deletions
diff --git a/‎src/main/java/org/radarcns/util/JsonAvroConverter.java‎
Lines changed: 3 additions & 2 deletions b/‎src/main/java/org/radarcns/util/JsonAvroConverter.java‎
Lines changed: 3 additions & 2 deletions
@@ -1,6 +1,6 @@
 # Restructure HDFS files
 
-[![Build Status](https://travis-ci.org/RADAR-CNS/Restructure-HDFS-topic.svg?branch=master)](https://travis-ci.org/RADAR-CNS/Restructure-HDFS-topic)
+[![Build Status](https://travis-ci.org/RADAR-base/Restructure-HDFS-topic.svg?branch=master)](https://travis-ci.org/RADAR-base/Restructure-HDFS-topic)
 
 Data streamed to HDFS using the [RADAR HDFS sink connector](https://github.com/RADAR-CNS/RADAR-HDFS-Sink-Connector) is streamed to files based on sensor only. This package can transform that output to a local directory structure as follows: `userId/topic/date_hour.csv`. The date and hour is extracted from the `time` field of each record, and is formatted in UTC time.
 
@@ -31,4 +31,4 @@ Another option is to output the data in compressed form. All files will get the
 java -Dorg.radarcns.compress=gzip -jar restructurehdfs-0.3.1-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
 ```
 
-Finally, files records are deduplicated after writing. To disable this behaviour, specify the option `-Dorg.radarcns.deduplicate=false`.
+Finally, by default, files records are not deduplicated after writing. To enable this behaviour, specify the option `-Dorg.radarcns.deduplicate=true`. This set to false by default because of an issue with Biovotion data. Please see - [issue #16](https://github.com/RADAR-base/Restructure-HDFS-topic/issues/16) before enabling it.
@@ -2,7 +2,7 @@ apply plugin: 'java'
 apply plugin: 'application'
 
 group 'org.radarcns.restructurehdfs'
-version '0.3.1'
+version '0.3.2'
 mainClassName = 'org.radarcns.RestructureAvroRecords'
 
 run {
 
@@ -16,6 +16,7 @@
 
 package org.radarcns;
 
+import com.fasterxml.jackson.databind.JsonMappingException;
 import org.apache.avro.Schema.Field;
 import org.apache.avro.file.DataFileReader;
 import org.apache.avro.generic.GenericDatumReader;
@@ -70,7 +71,9 @@ public class RestructureAvroRecords {
     private long processedFileCount;
     private long processedRecordsCount;
     private static final boolean USE_GZIP = "gzip".equalsIgnoreCase(System.getProperty("org.radarcns.compression"));
-    private static final boolean DO_DEDUPLICATE = "true".equalsIgnoreCase(System.getProperty("org.radarcns.deduplicate", "true"));
+
+    // Default set to false because causes loss of records from Biovotion data. https://github.com/RADAR-base/Restructure-HDFS-topic/issues/16
+    private static final boolean DO_DEDUPLICATE = "true".equalsIgnoreCase(System.getProperty("org.radarcns.deduplicate", "false"));
 
     public static void main(String [] args) throws Exception {
         if (args.length != 3) {
@@ -178,7 +181,12 @@ public void start(String directoryName) throws IOException {
             for (Map.Entry<String, List<Path>> entry : topicPaths.entrySet()) {
                 try (FileCacheStore cache = new FileCacheStore(converterFactory, 100, USE_GZIP, DO_DEDUPLICATE)) {
                     for (Path filePath : entry.getValue()) {
-                        this.processFile(filePath, entry.getKey(), cache, offsets);
+                        // If JsonMappingException occurs, log the error and continue with other files
+                        try {
+                            this.processFile(filePath, entry.getKey(), cache, offsets);
+                        } catch (JsonMappingException exc) {
+                            logger.error("Cannot map values", exc);
+                        }
                         progressBar.update(++processedFileCount);
                     }
                 }
@@ -232,7 +240,7 @@ private void processFile(Path filePath, String topicName, FileCacheStore cache,
             record = dataFileReader.next(record);
 
             // Get the fields
-            this.writeRecord(record, topicName, cache);
+            this.writeRecord(record, topicName, cache, 0);
         }
 
         // Write which file has been processed and update bins
@@ -245,7 +253,7 @@ record = dataFileReader.next(record);
         }
     }
 
-    private void writeRecord(GenericRecord record, String topicName, FileCacheStore cache)
+    private void writeRecord(GenericRecord record, String topicName, FileCacheStore cache, int suffix)
             throws IOException {
         GenericRecord keyField = (GenericRecord) record.get("key");
         GenericRecord valueField = (GenericRecord) record.get("value");
@@ -256,37 +264,63 @@ private void writeRecord(GenericRecord record, String topicName, FileCacheStore
         }
 
         Date time = getDate(keyField, valueField);
-        java.nio.file.Path outputFileName = createFilename(time);
+        java.nio.file.Path outputFileName = createFilename(time, suffix);
+
+        String projectId;
+
+        if(keyField.get("projectId") == null) {
+            projectId = "unknown-project";
+        } else {
+            // Clean Project id for use in final pathname
+            projectId = keyField.get("projectId").toString().replaceAll("[^a-zA-Z0-9_-]+", "");
+        }
 
         // Clean user id and create final output pathname
         String userId = keyField.get("userId").toString().replaceAll("[^a-zA-Z0-9_-]+", "");
-        java.nio.file.Path userDir = this.outputPath.resolve(userId);
+
+        java.nio.file.Path projectDir = this.outputPath.resolve(projectId);
+        java.nio.file.Path userDir = projectDir.resolve(userId);
         java.nio.file.Path userTopicDir = userDir.resolve(topicName);
         java.nio.file.Path outputPath = userTopicDir.resolve(outputFileName);
 
         // Write data
-        cache.writeRecord(outputPath, record);
+        int response = cache.writeRecord(outputPath, record);
 
-        java.nio.file.Path schemaPath = userTopicDir.resolve(SCHEMA_OUTPUT_FILE_NAME);
-        if (!Files.exists(schemaPath)) {
-            try (Writer writer = Files.newBufferedWriter(schemaPath)) {
-                writer.write(record.getSchema().toString(true));
+        if (response == FileCacheStore.CACHE_AND_NO_WRITE || response == FileCacheStore.NO_CACHE_AND_NO_WRITE) {
+            // Write was unsuccessful due to different number of columns,
+            // try again with new file name
+            writeRecord(record, topicName, cache, ++suffix);
+        } else {
+            // Write was successful, finalize the write
+            java.nio.file.Path schemaPath = userTopicDir.resolve(SCHEMA_OUTPUT_FILE_NAME);
+            if (!Files.exists(schemaPath)) {
+                try (Writer writer = Files.newBufferedWriter(schemaPath)) {
+                    writer.write(record.getSchema().toString(true));
+                }
             }
-        }
 
-        // Count data (binned and total)
-        bins.add(topicName, keyField.get("sourceId").toString(), time);
-        processedRecordsCount++;
+            // Count data (binned and total)
+            bins.add(topicName, keyField.get("sourceId").toString(), time);
+            processedRecordsCount++;
+        }
     }
 
-    private java.nio.file.Path createFilename(Date date) {
+    private java.nio.file.Path createFilename(Date date, int suffix) {
         if (date == null) {
             logger.warn("Time field of record valueField is not set");
             return Paths.get("unknown_date." + outputFileExtension);
         }
+
+        String finalSuffix;
+        if(suffix == 0) {
+            finalSuffix = "";
+        } else {
+            finalSuffix = "_" + suffix;
+        }
+
         // Make a timestamped filename YYYYMMDD_HH00.json
         String hourlyTimestamp = createHourTimestamp(date);
-        return Paths.get(hourlyTimestamp + "00." + outputFileExtension);
+        return Paths.get(hourlyTimestamp + "00" + finalSuffix +"." + outputFileExtension);
     }
 
     public static String createHourTimestamp(Date date) {
 
@@ -16,6 +16,9 @@
 
 package org.radarcns.util;
 
+import com.fasterxml.jackson.databind.JsonMappingException;
+import com.fasterxml.jackson.databind.MappingIterator;
+import com.fasterxml.jackson.databind.ObjectReader;
 import com.fasterxml.jackson.databind.ObjectWriter;
 import com.fasterxml.jackson.dataformat.csv.CsvFactory;
 import com.fasterxml.jackson.dataformat.csv.CsvGenerator;
@@ -28,11 +31,10 @@
 import org.apache.avro.generic.GenericRecord;
 
 import java.io.IOException;
+import java.io.Reader;
 import java.io.Writer;
 import java.nio.ByteBuffer;
-import java.util.LinkedHashMap;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
 
 /**
  * Converts deep hierarchical Avro records into flat CSV format. It uses a simple dot syntax in the
@@ -45,8 +47,8 @@ public static RecordConverterFactory getFactory() {
         CsvFactory factory = new CsvFactory();
         return new RecordConverterFactory() {
             @Override
-            public RecordConverter converterFor(Writer writer, GenericRecord record, boolean writeHeader) throws IOException {
-                return new CsvAvroConverter(factory, writer, record, writeHeader);
+            public RecordConverter converterFor(Writer writer, GenericRecord record, boolean writeHeader, Reader reader) throws IOException {
+                return new CsvAvroConverter(factory, writer, record, writeHeader, reader);
             }
 
             @Override
@@ -59,28 +61,68 @@ public boolean hasHeader() {
     private final ObjectWriter csvWriter;
     private final Map<String, Object> map;
     private final CsvGenerator generator;
+    private CsvSchema schema;
 
-    public CsvAvroConverter(CsvFactory factory, Writer writer, GenericRecord record, boolean writeHeader)
+    public CsvAvroConverter(CsvFactory factory, Writer writer, GenericRecord record, boolean writeHeader, Reader reader)
             throws IOException {
         map = new LinkedHashMap<>();
-        Map<String, Object> value = convertRecord(record);
+
+        CsvMapper mapper = new CsvMapper(factory);
+        Map<String, Object> value;
+
+        schema = CsvSchema.emptySchema().withHeader();
+        if (!writeHeader) {
+            // If file already exists read the schema from the CSV file
+            ObjectReader objectReader = mapper.readerFor(Map.class).with(schema);
+            MappingIterator<Map<String,Object>> iterator = objectReader.readValues(reader);
+            value = iterator.next();
+        } else {
+            value = convertRecord(record);
+        }
+
         CsvSchema.Builder builder = new CsvSchema.Builder();
         for (String key : value.keySet()) {
             builder.addColumn(key);
         }
-        CsvSchema schema = builder.build();
+        schema = builder.build();
+
         if (writeHeader) {
             schema = schema.withHeader();
         }
+
         generator = factory.createGenerator(writer);
-        csvWriter = new CsvMapper(factory).writer(schema);
+        csvWriter = mapper.writer(schema);
+
     }
 
+    /**
+     * Write AVRO record to CSV file.
+     * @param record the AVRO record to be written to CSV file
+     * @return true if write was successful, false if cannot write record to the current CSV file
+     * @throws IOException for other IO and Mapping errors
+     */
     @Override
-    public void writeRecord(GenericRecord record) throws IOException {
+    public boolean writeRecord(GenericRecord record) throws IOException {
         Map<String, Object> localMap = convertRecord(record);
+
+        if(localMap.size() > schema.size()) {
+            // Cannot write to same file so return false
+            return false;
+        } else {
+            Iterator<String> localColumnIterator = localMap.keySet().iterator();
+            for(int i = 0; i < schema.size(); i++) {
+                if (!schema.columnName(i).equals(localColumnIterator.next())) {
+                    /* The order or name of columns is different and
+                    thus cannot write to this csv file. return false.
+                     */
+                    return false;
+                }
+            }
+        }
+
         csvWriter.writeValue(generator, localMap);
         localMap.clear();
+        return true;
     }
 
     public Map<String, Object> convertRecord(GenericRecord record) {
 
@@ -16,17 +16,11 @@
 
 package org.radarcns.util;
 
-import java.io.BufferedOutputStream;
-import java.io.Closeable;
-import java.io.FileOutputStream;
-import java.io.Flushable;
-import java.io.IOException;
-import java.io.OutputStream;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
+import java.io.*;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.StandardOpenOption;
+import java.util.zip.GZIPInputStream;
 import java.util.zip.GZIPOutputStream;
 import javax.annotation.Nonnull;
 import org.apache.avro.generic.GenericRecord;
@@ -57,15 +51,19 @@ public FileCache(RecordConverterFactory converterFactory, Path path,
 
         OutputStream outFile = Files.newOutputStream(path,
                 StandardOpenOption.APPEND, StandardOpenOption.CREATE);
+        InputStream inputStream = new BufferedInputStream(Files.newInputStream(path));
         OutputStream bufOut = new BufferedOutputStream(outFile);
         if (gzip) {
             bufOut = new GZIPOutputStream(bufOut);
+            if (!fileIsNew) {
+                inputStream = new GZIPInputStream(inputStream);
+            }
         }
 
         this.writer = new OutputStreamWriter(bufOut);
 
-        try {
-            this.recordConverter = converterFactory.converterFor(writer, record, fileIsNew);
+        try (Reader reader = new InputStreamReader(inputStream)) {
+            this.recordConverter = converterFactory.converterFor(writer, record, fileIsNew, reader);
         } catch (IOException ex) {
             try {
                 writer.close();
@@ -76,10 +74,16 @@ public FileCache(RecordConverterFactory converterFactory, Path path,
         }
     }
 
-    /** Write a record to the cache. */
-    public void writeRecord(GenericRecord record) throws IOException {
-        this.recordConverter.writeRecord(record);
+    /**
+     * Write a record to the cache.
+     * @param record AVRO record
+     * @return true or false based on {@link RecordConverter} write result
+     * @throws IOException
+     */
+    public boolean writeRecord(GenericRecord record) throws IOException {
+        boolean result = this.recordConverter.writeRecord(record);
         lastUse = System.nanoTime();
+        return result;
     }
 
     @Override
 
@@ -40,6 +40,13 @@ public class FileCacheStore implements Flushable, Closeable {
     private final int maxFiles;
     private final Map<Path, FileCache> caches;
 
+    // Response codes for each write record case
+    public static final int CACHE_AND_WRITE = 1; //used cache and write successful
+    public static final int NO_CACHE_AND_WRITE= 2;
+    public static final int CACHE_AND_NO_WRITE =3;
+    public static final int NO_CACHE_AND_NO_WRITE =4;
+
+
     public FileCacheStore(RecordConverterFactory converterFactory, int maxFiles, boolean gzip, boolean deduplicate) {
         this.converterFactory = converterFactory;
         this.maxFiles = maxFiles;
@@ -54,14 +61,19 @@ public FileCacheStore(RecordConverterFactory converterFactory, int maxFiles, boo
      *
      * @param path file to append data to
      * @param record data
-     * @return true if the cache was used, false if a new file was opened.
+     * @return Integer value according to one of the response codes.
      * @throws IOException when failing to open a file or writing to it.
      */
-    public boolean writeRecord(Path path, GenericRecord record) throws IOException {
+    public int writeRecord(Path path, GenericRecord record) throws IOException {
         FileCache cache = caches.get(path);
         if (cache != null) {
-            cache.writeRecord(record);
-            return true;
+            if(cache.writeRecord(record)){
+                return CACHE_AND_WRITE;
+            } else {
+                // This is the case when cache is used but write is unsuccessful
+                // because of different number columns in same topic
+                return CACHE_AND_NO_WRITE;
+            }
         } else {
             ensureCapacity();
 
@@ -70,8 +82,14 @@ public boolean writeRecord(Path path, GenericRecord record) throws IOException {
 
             cache = new FileCache(converterFactory, path, record, gzip);
             caches.put(path, cache);
-            cache.writeRecord(record);
-            return false;
+            if(cache.writeRecord(record)) {
+                return NO_CACHE_AND_WRITE;
+            } else {
+                // The file path was not in cache but the file exists and this write is
+                // unsuccessful because of different number of columns
+                return NO_CACHE_AND_NO_WRITE;
+            }
+
         }
     }
 
 
@@ -42,7 +42,7 @@ public final class JsonAvroConverter implements RecordConverter {
 
     public static RecordConverterFactory getFactory() {
         JsonFactory factory = new JsonFactory();
-        return (writer, record, writeHeader) -> new JsonAvroConverter(factory, writer);
+        return (writer, record, writeHeader, reader) -> new JsonAvroConverter(factory, writer);
     }
 
     private final ObjectWriter jsonWriter;
@@ -54,8 +54,9 @@ public JsonAvroConverter(JsonFactory factory, Writer writer) throws IOException
     }
 
     @Override
-    public void writeRecord(GenericRecord record) throws IOException {
+    public boolean writeRecord(GenericRecord record) throws IOException {
         jsonWriter.writeValue(generator, convertRecord(record));
+        return true;
     }
 
     public Map<String, Object> convertRecord(GenericRecord record) {
Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@ public final class JsonAvroConverter implements RecordConverter {`
`42`	`42`
`43`	`43`	`public static RecordConverterFactory getFactory() {`
`44`	`44`	`JsonFactory factory = new JsonFactory();`
`45`		`- return (writer, record, writeHeader) -> new JsonAvroConverter(factory, writer);`
	`45`	`+ return (writer, record, writeHeader, reader) -> new JsonAvroConverter(factory, writer);`
`46`	`46`	`}`
`47`	`47`
`48`	`48`	`private final ObjectWriter jsonWriter;`
`@@ -54,8 +54,9 @@ public JsonAvroConverter(JsonFactory factory, Writer writer) throws IOException`
`54`	`54`	`}`
`55`	`55`
`56`	`56`	`@Override`
`57`		`- public void writeRecord(GenericRecord record) throws IOException {`
	`57`	`+ public boolean writeRecord(GenericRecord record) throws IOException {`
`58`	`58`	`jsonWriter.writeValue(generator, convertRecord(record));`
	`59`	`+ return true;`
`59`	`60`	`}`
`60`	`61`
`61`	`62`	`public Map<String, Object> convertRecord(GenericRecord record) {`