RADAR-base
diff --git a/‎README.md‎
Lines changed: 6 additions & 4 deletions b/‎README.md‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎build.gradle‎
Lines changed: 1 addition & 1 deletion b/‎build.gradle‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/main/java/org/radarcns/Frequency.java‎
Lines changed: 17 additions & 18 deletions b/‎src/main/java/org/radarcns/Frequency.java‎
Lines changed: 17 additions & 18 deletions
diff --git a/‎src/main/java/org/radarcns/OffsetRangeFile.java‎
Lines changed: 13 additions & 17 deletions b/‎src/main/java/org/radarcns/OffsetRangeFile.java‎
Lines changed: 13 additions & 17 deletions
diff --git a/‎src/main/java/org/radarcns/OffsetRangeSet.java‎
Lines changed: 0 additions & 2 deletions b/‎src/main/java/org/radarcns/OffsetRangeSet.java‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎src/main/java/org/radarcns/RestructureAvroRecords.java‎
Lines changed: 20 additions & 18 deletions b/‎src/main/java/org/radarcns/RestructureAvroRecords.java‎
Lines changed: 20 additions & 18 deletions
diff --git a/‎src/main/java/org/radarcns/util/CsvAvroConverter.java‎
Lines changed: 17 additions & 6 deletions b/‎src/main/java/org/radarcns/util/CsvAvroConverter.java‎
Lines changed: 17 additions & 6 deletions
@@ -15,18 +15,20 @@ Build jar from source with
 ```shell
 ./gradlew build
 ```
-and find the output JAR file as `build/libs/restructurehdfs-0.2.1-all.jar`. Then run with:
+and find the output JAR file as `build/libs/restructurehdfs-0.3-all.jar`. Then run with:
 
 ```shell
-java -jar restructurehdfs-0.2.1-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
+java -jar restructurehdfs-0.3-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
 ```
 
 By default, this will output the data in CSV format. If JSON format is preferred, use the following instead:
 ```
-java -Dorg.radarcns.format=json -jar restructurehdfs-0.2.1-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
+java -Dorg.radarcns.format=json -jar restructurehdfs-0.3-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
 ```
 
 Another option is to output the data in compressed form. All files will get the `gz` suffix, and can be decompressed with a GZIP decoder. Note that for a very small number of records, this may actually increase the file size.
 ```
-java -Dorg.radarcns.compress=gzip -jar restructurehdfs-0.2.1-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
+java -Dorg.radarcns.compress=gzip -jar restructurehdfs-0.3-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
 ```
+
+Finally, files records are deduplicated after writing. To disable this behaviour, specify the option `-Dorg.radarcns.deduplicate=false`.
@@ -2,7 +2,7 @@ apply plugin: 'java'
 apply plugin: 'application'
 
 group 'org.radarcns.restructurehdfs'
-version '0.2.1'
+version '0.3'
 mainClassName = 'org.radarcns.RestructureAvroRecords'
 
 run {
 
@@ -16,40 +16,40 @@
 
 package org.radarcns;
 
-import java.nio.file.Files;
-import java.util.Date;
-import java.util.List;
-import java.util.Objects;
-import javax.annotation.Nonnull;
-import org.apache.avro.Schema.Field;
-import org.apache.avro.generic.GenericRecord;
 import org.apache.commons.collections.MapIterator;
 import org.apache.commons.collections.keyvalue.MultiKey;
 import org.apache.commons.collections.map.MultiKeyMap;
-
-import java.io.*;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import javax.annotation.Nonnull;
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Date;
+import java.util.List;
+import java.util.Objects;
+
 
 public class Frequency {
     private static final Logger logger = LoggerFactory.getLogger(Frequency.class);
 
     private final MultiKeyMap bins;
-    private final File file;
+    private final Path path;
 
-    public Frequency(@Nonnull File file, @Nonnull MultiKeyMap initialData) {
-        Objects.requireNonNull(file);
+    public Frequency(@Nonnull Path path, @Nonnull MultiKeyMap initialData) {
+        Objects.requireNonNull(path);
         Objects.requireNonNull(initialData);
-        this.file = file;
+        this.path = path;
         this.bins = initialData;
     }
 
-    public static Frequency read(File file) {
+    public static Frequency read(Path path) {
         MultiKeyMap map = new MultiKeyMap();
         try {
             // Read in all lines as multikeymap (key, key, key, value)
-            List<String> lines = Files.readAllLines(file.toPath());
+            List<String> lines = Files.readAllLines(path);
             lines.subList(1, lines.size()).forEach(line -> {
                 String[] columns = line.split(",");
                 try {
@@ -61,7 +61,7 @@ public static Frequency read(File file) {
         } catch (IOException e) {
             logger.warn("Could not read the file with bins. Creating new file when writing.");
         }
-        return new Frequency(file, map);
+        return new Frequency(path, map);
     }
 
     public void add(String topicName, String id, Date date) {
@@ -88,8 +88,7 @@ public void print() {
     public void write() {
         // Write all bins to csv
         MapIterator mapIterator = bins.mapIterator();
-        try (FileWriter fw = new FileWriter(file, false);
-                BufferedWriter bw = new BufferedWriter(fw)) {
+        try (BufferedWriter bw = Files.newBufferedWriter(path)) {
             String header = String.join(",","topic","device","timestamp","count");
             bw.write(header);
             bw.write('\n');
 
@@ -23,15 +23,15 @@
 import com.fasterxml.jackson.dataformat.csv.CsvGenerator;
 import com.fasterxml.jackson.dataformat.csv.CsvMapper;
 import com.fasterxml.jackson.dataformat.csv.CsvSchema;
+
 import java.io.BufferedReader;
 import java.io.BufferedWriter;
 import java.io.Closeable;
-import java.io.File;
-import java.io.FileReader;
-import java.io.FileWriter;
 import java.io.Flushable;
 import java.io.IOException;
 import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
 
 import static java.nio.file.StandardCopyOption.REPLACE_EXISTING;
 
@@ -56,19 +56,18 @@ private OffsetRangeFile() {
         // utility class
     }
 
-    public static void cleanUp(File file) throws IOException {
-        File tmpFile = File.createTempFile("offsets", ".csv.tmp");
-        try (OffsetRangeFile.Writer offsets = new OffsetRangeFile.Writer(tmpFile)) {
-            offsets.write(OffsetRangeFile.read(file));
+    public static void cleanUp(Path path) throws IOException {
+        Path tmpPath = Files.createTempFile("offsets", ".csv.tmp");
+        try (OffsetRangeFile.Writer offsets = new OffsetRangeFile.Writer(tmpPath)) {
+            offsets.write(OffsetRangeFile.read(path));
         }
-        Files.move(tmpFile.toPath(), file.toPath(), REPLACE_EXISTING);
+        Files.move(tmpPath, path, REPLACE_EXISTING);
     }
 
-    public static OffsetRangeSet read(File inputFile) throws IOException {
+    public static OffsetRangeSet read(Path path) throws IOException {
         OffsetRangeSet set = new OffsetRangeSet();
 
-        try (FileReader fr = new FileReader(inputFile);
-                BufferedReader br = new BufferedReader(fr)) {
+        try (BufferedReader br = Files.newBufferedReader(path)) {
             MappingIterator<OffsetRange> ranges = CSV_READER.readValues(br);
             while(ranges.hasNext()) {
                 set.add(ranges.next());
@@ -78,15 +77,13 @@ public static OffsetRangeSet read(File inputFile) throws IOException {
     }
 
     public static class Writer implements Flushable, Closeable {
-        private final FileWriter fileWriter;
         private final BufferedWriter bufferedWriter;
         private final CsvGenerator generator;
         private final ObjectWriter writer;
 
-        public Writer(File outputFile) throws IOException {
-            boolean fileIsNew = !outputFile.exists() || outputFile.length() == 0;
-            this.fileWriter = new FileWriter(outputFile, true);
-            this.bufferedWriter = new BufferedWriter(this.fileWriter);
+        public Writer(Path path) throws IOException {
+            boolean fileIsNew = !Files.exists(path) || Files.size(path) == 0;
+            this.bufferedWriter = Files.newBufferedWriter(path, StandardOpenOption.APPEND, StandardOpenOption.CREATE);
             this.generator = CSV_FACTORY.createGenerator(bufferedWriter);
             this.writer = CSV_MAPPER.writerFor(OffsetRange.class)
                     .with(fileIsNew ? SCHEMA.withHeader() : SCHEMA);
@@ -111,7 +108,6 @@ public void flush() throws IOException {
         public void close() throws IOException {
             generator.close();
             bufferedWriter.close();
-            fileWriter.close();
         }
     }
 }
@@ -17,9 +17,7 @@
 package org.radarcns;
 
 import javax.annotation.Nonnull;
-import java.util.HashMap;
 import java.util.Iterator;
-import java.util.Map;
 import java.util.NoSuchElementException;
 import java.util.SortedMap;
 import java.util.SortedSet;
 
@@ -34,9 +34,10 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.io.File;
-import java.io.FileWriter;
 import java.io.IOException;
+import java.io.Writer;
+import java.nio.file.Files;
+import java.nio.file.Paths;
 import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Date;
@@ -49,9 +50,9 @@ public class RestructureAvroRecords {
     private static final Logger logger = LoggerFactory.getLogger(RestructureAvroRecords.class);
 
     private final String outputFileExtension;
-    private static final String OFFSETS_FILE_NAME = "offsets.csv";
-    private static final String BINS_FILE_NAME = "bins.csv";
-    private static final String SCHEMA_OUTPUT_FILE_NAME = "schema.json";
+    private static final java.nio.file.Path OFFSETS_FILE_NAME = Paths.get("offsets.csv");
+    private static final java.nio.file.Path BINS_FILE_NAME = Paths.get("bins.csv");
+    private static final java.nio.file.Path SCHEMA_OUTPUT_FILE_NAME = Paths.get("schema.json");
     private static final SimpleDateFormat FILE_DATE_FORMAT = new SimpleDateFormat("yyyyMMdd_HH");
 
     static {
@@ -60,15 +61,16 @@ public class RestructureAvroRecords {
 
     private final RecordConverterFactory converterFactory;
 
-    private File outputPath;
-    private File offsetsPath;
+    private java.nio.file.Path outputPath;
+    private java.nio.file.Path offsetsPath;
     private Frequency bins;
 
     private final Configuration conf = new Configuration();
 
     private long processedFileCount;
     private long processedRecordsCount;
     private static final boolean USE_GZIP = "gzip".equalsIgnoreCase(System.getProperty("org.radarcns.compression"));
+    private static final boolean DO_DEDUPLICATE = "true".equalsIgnoreCase(System.getProperty("org.radarcns.deduplicate", "true"));
 
     public static void main(String [] args) throws Exception {
         if (args.length != 3) {
@@ -121,9 +123,9 @@ public void setInputWebHdfsURL(String fileSystemURL) {
 
     public void setOutputPath(String path) {
         // Remove trailing backslash
-        outputPath = new File(path.replaceAll("/$",""));
-        offsetsPath = new File(outputPath, OFFSETS_FILE_NAME);
-        bins = Frequency.read(new File(outputPath, BINS_FILE_NAME));
+        outputPath = Paths.get(path.replaceAll("/$", ""));
+        offsetsPath = outputPath.resolve(OFFSETS_FILE_NAME);
+        bins = Frequency.read(outputPath.resolve(BINS_FILE_NAME));
     }
 
     public long getProcessedFileCount() {
@@ -174,7 +176,7 @@ public void start(String directoryName) throws IOException {
 
             // Actually process the files
             for (Map.Entry<String, List<Path>> entry : topicPaths.entrySet()) {
-                try (FileCacheStore cache = new FileCacheStore(converterFactory, 100, USE_GZIP)) {
+                try (FileCacheStore cache = new FileCacheStore(converterFactory, 100, USE_GZIP, DO_DEDUPLICATE)) {
                     for (Path filePath : entry.getValue()) {
                         this.processFile(filePath, entry.getKey(), cache, offsets);
                         progressBar.update(++processedFileCount);
@@ -258,16 +260,16 @@ private void writeRecord(GenericRecord record, String topicName, FileCacheStore
 
         // Clean user id and create final output pathname
         String userId = keyField.get("userId").toString().replaceAll("[^a-zA-Z0-9_-]+", "");
-        File userDir = new File(this.outputPath, userId);
-        File userTopicDir = new File(userDir, topicName);
-        File outputFile = new File(userTopicDir, outputFileName);
+        java.nio.file.Path userDir = this.outputPath.resolve(userId);
+        java.nio.file.Path userTopicDir = userDir.resolve(topicName);
+        java.nio.file.Path outputPath = userTopicDir.resolve(outputFileName);
 
         // Write data
-        cache.writeRecord(outputFile, record);
+        cache.writeRecord(outputPath, record);
 
-        File schemaFile = new File(userTopicDir, SCHEMA_OUTPUT_FILE_NAME);
-        if (!schemaFile.exists()) {
-            try (FileWriter writer = new FileWriter(schemaFile, false)) {
+        java.nio.file.Path schemaPath = userTopicDir.resolve(SCHEMA_OUTPUT_FILE_NAME);
+        if (!Files.exists(schemaPath)) {
+            try (Writer writer = Files.newBufferedWriter(schemaPath)) {
                 writer.write(record.getSchema().toString(true));
             }
         }
 
@@ -21,17 +21,18 @@
 import com.fasterxml.jackson.dataformat.csv.CsvGenerator;
 import com.fasterxml.jackson.dataformat.csv.CsvMapper;
 import com.fasterxml.jackson.dataformat.csv.CsvSchema;
+import org.apache.avro.Schema;
+import org.apache.avro.Schema.Field;
+import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericFixed;
+import org.apache.avro.generic.GenericRecord;
+
 import java.io.IOException;
 import java.io.Writer;
 import java.nio.ByteBuffer;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
-import org.apache.avro.Schema;
-import org.apache.avro.Schema.Field;
-import org.apache.avro.generic.GenericData;
-import org.apache.avro.generic.GenericFixed;
-import org.apache.avro.generic.GenericRecord;
 
 /**
  * Converts deep hierarchical Avro records into flat CSV format. It uses a simple dot syntax in the
@@ -42,7 +43,17 @@ public class CsvAvroConverter implements RecordConverter {
 
     public static RecordConverterFactory getFactory() {
         CsvFactory factory = new CsvFactory();
-        return (writer, record, writeHeader) -> new CsvAvroConverter(factory, writer, record, writeHeader);
+        return new RecordConverterFactory() {
+            @Override
+            public RecordConverter converterFor(Writer writer, GenericRecord record, boolean writeHeader) throws IOException {
+                return new CsvAvroConverter(factory, writer, record, writeHeader);
+            }
+
+            @Override
+            public boolean hasHeader() {
+                return true;
+            }
+        };
     }
 
     private final ObjectWriter csvWriter;