RADAR-base
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 4 deletions b/‎.gitignore‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎.idea/compiler.xml‎
Lines changed: 0 additions & 9 deletions b/‎.idea/compiler.xml‎
Lines changed: 0 additions & 9 deletions
diff --git a/‎.idea/copyright/Apache_2_0_TheHyve.xml‎
Lines changed: 0 additions & 6 deletions b/‎.idea/copyright/Apache_2_0_TheHyve.xml‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎.idea/copyright/profiles_settings.xml‎
Lines changed: 0 additions & 15 deletions b/‎.idea/copyright/profiles_settings.xml‎
Lines changed: 0 additions & 15 deletions
diff --git a/‎.travis.yml‎
Lines changed: 8 additions & 7 deletions b/‎.travis.yml‎
Lines changed: 8 additions & 7 deletions
diff --git a/‎README.md‎
Lines changed: 8 additions & 3 deletions b/‎README.md‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎build.gradle‎
Lines changed: 5 additions & 6 deletions b/‎build.gradle‎
Lines changed: 5 additions & 6 deletions
diff --git a/‎gradle/wrapper/gradle-wrapper.jar‎
571 Bytes b/‎gradle/wrapper/gradle-wrapper.jar‎
571 Bytes
diff --git a/‎gradle/wrapper/gradle-wrapper.properties‎
Lines changed: 2 additions & 2 deletions b/‎gradle/wrapper/gradle-wrapper.properties‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/main/java/org/radarcns/RestructureAvroRecords.java‎
Lines changed: 72 additions & 42 deletions b/‎src/main/java/org/radarcns/RestructureAvroRecords.java‎
Lines changed: 72 additions & 42 deletions
@@ -92,11 +92,8 @@ fabric.properties
 ### Intellij+iml Patch ###
 # Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023
 
+.idea/
 *.iml
-modules.xml
-.idea/misc.xml
-*.ipr
-.idea/runConfigurations.xml
 
 ## Pebble 2
 .lock*
 
@@ -1,15 +1,16 @@
 language: java
 jdk:
   - oraclejdk8
-env:
-  TERM: dumb
-before_cache:
-  - rm -f  $HOME/.gradle/caches/modules-2/modules-2.lock
-  - rm -fr $HOME/.gradle/caches/*/plugin-resolution/
+sudo: false
+
 cache:
   directories:
-    - $HOME/.gradle/caches/
-    - $HOME/.gradle/wrapper/
+    - $HOME/.gradle/caches/jars-1
+    - $HOME/.gradle/caches/jars-2
+    - $HOME/.gradle/caches/jars-3
+    - $HOME/.gradle/caches/modules-2/files-2.1/
+    - $HOME/.gradle/native
+    - $HOME/.gradle/wrapper
 
 deploy:
   provider: releases
 
@@ -15,13 +15,18 @@ Build jar from source with
 ```shell
 ./gradlew build
 ```
-and find the output JAR file as `build/libs/restructurehdfs-all-0.1.2-SNAPSHOT.jar`. Then run with:
+and find the output JAR file as `build/libs/restructurehdfs-all-0.2.jar`. Then run with:
 
 ```shell
-java -jar restructurehdfs-all-0.1.2-SNAPSHOT.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
+java -jar restructurehdfs-all-0.2.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
 ```
 
 By default, this will output the data in CSV format. If JSON format is preferred, use the following instead:
 ```
-java -Dorg.radarcns.format=json -jar restructurehdfs-all-0.1.2-SNAPSHOT.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
+java -Dorg.radarcns.format=json -jar restructurehdfs-all-0.2.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
+```
+
+Another option is to output the data in compressed form. All files will get the `gz` suffix, and can be decompressed with a GZIP decoder. Note that for a very small number of records, this may actually increase the file size.
+```
+java -Dorg.radarcns.compress=gzip -jar restructurehdfs-all-0.2.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
 ```
@@ -2,7 +2,7 @@ apply plugin: 'java'
 apply plugin: 'application'
 
 group 'org.radarcns.restructurehdfs'
-version '0.1.2-SNAPSHOT'
+version '0.2.1-SNAPSHOT'
 mainClassName = 'org.radarcns.RestructureAvroRecords'
 
 run {
@@ -12,10 +12,9 @@ run {
 sourceCompatibility = '1.8'
 targetCompatibility = '1.8'
 
-ext.avroVersion = '1.8.1'
-ext.jacksonVersion = '2.8.5'
+ext.avroVersion = '1.8.2'
+ext.jacksonVersion = '2.8.9'
 ext.hadoopVersion = '2.7.3'
-ext.log4jVersion = '2.8.1'
 
 repositories {
     jcenter()
@@ -57,6 +56,6 @@ artifacts {
 }
 
 task wrapper(type: Wrapper) {
-    gradleVersion = '3.4.1'
-    distributionUrl distributionUrl.replace("bin", "all")
+    gradleVersion = '3.5'
+    distributionType 'all'
 }
@@ -1,6 +1,6 @@
-#Thu Apr 13 12:11:40 CEST 2017
+#Mon Jul 03 09:50:31 CEST 2017
 distributionBase=GRADLE_USER_HOME
 distributionPath=wrapper/dists
 zipStoreBase=GRADLE_USER_HOME
 zipStorePath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-3.4.1-all.zip
+distributionUrl=https\://services.gradle.org/distributions/gradle-3.5-all.zip
@@ -19,22 +19,26 @@
 import java.io.File;
 import java.io.IOException;
 import java.text.SimpleDateFormat;
+import java.util.ArrayList;
 import java.util.Date;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
 import java.util.TimeZone;
 import org.apache.avro.Schema.Field;
 import org.apache.avro.file.DataFileReader;
 import org.apache.avro.generic.GenericDatumReader;
 import org.apache.avro.generic.GenericRecord;
 import org.apache.avro.mapred.FsInput;
-import org.apache.commons.io.FilenameUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.LocatedFileStatus;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.RemoteIterator;
 import org.radarcns.util.CsvAvroConverter;
-import org.radarcns.util.FileCache;
+import org.radarcns.util.FileCacheStore;
 import org.radarcns.util.JsonAvroConverter;
+import org.radarcns.util.ProgressBar;
 import org.radarcns.util.RecordConverterFactory;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -55,17 +59,17 @@ public class RestructureAvroRecords {
 
     private File outputPath;
     private File offsetsPath;
-    private OffsetRangeSet seenFiles;
     private Frequency bins;
 
     private final Configuration conf = new Configuration();
 
-    private int processedFileCount;
-    private int processedRecordsCount;
+    private long processedFileCount;
+    private long processedRecordsCount;
+    private static final boolean USE_GZIP = "gzip".equalsIgnoreCase(System.getProperty("org.radarcns.compression"));
 
     public static void main(String [] args) throws Exception {
         if (args.length != 3) {
-            System.out.println("Usage: hadoop jar restructurehdfs-all-0.1.0.jar <webhdfs_url> <hdfs_topic> <output_folder>");
+            System.out.println("Usage: hadoop jar restructurehdfs-all-0.2.jar <webhdfs_url> <hdfs_root_directory> <output_folder>");
             System.exit(1);
         }
 
@@ -91,13 +95,21 @@ public RestructureAvroRecords(String inputPath, String outputPath) {
         this.setInputWebHdfsURL(inputPath);
         this.setOutputPath(outputPath);
 
+        String extension;
         if (System.getProperty("org.radarcns.format", "csv").equalsIgnoreCase("json")) {
+            logger.info("Writing output files in JSON format");
             converterFactory = JsonAvroConverter.getFactory();
-            outputFileExtension = "json";
+            extension = "json";
         } else {
+            logger.info("Writing output files in CSV format");
             converterFactory = CsvAvroConverter.getFactory();
-            outputFileExtension = "csv";
+            extension = "csv";
         }
+        if (USE_GZIP) {
+            logger.info("Compressing output files in GZIP format");
+            extension += ".gz";
+        }
+        outputFileExtension = extension;
     }
 
     public void setInputWebHdfsURL(String fileSystemURL) {
@@ -111,83 +123,100 @@ public void setOutputPath(String path) {
         bins = Frequency.read(new File(outputPath, BINS_FILE_NAME));
     }
 
-    public int getProcessedFileCount() {
+    public long getProcessedFileCount() {
         return processedFileCount;
     }
 
-    public int getProcessedRecordsCount() {
+    public long getProcessedRecordsCount() {
         return processedRecordsCount;
     }
 
     public void start(String directoryName) throws IOException {
         // Get files and directories
         Path path = new Path(directoryName);
         FileSystem fs = FileSystem.get(conf);
-        RemoteIterator<LocatedFileStatus> files = fs.listLocatedStatus(path);
+
 
         try (OffsetRangeFile offsets = new OffsetRangeFile(offsetsPath)) {
+            OffsetRangeSet seenFiles;
             try {
                 seenFiles = offsets.read();
             } catch (IOException ex) {
                 logger.error("Error reading offsets file. Processing all offsets.");
                 seenFiles = new OffsetRangeSet();
             }
-            // Process the directories topics
-            processedFileCount = 0;
+            logger.info("Retrieving file list from {}", path);
+            // Get filenames to process
+            Map<String, List<Path>> topicPaths = new HashMap<>();
+            long toProcessFileCount = 0L;
+            processedFileCount = 0L;
+            RemoteIterator<LocatedFileStatus> files = fs.listFiles(path, true);
             while (files.hasNext()) {
                 LocatedFileStatus locatedFileStatus = files.next();
-                Path filePath = locatedFileStatus.getPath();
-
-                if (filePath.toString().contains("+tmp")) {
+                if (locatedFileStatus.isDirectory()) {
                     continue;
                 }
+                Path filePath = locatedFileStatus.getPath();
 
-                if (locatedFileStatus.isDirectory()) {
-                    processTopic(filePath, converterFactory, offsets);
+                String topic = getTopic(filePath, seenFiles);
+                if (topic != null) {
+                    topicPaths.computeIfAbsent(topic, k -> new ArrayList<>()).add(filePath);
+                    toProcessFileCount++;
                 }
             }
-        }
-    }
 
-    private void processTopic(Path topicPath, RecordConverterFactory converterFactory,
-            OffsetRangeFile offsets) throws IOException {
-        // Get files in this topic directory
-        FileSystem fs = FileSystem.get(conf);
-        RemoteIterator<LocatedFileStatus> files = fs.listFiles(topicPath, true);
+            logger.info("Converting {} files", toProcessFileCount);
 
-        String topicName = topicPath.getName();
-
-        try (FileCache cache = new FileCache(converterFactory, 100)) {
-            while (files.hasNext()) {
-                LocatedFileStatus locatedFileStatus = files.next();
+            ProgressBar progressBar = new ProgressBar(toProcessFileCount, 10);
+            progressBar.update(0);
 
-                if (locatedFileStatus.isFile()) {
-                    this.processFile(locatedFileStatus.getPath(), topicName, cache, offsets);
+            // Actually process the files
+            for (Map.Entry<String, List<Path>> entry : topicPaths.entrySet()) {
+                try (FileCacheStore cache = new FileCacheStore(converterFactory, 100, USE_GZIP)) {
+                    for (Path filePath : entry.getValue()) {
+                        this.processFile(filePath, entry.getKey(), cache, offsets);
+                        progressBar.update(++processedFileCount);
+                    }
                 }
             }
         }
     }
 
-    private void processFile(Path filePath, String topicName, FileCache cache,
-            OffsetRangeFile offsets) throws IOException {
-        String fileName = filePath.getName();
+    private static String getTopic(Path filePath, OffsetRangeSet seenFiles) {
+        if (filePath.toString().contains("+tmp")) {
+            return null;
+        }
 
+        String fileName = filePath.getName();
         // Skip if extension is not .avro
-        if (!FilenameUtils.getExtension(fileName).equals("avro")) {
-            logger.info("Skipped non-avro file: {}", fileName);
-            return;
+        if (!fileName.endsWith(".avro")) {
+            logger.info("Skipping non-avro file: {}", fileName);
+            return null;
         }
 
         OffsetRange range = OffsetRange.parse(fileName);
         // Skip already processed avro files
         if (seenFiles.contains(range)) {
-            return;
+            return null;
         }
 
-        logger.info("{}", filePath);
+        return filePath.getParent().getParent().getName();
+    }
+
+    private void processFile(Path filePath, String topicName, FileCacheStore cache,
+            OffsetRangeFile offsets) throws IOException {
+        logger.debug("Reading {}", filePath);
 
         // Read and parse avro file
         FsInput input = new FsInput(filePath, conf);
+
+        // processing zero-length files may trigger a stall. See:
+        // https://github.com/RADAR-CNS/Restructure-HDFS-topic/issues/3
+        if (input.length() == 0) {
+            logger.warn("File {} has zero length, skipping.", filePath);
+            return;
+        }
+
         DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>(input,
                 new GenericDatumReader<>());
 
@@ -201,15 +230,15 @@ record = dataFileReader.next(record);
 
         // Write which file has been processed and update bins
         try {
+            OffsetRange range = OffsetRange.parse(filePath.getName());
             offsets.write(range);
             bins.write();
         } catch (IOException ex) {
             logger.warn("Failed to update status. Continuing processing.", ex);
         }
-        processedFileCount++;
     }
 
-    private void writeRecord(GenericRecord record, String topicName, FileCache cache)
+    private void writeRecord(GenericRecord record, String topicName, FileCacheStore cache)
             throws IOException {
         GenericRecord keyField = (GenericRecord) record.get("key");
         GenericRecord valueField = (GenericRecord) record.get("value");
@@ -256,4 +285,5 @@ public static String createHourTimestamp(GenericRecord valueField, Field timeFie
         Date date = new Date((long) (time * 1000d));
         return FILE_DATE_FORMAT.format(date);
     }
+
 }