Merge pull request #12 from RADAR-CNS/v0.3.1_release

blootsvoets · web-flow · commit d225f6f900b2 · 2017-11-15T12:29:16.000+01:00
V0.3.1 release
diff --git a/README.md b/README.md
@@ -15,20 +15,20 @@ Build jar from source with
 ```shell
 ./gradlew build
 ```
-and find the output JAR file as `build/libs/restructurehdfs-0.3-all.jar`. Then run with:
+and find the output JAR file as `build/libs/restructurehdfs-0.3.1-all.jar`. Then run with:
 
 ```shell
-java -jar restructurehdfs-0.3-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
+java -jar restructurehdfs-0.3.1-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
 ```
 
 By default, this will output the data in CSV format. If JSON format is preferred, use the following instead:
 ```
-java -Dorg.radarcns.format=json -jar restructurehdfs-0.3-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
+java -Dorg.radarcns.format=json -jar restructurehdfs-0.3.1-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
 ```
 
 Another option is to output the data in compressed form. All files will get the `gz` suffix, and can be decompressed with a GZIP decoder. Note that for a very small number of records, this may actually increase the file size.
 ```
-java -Dorg.radarcns.compress=gzip -jar restructurehdfs-0.3-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
+java -Dorg.radarcns.compress=gzip -jar restructurehdfs-0.3.1-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
 ```
 
 Finally, files records are deduplicated after writing. To disable this behaviour, specify the option `-Dorg.radarcns.deduplicate=false`.
diff --git a/build.gradle b/build.gradle
@@ -2,7 +2,7 @@ apply plugin: 'java'
 apply plugin: 'application'
 
 group 'org.radarcns.restructurehdfs'
-version '0.3'
+version '0.3.1'
 mainClassName = 'org.radarcns.RestructureAvroRecords'
 
 run {
diff --git a/src/main/java/org/radarcns/RestructureAvroRecords.java b/src/main/java/org/radarcns/RestructureAvroRecords.java
@@ -256,7 +256,7 @@ private void writeRecord(GenericRecord record, String topicName, FileCacheStore
         }
 
         Date time = getDate(keyField, valueField);
-        String outputFileName = createFilename(time);
+        java.nio.file.Path outputFileName = createFilename(time);
 
         // Clean user id and create final output pathname
         String userId = keyField.get("userId").toString().replaceAll("[^a-zA-Z0-9_-]+", "");
@@ -279,14 +279,14 @@ private void writeRecord(GenericRecord record, String topicName, FileCacheStore
         processedRecordsCount++;
     }
 
-    private String createFilename(Date date) {
+    private java.nio.file.Path createFilename(Date date) {
         if (date == null) {
             logger.warn("Time field of record valueField is not set");
-            return "unknown_date." + outputFileExtension;
+            return Paths.get("unknown_date." + outputFileExtension);
         }
         // Make a timestamped filename YYYYMMDD_HH00.json
         String hourlyTimestamp = createHourTimestamp(date);
-        return hourlyTimestamp + "00." + outputFileExtension;
+        return Paths.get(hourlyTimestamp + "00." + outputFileExtension);
     }
 
     public static String createHourTimestamp(Date date) {
diff --git a/src/main/java/org/radarcns/util/RecordConverterFactory.java b/src/main/java/org/radarcns/util/RecordConverterFactory.java
@@ -27,8 +27,10 @@
 import java.io.OutputStreamWriter;
 import java.io.Reader;
 import java.io.Writer;
+import java.nio.file.FileSystems;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.nio.file.PathMatcher;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
@@ -40,6 +42,8 @@
 
 @FunctionalInterface
 public interface RecordConverterFactory {
+    PathMatcher GZ_FILE_MATCHER = FileSystems.getDefault().getPathMatcher("glob:**.gz");
+
     /**
      * Create a converter to write records of given type to given writer. A header is needed only
      * in certain converters. The given record is not converted yet, it is only used as an example.
@@ -61,7 +65,7 @@ default void sortUnique(Path path) throws IOException {
         Path tempOut = Files.createTempFile("tempfile", ".tmp");
         String header;
         boolean withHeader = hasHeader();
-        if (path.getFileName().endsWith(".gz")) {
+        if (GZ_FILE_MATCHER.matches(path)) {
             try (InputStream fileIn = Files.newInputStream(path);
                  GZIPInputStream gzipIn = new GZIPInputStream(fileIn);
                  Reader inReader = new InputStreamReader(gzipIn);
diff --git a/src/test/java/org/radarcns/util/CsvAvroConverterTest.java b/src/test/java/org/radarcns/util/CsvAvroConverterTest.java
@@ -19,14 +19,21 @@
 import static org.junit.Assert.assertArrayEquals;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;
 
 import com.fasterxml.jackson.databind.JsonMappingException;
 
 import java.io.BufferedReader;
 import java.io.BufferedWriter;
 import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.Reader;
 import java.io.StringWriter;
+import java.io.Writer;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.ArrayList;
@@ -36,6 +43,9 @@
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.GZIPOutputStream;
+
 import org.apache.avro.Schema;
 import org.apache.avro.Schema.Parser;
 import org.apache.avro.SchemaBuilder;
@@ -146,19 +156,46 @@ public void subSchema() throws IOException {
         System.out.println(writer.toString());
     }
 
+    static void writeTestNumbers(Writer writer) throws IOException {
+        writer.write("a,b\n");
+        writer.write("1,2\n");
+        writer.write("3,4\n");
+        writer.write("1,3\n");
+        writer.write("3,4\n");
+        writer.write("1,2\n");
+        writer.write("a,a\n");
+    }
+
     @Test
     public void deduplicate() throws IOException {
         Path path = folder.newFile().toPath();
         try (BufferedWriter writer = Files.newBufferedWriter(path)) {
-            writer.write("a,b\n");
-            writer.write("1,2\n");
-            writer.write("3,4\n");
-            writer.write("1,3\n");
-            writer.write("3,4\n");
-            writer.write("1,2\n");
-            writer.write("a,a\n");
+            writeTestNumbers(writer);
         }
         CsvAvroConverter.getFactory().sortUnique(path);
         assertEquals(Arrays.asList("a,b", "1,2", "1,3", "3,4", "a,a"), Files.readAllLines(path));
     }
+
+
+    @Test
+    public void deduplicateGzip() throws IOException {
+        Path path = folder.newFile("test.csv.gz").toPath();
+        try (OutputStream out = Files.newOutputStream(path);
+             GZIPOutputStream gzipOut = new GZIPOutputStream(out);
+             Writer writer = new OutputStreamWriter(gzipOut)) {
+            writeTestNumbers(writer);
+        }
+        CsvAvroConverter.getFactory().sortUnique(path);
+        try (InputStream in = Files.newInputStream(path);
+                GZIPInputStream gzipIn = new GZIPInputStream(in);
+                Reader inReader = new InputStreamReader(gzipIn);
+                BufferedReader reader = new BufferedReader(inReader)) {
+            assertEquals("a,b", reader.readLine());
+            assertEquals("1,2", reader.readLine());
+            assertEquals("1,3", reader.readLine());
+            assertEquals("3,4", reader.readLine());
+            assertEquals("a,a", reader.readLine());
+            assertNull(reader.readLine());
+        }
+    }
 }
diff --git a/src/test/java/org/radarcns/util/JsonAvroConverterTest.java b/src/test/java/org/radarcns/util/JsonAvroConverterTest.java
@@ -17,6 +17,7 @@
 package org.radarcns.util;
 
 import static org.junit.Assert.assertEquals;
+import static org.radarcns.util.CsvAvroConverterTest.writeTestNumbers;
 
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.fasterxml.jackson.databind.ObjectWriter;
@@ -81,15 +82,9 @@ public void fullAvroTest() throws IOException {
     public void deduplicate() throws IOException {
         Path path = folder.newFile().toPath();
         try (BufferedWriter writer = Files.newBufferedWriter(path)) {
-            writer.write("\"a,b\"\n");
-            writer.write("\"1,2\"\n");
-            writer.write("\"3,4\"\n");
-            writer.write("\"1,3\"\n");
-            writer.write("\"3,4\"\n");
-            writer.write("\"1,2\"\n");
-            writer.write("\"a,a\"\n");
+            writeTestNumbers(writer);
         }
         JsonAvroConverter.getFactory().sortUnique(path);
-        assertEquals(Arrays.asList("\"1,2\"", "\"1,3\"", "\"3,4\"", "\"a,a\"", "\"a,b\""), Files.readAllLines(path));
+        assertEquals(Arrays.asList("1,2", "1,3", "3,4", "a,a", "a,b"), Files.readAllLines(path));
     }
 }

Original file line number	Diff line number	Diff line change
`@@ -256,7 +256,7 @@ private void writeRecord(GenericRecord record, String topicName, FileCacheStore`
`256`	`256`	`}`
`257`	`257`
`258`	`258`	`Date time = getDate(keyField, valueField);`
`259`		`- String outputFileName = createFilename(time);`
	`259`	`+ java.nio.file.Path outputFileName = createFilename(time);`
`260`	`260`
`261`	`261`	`// Clean user id and create final output pathname`
`262`	`262`	`String userId = keyField.get("userId").toString().replaceAll("[^a-zA-Z0-9_-]+", "");`
`@@ -279,14 +279,14 @@ private void writeRecord(GenericRecord record, String topicName, FileCacheStore`
`279`	`279`	`processedRecordsCount++;`
`280`	`280`	`}`
`281`	`281`
`282`		`- private String createFilename(Date date) {`
	`282`	`+ private java.nio.file.Path createFilename(Date date) {`
`283`	`283`	`if (date == null) {`
`284`	`284`	`logger.warn("Time field of record valueField is not set");`
`285`		`- return "unknown_date." + outputFileExtension;`
	`285`	`+ return Paths.get("unknown_date." + outputFileExtension);`
`286`	`286`	`}`
`287`	`287`	`// Make a timestamped filename YYYYMMDD_HH00.json`
`288`	`288`	`String hourlyTimestamp = createHourTimestamp(date);`
`289`		`- return hourlyTimestamp + "00." + outputFileExtension;`
	`289`	`+ return Paths.get(hourlyTimestamp + "00." + outputFileExtension);`
`290`	`290`	`}`
`291`	`291`
`292`	`292`	`public static String createHourTimestamp(Date date) {`