Skip to content

Commit d225f6f

Browse files
authored
Merge pull request #12 from RADAR-CNS/v0.3.1_release
V0.3.1 release
2 parents b7ce9b6 + 46be271 commit d225f6f

File tree

6 files changed

+61
-25
lines changed

6 files changed

+61
-25
lines changed

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,20 +15,20 @@ Build jar from source with
1515
```shell
1616
./gradlew build
1717
```
18-
and find the output JAR file as `build/libs/restructurehdfs-0.3-all.jar`. Then run with:
18+
and find the output JAR file as `build/libs/restructurehdfs-0.3.1-all.jar`. Then run with:
1919

2020
```shell
21-
java -jar restructurehdfs-0.3-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
21+
java -jar restructurehdfs-0.3.1-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
2222
```
2323

2424
By default, this will output the data in CSV format. If JSON format is preferred, use the following instead:
2525
```
26-
java -Dorg.radarcns.format=json -jar restructurehdfs-0.3-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
26+
java -Dorg.radarcns.format=json -jar restructurehdfs-0.3.1-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
2727
```
2828

2929
Another option is to output the data in compressed form. All files will get the `gz` suffix, and can be decompressed with a GZIP decoder. Note that for a very small number of records, this may actually increase the file size.
3030
```
31-
java -Dorg.radarcns.compress=gzip -jar restructurehdfs-0.3-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
31+
java -Dorg.radarcns.compress=gzip -jar restructurehdfs-0.3.1-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
3232
```
3333

3434
Finally, files records are deduplicated after writing. To disable this behaviour, specify the option `-Dorg.radarcns.deduplicate=false`.

build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ apply plugin: 'java'
22
apply plugin: 'application'
33

44
group 'org.radarcns.restructurehdfs'
5-
version '0.3'
5+
version '0.3.1'
66
mainClassName = 'org.radarcns.RestructureAvroRecords'
77

88
run {

src/main/java/org/radarcns/RestructureAvroRecords.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,7 @@ private void writeRecord(GenericRecord record, String topicName, FileCacheStore
256256
}
257257

258258
Date time = getDate(keyField, valueField);
259-
String outputFileName = createFilename(time);
259+
java.nio.file.Path outputFileName = createFilename(time);
260260

261261
// Clean user id and create final output pathname
262262
String userId = keyField.get("userId").toString().replaceAll("[^a-zA-Z0-9_-]+", "");
@@ -279,14 +279,14 @@ private void writeRecord(GenericRecord record, String topicName, FileCacheStore
279279
processedRecordsCount++;
280280
}
281281

282-
private String createFilename(Date date) {
282+
private java.nio.file.Path createFilename(Date date) {
283283
if (date == null) {
284284
logger.warn("Time field of record valueField is not set");
285-
return "unknown_date." + outputFileExtension;
285+
return Paths.get("unknown_date." + outputFileExtension);
286286
}
287287
// Make a timestamped filename YYYYMMDD_HH00.json
288288
String hourlyTimestamp = createHourTimestamp(date);
289-
return hourlyTimestamp + "00." + outputFileExtension;
289+
return Paths.get(hourlyTimestamp + "00." + outputFileExtension);
290290
}
291291

292292
public static String createHourTimestamp(Date date) {

src/main/java/org/radarcns/util/RecordConverterFactory.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,10 @@
2727
import java.io.OutputStreamWriter;
2828
import java.io.Reader;
2929
import java.io.Writer;
30+
import java.nio.file.FileSystems;
3031
import java.nio.file.Files;
3132
import java.nio.file.Path;
33+
import java.nio.file.PathMatcher;
3234
import java.util.ArrayList;
3335
import java.util.Collection;
3436
import java.util.Collections;
@@ -40,6 +42,8 @@
4042

4143
@FunctionalInterface
4244
public interface RecordConverterFactory {
45+
PathMatcher GZ_FILE_MATCHER = FileSystems.getDefault().getPathMatcher("glob:**.gz");
46+
4347
/**
4448
* Create a converter to write records of given type to given writer. A header is needed only
4549
* in certain converters. The given record is not converted yet, it is only used as an example.
@@ -61,7 +65,7 @@ default void sortUnique(Path path) throws IOException {
6165
Path tempOut = Files.createTempFile("tempfile", ".tmp");
6266
String header;
6367
boolean withHeader = hasHeader();
64-
if (path.getFileName().endsWith(".gz")) {
68+
if (GZ_FILE_MATCHER.matches(path)) {
6569
try (InputStream fileIn = Files.newInputStream(path);
6670
GZIPInputStream gzipIn = new GZIPInputStream(fileIn);
6771
Reader inReader = new InputStreamReader(gzipIn);

src/test/java/org/radarcns/util/CsvAvroConverterTest.java

Lines changed: 44 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,21 @@
1919
import static org.junit.Assert.assertArrayEquals;
2020
import static org.junit.Assert.assertEquals;
2121
import static org.junit.Assert.assertFalse;
22+
import static org.junit.Assert.assertNull;
2223
import static org.junit.Assert.assertTrue;
2324

2425
import com.fasterxml.jackson.databind.JsonMappingException;
2526

2627
import java.io.BufferedReader;
2728
import java.io.BufferedWriter;
2829
import java.io.IOException;
30+
import java.io.InputStream;
31+
import java.io.InputStreamReader;
32+
import java.io.OutputStream;
33+
import java.io.OutputStreamWriter;
34+
import java.io.Reader;
2935
import java.io.StringWriter;
36+
import java.io.Writer;
3037
import java.nio.file.Files;
3138
import java.nio.file.Path;
3239
import java.util.ArrayList;
@@ -36,6 +43,9 @@
3643
import java.util.List;
3744
import java.util.Map;
3845
import java.util.Set;
46+
import java.util.zip.GZIPInputStream;
47+
import java.util.zip.GZIPOutputStream;
48+
3949
import org.apache.avro.Schema;
4050
import org.apache.avro.Schema.Parser;
4151
import org.apache.avro.SchemaBuilder;
@@ -146,19 +156,46 @@ public void subSchema() throws IOException {
146156
System.out.println(writer.toString());
147157
}
148158

159+
static void writeTestNumbers(Writer writer) throws IOException {
160+
writer.write("a,b\n");
161+
writer.write("1,2\n");
162+
writer.write("3,4\n");
163+
writer.write("1,3\n");
164+
writer.write("3,4\n");
165+
writer.write("1,2\n");
166+
writer.write("a,a\n");
167+
}
168+
149169
@Test
150170
public void deduplicate() throws IOException {
151171
Path path = folder.newFile().toPath();
152172
try (BufferedWriter writer = Files.newBufferedWriter(path)) {
153-
writer.write("a,b\n");
154-
writer.write("1,2\n");
155-
writer.write("3,4\n");
156-
writer.write("1,3\n");
157-
writer.write("3,4\n");
158-
writer.write("1,2\n");
159-
writer.write("a,a\n");
173+
writeTestNumbers(writer);
160174
}
161175
CsvAvroConverter.getFactory().sortUnique(path);
162176
assertEquals(Arrays.asList("a,b", "1,2", "1,3", "3,4", "a,a"), Files.readAllLines(path));
163177
}
178+
179+
180+
@Test
181+
public void deduplicateGzip() throws IOException {
182+
Path path = folder.newFile("test.csv.gz").toPath();
183+
try (OutputStream out = Files.newOutputStream(path);
184+
GZIPOutputStream gzipOut = new GZIPOutputStream(out);
185+
Writer writer = new OutputStreamWriter(gzipOut)) {
186+
writeTestNumbers(writer);
187+
}
188+
CsvAvroConverter.getFactory().sortUnique(path);
189+
try (InputStream in = Files.newInputStream(path);
190+
GZIPInputStream gzipIn = new GZIPInputStream(in);
191+
Reader inReader = new InputStreamReader(gzipIn);
192+
BufferedReader reader = new BufferedReader(inReader)) {
193+
assertEquals("a,b", reader.readLine());
194+
assertEquals("1,2", reader.readLine());
195+
assertEquals("1,3", reader.readLine());
196+
assertEquals("3,4", reader.readLine());
197+
assertEquals("a,a", reader.readLine());
198+
assertNull(reader.readLine());
199+
}
200+
}
164201
}

src/test/java/org/radarcns/util/JsonAvroConverterTest.java

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
package org.radarcns.util;
1818

1919
import static org.junit.Assert.assertEquals;
20+
import static org.radarcns.util.CsvAvroConverterTest.writeTestNumbers;
2021

2122
import com.fasterxml.jackson.databind.ObjectMapper;
2223
import com.fasterxml.jackson.databind.ObjectWriter;
@@ -81,15 +82,9 @@ public void fullAvroTest() throws IOException {
8182
public void deduplicate() throws IOException {
8283
Path path = folder.newFile().toPath();
8384
try (BufferedWriter writer = Files.newBufferedWriter(path)) {
84-
writer.write("\"a,b\"\n");
85-
writer.write("\"1,2\"\n");
86-
writer.write("\"3,4\"\n");
87-
writer.write("\"1,3\"\n");
88-
writer.write("\"3,4\"\n");
89-
writer.write("\"1,2\"\n");
90-
writer.write("\"a,a\"\n");
85+
writeTestNumbers(writer);
9186
}
9287
JsonAvroConverter.getFactory().sortUnique(path);
93-
assertEquals(Arrays.asList("\"1,2\"", "\"1,3\"", "\"3,4\"", "\"a,a\"", "\"a,b\""), Files.readAllLines(path));
88+
assertEquals(Arrays.asList("1,2", "1,3", "3,4", "a,a", "a,b"), Files.readAllLines(path));
9489
}
9590
}

0 commit comments

Comments
 (0)