Skip to content

Commit b41bd06

Browse files
Merge pull request #17 from RADAR-base/enhancements
Enhancements and fixes
2 parents f12fa1f + fe1971e commit b41bd06

File tree

3 files changed

+28
-9
lines changed

3 files changed

+28
-9
lines changed

README.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,20 +15,20 @@ Build jar from source with
1515
```shell
1616
./gradlew build
1717
```
18-
and find the output JAR file as `build/libs/restructurehdfs-0.3-all.jar`. Then run with:
18+
and find the output JAR file as `build/libs/restructurehdfs-0.3.1-all.jar`. Then run with:
1919

2020
```shell
21-
java -jar restructurehdfs-0.3-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
21+
java -jar restructurehdfs-0.3.1-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
2222
```
2323

2424
By default, this will output the data in CSV format. If JSON format is preferred, use the following instead:
2525
```
26-
java -Dorg.radarcns.format=json -jar restructurehdfs-0.3-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
26+
java -Dorg.radarcns.format=json -jar restructurehdfs-0.3.1-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
2727
```
2828

2929
Another option is to output the data in compressed form. All files will get the `gz` suffix, and can be decompressed with a GZIP decoder. Note that for a very small number of records, this may actually increase the file size.
3030
```
31-
java -Dorg.radarcns.compress=gzip -jar restructurehdfs-0.3-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
31+
java -Dorg.radarcns.compress=gzip -jar restructurehdfs-0.3.1-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
3232
```
3333

34-
Finally, files records are deduplicated after writing. To disable this behaviour, specify the option `-Dorg.radarcns.deduplicate=false`.
34+
Finally, by default, files records are not deduplicated after writing. To enable this behaviour, specify the option `-Dorg.radarcns.deduplicate=true`. This set to false by default because of an issue with Biovotion data. Please see - [issue #16](https://github.com/RADAR-base/Restructure-HDFS-topic/issues/16) before enabling it.

build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ apply plugin: 'java'
22
apply plugin: 'application'
33

44
group 'org.radarcns.restructurehdfs'
5-
version '0.3'
5+
version '0.3.2-SNAPSHOT'
66
mainClassName = 'org.radarcns.RestructureAvroRecords'
77

88
run {

src/main/java/org/radarcns/RestructureAvroRecords.java

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
package org.radarcns;
1818

19+
import com.fasterxml.jackson.databind.JsonMappingException;
1920
import org.apache.avro.Schema.Field;
2021
import org.apache.avro.file.DataFileReader;
2122
import org.apache.avro.generic.GenericDatumReader;
@@ -70,7 +71,9 @@ public class RestructureAvroRecords {
7071
private long processedFileCount;
7172
private long processedRecordsCount;
7273
private static final boolean USE_GZIP = "gzip".equalsIgnoreCase(System.getProperty("org.radarcns.compression"));
73-
private static final boolean DO_DEDUPLICATE = "true".equalsIgnoreCase(System.getProperty("org.radarcns.deduplicate", "true"));
74+
75+
// Default set to false because causes loss of records from Biovotion data. https://github.com/RADAR-base/Restructure-HDFS-topic/issues/16
76+
private static final boolean DO_DEDUPLICATE = "true".equalsIgnoreCase(System.getProperty("org.radarcns.deduplicate", "false"));
7477

7578
public static void main(String [] args) throws Exception {
7679
if (args.length != 3) {
@@ -178,7 +181,12 @@ public void start(String directoryName) throws IOException {
178181
for (Map.Entry<String, List<Path>> entry : topicPaths.entrySet()) {
179182
try (FileCacheStore cache = new FileCacheStore(converterFactory, 100, USE_GZIP, DO_DEDUPLICATE)) {
180183
for (Path filePath : entry.getValue()) {
181-
this.processFile(filePath, entry.getKey(), cache, offsets);
184+
// If JsonMappingException occurs, log the error and continue with other files
185+
try {
186+
this.processFile(filePath, entry.getKey(), cache, offsets);
187+
} catch (JsonMappingException exc) {
188+
logger.error("Cannot map values", exc);
189+
}
182190
progressBar.update(++processedFileCount);
183191
}
184192
}
@@ -258,9 +266,20 @@ private void writeRecord(GenericRecord record, String topicName, FileCacheStore
258266
Date time = getDate(keyField, valueField);
259267
java.nio.file.Path outputFileName = createFilename(time);
260268

269+
String projectId;
270+
271+
if(keyField.get("projectId") == null) {
272+
projectId = "unknown-project";
273+
} else {
274+
// Clean Project id for use in final pathname
275+
projectId = keyField.get("projectId").toString().replaceAll("[^a-zA-Z0-9_-]+", "");
276+
}
277+
261278
// Clean user id and create final output pathname
262279
String userId = keyField.get("userId").toString().replaceAll("[^a-zA-Z0-9_-]+", "");
263-
java.nio.file.Path userDir = this.outputPath.resolve(userId);
280+
281+
java.nio.file.Path projectDir = this.outputPath.resolve(projectId);
282+
java.nio.file.Path userDir = projectDir.resolve(userId);
264283
java.nio.file.Path userTopicDir = userDir.resolve(topicName);
265284
java.nio.file.Path outputPath = userTopicDir.resolve(outputFileName);
266285

0 commit comments

Comments
 (0)