Skip to content

Commit b7ce9b6

Browse files
authored
Merge pull request #11 from RADAR-CNS/v0.3-release
V0.3 release
2 parents 1cd1350 + 8b5b0dd commit b7ce9b6

17 files changed

+390
-185
lines changed

README.md

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,20 @@ Build jar from source with
1515
```shell
1616
./gradlew build
1717
```
18-
and find the output JAR file as `build/libs/restructurehdfs-0.2.1-all.jar`. Then run with:
18+
and find the output JAR file as `build/libs/restructurehdfs-0.3-all.jar`. Then run with:
1919

2020
```shell
21-
java -jar restructurehdfs-0.2.1-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
21+
java -jar restructurehdfs-0.3-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
2222
```
2323

2424
By default, this will output the data in CSV format. If JSON format is preferred, use the following instead:
2525
```
26-
java -Dorg.radarcns.format=json -jar restructurehdfs-0.2.1-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
26+
java -Dorg.radarcns.format=json -jar restructurehdfs-0.3-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
2727
```
2828

2929
Another option is to output the data in compressed form. All files will get the `gz` suffix, and can be decompressed with a GZIP decoder. Note that for a very small number of records, this may actually increase the file size.
3030
```
31-
java -Dorg.radarcns.compress=gzip -jar restructurehdfs-0.2.1-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
31+
java -Dorg.radarcns.compress=gzip -jar restructurehdfs-0.3-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
3232
```
33+
34+
Finally, files records are deduplicated after writing. To disable this behaviour, specify the option `-Dorg.radarcns.deduplicate=false`.

build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ apply plugin: 'java'
22
apply plugin: 'application'
33

44
group 'org.radarcns.restructurehdfs'
5-
version '0.2.1'
5+
version '0.3'
66
mainClassName = 'org.radarcns.RestructureAvroRecords'
77

88
run {

src/main/java/org/radarcns/Frequency.java

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -16,40 +16,40 @@
1616

1717
package org.radarcns;
1818

19-
import java.nio.file.Files;
20-
import java.util.Date;
21-
import java.util.List;
22-
import java.util.Objects;
23-
import javax.annotation.Nonnull;
24-
import org.apache.avro.Schema.Field;
25-
import org.apache.avro.generic.GenericRecord;
2619
import org.apache.commons.collections.MapIterator;
2720
import org.apache.commons.collections.keyvalue.MultiKey;
2821
import org.apache.commons.collections.map.MultiKeyMap;
29-
30-
import java.io.*;
3122
import org.slf4j.Logger;
3223
import org.slf4j.LoggerFactory;
3324

25+
import javax.annotation.Nonnull;
26+
import java.io.BufferedWriter;
27+
import java.io.IOException;
28+
import java.nio.file.Files;
29+
import java.nio.file.Path;
30+
import java.util.Date;
31+
import java.util.List;
32+
import java.util.Objects;
33+
3434

3535
public class Frequency {
3636
private static final Logger logger = LoggerFactory.getLogger(Frequency.class);
3737

3838
private final MultiKeyMap bins;
39-
private final File file;
39+
private final Path path;
4040

41-
public Frequency(@Nonnull File file, @Nonnull MultiKeyMap initialData) {
42-
Objects.requireNonNull(file);
41+
public Frequency(@Nonnull Path path, @Nonnull MultiKeyMap initialData) {
42+
Objects.requireNonNull(path);
4343
Objects.requireNonNull(initialData);
44-
this.file = file;
44+
this.path = path;
4545
this.bins = initialData;
4646
}
4747

48-
public static Frequency read(File file) {
48+
public static Frequency read(Path path) {
4949
MultiKeyMap map = new MultiKeyMap();
5050
try {
5151
// Read in all lines as multikeymap (key, key, key, value)
52-
List<String> lines = Files.readAllLines(file.toPath());
52+
List<String> lines = Files.readAllLines(path);
5353
lines.subList(1, lines.size()).forEach(line -> {
5454
String[] columns = line.split(",");
5555
try {
@@ -61,7 +61,7 @@ public static Frequency read(File file) {
6161
} catch (IOException e) {
6262
logger.warn("Could not read the file with bins. Creating new file when writing.");
6363
}
64-
return new Frequency(file, map);
64+
return new Frequency(path, map);
6565
}
6666

6767
public void add(String topicName, String id, Date date) {
@@ -88,8 +88,7 @@ public void print() {
8888
public void write() {
8989
// Write all bins to csv
9090
MapIterator mapIterator = bins.mapIterator();
91-
try (FileWriter fw = new FileWriter(file, false);
92-
BufferedWriter bw = new BufferedWriter(fw)) {
91+
try (BufferedWriter bw = Files.newBufferedWriter(path)) {
9392
String header = String.join(",","topic","device","timestamp","count");
9493
bw.write(header);
9594
bw.write('\n');

src/main/java/org/radarcns/OffsetRangeFile.java

Lines changed: 13 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -23,15 +23,15 @@
2323
import com.fasterxml.jackson.dataformat.csv.CsvGenerator;
2424
import com.fasterxml.jackson.dataformat.csv.CsvMapper;
2525
import com.fasterxml.jackson.dataformat.csv.CsvSchema;
26+
2627
import java.io.BufferedReader;
2728
import java.io.BufferedWriter;
2829
import java.io.Closeable;
29-
import java.io.File;
30-
import java.io.FileReader;
31-
import java.io.FileWriter;
3230
import java.io.Flushable;
3331
import java.io.IOException;
3432
import java.nio.file.Files;
33+
import java.nio.file.Path;
34+
import java.nio.file.StandardOpenOption;
3535

3636
import static java.nio.file.StandardCopyOption.REPLACE_EXISTING;
3737

@@ -56,19 +56,18 @@ private OffsetRangeFile() {
5656
// utility class
5757
}
5858

59-
public static void cleanUp(File file) throws IOException {
60-
File tmpFile = File.createTempFile("offsets", ".csv.tmp");
61-
try (OffsetRangeFile.Writer offsets = new OffsetRangeFile.Writer(tmpFile)) {
62-
offsets.write(OffsetRangeFile.read(file));
59+
public static void cleanUp(Path path) throws IOException {
60+
Path tmpPath = Files.createTempFile("offsets", ".csv.tmp");
61+
try (OffsetRangeFile.Writer offsets = new OffsetRangeFile.Writer(tmpPath)) {
62+
offsets.write(OffsetRangeFile.read(path));
6363
}
64-
Files.move(tmpFile.toPath(), file.toPath(), REPLACE_EXISTING);
64+
Files.move(tmpPath, path, REPLACE_EXISTING);
6565
}
6666

67-
public static OffsetRangeSet read(File inputFile) throws IOException {
67+
public static OffsetRangeSet read(Path path) throws IOException {
6868
OffsetRangeSet set = new OffsetRangeSet();
6969

70-
try (FileReader fr = new FileReader(inputFile);
71-
BufferedReader br = new BufferedReader(fr)) {
70+
try (BufferedReader br = Files.newBufferedReader(path)) {
7271
MappingIterator<OffsetRange> ranges = CSV_READER.readValues(br);
7372
while(ranges.hasNext()) {
7473
set.add(ranges.next());
@@ -78,15 +77,13 @@ public static OffsetRangeSet read(File inputFile) throws IOException {
7877
}
7978

8079
public static class Writer implements Flushable, Closeable {
81-
private final FileWriter fileWriter;
8280
private final BufferedWriter bufferedWriter;
8381
private final CsvGenerator generator;
8482
private final ObjectWriter writer;
8583

86-
public Writer(File outputFile) throws IOException {
87-
boolean fileIsNew = !outputFile.exists() || outputFile.length() == 0;
88-
this.fileWriter = new FileWriter(outputFile, true);
89-
this.bufferedWriter = new BufferedWriter(this.fileWriter);
84+
public Writer(Path path) throws IOException {
85+
boolean fileIsNew = !Files.exists(path) || Files.size(path) == 0;
86+
this.bufferedWriter = Files.newBufferedWriter(path, StandardOpenOption.APPEND, StandardOpenOption.CREATE);
9087
this.generator = CSV_FACTORY.createGenerator(bufferedWriter);
9188
this.writer = CSV_MAPPER.writerFor(OffsetRange.class)
9289
.with(fileIsNew ? SCHEMA.withHeader() : SCHEMA);
@@ -111,7 +108,6 @@ public void flush() throws IOException {
111108
public void close() throws IOException {
112109
generator.close();
113110
bufferedWriter.close();
114-
fileWriter.close();
115111
}
116112
}
117113
}

src/main/java/org/radarcns/OffsetRangeSet.java

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,7 @@
1717
package org.radarcns;
1818

1919
import javax.annotation.Nonnull;
20-
import java.util.HashMap;
2120
import java.util.Iterator;
22-
import java.util.Map;
2321
import java.util.NoSuchElementException;
2422
import java.util.SortedMap;
2523
import java.util.SortedSet;

src/main/java/org/radarcns/RestructureAvroRecords.java

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,10 @@
3434
import org.slf4j.Logger;
3535
import org.slf4j.LoggerFactory;
3636

37-
import java.io.File;
38-
import java.io.FileWriter;
3937
import java.io.IOException;
38+
import java.io.Writer;
39+
import java.nio.file.Files;
40+
import java.nio.file.Paths;
4041
import java.text.SimpleDateFormat;
4142
import java.util.ArrayList;
4243
import java.util.Date;
@@ -49,9 +50,9 @@ public class RestructureAvroRecords {
4950
private static final Logger logger = LoggerFactory.getLogger(RestructureAvroRecords.class);
5051

5152
private final String outputFileExtension;
52-
private static final String OFFSETS_FILE_NAME = "offsets.csv";
53-
private static final String BINS_FILE_NAME = "bins.csv";
54-
private static final String SCHEMA_OUTPUT_FILE_NAME = "schema.json";
53+
private static final java.nio.file.Path OFFSETS_FILE_NAME = Paths.get("offsets.csv");
54+
private static final java.nio.file.Path BINS_FILE_NAME = Paths.get("bins.csv");
55+
private static final java.nio.file.Path SCHEMA_OUTPUT_FILE_NAME = Paths.get("schema.json");
5556
private static final SimpleDateFormat FILE_DATE_FORMAT = new SimpleDateFormat("yyyyMMdd_HH");
5657

5758
static {
@@ -60,15 +61,16 @@ public class RestructureAvroRecords {
6061

6162
private final RecordConverterFactory converterFactory;
6263

63-
private File outputPath;
64-
private File offsetsPath;
64+
private java.nio.file.Path outputPath;
65+
private java.nio.file.Path offsetsPath;
6566
private Frequency bins;
6667

6768
private final Configuration conf = new Configuration();
6869

6970
private long processedFileCount;
7071
private long processedRecordsCount;
7172
private static final boolean USE_GZIP = "gzip".equalsIgnoreCase(System.getProperty("org.radarcns.compression"));
73+
private static final boolean DO_DEDUPLICATE = "true".equalsIgnoreCase(System.getProperty("org.radarcns.deduplicate", "true"));
7274

7375
public static void main(String [] args) throws Exception {
7476
if (args.length != 3) {
@@ -121,9 +123,9 @@ public void setInputWebHdfsURL(String fileSystemURL) {
121123

122124
public void setOutputPath(String path) {
123125
// Remove trailing backslash
124-
outputPath = new File(path.replaceAll("/$",""));
125-
offsetsPath = new File(outputPath, OFFSETS_FILE_NAME);
126-
bins = Frequency.read(new File(outputPath, BINS_FILE_NAME));
126+
outputPath = Paths.get(path.replaceAll("/$", ""));
127+
offsetsPath = outputPath.resolve(OFFSETS_FILE_NAME);
128+
bins = Frequency.read(outputPath.resolve(BINS_FILE_NAME));
127129
}
128130

129131
public long getProcessedFileCount() {
@@ -174,7 +176,7 @@ public void start(String directoryName) throws IOException {
174176

175177
// Actually process the files
176178
for (Map.Entry<String, List<Path>> entry : topicPaths.entrySet()) {
177-
try (FileCacheStore cache = new FileCacheStore(converterFactory, 100, USE_GZIP)) {
179+
try (FileCacheStore cache = new FileCacheStore(converterFactory, 100, USE_GZIP, DO_DEDUPLICATE)) {
178180
for (Path filePath : entry.getValue()) {
179181
this.processFile(filePath, entry.getKey(), cache, offsets);
180182
progressBar.update(++processedFileCount);
@@ -258,16 +260,16 @@ private void writeRecord(GenericRecord record, String topicName, FileCacheStore
258260

259261
// Clean user id and create final output pathname
260262
String userId = keyField.get("userId").toString().replaceAll("[^a-zA-Z0-9_-]+", "");
261-
File userDir = new File(this.outputPath, userId);
262-
File userTopicDir = new File(userDir, topicName);
263-
File outputFile = new File(userTopicDir, outputFileName);
263+
java.nio.file.Path userDir = this.outputPath.resolve(userId);
264+
java.nio.file.Path userTopicDir = userDir.resolve(topicName);
265+
java.nio.file.Path outputPath = userTopicDir.resolve(outputFileName);
264266

265267
// Write data
266-
cache.writeRecord(outputFile, record);
268+
cache.writeRecord(outputPath, record);
267269

268-
File schemaFile = new File(userTopicDir, SCHEMA_OUTPUT_FILE_NAME);
269-
if (!schemaFile.exists()) {
270-
try (FileWriter writer = new FileWriter(schemaFile, false)) {
270+
java.nio.file.Path schemaPath = userTopicDir.resolve(SCHEMA_OUTPUT_FILE_NAME);
271+
if (!Files.exists(schemaPath)) {
272+
try (Writer writer = Files.newBufferedWriter(schemaPath)) {
271273
writer.write(record.getSchema().toString(true));
272274
}
273275
}

src/main/java/org/radarcns/util/CsvAvroConverter.java

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,17 +21,18 @@
2121
import com.fasterxml.jackson.dataformat.csv.CsvGenerator;
2222
import com.fasterxml.jackson.dataformat.csv.CsvMapper;
2323
import com.fasterxml.jackson.dataformat.csv.CsvSchema;
24+
import org.apache.avro.Schema;
25+
import org.apache.avro.Schema.Field;
26+
import org.apache.avro.generic.GenericData;
27+
import org.apache.avro.generic.GenericFixed;
28+
import org.apache.avro.generic.GenericRecord;
29+
2430
import java.io.IOException;
2531
import java.io.Writer;
2632
import java.nio.ByteBuffer;
2733
import java.util.LinkedHashMap;
2834
import java.util.List;
2935
import java.util.Map;
30-
import org.apache.avro.Schema;
31-
import org.apache.avro.Schema.Field;
32-
import org.apache.avro.generic.GenericData;
33-
import org.apache.avro.generic.GenericFixed;
34-
import org.apache.avro.generic.GenericRecord;
3536

3637
/**
3738
* Converts deep hierarchical Avro records into flat CSV format. It uses a simple dot syntax in the
@@ -42,7 +43,17 @@ public class CsvAvroConverter implements RecordConverter {
4243

4344
public static RecordConverterFactory getFactory() {
4445
CsvFactory factory = new CsvFactory();
45-
return (writer, record, writeHeader) -> new CsvAvroConverter(factory, writer, record, writeHeader);
46+
return new RecordConverterFactory() {
47+
@Override
48+
public RecordConverter converterFor(Writer writer, GenericRecord record, boolean writeHeader) throws IOException {
49+
return new CsvAvroConverter(factory, writer, record, writeHeader);
50+
}
51+
52+
@Override
53+
public boolean hasHeader() {
54+
return true;
55+
}
56+
};
4657
}
4758

4859
private final ObjectWriter csvWriter;

0 commit comments

Comments
 (0)