Skip to content

Commit 090a1dc

Browse files
Use builder pattern
1 parent 6eeee1f commit 090a1dc

File tree

4 files changed

+54
-32
lines changed

4 files changed

+54
-32
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,4 +41,4 @@ Another option is to output the data in compressed form. All files will get the
4141
java -jar restructurehdfs-0.3.3-all.jar --compression gzip --hdfs-uri <webhdfs_url> --hdfs-root-directory <hdfs_topic_path> --output-directory <output_folder>
4242
```
4343

44-
Finally, by default, files records are not deduplicated after writing. To enable this behaviour, specify the option `--deduplicate true`. This set to false by default because of an issue with Biovotion data. Please see - [issue #16](https://github.com/RADAR-base/Restructure-HDFS-topic/issues/16) before enabling it.
44+
Finally, by default, files records are not deduplicated after writing. To enable this behaviour, specify the option `--deduplicate` or `-d`. This set to false by default because of an issue with Biovotion data. Please see - [issue #16](https://github.com/RADAR-base/Restructure-HDFS-topic/issues/16) before enabling it.

src/main/java/org/radarcns/RestructureAvroRecords.java

Lines changed: 51 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,8 @@ public class RestructureAvroRecords {
7272

7373
private long processedFileCount;
7474
private long processedRecordsCount;
75-
private static boolean USE_GZIP;
76-
private static boolean DO_DEDUPLICATE;
75+
private boolean useGzip;
76+
private boolean doDeduplicate;
7777

7878
private static final CommandLineArgs commandLineArgs = new CommandLineArgs();
7979
private static final JCommander parser = JCommander.newBuilder().addObject(commandLineArgs).build();
@@ -88,18 +88,20 @@ public static void main(String [] args) throws Exception {
8888
System.exit(0);
8989
}
9090

91-
USE_GZIP = "gzip".equalsIgnoreCase(commandLineArgs.compression);
92-
DO_DEDUPLICATE = commandLineArgs.deduplicate;
93-
9491
logger.info(new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date()));
9592
logger.info("Starting...");
9693
logger.info("In: " + commandLineArgs.hdfsUri + commandLineArgs.hdfsRootDirectory);
9794
logger.info("Out: " + commandLineArgs.outputDirectory);
98-
logger.info("Deduplicate set to {}", DO_DEDUPLICATE);
95+
9996

10097
long time1 = System.currentTimeMillis();
10198

102-
RestructureAvroRecords restr = new RestructureAvroRecords(commandLineArgs.hdfsUri, commandLineArgs.outputDirectory);
99+
RestructureAvroRecords restr = new RestructureAvroRecords.Builder(commandLineArgs.hdfsUri,
100+
commandLineArgs.outputDirectory)
101+
.useGzip("gzip".equalsIgnoreCase(commandLineArgs.compression))
102+
.doDeuplicate(commandLineArgs.deduplicate).format(commandLineArgs.format)
103+
.build();
104+
103105
try {
104106
restr.start(commandLineArgs.hdfsRootDirectory);
105107
} catch (IOException ex) {
@@ -110,12 +112,16 @@ public static void main(String [] args) throws Exception {
110112
logger.info("Time taken: {} seconds", (System.currentTimeMillis() - time1)/1000d);
111113
}
112114

113-
public RestructureAvroRecords(String inputPath, String outputPath) {
114-
this.setInputWebHdfsURL(inputPath);
115+
private RestructureAvroRecords(String hdfsUri, String outputPath, boolean gzip, boolean dedup, String format) {
116+
this.setInputWebHdfsURL(hdfsUri);
115117
this.setOutputPath(outputPath);
116118

119+
this.useGzip = gzip;
120+
this.doDeduplicate = dedup;
121+
logger.info("Deduplicate set to {}", doDeduplicate);
122+
117123
String extension;
118-
if (commandLineArgs.format.equalsIgnoreCase("json")) {
124+
if (format.equalsIgnoreCase("json")) {
119125
logger.info("Writing output files in JSON format");
120126
converterFactory = JsonAvroConverter.getFactory();
121127
extension = "json";
@@ -124,7 +130,7 @@ public RestructureAvroRecords(String inputPath, String outputPath) {
124130
converterFactory = CsvAvroConverter.getFactory();
125131
extension = "csv";
126132
}
127-
if (USE_GZIP) {
133+
if (this.useGzip) {
128134
logger.info("Compressing output files in GZIP format");
129135
extension += ".gz";
130136
}
@@ -190,7 +196,7 @@ public void start(String directoryName) throws IOException {
190196

191197
// Actually process the files
192198
for (Map.Entry<String, List<Path>> entry : topicPaths.entrySet()) {
193-
try (FileCacheStore cache = new FileCacheStore(converterFactory, 100, USE_GZIP, DO_DEDUPLICATE)) {
199+
try (FileCacheStore cache = new FileCacheStore(converterFactory, 100, useGzip, doDeduplicate)) {
194200
for (Path filePath : entry.getValue()) {
195201
// If JsonMappingException occurs, log the error and continue with other files
196202
try {
@@ -358,4 +364,37 @@ public static Date getDate(GenericRecord keyField, GenericRecord valueField) {
358364
long time = (Long) keyField.get("start");
359365
return new Date(time);
360366
}
367+
368+
public static class Builder {
369+
private boolean useGzip;
370+
private boolean doDeduplicate;
371+
private String hdfsUri;
372+
private String outputPath;
373+
private String format;
374+
375+
public Builder(final String uri, final String outputPath) {
376+
this.hdfsUri = uri;
377+
this.outputPath = outputPath;
378+
}
379+
380+
public Builder useGzip(final boolean gzip) {
381+
this.useGzip = gzip;
382+
return this;
383+
}
384+
385+
public Builder doDeuplicate(final boolean dedup) {
386+
this.doDeduplicate = dedup;
387+
return this;
388+
}
389+
390+
public Builder format(final String format) {
391+
this.format = format;
392+
return this;
393+
}
394+
395+
public RestructureAvroRecords build() {
396+
return new RestructureAvroRecords(hdfsUri, outputPath, useGzip, doDeduplicate, format);
397+
}
398+
399+
}
361400
}

src/main/java/org/radarcns/util/commandline/BooleanValidator.java

Lines changed: 0 additions & 17 deletions
This file was deleted.

src/main/java/org/radarcns/util/commandline/CommandLineArgs.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ public class CommandLineArgs {
1111
public String compression = "none";
1212

1313
// Default set to false because causes loss of records from Biovotion data. https://github.com/RADAR-base/Restructure-HDFS-topic/issues/16
14-
@Parameter(names = { "-d", "--deduplicate" }, description = "Boolean to define if to use deduplication or not.", validateWith = BooleanValidator.class)
15-
public Boolean deduplicate = false;
14+
@Parameter(names = { "-d", "--deduplicate" }, description = "Boolean to define if to use deduplication or not.")
15+
public boolean deduplicate;
1616

1717
@Parameter(names = { "-u", "--hdfs-uri" }, description = "The HDFS uri to connect to. Eg - 'hdfs://<HOST>:<RPC_PORT>/<PATH>'.", required = true, validateWith = { HdfsUriValidator.class, PathValidator.class })
1818
public String hdfsUri;

0 commit comments

Comments
 (0)