Skip to content

Commit e015f31

Browse files
Merge pull request #23 from RADAR-base/command_line_parser
Command line parser
2 parents 7a43d0f + f33c622 commit e015f31

File tree

6 files changed

+149
-23
lines changed

6 files changed

+149
-23
lines changed

README.md

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,20 +15,30 @@ Build jar from source with
1515
```shell
1616
./gradlew build
1717
```
18-
and find the output JAR file as `build/libs/restructurehdfs-0.3.1-all.jar`. Then run with:
18+
and find the output JAR file as `build/libs/restructurehdfs-0.3.3-all.jar`. Then run with:
1919

2020
```shell
21-
java -jar restructurehdfs-0.3.1-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
21+
java -jar restructurehdfs-0.3.3-all.jar --hdfs-uri <webhdfs_url> --output-directory <output_folder> <input_path_1> [<input_path_2> ...]
22+
```
23+
or you can use the short form as well like -
24+
```shell
25+
java -jar restructurehdfs-0.3.3-all.jar -u <webhdfs_url> -o <output_folder> <input_path_1> [<input_path_2> ...]
2226
```
2327

24-
By default, this will output the data in CSV format. If JSON format is preferred, use the following instead:
28+
To display the usage and all available options you can use the help option as follows -
29+
```shell
30+
java -jar restructurehdfs-0.3.3-all.jar --help
2531
```
26-
java -Dorg.radarcns.format=json -jar restructurehdfs-0.3.1-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
32+
Note that the options preceded by the `*` in the above output are required to run the app. Also note that there can be multiple input paths from which to read the files. Eg - `/topicAndroidNew/topic1 /topicAndroidNew/topic2 ...`. At least one input path is required.
33+
34+
By default, this will output the data in CSV format. If JSON format is preferred, use the following instead:
35+
```shell
36+
java -jar restructurehdfs-0.3.3-all.jar --format json --hdfs-uri <webhdfs_url> --output-directory <output_folder> <input_path_1> [<input_path_2> ...]
2737
```
2838

2939
Another option is to output the data in compressed form. All files will get the `gz` suffix, and can be decompressed with a GZIP decoder. Note that for a very small number of records, this may actually increase the file size.
3040
```
31-
java -Dorg.radarcns.compression=gzip -jar restructurehdfs-0.3.1-all.jar <webhdfs_url> <hdfs_topic_path> <output_folder>
41+
java -jar restructurehdfs-0.3.3-all.jar --compression gzip --hdfs-uri <webhdfs_url> --output-directory <output_folder> <input_path_1> [<input_path_2> ...]
3242
```
3343

34-
Finally, by default, files records are not deduplicated after writing. To enable this behaviour, specify the option `-Dorg.radarcns.deduplicate=true`. This set to false by default because of an issue with Biovotion data. Please see - [issue #16](https://github.com/RADAR-base/Restructure-HDFS-topic/issues/16) before enabling it.
44+
Finally, by default, files records are not deduplicated after writing. To enable this behaviour, specify the option `--deduplicate` or `-d`. This set to false by default because of an issue with Biovotion data. Please see - [issue #16](https://github.com/RADAR-base/Restructure-HDFS-topic/issues/16) before enabling it.

build.gradle

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ targetCompatibility = '1.8'
1515
ext.avroVersion = '1.8.2'
1616
ext.jacksonVersion = '2.8.9'
1717
ext.hadoopVersion = '2.7.3'
18+
ext.jCommanderVersion = '1.72'
1819

1920
repositories {
2021
jcenter()
@@ -27,6 +28,8 @@ dependencies {
2728
compile group: 'com.fasterxml.jackson.core' , name: 'jackson-databind', version: jacksonVersion
2829
compile group: 'com.fasterxml.jackson.dataformat' , name: 'jackson-dataformat-csv', version: jacksonVersion
2930

31+
compile group: 'com.beust', name: 'jcommander', version: jCommanderVersion
32+
3033
runtime group: 'org.apache.hadoop', name: 'hadoop-hdfs', version: hadoopVersion
3134

3235
testCompile group: 'junit', name: 'junit', version: '4.12'

src/main/java/org/radarcns/RestructureAvroRecords.java

Lines changed: 68 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
package org.radarcns;
1818

19+
import com.beust.jcommander.JCommander;
1920
import com.fasterxml.jackson.databind.JsonMappingException;
2021
import org.apache.avro.Schema.Field;
2122
import org.apache.avro.file.DataFileReader;
@@ -32,6 +33,7 @@
3233
import org.radarcns.util.JsonAvroConverter;
3334
import org.radarcns.util.ProgressBar;
3435
import org.radarcns.util.RecordConverterFactory;
36+
import org.radarcns.util.commandline.CommandLineArgs;
3537
import org.slf4j.Logger;
3638
import org.slf4j.LoggerFactory;
3739

@@ -70,27 +72,39 @@ public class RestructureAvroRecords {
7072

7173
private long processedFileCount;
7274
private long processedRecordsCount;
73-
private static final boolean USE_GZIP = "gzip".equalsIgnoreCase(System.getProperty("org.radarcns.compression"));
74-
75-
// Default set to false because causes loss of records from Biovotion data. https://github.com/RADAR-base/Restructure-HDFS-topic/issues/16
76-
private static final boolean DO_DEDUPLICATE = "true".equalsIgnoreCase(System.getProperty("org.radarcns.deduplicate", "false"));
75+
private final boolean useGzip;
76+
private final boolean doDeduplicate;
7777

7878
public static void main(String [] args) throws Exception {
79-
if (args.length != 3) {
80-
System.out.println("Usage: hadoop jar restructurehdfs-all-0.2.jar <webhdfs_url> <hdfs_root_directory> <output_folder>");
81-
System.exit(1);
79+
80+
final CommandLineArgs commandLineArgs = new CommandLineArgs();
81+
final JCommander parser = JCommander.newBuilder().addObject(commandLineArgs).build();
82+
83+
parser.setProgramName("hadoop jar restructurehdfs-all-0.3.3.jar");
84+
parser.parse(args);
85+
86+
if(commandLineArgs.help) {
87+
parser.usage();
88+
System.exit(0);
8289
}
8390

8491
logger.info(new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date()));
8592
logger.info("Starting...");
86-
logger.info("In: " + args[0] + args[1]);
87-
logger.info("Out: " + args[2]);
8893

8994
long time1 = System.currentTimeMillis();
9095

91-
RestructureAvroRecords restr = new RestructureAvroRecords(args[0], args[2]);
96+
RestructureAvroRecords restr = new RestructureAvroRecords.Builder(commandLineArgs.hdfsUri,
97+
commandLineArgs.outputDirectory)
98+
.useGzip("gzip".equalsIgnoreCase(commandLineArgs.compression))
99+
.doDeduplicate(commandLineArgs.deduplicate).format(commandLineArgs.format)
100+
.build();
101+
92102
try {
93-
restr.start(args[1]);
103+
for(String input : commandLineArgs.inputPaths) {
104+
logger.info("In: " + commandLineArgs.hdfsUri + input);
105+
logger.info("Out: " + commandLineArgs.outputDirectory);
106+
restr.start(input);
107+
}
94108
} catch (IOException ex) {
95109
logger.error("Processing failed", ex);
96110
}
@@ -99,12 +113,16 @@ public static void main(String [] args) throws Exception {
99113
logger.info("Time taken: {} seconds", (System.currentTimeMillis() - time1)/1000d);
100114
}
101115

102-
public RestructureAvroRecords(String inputPath, String outputPath) {
103-
this.setInputWebHdfsURL(inputPath);
104-
this.setOutputPath(outputPath);
116+
private RestructureAvroRecords(RestructureAvroRecords.Builder builder) {
117+
this.setInputWebHdfsURL(builder.hdfsUri);
118+
this.setOutputPath(builder.outputPath);
119+
120+
this.useGzip = builder.useGzip;
121+
this.doDeduplicate = builder.doDeduplicate;
122+
logger.info("Deduplicate set to {}", doDeduplicate);
105123

106124
String extension;
107-
if (System.getProperty("org.radarcns.format", "csv").equalsIgnoreCase("json")) {
125+
if (builder.format.equalsIgnoreCase("json")) {
108126
logger.info("Writing output files in JSON format");
109127
converterFactory = JsonAvroConverter.getFactory();
110128
extension = "json";
@@ -113,7 +131,7 @@ public RestructureAvroRecords(String inputPath, String outputPath) {
113131
converterFactory = CsvAvroConverter.getFactory();
114132
extension = "csv";
115133
}
116-
if (USE_GZIP) {
134+
if (this.useGzip) {
117135
logger.info("Compressing output files in GZIP format");
118136
extension += ".gz";
119137
}
@@ -179,7 +197,7 @@ public void start(String directoryName) throws IOException {
179197

180198
// Actually process the files
181199
for (Map.Entry<String, List<Path>> entry : topicPaths.entrySet()) {
182-
try (FileCacheStore cache = new FileCacheStore(converterFactory, 100, USE_GZIP, DO_DEDUPLICATE)) {
200+
try (FileCacheStore cache = new FileCacheStore(converterFactory, 100, useGzip, doDeduplicate)) {
183201
for (Path filePath : entry.getValue()) {
184202
// If JsonMappingException occurs, log the error and continue with other files
185203
try {
@@ -347,4 +365,37 @@ public static Date getDate(GenericRecord keyField, GenericRecord valueField) {
347365
long time = (Long) keyField.get("start");
348366
return new Date(time);
349367
}
368+
369+
public static class Builder {
370+
private boolean useGzip;
371+
private boolean doDeduplicate;
372+
private String hdfsUri;
373+
private String outputPath;
374+
private String format;
375+
376+
public Builder(final String uri, final String outputPath) {
377+
this.hdfsUri = uri;
378+
this.outputPath = outputPath;
379+
}
380+
381+
public Builder useGzip(final boolean gzip) {
382+
this.useGzip = gzip;
383+
return this;
384+
}
385+
386+
public Builder doDeduplicate(final boolean dedup) {
387+
this.doDeduplicate = dedup;
388+
return this;
389+
}
390+
391+
public Builder format(final String format) {
392+
this.format = format;
393+
return this;
394+
}
395+
396+
public RestructureAvroRecords build() {
397+
return new RestructureAvroRecords(this);
398+
}
399+
400+
}
350401
}
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
package org.radarcns.util.commandline;
2+
3+
import com.beust.jcommander.Parameter;
4+
5+
import java.util.ArrayList;
6+
import java.util.List;
7+
8+
public class CommandLineArgs {
9+
10+
@Parameter(description = "<input_path_1> [<input_path_2> ...]", variableArity = true, required = true)
11+
public List<String> inputPaths = new ArrayList<>();
12+
13+
@Parameter(names = { "-f", "--format" }, description = "Format to use when converting the files. JSON and CSV is available.")
14+
public String format = "csv";
15+
16+
@Parameter(names = { "-c", "--compression" }, description = "Compression to use when converting the files. Gzip is available.")
17+
public String compression = "none";
18+
19+
// Default set to false because causes loss of records from Biovotion data. https://github.com/RADAR-base/Restructure-HDFS-topic/issues/16
20+
@Parameter(names = { "-d", "--deduplicate" }, description = "Boolean to define if to use deduplication or not.")
21+
public boolean deduplicate;
22+
23+
@Parameter(names = { "-u", "--hdfs-uri" }, description = "The HDFS uri to connect to. Eg - 'hdfs://<HOST>:<RPC_PORT>/<PATH>'.", required = true, validateWith = { HdfsUriValidator.class, PathValidator.class })
24+
public String hdfsUri;
25+
26+
@Parameter(names = { "-o", "--output-directory"}, description = "The output folder where the files are to be extracted.", required = true, validateWith = PathValidator.class)
27+
public String outputDirectory;
28+
29+
@Parameter(names = { "-h", "--help"}, help = true, description = "Display the usage of the program with available options.")
30+
public boolean help;
31+
}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
package org.radarcns.util.commandline;
2+
3+
4+
import com.beust.jcommander.ParameterException;
5+
import com.beust.jcommander.IParameterValidator;
6+
7+
public class HdfsUriValidator implements IParameterValidator{
8+
@Override
9+
public void validate(String name, String value) throws ParameterException {
10+
if (! value.matches("((hdfs)|(webhdfs)):(/?/?)[^\\s]+")) {
11+
throw new ParameterException("Parameter " + name + " should be a valid HDFS or WebHDFS URI. "
12+
+ "Eg - hdfs://<HOST>:<RPC_PORT>/<PATH>. (found " + value
13+
+ "). Please run with --help or -h for more information.");
14+
}
15+
}
16+
}
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
package org.radarcns.util.commandline;
2+
3+
import com.beust.jcommander.ParameterException;
4+
import com.beust.jcommander.IParameterValidator;
5+
6+
public class PathValidator implements IParameterValidator{
7+
@Override
8+
public void validate(String name, String value) throws ParameterException {
9+
if (value == null || value.isEmpty()) {
10+
throw new ParameterException("Parameter " + name + " should be supplied. "
11+
+ "It cannot be empty or null. (found " + value +")."
12+
+ "Please run with --help or -h for more information.");
13+
}
14+
}
15+
}

0 commit comments

Comments
 (0)