RADAR-base
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 37 additions & 27 deletions b/‎README.md‎
Lines changed: 37 additions & 27 deletions
diff --git a/‎build.gradle‎
Lines changed: 4 additions & 0 deletions b/‎build.gradle‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎restructure.yml‎
Lines changed: 92 additions & 0 deletions b/‎restructure.yml‎
Lines changed: 92 additions & 0 deletions
@@ -98,3 +98,4 @@ fabric.properties
 ## Pebble 2
 .lock*
 /data/
+/output/
@@ -12,22 +12,6 @@ docker run --rm -t --network hadoop -v "$PWD/output:/output" radarbase/radar-hdf
 ```
 if your docker cluster is running in the `hadoop` network and your output directory should be `./output`.
 
-## Local build
-
-This package requires at least Java JDK 8. Build the distribution with
-
-```shell
-./gradlew build
-```
-
-and install the package into `/usr/local` with for example
-```shell
-sudo mkdir -p /usr/local
-sudo tar -xzf build/distributions/radar-hdfs-restructure-0.5.7.tar.gz -C /usr/local --strip-components=1
-```
-
-Now the `radar-hdfs-restructure` command should be available.
-
 ## Command line usage
 
 When the application is installed, it can be used as follows:
@@ -46,32 +30,58 @@ radar-hdfs-restructure --help
 ```
 Note that the options preceded by the `*` in the above output are required to run the app. Also note that there can be multiple input paths from which to read the files. Eg - `/topicAndroidNew/topic1 /topicAndroidNew/topic2 ...`. At least one input path is required.
 
+Each argument, as well as much more, can be supplied in a config file. The default name of the config file is `restructure.yml`. Please refer to `restructure.yml` in the current directory for all available options. An alternative file can be specified with the `-F` flag.
+
+### File Format
+
 By default, this will output the data in CSV format. If JSON format is preferred, use the following instead:
 ```shell
 radar-hdfs-restructure --format json --nameservice <hdfs_node> --output-directory <output_folder>  <input_path_1> [<input_path_2> ...]
 ```
 
+By default, files records are not deduplicated after writing. To enable this behaviour, specify the option `--deduplicate` or `-d`. This set to false by default because of an issue with Biovotion data. Please see - [issue #16](https://github.com/RADAR-base/Restructure-HDFS-topic/issues/16) before enabling it. Deduplication can also be enabled or disabled per topic using the config file. If lines should be deduplicated using a subset of fields, e.g. only `sourceId` and `time` define a unique record and only the last record with duplicate values should be kept, then specify `topics: <topicName>: deduplicateFields: [sourceId, time]`.
+
+### Compression
+
 Another option is to output the data in compressed form. All files will get the `gz` suffix, and can be decompressed with a GZIP decoder. Note that for a very small number of records, this may actually increase the file size.
 ```
 radar-hdfs-restructure --compression gzip  --nameservice <hdfs_node> --output-directory <output_folder> <input_path_1> [<input_path_2> ...]
 ```
 
-By default, files records are not deduplicated after writing. To enable this behaviour, specify the option `--deduplicate` or `-d`. This set to false by default because of an issue with Biovotion data. Please see - [issue #16](https://github.com/RADAR-base/Restructure-HDFS-topic/issues/16) before enabling it.
+### Storage
+
+When using local storage, to set the output user ID and group ID, specify the `-p local-uid=123` and `-p local-gid=12` properties.
+
+### Service
+
+To run the output generator as a service that will regularly poll the HDFS directory, add the `--service` flag and optionally the `--interval` flag to adjust the polling interval or use the corresponding configuration file parameters.
+
+## Local build
+
+This package requires at least Java JDK 8. Build the distribution with
+
+```shell
+./gradlew build
+```
 
-To set the output user ID and group ID, specify the `-p local-uid=123` and `-p local-gid=12` properties.
+and install the package into `/usr/local` with for example
+```shell
+sudo mkdir -p /usr/local
+sudo tar -xzf build/distributions/radar-hdfs-restructure-0.5.7.tar.gz -C /usr/local --strip-components=1
+```
 
-To run the output generator as a service that will regularly poll the HDFS directory, add the `--service` flag and optionally the `--interval` flag to adjust the polling interval.
+Now the `radar-hdfs-restructure` command should be available.
 
-## Extending the connector
+### Extending the connector
 
 To implement alternative storage paths, storage drivers or storage formats, put your custom JAR in
 `$APP_DIR/lib/radar-hdfs-plugins`. To load them, use the following options:
 
-| Option                  | Class                                       | Behaviour                                  | Default                   |
-| ----------------------- | ------------------------------------------- | ------------------------------------------ | ------------------------- |
-| `--path-factory`        | `org.radarbase.hdfs.RecordPathFactory`       | Factory to create output path names with.  | ObservationKeyPathFactory |
-| `--storage-driver`      | `org.radarbase.hdfs.data.StorageDriver`      | Storage driver to use for storing data.    | LocalStorageDriver        |
-| `--format-factory`      | `org.radarbase.hdfs.data.FormatFactory`      | Factory for output formats.                | FormatFactory             |
-| `--compression-factory` | `org.radarbase.hdfs.data.CompressionFactory` | Factory class to use for data compression. | CompressionFactory        |
+| Parameter                   | Base class                                          | Behaviour                                  | Default                   |
+| --------------------------- | --------------------------------------------------- | ------------------------------------------ | ------------------------- |
+| `paths: factory: ...`       | `org.radarbase.hdfs.path.RecordPathFactory`         | Factory to create output path names with.  | ObservationKeyPathFactory |
+| `storage: factory: ...`     | `org.radarbase.hdfs.storage.StorageDriver`          | Storage driver to use for storing data.    | LocalStorageDriver        |
+| `format: factory: ...`      | `org.radarbase.hdfs.format.FormatFactory`           | Factory for output formats.                | FormatFactory             |
+| `compression: factory: ...` | `org.radarbase.hdfs.compression.CompressionFactory` | Factory class to use for data compression. | CompressionFactory        |
 
-To pass arguments to self-assigned plugins, use `-p arg1=value1 -p arg2=value2` command-line flags and read those arguments in the `Plugin#init(Map<String, String>)` method.
+The respective `<type>: properties: {}` configuration parameters can be used to provide custom configuration of the factory. This configuration will be passed to the `Plugin#init(Map<String, String>)` method.
@@ -36,11 +36,15 @@ repositories {
 dependencies {
     api group: 'org.apache.avro', name: 'avro', version: avroVersion
     implementation group: 'com.fasterxml.jackson.core' , name: 'jackson-databind', version: jacksonVersion
+    implementation group: 'com.fasterxml.jackson.dataformat' , name: 'jackson-dataformat-yaml', version: jacksonVersion
     implementation group: 'com.fasterxml.jackson.dataformat' , name: 'jackson-dataformat-csv', version: jacksonVersion
+    implementation("com.fasterxml.jackson.module:jackson-module-kotlin:$jacksonVersion")
+
     implementation group: 'com.beust', name: 'jcommander', version: jCommanderVersion
     implementation group: 'com.almworks.integers', name: 'integers', version: almworksVersion
 
     implementation 'software.amazon.awssdk:s3:2.10.3'
+    implementation 'com.opencsv:opencsv:5.0'
 
     implementation group: 'org.apache.avro', name: 'avro-mapred', version: avroVersion
     implementation group: 'org.apache.hadoop', name: 'hadoop-common', version: hadoopVersion
 
@@ -0,0 +1,92 @@
+service:
+  # Whether to run the application as a polling service.
+  enable: false
+  # Polling interval in seconds.
+  interval: 30
+
+# Compression characteristics
+compression:
+  # Compression type: none, zip or gzip
+  type: gzip
+  # Compression Factory class
+  # factory: org.radarbase.hdfs.compression.CompressionFactory
+  # Additional compression properties
+  # properties: {}
+
+# File format
+format:
+  # Format type: CSV or JSON
+  type: csv
+  # Whether to deduplicate the files in each topic by default
+  deduplication:
+    enable: true
+    # Use specific fields to consider records distinct. Disregarded if empty.
+    # distinctFields: []
+    # Ignore specific fields to consider records distinct. Disregarded if empty.
+    # ignoreFields: []
+  # Format factory class
+  # factory: org.radarbase.hdfs.format.FormatFactory
+  # Additional format properties
+  # properties: {}
+
+# Worker settings
+worker:
+  # Maximum number of files and converters to keep open while processing
+  cacheSize: 300
+  # Number of threads to do processing on
+  numThreads: 2
+  # Maximum number of files to process in any given topic.
+  maxFilesPerTopic: null
+
+# Path settings
+paths:
+  # Input directories in HDFS
+  inputs:
+    - /topicAndroidNew
+  # Root temporary directory for local file processing.
+  temp: ./output/+tmp
+  # Output directory
+  output: ./output
+  # Output path construction factory
+  factory: org.radarbase.hdfs.path.MonthlyObservationKeyPathFactory
+  # Additional properties
+  # properties: {}
+
+# Individual topic configuration
+topics:
+  # topic name
+  connect_fitbit_source:
+    # deduplicate this topic, regardless of the format settings
+    deduplication:
+      enable: true
+      # deduplicate this topic only using given fields.
+      distinctFields: [key.sourceId, value.time]
+  # topic name
+  connect_fitbit_source2:
+    # deduplicate this topic, regardless of the format settings
+    deduplication:
+      enable: true
+      # deduplicate this topic without regard of given fields.
+      ignoreFields: [value.timeReceived]
+  connect_fitbit_bad:
+    # Do not process this topic
+    exclude: true
+  biovotion_acceleration:
+    # Disable deduplication
+    deduplication:
+      enable: false
+
+# HDFS settings
+hdfs:
+  # HDFS name node in case of a single name node, or HDFS cluster ID in case of high availability.
+  name: hdfs-namenode
+  # High availability settings:
+  # nameNodes:
+  #   - name: hdfs1
+  #     hostname: hdfs-namenode-1
+  #   - name: hdfs2
+  #     hostname: hdfs-namenode-2
+  # Where files will be locked. This value should be the same for all restructure processes.
+  lockPath: /logs/org.radarbase.hdfs/lock
+  # Additional raw HDFS configuration properties
+  # properties: {}