Skip to content

Commit b168d12

Browse files
authored
Merge pull request #63 from RADAR-base/release-1.0.0
radar-output-restructure release 1.0.0
2 parents 779d34c + 5dcf128 commit b168d12

File tree

92 files changed

+2830
-1519
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

92 files changed

+2830
-1519
lines changed

.travis.yml

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,27 @@
11
language: java
2-
# Force use of dist where oraclejdk8 is supported
3-
dist: trusty
4-
jdk:
5-
- oraclejdk8
6-
sudo: false
2+
sudo: required
3+
4+
services:
5+
- docker
6+
7+
before_cache:
8+
- rm -f $HOME/.gradle/caches/modules-2/modules-2.lock
9+
- rm -fr $HOME/.gradle/caches/*/plugin-resolution/
10+
- rm -f $HOME/.gradle/caches/*/*/*.lock
11+
- rm -fr $HOME/.gradle/caches/journal-1
712

813
cache:
914
directories:
10-
- $HOME/.gradle/caches/jars-1
11-
- $HOME/.gradle/caches/jars-2
12-
- $HOME/.gradle/caches/jars-3
13-
- $HOME/.gradle/caches/modules-2/files-2.1/
14-
- $HOME/.gradle/native
15-
- $HOME/.gradle/wrapper
15+
- $HOME/.gradle/caches/
16+
- $HOME/.gradle/wrapper/
17+
18+
env:
19+
- DOCKER_COMPOSE_VERSION=1.25.4
1620

1721
before_install:
22+
- curl -L https://github.com/docker/compose/releases/download/${DOCKER_COMPOSE_VERSION}/docker-compose-`uname -s`-`uname -m` > docker-compose
23+
- chmod +x docker-compose
24+
- sudo mv docker-compose /usr/local/bin
1825
- ./gradlew downloadDependencies
1926

2027
deploy:

Dockerfile

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
# See the License for the specific language governing permissions and
1111
# limitations under the License.
1212

13-
FROM openjdk:8-alpine AS builder
13+
FROM openjdk:14 AS builder
1414

1515
RUN mkdir /code
1616
WORKDIR /code
@@ -29,19 +29,16 @@ COPY ./src /code/src
2929

3030
RUN ./gradlew jar
3131

32-
FROM gradiant/hadoop-base:3.1.2
32+
FROM openjdk:14
3333

3434
MAINTAINER Joris Borgdorff <[email protected]>, Yatharth Ranjan<[email protected]>
3535

36-
LABEL description="RADAR-base HDFS data restructuring"
36+
LABEL description="RADAR-base output data restructuring"
3737

38-
ENV JAVA_OPTS="-Djava.library.path=${HADOOP_HOME}/lib/native -Djava.security.egd=file:/dev/./urandom -XX:+UseG1GC -XX:MaxHeapFreeRatio=10 -XX:MinHeapFreeRatio=10" \
39-
LD_LIBRARY_PATH=/lib64
40-
41-
RUN apk add --no-cache libc6-compat
38+
ENV JAVA_OPTS="-Djava.security.egd=file:/dev/./urandom -XX:+UseG1GC -XX:MaxHeapFreeRatio=10 -XX:MinHeapFreeRatio=10"
4239

4340
COPY --from=builder /code/build/third-party/* /usr/lib/
4441
COPY --from=builder /code/build/scripts/* /usr/bin/
4542
COPY --from=builder /code/build/libs/* /usr/lib/
4643

47-
ENTRYPOINT ["radar-hdfs-restructure"]
44+
ENTRYPOINT ["radar-output-restructure"]

Dockerfile.hdfs

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# Licensed under the Apache License, Version 2.0 (the "License");
2+
# you may not use this file except in compliance with the License.
3+
# You may obtain a copy of the License at
4+
#
5+
# http://www.apache.org/licenses/LICENSE-2.0
6+
#
7+
# Unless required by applicable law or agreed to in writing, software
8+
# distributed under the License is distributed on an "AS IS" BASIS,
9+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10+
# See the License for the specific language governing permissions and
11+
# limitations under the License.
12+
13+
FROM openjdk:8-alpine AS builder
14+
15+
RUN mkdir /code
16+
WORKDIR /code
17+
18+
ENV GRADLE_OPTS -Dorg.gradle.daemon=false
19+
20+
COPY ./gradle /code/gradle
21+
COPY ./gradlew /code/
22+
RUN ./gradlew --version
23+
24+
COPY ./build.gradle ./gradle.properties ./settings.gradle /code/
25+
26+
RUN ./gradlew downloadDependencies copyDependencies startScripts
27+
28+
COPY ./src /code/src
29+
30+
RUN ./gradlew jar
31+
32+
FROM gradiant/hadoop-base:3.1.2
33+
34+
MAINTAINER Joris Borgdorff <[email protected]>, Yatharth Ranjan<[email protected]>
35+
36+
LABEL description="RADAR-base HDFS data restructuring"
37+
38+
ENV JAVA_OPTS="-Djava.library.path=${HADOOP_HOME}/lib/native -Djava.security.egd=file:/dev/./urandom -XX:+UseG1GC -XX:MaxHeapFreeRatio=10 -XX:MinHeapFreeRatio=10" \
39+
LD_LIBRARY_PATH=/lib64
40+
41+
RUN apk add --no-cache libc6-compat
42+
43+
COPY --from=builder /code/build/third-party/* /usr/lib/
44+
COPY --from=builder /code/build/scripts/* /usr/bin/
45+
COPY --from=builder /code/build/libs/* /usr/lib/
46+
47+
ENTRYPOINT ["radar-output-restructure"]

README.md

Lines changed: 101 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,46 +1,97 @@
1-
# Restructure HDFS files
1+
# Restructure Kafka connector output files
22

33
[![Build Status](https://travis-ci.org/RADAR-base/Restructure-HDFS-topic.svg?branch=master)](https://travis-ci.org/RADAR-base/Restructure-HDFS-topic)
44

5-
Data streamed to HDFS using the [RADAR HDFS sink connector](https://github.com/RADAR-base/RADAR-HDFS-Sink-Connector) is streamed to files based on sensor only. This package can transform that output to a local directory structure as follows: `userId/topic/date_hour.csv`. The date and hour is extracted from the `time` field of each record, and is formatted in UTC time. This package is included in the [RADAR-Docker](https://github.com/RADAR-base/RADAR-Docker) repository, in the `dcompose/radar-cp-hadoop-stack/hdfs_restructure.sh` script.
6-
7-
_Note_: when upgrading to version 0.6.0, please follow the following instructions:
8-
- Write configuration file `restructure.yml` to match settings used with 0.5.x.
5+
Data streamed by a Kafka Connector will be converted to a RADAR-base oriented output directory, by organizing it by project, user and collection date.
6+
It supports data written by [RADAR HDFS sink connector](https://github.com/RADAR-base/RADAR-HDFS-Sink-Connector) is streamed to files based on topic name only. This package transforms that output to a local directory structure as follows: `projectId/userId/topic/date_hour.csv`. The date and hour are extracted from the `time` field of each record, and is formatted in UTC time. This package is included in the [RADAR-Docker](https://github.com/RADAR-base/RADAR-Docker) repository, in the `dcompose/radar-cp-hadoop-stack/bin/hdfs-restructure` script.
7+
8+
## Upgrade instructions
9+
10+
When upgrading to version 1.0.0 from version 0.6.0 please follow the following instructions:
11+
12+
- This package now relies on Redis for locking and offset management. Please install Redis or use
13+
the docker-compose.yml file to start it.
14+
- Write configuration file `restructure.yml` to match settings used with 0.6.0
15+
- HDFS settings have moved to `source`. Specify all name nodes in the `nameNodes`
16+
property. The `name` property is no longer used.
17+
18+
```yaml
19+
source:
20+
type: hdfs
21+
hdfs:
22+
nameNodes: [hdfs-namenode]
23+
```
24+
- Add a `redis` block:
25+
26+
```yaml
27+
redis:
28+
uri: redis://localhost:6379
29+
```
30+
- Offset accounting will automatically be migrated from a file-based storage to a Redis entry
31+
as radar-output processes the topic. Please do not remove the offsets directory until it is
32+
empty.
33+
- storage settings have moved to the `target` block. Using local output directory:
34+
35+
```yaml
36+
target:
37+
type: local
38+
local:
39+
# User ID to write data as
40+
userId: 123
41+
# Group ID to write data as
42+
groupId: 123
43+
```
44+
45+
With the `S3StorageDriver`, use the following configuration instead:
46+
```yaml
47+
target:
48+
type: s3
49+
s3:
50+
endpoint: https://my-region.s3.aws.amazon.com # or http://localhost:9000 for local minio
51+
accessToken: ABA...
52+
secretKey: CSD...
53+
bucket: myBucketName
54+
```
55+
56+
When upgrading to version 0.6.0 from version 0.5.x or earlier, please follow the following instructions:
57+
- Write configuration file `restructure.yml` to match command-line settings used with 0.5.x.
958
- If needed, move all entries of `offsets.csv` to their per-topic file in `offsets/<topic>.csv`. First go to the output directory, then run the `bin/migrate-offsets-to-0.6.0.sh` script.
1059

1160
## Docker usage
1261

13-
This package is available as docker image [`radarbase/radar-hdfs-restructure`](https://hub.docker.com/r/radarbase/radar-hdfs-restructure). The entrypoint of the image is the current application. So in all of the commands listed in usage, replace `radar-hdfs-restructure` with for example:
62+
This package is available as docker image [`radarbase/radar-output-restructure`](https://hub.docker.com/r/radarbase/radar-output-restructure). The entrypoint of the image is the current application. So in all the commands listed in usage, replace `radar-output-restructure` with for example:
1463
```shell
15-
docker run --rm -t --network hadoop -v "$PWD/output:/output" radarbase/radar-hdfs-restructure:0.6.0 -n hdfs-namenode -o /output /myTopic
64+
docker run --rm -t --network hadoop -v "$PWD/output:/output" radarbase/radar-output-restructure:1.0.0-hdfs -n hdfs-namenode -o /output /myTopic
1665
```
1766
if your docker cluster is running in the `hadoop` network and your output directory should be `./output`.
1867

68+
Docker image tags that are optimized for HDFS are suffixed with `-hdfs`. Otherwise, please use the image without that suffix.
69+
1970
## Command line usage
2071

2172
When the application is installed, it can be used as follows:
2273

2374
```shell
24-
radar-hdfs-restructure --nameservice <hdfs_node> --output-directory <output_folder> <input_path_1> [<input_path_2> ...]
75+
radar-output-restructure --nameservice <hdfs_node> --output-directory <output_folder> <input_path_1> [<input_path_2> ...]
2576
```
26-
or you can use the short form as well like -
77+
or you can use the short form as well:
2778
```shell
28-
radar-hdfs-restructure -n <hdfs_node> -o <output_folder> <input_path_1> [<input_path_2> ...]
79+
radar-output-restructure -n <hdfs_node> -o <output_folder> <input_path_1> [<input_path_2> ...]
2980
```
3081

31-
To display the usage and all available options you can use the help option as follows -
82+
To display the usage and all available options you can use the help option as follows:
3283
```shell
33-
radar-hdfs-restructure --help
84+
radar-output-restructure --help
3485
```
35-
Note that the options preceded by the `*` in the above output are required to run the app. Also note that there can be multiple input paths from which to read the files. Eg - `/topicAndroidNew/topic1 /topicAndroidNew/topic2 ...`. At least one input path is required.
86+
Note that the options preceded by the `*` in the above output are required to run the app. Also note that there can be multiple input paths from which to read the files. Eg - `/topicAndroidNew/topic1 /topicAndroidNew/topic2 ...`. Provide at least one input path.
3687

3788
Each argument, as well as much more, can be supplied in a config file. The default name of the config file is `restructure.yml`. Please refer to `restructure.yml` in the current directory for all available options. An alternative file can be specified with the `-F` flag.
3889

3990
### File Format
4091

4192
By default, this will output the data in CSV format. If JSON format is preferred, use the following instead:
4293
```shell
43-
radar-hdfs-restructure --format json --nameservice <hdfs_node> --output-directory <output_folder> <input_path_1> [<input_path_2> ...]
94+
radar-output-restructure --format json --nameservice <hdfs_node> --output-directory <output_folder> <input_path_1> [<input_path_2> ...]
4495
```
4596

4697
By default, files records are not deduplicated after writing. To enable this behaviour, specify the option `--deduplicate` or `-d`. This set to false by default because of an issue with Biovotion data. Please see - [issue #16](https://github.com/RADAR-base/Restructure-HDFS-topic/issues/16) before enabling it. Deduplication can also be enabled or disabled per topic using the config file. If lines should be deduplicated using a subset of fields, e.g. only `sourceId` and `time` define a unique record and only the last record with duplicate values should be kept, then specify `topics: <topicName>: deduplication: distinctFields: [key.sourceId, value.time]`.
@@ -49,35 +100,45 @@ By default, files records are not deduplicated after writing. To enable this beh
49100

50101
Another option is to output the data in compressed form. All files will get the `gz` suffix, and can be decompressed with a GZIP decoder. Note that for a very small number of records, this may actually increase the file size. Zip compression is also available.
51102
```
52-
radar-hdfs-restructure --compression gzip --nameservice <hdfs_node> --output-directory <output_folder> <input_path_1> [<input_path_2> ...]
103+
radar-output-restructure --compression gzip --nameservice <hdfs_node> --output-directory <output_folder> <input_path_1> [<input_path_2> ...]
53104
```
54105
55-
### Storage
106+
### Redis
107+
108+
This package assumes a Redis service running. See the example `restructure.yml` for configuration options.
56109
57-
There are two storage drivers implemented: `org.radarbase.hdfs.storage.LocalStorageDriver` for an output directory on the local file system or `org.radarbase.hdfs.storage.S3StorageDriver` for storage on an object store.
110+
### Source and target
111+
112+
The `source` and `target` properties contain resource descriptions. The source can have two types, `hdfs` and `s3`:
58113
59-
`LocalStorageDriver` takes the following properties:
60114
```yaml
61-
storage:
62-
factory: org.radarbase.hdfs.storage.LocalStorageDriver
63-
properties:
64-
# User ID to write data as
65-
localUid: 123
66-
# Group ID to write data as
67-
localGid: 123
115+
source:
116+
type: s3 # hdfs or s3
117+
s3:
118+
endpoint: http://localhost:9000 # using AWS S3 endpoint is also possible.
119+
bucket: radar
120+
accessToken: minioadmin
121+
secretKey: minioadmin
122+
# only actually needed if source type is hdfs
123+
hdfs:
124+
nameNodes: [hdfs-namenode-1, hdfs-namenode-2]
68125
```
69126

70-
With the `S3StorageDriver`, use the following configuration instead:
127+
The target is similar, but it does not support HDFS, but the local file system (`local`) or `s3`.
128+
71129
```yaml
72-
storage:
73-
factory: org.radarbase.hdfs.storage.S3StorageDriver
74-
properties:
75-
# Object store URL
76-
s3EndpointUrl: s3://my-region.s3.aws.amazon.com
77-
# Bucket to use
78-
s3Bucket: myBucketName
130+
target:
131+
type: s3 # s3 or local
132+
s3:
133+
endpoint: http://localhost:9000
134+
bucket: out
135+
accessToken: minioadmin
136+
secretKey: minioadmin
137+
# only actually needed if target type is local
138+
local:
139+
userId: 1000 # write as regular user, use -1 to use current user (default).
140+
groupId: 100 # write as regular group, use -1 to use current user (default).
79141
```
80-
Ensure that the environment variables contain the authorized AWS keys that allow the service to list, download and upload files to the respective bucket.
81142
82143
### Service
83144
@@ -94,21 +155,20 @@ This package requires at least Java JDK 8. Build the distribution with
94155
and install the package into `/usr/local` with for example
95156
```shell
96157
sudo mkdir -p /usr/local
97-
sudo tar -xzf build/distributions/radar-hdfs-restructure-0.6.0.tar.gz -C /usr/local --strip-components=1
158+
sudo tar -xzf build/distributions/radar-output-restructure-1.0.0.tar.gz -C /usr/local --strip-components=1
98159
```
99160

100-
Now the `radar-hdfs-restructure` command should be available.
161+
Now the `radar-output-restructure` command should be available.
101162

102163
### Extending the connector
103164

104165
To implement alternative storage paths, storage drivers or storage formats, put your custom JAR in
105-
`$APP_DIR/lib/radar-hdfs-plugins`. To load them, use the following options:
166+
`$APP_DIR/lib/radar-output-plugins`. To load them, use the following options:
106167

107168
| Parameter | Base class | Behaviour | Default |
108169
| --------------------------- | --------------------------------------------------- | ------------------------------------------ | ------------------------- |
109-
| `paths: factory: ...` | `org.radarbase.hdfs.path.RecordPathFactory` | Factory to create output path names with. | ObservationKeyPathFactory |
110-
| `storage: factory: ...` | `org.radarbase.hdfs.storage.StorageDriver` | Storage driver to use for storing data. | LocalStorageDriver |
111-
| `format: factory: ...` | `org.radarbase.hdfs.format.FormatFactory` | Factory for output formats. | FormatFactory |
112-
| `compression: factory: ...` | `org.radarbase.hdfs.compression.CompressionFactory` | Factory class to use for data compression. | CompressionFactory |
170+
| `paths: factory: ...` | `org.radarbase.output.path.RecordPathFactory` | Factory to create output path names with. | ObservationKeyPathFactory |
171+
| `format: factory: ...` | `org.radarbase.output.format.FormatFactory` | Factory for output formats. | FormatFactory |
172+
| `compression: factory: ...` | `org.radarbase.output.compression.CompressionFactory` | Factory class to use for data compression. | CompressionFactory |
113173

114174
The respective `<type>: properties: {}` configuration parameters can be used to provide custom configuration of the factory. This configuration will be passed to the `Plugin#init(Map<String, String>)` method.

0 commit comments

Comments
 (0)