Skip to content

Commit 1073a5e

Browse files
committed
Merge pull request #78 in CATS/pyclowder2 from r-extractor to develop
* commit '2ebddbb5b9dba23d259c62b1322a8c91f7ac8451': use launcher script Add missing Dockerfile example Created generic SimpleExtractor class used by both the R and Python implementations. initial example of r extractor wordcount example
2 parents 14ce71b + 2ebddbb commit 1073a5e

File tree

13 files changed

+357
-38
lines changed

13 files changed

+357
-38
lines changed

docker.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@ ${DEBUG} docker build --tag clowder/pyclowder:latest .
1111
${DEBUG} docker build --tag clowder/pyclowder:onbuild --file Dockerfile.onbuild .
1212
${DEBUG} docker build --tag clowder/extractors-binary-preview:onbuild sample-extractors/binary-preview
1313
${DEBUG} docker build --tag clowder/extractors-simple-extractor:onbuild sample-extractors/simple-extractor
14+
${DEBUG} docker build --tag clowder/extractors-simple-r-extractor:onbuild sample-extractors/simple-r-extractor
1415

1516
# build sample extractors
1617
${DEBUG} docker build --tag clowder/extractors-wordcount:latest sample-extractors/wordcount
17-
${DEBUG} docker build --tag clowder/extractors-wordcount-simpleextractor:latest sample-extractors/wordcount-simple-extractor
18+
${DEBUG} docker build --tag clowder/extractors-wordcount-simple-extractor:latest sample-extractors/wordcount-simple-extractor
19+
${DEBUG} docker build --tag clowder/extractors-wordcount-simple-r-extractor:latest sample-extractors/wordcount-simple-r-extractor

pyclowder/extractors.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
from pyclowder.connectors import RabbitMQConnector, HPCConnector, LocalConnector
2323
from pyclowder.utils import CheckMessage, setup_logging
24+
import pyclowder.files
2425

2526

2627
class Extractor(object):
@@ -287,3 +288,80 @@ def process_message(self, connector, host, secret_key, resource, parameters):
287288
parameters (dict): the message received
288289
"""
289290
logging.getLogger(__name__).debug("default process message : " + str(parameters))
291+
292+
293+
class SimpleExtractor(Extractor):
294+
"""
295+
Simple extractor. All that is needed to be done is extend the process_file function.
296+
"""
297+
298+
def __init__(self):
299+
'''
300+
Initialize the extractor and setup the logger.
301+
'''
302+
Extractor.__init__(self)
303+
self.setup()
304+
305+
# setup logging for the exctractor
306+
logging.getLogger('pyclowder').setLevel(logging.INFO)
307+
self.logger = logging.getLogger('__main__')
308+
self.logger.setLevel(logging.INFO)
309+
310+
def process_message(self, connector, host, secret_key, resource, parameters):
311+
"""
312+
Process a clowder message. This will download the file to local disk and call the
313+
process_file to do the actual processing of the file. The resulting dict is then
314+
parsed and based on the keys in the dict it will upload the results to the right
315+
location in clowder.
316+
"""
317+
input_file = resource["local_paths"][0]
318+
file_id = resource['id']
319+
320+
# call the actual function that processes the file
321+
if file_id and input_file:
322+
result = self.process_file(input_file)
323+
else:
324+
result = dict()
325+
326+
# return information to clowder
327+
try:
328+
if 'metadata' in result.keys():
329+
metadata = self.get_metadata(result.get('metadata'), 'file', file_id, host)
330+
self.logger.info("upload metadata")
331+
self.logger.debug(metadata)
332+
pyclowder.files.upload_metadata(connector, host, secret_key, file_id, metadata)
333+
if 'previews' in result.keys():
334+
self.logger.info("upload previews")
335+
for preview in result['previews']:
336+
if os.path.exists(str(preview)):
337+
preview = {'file': preview}
338+
self.logger.info("upload preview")
339+
pyclowder.files.upload_preview(connector, host, secret_key, file_id, str(preview))
340+
finally:
341+
self.cleanup_data(result)
342+
343+
def process_file(self, input_file):
344+
"""
345+
This function will process the file and return a dict that contains the result. This
346+
dict can have the following keys:
347+
- metadata: the metadata to be associated with the file
348+
- previews: files on disk with the preview to be uploaded
349+
:param input_file: the file to be processed.
350+
:return: the specially formatted dict.
351+
"""
352+
return dict()
353+
354+
def cleanup_data(self, result):
355+
"""
356+
Once the information is uploaded to clowder this function is called for cleanup. This
357+
will enable the extractor to remove any preview images or other cleanup other resources
358+
that were opened. This is the same dict as returned by process_file.
359+
360+
The default behaviour is to remove all the files in previews.
361+
362+
:param result: the result returned from process_file.
363+
"""
364+
365+
for preview in result.get("previews", []):
366+
if os.path.exists(preview):
367+
os.remove(preview)

release.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ for i in pyclowder extractors-wordcount; do
3737
done
3838
done
3939

40-
for i in pyclowder extractors-binary-preview extractors-simple-extractor; do
40+
for i in pyclowder extractors-binary-preview extractors-simple-extractor extractors-simple-r-extractor; do
4141
for v in ${VERSION}; do
4242
if [ "$v" != "latest" ]; then
4343
${DEBUG} docker tag clowder/${i}:onbuild ${SERVER}clowder/${i}:${v}-onbuild

sample-extractors/simple-extractor/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,4 @@ ONBUILD RUN if [ -e requirements.txt ]; then \
2020
# copy all files
2121
ONBUILD ADD . /home/clowder/
2222

23-
CMD python -c "from simple_extractor import SimpleExtractor; from ${EXTRACTION_MODULE} import *; SimpleExtractor(${EXTRACTION_FUNC}).start()"
23+
CMD python -c "from simple_extractor import SimplePythonExtractor; from ${EXTRACTION_MODULE} import *; SimplePythonExtractor(${EXTRACTION_FUNC}).start()"

sample-extractors/simple-extractor/README.md

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,8 @@ can contain either metadata information ("metadata"), details about file preview
3131

3232
2. Let's create a Dockerfile for your extractor. Dockerfile contents need to be:
3333

34-
FROM clowder/extractors-simple-extractor:onbuild
35-
ENV EXTRACTION_FUNC="your_main_function"
36-
ENV EXTRACTION_MODULE="your_python_program"
37-
38-
39-
34+
```Dockerfile
35+
FROM clowder/extractors-simple-extractor:onbuild
36+
ENV EXTRACTION_FUNC="your_main_function"
37+
ENV EXTRACTION_MODULE="your_python_program"
38+
```
Lines changed: 5 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,12 @@
11
#!/usr/bin/env python
22

3-
import logging
4-
from pyclowder.extractors import Extractor
5-
import pyclowder.files
3+
from pyclowder.extractors import SimpleExtractor
64

75

8-
class SimpleExtractor(Extractor):
6+
class SimplePythonExtractor(SimpleExtractor):
97
def __init__(self, extraction):
10-
Extractor.__init__(self)
8+
SimpleExtractor.__init__(self)
119
self.extraction = extraction
12-
self.setup()
13-
# setup logging for the exctractor
14-
logging.getLogger('pyclowder').setLevel(logging.INFO)
15-
self.logger = logging.getLogger('__main__')
16-
self.logger.setLevel(logging.INFO)
1710

18-
def process_message(self, connector, host, secret_key, resource, parameters):
19-
input_file = resource["local_paths"][0]
20-
file_id = resource['id']
21-
result = self.extraction(input_file)
22-
if 'metadata' in result.keys():
23-
metadata = self.get_metadata(result.get('metadata'), 'file', file_id, host)
24-
self.logger.info("upload metadata")
25-
self.logger.debug(metadata)
26-
pyclowder.files.upload_metadata(connector, host, secret_key, file_id, metadata)
27-
if 'previews' in result.keys():
28-
self.logger.info("upload previews")
29-
for preview in result['previews']:
30-
if isinstance(preview, basestring):
31-
preview = {'file': preview}
32-
else:
33-
continue
34-
self.logger.info("upload preview")
35-
pyclowder.files.upload_preview(connector, host, secret_key, file_id, preview.get('file'),
36-
preview.get('metadata'), preview.get('mimetype'))
11+
def process_file(self, input_file):
12+
return self.extraction(input_file)
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
FROM clowder/pyclowder:latest
2+
3+
ENV R_SCRIPT="" \
4+
R_FUNCTION=""
5+
6+
RUN echo "deb http://cran.rstudio.com/bin/linux/ubuntu xenial/" > /etc/apt/sources.list.d/R.list \
7+
&& apt-key adv --keyserver keyserver.ubuntu.com --recv E084DAB9 \
8+
&& apt-get -q -q update \
9+
&& apt-get -y install --no-install-recommends r-base-core r-base-dev \
10+
&& rm -rf /var/lib/apt/lists/* \
11+
&& Rscript --vanilla -e "install.packages('jsonlite', repos='http://cran.rstudio.com/'); print(system.file(package = 'jsonlite')); q(status=as.integer(system.file(package = 'jsonlite') == ''))"
12+
13+
COPY r_extractor.py launcher.R /home/clowder/
14+
15+
# install any packages
16+
ONBUILD COPY packages.* Dockerfile /home/clowder/
17+
ONBUILD RUN if [ -e packages.apt ]; then \
18+
apt-get -q -q update \
19+
&& xargs apt-get -y install --no-install-recommends < packages.apt \
20+
&& rm -rf /var/lib/apt/lists/*; \
21+
fi
22+
23+
# install any r packages
24+
ONBUILD COPY docker.R* Dockerfile /home/clowder/
25+
ONBUILD RUN if [ -e docker.R ]; then \
26+
Rscript docker.R; \
27+
fi
28+
29+
# copy all files
30+
ONBUILD ADD . /home/clowder/
31+
32+
CMD python r_extractor.py
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
# Introduction
2+
3+
Clowder is an open-source research data management system that supports curation of long-tail data and metadata across
4+
multiple research domains and diverse data types. It uses a metadata extraction bus to perform data curation. Extractors
5+
are software programs that do the extraction of specific metadata from a file or dataset (a group of related files).
6+
The Simple Extractor Wrapper is a piece of software being developed to make the process of developing an extractor
7+
easier. This document will provide the details of writing an extractor program using the Simple Extractor Wrapper.
8+
9+
# Goals of Simple Extractor Wrapper
10+
11+
An extractor can be written in any programming language as long as it can communicate with Clowder using a simple HTTP
12+
web service API and RabbitMQ. It can be hard to develop an extractor from the scratch when you also consider the code
13+
that is needed for this communication. To reduce this effort and to avoid code duplication, we created libraries written
14+
in Python (PyClowder) and Java (JClowder) to make the processing of writing extractors easy in these languages. We chose
15+
these languages since they are among the most popular ones and they continue to remain to so. Though this is the case,
16+
there is still some overhead in terms of developing an extractor using these libraries. In order to make the process of
17+
writing extractors even easier, we created a Simple Extractor Wrapper, that wraps around your existing source code and
18+
converts your code into an extractor. As the name says, the extractor itself needs to be simple in nature. The extractor
19+
will process a file and generate metadata in JSON format and/or create a file preview. Any other Clowder API endpoints
20+
are not currently available through the Simple Extractor and the developer would have to fall back to using PyClowder,
21+
JClowder or writing the extractor from scratch.
22+
23+
# Creating an Extractor
24+
25+
The main function of your program needs to accept the string format file path of the input file. It also needs to
26+
return an object containing either metadata information ("metadata"), details about file previews ("previews") or both
27+
in the following format:
28+
29+
```json
30+
{
31+
"metadata": {},
32+
"previews": []
33+
}
34+
```
35+
36+
The metadata sub document will contain the metadata that is directly uploaded back to clowder and will be associated
37+
with the file. The previews array is a list of filenames that are previews that will be uploaded to clowder and
38+
associated with file. Once the previews are uploaded they will be removed from the drive.
39+
40+
When writing the code for the extractor you don't have to worry about interaction with clowder and any subpieces, you
41+
can test your code locally in your development environment by calling the function that will process the file and see
42+
if the result matches the output described above.
43+
44+
# Using Extractor in Clowder
45+
46+
Once you are done with the extractor and you have tested your code you can wrap the extractor in a docker image and test
47+
this image in the full clowder environment. To do this you will need to create a Dockerfile as well as an
48+
extractor_info.json file as well as some optional additional files need by the docker build process. Once you have these
49+
files you can build you image using `docker build -t extractor-example .`. This will build the docker image and tag it
50+
with the name extractor-example (you should replace this with a better name).
51+
52+
The dockerfile has 2 environment variables that need to be set:
53+
- R_SCRIPT : the path on disk to the actual file that needs to be sourced for the function. This can be left blank if
54+
no file needs to be sourced (for example in case when the file is installed as a package).
55+
- R_FUNCTION : the name of the function that needs to be called that takes a file as input and returns an object that
56+
contains the data described above.
57+
58+
There can be 2 additional files that are used when creating the docker image:
59+
- packages.apt : a list of ubuntu packages that need to be installed for the default ubuntu repositories.
60+
- docker.R : an R script that is run during the docker build process. This can be used to install any required R
61+
packages. Another option is to install the code if it is provided as an R package.
62+
63+
An example of the Dockerfile is:
64+
65+
```Dockerfile
66+
FROM clowder/extractors-simple-r-extractor:onbuild
67+
68+
ENV R_SCRIPT="wordcount.R" \
69+
R_FUNCTION="process_file"
70+
```
71+
72+
There also has to be an extractor_info.json file which contains information about the extractor and is used to by the
73+
extractor framework to initialize the extractor as well as upload information to clowder about the extractor.
74+
75+
```json
76+
{
77+
"@context": "<context root URL>",
78+
"name": "<extractor name>",
79+
"version": "<version number>",
80+
"description": "<extractor description>",
81+
"author": "<first name> <last name> <<email address>>",
82+
"contributors": [
83+
"<first name> <last name> <<email address>>",
84+
"<first name> <last name> <<email address>>",
85+
],
86+
"contexts": [
87+
{
88+
"<metadata term 1>": "<URL definition of metadata term 1>",
89+
"<metadata term 2>": "<URL definition of metadata term 2>",
90+
}
91+
],
92+
"repository": [
93+
{
94+
"repType": "git",
95+
"repUrl": "<source code URL>"
96+
}
97+
],
98+
"process": {
99+
"file": [
100+
"<MIME type/subtype>",
101+
"<MIME type/subtype>"
102+
]
103+
},
104+
"external_services": [],
105+
"dependencies": [],
106+
"bibtex": []
107+
}
108+
```
109+
110+
Once the image with the extractor is build you can test this extractor in the clowder environment. To do this you will
111+
need to start clowder first. This can be done using a single [docker-compose file](https://opensource.ncsa.illinois.edu/bitbucket/projects/CATS/repos/pyclowder2/raw/docker-compose.yml).
112+
You can start the full clowder stack using `docker-compose up -p clowder` in the same folder where you downloaded the
113+
docker-compose file. After some time you will have an instance of clowder running that you can access using:
114+
http://localhost:9000/ (if you use docker with virtualbox the url will probably be http://192.168.99.100:9000/).
115+
116+
If this is the first time you have started clowder you will need to create an account. You will be asked to enter an
117+
email address (use [email protected]). If you look at the console where you started clowder using docker-compose you
118+
will some text and a url of the form http://localhost:9000/signup/57d93076-7eca-418e-be7e-4a06c06f3259. If you follow
119+
this URL you will be able to create an account for clowder. If you used the [email protected] email address this will
120+
have admin privileges.
121+
122+
Once you have the full clowder stack running, you can start your extractor using
123+
`docker run --rm -ti --network clowder_clowder extractor-example`. This will start the extractor and show the output
124+
of the extractor on the command line. Once the extractor has started successfully, you can upload the appropriate file
125+
and it should show that it is being processed by the extractor. At this point you have successfully created an
126+
extractor and deployed it in clowder.
127+
128+
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#!/usr/bin/env Rscript
2+
3+
# get environment variables
4+
r_script <- Sys.getenv('R_SCRIPT')
5+
r_function <- Sys.getenv('R_FUNCTION')
6+
if (r_function == '') {
7+
stop("Need a function to call.")
8+
}
9+
10+
# command line arguments
11+
args <- commandArgs(trailingOnly=TRUE)
12+
if (length(args) != 2) {
13+
stop("Need 2 arguments (input_file, json_file)")
14+
}
15+
input_file <- args[1]
16+
json_file <- args[2]
17+
18+
# source script file
19+
if (r_script != '') {
20+
source(r_script)
21+
}
22+
23+
# call function
24+
result <- do.call(r_function, list(input_file))
25+
26+
# write result as json
27+
write(jsonlite::toJSON(result, auto_unbox=TRUE), json_file)
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/usr/bin/env python
2+
3+
import json
4+
import subprocess
5+
import tempfile
6+
7+
from pyclowder.extractors import SimpleExtractor
8+
9+
10+
class RExtractor(SimpleExtractor):
11+
def process_file(self, input_file):
12+
with tempfile.NamedTemporaryFile(suffix=".json") as json_file:
13+
subprocess.check_call(['/home/clowder/launcher.R', input_file, json_file.name])
14+
return json.load(json_file.file)
15+
16+
17+
RExtractor().start()

0 commit comments

Comments
 (0)