Created generic SimpleExtractor class used by both the R and Python implementations.

robkooper · robkooper · commit 0751f96e1b43 · 2018-11-08T16:25:56.000-06:00
initial example of r extractor
wordcount example
diff --git a/docker.sh b/docker.sh
@@ -11,7 +11,9 @@ ${DEBUG} docker build --tag clowder/pyclowder:latest .
 ${DEBUG} docker build --tag clowder/pyclowder:onbuild --file Dockerfile.onbuild .
 ${DEBUG} docker build --tag clowder/extractors-binary-preview:onbuild sample-extractors/binary-preview
 ${DEBUG} docker build --tag clowder/extractors-simple-extractor:onbuild sample-extractors/simple-extractor
+${DEBUG} docker build --tag clowder/extractors-simple-r-extractor:onbuild sample-extractors/simple-r-extractor
 
 # build sample extractors
 ${DEBUG} docker build --tag clowder/extractors-wordcount:latest sample-extractors/wordcount
-${DEBUG} docker build --tag clowder/extractors-wordcount-simpleextractor:latest sample-extractors/wordcount-simple-extractor
+${DEBUG} docker build --tag clowder/extractors-wordcount-simple-extractor:latest sample-extractors/wordcount-simple-extractor
+${DEBUG} docker build --tag clowder/extractors-wordcount-simple-r-extractor:latest sample-extractors/wordcount-simple-r-extractor
diff --git a/pyclowder/extractors.py b/pyclowder/extractors.py
@@ -21,6 +21,7 @@
 
 from pyclowder.connectors import RabbitMQConnector, HPCConnector, LocalConnector
 from pyclowder.utils import CheckMessage, setup_logging
+import pyclowder.files
 
 
 class Extractor(object):
@@ -287,3 +288,80 @@ def process_message(self, connector, host, secret_key, resource, parameters):
             parameters (dict): the message received
         """
         logging.getLogger(__name__).debug("default process message : " + str(parameters))
+
+
+class SimpleExtractor(Extractor):
+    """
+    Simple extractor. All that is needed to be done is extend the process_file function.
+    """
+
+    def __init__(self):
+        '''
+        Initialize the extractor and setup the logger.
+        '''
+        Extractor.__init__(self)
+        self.setup()
+
+        # setup logging for the exctractor
+        logging.getLogger('pyclowder').setLevel(logging.INFO)
+        self.logger = logging.getLogger('__main__')
+        self.logger.setLevel(logging.INFO)
+
+    def process_message(self, connector, host, secret_key, resource, parameters):
+        """
+        Process a clowder message. This will download the file to local disk and call the
+        process_file to do the actual processing of the file. The resulting dict is then
+        parsed and based on the keys in the dict it will upload the results to the right
+        location in clowder.
+        """
+        input_file = resource["local_paths"][0]
+        file_id = resource['id']
+
+        # call the actual function that processes the file
+        if file_id and input_file:
+            result = self.process_file(input_file)
+        else:
+            result = dict()
+
+        # return information to clowder
+        try:
+            if 'metadata' in result.keys():
+                metadata = self.get_metadata(result.get('metadata'), 'file', file_id, host)
+                self.logger.info("upload metadata")
+                self.logger.debug(metadata)
+                pyclowder.files.upload_metadata(connector, host, secret_key, file_id, metadata)
+            if 'previews' in result.keys():
+                self.logger.info("upload previews")
+                for preview in result['previews']:
+                    if os.path.exists(str(preview)):
+                        preview = {'file': preview}
+                        self.logger.info("upload preview")
+                        pyclowder.files.upload_preview(connector, host, secret_key, file_id, str(preview))
+        finally:
+            self.cleanup_data(result)
+
+    def process_file(self, input_file):
+        """
+        This function will process the file and return a dict that contains the result. This
+        dict can have the following keys:
+            - metadata: the metadata to be associated with the file
+            - previews: files on disk with the preview to be uploaded
+        :param input_file: the file to be processed.
+        :return: the specially formatted dict.
+        """
+        return dict()
+
+    def cleanup_data(self, result):
+        """
+        Once the information is uploaded to clowder this function is called for cleanup. This
+        will enable the extractor to remove any preview images or other cleanup other resources
+        that were opened. This is the same dict as returned by process_file.
+
+        The default behaviour is to remove all the files in previews.
+
+        :param result: the result returned from process_file.
+        """
+
+        for preview in result.get("previews", []):
+            if os.path.exists(preview):
+                os.remove(preview)
diff --git a/release.sh b/release.sh
@@ -37,7 +37,7 @@ for i in pyclowder extractors-wordcount; do
     done
 done
 
-for i in pyclowder extractors-binary-preview extractors-simple-extractor; do
+for i in pyclowder extractors-binary-preview extractors-simple-extractor extractors-simple-r-extractor; do
     for v in ${VERSION}; do
         if [ "$v" != "latest" ]; then
             ${DEBUG} docker tag clowder/${i}:onbuild ${SERVER}clowder/${i}:${v}-onbuild
diff --git a/sample-extractors/simple-extractor/Dockerfile b/sample-extractors/simple-extractor/Dockerfile
@@ -20,4 +20,4 @@ ONBUILD RUN if [ -e requirements.txt ]; then \
 # copy all files
 ONBUILD ADD . /home/clowder/
 
-CMD python -c "from simple_extractor import SimpleExtractor; from ${EXTRACTION_MODULE} import *; SimpleExtractor(${EXTRACTION_FUNC}).start()"
+CMD python -c "from simple_extractor import SimplePythonExtractor; from ${EXTRACTION_MODULE} import *; SimplePythonExtractor(${EXTRACTION_FUNC}).start()"
diff --git a/sample-extractors/simple-extractor/README.md b/sample-extractors/simple-extractor/README.md
@@ -31,9 +31,8 @@ can contain either metadata information ("metadata"), details about file preview
 
 2. Let's create a Dockerfile for your extractor. Dockerfile contents need to be:
 
-        FROM clowder/extractors-simple-extractor:onbuild
-        ENV EXTRACTION_FUNC="your_main_function"
-        ENV EXTRACTION_MODULE="your_python_program"
-
-
-
+```Dockerfile
+FROM clowder/extractors-simple-extractor:onbuild
+ENV EXTRACTION_FUNC="your_main_function"
+ENV EXTRACTION_MODULE="your_python_program"
+```
diff --git a/sample-extractors/simple-extractor/simple_extractor.py b/sample-extractors/simple-extractor/simple_extractor.py
@@ -1,36 +1,12 @@
 #!/usr/bin/env python
 
-import logging
-from pyclowder.extractors import Extractor
-import pyclowder.files
+from pyclowder.extractors import SimpleExtractor
 
 
-class SimpleExtractor(Extractor):
+class SimplePythonExtractor(SimpleExtractor):
     def __init__(self, extraction):
-        Extractor.__init__(self)
+        SimpleExtractor.__init__(self)
         self.extraction = extraction
-        self.setup()
-        # setup logging for the exctractor
-        logging.getLogger('pyclowder').setLevel(logging.INFO)
-        self.logger = logging.getLogger('__main__')
-        self.logger.setLevel(logging.INFO)
 
-    def process_message(self, connector, host, secret_key, resource, parameters):
-        input_file = resource["local_paths"][0]
-        file_id = resource['id']
-        result = self.extraction(input_file)
-        if 'metadata' in result.keys():
-            metadata = self.get_metadata(result.get('metadata'), 'file', file_id, host)
-            self.logger.info("upload metadata")
-            self.logger.debug(metadata)
-            pyclowder.files.upload_metadata(connector, host, secret_key, file_id, metadata)
-        if 'previews' in result.keys():
-            self.logger.info("upload previews")
-            for preview in result['previews']:
-                if isinstance(preview, basestring):
-                    preview = {'file': preview}
-                else:
-                    continue
-                self.logger.info("upload preview")
-                pyclowder.files.upload_preview(connector, host, secret_key, file_id, preview.get('file'),
-                                               preview.get('metadata'), preview.get('mimetype'))
+    def process_file(self, input_file):
+        return self.extraction(input_file)
diff --git a/sample-extractors/simple-r-extractor/Dockerfile b/sample-extractors/simple-r-extractor/Dockerfile
@@ -0,0 +1,31 @@
+FROM clowder/pyclowder
+
+ENV R_SCRIPT="" \
+    R_FUNCTION=""
+
+RUN apt-get -q -q update \
+    && apt-get -y install --no-install-recommends r-base-core r-base-dev python-dev \
+    && rm -rf /var/lib/apt/lists/* \
+    && pip install rpy2==2.8.6
+RUN Rscript --vanilla -e "install.packages('jsonlite', repos='http://cran.rstudio.com/'); print(system.file(package = 'jsonlite')); q(status=as.integer(system.file(package = 'jsonlite') == ''))"
+
+COPY r_extractor.py /home/clowder/
+
+# install any packages
+ONBUILD COPY packages.* Dockerfile /home/clowder/
+ONBUILD RUN if [ -e packages.apt ]; then \
+                apt-get -q -q update \
+                && xargs apt-get -y install --no-install-recommends < packages.apt \
+                && rm -rf /var/lib/apt/lists/*; \
+            fi
+
+# install any r packages
+ONBUILD COPY docker.R* Dockerfile /home/clowder/
+ONBUILD RUN if [ -e docker.R ]; then \
+                Rscript docker.R; \
+            fi
+
+# copy all files
+ONBUILD ADD . /home/clowder/
+
+CMD python r_extractor.py
diff --git a/sample-extractors/simple-r-extractor/README.md b/sample-extractors/simple-r-extractor/README.md
@@ -0,0 +1,118 @@
+# Introduction
+
+Clowder is an open-source research data management system that supports curation of long-tail data and metadata across
+multiple research domains and diverse data types. It uses a metadata extraction bus to perform data curation. Extractors
+are software programs that do the extraction of specific metadata from a file or dataset (a group of related files).
+The Simple Extractor Wrapper is a piece of software being developed to make the process of developing an extractor
+easier. This document will provide the details of writing an extractor program using the Simple Extractor Wrapper.
+
+# Goals of Simple Extractor Wrapper
+
+An extractor can be written in any programming language as long as it can communicate with Clowder using a simple HTTP
+web service API and RabbitMQ. It can be hard to develop an extractor from the scratch when you also consider the code
+that is needed for this communication. To reduce this effort and to avoid code duplication, we created libraries written
+in Python (PyClowder) and Java (JClowder) to make the processing of writing extractors easy in these languages. We chose
+these languages since they are among the most popular ones and they continue to remain to so. Though this is the case,
+there is still some overhead in terms of developing an extractor using these libraries. In order to make the process of
+writing extractors even easier, we created a Simple Extractor Wrapper, that wraps around your existing source code and
+converts your code into an extractor. As the name says, the extractor itself needs to be simple in nature. The extractor
+will process a file and generate metadata in JSON format and/or create a file preview. Any other Clowder API endpoints
+are not currently available through the Simple Extractor and the developer would have to fall back to using PyClowder,
+JClowder or writing the extractor from scratch.
+
+# Creating an Extractor
+
+The main function of your program needs to accept the string format file path of the input file. It also needs to
+return an object containing either metadata information ("metadata"), details about file previews ("previews") or both
+in the following format:
+
+```json
+{
+    "metadata": {},
+    "previews": []
+}
+```
+
+The metadata sub document will contain the metadata that is directly uploaded back to clowder and will be associated
+with the file. The previews array is a list of filenames that are previews that will be uploaded to clowder and
+associated with file. Once the previews are uploaded they will be removed from the drive.
+
+When writing the code for the extractor you don't have to worry about interaction with clowder and any subpieces, you
+can test your code locally in your development environment by calling the function that will process the file and see
+if the result matches the output described above.
+
+# Using Extractor in Clowder
+
+Once you are done with the extractor and you have tested your code you can wrap the extractor in a docker image and test
+this image in the full clowder environment. To do this you will need to create a Dockerfile as well as an
+extractor_info.json file as well as some optional additional files need by the docker build process. Once you have these
+files you can build you image using `docker build -t extractor-example .`. This will build the docker image and tag it
+with the name extractor-example (you should replace this with a better name).
+
+The dockerfile has 2 environment variables that need to be set:
+- R_SCRIPT : the path on disk to the actual file that needs to be sourced for the function. This can be left blank if
+  no file needs to be sourced (for example in case when the file is installed as a package).
+- R_FUNCTION : the name of the function that needs to be called that takes a file as input and returns an object that
+  contains the data described above.
+There can be 2 additional files that are used when creating the docker image:
+- packages.apt : a list of ubuntu packages that need to be installed for the default ubuntu repositories.
+- docker.R : an R script that is run during the docker build process. This can be used to install any required R
+  packages. Another option is to install the code if it is provided as an R package.
+
+There also has to be an extractor_info.json file which contains information about the extractor and is used to by the
+extractor framework to initialize the extractor as well as upload information to clowder about the extractor.
+
+```json
+{
+   "@context": "<context root URL>",
+   "name": "<extractor name>",
+   "version": "<version number>",
+   "description": "<extractor description>",
+   "author": "<first name> <last name> <<email address>>",
+   "contributors": [
+       "<first name> <last name> <<email address>>",
+       "<first name> <last name> <<email address>>",
+     ],
+   "contexts": [
+    {
+       "<metadata term 1>": "<URL definition of metadata term 1>",
+        "<metadata term 2>": "<URL definition of metadata term 2>",
+     }
+   ],
+   "repository": [
+      {
+    "repType": "git",
+         "repUrl": "<source code URL>"
+      }
+   ],
+   "process": {
+     "file": [
+       "<MIME type/subtype>",
+       "<MIME type/subtype>"
+     ]
+   },
+   "external_services": [],
+   "dependencies": [],
+   "bibtex": []
+ }
+```
+
+Once the image with the extractor is build you can test this extractor in the clowder environment. To do this you will
+need to start clowder first. This can be done using a single [docker-compose file](https://opensource.ncsa.illinois.edu/bitbucket/projects/CATS/repos/pyclowder2/raw/docker-compose.yml).
+You can start the full clowder stack using `docker-compose up -p clowder` in the same folder where you downloaded the 
+docker-compose file. After some time you will have an instance of clowder running that you can access using:
+http://localhost:9000/ (if you use docker with virtualbox the url will probably be http://192.168.99.100:9000/).
+
+If this is the first time you have started clowder you will need to create an account. You will be asked to enter an
+email address (use admin@example.com). If you look at the console where you started clowder using docker-compose you
+will some text and a url of the form  http://localhost:9000/signup/57d93076-7eca-418e-be7e-4a06c06f3259. If you follow
+this URL you will be able to create an account for clowder. If you used the admin@example.com email address this will
+have admin privileges.
+
+Once you have the full clowder stack running, you can start your extractor using 
+`docker run --rm -ti --network clowder_clowder extractor-example`. This will start the extractor and show the output
+of the extractor on the command line. Once the extractor has started successfully, you can upload the appropriate file
+and it should show that it is being processed by the extractor. At this point you have successfully created an
+extractor and deployed it in clowder.
+
+
diff --git a/sample-extractors/simple-r-extractor/r_extractor.py b/sample-extractors/simple-r-extractor/r_extractor.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python
+
+import json
+import os
+
+from pyclowder.extractors import SimpleExtractor
+import rpy2.robjects as robjects
+
+r_script = os.getenv("R_SCRIPT")
+r_function = os.getenv("R_FUNCTION")
+
+
+class RExtractor(SimpleExtractor):
+    def process_file(self, input_file):
+        r_result = robjects.r('''
+                    if ("%s" != "") {
+                        source("%s")
+                    }
+                    result <- do.call("%s", list("%s"))
+                    jsonlite::toJSON(result, auto_unbox=TRUE)
+                ''' % (r_script, r_script, r_function, input_file))
+        return json.loads(str(r_result))
+
+
+RExtractor().start()
diff --git a/sample-extractors/wordcount-simple-r-extractor/Dockerfile b/sample-extractors/wordcount-simple-r-extractor/Dockerfile
@@ -0,0 +1,4 @@
+FROM clowder/extractors-simple-r-extractor:onbuild
+
+ENV R_SCRIPT="wordcount.R" \
+    R_FUNCTION="process_file"
diff --git a/sample-extractors/wordcount-simple-r-extractor/extractor_info.json b/sample-extractors/wordcount-simple-r-extractor/extractor_info.json
@@ -0,0 +1,28 @@
+{
+  "@context": "http://clowder.ncsa.illinois.edu/contexts/extractors.jsonld",
+  "name": "ncsa.wordcount.R",
+  "version": "1.0",
+  "description": "WordCount simple extractor. Counts the number of characters, words and lines in the text file that was uploaded.",
+  "author": "Rob Kooper <kooper@illinois.edu>",
+  "contributors": [],
+  "contexts": [
+    {
+      "lines": "http://clowder.ncsa.illinois.edu/metadata/ncsa.wordcount#lines",
+      "words": "http://clowder.ncsa.illinois.edu/metadata/ncsa.wordcount#words",
+      "characters": "http://clowder.ncsa.illinois.edu/metadata/ncsa.wordcount#characters"
+    }
+  ],
+  "repository": {
+    "repType": "git",
+    "repUrl": "https://opensource.ncsa.illinois.edu/stash/scm/cats/pyclowder.git"
+  },
+  "process": {
+    "file": [
+      "text/*",
+      "application/json"
+    ]
+  },
+  "external_services": [],
+  "dependencies": [],
+  "bibtex": []
+}
diff --git a/sample-extractors/wordcount-simple-r-extractor/wordcount.R b/sample-extractors/wordcount-simple-r-extractor/wordcount.R