Merge branch 'develop' into separate-extractor-message-and-type

max-zilla · max-zilla · commit 9aa47b3b5b65 · 2020-08-24T09:57:04.000-05:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/) 
 and this project adheres to [Semantic Versioning](http://semver.org/).
 
+## Unreleased
+
+### Added
+- Simple extractors now support datasets, can also create new datasets.
+
 ## 2.2.3 - 2019-10-14
 
 ### Fixed
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -1,4 +1,4 @@
 enum34==1.1.6
 Sphinx==1.6.2
 pika==0.10.0
-PyYAML==3.11
+PyYAML==5.1
diff --git a/pyclowder/connectors.py b/pyclowder/connectors.py
@@ -761,7 +761,12 @@ def on_message(self, channel, method, header, body):
             if 'routing_key' not in json_body and method.routing_key:
                 json_body['routing_key'] = method.routing_key
 
-            self.worker = RabbitMQHandler(self.extractor_name, self.extractor_info, self.check_message,
+            if 'jobid' not in json_body:
+                job_id = None
+            else:
+                job_id = json_body['jobid']
+
+            self.worker = RabbitMQHandler(self.extractor_name, self.extractor_info, job_id, self.check_message,
                                           self.process_message, self.ssl_verify, self.mounted_paths,
                                           method, header, body)
             self.worker.start_thread(json_body)
@@ -836,13 +841,14 @@ class RabbitMQHandler(Connector):
     a queue of messages that the super- loop can access and send later.
     """
 
-    def __init__(self, extractor_name, extractor_info, check_message=None, process_message=None, ssl_verify=True,
+    def __init__(self, extractor_name, extractor_info, job_id, check_message=None, process_message=None, ssl_verify=True,
                  mounted_paths=None, method=None, header=None, body=None):
         super(RabbitMQHandler, self).__init__(extractor_name, extractor_info, check_message, process_message,
                                               ssl_verify, mounted_paths)
         self.method = method
         self.header = header
         self.body = body
+        self.job_id = job_id
         self.messages = []
         self.thread = None
         self.finished = False
@@ -921,6 +927,15 @@ def process_messages(self, channel, rabbitmq_queue):
 
     def status_update(self, status, resource, message):
         super(RabbitMQHandler, self).status_update(status, resource, message)
+
+        status_report = dict()
+        # TODO: Update this to check resource["type"] once Clowder better supports dataset events
+        status_report['file_id'] = resource["id"]
+        status_report['job_id'] = self.job_id
+        status_report['extractor_id'] = self.extractor_info['name']
+        status_report['status'] = "%s: %s" % (status, message)
+        status_report['start'] = pyclowder.utils.iso8601time()
+
         with self.lock:
             # TODO: Remove 'status' from payload later and read from message_type and message in Clowder 2.0
             self.messages.append({"type": "status",
@@ -959,7 +974,8 @@ class HPCConnector(Connector):
     def __init__(self, extractor_name, extractor_info, picklefile,
                  check_message=None, process_message=None, ssl_verify=True, mounted_paths=None):
         super(HPCConnector, self).__init__(extractor_name, extractor_info, check_message, process_message,
-                                           ssl_verify, mounted_paths)
+                                           ssl_verify, job_id, mounted_paths)
+        self.job_id = job_id
         self.picklefile = picklefile
         self.logfile = None
 
@@ -998,6 +1014,7 @@ def status_update(self, status, resource, message):
                     statusreport = dict()
                     statusreport['file_id'] = resource["id"]
                     statusreport['extractor_id'] = self.extractor_info['name']
+                    statusreport['job_id'] = self.job_id
                     statusreport['status'] = "%s: %s" % (status, message)
                     statusreport['start'] = time.strftime('%Y-%m-%dT%H:%M:%S')
                     log.write(json.dumps(statusreport) + '\n')
diff --git a/pyclowder/extractors.py b/pyclowder/extractors.py
@@ -308,50 +308,137 @@ def __init__(self):
         self.logger = logging.getLogger('__main__')
         self.logger.setLevel(logging.INFO)
 
+    # TODO: Support check_message() in simple extractors
+
     def process_message(self, connector, host, secret_key, resource, parameters):
         """
-        Process a clowder message. This will download the file to local disk and call the
-        process_file to do the actual processing of the file. The resulting dict is then
+        Process a clowder message. This will download the file(s) to local disk and call
+        process_file or process_dataset to do the actual processing. The resulting dict is then
         parsed and based on the keys in the dict it will upload the results to the right
         location in clowder.
         """
-        input_file = resource["local_paths"][0]
-        file_id = resource['id']
+        if 'files' in resource:
+            type = 'dataset'
+            input_files = resource['local_paths']
+            dataset_id = resource['id']
+
+        elif 'local_paths' in resource:
+            type = 'file'
+            input_file = resource['local_paths'][0]
+            file_id = resource['id']
+            dataset_id = resource['parent']['id']
+        else:
+            # TODO: Eventually support other messages such as metadata.added
+            type = 'unknown'
 
-        # call the actual function that processes the file
-        if file_id and input_file:
+        # call the actual function that processes the message
+        if type == 'file' and file_id and input_file:
             result = self.process_file(input_file)
+        elif type == 'dataset' and dataset_id and input_files:
+            result = self.process_dataset(input_files)
         else:
             result = dict()
 
-        # return information to clowder
         try:
+            # upload metadata to the processed file or dataset
             if 'metadata' in result.keys():
-                metadata = self.get_metadata(result.get('metadata'), 'file', file_id, host)
                 self.logger.info("upload metadata")
-                self.logger.debug(metadata)
-                pyclowder.files.upload_metadata(connector, host, secret_key, file_id, metadata)
+                if type == 'file':
+                    metadata = self.get_metadata(result.get('metadata'), 'file', file_id, host)
+                    self.logger.debug(metadata)
+                    pyclowder.files.upload_metadata(connector, host, secret_key, file_id, metadata)
+                elif type == 'dataset':
+                    metadata = self.get_metadata(result.get('metadata'), 'dataset', dataset_id, host)
+                    self.logger.debug(metadata)
+                    pyclowder.datasets.upload_metadata(connector, host, secret_key, dataset_id, metadata)
+                else:
+                    self.logger.error("unable to attach metadata to resource type: %s" % type)
+
+            # upload previews to the processed file
             if 'previews' in result.keys():
                 self.logger.info("upload previews")
-                for preview in result['previews']:
-                    if os.path.exists(str(preview)):
-                        preview = {'file': preview}
-                        self.logger.info("upload preview")
-                        pyclowder.files.upload_preview(connector, host, secret_key, file_id, str(preview))
+                if type == 'file':
+                    for preview in result['previews']:
+                        if os.path.exists(str(preview)):
+                            preview = {'file': preview}
+                            self.logger.info("upload preview")
+                            pyclowder.files.upload_preview(connector, host, secret_key, file_id, str(preview))
+                else:
+                    # TODO: Add Clowder endpoint (& pyclowder method) to attach previews to datasets
+                    self.logger.error("previews not currently supported for resource type: %s" % type)
+
+            # upload output files to the processed file's parent dataset or processed dataset
+            if 'outputs' in result.keys():
+                self.logger.info("upload output files")
+                if type == 'file' or type == 'dataset':
+                    for output in result['outputs']:
+                        if os.path.exists(str(output)):
+                            pyclowder.files.upload_to_dataset(connector, host, secret_key, dataset_id, str(output))
+                else:
+                    self.logger.error("unable to upload outputs to resource type: %s" % type)
+
+            if 'new_dataset' in result.keys():
+                if type == 'dataset':
+                    nds = result['new_dataset']
+                    if 'name' not in nds.keys():
+                        self.logger.error("new datasets require a name")
+                    else:
+                        description = nds['description'] if 'description' in nds.keys() else ""
+                        new_dataset_id = pyclowder.datasets.create_empty(connector, host, secret_key, nds['name'],
+                                                                         description)
+                        self.logger.info("created new dataset: %s" % new_dataset_id)
+
+                        if 'metadata' in nds.keys():
+                            self.logger.info("upload metadata to new dataset")
+                            metadata = self.get_metadata(nds.get('metadata'), 'dataset', new_dataset_id, host)
+                            self.logger.debug(metadata)
+                            pyclowder.datasets.upload_metadata(connector, host, secret_key, new_dataset_id, metadata)
+
+                        if 'outputs' in nds.keys():
+                            self.logger.info("upload output files to new dataset")
+                            for output in nds['outputs']:
+                                if os.path.exists(str(output)):
+                                    pyclowder.files.upload_to_dataset(connector, host, secret_key, new_dataset_id,
+                                                                      str(output))
+
+                        if 'previews' in nds.keys():
+                            # TODO: Add Clowder endpoint (& pyclowder method) to attach previews to datasets
+                            self.logger.error("previews not currently supported for resource type: %s" % type)
+
         finally:
             self.cleanup_data(result)
 
     def process_file(self, input_file):
         """
         This function will process the file and return a dict that contains the result. This
         dict can have the following keys:
-            - metadata: the metadata to be associated with the file
-            - previews: files on disk with the preview to be uploaded
+            - metadata: the metadata to be associated with the processed file
+            - previews: images on disk with the preview to be uploaded to the processed file
+            - outputs: files on disk to be added to processed file's parent
         :param input_file: the file to be processed.
         :return: the specially formatted dict.
         """
         return dict()
 
+    def process_dataset(self, input_files):
+        """
+        This function will process the file list and return a dict that contains the result. This
+        dict can have the following keys:
+            - metadata: the metadata to be associated with the processed dataset
+            - outputs: files on disk to be added to the dataset
+            - previews: images to be associated with the dataset
+            - new_dataset: a dict describing a new dataset to be created for the outputs, with the following keys:
+                - name: the name of the new dataset to be created (including adding the outputs,
+                        metadata and previews contained in new_dataset)
+                - description: description for the new dataset to be created
+                - previews: (see above)
+                - metadata: (see above)
+                - outputs: (see above)
+        :param input_files: the files to be processed.
+        :return: the specially formatted dict.
+        """
+        return dict()
+
     def cleanup_data(self, result):
         """
         Once the information is uploaded to clowder this function is called for cleanup. This
diff --git a/requirements.txt b/requirements.txt
@@ -3,7 +3,7 @@ pika==1.0.0
 PyYAML==5.1
 requests==2.21.0
 wheel==0.33.1
-urllib3==1.24.1
+urllib3==1.24.2
 pytest==4.3.1
 pytest-pep8==1.0.6
 requests-toolbelt==0.9.1
diff --git a/sample-extractors/simple-extractor/simple_extractor.py b/sample-extractors/simple-extractor/simple_extractor.py
@@ -10,3 +10,6 @@ def __init__(self, extraction):
 
     def process_file(self, input_file):
         return self.extraction(input_file)
+
+    def process_dataset(self, input_files):
+        return self.extraction(input_files)