Merge pull request #2 from clowder-framework/separate-extractor-message-and-type

robkooper · web-flow · commit b9738cb750d4 · 2020-08-24T11:42:56.000-05:00
improve error handling, make extractor message and type separate
diff --git a/pyclowder/collections.py b/pyclowder/collections.py
@@ -8,7 +8,6 @@
 import requests
 
 from pyclowder.client import ClowderClient
-from pyclowder.utils import StatusMessage
 
 
 def create_empty(connector, host, key, collectionname, description, parentid=None, spaceid=None):
@@ -121,8 +120,7 @@ def upload_preview(connector, host, key, collectionid, previewfile, previewmetad
                     section this preview should be associated with.
     """
 
-    connector.status_update(StatusMessage.processing, {"type": "collection", "id": collectionid},
-                            "Uploading collection preview.")
+    connector.message_process({"type": "collection", "id": collectionid}, "Uploading collection preview.")
 
     logger = logging.getLogger(__name__)
     headers = {'Content-Type': 'application/json'}
diff --git a/pyclowder/connectors.py b/pyclowder/connectors.py
@@ -230,7 +230,6 @@ def _build_resource(self, body, host, secret_key):
                     "type": "dataset",
                     "id": datasetid
                 }
-                self.status_update(pyclowder.utils.StatusMessage.error, resource, msg)
                 self.message_error(resource)
                 return None
 
@@ -392,7 +391,7 @@ def _process_message(self, body):
             self.register_extractor("%s?key=%s" % (url, secret_key))
 
         # tell everybody we are starting to process the file
-        self.status_update(pyclowder.utils.StatusMessage.start, resource, "Started processing")
+        self.status_update(pyclowder.utils.StatusMessage.start.value, resource, "Started processing.")
 
         # checks whether to process the file in this message or not
         # pylint: disable=too-many-nested-blocks
@@ -456,41 +455,41 @@ def _process_message(self, body):
                                     logger.exception("Error removing temporary dataset directory")
 
             else:
-                self.status_update(pyclowder.utils.StatusMessage.processing, resource, "Skipped in check_message")
+                self.status_update(pyclowder.utils.StatusMessage.skip.value, resource, "Skipped in check_message")
 
             self.message_ok(resource)
 
         except SystemExit as exc:
-            status = "sys.exit : " + str(exc)
-            logger.exception("[%s] %s", resource['id'], status)
-            self.status_update(pyclowder.utils.StatusMessage.error, resource, status)
-            self.message_resubmit(resource, retry_count)
+            message = str.format("sys.exit: {}", str(exc))
+            logger.exception("[%s] %s", resource['id'], message)
+            self.message_resubmit(resource, retry_count, message)
             raise
         except KeyboardInterrupt:
-            status = "keyboard interrupt"
-            logger.exception("[%s] %s", resource['id'], status)
-            self.status_update(pyclowder.utils.StatusMessage.error, resource, status)
-            self.message_resubmit(resource, retry_count)
+            message = "keyboard interrupt"
+            logger.exception("[%s] %s", resource['id'], message)
+            self.message_resubmit(resource, retry_count, message)
             raise
         except GeneratorExit:
-            status = "generator exit"
-            logger.exception("[%s] %s", resource['id'], status)
-            self.status_update(pyclowder.utils.StatusMessage.error, resource, status)
-            self.message_resubmit(resource, retry_count)
+            message = "generator exit"
+            logger.exception("[%s] %s", resource['id'], message)
+            self.message_resubmit(resource, retry_count, message)
             raise
         except subprocess.CalledProcessError as exc:
-            status = str.format("Error processing [exit code={}]\n{}", exc.returncode, exc.output)
-            logger.exception("[%s] %s", resource['id'], status)
-            self.status_update(pyclowder.utils.StatusMessage.error, resource, status)
-            self.message_error(resource)
+            message = str.format("Error in subprocess [exit code={}]:\n{}", exc.returncode, exc.output)
+            logger.exception("[%s] %s", resource['id'], message)
+            self.message_error(resource, message)
+        except PyClowderExtractionAbort as exc:
+            message = str.format("Aborting message: {}", exc.message)
+            logger.exception("[%s] %s", resource['id'], message)
+            self.message_error(resource, message)
         except Exception as exc:  # pylint: disable=broad-except
-            status = "Error processing : " + str(exc)
-            logger.exception("[%s] %s", resource['id'], status)
-            self.status_update(pyclowder.utils.StatusMessage.error, resource, status)
+            message = str(exc)
+            logger.exception("[%s] %s", resource['id'], message)
             if retry_count < 10:
-                self.message_resubmit(resource, retry_count + 1)
+                message = "(#%s) %s" % (retry_count+1, message)
+                self.message_resubmit(resource, retry_count+1, message)
             else:
-                self.message_error(resource)
+                self.message_error(resource, message)
 
     def register_extractor(self, endpoints):
         """Register extractor info with Clowder.
@@ -528,21 +527,23 @@ def status_update(self, status, resource, message):
         the instance know the progress of the extractor.
 
         Keyword arguments:
-        status - START | PROCESSING | DONE | ERROR
+        status - pyclowder.utils.StatusMessage value
         resource  - descriptor object with {"type", "id"} fields
         message - contents of the status update
         """
         logging.getLogger(__name__).info("[%s] : %s: %s", resource["id"], status, message)
 
-    def message_ok(self, resource):
-        self.status_update(pyclowder.utils.StatusMessage.done, resource, "Done processing")
+    def message_ok(self, resource, message="Done processing."):
+        self.status_update(pyclowder.utils.StatusMessage.done.value, resource, message)
+
+    def message_error(self, resource, message="Error processing message."):
+        self.status_update(pyclowder.utils.StatusMessage.error.value, resource, message)
 
-    def message_error(self, resource):
-        self.status_update(pyclowder.utils.StatusMessage.error, resource, "Error processing message")
+    def message_resubmit(self, resource, retry_count, message="Resubmitting message."):
+        self.status_update(pyclowder.utils.StatusMessage.retry.value, resource, message)
 
-    def message_resubmit(self, resource, retry_count):
-        self.status_update(pyclowder.utils.StatusMessage.processing, resource, "Resubmitting message (attempt #%s)"
-                           % retry_count)
+    def message_process(self, resource, message):
+        self.status_update(pyclowder.utils.StatusMessage.processing.value, resource, message)
 
     def get(self, url, params=None, raise_status=True, **kwargs):
         """
@@ -877,19 +878,22 @@ def process_messages(self, channel, rabbitmq_queue):
             with self.lock:
                 msg = self.messages.pop(0)
 
+            # PROCESSING - Standard update message during extractor processing
             if msg["type"] == 'status':
                 if self.header.reply_to:
                     properties = pika.BasicProperties(delivery_mode=2, correlation_id=self.header.correlation_id)
                     channel.basic_publish(exchange='',
                                           routing_key=self.header.reply_to,
                                           properties=properties,
-                                          body=json.dumps(msg['status']))
+                                          body=json.dumps(msg['payload']))
 
+            # DONE - Extractor finished without error
             elif msg["type"] == 'ok':
                 channel.basic_ack(self.method.delivery_tag)
                 with self.lock:
                     self.finished = True
 
+            # ERROR - Extractor encountered error and message goes to error queue
             elif msg["type"] == 'error':
                 properties = pika.BasicProperties(delivery_mode=2, reply_to=self.header.reply_to)
                 channel.basic_publish(exchange='',
@@ -900,18 +904,18 @@ def process_messages(self, channel, rabbitmq_queue):
                 with self.lock:
                     self.finished = True
 
+            # RESUBMITTING - Extractor encountered error and message is resubmitted to same queue
             elif msg["type"] == 'resubmit':
-                retry_count = msg['retry_count']
-                queue = rabbitmq_queue
-                properties = pika.BasicProperties(delivery_mode=2, reply_to=self.header.reply_to)
                 jbody = json.loads(self.body)
-                jbody['retry_count'] = retry_count
+                jbody['retry_count'] = msg['retry_count']
                 if 'exchange' not in jbody and self.method.exchange:
                     jbody['exchange'] = self.method.exchange
-                if 'routing_key' not in jbody and self.method.routing_key and self.method.routing_key != queue:
+                if 'routing_key' not in jbody and self.method.routing_key and self.method.routing_key != rabbitmq_queue:
                     jbody['routing_key'] = self.method.routing_key
+
+                properties = pika.BasicProperties(delivery_mode=2, reply_to=self.header.reply_to)
                 channel.basic_publish(exchange='',
-                                      routing_key=queue,
+                                      routing_key=rabbitmq_queue,
                                       properties=properties,
                                       body=json.dumps(jbody))
                 channel.basic_ack(self.method.delivery_tag)
@@ -923,31 +927,35 @@ def process_messages(self, channel, rabbitmq_queue):
 
     def status_update(self, status, resource, message):
         super(RabbitMQHandler, self).status_update(status, resource, message)
-        status_report = dict()
-        # TODO: Update this to check resource["type"] once Clowder better supports dataset events
-        status_report['file_id'] = resource["id"]
-        status_report['job_id'] = self.job_id
-        status_report['extractor_id'] = self.extractor_info['name']
-        status_report['status'] = "%s: %s" % (status, message)
-        status_report['start'] = pyclowder.utils.iso8601time()
+
         with self.lock:
+            # TODO: Remove 'status' from payload later and read from message_type and message in Clowder 2.0
             self.messages.append({"type": "status",
-                                  "status": status_report,
                                   "resource": resource,
-                                  "message": message})
-
-    def message_ok(self, resource):
-        super(RabbitMQHandler, self).message_ok(resource)
+                                  "payload": {
+                                      "file_id":      resource["id"],
+                                      "extractor_id": self.extractor_info['name'],
+                                      "job_id":       self.job_id,
+                                      "status":       "%s: %s" % (status, message),
+                                      "start":        pyclowder.utils.iso8601time(),
+                                      "message_type": status,
+                                      "message":      message
+                                  }})
+
+    def message_ok(self, resource, message="Done processing."):
+        super(RabbitMQHandler, self).message_ok(resource, message)
         with self.lock:
             self.messages.append({"type": "ok"})
 
-    def message_error(self, resource):
-        super(RabbitMQHandler, self).message_error(resource)
+    def message_error(self, resource, message="Error processing message."):
+        super(RabbitMQHandler, self).message_error(resource, message)
         with self.lock:
             self.messages.append({"type": "error"})
 
-    def message_resubmit(self, resource, retry_count):
-        super(RabbitMQHandler, self).message_resubmit(resource, retry_count)
+    def message_resubmit(self, resource, retry_count, message=None):
+        if message is None:
+            message = "(#%s)" % retry_count
+        super(RabbitMQHandler, self).message_resubmit(resource, retry_count, message)
         with self.lock:
             self.messages.append({"type": "resubmit", "retry_count": retry_count})
 
@@ -1105,3 +1113,14 @@ def put(self, url, data=None, raise_status=True, **kwargs):
     def delete(self, url, raise_status=True, **kwargs):
         logging.getLogger(__name__).debug("DELETE: " + url)
         return None
+
+
+class PyClowderExtractionAbort(Exception):
+    """Raise exception that will not be subject to retry attempts (i.e. errors that are expected to fail again).
+
+    Attributes:
+        message -- explanation of the error
+    """
+
+    def __init__(self, message):
+        self.message = message
diff --git a/pyclowder/datasets.py b/pyclowder/datasets.py
@@ -12,7 +12,6 @@
 
 from pyclowder.client import ClowderClient
 from pyclowder.collections import get_datasets, get_child_collections, delete as delete_collection
-from pyclowder.utils import StatusMessage
 
 
 def create_empty(connector, host, key, datasetname, description, parentid=None, spaceid=None):
@@ -113,7 +112,7 @@ def download(connector, host, key, datasetid):
     datasetid -- the file that is currently being processed
     """
 
-    connector.status_update(StatusMessage.processing, {"type": "dataset", "id": datasetid}, "Downloading dataset.")
+    connector.message_process({"type": "dataset", "id": datasetid}, "Downloading dataset.")
 
     # fetch dataset zipfile
     url = '%sapi/datasets/%s/download?key=%s' % (host, datasetid, key)
@@ -287,8 +286,7 @@ def upload_metadata(connector, host, key, datasetid, metadata):
     metadata -- the metadata to be uploaded
     """
 
-    connector.status_update(StatusMessage.processing, {"type": "dataset", "id": datasetid},
-                            "Uploading dataset metadata.")
+    connector.message_process({"type": "dataset", "id": datasetid}, "Uploading dataset metadata.")
 
     headers = {'Content-Type': 'application/json'}
     url = '%sapi/datasets/%s/metadata.jsonld?key=%s' % (host, datasetid, key)
diff --git a/pyclowder/files.py b/pyclowder/files.py
@@ -14,7 +14,6 @@
 
 from pyclowder.datasets import get_file_list
 from pyclowder.collections import get_datasets, get_child_collections
-from pyclowder.utils import StatusMessage
 
 # Some sources of urllib3 support warning suppression, but not all
 try:
@@ -38,7 +37,7 @@ def download(connector, host, key, fileid, intermediatefileid=None, ext=""):
     ext -- the file extension, the downloaded file will end with this extension
     """
 
-    connector.status_update(StatusMessage.processing, {"type": "file", "id": fileid}, "Downloading file.")
+    connector.message_process({"type": "file", "id": fileid}, "Downloading file.")
 
     # TODO: intermediateid doesn't really seem to be used here, can we remove entirely?
     if not intermediatefileid:
@@ -180,7 +179,7 @@ def upload_metadata(connector, host, key, fileid, metadata):
     metadata -- the metadata to be uploaded
     """
 
-    connector.status_update(StatusMessage.processing, {"type": "file", "id": fileid}, "Uploading file metadata.")
+    connector.message_process({"type": "file", "id": fileid}, "Uploading file metadata.")
 
     headers = {'Content-Type': 'application/json'}
     url = '%sapi/files/%s/metadata.jsonld?key=%s' % (host, fileid, key)
@@ -204,7 +203,7 @@ def upload_preview(connector, host, key, fileid, previewfile, previewmetadata=No
                     file itself and this parameter can be ignored. E.g. 'application/vnd.clowder+custom+xml'
     """
 
-    connector.status_update(StatusMessage.processing, {"type": "file", "id": fileid}, "Uploading file preview.")
+    connector.message_process({"type": "file", "id": fileid}, "Uploading file preview.")
 
     logger = logging.getLogger(__name__)
     headers = {'Content-Type': 'application/json'}
@@ -248,7 +247,7 @@ def upload_tags(connector, host, key, fileid, tags):
     tags -- the tags to be uploaded
     """
 
-    connector.status_update(StatusMessage.processing, {"type": "file", "id": fileid}, "Uploading file tags.")
+    connector.message_process({"type": "file", "id": fileid}, "Uploading file tags.")
 
     headers = {'Content-Type': 'application/json'}
     url = '%sapi/files/%s/tags?key=%s' % (host, fileid, key)
diff --git a/pyclowder/sections.py b/pyclowder/sections.py
@@ -8,8 +8,6 @@
 
 import requests
 
-from pyclowder.utils import StatusMessage
-
 
 def upload(connector, host, key, sectiondata):
     """Upload section to Clowder.
@@ -47,7 +45,7 @@ def upload_tags(connector, host, key, sectionid, tags):
     tags -- the tags to be uploaded
     """
 
-    connector.status_update(StatusMessage.processing, {"type": "section", "id": sectionid}, "Uploading section tags.")
+    connector.message_process({"type": "section", "id": sectionid}, "Uploading section tags.")
 
     headers = {'Content-Type': 'application/json'}
     url = '%sapi/sections/%s/tags?key=%s' % (host, sectionid, key)
@@ -67,8 +65,8 @@ def upload_description(connector, host, key, sectionid, description):
     description -- the description to be uploaded
     """
 
-    connector.status_update(StatusMessage.processing, {"type": "section", "id": sectionid},
-                            "Uploading section description.")
+    connector.message_process({"type": "section", "id": sectionid},
+                              "Uploading section description.")
 
     headers = {'Content-Type': 'application/json'}
     url = '%sapi/sections/%s/description?key=%s' % (host, sectionid, key)
diff --git a/pyclowder/utils.py b/pyclowder/utils.py
@@ -44,10 +44,12 @@ class StatusMessage(Enum):
     full string will be STATUS: MESSAGE.
     """
 
-    start = "START"
+    start = "STARTED"
     processing = "PROCESSING"
-    done = "DONE"
+    done = "SUCCEEDED"
+    skip = "SKIPPED"
     error = "ERROR"
+    retry = "RESUBMITTED"
 
 
 def iso8601time():
diff --git a/sample-extractors/wordcount/wordcount.py b/sample-extractors/wordcount/wordcount.py
@@ -32,24 +32,30 @@ def process_message(self, connector, host, secret_key, resource, parameters):
         inputfile = resource["local_paths"][0]
         file_id = resource['id']
 
-        # call actual program
+        # These process messages will appear in the Clowder UI under Extractions.
+        connector.message_process(resource, "Loading contents of file...")
+
+        # Call actual program
         result = subprocess.check_output(['wc', inputfile], stderr=subprocess.STDOUT)
         result = result.decode('utf-8')
         (lines, words, characters, _) = result.split()
 
-        # store results as metadata
+        connector.message_process(resource, "Found %s lines and %s words..." % (lines, words))
+
+        # Store results as metadata
         result = {
             'lines': lines,
             'words': words,
             'characters': characters
         }
         metadata = self.get_metadata(result, 'file', file_id, host)
+
+        # Normal logs will appear in the extractor log, but NOT in the Clowder UI.
         logger.debug(metadata)
 
-        # upload metadata
+        # Upload metadata to original file
         pyclowder.files.upload_metadata(connector, host, secret_key, file_id, metadata)
 
-
 if __name__ == "__main__":
     extractor = WordCount()
     extractor.start()