mwvgroup
diff --git a/‎broker/alert_ingestion/GCS_to_BQ/main.py‎
Lines changed: 141 additions & 0 deletions b/‎broker/alert_ingestion/GCS_to_BQ/main.py‎
Lines changed: 141 additions & 0 deletions
diff --git a/‎broker/alert_ingestion/GCS_to_BQ/requirements.txt‎
Lines changed: 8 additions & 0 deletions b/‎broker/alert_ingestion/GCS_to_BQ/requirements.txt‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎broker/alert_ingestion/consume.py‎
Lines changed: 2 additions & 3 deletions b/‎broker/alert_ingestion/consume.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎broker/alert_ingestion/gen_valid_schema.py‎
Lines changed: 2 additions & 2 deletions b/‎broker/alert_ingestion/gen_valid_schema.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎broker/deploy_cloudfnc.sh‎
Lines changed: 18 additions & 0 deletions b/‎broker/deploy_cloudfnc.sh‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎broker/gcp_setup.py‎
Lines changed: 77 additions & 17 deletions b/‎broker/gcp_setup.py‎
Lines changed: 77 additions & 17 deletions
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+
+"""This module is intended to be deployed as a Google Cloud Function so that it
+listens to a Google Cloud Storage (GCS) bucket. When a new file is detected in
+the bucket (Avro file format expected), it will automatically load it into a
+BigQuery (BQ) table and publish a message to PubSub (PS). This code borrows
+heavily from
+https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-avro.
+
+Usage Example
+-------------
+
+First, check that the buckets (GCS), datasets (BQ), tables (BQ), and topics (PS)
+referenced in the ``bucket_resources`` dictionary (below) point to the
+appropriate Google Cloud Platform (GCP) resources. (These should have been
+initialized during the GCP setup, see
+https://pitt-broker.readthedocs.io/en/latest/installation.html#setting-up-gcp.)
+Buckets and datasets must exist (with appropriate permissions) prior to
+invoking this module. Tables are created automatically and on-the-fly if they
+don't already exist.
+
+Deploy the ``stream_GCS_to_BQ`` function by running the following command in
+the directory where this module is located. Be sure to replace
+``<YOUR_TRIGGER_BUCKET_NAME>`` with the name of the GCS bucket that this
+function should listen to. For more information, see
+https://cloud.google.com/functions/docs/calling/storage.
+
+.. code-block:: bash
+   :linenos:
+
+   gcloud functions deploy stream_GCS_to_BQ --runtime python37 --set-env-vars
+   GOOGLE_CLOUD_PROJECT=${GOOGLE_CLOUD_PROJECT} --trigger-resource
+   <YOUR_TRIGGER_BUCKET_NAME> --trigger-event google.storage.object.finalize
+
+The script ``broker/deploy_cloudfnc.sh`` automates the deployment.
+
+Module Documentation
+--------------------
+"""
+
+import logging
+import os
+from google.cloud import bigquery
+from google.cloud import pubsub
+from google.cloud.pubsub_v1.publisher.futures import Future
+
+log = logging.getLogger(__name__)
+PROJECT_ID = os.getenv('GOOGLE_CLOUD_PROJECT')
+BQ = bigquery.Client()
+
+# The bucket_resources dictionary determines which BQ table the alert data will
+# be uploaded to based on which GCS bucket the alert Avro file is stored in.
+ztf_bucket = '_'.join([PROJECT_ID, 'ztf_alert_avro_bucket'])
+testing_bucket = '_'.join([PROJECT_ID, 'testing_bucket'])
+bucket_resources = {
+    ztf_bucket:     {'BQ_DATASET': 'ztf_alerts',
+                     'BQ_TABLE': 'alerts',
+                     'PS_TOPIC': 'ztf_alerts_in_BQ'
+                     },
+    testing_bucket: {'BQ_DATASET': 'testing_dataset',
+                     'BQ_TABLE': 'test_GCS_to_BQ',
+                     'PS_TOPIC': 'test_alerts_in_BQ'
+                     }
+}
+
+
+def stream_GCS_to_BQ(data: dict, context: dict) -> str:
+    """This function is executed whenever a file is added to Cloud Storage.
+    Most of this function is taken from
+    https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-avro
+    """
+
+    # Create the job
+    bucket_name = data['bucket']
+    file_name = data['name']
+    job_config = bigquery.LoadJobConfig()
+    job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND
+    job_config.source_format = bigquery.SourceFormat.AVRO
+    uri = f'gs://{bucket_name}/{file_name}'
+    try:
+        BQ_TABLE_ID = get_BQ_TABLE_ID(bucket_name)
+    except KeyError as e:
+        msg = (f'GCS bucket {e} does not have an associated BigQuery dataset '
+               f'configured for the `stream_GCS_to_BQ` Cloud Function. '
+               f'Data in {file_name} cannot be uploaded to BigQuery.')
+        log.error(msg)
+        return f'GCS bucket {e} not configured'  # used in testing
+
+    # API request
+    load_job = BQ.load_table_from_uri(uri, BQ_TABLE_ID, job_config=job_config)
+    msg = (f'Starting stream_GCS_to_BQ job {load_job.job_id} | '
+           f'file name: {file_name} | '
+           f'GCS Bucket: {bucket_name} | '
+           f'BQ Table ID: {BQ_TABLE_ID}'
+           )
+    log.info(msg)
+
+    # Run the job
+    load_job.result()  # Start job, wait for it to complete, get the result
+    error_result = load_job.error_result
+
+    # Publish PubSub message if BQ upload was successful
+    if error_result is None:
+        topic = bucket_resources[bucket_name]['PS_TOPIC']
+        publish_pubsub(topic, file_name)
+
+    return error_result
+
+
+def get_BQ_TABLE_ID(bucket_name: str) -> str:
+    """ Returns the ID of the BQ table associated with the GCS bucket_name.
+    """
+
+    BQ_DATASET = bucket_resources[bucket_name]['BQ_DATASET']
+    BQ_TABLE = bucket_resources[bucket_name]['BQ_TABLE']
+    BQ_TABLE_ID = '.'.join([PROJECT_ID, BQ_DATASET, BQ_TABLE])
+
+    return BQ_TABLE_ID
+
+
+def publish_pubsub(topic: str, message: str) -> Future:
+    """Publish a PubSub alert
+
+    Args:
+        message: The message to publish
+
+    Returns:
+        The Id of the published message
+    """
+
+    # Configure PubSub topic
+    publisher = pubsub.PublisherClient()
+    topic_path = publisher.topic_path(PROJECT_ID, topic)
+
+    # Publish
+    log.debug(f'Publishing message: {message}')
+    message_data = message.encode('UTF-8')
+    future = publisher.publish(topic_path, data=message_data)
+
+    return future.result()
@@ -0,0 +1,8 @@
+# As explained here
+# https://cloud.google.com/functions/docs/writing/specifying-dependencies-python
+# dependencies for a Cloud Function must be specified in a `requirements.txt`
+# file (or packaged with the function) in the same directory as `main.py`
+# which contains the `stream_GCS_to_BQ()` function.
+
+google-cloud-bigquery
+google-cloud-pubsub
@@ -24,9 +24,9 @@
    # Create a GCS consumer object
    c = consume.GCSKafkaConsumer(
        kafka_config=config,
-       bucket_name='my-gcs-bucket-name',
+       bucket_name='<PROJECT_ID>_ztf_alert_avro_bucket',
        kafka_topic='my_kafka_topic_name',
-       pubsub_topic='my-gcs-pubsub-name',
+       pubsub_topic='ztf_alert_avro_in_bucket',
        debug=True  # Use debug to run without updating your kafka offset
    )
 
@@ -331,7 +331,6 @@ def guess_schema_version(alert_bytes: bytes) -> str:
 
     return version_match.group(2).decode()
 
-
 def guess_schema_survey(alert_bytes: bytes) -> str:
     """Retrieve the ZTF schema version
 
 
@@ -34,7 +34,7 @@
 
 from pathlib import Path
 import logging
-from typing import Tuple, BinaryIO, Union
+from typing import Tuple, List, BinaryIO, Union
 import pickle
 import json
 import fastavro
@@ -144,7 +144,7 @@ def _reverse_types(field: dict) -> dict:
     return field
 
 
-def _load_Avro(fin: Union[Path, BinaryIO]) -> Tuple[dict, dict]:
+def _load_Avro(fin: Union[Path, BinaryIO]) -> Tuple[dict, List[dict]]:
     """
     Args:
         fin   (str or file-like) : Path to, or file-like object representing,
 
@@ -0,0 +1,18 @@
+#!/bin/sh
+
+###
+# This script deploys the ``stream_GCS_to_BQ`` function in
+# ``broker/alert_ingestion/GCS_to_BQ/main.py`` as a Google Cloud Function
+# so that it listens for new Avro files added to a Google Cloud Storage bucket
+# and uploads them to a BigQuery table.
+#
+# Note that the `GOOGLE_CLOUD_PROJECT` environment variable must be set
+# explicitly within the `gcloud` command.
+###
+
+# NOT SURE OF THE RIGHT WAY TO GET INTO THIS DIRECTORY:
+cd broker/alert_ingestion/GCS_to_BQ
+
+# deploy stream_GCS_to_BQ() to listen to the ztf_alert_avro_bucket
+bucket="${GOOGLE_CLOUD_PROJECT}_ztf_alert_avro_bucket"
+gcloud functions deploy stream_GCS_to_BQ --runtime python37 --set-env-vars GOOGLE_CLOUD_PROJECT=${GOOGLE_CLOUD_PROJECT} --trigger-resource ${bucket} --trigger-event google.storage.object.finalize
@@ -28,46 +28,100 @@
 """
 
 import os
+from pathlib import Path
 
 if not os.getenv('GPB_OFFLINE', False):
     from google.api_core.exceptions import NotFound
-    from google.cloud import bigquery, logging, storage
+    from google.cloud import bigquery, pubsub, logging, storage
 
-_tables = ('alert', 'candidate')
+PROJECT_ID = os.getenv('GOOGLE_CLOUD_PROJECT')
+
+_tables = ('alerts', 'test_GCS_to_BQ')
 
 
 def setup_big_query() -> None:
     """Create the necessary Big Query datasets if they do not already exist
 
-    New data sets include:
+    New datasets include:
       ``ztf_alerts``
+      ``testing_dataset``
     """
 
     bigquery_client = bigquery.Client()
     bigquery_client.create_dataset('ztf_alerts', exists_ok=True)
+    bigquery_client.create_dataset('testing_dataset', exists_ok=True)
 
 
 def setup_buckets() -> None:
-    """Create new storage buckets
+    """Create new storage buckets and upload testing files.
+    Files are expected to reside in the ``tests/test_alerts`` directory.
 
-    New buckets include:
-      ``<project_id>_alert_avro_bucket
+    New buckets [files] include:
+      ``<PROJECT_ID>_ztf_alert_avro_bucket``
+      ``<PROJECT_ID>_testing_bucket`` [``ztf_3.3_validschema_1154446891615015011.avro``]
     """
 
-    storage_client = storage.Client()
+    buckets = {  # '<bucket name>': ['file name',]
+                f'{PROJECT_ID}_ztf_alert_avro_bucket': [],
+                f'{PROJECT_ID}_testing_bucket':
+                    ['ztf_3.3_validschema_1154446891615015011.avro']
+                }
 
-    # Create bucket names
-    project_id = os.environ['GOOGLE_CLOUD_PROJECT']
-    alert_avro_name = f'{project_id}_alert_avro_bucket'
+    storage_client = storage.Client()
 
-    # Create buckets if the do not exist
-    for bucket_name in (alert_avro_name,):
+    for bucket_name, files in buckets.items():
+        # Create buckets if they do not exist
         try:
             storage_client.get_bucket(bucket_name)
-
         except NotFound:
             storage_client.create_bucket(bucket_name)
 
+        # Upload any files
+        for filename in files:
+            bucket = storage_client.get_bucket(bucket_name)
+            blob = bucket.blob(filename)
+            inpath = Path('tests/test_alerts') / filename
+            with inpath.open('rb') as infile:
+                blob.upload_from_file(infile)
+
+
+def setup_pubsub() -> None:
+    """ Create new Pub/Sub topics and subscriptions
+
+    New topics [subscriptions] include:
+        ``ztf_alert_avro_in_bucket``
+        ``ztf_alerts_in_BQ``
+        ``test_alerts_in_BQ``
+        ``test_alerts_PS_publish`` [``test_alerts_PS_subscribe``]
+    """
+
+    topics = {# '<topic_name>': ['<subscription_name>', ]
+                'ztf_alert_avro_in_bucket': [],
+                'ztf_alerts_in_BQ': [],
+                'test_alerts_in_BQ': [],
+                'test_alerts_PS_publish': ['test_alerts_PS_subscribe']
+                }
+
+    publisher = pubsub.PublisherClient()
+    subscriber = pubsub.SubscriberClient()
+
+    for topic, subscriptions in topics.items():
+        topic_path = publisher.topic_path(PROJECT_ID, topic)
+
+        # Create the topic
+        try:
+            publisher.get_topic(topic_path)
+        except NotFound:
+            publisher.create_topic(topic_path)
+
+        # Create any subscriptions:
+        for sub_name in subscriptions:
+            sub_path = subscriber.subscription_path(PROJECT_ID, sub_name)
+            try:
+                subscriber.get_subscription(sub_path)
+            except NotFound:
+                subscriber.create_subscription(sub_path, topic_path)
+
 
 def setup_logging_sinks() -> None:
     """Create sinks for exporting log entries to GCP
@@ -92,16 +146,22 @@ def auto_setup() -> None:
     """Create and setup GCP products required by the ``broker`` package
 
     New data sets include:
-      ``ztf_alerts``
+        ``ztf_alerts``
+        ``testing_dataset``
 
     New buckets include:
-     ``<project_id>_logging_bucket``
-     ``<project_id>_ztf_images``
+        ``<PROJECT_ID>_ztf_alert_avro_bucket``
+        ``<PROJECT_ID>_testing_bucket``
+
+    New topics include:
+        ``ztf_alerts_in_BQ``
+        ``test_alerts_in_BQ``
 
     New sinks include:
-      ``broker_logging_sink``
+        ``broker_logging_sink``
     """
 
     setup_big_query()
     setup_buckets()
+    setup_pubsub()
     setup_logging_sinks()