|
| 1 | +#!/usr/bin/env python3 |
| 2 | +# -*- coding: UTF-8 -*- |
| 3 | + |
| 4 | +"""This module is intended to be deployed as a Google Cloud Function so that it |
| 5 | +listens to a Google Cloud Storage (GCS) bucket. When a new file is detected in |
| 6 | +the bucket (Avro file format expected), it will automatically load it into a |
| 7 | +BigQuery (BQ) table and publish a message to PubSub (PS). This code borrows |
| 8 | +heavily from |
| 9 | +https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-avro. |
| 10 | +
|
| 11 | +Usage Example |
| 12 | +------------- |
| 13 | +
|
| 14 | +First, check that the buckets (GCS), datasets (BQ), tables (BQ), and topics (PS) |
| 15 | +referenced in the ``bucket_resources`` dictionary (below) point to the |
| 16 | +appropriate Google Cloud Platform (GCP) resources. (These should have been |
| 17 | +initialized during the GCP setup, see |
| 18 | +https://pitt-broker.readthedocs.io/en/latest/installation.html#setting-up-gcp.) |
| 19 | +Buckets and datasets must exist (with appropriate permissions) prior to |
| 20 | +invoking this module. Tables are created automatically and on-the-fly if they |
| 21 | +don't already exist. |
| 22 | +
|
| 23 | +Deploy the ``stream_GCS_to_BQ`` function by running the following command in |
| 24 | +the directory where this module is located. Be sure to replace |
| 25 | +``<YOUR_TRIGGER_BUCKET_NAME>`` with the name of the GCS bucket that this |
| 26 | +function should listen to. For more information, see |
| 27 | +https://cloud.google.com/functions/docs/calling/storage. |
| 28 | +
|
| 29 | +.. code-block:: bash |
| 30 | + :linenos: |
| 31 | +
|
| 32 | + gcloud functions deploy stream_GCS_to_BQ --runtime python37 --set-env-vars |
| 33 | + GOOGLE_CLOUD_PROJECT=${GOOGLE_CLOUD_PROJECT} --trigger-resource |
| 34 | + <YOUR_TRIGGER_BUCKET_NAME> --trigger-event google.storage.object.finalize |
| 35 | +
|
| 36 | +The script ``broker/deploy_cloudfnc.sh`` automates the deployment. |
| 37 | +
|
| 38 | +Module Documentation |
| 39 | +-------------------- |
| 40 | +""" |
| 41 | + |
| 42 | +import logging |
| 43 | +import os |
| 44 | +from google.cloud import bigquery |
| 45 | +from google.cloud import pubsub |
| 46 | +from google.cloud.pubsub_v1.publisher.futures import Future |
| 47 | + |
| 48 | +log = logging.getLogger(__name__) |
| 49 | +PROJECT_ID = os.getenv('GOOGLE_CLOUD_PROJECT') |
| 50 | +BQ = bigquery.Client() |
| 51 | + |
| 52 | +# The bucket_resources dictionary determines which BQ table the alert data will |
| 53 | +# be uploaded to based on which GCS bucket the alert Avro file is stored in. |
| 54 | +ztf_bucket = '_'.join([PROJECT_ID, 'ztf_alert_avro_bucket']) |
| 55 | +testing_bucket = '_'.join([PROJECT_ID, 'testing_bucket']) |
| 56 | +bucket_resources = { |
| 57 | + ztf_bucket: {'BQ_DATASET': 'ztf_alerts', |
| 58 | + 'BQ_TABLE': 'alerts', |
| 59 | + 'PS_TOPIC': 'ztf_alerts_in_BQ' |
| 60 | + }, |
| 61 | + testing_bucket: {'BQ_DATASET': 'testing_dataset', |
| 62 | + 'BQ_TABLE': 'test_GCS_to_BQ', |
| 63 | + 'PS_TOPIC': 'test_alerts_in_BQ' |
| 64 | + } |
| 65 | +} |
| 66 | + |
| 67 | + |
| 68 | +def stream_GCS_to_BQ(data: dict, context: dict) -> str: |
| 69 | + """This function is executed whenever a file is added to Cloud Storage. |
| 70 | + Most of this function is taken from |
| 71 | + https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-avro |
| 72 | + """ |
| 73 | + |
| 74 | + # Create the job |
| 75 | + bucket_name = data['bucket'] |
| 76 | + file_name = data['name'] |
| 77 | + job_config = bigquery.LoadJobConfig() |
| 78 | + job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND |
| 79 | + job_config.source_format = bigquery.SourceFormat.AVRO |
| 80 | + uri = f'gs://{bucket_name}/{file_name}' |
| 81 | + try: |
| 82 | + BQ_TABLE_ID = get_BQ_TABLE_ID(bucket_name) |
| 83 | + except KeyError as e: |
| 84 | + msg = (f'GCS bucket {e} does not have an associated BigQuery dataset ' |
| 85 | + f'configured for the `stream_GCS_to_BQ` Cloud Function. ' |
| 86 | + f'Data in {file_name} cannot be uploaded to BigQuery.') |
| 87 | + log.error(msg) |
| 88 | + return f'GCS bucket {e} not configured' # used in testing |
| 89 | + |
| 90 | + # API request |
| 91 | + load_job = BQ.load_table_from_uri(uri, BQ_TABLE_ID, job_config=job_config) |
| 92 | + msg = (f'Starting stream_GCS_to_BQ job {load_job.job_id} | ' |
| 93 | + f'file name: {file_name} | ' |
| 94 | + f'GCS Bucket: {bucket_name} | ' |
| 95 | + f'BQ Table ID: {BQ_TABLE_ID}' |
| 96 | + ) |
| 97 | + log.info(msg) |
| 98 | + |
| 99 | + # Run the job |
| 100 | + load_job.result() # Start job, wait for it to complete, get the result |
| 101 | + error_result = load_job.error_result |
| 102 | + |
| 103 | + # Publish PubSub message if BQ upload was successful |
| 104 | + if error_result is None: |
| 105 | + topic = bucket_resources[bucket_name]['PS_TOPIC'] |
| 106 | + publish_pubsub(topic, file_name) |
| 107 | + |
| 108 | + return error_result |
| 109 | + |
| 110 | + |
| 111 | +def get_BQ_TABLE_ID(bucket_name: str) -> str: |
| 112 | + """ Returns the ID of the BQ table associated with the GCS bucket_name. |
| 113 | + """ |
| 114 | + |
| 115 | + BQ_DATASET = bucket_resources[bucket_name]['BQ_DATASET'] |
| 116 | + BQ_TABLE = bucket_resources[bucket_name]['BQ_TABLE'] |
| 117 | + BQ_TABLE_ID = '.'.join([PROJECT_ID, BQ_DATASET, BQ_TABLE]) |
| 118 | + |
| 119 | + return BQ_TABLE_ID |
| 120 | + |
| 121 | + |
| 122 | +def publish_pubsub(topic: str, message: str) -> Future: |
| 123 | + """Publish a PubSub alert |
| 124 | +
|
| 125 | + Args: |
| 126 | + message: The message to publish |
| 127 | +
|
| 128 | + Returns: |
| 129 | + The Id of the published message |
| 130 | + """ |
| 131 | + |
| 132 | + # Configure PubSub topic |
| 133 | + publisher = pubsub.PublisherClient() |
| 134 | + topic_path = publisher.topic_path(PROJECT_ID, topic) |
| 135 | + |
| 136 | + # Publish |
| 137 | + log.debug(f'Publishing message: {message}') |
| 138 | + message_data = message.encode('UTF-8') |
| 139 | + future = publisher.publish(topic_path, data=message_data) |
| 140 | + |
| 141 | + return future.result() |
0 commit comments