|
1 | 1 | import arrow
|
2 | 2 | import cattr
|
3 | 3 |
|
| 4 | +import base64 |
4 | 5 | import datetime
|
5 | 6 | import os
|
6 | 7 | import json
|
@@ -113,3 +114,62 @@ def process_fastly_log(data, context):
|
113 | 114 | except exceptions.NotFound:
|
114 | 115 | # Sometimes we try to delete twice
|
115 | 116 | pass
|
| 117 | + |
| 118 | + |
| 119 | +def load_processed_files_into_bigquery(event, context): |
| 120 | + if "attributes" in event and "partition" in event["attributes"]: |
| 121 | + # Check to see if we've manually triggered the function and provided a partition |
| 122 | + partition = event["attributes"]["partition"] |
| 123 | + else: |
| 124 | + # Otherwise, this was triggered via cron, use the current time |
| 125 | + partition = datetime.datetime.utcnow().strftime("%Y%m%d") |
| 126 | + |
| 127 | + folder = f"gs://{RESULT_BUCKET}/processed/{partition}" |
| 128 | + |
| 129 | + # Load the data into the dataset(s) |
| 130 | + job_config = bigquery.LoadJobConfig() |
| 131 | + job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON |
| 132 | + job_config.ignore_unknown_values = True |
| 133 | + |
| 134 | + storage_client = storage.Client() |
| 135 | + bucket = storage_client.bucket(RESULT_BUCKET) |
| 136 | + |
| 137 | + bigquery_client = bigquery.Client() |
| 138 | + |
| 139 | + # Get the processed files we're loading |
| 140 | + download_prefix = f"{folder}/downloads-*.json" |
| 141 | + download_source_uris = bucket.list_blobs(prefix=download_prefix) |
| 142 | + simple_prefix = f"{folder}/simple-*.json" |
| 143 | + simple_source_uris = bucket.list_blobs(prefix=simple_prefix) |
| 144 | + |
| 145 | + for DATASET in DATASETS: |
| 146 | + dataset_ref = bigquery.dataset.DatasetReference.from_string( |
| 147 | + DATASET, default_project=DEFAULT_PROJECT |
| 148 | + ) |
| 149 | + |
| 150 | + # Load the files for the downloads table |
| 151 | + load_job = bigquery_client.load_table_from_uri( |
| 152 | + download_source_uris, |
| 153 | + dataset_ref.table(DOWNLOAD_TABLE), |
| 154 | + job_id_prefix="linehaul_file_downloads", |
| 155 | + location="US", |
| 156 | + job_config=job_config, |
| 157 | + rewind=True, |
| 158 | + ) |
| 159 | + load_job.result() |
| 160 | + print(f"Loaded {load_job.output_rows} rows into {DATASET}:{DOWNLOAD_TABLE}") |
| 161 | + |
| 162 | + # Load the files for the simple table |
| 163 | + load_job = bigquery_client.load_table_from_uri( |
| 164 | + simple_source_uris, |
| 165 | + dataset_ref.table(SIMPLE_TABLE), |
| 166 | + job_id_prefix="linehaul_file_downloads", |
| 167 | + location="US", |
| 168 | + job_config=job_config, |
| 169 | + rewind=True, |
| 170 | + ) |
| 171 | + load_job.result() |
| 172 | + print(f"Loaded {load_job.output_rows} rows into {DATASET}:{SIMPLE_TABLE}") |
| 173 | + |
| 174 | + bucket.delete_blobs(blobs=download_source_uris) |
| 175 | + bucket.delete_blobs(blobs=simple_source_uris) |
0 commit comments