implement a lookback (#65)

ewdurbin · web-flow · commit 54bdc3dcd291 · 2021-12-10T13:50:21.000-05:00
diff --git a/main.py b/main.py
@@ -145,19 +145,40 @@ def _delete_blobs(
         )
 
 
+def _fetch_blobs(bucket, blob_type="downloads", past_partition=None, partition=None):
+    # Get the processed files we're loading
+
+    if past_partition is not None:
+        folder = f"processed/{past_partition}"
+        prefix = f"{folder}/{blob_type}-"
+        source_blobs = list(
+            bucket.list_blobs(prefix=prefix, max_results=MAX_BLOBS_PER_RUN)
+        )
+        if len(source_blobs) > 0:
+            return (source_blobs, prefix)
+
+    folder = f"processed/{partition}"
+    prefix = f"{folder}/{blob_type}-"
+    source_blobs = list(bucket.list_blobs(prefix=prefix, max_results=MAX_BLOBS_PER_RUN))
+    return (source_blobs, prefix)
+
+
 def load_processed_files_into_bigquery(event, context):
     continue_publishing = False
     if "attributes" in event and "partition" in event["attributes"]:
         # Check to see if we've manually triggered the function and provided a partition
+        past_partition = None
         partition = event["attributes"]["partition"]
         if "continue_publishing" in event["attributes"]:
             continue_publishing = bool(event["attributes"]["continue_publishing"])
     else:
         # Otherwise, this was triggered via cron, use the current time
+        # checking the past day first
+        past_partition = (
+            datetime.datetime.utcnow() - datetime.timedelta(days=1)
+        ).strftime("%Y%m%d")
         partition = datetime.datetime.utcnow().strftime("%Y%m%d")
 
-    folder = f"processed/{partition}"
-
     # Load the data into the dataset(s)
     job_config = bigquery.LoadJobConfig()
     job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
@@ -168,17 +189,17 @@ def load_processed_files_into_bigquery(event, context):
 
     bigquery_client = bigquery.Client()
 
-    # Get the processed files we're loading
-    download_prefix = f"{folder}/downloads-"
-    download_source_blobs = list(
-        bucket.list_blobs(prefix=download_prefix, max_results=MAX_BLOBS_PER_RUN)
+    download_source_blobs, download_prefix = _fetch_blobs(
+        bucket,
+        blob_type="downloads",
+        past_partition=past_partition,
+        partition=partition,
     )
     download_source_uris = [
         f"gs://{blob.bucket.name}/{blob.name}" for blob in download_source_blobs
     ]
-    simple_prefix = f"{folder}/simple-"
-    simple_source_blobs = list(
-        bucket.list_blobs(prefix=simple_prefix, max_results=MAX_BLOBS_PER_RUN)
+    simple_source_blobs, simple_prefix = _fetch_blobs(
+        bucket, blob_type="simple", past_partition=past_partition, partition=partition
     )
     simple_source_uris = [
         f"gs://{blob.bucket.name}/{blob.name}" for blob in simple_source_blobs
diff --git a/test_functions.py b/test_functions.py
@@ -115,24 +115,32 @@ def upload_from_file(self, file_handler, rewind=False):
     ],
 )
 @pytest.mark.parametrize(
-    "blobs, expected_load_jobs, expected_delete_calls",
+    "blobs, simple_fetch_current, expected_load_jobs, expected_delete_calls",
     [
-        ({"simple": [], "downloads": ["blob0", "blob1", "blob2"]}, 1, 3),
-        ({"simple": ["blob0", "blob1", "blob2"], "downloads": []}, 1, 3),
+        ({"simple": [], "downloads": ["blob0", "blob1", "blob2"]}, True, 1, 3),
+        ({"simple": ["blob0", "blob1", "blob2"], "downloads": []}, True, 1, 3),
         (
             {
                 "simple": ["blob0", "blob1", "blob2"],
                 "downloads": ["blob0", "blob1", "blob2"],
             },
+            True,
             2,
             6,
         ),
+        (
+            {"simple": ["pastblob0", "pastblob1"], "downloads": ["blob0", "blob1"]},
+            False,
+            2,
+            4,
+        ),
     ],
 )
 def test_load_processed_files_into_bigquery(
     monkeypatch,
     bigquery_dataset,
     blobs,
+    simple_fetch_current,
     expected_load_jobs,
     expected_delete_calls,
 ):
@@ -150,11 +158,22 @@ def test_load_processed_files_into_bigquery(
         name="blobname", bucket=bucket, delete=pretend.call_recorder(lambda: None)
     )
 
+    past_partition = (datetime.datetime.utcnow() - datetime.timedelta(days=1)).strftime(
+        "%Y%m%d"
+    )
+    partition = datetime.datetime.utcnow().strftime("%Y%m%d")
+
     def _generate_blob_list(prefix, max_results):
         if "simple" in prefix:
-            _blobs = blobs["simple"]
+            if past_partition in prefix:
+                _blobs = [b for b in blobs["simple"] if b.startswith("past")]
+            else:
+                _blobs = blobs["simple"]
         elif "downloads" in prefix:
-            _blobs = blobs["downloads"]
+            if past_partition in prefix:
+                _blobs = [b for b in blobs["downloads"] if b.startswith("past")]
+            else:
+                _blobs = blobs["downloads"]
         else:
             _blobs = []
         blob_list = [blob_stub for b in _blobs]
@@ -204,17 +223,21 @@ def fake_batch(*a, **kw):
 
     event = {}
     context = pretend.stub()
-    partition = datetime.datetime.utcnow().strftime("%Y%m%d")
 
     main.load_processed_files_into_bigquery(event, context)
 
     assert storage_client_stub.bucket.calls == [
         pretend.call(RESULT_BUCKET),
     ]
-    assert bucket_stub.list_blobs.calls == [
+    expected_list_blob_calls = [
+        pretend.call(prefix=f"processed/{past_partition}/downloads-", max_results=1000),
         pretend.call(prefix=f"processed/{partition}/downloads-", max_results=1000),
+        pretend.call(prefix=f"processed/{past_partition}/simple-", max_results=1000),
         pretend.call(prefix=f"processed/{partition}/simple-", max_results=1000),
     ]
+    if not simple_fetch_current:
+        expected_list_blob_calls = expected_list_blob_calls[:3]
+    assert bucket_stub.list_blobs.calls == expected_list_blob_calls
     assert (
         load_job_stub.result.calls
         == [pretend.call()] * len(bigquery_dataset.split()) * expected_load_jobs