pypi
diff --git a/‎fixtures/downloads-2021-01-07-20-55-2021-01-07T20-55-00.000-B8Hs_G6d6xN61En2ypwk.log.gz
75 Bytes b/‎fixtures/downloads-2021-01-07-20-55-2021-01-07T20-55-00.000-B8Hs_G6d6xN61En2ypwk.log.gz
75 Bytes
diff --git a/‎fixtures/simple-2021-01-07-20-55-2021-01-07T20-55-00.000-3wuB00t9tqgbGLFI2fSI.log.gz
41 Bytes b/‎fixtures/simple-2021-01-07-20-55-2021-01-07T20-55-00.000-3wuB00t9tqgbGLFI2fSI.log.gz
41 Bytes
diff --git a/‎main.py
Lines changed: 15 additions & 43 deletions b/‎main.py
Lines changed: 15 additions & 43 deletions
diff --git a/‎test_function.py
Lines changed: 31 additions & 83 deletions b/‎test_function.py
Lines changed: 31 additions & 83 deletions
@@ -20,6 +20,8 @@
 )
 
 DEFAULT_PROJECT = os.environ.get("GCP_PROJECT", "the-psf")
+RESULT_BUCKET = os.environ.get("RESULT_BUCKET")
+
 # Multiple datasets can be specified by separating them with whitespace
 # Datasets in other projects can be referenced by using the full dataset id:
 #   <project_id>.<dataset_name>
@@ -28,16 +30,13 @@
 DATASETS = os.environ.get("BIGQUERY_DATASET", "").strip().split()
 SIMPLE_TABLE = os.environ.get("BIGQUERY_SIMPLE_TABLE")
 DOWNLOAD_TABLE = os.environ.get("BIGQUERY_DOWNLOAD_TABLE")
-RESULT_BUCKET = os.environ.get("RESULT_BUCKET")
 
 prefix = {Simple.__name__: "simple_requests", Download.__name__: "file_downloads"}
 
 
 def process_fastly_log(data, context):
     storage_client = storage.Client()
-    bigquery_client = bigquery.Client()
-    identifier = os.path.basename(data["name"]).split("-", 3)[-1].rstrip(".log.gz")
-    default_partition = datetime.datetime.utcnow().strftime("%Y%m%d")
+    file_name = os.path.basename(data["name"]).rstrip(".log.gz")
 
     print(f"Beginning processing for gs://{data['bucket']}/{data['name']}")
 
@@ -59,9 +58,11 @@ def process_fastly_log(data, context):
         simple_results_file = stack.enter_context(NamedTemporaryFile())
         download_results_file = stack.enter_context(NamedTemporaryFile())
 
+        min_timestamp = arrow.utcnow()
         for line in input_file:
             try:
                 res = parse(line.decode())
+                min_timestamp = min(min_timestamp, res.timestamp)
                 if res is not None:
                     if res.__class__.__name__ == Simple.__name__:
                         simple_results_file.write(
@@ -88,47 +89,18 @@ def process_fastly_log(data, context):
             f"Processed gs://{data['bucket']}/{data['name']}: {total} lines, {simple_lines} simple_requests, {download_lines} file_downloads, {unprocessed_lines} unprocessed"
         )
 
-        # Load the data into the dataset(s)
-        job_config = bigquery.LoadJobConfig()
-        job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
-        job_config.ignore_unknown_values = True
-
-        for DATASET in DATASETS:
-            dataset_ref = bigquery.dataset.DatasetReference.from_string(
-                DATASET, default_project=DEFAULT_PROJECT
-            )
-            if download_lines > 0:
-                load_job = bigquery_client.load_table_from_file(
-                    download_results_file,
-                    dataset_ref.table(DOWNLOAD_TABLE),
-                    job_id_prefix="linehaul_file_downloads",
-                    location="US",
-                    job_config=job_config,
-                    rewind=True,
-                )
-                load_job.result()
-                print(
-                    f"Loaded {load_job.output_rows} rows into {DATASET}:{DOWNLOAD_TABLE}"
-                )
-
-            if simple_lines > 0:
-                load_job = bigquery_client.load_table_from_file(
-                    simple_results_file,
-                    dataset_ref.table(SIMPLE_TABLE),
-                    job_id_prefix="linehaul_file_downloads",
-                    location="US",
-                    job_config=job_config,
-                    rewind=True,
-                )
-                load_job.result()
-                print(
-                    f"Loaded {load_job.output_rows} rows into {DATASET}:{SIMPLE_TABLE}"
-                )
-
-            bucket = storage_client.bucket(RESULT_BUCKET)
+        bucket = storage_client.bucket(RESULT_BUCKET)
+        partition = min_timestamp.strftime("%Y%m%d")
+
+        if simple_lines > 0:
+            blob = bucket.blob(f"processed/{partition}/simple-{file_name}.json")
+            blob.upload_from_file(simple_results_file, rewind=True)
+        if download_lines > 0:
+            blob = bucket.blob(f"processed/{partition}/downloads-{file_name}.json")
+            blob.upload_from_file(download_results_file, rewind=True)
 
         if unprocessed_lines > 0:
-            blob = bucket.blob(f"unprocessed/{default_partition}/{identifier}.txt")
+            blob = bucket.blob(f"unprocessed/{partition}/{file_name}.txt")
             try:
                 blob.upload_from_file(unprocessed_file, rewind=True)
             except Exception:
 
@@ -7,57 +7,38 @@
 import main
 
 GCP_PROJECT = "my-gcp-project"
-BIGQUERY_DATASET = "my-bigquery-dataset"
-BIGQUERY_SIMPLE_TABLE = "my-simple-table"
-BIGQUERY_DOWNLOAD_TABLE = "my-download-table"
 RESULT_BUCKET = "my-result-bucket"
 
-
-@pytest.mark.parametrize(
-    "bigquery_dataset, expected_from_string_calls",
-    [
-        (
-            "my-bigquery-dataset",
-            [pretend.call("my-bigquery-dataset", default_project=GCP_PROJECT)],
-        ),
-        (
-            "my-bigquery-dataset some-other-dataset",
-            [
-                pretend.call("my-bigquery-dataset", default_project=GCP_PROJECT),
-                pretend.call("some-other-dataset", default_project=GCP_PROJECT),
-            ],
-        ),
-    ],
-)
 @pytest.mark.parametrize(
-    "log_filename, table_name, expected",
+    "log_filename, expected_data, expected_unprocessed, expected_unprocessed_filename, expected_data_filename",
     [
         (
             "downloads-2021-01-07-20-55-2021-01-07T20-55-00.000-B8Hs_G6d6xN61En2ypwk.log.gz",
-            BIGQUERY_DOWNLOAD_TABLE,
             b'{"timestamp": "2021-01-07 20:54:54 +00:00", "url": "/packages/f7/12/ec3f2e203afa394a149911729357aa48affc59c20e2c1c8297a60f33f133/threadpoolctl-2.1.0-py3-none-any.whl", "project": "threadpoolctl", "file": {"filename": "threadpoolctl-2.1.0-py3-none-any.whl", "project": "threadpoolctl", "version": "2.1.0", "type": "bdist_wheel"}, "tls_protocol": "TLSv1.2", "tls_cipher": "ECDHE-RSA-AES128-GCM-SHA256", "country_code": "US", "details": {"installer": {"name": "pip", "version": "20.1.1"}, "python": "3.7.9", "implementation": {"name": "CPython", "version": "3.7.9"}, "distro": {"name": "Debian GNU/Linux", "version": "9", "id": "stretch", "libc": {"lib": "glibc", "version": "2.24"}}, "system": {"name": "Linux", "release": "4.15.0-112-generic"}, "cpu": "x86_64", "openssl_version": "OpenSSL 1.1.0l  10 Sep 2019", "setuptools_version": "47.1.0", "ci": null}}\n'
             b'{"timestamp": "2021-01-07 20:54:54 +00:00", "url": "/packages/cd/f9/8fad70a3bd011a6be7c5c6067278f006a25341eb39d901fbda307e26804c/django_crum-0.7.9-py2.py3-none-any.whl", "project": "django-crum", "file": {"filename": "django_crum-0.7.9-py2.py3-none-any.whl", "project": "django-crum", "version": "0.7.9", "type": "bdist_wheel"}, "tls_protocol": "TLSv1.2", "tls_cipher": "ECDHE-RSA-AES128-GCM-SHA256", "country_code": "US", "details": {"installer": {"name": "pip", "version": "20.0.2"}, "python": "3.8.5", "implementation": {"name": "CPython", "version": "3.8.5"}, "distro": {"name": "Ubuntu", "version": "16.04", "id": "xenial", "libc": {"lib": "glibc", "version": "2.23"}}, "system": {"name": "Linux", "release": "4.4.0-1113-aws"}, "cpu": "x86_64", "openssl_version": "OpenSSL 1.0.2g  1 Mar 2016", "setuptools_version": "44.1.0", "ci": null}}\n',
+            b'download|Thu, 07 Jan 2021 20:54:56 GMT|US|/packages/c5/db/e56e6b4bbac7c4a06de1c50de6fe1ef3810018ae11732a50f15f62c7d050/enum34-1.1.6-py2-none-any.whl|TLSv1.2|ECDHE-RSA-AES128-GCM-SHA256|enum34|1.1.6|bdist_wheel|(null)\n',
+            "unprocessed/20210107/downloads-2021-01-07-20-55-2021-01-07T20-55-00.000-B8Hs_G6d6xN61En2ypwk.txt",
+            "processed/20210107/downloads-downloads-2021-01-07-20-55-2021-01-07T20-55-00.000-B8Hs_G6d6xN61En2ypwk.json",
         ),
         (
             "simple-2021-01-07-20-55-2021-01-07T20-55-00.000-3wuB00t9tqgbGLFI2fSI.log.gz",
-            BIGQUERY_SIMPLE_TABLE,
             b'{"timestamp": "2021-01-07 20:54:52 +00:00", "url": "/simple/azureml-model-management-sdk/", "project": "azureml-model-management-sdk", "tls_protocol": "TLSv1.3", "tls_cipher": "AES256-GCM", "country_code": "US", "details": {"installer": {"name": "pip", "version": "20.0.2"}, "python": "3.7.5", "implementation": {"name": "CPython", "version": "3.7.5"}, "distro": {"name": "Ubuntu", "version": "18.04", "id": "bionic", "libc": {"lib": "glibc", "version": "2.27"}}, "system": {"name": "Linux", "release": "4.15.0-1092-azure"}, "cpu": "x86_64", "openssl_version": "OpenSSL 1.1.1  11 Sep 2018", "setuptools_version": "45.2.0", "ci": null}}\n'
             b'{"timestamp": "2021-01-07 20:54:52 +00:00", "url": "/simple/pyrsistent/", "project": "pyrsistent", "tls_protocol": "TLSv1.3", "tls_cipher": "AES256-GCM", "country_code": "US", "details": {"installer": {"name": "pip", "version": "20.0.2"}, "python": "3.8.5", "implementation": {"name": "CPython", "version": "3.8.5"}, "distro": {"name": "Ubuntu", "version": "20.04", "id": "focal", "libc": {"lib": "glibc", "version": "2.31"}}, "system": {"name": "Linux", "release": "5.4.72-flatcar"}, "cpu": "x86_64", "openssl_version": "OpenSSL 1.1.1f  31 Mar 2020", "setuptools_version": "45.2.0", "ci": true}}\n',
+            b'simple|Thu, 07 Jan 2021 20:54:52 GMT|US|/simple/numpy/|TLSv1.2|ECDHE-RSA-AES128-GCM-SHA256||||(null)\n',
+            "unprocessed/20210107/simple-2021-01-07-20-55-2021-01-07T20-55-00.000-3wuB00t9tqgbGLFI2fSI.txt",
+            "processed/20210107/simple-simple-2021-01-07-20-55-2021-01-07T20-55-00.000-3wuB00t9tqgbGLFI2fSI.json",
         ),
     ],
 )
 def test_function(
     monkeypatch,
     log_filename,
-    table_name,
-    expected,
-    bigquery_dataset,
-    expected_from_string_calls,
+    expected_data,
+    expected_unprocessed,
+    expected_data_filename,
+    expected_unprocessed_filename,
 ):
     monkeypatch.setenv("GCP_PROJECT", GCP_PROJECT)
-    monkeypatch.setenv("BIGQUERY_DATASET", bigquery_dataset)
-    monkeypatch.setenv("BIGQUERY_SIMPLE_TABLE", BIGQUERY_SIMPLE_TABLE)
-    monkeypatch.setenv("BIGQUERY_DOWNLOAD_TABLE", BIGQUERY_DOWNLOAD_TABLE)
     monkeypatch.setenv("RESULT_BUCKET", RESULT_BUCKET)
 
     reload(main)
@@ -66,46 +47,28 @@ def _download_to_file(file_handler):
         with open(Path(".") / "fixtures" / log_filename, "rb") as f:
             file_handler.write(f.read())
 
-    blob_stub = pretend.stub(
+    get_blob_stub = pretend.stub(
         download_to_file=_download_to_file, delete=pretend.call_recorder(lambda: None),
     )
-    bucket_stub = pretend.stub(get_blob=pretend.call_recorder(lambda a: blob_stub),)
-    storage_client_stub = pretend.stub(
-        bucket=pretend.call_recorder(lambda a: bucket_stub),
-    )
-    monkeypatch.setattr(
-        main, "storage", pretend.stub(Client=lambda: storage_client_stub)
-    )
 
-    table_stub = pretend.stub()
-    dataset_stub = pretend.stub(table=pretend.call_recorder(lambda a: table_stub))
-    load_job_stub = pretend.stub(
-        result=pretend.call_recorder(lambda: None), output_rows=pretend.stub(),
-    )
+    blobs = {}
+    class Blob(object):
+        def __init__(self, blob_uri):
+            self.uri = blob_uri
+            self.data = None
+            blobs[blob_uri] = self
 
-    def _load_table_from_file(fh, *a, **kw):
-        fh.flush()
-        with open(fh.name, "rb") as f:
-            load_job_stub._result = f.read()
-        return load_job_stub
+        def upload_from_file(self, file_handler, rewind=False):
+            if rewind:
+                file_handler.seek(0)
+            self.data = file_handler.read()
 
-    bigquery_client_stub = pretend.stub(
-        load_table_from_file=pretend.call_recorder(_load_table_from_file),
-    )
-    job_config_stub = pretend.stub()
-    dataset_reference_stub = pretend.stub(
-        from_string=pretend.call_recorder(lambda *a, **kw: dataset_stub)
+    bucket_stub = pretend.stub(get_blob=pretend.call_recorder(lambda a: get_blob_stub), blob=pretend.call_recorder(lambda a: Blob(a)),)
+    storage_client_stub = pretend.stub(
+        bucket=pretend.call_recorder(lambda a: bucket_stub),
     )
-
     monkeypatch.setattr(
-        main,
-        "bigquery",
-        pretend.stub(
-            Client=lambda: bigquery_client_stub,
-            LoadJobConfig=lambda: job_config_stub,
-            SourceFormat=pretend.stub(NEWLINE_DELIMITED_JSON=pretend.stub()),
-            dataset=pretend.stub(DatasetReference=dataset_reference_stub),
-        ),
+        main, "storage", pretend.stub(Client=lambda: storage_client_stub)
     )
 
     data = {
@@ -118,24 +81,9 @@ def _load_table_from_file(fh, *a, **kw):
 
     assert storage_client_stub.bucket.calls == [pretend.call("my-bucket")] + [
         pretend.call(RESULT_BUCKET),
-    ] * len(expected_from_string_calls)
+    ]
     assert bucket_stub.get_blob.calls == [pretend.call(log_filename)]
-    assert dataset_reference_stub.from_string.calls == expected_from_string_calls
-    assert bigquery_client_stub.load_table_from_file.calls == [
-        pretend.call(
-            bigquery_client_stub.load_table_from_file.calls[0].args[0],  # shh
-            table_stub,
-            job_id_prefix="linehaul_file_downloads",
-            location="US",
-            job_config=job_config_stub,
-            rewind=True,
-        )
-    ] * len(expected_from_string_calls)
-    assert dataset_stub.table.calls == [pretend.call(table_name)] * len(
-        expected_from_string_calls
-    )
-    assert blob_stub.delete.calls == [pretend.call()]
-    assert load_job_stub.result.calls == [pretend.call()] * len(
-        expected_from_string_calls
-    )
-    assert load_job_stub._result == expected
+    assert bucket_stub.blob.calls == [pretend.call(expected_data_filename), pretend.call(expected_unprocessed_filename)]
+    assert get_blob_stub.delete.calls == [pretend.call()]
+    assert blobs[expected_data_filename].data == expected_data
+    assert blobs[expected_unprocessed_filename].data == expected_unprocessed