Tech report pipeline - add flex template (#251)

giancarloaf · web-flow · commit dd1ec00d6863 · 2024-03-12T21:07:43.000-04:00
* added pipeline runner run_tech_report.py

* added flex template metadata flex_template_metadata_tech_report.json

* added tech_report to flex template build script build_flex_template.sh

* added tech_report to flex template run script run_flex_template.sh

* added .venv to .gitignore

* updated README with minimal information
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,7 @@
 /build
 /env
 /venv
+/.venv
 *.pyc
 /.vscode
 .coverage
diff --git a/README.md b/README.md
@@ -37,11 +37,15 @@ The new HTTP Archive data pipeline built entirely on GCP
 
 This repo handles the HTTP Archive data pipeline, which takes the results of the monthly HTTP Archive run and saves this to the `httparchive` dataset in BigQuery.
 
+A secondary pipeline is responsible for populating the Technology Report Firestore collections.
+
 There are currently two main pipelines:
 
 - The `all` pipeline which saves data to the new `httparchive.all` dataset
 - The `combined` pipline which saves data to the legacy tables. This processes both the `summary` tables (`summary_pages` and `summary_requests`) and `non-summary` pipeline (`pages`, `requests`, `response_bodies`....etc.)
 
+The secondary `tech_report` pipeline saves data to a Firestore database (e.g. `tech-report-apis-prod`) across various collections ([see `TECHNOLOGY_QUERIES` in constants.py](modules/constants.py))
+
 The pipelines are run in Google Cloud Platform (GCP) and are kicked off automatically on crawl completion, based on the code in the `main` branch which is deployed to GCP on each merge.
 
 The [`data-pipeline` workflow](https://console.cloud.google.com/workflows/workflow/us-west1/data-pipeline/executions?project=httparchive) as defined by the [data-pipeline-workflows.yaml](./data-pipeline-workflows.yaml) file, runs the whole process from start to finish, including generating the manifest file for each of the two runs (desktop and mobile) and then starting the four dataflow jobs (desktop all, mobile all, desktop combined, mobile combined) in sequence to upload of the HAR files to the BigQuery tables. This can be rerun in case of failure by [publishing a crawl-complete message](#publishing-a-pubsub-message), providing no data was saved to the final BigQuery tables.
@@ -156,6 +160,7 @@ This method is best used when developing locally, as a convenience for running t
 # run the pipeline using a flex template
 ./run_flex_template all [...]
 ./run_flex_template combined [...]
+./run_flex_template tech_report [...]
 ```
 
 ### Running a flex template from the Cloud Console
diff --git a/build_flex_template.sh b/build_flex_template.sh
@@ -5,9 +5,13 @@ set -u
 
 BUILD_TAG=$(date -u +"%Y-%m-%d_%H-%M-%S")
 
+# all and combined pipelines
 for type in all combined
 do
-    gcloud builds submit --substitutions=_TYPE="${type}",_BUILD_TAG="${BUILD_TAG}" .
+    gcloud builds submit --substitutions=_TYPE="${type}",_BUILD_TAG="${BUILD_TAG}",_WORKER_TYPE=n1-standard-32 .
 done
 
+# tech_report pipeline
+gcloud builds submit --substitutions=_TYPE=tech_report,_BUILD_TAG="${BUILD_TAG}",_WORKER_TYPE=n1-standard-1 .
+
 echo "${BUILD_TAG}"
diff --git a/flex_template_metadata_tech_report.json b/flex_template_metadata_tech_report.json
@@ -0,0 +1,36 @@
+{
+    "name": "Technology Report API pipeline",
+    "description": "Runs a pipeline to generate firestore API results",
+    "parameters": [
+        {
+            "name": "query_type",
+            "label":"Query type",
+            "helpText": "Technology query type",
+            "isOptional": true
+        },
+        {
+            "name": "firestore_project",
+            "label":"Firestore project",
+            "helpText": "Google Cloud project with Firestore",
+            "isOptional": true
+        },
+        {
+            "name": "firestore_collection",
+            "label":"Firestore collection",
+            "helpText": "Firestore collection with HTTPArchive data",
+            "isOptional": true
+        },
+        {
+            "name": "firestore_database",
+            "label":"Firestore database",
+            "helpText": "Firestore database with HTTPArchive data",
+            "isOptional": true
+        },
+        {
+            "name": "date",
+            "label":"Date",
+            "helpText": "Date to query",
+            "isOptional": true
+        }
+    ]
+}
diff --git a/run_flex_template.sh b/run_flex_template.sh
@@ -12,26 +12,32 @@ DF_JOB_ID="${REPO}-${TYPE}-$(date -u +%Y%m%d-%H%M%S)"
 DF_TEMP_BUCKET="gs://${PROJECT}-staging/dataflow"
 TEMPLATE_BASE_PATH="gs://${PROJECT}/dataflow/templates"
 
+# find the latest template if unset
+: "${TEMPLATE_PATH:=$(gsutil ls ${TEMPLATE_BASE_PATH}/${REPO}-"${TYPE}"*.json | sort -r | head -n 1)}"
+
 case "${TYPE}~${TEMPLATE_PATH}" in
     all~|combined~) : ;;
     all~gs://*all*) : ;;
     combined~gs://*combined*) : ;;
+    tech_report~gs://*tech_report*) : ;;
     *)
-        echo "Expected an argumment of either [all|combined] and optionally TEMPLATE_PATH to be set (otherwise the latest template will be used)"
+        echo "Expected an argumment of either [all|combined|tech_report] and optionally TEMPLATE_PATH to be set (otherwise the latest template will be used)"
         echo "Examples"
         echo "  $(basename "$0") all ..."
         echo "  $(basename "$0") combined ..."
+        echo "  $(basename "$0") tech_report ..."
         echo "  TEMPLATE_PATH=${TEMPLATE_BASE_PATH}/${REPO}-all-2022-10-12_00-19-44.json $(basename "$0") all ..."
         echo "  TEMPLATE_PATH=${TEMPLATE_BASE_PATH}/${REPO}-combined-2022-10-12_00-19-44.json $(basename "$0") combined ..."
+        echo "  TEMPLATE_PATH=${TEMPLATE_BASE_PATH}/${REPO}-tech_report-2022-10-12_00-19-44.json $(basename "$0") tech_report ..."
         exit 1
         ;;
 esac
 
 # drop the first argument
 shift
 
-# find the latest template if unset
-: "${TEMPLATE_PATH:=$(gsutil ls ${TEMPLATE_BASE_PATH}/${REPO}-"${TYPE}"*.json | sort -r | head -n 1)}"
+# replace underscores with hyphens in the job id
+DF_JOB_ID=${DF_JOB_ID//_/-}
 
 set -u
 
diff --git a/run_tech_report.py b/run_tech_report.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python3
+
+import logging
+
+from apache_beam.runners import DataflowRunner
+
+from modules import tech_report_pipeline
+
+
+def run(argv=None):
+    logging.getLogger().setLevel(logging.INFO)
+    p = tech_report_pipeline.create_pipeline()
+    pipeline_result = p.run(argv)
+    if not isinstance(p.runner, DataflowRunner):
+        pipeline_result.wait_until_finish()
+
+
+if __name__ == "__main__":
+    run()