Skip to content

Commit ef00169

Browse files
Truncate Firestore data before export (#56)
* firestore truncate option * conditional db environment * modularized functions * trigger using both env * truncate required * lint * lint
1 parent 24ecd03 commit ef00169

File tree

16 files changed

+321
-151
lines changed

16 files changed

+321
-151
lines changed

infra/bigquery-export/firestore.js

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,6 @@ export class FirestoreBatch {
1515
constructor () {
1616
this.firestore = new Firestore()
1717
this.bigquery = new BigQueryExport()
18-
this.firestore.settings({
19-
databaseId: 'tech-report-apis-prod'
20-
})
2118
this.batchSize = 500
2219
this.maxConcurrentBatches = 200
2320
}
@@ -139,13 +136,18 @@ export class FirestoreBatch {
139136
console.info(`Transfer to ${this.collectionName} complete. Total rows processed: ${totalRowsProcessed}. Time: ${duration} seconds`)
140137
}
141138

142-
async export (config, query) {
143-
this.date = config.date
144-
this.collectionName = config.name
145-
this.collectionType = config.type
139+
async export (exportConfig, query) {
140+
this.date = exportConfig.date
141+
this.collectionName = exportConfig.name
142+
this.collectionType = exportConfig.type
143+
this.firestore.settings({
144+
databaseId: 'tech-report-apis-' + exportConfig.environment
145+
})
146146

147-
// Delete documents before writing new ones
148-
await this.batchDelete()
147+
// Delete all the documents before writing the new ones
148+
if (exportConfig.truncate !== 'false') {
149+
await this.batchDelete()
150+
}
149151

150152
const rowStream = await this.bigquery.queryResultsStream(query)
151153
await this.streamFromBigQuery(rowStream)

infra/bigquery-export/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"main": "index.js",
55
"scripts": {
66
"start": "node index.js",
7-
"buildpack": "rm -rf node_modules; gcloud builds submit --pack image=gcr.io/httparchive/bigquery-export"
7+
"buildpack": "rm -rf node_modules; gcloud builds submit --pack image=us.gcr.io/httparchive/cloud-run/bigquery-export"
88
},
99
"type": "module",
1010
"dependencies": {

infra/bigquery-export/reports.js

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ export class ReportsExporter {
99
}
1010

1111
// Export timeseries reports
12-
async exportTimeseries (exportData) {
13-
const metric = exportData.name
12+
async exportTimeseries (exportConfig) {
13+
const metric = exportConfig.name
1414
const query = `
1515
SELECT
1616
FORMAT_DATE('%Y_%m_%d', date) AS date,
@@ -22,9 +22,9 @@ FROM reports.${metric}_timeseries
2222
}
2323

2424
// Export monthly histogram report
25-
async exportHistogram (exportData) {
26-
const metric = exportData.name
27-
const date = exportData.date
25+
async exportHistogram (exportConfig) {
26+
const metric = exportConfig.name
27+
const date = exportConfig.date
2828

2929
const query = `
3030
SELECT * EXCEPT(date)
@@ -35,16 +35,16 @@ WHERE date = '${date}'
3535
await this.storage.exportToJson(rows, `${this.storagePath}${date.replaceAll('-', '_')}/${metric}.json`)
3636
}
3737

38-
async export (exportData) {
39-
if (exportData.dataform_trigger !== 'report_complete') {
38+
async export (exportConfig) {
39+
if (exportConfig.dataform_trigger !== 'report_complete') {
4040
console.error('Invalid dataform trigger')
4141
return
4242
}
4343

44-
if (exportData.type === 'histogram') {
45-
await this.exportHistogram(exportData)
46-
} else if (exportData.type === 'timeseries') {
47-
await this.exportTimeseries(exportData)
44+
if (exportConfig.type === 'histogram') {
45+
await this.exportHistogram(exportConfig)
46+
} else if (exportConfig.type === 'timeseries') {
47+
await this.exportTimeseries(exportConfig)
4848
} else {
4949
console.error('Invalid report type')
5050
}
@@ -56,30 +56,30 @@ export class TechReportsExporter {
5656
this.firestore = new FirestoreBatch()
5757
}
5858

59-
async export (exportData) {
60-
if (exportData.dataform_trigger !== 'report_cwv_tech_complete') {
59+
async export (exportConfig) {
60+
if (exportConfig.dataform_trigger !== 'report_cwv_tech_complete') {
6161
console.error('Invalid dataform trigger')
6262
return
6363
}
6464

6565
let query = ''
66-
if (exportData.type === 'report') {
66+
if (exportConfig.type === 'report') {
6767
query = `
6868
SELECT
6969
STRING(date) AS date,
7070
* EXCEPT(date)
71-
FROM httparchive.reports.cwv_tech_${exportData.name}
72-
WHERE date = '${exportData.date}'
71+
FROM httparchive.reports.cwv_tech_${exportConfig.name}
72+
WHERE date = '${exportConfig.date}'
7373
`
74-
} else if (exportData.type === 'dict') {
74+
} else if (exportConfig.type === 'dict') {
7575
query = `
7676
SELECT *
77-
FROM reports.cwv_tech_${exportData.name}
77+
FROM reports.cwv_tech_${exportConfig.name}
7878
`
7979
} else {
8080
console.error('Invalid export type')
8181
}
8282

83-
await this.firestore.export(exportData, query)
83+
await this.firestore.export(exportConfig, query)
8484
}
8585
}

infra/dataform-export/index.js

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,12 @@ functions.http('dataform-export', async (req, res) => {
5454
res.status(400).send('Bad Request: no query found')
5555
}
5656

57+
const repoEnvironment = messageData.protoPayload.serviceData.jobCompletedEvent.job.jobConfiguration.labels.dataform_repository_id
58+
if (!repoEnvironment) {
59+
console.log(`no repo environment found: ${JSON.stringify(messageData)}`)
60+
res.status(400).send('Bad Request: no repo environment found')
61+
}
62+
5763
const regex = /\/\* ({"dataform_trigger":.+) \*\//
5864
const reportConfig = regex.exec(query)
5965
if (!reportConfig) {
@@ -62,6 +68,11 @@ functions.http('dataform-export', async (req, res) => {
6268
}
6369

6470
const eventData = JSON.parse(reportConfig[1])
71+
if (!eventData) {
72+
console.log(`no event data found: ${reportConfig[1]}`)
73+
res.status(400).send('Bad Request: no event data found')
74+
}
75+
eventData.environment = repoEnvironment === 'crawl-data' ? 'prod' : 'dev'
6576
await callRunJob(eventData)
6677

6778
res.status(200).send('OK')

infra/tf/.terraform.lock.hcl

Lines changed: 26 additions & 26 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

infra/tf/bigquery_export/main.tf

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
terraform {
2+
required_version = ">= 1.9.7"
3+
4+
required_providers {
5+
archive = {
6+
source = "hashicorp/archive"
7+
version = "2.6.0"
8+
}
9+
google = {
10+
source = "hashicorp/google"
11+
version = ">= 6.13.0"
12+
}
13+
}
14+
}
15+
16+
data "archive_file" "zip" {
17+
type = "zip"
18+
source_dir = "../${var.function_name}/"
19+
output_path = "./tmp/${var.function_name}.zip"
20+
}
21+
22+
resource "google_storage_bucket_object" "zource" {
23+
bucket = "gcf-v2-uploads-${var.project_number}-${var.region}"
24+
name = "${var.function_name}_${data.archive_file.zip.id}.zip"
25+
source = data.archive_file.zip.output_path
26+
}
27+
28+
resource "google_cloud_run_v2_job" "bigquery_export" {
29+
name = var.function_name
30+
location = var.region
31+
32+
deletion_protection = false
33+
34+
template {
35+
template {
36+
containers {
37+
image = "${var.location}.gcr.io/${var.project}/cloud-run/${var.function_name}:latest"
38+
resources {
39+
limits = {
40+
cpu = "4"
41+
memory = "4Gi"
42+
}
43+
}
44+
env {
45+
name = "EXPORT_CONFIG"
46+
value = ""
47+
}
48+
}
49+
timeout = "3600s"
50+
service_account = var.function_identity
51+
}
52+
}
53+
}
54+

infra/tf/bigquery_export/variables.tf

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
variable "project" {
2+
type = string
3+
}
4+
5+
variable "project_number" {
6+
type = string
7+
}
8+
9+
variable "region" {
10+
type = string
11+
}
12+
13+
variable "function_identity" {
14+
type = string
15+
}
16+
17+
variable "function_name" {
18+
type = string
19+
}
20+
21+
variable "location" {
22+
type = string
23+
}

infra/tf/bigquery_export_job.tf

Lines changed: 0 additions & 25 deletions
This file was deleted.

0 commit comments

Comments
 (0)