Skip to content

Commit 75f999b

Browse files
feat(monitoring): k8s CronJob monitoring (#3118)
This PR adds Terraform config to auto-generate alerting policy for all Kubernetes CronJobs so that we are informed of failures in them. Much credit goes to @rjerrems for doing most of the legwork, I've just picked up his Terraform and massaged it into something that appears to work, based on `terraform plan` (any additional feedback on implementation greatly appreciated): ``` Terraform will perform the following actions: # module.k8s_cron_alert["CronJob--alias-computation"].google_monitoring_alert_policy.cron_alert_policy will be created + resource "google_monitoring_alert_policy" "cron_alert_policy" { + combiner = "OR" + creation_record = (known after apply) + display_name = "Cronjob: alias-computation has not run recently." + enabled = true + id = (known after apply) + name = (known after apply) + project = "oss-vdb-test" + conditions { + display_name = "Cronjob: alias-computation has not run recently." + name = (known after apply) + condition_prometheus_query_language { + alert_rule = "AlwaysOn" + duration = "60s" + evaluation_interval = "60s" + query = "((time() - kube_cronjob_status_last_successful_time{cronjob=\"alias-computation\"})/60) > 45" + rule_group = "cronjob alias-computation" } } } # module.k8s_cron_alert["CronJob--alpine-cve-convert"].google_monitoring_alert_policy.cron_alert_policy will be created + resource "google_monitoring_alert_policy" "cron_alert_policy" { + combiner = "OR" + creation_record = (known after apply) + display_name = "Cronjob: alpine-cve-convert has not run recently." + enabled = true + id = (known after apply) + name = (known after apply) + project = "oss-vdb-test" + conditions { + display_name = "Cronjob: alpine-cve-convert has not run recently." + name = (known after apply) + condition_prometheus_query_language { + alert_rule = "AlwaysOn" + duration = "60s" + evaluation_interval = "60s" + query = "((time() - kube_cronjob_status_last_successful_time{cronjob=\"alpine-cve-convert\"})/60) > 180" + rule_group = "cronjob alpine-cve-convert" } } } # module.k8s_cron_alert["CronJob--backup"].google_monitoring_alert_policy.cron_alert_policy will be created + resource "google_monitoring_alert_policy" "cron_alert_policy" { + combiner = "OR" + creation_record = (known after apply) + display_name = "Cronjob: backup has not run recently." + enabled = true + id = (known after apply) + name = (known after apply) + project = "oss-vdb-test" + conditions { + display_name = "Cronjob: backup has not run recently." + name = (known after apply) + condition_prometheus_query_language { + alert_rule = "AlwaysOn" + duration = "60s" + evaluation_interval = "60s" + query = "((time() - kube_cronjob_status_last_successful_time{cronjob=\"backup\"})/60) > 2880" + rule_group = "cronjob backup" } } } # module.k8s_cron_alert["CronJob--combine-to-osv"].google_monitoring_alert_policy.cron_alert_policy will be created + resource "google_monitoring_alert_policy" "cron_alert_policy" { + combiner = "OR" + creation_record = (known after apply) + display_name = "Cronjob: combine-to-osv has not run recently." + enabled = true + id = (known after apply) + name = (known after apply) + project = "oss-vdb-test" + conditions { + display_name = "Cronjob: combine-to-osv has not run recently." + name = (known after apply) + condition_prometheus_query_language { + alert_rule = "AlwaysOn" + duration = "60s" + evaluation_interval = "60s" + query = "((time() - kube_cronjob_status_last_successful_time{cronjob=\"combine-to-osv\"})/60) > 90" + rule_group = "cronjob combine-to-osv" } } } # module.k8s_cron_alert["CronJob--cpe-repo-gen"].google_monitoring_alert_policy.cron_alert_policy will be created + resource "google_monitoring_alert_policy" "cron_alert_policy" { + combiner = "OR" + creation_record = (known after apply) + display_name = "Cronjob: cpe-repo-gen has not run recently." + enabled = true + id = (known after apply) + name = (known after apply) + project = "oss-vdb-test" + conditions { + display_name = "Cronjob: cpe-repo-gen has not run recently." + name = (known after apply) + condition_prometheus_query_language { + alert_rule = "AlwaysOn" + duration = "60s" + evaluation_interval = "60s" + query = "((time() - kube_cronjob_status_last_successful_time{cronjob=\"cpe-repo-gen\"})/60) > 2880" + rule_group = "cronjob cpe-repo-gen" } } } # module.k8s_cron_alert["CronJob--debian-convert"].google_monitoring_alert_policy.cron_alert_policy will be created + resource "google_monitoring_alert_policy" "cron_alert_policy" { + combiner = "OR" + creation_record = (known after apply) + display_name = "Cronjob: debian-convert has not run recently." + enabled = true + id = (known after apply) + name = (known after apply) + project = "oss-vdb-test" + conditions { + display_name = "Cronjob: debian-convert has not run recently." + name = (known after apply) + condition_prometheus_query_language { + alert_rule = "AlwaysOn" + duration = "60s" + evaluation_interval = "60s" + query = "((time() - kube_cronjob_status_last_successful_time{cronjob=\"debian-convert\"})/60) > 180" + rule_group = "cronjob debian-convert" } } } # module.k8s_cron_alert["CronJob--debian-copyright-mirror"].google_monitoring_alert_policy.cron_alert_policy will be created + resource "google_monitoring_alert_policy" "cron_alert_policy" { + combiner = "OR" + creation_record = (known after apply) + display_name = "Cronjob: debian-copyright-mirror has not run recently." + enabled = true + id = (known after apply) + name = (known after apply) + project = "oss-vdb-test" + conditions { + display_name = "Cronjob: debian-copyright-mirror has not run recently." + name = (known after apply) + condition_prometheus_query_language { + alert_rule = "AlwaysOn" + duration = "60s" + evaluation_interval = "60s" + query = "((time() - kube_cronjob_status_last_successful_time{cronjob=\"debian-copyright-mirror\"})/60) > 2880" + rule_group = "cronjob debian-copyright-mirror" } } } # module.k8s_cron_alert["CronJob--debian-cve-convert"].google_monitoring_alert_policy.cron_alert_policy will be created + resource "google_monitoring_alert_policy" "cron_alert_policy" { + combiner = "OR" + creation_record = (known after apply) + display_name = "Cronjob: debian-cve-convert has not run recently." + enabled = true + id = (known after apply) + name = (known after apply) + project = "oss-vdb-test" + conditions { + display_name = "Cronjob: debian-cve-convert has not run recently." + name = (known after apply) + condition_prometheus_query_language { + alert_rule = "AlwaysOn" + duration = "60s" + evaluation_interval = "60s" + query = "((time() - kube_cronjob_status_last_successful_time{cronjob=\"debian-cve-convert\"})/60) > 120" + rule_group = "cronjob debian-cve-convert" } } } # module.k8s_cron_alert["CronJob--debian-first-version"].google_monitoring_alert_policy.cron_alert_policy will be created + resource "google_monitoring_alert_policy" "cron_alert_policy" { + combiner = "OR" + creation_record = (known after apply) + display_name = "Cronjob: debian-first-version has not run recently." + enabled = true + id = (known after apply) + name = (known after apply) + project = "oss-vdb-test" + conditions { + display_name = "Cronjob: debian-first-version has not run recently." + name = (known after apply) + condition_prometheus_query_language { + alert_rule = "AlwaysOn" + duration = "60s" + evaluation_interval = "60s" + query = "((time() - kube_cronjob_status_last_successful_time{cronjob=\"debian-first-version\"})/60) > 120" + rule_group = "cronjob debian-first-version" } } } # module.k8s_cron_alert["CronJob--exporter"].google_monitoring_alert_policy.cron_alert_policy will be created + resource "google_monitoring_alert_policy" "cron_alert_policy" { + combiner = "OR" + creation_record = (known after apply) + display_name = "Cronjob: exporter has not run recently." + enabled = true + id = (known after apply) + name = (known after apply) + project = "oss-vdb-test" + conditions { + display_name = "Cronjob: exporter has not run recently." + name = (known after apply) + condition_prometheus_query_language { + alert_rule = "AlwaysOn" + duration = "60s" + evaluation_interval = "60s" + query = "((time() - kube_cronjob_status_last_successful_time{cronjob=\"exporter\"})/60) > 90" + rule_group = "cronjob exporter" } } } # module.k8s_cron_alert["CronJob--generate-sitemap"].google_monitoring_alert_policy.cron_alert_policy will be created + resource "google_monitoring_alert_policy" "cron_alert_policy" { + combiner = "OR" + creation_record = (known after apply) + display_name = "Cronjob: generate-sitemap has not run recently." + enabled = true + id = (known after apply) + name = (known after apply) + project = "oss-vdb-test" + conditions { + display_name = "Cronjob: generate-sitemap has not run recently." + name = (known after apply) + condition_prometheus_query_language { + alert_rule = "AlwaysOn" + duration = "60s" + evaluation_interval = "60s" + query = "((time() - kube_cronjob_status_last_successful_time{cronjob=\"generate-sitemap\"})/60) > 2880" + rule_group = "cronjob generate-sitemap" } } } # module.k8s_cron_alert["CronJob--importer"].google_monitoring_alert_policy.cron_alert_policy will be created + resource "google_monitoring_alert_policy" "cron_alert_policy" { + combiner = "OR" + creation_record = (known after apply) + display_name = "Cronjob: importer has not run recently." + enabled = true + id = (known after apply) + name = (known after apply) + project = "oss-vdb-test" + conditions { + display_name = "Cronjob: importer has not run recently." + name = (known after apply) + condition_prometheus_query_language { + alert_rule = "AlwaysOn" + duration = "60s" + evaluation_interval = "60s" + query = "((time() - kube_cronjob_status_last_successful_time{cronjob=\"importer\"})/60) > 90" + rule_group = "cronjob importer" } } } # module.k8s_cron_alert["CronJob--importer-deleter"].google_monitoring_alert_policy.cron_alert_policy will be created + resource "google_monitoring_alert_policy" "cron_alert_policy" { + combiner = "OR" + creation_record = (known after apply) + display_name = "Cronjob: importer-deleter has not run recently." + enabled = true + id = (known after apply) + name = (known after apply) + project = "oss-vdb-test" + conditions { + display_name = "Cronjob: importer-deleter has not run recently." + name = (known after apply) + condition_prometheus_query_language { + alert_rule = "AlwaysOn" + duration = "60s" + evaluation_interval = "60s" + query = "((time() - kube_cronjob_status_last_successful_time{cronjob=\"importer-deleter\"})/60) > 360" + rule_group = "cronjob importer-deleter" } } } # module.k8s_cron_alert["CronJob--nvd-cve-osv"].google_monitoring_alert_policy.cron_alert_policy will be created + resource "google_monitoring_alert_policy" "cron_alert_policy" { + combiner = "OR" + creation_record = (known after apply) + display_name = "Cronjob: nvd-cve-osv has not run recently." + enabled = true + id = (known after apply) + name = (known after apply) + project = "oss-vdb-test" + conditions { + display_name = "Cronjob: nvd-cve-osv has not run recently." + name = (known after apply) + condition_prometheus_query_language { + alert_rule = "AlwaysOn" + duration = "60s" + evaluation_interval = "60s" + query = "((time() - kube_cronjob_status_last_successful_time{cronjob=\"nvd-cve-osv\"})/60) > 86400" + rule_group = "cronjob nvd-cve-osv" } } } # module.k8s_cron_alert["CronJob--nvd-mirror"].google_monitoring_alert_policy.cron_alert_policy will be created + resource "google_monitoring_alert_policy" "cron_alert_policy" { + combiner = "OR" + creation_record = (known after apply) + display_name = "Cronjob: nvd-mirror has not run recently." + enabled = true + id = (known after apply) + name = (known after apply) + project = "oss-vdb-test" + conditions { + display_name = "Cronjob: nvd-mirror has not run recently." + name = (known after apply) + condition_prometheus_query_language { + alert_rule = "AlwaysOn" + duration = "60s" + evaluation_interval = "60s" + query = "((time() - kube_cronjob_status_last_successful_time{cronjob=\"nvd-mirror\"})/60) > 240" + rule_group = "cronjob nvd-mirror" } } } # module.k8s_cron_alert["CronJob--staging-api-test"].google_monitoring_alert_policy.cron_alert_policy will be created + resource "google_monitoring_alert_policy" "cron_alert_policy" { + combiner = "OR" + creation_record = (known after apply) + display_name = "Cronjob: staging-api-test has not run recently." + enabled = true + id = (known after apply) + name = (known after apply) + project = "oss-vdb-test" + conditions { + display_name = "Cronjob: staging-api-test has not run recently." + name = (known after apply) + condition_prometheus_query_language { + alert_rule = "AlwaysOn" + duration = "60s" + evaluation_interval = "60s" + query = "((time() - kube_cronjob_status_last_successful_time{cronjob=\"staging-api-test\"})/60) > 2880" + rule_group = "cronjob staging-api-test" } } } Plan: 16 to add, 0 to change, 0 to destroy. ```
1 parent 17bc251 commit 75f999b

19 files changed

+93
-0
lines changed

deployment/clouddeploy/gke-workers/base/alias-computation.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ apiVersion: batch/v1
22
kind: CronJob
33
metadata:
44
name: alias-computation
5+
labels:
6+
cronLastSuccessfulTimeMins: 45
57
spec:
68
schedule: "10/15 * * * *"
79
concurrencyPolicy: Forbid

deployment/clouddeploy/gke-workers/base/alpine-cve-convert.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ apiVersion: batch/v1
22
kind: CronJob
33
metadata:
44
name: alpine-cve-convert
5+
labels:
6+
cronLastSuccessfulTimeMins: 180
57
spec:
68
schedule: "0 */1 * * *"
79
concurrencyPolicy: Forbid

deployment/clouddeploy/gke-workers/base/backup.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ apiVersion: batch/v1
22
kind: CronJob
33
metadata:
44
name: backup
5+
labels:
6+
cronLastSuccessfulTimeMins: 2880
57
spec:
68
schedule: "0 18 * * *"
79
concurrencyPolicy: Forbid

deployment/clouddeploy/gke-workers/base/combine-to-osv.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ apiVersion: batch/v1
22
kind: CronJob
33
metadata:
44
name: combine-to-osv
5+
labels:
6+
cronLastSuccessfulTimeMins: 90
57
spec:
68
schedule: "30 */1 * * *"
79
concurrencyPolicy: Forbid

deployment/clouddeploy/gke-workers/base/cpe-repo-gen.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ apiVersion: batch/v1
22
kind: CronJob
33
metadata:
44
name: cpe-repo-gen
5+
labels:
6+
cronLastSuccessfulTimeMins: 2880
57
spec:
68
schedule: "0 6 */1 * *"
79
concurrencyPolicy: Forbid

deployment/clouddeploy/gke-workers/base/debian-convert.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ apiVersion: batch/v1
22
kind: CronJob
33
metadata:
44
name: debian-convert
5+
labels:
6+
cronLastSuccessfulTimeMins: 180
57
spec:
68
schedule: "0 */1 * * *"
79
concurrencyPolicy: Forbid

deployment/clouddeploy/gke-workers/base/debian-copyright-mirror.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ apiVersion: batch/v1
22
kind: CronJob
33
metadata:
44
name: debian-copyright-mirror
5+
labels:
6+
cronLastSuccessfulTimeMins: 2880
57
spec:
68
schedule: "0 6 */1 * *"
79
concurrencyPolicy: Forbid

deployment/clouddeploy/gke-workers/base/debian-cve-convert.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ apiVersion: batch/v1
22
kind: CronJob
33
metadata:
44
name: debian-cve-convert
5+
labels:
6+
cronLastSuccessfulTimeMins: 120
57
spec:
68
schedule: "0 */1 * * *"
79
concurrencyPolicy: Forbid

deployment/clouddeploy/gke-workers/base/debian-first-version.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ apiVersion: batch/v1
22
kind: CronJob
33
metadata:
44
name: debian-first-version
5+
labels:
6+
cronLastSuccessfulTimeMins: 120
57
spec:
68
schedule: "0 1 * * *"
79
concurrencyPolicy: Forbid

deployment/clouddeploy/gke-workers/base/exporter.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ apiVersion: batch/v1
22
kind: CronJob
33
metadata:
44
name: exporter
5+
labels:
6+
cronLastSuccessfulTimeMins: 90
57
spec:
68
schedule: "*/30 * * * *"
79
concurrencyPolicy: Forbid

0 commit comments

Comments
 (0)