Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
251 changes: 251 additions & 0 deletions common/alert_group/standard/1.0/facets.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
intent: alert_group
flavor: standard
version: '1.0'
description: Creates Prometheus alert rules for monitoring and alerting with comprehensive
validation and enhanced UI features
clouds:
- aws
- azure
- gcp
- kubernetes
inputs:
kubernetes_details:
type: '@facets/kubernetes-details'
displayName: Kubernetes Cluster
optional: false
default:
resource_type: kubernetes_cluster
resource_name: default
providers:
- kubernetes
- kubernetes-alpha
- helm
prometheus:
type: '@facets/prometheus'
displayName: Prometheus Configuration
description: Prometheus instance details for alert rule configuration and deployment
optional: false
default:
resource_type: configuration
resource_name: prometheus
spec:
title: Alert Group
type: object
description: Specification of the Alert Group resource intent
properties:
rules:
title: Alert Rules
type: object
description: Define alert rules for Prometheus monitoring
x-ui-toggle: false
patternProperties:
^[a-zA-Z0-9_-]+$:
title: Alert Rule Configuration
type: object
properties:
expr:
type: string
title: Prometheus Expression
description: PromQL expression for the alert condition
pattern: ^.{1,2000}$
x-ui-error-message: Prometheus expression is required and must be between
1-2000 characters
for:
type: string
title: Alert Duration
description: Duration for which the condition must be true (e.g., 5m,
10s, 1h, 2d)
pattern: ^[0-9]+[smhd]$
default: 5m
x-ui-error-message: Duration must be in format like 5m, 30s, 1h, 2d
message:
type: string
title: Alert Message
description: Detailed message when alert fires
minLength: 10
maxLength: 1000
x-ui-error-message: Alert message must be between 10-1000 characters
and provide clear context
summary:
type: string
title: Alert Summary
description: Brief summary of the alert (appears in notifications)
minLength: 5
maxLength: 200
x-ui-error-message: Alert summary must be between 5-200 characters
resource_type:
type: string
title: Resource Type
description: Type of Kubernetes resource being monitored
minLength: 1
maxLength: 100
x-ui-api-source:
endpoint: /cc-ui/v1/dropdown/stack/{{stackName}}/resources-info
method: GET
params:
includeContent: false
labelKey: resourceType
valueKey: resourceType
filterConditions:
- field: resourceType
value: UNKNOWN
type: negation
x-ui-typeable: true
x-ui-error-message: Please enter a valid Kubernetes resource type
resource_name:
type: string
resource_name: null
title: Resource Name
description: Name of the resource being monitored (must follow Kubernetes
naming conventions)
pattern: ^(\{\{.*\}\}|[a-z0-9]([-a-z0-9]*[a-z0-9])?)$
minLength: 1
maxLength: 63
x-ui-api-source:
endpoint: /cc-ui/v1/dropdown/stack/{{stackName}}/resources-info
method: GET
params:
includeContent: false
labelKey: resourceName
valueKey: resourceName
filterConditions:
- field: resourceType
value: spec.rules.{{this}}.resource_type
type: dynamic
x-ui-error-message: Resource name must be valid Kubernetes name (lowercase,
alphanumeric, hyphens, 1-63 chars)
alert_type:
type: string
title: Alert Type
description: Type of alert supported by Facets monitoring system
enum:
- performance
- availability
- security
- capacity
- network
- database
- application
- infrastructure
- custom
minLength: 1
maxLength: 100
x-ui-typeable: true
x-ui-error-message: Select from predefined alert types or enter a custom
alert type
severity:
type: string
title: Severity Level
description: Alert severity level for prioritization and routing
enum:
- high
- normal
- urgent
- warning
- critical
default: normal
x-ui-typeable: true
x-ui-error-message: Select from predefined severity levels or enter
a custom severity
disabled:
type: boolean
title: Disabled
description: Whether this alert rule is disabled (won't trigger when
conditions are met)
default: false
runbook_url:
type: string
title: Runbook URL
description: URL to documentation or runbook for handling this alert
pattern: ^https?://.*
x-ui-error-message: Runbook URL must be a valid HTTP/HTTPS URL
escalation_policy:
type: string
title: Escalation Policy
description: Name of the escalation policy for this alert
enum:
- immediate
- standard
- low_priority
- business_hours
- weekend_only
- custom
default: standard
x-ui-error-message: Please select a valid escalation policy
thresholds:
type: object
title: Alert Thresholds
description: Configurable thresholds for the alert condition
x-ui-toggle: true
properties:
warning:
type: number
title: Warning Threshold
description: Threshold value for warning level alerts
minimum: 0
critical:
type: number
title: Critical Threshold
description: Threshold value for critical level alerts
minimum: 0
unit:
type: string
title: Unit
description: Unit of measurement for thresholds
enum:
- percent
- bytes
- count
- seconds
- milliseconds
- requests_per_second
- errors_per_minute
default: percent
labels:
type: object
title: Additional Labels
description: Additional labels for the alert (key-value pairs for routing
and grouping)
x-ui-toggle: true
x-ui-yaml-editor: true
x-ui-error-message: Labels should be key-value pairs with valid Kubernetes
label format
annotations:
type: object
title: Additional Annotations
description: Additional annotations for the alert (key-value pairs for
metadata)
x-ui-toggle: true
x-ui-yaml-editor: true
x-ui-error-message: Annotations should be key-value pairs providing
additional context
required:
- expr
- for
- resource_name
- resource_type
- summary
- message
- alert_type
required:
- rules
outputs:
default:
type: '@facets/alert_group'
title: Alert Group Configuration
sample:
version: '1.0'
flavor: standard
kind: alert_group
disabled: true
spec:
rules:
high_cpu_usage:
expr: cpu_usage_percent > 80
for: 5m
message: CPU usage is above 80% for more than 5 minutes
summary: High CPU usage detected
resource_type: pod
resource_name: my-application
alert_type: performance
severity: normal
99 changes: 99 additions & 0 deletions common/alert_group/standard/1.0/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
locals {
spec = lookup(var.instance, "spec", {})

# Get alert rules from spec
rules = lookup(local.spec, "rules", {})

# Get Prometheus release ID from prometheus input
prometheus_release = lookup(var.inputs.prometheus.attributes, "helm_release_id", "prometheus")

# Transform rules into PrometheusRule format, filtering out disabled rules
alert_rules = [
for rule_name, rule_object in local.rules :
{
alert = rule_name
expr = rule_object.expr
for = rule_object.for
labels = merge(
lookup(rule_object, "labels", {}),
{
resource_type = rule_object.resource_type
resource_name = rule_object.resource_name
resourceType = rule_object.resource_type
resourceName = rule_object.resource_name
alert_type = lookup(rule_object, "alert_type", null)
severity = lookup(rule_object, "severity", null)
}
)
annotations = merge(
lookup(rule_object, "annotations", {}),
{
message = rule_object.message
summary = rule_object.summary
}
)
} if !lookup(rule_object, "disabled", false)
]

# Extract rule names for outputs
rule_names = [for key, rule in local.rules : key if !lookup(rule, "disabled", false)]

# Metadata for PrometheusRule
prometheus_rule_metadata = {
name = "${var.instance_name}-alert-group"
namespace = var.environment.namespace
labels = merge(
{
alert_group_name = var.instance_name
role = "alert-rules"
release = local.prometheus_release
"app.kubernetes.io/name" = var.instance_name
"app.kubernetes.io/instance" = var.instance_name
"app.kubernetes.io/component" = "alert-rules"
"app.kubernetes.io/managed-by" = "facets"
},
var.environment.cloud_tags
)
annotations = merge(
{
owner = "facets"
"facets.cloud/instance" = var.instance_name
"facets.cloud/environment" = var.environment.name
}
)
}

# PrometheusRule manifest
prometheus_rule_manifest = {
apiVersion = "monitoring.coreos.com/v1"
kind = "PrometheusRule"
metadata = local.prometheus_rule_metadata
spec = {
groups = [
{
name = "${var.instance_name}-alert-rules"
rules = local.alert_rules
}
]
}
}
}

# Deploy PrometheusRule using helm_release with any-k8s-resource chart
resource "helm_release" "alert_group" {
name = "${var.instance_name}-alert-group"
chart = "https://github.com/Facets-cloud/facets-utility-modules/raw/master/any-k8s-resource/dynamic-k8s-resource-0.1.0.tgz"
namespace = var.environment.namespace
create_namespace = true
version = "0.1.0"
timeout = 300
cleanup_on_fail = true
wait = false
max_history = 10

values = [
yamlencode({
resource = local.prometheus_rule_manifest
})
]
}
9 changes: 9 additions & 0 deletions common/alert_group/standard/1.0/outputs.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
locals {
output_attributes = {
alert_count = length(local.rule_names)
alert_names = local.rule_names
namespace = var.environment.namespace
prometheus_rule_name = "${var.instance_name}-alert-group"
}
output_interfaces = {}
}
33 changes: 33 additions & 0 deletions common/alert_group/standard/1.0/variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
variable "instance" {
description = "Instance configuration for alert group"
type = object({
spec = any
})
}

variable "instance_name" {
description = "Name of the alert group instance"
type = string
}

variable "environment" {
description = "Environment configuration"
type = object({
name = string
namespace = string
cloud_tags = map(string)
})
}

variable "inputs" {
description = "Input resources for the module"
type = object({
kubernetes_details = object({
resource_name = string
resource_type = string
})
prometheus = object({
attributes = any
})
})
}
Loading