diff --git a/common/alert_group/standard/1.0/facets.yaml b/common/alert_group/standard/1.0/facets.yaml new file mode 100644 index 00000000..b095b9a4 --- /dev/null +++ b/common/alert_group/standard/1.0/facets.yaml @@ -0,0 +1,251 @@ +intent: alert_group +flavor: standard +version: '1.0' +description: Creates Prometheus alert rules for monitoring and alerting with comprehensive + validation and enhanced UI features +clouds: +- aws +- azure +- gcp +- kubernetes +inputs: + kubernetes_details: + type: '@facets/kubernetes-details' + displayName: Kubernetes Cluster + optional: false + default: + resource_type: kubernetes_cluster + resource_name: default + providers: + - kubernetes + - kubernetes-alpha + - helm + prometheus: + type: '@facets/prometheus' + displayName: Prometheus Configuration + description: Prometheus instance details for alert rule configuration and deployment + optional: false + default: + resource_type: configuration + resource_name: prometheus +spec: + title: Alert Group + type: object + description: Specification of the Alert Group resource intent + properties: + rules: + title: Alert Rules + type: object + description: Define alert rules for Prometheus monitoring + x-ui-toggle: false + patternProperties: + ^[a-zA-Z0-9_-]+$: + title: Alert Rule Configuration + type: object + properties: + expr: + type: string + title: Prometheus Expression + description: PromQL expression for the alert condition + pattern: ^.{1,2000}$ + x-ui-error-message: Prometheus expression is required and must be between + 1-2000 characters + for: + type: string + title: Alert Duration + description: Duration for which the condition must be true (e.g., 5m, + 10s, 1h, 2d) + pattern: ^[0-9]+[smhd]$ + default: 5m + x-ui-error-message: Duration must be in format like 5m, 30s, 1h, 2d + message: + type: string + title: Alert Message + description: Detailed message when alert fires + minLength: 10 + maxLength: 1000 + x-ui-error-message: Alert message must be between 10-1000 characters + and provide clear context + summary: + type: string + title: Alert Summary + description: Brief summary of the alert (appears in notifications) + minLength: 5 + maxLength: 200 + x-ui-error-message: Alert summary must be between 5-200 characters + resource_type: + type: string + title: Resource Type + description: Type of Kubernetes resource being monitored + minLength: 1 + maxLength: 100 + x-ui-api-source: + endpoint: /cc-ui/v1/dropdown/stack/{{stackName}}/resources-info + method: GET + params: + includeContent: false + labelKey: resourceType + valueKey: resourceType + filterConditions: + - field: resourceType + value: UNKNOWN + type: negation + x-ui-typeable: true + x-ui-error-message: Please enter a valid Kubernetes resource type + resource_name: + type: string + resource_name: null + title: Resource Name + description: Name of the resource being monitored (must follow Kubernetes + naming conventions) + pattern: ^(\{\{.*\}\}|[a-z0-9]([-a-z0-9]*[a-z0-9])?)$ + minLength: 1 + maxLength: 63 + x-ui-api-source: + endpoint: /cc-ui/v1/dropdown/stack/{{stackName}}/resources-info + method: GET + params: + includeContent: false + labelKey: resourceName + valueKey: resourceName + filterConditions: + - field: resourceType + value: spec.rules.{{this}}.resource_type + type: dynamic + x-ui-error-message: Resource name must be valid Kubernetes name (lowercase, + alphanumeric, hyphens, 1-63 chars) + alert_type: + type: string + title: Alert Type + description: Type of alert supported by Facets monitoring system + enum: + - performance + - availability + - security + - capacity + - network + - database + - application + - infrastructure + - custom + minLength: 1 + maxLength: 100 + x-ui-typeable: true + x-ui-error-message: Select from predefined alert types or enter a custom + alert type + severity: + type: string + title: Severity Level + description: Alert severity level for prioritization and routing + enum: + - high + - normal + - urgent + - warning + - critical + default: normal + x-ui-typeable: true + x-ui-error-message: Select from predefined severity levels or enter + a custom severity + disabled: + type: boolean + title: Disabled + description: Whether this alert rule is disabled (won't trigger when + conditions are met) + default: false + runbook_url: + type: string + title: Runbook URL + description: URL to documentation or runbook for handling this alert + pattern: ^https?://.* + x-ui-error-message: Runbook URL must be a valid HTTP/HTTPS URL + escalation_policy: + type: string + title: Escalation Policy + description: Name of the escalation policy for this alert + enum: + - immediate + - standard + - low_priority + - business_hours + - weekend_only + - custom + default: standard + x-ui-error-message: Please select a valid escalation policy + thresholds: + type: object + title: Alert Thresholds + description: Configurable thresholds for the alert condition + x-ui-toggle: true + properties: + warning: + type: number + title: Warning Threshold + description: Threshold value for warning level alerts + minimum: 0 + critical: + type: number + title: Critical Threshold + description: Threshold value for critical level alerts + minimum: 0 + unit: + type: string + title: Unit + description: Unit of measurement for thresholds + enum: + - percent + - bytes + - count + - seconds + - milliseconds + - requests_per_second + - errors_per_minute + default: percent + labels: + type: object + title: Additional Labels + description: Additional labels for the alert (key-value pairs for routing + and grouping) + x-ui-toggle: true + x-ui-yaml-editor: true + x-ui-error-message: Labels should be key-value pairs with valid Kubernetes + label format + annotations: + type: object + title: Additional Annotations + description: Additional annotations for the alert (key-value pairs for + metadata) + x-ui-toggle: true + x-ui-yaml-editor: true + x-ui-error-message: Annotations should be key-value pairs providing + additional context + required: + - expr + - for + - resource_name + - resource_type + - summary + - message + - alert_type + required: + - rules +outputs: + default: + type: '@facets/alert_group' + title: Alert Group Configuration +sample: + version: '1.0' + flavor: standard + kind: alert_group + disabled: true + spec: + rules: + high_cpu_usage: + expr: cpu_usage_percent > 80 + for: 5m + message: CPU usage is above 80% for more than 5 minutes + summary: High CPU usage detected + resource_type: pod + resource_name: my-application + alert_type: performance + severity: normal diff --git a/common/alert_group/standard/1.0/main.tf b/common/alert_group/standard/1.0/main.tf new file mode 100644 index 00000000..41369662 --- /dev/null +++ b/common/alert_group/standard/1.0/main.tf @@ -0,0 +1,99 @@ +locals { + spec = lookup(var.instance, "spec", {}) + + # Get alert rules from spec + rules = lookup(local.spec, "rules", {}) + + # Get Prometheus release ID from prometheus input + prometheus_release = lookup(var.inputs.prometheus.attributes, "helm_release_id", "prometheus") + + # Transform rules into PrometheusRule format, filtering out disabled rules + alert_rules = [ + for rule_name, rule_object in local.rules : + { + alert = rule_name + expr = rule_object.expr + for = rule_object.for + labels = merge( + lookup(rule_object, "labels", {}), + { + resource_type = rule_object.resource_type + resource_name = rule_object.resource_name + resourceType = rule_object.resource_type + resourceName = rule_object.resource_name + alert_type = lookup(rule_object, "alert_type", null) + severity = lookup(rule_object, "severity", null) + } + ) + annotations = merge( + lookup(rule_object, "annotations", {}), + { + message = rule_object.message + summary = rule_object.summary + } + ) + } if !lookup(rule_object, "disabled", false) + ] + + # Extract rule names for outputs + rule_names = [for key, rule in local.rules : key if !lookup(rule, "disabled", false)] + + # Metadata for PrometheusRule + prometheus_rule_metadata = { + name = "${var.instance_name}-alert-group" + namespace = var.environment.namespace + labels = merge( + { + alert_group_name = var.instance_name + role = "alert-rules" + release = local.prometheus_release + "app.kubernetes.io/name" = var.instance_name + "app.kubernetes.io/instance" = var.instance_name + "app.kubernetes.io/component" = "alert-rules" + "app.kubernetes.io/managed-by" = "facets" + }, + var.environment.cloud_tags + ) + annotations = merge( + { + owner = "facets" + "facets.cloud/instance" = var.instance_name + "facets.cloud/environment" = var.environment.name + } + ) + } + + # PrometheusRule manifest + prometheus_rule_manifest = { + apiVersion = "monitoring.coreos.com/v1" + kind = "PrometheusRule" + metadata = local.prometheus_rule_metadata + spec = { + groups = [ + { + name = "${var.instance_name}-alert-rules" + rules = local.alert_rules + } + ] + } + } +} + +# Deploy PrometheusRule using helm_release with any-k8s-resource chart +resource "helm_release" "alert_group" { + name = "${var.instance_name}-alert-group" + chart = "https://github.com/Facets-cloud/facets-utility-modules/raw/master/any-k8s-resource/dynamic-k8s-resource-0.1.0.tgz" + namespace = var.environment.namespace + create_namespace = true + version = "0.1.0" + timeout = 300 + cleanup_on_fail = true + wait = false + max_history = 10 + + values = [ + yamlencode({ + resource = local.prometheus_rule_manifest + }) + ] +} diff --git a/common/alert_group/standard/1.0/outputs.tf b/common/alert_group/standard/1.0/outputs.tf new file mode 100644 index 00000000..aa3b1cb9 --- /dev/null +++ b/common/alert_group/standard/1.0/outputs.tf @@ -0,0 +1,9 @@ +locals { + output_attributes = { + alert_count = length(local.rule_names) + alert_names = local.rule_names + namespace = var.environment.namespace + prometheus_rule_name = "${var.instance_name}-alert-group" + } + output_interfaces = {} +} diff --git a/common/alert_group/standard/1.0/variables.tf b/common/alert_group/standard/1.0/variables.tf new file mode 100644 index 00000000..844db25a --- /dev/null +++ b/common/alert_group/standard/1.0/variables.tf @@ -0,0 +1,33 @@ +variable "instance" { + description = "Instance configuration for alert group" + type = object({ + spec = any + }) +} + +variable "instance_name" { + description = "Name of the alert group instance" + type = string +} + +variable "environment" { + description = "Environment configuration" + type = object({ + name = string + namespace = string + cloud_tags = map(string) + }) +} + +variable "inputs" { + description = "Input resources for the module" + type = object({ + kubernetes_details = object({ + resource_name = string + resource_type = string + }) + prometheus = object({ + attributes = any + }) + }) +} diff --git a/common/prometheus/k8s_standard/1.0/main.tf b/common/prometheus/k8s_standard/1.0/main.tf index b0bdca95..e34f0542 100644 --- a/common/prometheus/k8s_standard/1.0/main.tf +++ b/common/prometheus/k8s_standard/1.0/main.tf @@ -48,6 +48,9 @@ resource "helm_release" "prometheus-operator" { } }, alertmanager = { + annotations = { + "cluster-autoscaler.kubernetes.io/safe-to-evict" = "true" + } alertmanagerSpec = { storage = { volumeClaimTemplate = { @@ -68,6 +71,37 @@ resource "helm_release" "prometheus-operator" { } } } + config = { + global = { + resolve_timeout = "60m" + } + route = { + receiver = "default" + group_by = ["alertname", "entity"] + routes = [] + group_wait = "30s" + group_interval = "5m" + repeat_interval = "6h" + } + receivers : [ + { + name = "default" + webhook_configs : [ + { + url = "http://alertmanager-webhook.default/alerts" + send_resolved = true + }, + { + url = "https://${var.cc_metadata.cc_host}/cc/v1/clusters/${var.cluster.id}/alerts" + send_resolved = true + http_config = { + bearer_token = var.cc_metadata.cc_auth_token + } + } + ] + } + ] + } }, kube-state-metrics = { enabled = true diff --git a/common/prometheus/k8s_standard/1.0/outputs.tf b/common/prometheus/k8s_standard/1.0/outputs.tf index d18e47f8..187d7d0f 100644 --- a/common/prometheus/k8s_standard/1.0/outputs.tf +++ b/common/prometheus/k8s_standard/1.0/outputs.tf @@ -1,12 +1,14 @@ locals { output_interfaces = {} output_attributes = { - prometheus_url = "http://${module.name.name}.${var.environment.namespace}.svc.cluster.local:9090" - alertmanager_url = "http://${module.name.name}-alertmanager.${var.environment.namespace}.svc.cluster.local:9093" - grafana_url = "http://${module.name.name}-grafana.${var.environment.namespace}.svc.cluster.local:80" - helm_release_id = helm_release.prometheus-operator.id - prometheus_service = "${module.name.name}-prometheus" - alertmanager_service = "${module.name.name}-alertmanager" - grafana_service = "${module.name.name}-grafana" + prometheus_url = "http://${module.name.name}.${var.environment.namespace}.svc.cluster.local:9090" + alertmanager_url = "http://${module.name.name}-alertmanager.${var.environment.namespace}.svc.cluster.local:9093" + grafana_url = "http://${module.name.name}-grafana.${var.environment.namespace}.svc.cluster.local:80" + helm_release_id = helm_release.prometheus-operator.id + prometheus_release_name = module.name.name + namespace = local.namespace + prometheus_service = "${module.name.name}-prometheus" + alertmanager_service = "${module.name.name}-alertmanager" + grafana_service = "${module.name.name}-grafana" } } \ No newline at end of file