diff --git a/docs/resources/asserts_prom_rule_file.md b/docs/resources/asserts_prom_rule_file.md new file mode 100644 index 000000000..b7a99d645 --- /dev/null +++ b/docs/resources/asserts_prom_rule_file.md @@ -0,0 +1,314 @@ +--- +# generated by https://github.com/hashicorp/terraform-plugin-docs +page_title: "grafana_asserts_prom_rule_file Resource - terraform-provider-grafana" +subcategory: "Knowledge Graph" +description: |- + Manages Prometheus Rules configurations through Grafana Asserts API. Allows creation and management of custom Prometheus recording and alerting rules. +--- + +# grafana_asserts_prom_rule_file (Resource) + +Manages Prometheus Rules configurations through Grafana Asserts API. Allows creation and management of custom Prometheus recording and alerting rules. + +## Example Usage + +```terraform +# Basic recording rule for latency metrics +resource "grafana_asserts_prom_rule_file" "latency_metrics" { + name = "custom-latency-metrics" + active = true + + group { + name = "latency_recording_rules" + interval = "30s" + + rule { + record = "custom:latency:p95" + expr = "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))" + labels = { + source = "custom_instrumentation" + severity = "info" + } + } + + rule { + record = "custom:latency:p99" + expr = "histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m]))" + labels = { + source = "custom_instrumentation" + severity = "info" + } + } + } +} + +# Alert rules for high latency +resource "grafana_asserts_prom_rule_file" "latency_alerts" { + name = "custom-latency-alerts" + active = true + + group { + name = "latency_alerting" + interval = "30s" + + rule { + alert = "HighLatency" + expr = "histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 0.5" + duration = "5m" + labels = { + severity = "warning" + category = "Latency" + } + annotations = { + summary = "High latency detected" + description = "P99 latency is above 500ms for 5 minutes" + } + } + + rule { + alert = "VeryHighLatency" + expr = "histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 1.0" + duration = "2m" + labels = { + severity = "critical" + category = "Latency" + } + annotations = { + summary = "Very high latency detected" + description = "P99 latency is above 1 second" + } + } + } +} + +# Comprehensive monitoring rules with multiple groups +resource "grafana_asserts_prom_rule_file" "comprehensive_monitoring" { + name = "custom-comprehensive-monitoring" + active = true + + # Latency monitoring + group { + name = "latency_monitoring" + interval = "30s" + + rule { + record = "custom:latency:p99" + expr = "histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m]))" + labels = { + source = "custom" + } + } + + rule { + alert = "HighLatency" + expr = "custom:latency:p99 > 0.5" + duration = "5m" + labels = { + severity = "warning" + } + annotations = { + summary = "High latency detected" + } + } + } + + # Error rate monitoring + group { + name = "error_monitoring" + interval = "1m" + + rule { + record = "custom:error:rate" + expr = "rate(http_requests_total{status=~\"5..\"}[5m])" + labels = { + source = "custom" + } + } + + rule { + alert = "HighErrorRate" + expr = "custom:error:rate > 0.1" + duration = "10m" + labels = { + severity = "critical" + category = "Errors" + } + annotations = { + summary = "High error rate detected" + description = "Error rate is above 10%" + } + } + } + + # Throughput monitoring + group { + name = "throughput_monitoring" + interval = "1m" + + rule { + record = "custom:throughput:total" + expr = "sum(rate(http_requests_total[5m]))" + labels = { + source = "custom" + } + } + + rule { + alert = "LowThroughput" + expr = "custom:throughput:total < 10" + duration = "5m" + labels = { + severity = "warning" + category = "Throughput" + } + annotations = { + summary = "Low throughput detected" + description = "Request throughput is below 10 requests/second" + } + } + } +} + +# Rules with conditional enablement +resource "grafana_asserts_prom_rule_file" "conditional_rules" { + name = "custom-conditional-rules" + active = true + + group { + name = "environment_specific_rules" + interval = "30s" + + rule { + alert = "TestAlert" + expr = "up == 0" + duration = "1m" + labels = { + severity = "info" + } + annotations = { + summary = "Test alert that is disabled in production" + } + # This rule will be disabled in the production group + disable_in_groups = ["production"] + } + + rule { + alert = "CriticalAlert" + expr = "up == 0" + duration = "30s" + labels = { + severity = "critical" + } + annotations = { + summary = "Critical alert that fires in all environments" + } + } + } +} + +# Inactive rules (for staging/testing) +resource "grafana_asserts_prom_rule_file" "staging_rules" { + name = "custom-staging-rules" + active = false # Rules file is inactive + + group { + name = "staging_tests" + interval = "1m" + + rule { + record = "staging:test:metric" + expr = "up" + labels = { + environment = "staging" + } + } + } +} + +# SLO-based alerting +resource "grafana_asserts_prom_rule_file" "slo_alerts" { + name = "custom-slo-alerts" + active = true + + group { + name = "slo_monitoring" + interval = "1m" + + rule { + record = "custom:slo:availability" + expr = "sum(rate(http_requests_total{status!~\"5..\"}[5m])) / sum(rate(http_requests_total[5m]))" + labels = { + slo_type = "availability" + } + } + + rule { + alert = "SLOAvailabilityBreach" + expr = "custom:slo:availability < 0.995" + duration = "5m" + labels = { + severity = "critical" + category = "SLO" + } + annotations = { + summary = "SLO availability breach" + description = "Availability is below 99.5% SLO target" + runbook_url = "https://docs.example.com/runbooks/availability-breach" + } + } + } +} +``` + + +## Schema + +### Required + +- `group` (Block List, Min: 1) List of Prometheus rule groups. Each group contains one or more rules and can have its own evaluation interval. (see [below for nested schema](#nestedblock--group)) +- `name` (String) The name of the Prometheus rules file. This will be stored with a .custom extension. Must follow naming validation rules (alphanumeric, hyphens, underscores). + +### Optional + +- `active` (Boolean) Whether the rules file is active. Inactive rules are not evaluated. Defaults to `true`. + +### Read-Only + +- `id` (String) The ID of this resource. + + +### Nested Schema for `group` + +Required: + +- `name` (String) The name of the rule group (e.g., 'latency_monitoring'). +- `rule` (Block List, Min: 1) List of Prometheus rules in this group. (see [below for nested schema](#nestedblock--group--rule)) + +Optional: + +- `interval` (String) Evaluation interval for this group (e.g., '30s', '1m'). If not specified, uses the global evaluation interval. + + +### Nested Schema for `group.rule` + +Required: + +- `expr` (String) The PromQL expression to evaluate. + +Optional: + +- `active` (Boolean) Whether this specific rule is active. This field is read-only and controlled by the API. +- `alert` (String) The name of the alert for alerting rules. Either 'record' or 'alert' must be specified, but not both. +- `annotations` (Map of String) Annotations to add to alerts (e.g., summary, description). +- `disable_in_groups` (Set of String) List of group names where this rule should be disabled. Useful for conditional rule enablement. +- `duration` (String) How long the condition must be true before firing the alert (e.g., '5m'). Only applicable for alerting rules. Maps to 'for' in Prometheus. +- `labels` (Map of String) Labels to attach to the resulting time series or alert. +- `record` (String) The name of the time series to output for recording rules. Either 'record' or 'alert' must be specified, but not both. + +## Import + +Import is supported using the following syntax: + +```shell +terraform import grafana_asserts_prom_rule_file.name "{{ name }}" +``` diff --git a/examples/resources/grafana_asserts_prom_rule_file/import.sh b/examples/resources/grafana_asserts_prom_rule_file/import.sh new file mode 100644 index 000000000..64588c5df --- /dev/null +++ b/examples/resources/grafana_asserts_prom_rule_file/import.sh @@ -0,0 +1 @@ +terraform import grafana_asserts_prom_rule_file.name "{{ name }}" diff --git a/examples/resources/grafana_asserts_prom_rule_file/resource.tf b/examples/resources/grafana_asserts_prom_rule_file/resource.tf new file mode 100644 index 000000000..f848ef4da --- /dev/null +++ b/examples/resources/grafana_asserts_prom_rule_file/resource.tf @@ -0,0 +1,246 @@ +# Basic recording rule for latency metrics +resource "grafana_asserts_prom_rule_file" "latency_metrics" { + name = "custom-latency-metrics" + active = true + + group { + name = "latency_recording_rules" + interval = "30s" + + rule { + record = "custom:latency:p95" + expr = "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))" + labels = { + source = "custom_instrumentation" + severity = "info" + } + } + + rule { + record = "custom:latency:p99" + expr = "histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m]))" + labels = { + source = "custom_instrumentation" + severity = "info" + } + } + } +} + +# Alert rules for high latency +resource "grafana_asserts_prom_rule_file" "latency_alerts" { + name = "custom-latency-alerts" + active = true + + group { + name = "latency_alerting" + interval = "30s" + + rule { + alert = "HighLatency" + expr = "histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 0.5" + duration = "5m" + labels = { + severity = "warning" + category = "Latency" + } + annotations = { + summary = "High latency detected" + description = "P99 latency is above 500ms for 5 minutes" + } + } + + rule { + alert = "VeryHighLatency" + expr = "histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 1.0" + duration = "2m" + labels = { + severity = "critical" + category = "Latency" + } + annotations = { + summary = "Very high latency detected" + description = "P99 latency is above 1 second" + } + } + } +} + +# Comprehensive monitoring rules with multiple groups +resource "grafana_asserts_prom_rule_file" "comprehensive_monitoring" { + name = "custom-comprehensive-monitoring" + active = true + + # Latency monitoring + group { + name = "latency_monitoring" + interval = "30s" + + rule { + record = "custom:latency:p99" + expr = "histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m]))" + labels = { + source = "custom" + } + } + + rule { + alert = "HighLatency" + expr = "custom:latency:p99 > 0.5" + duration = "5m" + labels = { + severity = "warning" + } + annotations = { + summary = "High latency detected" + } + } + } + + # Error rate monitoring + group { + name = "error_monitoring" + interval = "1m" + + rule { + record = "custom:error:rate" + expr = "rate(http_requests_total{status=~\"5..\"}[5m])" + labels = { + source = "custom" + } + } + + rule { + alert = "HighErrorRate" + expr = "custom:error:rate > 0.1" + duration = "10m" + labels = { + severity = "critical" + category = "Errors" + } + annotations = { + summary = "High error rate detected" + description = "Error rate is above 10%" + } + } + } + + # Throughput monitoring + group { + name = "throughput_monitoring" + interval = "1m" + + rule { + record = "custom:throughput:total" + expr = "sum(rate(http_requests_total[5m]))" + labels = { + source = "custom" + } + } + + rule { + alert = "LowThroughput" + expr = "custom:throughput:total < 10" + duration = "5m" + labels = { + severity = "warning" + category = "Throughput" + } + annotations = { + summary = "Low throughput detected" + description = "Request throughput is below 10 requests/second" + } + } + } +} + +# Rules with conditional enablement +resource "grafana_asserts_prom_rule_file" "conditional_rules" { + name = "custom-conditional-rules" + active = true + + group { + name = "environment_specific_rules" + interval = "30s" + + rule { + alert = "TestAlert" + expr = "up == 0" + duration = "1m" + labels = { + severity = "info" + } + annotations = { + summary = "Test alert that is disabled in production" + } + # This rule will be disabled in the production group + disable_in_groups = ["production"] + } + + rule { + alert = "CriticalAlert" + expr = "up == 0" + duration = "30s" + labels = { + severity = "critical" + } + annotations = { + summary = "Critical alert that fires in all environments" + } + } + } +} + +# Inactive rules (for staging/testing) +resource "grafana_asserts_prom_rule_file" "staging_rules" { + name = "custom-staging-rules" + active = false # Rules file is inactive + + group { + name = "staging_tests" + interval = "1m" + + rule { + record = "staging:test:metric" + expr = "up" + labels = { + environment = "staging" + } + } + } +} + +# SLO-based alerting +resource "grafana_asserts_prom_rule_file" "slo_alerts" { + name = "custom-slo-alerts" + active = true + + group { + name = "slo_monitoring" + interval = "1m" + + rule { + record = "custom:slo:availability" + expr = "sum(rate(http_requests_total{status!~\"5..\"}[5m])) / sum(rate(http_requests_total[5m]))" + labels = { + slo_type = "availability" + } + } + + rule { + alert = "SLOAvailabilityBreach" + expr = "custom:slo:availability < 0.995" + duration = "5m" + labels = { + severity = "critical" + category = "SLO" + } + annotations = { + summary = "SLO availability breach" + description = "Availability is below 99.5% SLO target" + runbook_url = "https://docs.example.com/runbooks/availability-breach" + } + } + } +} + diff --git a/internal/resources/asserts/catalog-resource.yaml b/internal/resources/asserts/catalog-resource.yaml index 60399e98f..2a1be9b3e 100644 --- a/internal/resources/asserts/catalog-resource.yaml +++ b/internal/resources/asserts/catalog-resource.yaml @@ -53,6 +53,19 @@ spec: --- apiVersion: backstage.io/v1alpha1 kind: Component +metadata: + name: resource-grafana_asserts_prom_rule_file + title: grafana_asserts_prom_rule_file (resource) + description: | + resource `grafana_asserts_prom_rule_file` in Grafana Labs' Terraform Provider +spec: + subcomponentOf: component:default/terraform-provider-grafana + type: terraform-resource + owner: group:default/asserts + lifecycle: production +--- +apiVersion: backstage.io/v1alpha1 +kind: Component metadata: name: resource-grafana_asserts_thresholds title: grafana_asserts_thresholds (resource) diff --git a/internal/resources/asserts/common_lister.go b/internal/resources/asserts/common_lister.go index de681d768..ff5203093 100644 --- a/internal/resources/asserts/common_lister.go +++ b/internal/resources/asserts/common_lister.go @@ -98,3 +98,17 @@ func listLogConfigs(ctx context.Context, client *assertsapi.APIClient, stackID s } return names, nil } + +// listPromRules retrieves the list of all Prometheus rules file names for a specific stack +func listPromRules(ctx context.Context, client *assertsapi.APIClient, stackID string) ([]string, error) { + request := client.PromRulesConfigControllerAPI.ListPromRules(ctx). + XScopeOrgID(stackID) + + namesDto, _, err := request.Execute() + if err != nil { + return nil, err + } + + // The DTO contains an array of rule file names + return namesDto.RuleNames, nil +} diff --git a/internal/resources/asserts/resource_prom_rules.go b/internal/resources/asserts/resource_prom_rules.go new file mode 100644 index 000000000..d310614d2 --- /dev/null +++ b/internal/resources/asserts/resource_prom_rules.go @@ -0,0 +1,501 @@ +package asserts + +import ( + "context" + "fmt" + + "github.com/hashicorp/terraform-plugin-sdk/v2/diag" + "github.com/hashicorp/terraform-plugin-sdk/v2/helper/retry" + "github.com/hashicorp/terraform-plugin-sdk/v2/helper/schema" + + assertsapi "github.com/grafana/grafana-asserts-public-clients/go/gcom" + "github.com/grafana/terraform-provider-grafana/v4/internal/common" +) + +func makeResourcePromRules() *common.Resource { + schema := &schema.Resource{ + Description: "Manages Prometheus Rules configurations through Grafana Asserts API. " + + "Allows creation and management of custom Prometheus recording and alerting rules.", + + CreateContext: resourcePromRulesCreate, + ReadContext: resourcePromRulesRead, + UpdateContext: resourcePromRulesUpdate, + DeleteContext: resourcePromRulesDelete, + + Importer: &schema.ResourceImporter{ + StateContext: schema.ImportStatePassthroughContext, + }, + + Schema: map[string]*schema.Schema{ + "name": { + Type: schema.TypeString, + Required: true, + ForceNew: true, // Force recreation if name changes + Description: "The name of the Prometheus rules file. This will be stored with a .custom extension. " + + "Must follow naming validation rules (alphanumeric, hyphens, underscores).", + }, + "active": { + Type: schema.TypeBool, + Optional: true, + Default: true, + Description: "Whether the rules file is active. Inactive rules are not evaluated.", + }, + "group": { + Type: schema.TypeList, + Required: true, + Description: "List of Prometheus rule groups. Each group contains one or more rules " + + "and can have its own evaluation interval.", + Elem: &schema.Resource{ + Schema: map[string]*schema.Schema{ + "name": { + Type: schema.TypeString, + Required: true, + Description: "The name of the rule group (e.g., 'latency_monitoring').", + }, + "interval": { + Type: schema.TypeString, + Optional: true, + Description: "Evaluation interval for this group (e.g., '30s', '1m'). " + + "If not specified, uses the global evaluation interval.", + }, + "rule": { + Type: schema.TypeList, + Required: true, + Description: "List of Prometheus rules in this group.", + Elem: &schema.Resource{ + Schema: map[string]*schema.Schema{ + "record": { + Type: schema.TypeString, + Optional: true, + Description: "The name of the time series to output for recording rules. " + + "Either 'record' or 'alert' must be specified, but not both.", + }, + "alert": { + Type: schema.TypeString, + Optional: true, + Description: "The name of the alert for alerting rules. " + + "Either 'record' or 'alert' must be specified, but not both.", + }, + "expr": { + Type: schema.TypeString, + Required: true, + Description: "The PromQL expression to evaluate.", + }, + "duration": { + Type: schema.TypeString, + Optional: true, + Description: "How long the condition must be true before firing the alert " + + "(e.g., '5m'). Only applicable for alerting rules. Maps to 'for' in Prometheus.", + }, + "active": { + Type: schema.TypeBool, + Optional: true, + Computed: true, + Description: "Whether this specific rule is active. " + + "This field is read-only and controlled by the API.", + }, + "labels": { + Type: schema.TypeMap, + Optional: true, + Description: "Labels to attach to the resulting time series or alert.", + Elem: &schema.Schema{Type: schema.TypeString}, + }, + "annotations": { + Type: schema.TypeMap, + Optional: true, + Description: "Annotations to add to alerts (e.g., summary, description).", + Elem: &schema.Schema{Type: schema.TypeString}, + }, + "disable_in_groups": { + Type: schema.TypeSet, + Optional: true, + Description: "List of group names where this rule should be disabled. " + + "Useful for conditional rule enablement.", + Elem: &schema.Schema{Type: schema.TypeString}, + }, + }, + }, + }, + }, + }, + }, + }, + } + + return common.NewLegacySDKResource( + common.CategoryAsserts, + "grafana_asserts_prom_rule_file", + common.NewResourceID(common.StringIDField("name")), + schema, + ).WithLister(assertsListerFunction(listPromRules)) +} + +func resourcePromRulesCreate(ctx context.Context, d *schema.ResourceData, meta interface{}) diag.Diagnostics { + client, stackID, diags := validateAssertsClient(meta) + if diags.HasError() { + return diags + } + + name := d.Get("name").(string) + active := d.Get("active").(bool) + + // Build the PrometheusRulesDto + rulesDto := assertsapi.PrometheusRulesDto{ + Name: &name, + Active: &active, + } + + // Build groups + groups, err := buildRuleGroups(d.Get("group").([]interface{})) + if err != nil { + return diag.FromErr(err) + } + rulesDto.Groups = groups + + // Call the API to create/update the rules file + // Note: PUT is idempotent, so create and update use the same operation + request := client.PromRulesConfigControllerAPI.PutPromRules(ctx). + PrometheusRulesDto(rulesDto). + XScopeOrgID(fmt.Sprintf("%d", stackID)) + + resp, err := request.Execute() + if err != nil { + // Try to extract more details from the error + apiErr := fmt.Errorf("failed to create Prometheus rules file: %w", err) + if resp != nil { + apiErr = fmt.Errorf("failed to create Prometheus rules file (HTTP %d): %w", resp.StatusCode, err) + } + return diag.FromErr(apiErr) + } + + d.SetId(name) + + return resourcePromRulesRead(ctx, d, meta) +} + +func resourcePromRulesRead(ctx context.Context, d *schema.ResourceData, meta interface{}) diag.Diagnostics { + client, stackID, diags := validateAssertsClient(meta) + if diags.HasError() { + return diags + } + + name := d.Id() + + // Retry logic for read operation to handle eventual consistency + var foundRules *assertsapi.PrometheusRulesDto + err := withRetryRead(ctx, func(retryCount, maxRetries int) *retry.RetryError { + // Get specific rules file + request := client.PromRulesConfigControllerAPI.GetPromRules(ctx, name). + XScopeOrgID(fmt.Sprintf("%d", stackID)) + + rules, resp, err := request.Execute() + if err != nil { + // If 404, the resource doesn't exist + if resp != nil && resp.StatusCode == 404 { + // Check if we should give up or retry + if retryCount >= maxRetries { + return createNonRetryableError("Prometheus rules file", name, retryCount) + } + return createRetryableError("Prometheus rules file", name, retryCount, maxRetries) + } + return createAPIError("get Prometheus rules file", retryCount, maxRetries, err) + } + + foundRules = rules + return nil + }) + + if err != nil { + // If not found after retries, remove from state + if foundRules == nil { + d.SetId("") + return nil + } + return diag.FromErr(err) + } + + // Set the resource data + if foundRules.Name != nil { + if err := d.Set("name", *foundRules.Name); err != nil { + return diag.FromErr(err) + } + } + + // Set active field - trust what the API returns + // If API doesn't return it, Terraform will use the schema default (true) + if foundRules.Active != nil { + if err := d.Set("active", *foundRules.Active); err != nil { + return diag.FromErr(err) + } + } + + // Flatten groups back into Terraform state + if len(foundRules.Groups) > 0 { + groups, err := flattenRuleGroups(foundRules.Groups) + if err != nil { + return diag.FromErr(err) + } + if err := d.Set("group", groups); err != nil { + return diag.FromErr(err) + } + } + + return nil +} + +func resourcePromRulesUpdate(ctx context.Context, d *schema.ResourceData, meta interface{}) diag.Diagnostics { + client, stackID, diags := validateAssertsClient(meta) + if diags.HasError() { + return diags + } + + name := d.Get("name").(string) + active := d.Get("active").(bool) + + // Build the PrometheusRulesDto + rulesDto := assertsapi.PrometheusRulesDto{ + Name: &name, + Active: &active, + } + + // Build groups + groups, err := buildRuleGroups(d.Get("group").([]interface{})) + if err != nil { + return diag.FromErr(err) + } + rulesDto.Groups = groups + + // Update using PUT (idempotent) + request := client.PromRulesConfigControllerAPI.PutPromRules(ctx). + PrometheusRulesDto(rulesDto). + XScopeOrgID(fmt.Sprintf("%d", stackID)) + + resp, err := request.Execute() + if err != nil { + // Try to extract more details from the error + apiErr := fmt.Errorf("failed to update Prometheus rules file: %w", err) + if resp != nil { + apiErr = fmt.Errorf("failed to update Prometheus rules file (HTTP %d): %w", resp.StatusCode, err) + } + return diag.FromErr(apiErr) + } + + return resourcePromRulesRead(ctx, d, meta) +} + +func resourcePromRulesDelete(ctx context.Context, d *schema.ResourceData, meta interface{}) diag.Diagnostics { + client, stackID, diags := validateAssertsClient(meta) + if diags.HasError() { + return diags + } + + name := d.Id() + + // Delete the rules file + request := client.PromRulesConfigControllerAPI.DeletePromRules(ctx, name). + XScopeOrgID(fmt.Sprintf("%d", stackID)) + + resp, err := request.Execute() + if err != nil { + // Ignore 404 errors - resource already deleted + if resp != nil && resp.StatusCode == 404 { + return nil + } + if common.IsNotFoundError(err) { + return nil + } + if resp != nil { + return diag.FromErr(fmt.Errorf("failed to delete Prometheus rules file (HTTP %d): %w", resp.StatusCode, err)) + } + return diag.FromErr(fmt.Errorf("failed to delete Prometheus rules file: %w", err)) + } + + return nil +} + +// buildRuleGroups converts Terraform schema data into PrometheusRuleGroupDto slice +func buildRuleGroups(groupsData []interface{}) ([]assertsapi.PrometheusRuleGroupDto, error) { + if len(groupsData) == 0 { + return nil, fmt.Errorf("at least one rule group is required") + } + + groups := make([]assertsapi.PrometheusRuleGroupDto, 0, len(groupsData)) + + for _, groupItem := range groupsData { + groupMap := groupItem.(map[string]interface{}) + + groupName := groupMap["name"].(string) + group := assertsapi.PrometheusRuleGroupDto{ + Name: &groupName, + } + + // Optional interval + if interval, ok := groupMap["interval"].(string); ok && interval != "" { + group.Interval = &interval + } + + // Build rules + rulesData := groupMap["rule"].([]interface{}) + if len(rulesData) == 0 { + return nil, fmt.Errorf("group '%s' must have at least one rule", groupName) + } + + rules, err := buildRules(rulesData, groupName) + if err != nil { + return nil, err + } + + group.Rules = rules + groups = append(groups, group) + } + + return groups, nil +} + +// buildRules converts Terraform schema data for rules into PrometheusRuleDto slice +func buildRules(rulesData []interface{}, groupName string) ([]assertsapi.PrometheusRuleDto, error) { + rules := make([]assertsapi.PrometheusRuleDto, 0, len(rulesData)) + + for _, ruleItem := range rulesData { + ruleMap := ruleItem.(map[string]interface{}) + + rule, err := buildRule(ruleMap, groupName) + if err != nil { + return nil, err + } + + rules = append(rules, rule) + } + + return rules, nil +} + +// buildRule converts a single rule from Terraform schema data into PrometheusRuleDto +func buildRule(ruleMap map[string]interface{}, groupName string) (assertsapi.PrometheusRuleDto, error) { + // Validate record/alert fields + record, hasRecord := ruleMap["record"].(string) + alert, hasAlert := ruleMap["alert"].(string) + + if (hasRecord && record != "") && (hasAlert && alert != "") { + return assertsapi.PrometheusRuleDto{}, fmt.Errorf("rule in group '%s' cannot have both 'record' and 'alert' specified", groupName) + } + if (!hasRecord || record == "") && (!hasAlert || alert == "") { + return assertsapi.PrometheusRuleDto{}, fmt.Errorf("rule in group '%s' must have either 'record' or 'alert' specified", groupName) + } + + expr := ruleMap["expr"].(string) + if expr == "" { + return assertsapi.PrometheusRuleDto{}, fmt.Errorf("rule in group '%s' must have 'expr' specified", groupName) + } + + rule := assertsapi.PrometheusRuleDto{ + Expr: &expr, + } + + if hasRecord && record != "" { + rule.Record = &record + } + + if hasAlert && alert != "" { + rule.Alert = &alert + } + + // Optional fields + if duration, ok := ruleMap["duration"].(string); ok && duration != "" { + rule.For = &duration + } + + // Don't send rule-level active - it's not persisted by the API yet + // Only file-level active is supported + // if activeVal, ok := ruleMap["active"].(bool); ok && !activeVal { + // rule.Active = &activeVal + // } + + // Labels + if labelsData, ok := ruleMap["labels"].(map[string]interface{}); ok && len(labelsData) > 0 { + labels := make(map[string]string) + for k, v := range labelsData { + labels[k] = v.(string) + } + rule.Labels = labels + } + + // Annotations + if annotationsData, ok := ruleMap["annotations"].(map[string]interface{}); ok && len(annotationsData) > 0 { + annotations := make(map[string]string) + for k, v := range annotationsData { + annotations[k] = v.(string) + } + rule.Annotations = annotations + } + + // Disable in groups + if disableInGroupsData, ok := ruleMap["disable_in_groups"].(*schema.Set); ok && disableInGroupsData.Len() > 0 { + disableInGroups := make([]string, 0, disableInGroupsData.Len()) + for _, item := range disableInGroupsData.List() { + disableInGroups = append(disableInGroups, item.(string)) + } + rule.DisableInGroups = disableInGroups + } + + return rule, nil +} + +// flattenRuleGroups converts PrometheusRuleGroupDto slice into Terraform schema data +func flattenRuleGroups(groups []assertsapi.PrometheusRuleGroupDto) ([]interface{}, error) { + result := make([]interface{}, 0, len(groups)) + + for _, group := range groups { + groupMap := make(map[string]interface{}) + + if group.Name != nil { + groupMap["name"] = *group.Name + } + + if group.Interval != nil { + groupMap["interval"] = *group.Interval + } + + // Flatten rules + rules := make([]interface{}, 0, len(group.Rules)) + for _, rule := range group.Rules { + ruleMap := make(map[string]interface{}) + + if rule.Record != nil { + ruleMap["record"] = *rule.Record + } + + if rule.Alert != nil { + ruleMap["alert"] = *rule.Alert + } + + if rule.Expr != nil { + ruleMap["expr"] = *rule.Expr + } + + if rule.For != nil { + ruleMap["duration"] = *rule.For + } + + // Only set collections if they have values - don't add empty ones + if len(rule.Labels) > 0 { + ruleMap["labels"] = rule.Labels + } + + if len(rule.Annotations) > 0 { + ruleMap["annotations"] = rule.Annotations + } + + if len(rule.DisableInGroups) > 0 { + ruleMap["disable_in_groups"] = rule.DisableInGroups + } + + rules = append(rules, ruleMap) + } + + groupMap["rule"] = rules + result = append(result, groupMap) + } + + return result, nil +} diff --git a/internal/resources/asserts/resource_prom_rules_test.go b/internal/resources/asserts/resource_prom_rules_test.go new file mode 100644 index 000000000..be27d0f27 --- /dev/null +++ b/internal/resources/asserts/resource_prom_rules_test.go @@ -0,0 +1,508 @@ +package asserts_test + +import ( + "context" + "fmt" + "strings" + "testing" + "time" + + "github.com/grafana/terraform-provider-grafana/v4/internal/common" + "github.com/grafana/terraform-provider-grafana/v4/internal/testutils" + "github.com/hashicorp/terraform-plugin-sdk/v2/helper/acctest" + "github.com/hashicorp/terraform-plugin-sdk/v2/helper/resource" + "github.com/hashicorp/terraform-plugin-sdk/v2/terraform" +) + +// cleanupDanglingPromRules removes any test prom rules that may have been left behind +// from previous test runs to avoid conflicts and ensure clean test state. +// Note: This function includes longer wait times due to backend JPA/Hibernate caching issues +// where deleted entities can remain visible in the cache for several seconds. +func cleanupDanglingPromRules(t *testing.T) { + client := testutils.Provider.Meta().(*common.Client).AssertsAPIClient + ctx := context.Background() + stackID := fmt.Sprintf("%d", testutils.Provider.Meta().(*common.Client).GrafanaStackID) + + t.Log("Cleaning up dangling prom rules from previous test runs...") + + // List all prom rules + listReq := client.PromRulesConfigControllerAPI.ListPromRules(ctx). + XScopeOrgID(stackID) + + namesDto, _, err := listReq.Execute() + if err != nil { + t.Logf("Warning: could not list prom rules for cleanup: %v", err) + return + } + + // Delete any test rules (prefixed with test- or stress-test-) + deletedCount := 0 + for _, name := range namesDto.RuleNames { + if strings.HasPrefix(name, "test-") || strings.HasPrefix(name, "stress-test-") { + t.Logf("Deleting dangling rule: %s", name) + + _, err := client.PromRulesConfigControllerAPI.DeletePromRules(ctx, name). + XScopeOrgID(stackID).Execute() + if err != nil { + t.Logf("Warning: failed to delete %s: %v", name, err) + } else { + deletedCount++ + } + } + } + + if deletedCount > 0 { + // Wait longer due to backend JPA/Hibernate caching issues + // The JpaKeyValueStore.delete() doesn't flush the EntityManager or clear caches + t.Logf("Deleted %d dangling rules, waiting 10s for backend cache to clear...", deletedCount) + time.Sleep(10 * time.Second) + } else { + t.Log("No dangling rules found") + } +} + +func TestAccAssertsPromRules_basic(t *testing.T) { + testutils.CheckCloudInstanceTestsEnabled(t) + cleanupDanglingPromRules(t) + + stackID := getTestStackID(t) + rName := fmt.Sprintf("test-acc-%s", acctest.RandString(8)) + + resource.ParallelTest(t, resource.TestCase{ + ProtoV5ProviderFactories: testutils.ProtoV5ProviderFactories, + CheckDestroy: testAccAssertsPromRulesCheckDestroy, + Steps: []resource.TestStep{ + { + Config: testAccAssertsPromRulesConfig(stackID, rName), + Check: resource.ComposeTestCheckFunc( + testAccAssertsPromRulesCheckExists("grafana_asserts_prom_rule_file.test", stackID, rName), + resource.TestCheckResourceAttr("grafana_asserts_prom_rule_file.test", "name", rName), + resource.TestCheckResourceAttr("grafana_asserts_prom_rule_file.test", "group.#", "1"), + resource.TestCheckResourceAttr("grafana_asserts_prom_rule_file.test", "group.0.name", "test_rules"), + resource.TestCheckResourceAttr("grafana_asserts_prom_rule_file.test", "group.0.rule.#", "1"), + resource.TestCheckResourceAttr("grafana_asserts_prom_rule_file.test", "group.0.rule.0.record", "custom:test:metric"), + testutils.CheckLister("grafana_asserts_prom_rule_file.test"), + ), + }, + { + // Test import + ResourceName: "grafana_asserts_prom_rule_file.test", + ImportState: true, + ImportStateVerify: true, + // Ignore active field - API may not return it if it's the default (true) + ImportStateVerifyIgnore: []string{"active"}, + }, + { + // Test update + Config: testAccAssertsPromRulesConfigUpdated(stackID, rName), + Check: resource.ComposeTestCheckFunc( + testAccAssertsPromRulesCheckExists("grafana_asserts_prom_rule_file.test", stackID, rName), + resource.TestCheckResourceAttr("grafana_asserts_prom_rule_file.test", "name", rName), + resource.TestCheckResourceAttr("grafana_asserts_prom_rule_file.test", "group.#", "2"), + resource.TestCheckResourceAttr("grafana_asserts_prom_rule_file.test", "group.0.name", "test_rules"), + resource.TestCheckResourceAttr("grafana_asserts_prom_rule_file.test", "group.1.name", "additional_rules"), + ), + }, + }, + }) +} + +func TestAccAssertsPromRules_recordingRule(t *testing.T) { + testutils.CheckCloudInstanceTestsEnabled(t) + cleanupDanglingPromRules(t) + + stackID := getTestStackID(t) + rName := fmt.Sprintf("test-recording-%s", acctest.RandString(8)) + + resource.ParallelTest(t, resource.TestCase{ + ProtoV5ProviderFactories: testutils.ProtoV5ProviderFactories, + CheckDestroy: testAccAssertsPromRulesCheckDestroy, + Steps: []resource.TestStep{ + { + Config: testAccAssertsPromRulesRecordingConfig(stackID, rName), + Check: resource.ComposeTestCheckFunc( + testAccAssertsPromRulesCheckExists("grafana_asserts_prom_rule_file.test", stackID, rName), + resource.TestCheckResourceAttr("grafana_asserts_prom_rule_file.test", "name", rName), + resource.TestCheckResourceAttr("grafana_asserts_prom_rule_file.test", "group.0.rule.0.record", "custom:requests:rate"), + resource.TestCheckResourceAttr("grafana_asserts_prom_rule_file.test", "group.0.rule.0.labels.source", "custom"), + resource.TestCheckResourceAttr("grafana_asserts_prom_rule_file.test", "group.0.rule.0.labels.severity", "info"), + ), + }, + }, + }) +} + +func TestAccAssertsPromRules_alertingRule(t *testing.T) { + testutils.CheckCloudInstanceTestsEnabled(t) + cleanupDanglingPromRules(t) + + stackID := getTestStackID(t) + rName := fmt.Sprintf("test-alerting-%s", acctest.RandString(8)) + + resource.ParallelTest(t, resource.TestCase{ + ProtoV5ProviderFactories: testutils.ProtoV5ProviderFactories, + CheckDestroy: testAccAssertsPromRulesCheckDestroy, + Steps: []resource.TestStep{ + { + Config: testAccAssertsPromRulesAlertingConfig(stackID, rName), + Check: resource.ComposeTestCheckFunc( + testAccAssertsPromRulesCheckExists("grafana_asserts_prom_rule_file.test", stackID, rName), + resource.TestCheckResourceAttr("grafana_asserts_prom_rule_file.test", "name", rName), + resource.TestCheckResourceAttr("grafana_asserts_prom_rule_file.test", "group.0.rule.0.alert", "TestAlert"), + resource.TestCheckResourceAttr("grafana_asserts_prom_rule_file.test", "group.0.rule.0.expr", "up == 0"), + resource.TestCheckResourceAttr("grafana_asserts_prom_rule_file.test", "group.0.rule.0.duration", "1m"), + resource.TestCheckResourceAttr("grafana_asserts_prom_rule_file.test", "group.0.rule.0.labels.asserts_alert_category", "error"), + resource.TestCheckResourceAttr("grafana_asserts_prom_rule_file.test", "group.0.rule.0.labels.asserts_severity", "warning"), + resource.TestCheckResourceAttr("grafana_asserts_prom_rule_file.test", "group.0.rule.0.annotations.summary", "Instance is down"), + ), + }, + }, + }) +} + +func TestAccAssertsPromRules_multipleGroups(t *testing.T) { + testutils.CheckCloudInstanceTestsEnabled(t) + cleanupDanglingPromRules(t) + + stackID := getTestStackID(t) + rName := fmt.Sprintf("test-multi-%s", acctest.RandString(8)) + + resource.ParallelTest(t, resource.TestCase{ + ProtoV5ProviderFactories: testutils.ProtoV5ProviderFactories, + CheckDestroy: testAccAssertsPromRulesCheckDestroy, + Steps: []resource.TestStep{ + { + Config: testAccAssertsPromRulesMultiGroupConfig(stackID, rName), + Check: resource.ComposeTestCheckFunc( + testAccAssertsPromRulesCheckExists("grafana_asserts_prom_rule_file.test", stackID, rName), + resource.TestCheckResourceAttr("grafana_asserts_prom_rule_file.test", "group.#", "3"), + resource.TestCheckResourceAttr("grafana_asserts_prom_rule_file.test", "group.0.name", "latency_rules"), + resource.TestCheckResourceAttr("grafana_asserts_prom_rule_file.test", "group.1.name", "error_rules"), + resource.TestCheckResourceAttr("grafana_asserts_prom_rule_file.test", "group.2.name", "throughput_rules"), + ), + }, + }, + }) +} + +func TestAccAssertsPromRules_inactive(t *testing.T) { + testutils.CheckCloudInstanceTestsEnabled(t) + cleanupDanglingPromRules(t) + + stackID := getTestStackID(t) + rName := fmt.Sprintf("test-inactive-%s", acctest.RandString(8)) + + resource.ParallelTest(t, resource.TestCase{ + ProtoV5ProviderFactories: testutils.ProtoV5ProviderFactories, + CheckDestroy: testAccAssertsPromRulesCheckDestroy, + Steps: []resource.TestStep{ + { + Config: testAccAssertsPromRulesInactiveConfig(stackID, rName), + Check: resource.ComposeTestCheckFunc( + testAccAssertsPromRulesCheckExists("grafana_asserts_prom_rule_file.test", stackID, rName), + resource.TestCheckResourceAttr("grafana_asserts_prom_rule_file.test", "active", "false"), + ), + }, + }, + }) +} + +func TestAccAssertsPromRules_eventualConsistencyStress(t *testing.T) { + testutils.CheckCloudInstanceTestsEnabled(t) + testutils.CheckStressTestsEnabled(t) + cleanupDanglingPromRules(t) + + stackID := getTestStackID(t) + baseName := fmt.Sprintf("stress-test-%s", acctest.RandString(8)) + + resource.ParallelTest(t, resource.TestCase{ + ProtoV5ProviderFactories: testutils.ProtoV5ProviderFactories, + CheckDestroy: testAccAssertsPromRulesCheckDestroy, + Steps: []resource.TestStep{ + { + Config: testAccAssertsPromRulesStressConfig(stackID, baseName), + Check: resource.ComposeTestCheckFunc( + // Check that all resources were created successfully + testAccAssertsPromRulesCheckExists("grafana_asserts_prom_rule_file.test1", stackID, baseName+"-1"), + testAccAssertsPromRulesCheckExists("grafana_asserts_prom_rule_file.test2", stackID, baseName+"-2"), + testAccAssertsPromRulesCheckExists("grafana_asserts_prom_rule_file.test3", stackID, baseName+"-3"), + resource.TestCheckResourceAttr("grafana_asserts_prom_rule_file.test1", "name", baseName+"-1"), + resource.TestCheckResourceAttr("grafana_asserts_prom_rule_file.test2", "name", baseName+"-2"), + resource.TestCheckResourceAttr("grafana_asserts_prom_rule_file.test3", "name", baseName+"-3"), + ), + }, + }, + }) +} + +func testAccAssertsPromRulesCheckExists(rn string, stackID int64, name string) resource.TestCheckFunc { + return func(s *terraform.State) error { + rs, ok := s.RootModule().Resources[rn] + if !ok { + return fmt.Errorf("resource not found: %s\n %#v", rn, s.RootModule().Resources) + } + + if rs.Primary.ID == "" { + return fmt.Errorf("resource id not set") + } + + client := testutils.Provider.Meta().(*common.Client).AssertsAPIClient + ctx := context.Background() + + // Get specific rules file + request := client.PromRulesConfigControllerAPI.GetPromRules(ctx, name). + XScopeOrgID(fmt.Sprintf("%d", stackID)) + + _, resp, err := request.Execute() + if err != nil { + if resp != nil && resp.StatusCode == 404 { + return fmt.Errorf("Prometheus rules file %s not found", name) + } + return fmt.Errorf("error getting Prometheus rules file: %s", err) + } + + return nil + } +} + +func testAccAssertsPromRulesCheckDestroy(s *terraform.State) error { + client := testutils.Provider.Meta().(*common.Client).AssertsAPIClient + ctx := context.Background() + + // Increased timeout to 180s (3 minutes) due to backend JPA/Hibernate caching issues + // The JpaKeyValueStore.delete() doesn't flush the EntityManager, so deleted entities + // can remain visible in the cache for an extended period + deadline := time.Now().Add(180 * time.Second) + + for _, rs := range s.RootModule().Resources { + if rs.Type != "grafana_asserts_prom_rule_file" { + continue + } + + // Resource ID is just the name now + name := rs.Primary.ID + stackID := fmt.Sprintf("%d", testutils.Provider.Meta().(*common.Client).GrafanaStackID) + + for { + // Try to get the rules file + request := client.PromRulesConfigControllerAPI.GetPromRules(ctx, name). + XScopeOrgID(stackID) + + _, resp, err := request.Execute() + if err != nil { + // If 404, resource is deleted - that's what we want + if resp != nil && resp.StatusCode == 404 { + break + } + // If we can't get it for other reasons, assume it's deleted + if common.IsNotFoundError(err) { + break + } + return fmt.Errorf("error checking Prometheus rules file destruction: %s", err) + } + + // Resource still exists + if time.Now().After(deadline) { + return fmt.Errorf("Prometheus rules file %s still exists after 180s (likely backend JPA cache issue)", name) + } + + // Use longer sleep interval due to caching delays + time.Sleep(5 * time.Second) + } + } + + return nil +} + +func testAccAssertsPromRulesConfig(stackID int64, name string) string { + return fmt.Sprintf(` +resource "grafana_asserts_prom_rule_file" "test" { + name = "%s" + + group { + name = "test_rules" + + rule { + record = "custom:test:metric" + expr = "up" + } + } +} +`, name) +} + +func testAccAssertsPromRulesConfigUpdated(stackID int64, name string) string { + return fmt.Sprintf(` +resource "grafana_asserts_prom_rule_file" "test" { + name = "%s" + + group { + name = "test_rules" + + rule { + record = "custom:test:metric:v2" + expr = "up" + } + + rule { + record = "custom:new:metric" + expr = "up" + } + } + + group { + name = "additional_rules" + + rule { + record = "custom:another:metric" + expr = "up" + } + } +} +`, name) +} + +func testAccAssertsPromRulesRecordingConfig(stackID int64, name string) string { + return fmt.Sprintf(` +resource "grafana_asserts_prom_rule_file" "test" { + name = "%s" + + group { + name = "recording_rules" + + rule { + record = "custom:requests:rate" + expr = "up" + labels = { + source = "custom" + severity = "info" + } + } + } +} +`, name) +} + +func testAccAssertsPromRulesAlertingConfig(stackID int64, name string) string { + return fmt.Sprintf(` +resource "grafana_asserts_prom_rule_file" "test" { + name = "%s" + + group { + name = "alerting_rules" + + rule { + alert = "TestAlert" + expr = "up == 0" + duration = "1m" + labels = { + asserts_alert_category = "error" + asserts_severity = "warning" + } + annotations = { + summary = "Instance is down" + } + } + } +} +`, name) +} + +func testAccAssertsPromRulesMultiGroupConfig(stackID int64, name string) string { + return fmt.Sprintf(` +resource "grafana_asserts_prom_rule_file" "test" { + name = "%s" + + group { + name = "latency_rules" + + rule { + record = "custom:latency:p95" + expr = "up" + } + } + + group { + name = "error_rules" + + rule { + record = "custom:error:rate" + expr = "up" + } + } + + group { + name = "throughput_rules" + + rule { + record = "custom:throughput:total" + expr = "up" + } + } +} +`, name) +} + +func testAccAssertsPromRulesInactiveConfig(stackID int64, name string) string { + return fmt.Sprintf(` +resource "grafana_asserts_prom_rule_file" "test" { + name = "%s" + active = false + + group { + name = "inactive_rules" + + rule { + record = "custom:test:metric" + expr = "up" + } + } +} +`, name) +} + +func testAccAssertsPromRulesStressConfig(stackID int64, baseName string) string { + return fmt.Sprintf(` +resource "grafana_asserts_prom_rule_file" "test1" { + name = "%s-1" + active = true + + group { + name = "stress_test_group_1" + + rule { + record = "stress:test:metric1" + expr = "up" + } + } +} + +resource "grafana_asserts_prom_rule_file" "test2" { + name = "%s-2" + active = true + + group { + name = "stress_test_group_2" + + rule { + record = "stress:test:metric2" + expr = "up" + } + } +} + +resource "grafana_asserts_prom_rule_file" "test3" { + name = "%s-3" + active = true + + group { + name = "stress_test_group_3" + + rule { + record = "stress:test:metric3" + expr = "up" + } + } +} +`, baseName, baseName, baseName) +} diff --git a/internal/resources/asserts/resources.go b/internal/resources/asserts/resources.go index 36da880be..a219614cd 100644 --- a/internal/resources/asserts/resources.go +++ b/internal/resources/asserts/resources.go @@ -11,6 +11,7 @@ var Resources = []*common.Resource{ makeResourceDisabledAlertConfig(), makeResourceCustomModelRules(), makeResourceLogConfig(), + makeResourcePromRules(), makeResourceThresholds(), }