diff --git a/.gitignore b/.gitignore index 7ef77d321a..4ed41012d0 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,7 @@ !/cluster/testdata/*.yml !/config/testdata/*.yml !/examples/ha/tls/*.yml +!/examples/slack-update-messages.yml !/notify/email/testdata/*.yml !/doc/examples/simple.yml !/circle.yml diff --git a/config/notifiers.go b/config/notifiers.go index b83db4c97f..570602de0f 100644 --- a/config/notifiers.go +++ b/config/notifiers.go @@ -522,6 +522,9 @@ type SlackConfig struct { LinkNames bool `yaml:"link_names" json:"link_names,omitempty"` MrkdwnIn []string `yaml:"mrkdwn_in,omitempty" json:"mrkdwn_in,omitempty"` Actions []*SlackAction `yaml:"actions,omitempty" json:"actions,omitempty"` + // UpdateMessage enables updating existing Slack messages instead of creating new ones. + // Requires bot token with chat:write scope. Webhook URLs do not support updates. + UpdateMessage bool `yaml:"update_message" json:"update_message,omitempty"` // Timeout is the maximum time allowed to invoke the slack. Setting this to 0 // does not impose a timeout. Timeout time.Duration `yaml:"timeout" json:"timeout"` diff --git a/examples/slack-update-messages.yml b/examples/slack-update-messages.yml new file mode 100644 index 0000000000..a0179fc70f --- /dev/null +++ b/examples/slack-update-messages.yml @@ -0,0 +1,37 @@ +# Alertmanager configuration with Slack message updates +global: + resolve_timeout: 5m + +# Route configuration - required! +route: + receiver: 'slack' + group_by: ['alertname', 'cluster', 'service'] + group_wait: 10s + group_interval: 10s + repeat_interval: 12h + +receivers: + - name: 'slack' + slack_configs: + - send_resolved: true # Required for message updates! + api_url: 'https://slack.com/api/chat.postMessage' + http_config: + authorization: + credentials: 'xoxb-your-bot-token' + channel: '#your-channel' + update_message: true # This enables message updates! + + # Template for updated messages + title: '{{ .GroupLabels.alertname }} - {{ .Status | toUpper }}' + text: | + {{ if eq .Status "firing" }} + 🔥 *{{ .Alerts.Firing | len }} alert(s) firing* + {{ else }} + ✅ *All alerts resolved* + {{ end }} + + {{ range .Alerts }} + • {{ if .Annotations.summary }}{{ .Annotations.summary }}{{ else }}{{ .Labels.alertname }}{{ end }} + {{ end }} + + color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' \ No newline at end of file diff --git a/nflog/nflogpb/nflog.proto b/nflog/nflogpb/nflog.proto index eb4fd8ba9e..50e028811f 100644 --- a/nflog/nflogpb/nflog.proto +++ b/nflog/nflogpb/nflog.proto @@ -39,6 +39,9 @@ message Entry { repeated uint64 firing_alerts = 6; // ResolvedAlerts list of hashes of resolved alerts at the last notification time. repeated uint64 resolved_alerts = 7; + // Metadata holds integration-specific metadata (e.g. Slack message_ts, Jira issue key). + // This allows integrations to store identifiers for updating existing notifications. + map metadata = 8; } // MeshEntry is a wrapper message to communicate a notify log diff --git a/notify/metadata_store.go b/notify/metadata_store.go new file mode 100644 index 0000000000..80730f253e --- /dev/null +++ b/notify/metadata_store.go @@ -0,0 +1,81 @@ +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package notify + +import ( + "fmt" + "sync" + + "github.com/prometheus/alertmanager/nflog/nflogpb" +) + +// MetadataStore is a temporary in-memory store for notification metadata +// (e.g., Slack message_ts, Jira issue keys) until protobuf Entry supports metadata field. +// This allows integrations to update existing notifications instead of creating new ones. +type MetadataStore struct { + mtx sync.RWMutex + data map[string]map[string]string // key: stateKey(groupKey, receiver) -> metadata map +} + +// NewMetadataStore creates a new MetadataStore. +func NewMetadataStore() *MetadataStore { + return &MetadataStore{ + data: make(map[string]map[string]string), + } +} + +// Set stores metadata for a given receiver and group key. +func (s *MetadataStore) Set(receiver *nflogpb.Receiver, groupKey string, metadata map[string]string) { + s.mtx.Lock() + defer s.mtx.Unlock() + + key := stateKey(groupKey, receiver) + s.data[key] = metadata +} + +// Get retrieves metadata for a given receiver and group key. +func (s *MetadataStore) Get(receiver *nflogpb.Receiver, groupKey string) (map[string]string, bool) { + s.mtx.RLock() + defer s.mtx.RUnlock() + + key := stateKey(groupKey, receiver) + metadata, ok := s.data[key] + return metadata, ok +} + +// Delete removes metadata for a given receiver and group key. +func (s *MetadataStore) Delete(receiver *nflogpb.Receiver, groupKey string) { + s.mtx.Lock() + defer s.mtx.Unlock() + + key := stateKey(groupKey, receiver) + delete(s.data, key) +} + +// stateKey returns a string key for a log entry consisting of the group key and receiver. +// This matches the key generation in nflog. +func stateKey(gkey string, r *nflogpb.Receiver) string { + return receiverKey(gkey, r) +} + +// receiverKey creates a unique key from group key and receiver. +// Format matches nflog's internal stateKey format: "groupKey:groupName/integration/idx". +func receiverKey(groupKey string, r *nflogpb.Receiver) string { + return groupKey + ":" + receiverString(r) +} + +// receiverString returns a string representation of the receiver. +func receiverString(r *nflogpb.Receiver) string { + return fmt.Sprintf("%s/%s/%d", r.GroupName, r.Integration, r.Idx) +} diff --git a/notify/slack/slack.go b/notify/slack/slack.go index 006a6eb88c..8daa7104f5 100644 --- a/notify/slack/slack.go +++ b/notify/slack/slack.go @@ -27,6 +27,7 @@ import ( commoncfg "github.com/prometheus/common/config" "github.com/prometheus/alertmanager/config" + "github.com/prometheus/alertmanager/nflog/nflogpb" "github.com/prometheus/alertmanager/notify" "github.com/prometheus/alertmanager/template" "github.com/prometheus/alertmanager/types" @@ -37,11 +38,12 @@ const maxTitleLenRunes = 1024 // Notifier implements a Notifier for Slack notifications. type Notifier struct { - conf *config.SlackConfig - tmpl *template.Template - logger *slog.Logger - client *http.Client - retrier *notify.Retrier + conf *config.SlackConfig + tmpl *template.Template + logger *slog.Logger + client *http.Client + retrier *notify.Retrier + metadataStore *notify.MetadataStore postJSONFunc func(ctx context.Context, client *http.Client, url string, body io.Reader) (*http.Response, error) } @@ -54,12 +56,13 @@ func New(c *config.SlackConfig, t *template.Template, l *slog.Logger, httpOpts . } return &Notifier{ - conf: c, - tmpl: t, - logger: l, - client: client, - retrier: ¬ify.Retrier{}, - postJSONFunc: notify.PostJSON, + conf: c, + tmpl: t, + logger: l, + client: client, + retrier: ¬ify.Retrier{}, + metadataStore: notify.NewMetadataStore(), + postJSONFunc: notify.PostJSON, }, nil } @@ -71,6 +74,16 @@ type request struct { IconURL string `json:"icon_url,omitempty"` LinkNames bool `json:"link_names,omitempty"` Attachments []attachment `json:"attachments"` + // Timestamp is used for updating existing messages (chat.update API) + Timestamp string `json:"ts,omitempty"` +} + +// slackResponse represents the response from Slack API. +type slackResponse struct { + OK bool `json:"ok"` + Error string `json:"error,omitempty"` + Channel string `json:"channel,omitempty"` + TS string `json:"ts,omitempty"` // Message timestamp, used for updates } // attachment is used to display a richly-formatted message block. @@ -92,7 +105,17 @@ type attachment struct { // Notify implements the Notifier interface. func (n *Notifier) Notify(ctx context.Context, as ...*types.Alert) (bool, error) { - var err error + // Extract group key and receiver name for message tracking + key, err := notify.ExtractGroupKey(ctx) + if err != nil { + return false, err + } + + receiverName, ok := notify.ReceiverName(ctx) + if !ok { + n.logger.Warn("receiver name missing from context") + } + var ( data = notify.GetTemplateData(ctx, n.tmpl, as, n.logger) tmplText = notify.TmplText(n.tmpl, data, &err) @@ -107,10 +130,6 @@ func (n *Notifier) Notify(ctx context.Context, as ...*types.Alert) (bool, error) title, truncated := notify.TruncateInRunes(tmplText(n.conf.Title), maxTitleLenRunes) if truncated { - key, err := notify.ExtractGroupKey(ctx) - if err != nil { - return false, err - } n.logger.Warn("Truncated title", "key", key, "max_runes", maxTitleLenRunes) } att := &attachment{ @@ -188,6 +207,53 @@ func (n *Notifier) Notify(ctx context.Context, as ...*types.Alert) (bool, error) return false, err } + // Check for existing message if update_message is enabled + var existingTS string + if n.conf.UpdateMessage && receiverName != "" { + // Create a simple receiver representation for metadata lookup + simpleReceiver := &nflogpb.Receiver{ + GroupName: receiverName, + Integration: "slack", + Idx: 0, + } + + n.logger.Debug("checking for existing message", + "key", key, + "receiver", receiverName, + "update_message", n.conf.UpdateMessage) + + if metadata, ok := n.metadataStore.Get(simpleReceiver, key.String()); ok { + if ts, exists := metadata["message_ts"]; exists && ts != "" { + existingTS = ts + req.Timestamp = ts + + // For chat.update, we need to use channel ID, not channel name + if channelID, ok := metadata["channel_id"]; ok && channelID != "" { + req.Channel = channelID + n.logger.Debug("FOUND existing Slack message - will UPDATE", + "key", key, + "receiver", receiverName, + "message_ts", ts, + "channel_id", channelID) + } else { + n.logger.Debug("FOUND message but no channel_id, using channel name", + "key", key, + "message_ts", ts) + } + } else { + n.logger.Debug("metadata exists but no message_ts", "metadata", metadata) + } + } else { + n.logger.Debug("no existing message found - will create NEW", + "key", key, + "receiver", receiverName) + } + } else { + n.logger.Debug("update disabled or no receiver", + "update_message", n.conf.UpdateMessage, + "receiver", receiverName) + } + var buf bytes.Buffer if err := json.NewEncoder(&buf).Encode(req); err != nil { return false, err @@ -204,6 +270,13 @@ func (n *Notifier) Notify(ctx context.Context, as ...*types.Alert) (bool, error) u = strings.TrimSpace(string(content)) } + // Use chat.update endpoint if we're updating an existing message + if existingTS != "" && n.conf.UpdateMessage { + // Replace chat.postMessage with chat.update for updates + u = strings.Replace(u, "chat.postMessage", "chat.update", 1) + n.logger.Debug("using chat.update endpoint for message update", "url", u) + } + if n.conf.Timeout > 0 { postCtx, cancel := context.WithTimeoutCause(ctx, n.conf.Timeout, fmt.Errorf("configured slack timeout reached (%s)", n.conf.Timeout)) defer cancel() @@ -219,9 +292,15 @@ func (n *Notifier) Notify(ctx context.Context, as ...*types.Alert) (bool, error) } defer notify.Drain(resp) + // Read response body once for all checks + body, err := io.ReadAll(resp.Body) + if err != nil { + return true, fmt.Errorf("channel %q: failed to read response body: %w", req.Channel, err) + } + // Use a retrier to generate an error message for non-200 responses and // classify them as retriable or not. - retry, err := n.retrier.Check(resp.StatusCode, resp.Body) + retry, err := n.retrier.Check(resp.StatusCode, bytes.NewReader(body)) if err != nil { err = fmt.Errorf("channel %q: %w", req.Channel, err) return retry, notify.NewErrorWithReason(notify.GetFailureReasonFromStatusCode(resp.StatusCode), err) @@ -229,48 +308,62 @@ func (n *Notifier) Notify(ctx context.Context, as ...*types.Alert) (bool, error) // Slack web API might return errors with a 200 response code. // https://slack.dev/node-slack-sdk/web-api#handle-errors - retry, err = checkResponseError(resp) + retry, err = checkJSONResponseErrorFromBody(body) if err != nil { err = fmt.Errorf("channel %q: %w", req.Channel, err) return retry, notify.NewErrorWithReason(notify.ClientErrorReason, err) } - return retry, nil -} + // Save message timestamp and channel ID for future updates if update_message is enabled + if n.conf.UpdateMessage && receiverName != "" { + if slackResp, err := extractSlackResponseFromBody(body); err == nil && slackResp.TS != "" { + simpleReceiver := &nflogpb.Receiver{ + GroupName: receiverName, + Integration: "slack", + Idx: 0, + } -// checkResponseError parses out the error message from Slack API response. -func checkResponseError(resp *http.Response) (bool, error) { - body, err := io.ReadAll(resp.Body) - if err != nil { - return true, fmt.Errorf("could not read response body: %w", err) - } + metadata := map[string]string{ + "message_ts": slackResp.TS, + "channel": req.Channel, + } - if strings.HasPrefix(resp.Header.Get("Content-Type"), "application/json") { - return checkJSONResponseError(body) + // Save channel ID for future updates (required by chat.update) + if slackResp.Channel != "" { + metadata["channel_id"] = slackResp.Channel + } + + n.metadataStore.Set(simpleReceiver, key.String(), metadata) + + n.logger.Debug("saved Slack message_ts for future updates", + "key", key, + "receiver", receiverName, + "message_ts", slackResp.TS, + "channel_id", slackResp.Channel, + "is_update", existingTS != "") + } } - return checkTextResponseError(body) + + return retry, nil } -// checkTextResponseError classifies plaintext responses from Slack. -// A plaintext (non-JSON) response is successful if it's a string "ok". -// This is typically a response for an Incoming Webhook -// (https://api.slack.com/messaging/webhooks#handling_errors) -func checkTextResponseError(body []byte) (bool, error) { - if !bytes.Equal(body, []byte("ok")) { - return false, fmt.Errorf("received an error response from Slack: %s", string(body)) +// extractSlackResponseFromBody extracts the full Slack response from body. +func extractSlackResponseFromBody(body []byte) (*slackResponse, error) { + var slackResp slackResponse + if err := json.Unmarshal(body, &slackResp); err != nil { + return nil, fmt.Errorf("failed to unmarshal Slack response: %w", err) } - return false, nil -} -// checkJSONResponseError classifies JSON responses from Slack. -func checkJSONResponseError(body []byte) (bool, error) { - // response is for parsing out errors from the JSON response. - type response struct { - OK bool `json:"ok"` - Error string `json:"error"` + if !slackResp.OK { + return nil, fmt.Errorf("slack API returned error: %s", slackResp.Error) } - var data response + return &slackResp, nil +} + +// checkJSONResponseErrorFromBody classifies JSON responses from body bytes. +func checkJSONResponseErrorFromBody(body []byte) (bool, error) { + var data slackResponse if err := json.Unmarshal(body, &data); err != nil { return true, fmt.Errorf("could not unmarshal JSON response %q: %w", string(body), err) }