prometheus · siavashs · Dec 8, 2022 · TheMeier · Nov 13, 2025 · siavashs
diff --git a/cmd/alertmanager/main.go b/cmd/alertmanager/main.go
@@ -143,6 +143,7 @@ func run() int {
 		maxSilenceSizeBytes         = kingpin.Flag("silences.max-silence-size-bytes", "Maximum silence size in bytes. If negative or zero, no limit is set.").Default("0").Int()
 		alertGCInterval             = kingpin.Flag("alerts.gc-interval", "Interval between alert GC.").Default("30m").Duration()
 		dispatchMaintenanceInterval = kingpin.Flag("dispatch.maintenance-interval", "Interval between maintenance of aggregation groups in the dispatcher.").Default("30s").Duration()
+		DispatchStartDelay          = kingpin.Flag("dispatch.start-delay", "Minimum amount of time to wait before dispatching alerts. This option should be synced with value of --rules.alert.resend-delay on Prometheus.").Default("0s").Duration()
 
 		webConfig      = webflag.AddFlags(kingpin.CommandLine, ":9093")
 		externalURL    = kingpin.Flag("web.external-url", "The URL under which Alertmanager is externally reachable (for example, if Alertmanager is served via a reverse proxy). Used for generating relative and absolute links back to Alertmanager itself. If the URL has a path portion, it will be used to prefix all HTTP endpoints served by Alertmanager. If omitted, relevant URL components will be derived automatically.").String()
@@ -415,7 +416,7 @@ func run() int {
 		prometheus.DefaultRegisterer,
 		configLogger,
 	)
-	configCoordinator.Subscribe(func(conf *config.Config) error {
+	configCoordinator.Subscribe(func(conf *config.Config, initial bool) error {
 		tmpl, err = template.FromGlobs(conf.Templates)
 		if err != nil {
 			return fmt.Errorf("failed to parse templates: %w", err)
@@ -493,7 +494,17 @@ func run() int {
 			silencer.Mutes(labels)
 		})
 
-		disp = dispatch.NewDispatcher(alerts, routes, pipeline, marker, timeoutFunc, *dispatchMaintenanceInterval, nil, logger, dispMetrics)
+		disp = dispatch.NewDispatcher(
+			alerts,
+			routes,
+			pipeline,
+			marker,
+			timeoutFunc,
+			*dispatchMaintenanceInterval,
+			nil,
+			logger,
+			dispMetrics,
+		)
 		routes.Walk(func(r *dispatch.Route) {
 			if r.RouteOpts.RepeatInterval > *retention {
 				configLogger.Warn(
@@ -520,7 +531,13 @@ func run() int {
 			}
 		})
 
-		go disp.Run()
+		dispatchDelay := time.Duration(0)
+		if initial {
+			// Only set minDispatchTime if we're in the initial start and not in a reload.
+			// This ensures immediate dispatch after a reload and optional delay after initial start.
+			dispatchDelay = *DispatchStartDelay
+		}
+		go disp.Run(dispatchDelay)
 		go inhibitor.Run()
 
 		return nil

diff --git a/config/coordinator.go b/config/coordinator.go
@@ -30,9 +30,10 @@ type Coordinator struct {
 	logger         *slog.Logger
 
 	// Protects config and subscribers
-	mutex       sync.Mutex
-	config      *Config
-	subscribers []func(*Config) error
+	mutex         sync.Mutex
+	config        *Config
+	subscribers   []func(*Config, bool) error
+	initialReload bool
 
 	configHashMetric        prometheus.Gauge
 	configSuccessMetric     prometheus.Gauge
@@ -46,6 +47,7 @@ func NewCoordinator(configFilePath string, r prometheus.Registerer, l *slog.Logg
 	c := &Coordinator{
 		configFilePath: configFilePath,
 		logger:         l,
+		initialReload:  true,
 	}
 
 	c.registerMetrics(r)
@@ -73,7 +75,7 @@ func (c *Coordinator) registerMetrics(r prometheus.Registerer) {
 }
 
 // Subscribe subscribes the given Subscribers to configuration changes.
-func (c *Coordinator) Subscribe(ss ...func(*Config) error) {
+func (c *Coordinator) Subscribe(ss ...func(*Config, bool) error) {
 	c.mutex.Lock()
 	defer c.mutex.Unlock()
 
@@ -82,11 +84,13 @@ func (c *Coordinator) Subscribe(ss ...func(*Config) error) {
 
 func (c *Coordinator) notifySubscribers() error {
 	for _, s := range c.subscribers {
-		if err := s(c.config); err != nil {
+		if err := s(c.config, c.initialReload); err != nil {
 			return err
 		}
 	}
 
+	// Set initialReload to false after the first notification.
+	c.initialReload = false
 	return nil
 }
 

diff --git a/config/coordinator_test.go b/config/coordinator_test.go
@@ -49,7 +49,7 @@ func TestCoordinatorRegistersMetrics(t *testing.T) {
 func TestCoordinatorNotifiesSubscribers(t *testing.T) {
 	callBackCalled := false
 	c := NewCoordinator("testdata/conf.good.yml", prometheus.NewRegistry(), promslog.NewNopLogger())
-	c.Subscribe(func(*Config) error {
+	c.Subscribe(func(*Config, bool) error {
 		callBackCalled = true
 		return nil
 	})
@@ -68,7 +68,7 @@ func TestCoordinatorFailReloadWhenSubscriberFails(t *testing.T) {
 	errMessage := "something happened"
 	c := NewCoordinator("testdata/conf.good.yml", prometheus.NewRegistry(), promslog.NewNopLogger())
 
-	c.Subscribe(func(*Config) error {
+	c.Subscribe(func(*Config, bool) error {
 		return errors.New(errMessage)
 	})
 

diff --git a/dispatch/dispatch.go b/dispatch/dispatch.go
@@ -25,13 +25,26 @@ import (
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/prometheus/client_golang/prometheus/promauto"
 	"github.com/prometheus/common/model"
+	"go.uber.org/atomic"
 
 	"github.com/prometheus/alertmanager/notify"
 	"github.com/prometheus/alertmanager/provider"
 	"github.com/prometheus/alertmanager/store"
 	"github.com/prometheus/alertmanager/types"
 )
 
+const (
+	DispatcherStateUnknown = iota
+	DispatcherStateWaitingToStart
+	DispatcherStateRunning
+)
+
+var state = map[int]string{
+	DispatcherStateUnknown:        "unknown",
+	DispatcherStateWaitingToStart: "waiting_to_start",
+	DispatcherStateRunning:        "running",
+}
+
 // DispatcherMetrics represents metrics associated to a dispatcher.
 type DispatcherMetrics struct {
 	aggrGroups            prometheus.Gauge
@@ -90,6 +103,9 @@ type Dispatcher struct {
 	cancel              func()
 
 	logger *slog.Logger
+
+	startTimer *time.Timer
+	state      int
 }
 
 // Limits describes limits used by Dispatcher.
@@ -102,39 +118,44 @@ type Limits interface {
 
 // NewDispatcher returns a new Dispatcher.
 func NewDispatcher(
-	ap provider.Alerts,
-	r *Route,
-	s notify.Stage,
-	mk types.GroupMarker,
-	to func(time.Duration) time.Duration,
-	mi time.Duration,
-	lim Limits,
-	l *slog.Logger,
-	m *DispatcherMetrics,
+	alerts provider.Alerts,
+	route *Route,
+	stage notify.Stage,
+	marker types.GroupMarker,
+	timeout func(time.Duration) time.Duration,
+	maintenanceInterval time.Duration,
+	limits Limits,
+	logger *slog.Logger,
+	metrics *DispatcherMetrics,
 ) *Dispatcher {
-	if lim == nil {
-		lim = nilLimits{}
+	if limits == nil {
+		limits = nilLimits{}
 	}
 
 	disp := &Dispatcher{
-		alerts:              ap,
-		stage:               s,
-		route:               r,
-		marker:              mk,
-		timeout:             to,
-		maintenanceInterval: mi,
-		logger:              l.With("component", "dispatcher"),
-		metrics:             m,
-		limits:              lim,
+		alerts:              alerts,
+		stage:               stage,
+		route:               route,
+		marker:              marker,
+		timeout:             timeout,
+		maintenanceInterval: maintenanceInterval,
+		logger:              logger.With("component", "dispatcher"),
+		metrics:             metrics,
+		limits:              limits,
+		state:               DispatcherStateUnknown,
 	}
 	return disp
 }
 
 // Run starts dispatching alerts incoming via the updates channel.
-func (d *Dispatcher) Run() {
+func (d *Dispatcher) Run(dispatchDelay time.Duration) {
 	d.done = make(chan struct{})
 
 	d.mtx.Lock()
+	d.logger.Debug("preparing to start", "dispatchDelay", dispatchDelay)
+	d.startTimer = time.NewTimer(dispatchDelay)
+	d.state = DispatcherStateWaitingToStart
+	d.logger.Debug("setting state", "state", state[d.state])
 	d.aggrGroupsPerRoute = map[*Route]map[model.Fingerprint]*aggrGroup{}
 	d.aggrGroupsNum = 0
 	d.metrics.aggrGroups.Set(0)
@@ -176,6 +197,18 @@ func (d *Dispatcher) run(it provider.AlertIterator) {
 			}
 			d.metrics.processingDuration.Observe(time.Since(now).Seconds())
 
+		case <-d.startTimer.C:
+			if d.state == DispatcherStateWaitingToStart {
+				d.state = DispatcherStateRunning
+				d.logger.Debug("started", "state", state[d.state])
+				d.logger.Debug("Starting all existing aggregation groups")
+				for _, groups := range d.aggrGroupsPerRoute {
+					for _, ag := range groups {
+						d.runAG(ag)
+					}
+				}
+			}
+
 		case <-maintenance.C:
 			d.doMaintenance()
 		case <-d.ctx.Done():
@@ -311,6 +344,7 @@ type notifyFunc func(context.Context, ...*types.Alert) bool
 // processAlert determines in which aggregation group the alert falls
 // and inserts it.
 func (d *Dispatcher) processAlert(alert *types.Alert, route *Route) {
+	now := time.Now()
 	groupLabels := getGroupLabels(alert, route)
 
 	fp := groupLabels.Fingerprint()
@@ -347,6 +381,34 @@ func (d *Dispatcher) processAlert(alert *types.Alert, route *Route) {
 	// alert is already there.
 	ag.insert(alert)
 
+	if alert.StartsAt.Add(ag.opts.GroupWait).Before(now) {
+		ag.logger.Debug(
+			"Alert is old enough for immediate flush, resetting timer to zero",
+			"alert", alert.Name(),
+			"fingerprint", alert.Fingerprint(),
+			"startsAt", alert.StartsAt,
+		)
+		ag.resetTimer(0)
+	}
+	// Check dispatcher and alert state to determine if we should run the AG now.
+	switch d.state {
+	case DispatcherStateWaitingToStart:
+		d.logger.Debug("Dispatcher still waiting to start")
+	case DispatcherStateRunning:
+		d.runAG(ag)
+	default:
+		s, ok := state[d.state]
+		if !ok {
+			s = "unknown"
+		}
+		d.logger.Warn("unknown state detected", "state", s)
+	}
+}
+
+func (d *Dispatcher) runAG(ag *aggrGroup) {
+	if ag.running.Load() {
+		return
+	}
 	go ag.run(func(ctx context.Context, alerts ...*types.Alert) bool {
 		_, _, err := d.stage.Exec(ctx, d.logger, alerts...)
 		if err != nil {
@@ -392,13 +454,18 @@ type aggrGroup struct {
 	done    chan struct{}
 	next    *time.Timer
 	timeout func(time.Duration) time.Duration
-
-	mtx        sync.RWMutex
-	hasFlushed bool
+	running atomic.Bool
 }
 
 // newAggrGroup returns a new aggregation group.
-func newAggrGroup(ctx context.Context, labels model.LabelSet, r *Route, to func(time.Duration) time.Duration, marker types.AlertMarker, logger *slog.Logger) *aggrGroup {
+func newAggrGroup(
+	ctx context.Context,
+	labels model.LabelSet,
+	r *Route,
+	to func(time.Duration) time.Duration,
+	marker types.AlertMarker,
+	logger *slog.Logger,
+) *aggrGroup {
 	if to == nil {
 		to = func(d time.Duration) time.Duration { return d }
 	}
@@ -436,6 +503,7 @@ func (ag *aggrGroup) String() string {
 }
 
 func (ag *aggrGroup) run(nf notifyFunc) {
+	ag.running.Store(true)
 	defer close(ag.done)
 	defer ag.next.Stop()
 
@@ -462,10 +530,7 @@ func (ag *aggrGroup) run(nf notifyFunc) {
 			ctx = notify.WithRouteID(ctx, ag.routeID)
 
 			// Wait the configured interval before calling flush again.
-			ag.mtx.Lock()
-			ag.next.Reset(ag.opts.GroupInterval)
-			ag.hasFlushed = true
-			ag.mtx.Unlock()
+			ag.resetTimer(ag.opts.GroupInterval)
 
 			ag.flush(func(alerts ...*types.Alert) bool {
 				return nf(ctx, alerts...)
@@ -486,19 +551,16 @@ func (ag *aggrGroup) stop() {
 	<-ag.done
 }
 
+// resetTimer resets the timer for the AG.
+func (ag *aggrGroup) resetTimer(t time.Duration) {
+	ag.next.Reset(t)
+}
+
 // insert inserts the alert into the aggregation group.
 func (ag *aggrGroup) insert(alert *types.Alert) {
 	if err := ag.alerts.Set(alert); err != nil {
 		ag.logger.Error("error on set alert", "err", err)
 	}
-
-	// Immediately trigger a flush if the wait duration for this
-	// alert is already over.
-	ag.mtx.Lock()
-	defer ag.mtx.Unlock()
-	if !ag.hasFlushed && alert.StartsAt.Add(ag.opts.GroupWait).Before(time.Now()) {
-		ag.next.Reset(0)
-	}
 }
 
 func (ag *aggrGroup) empty() bool {