feat(go): add dynamic rate-limiting (#823)

Proximyst · web-flow · commit 80b783e335e7 · 2025-10-21T14:18:24.000+02:00
diff --git a/cmd/server/cmd.go b/cmd/server/cmd.go
@@ -19,7 +19,7 @@ func NewCmd() *cli.Command {
 	return &cli.Command{
 		Name:   "server",
 		Usage:  "Run the server part of the service.",
-		Flags:  slices.Concat(config.ServerFlags(), config.TracingFlags(), config.BrowserFlags()),
+		Flags:  slices.Concat(config.ServerFlags(), config.TracingFlags(), config.BrowserFlags(), config.RateLimitFlags()),
 		Action: run,
 	}
 }
@@ -37,6 +37,10 @@ func run(ctx context.Context, c *cli.Command) error {
 	if err != nil {
 		return fmt.Errorf("failed to parse tracing config: %w", err)
 	}
+	rateLimitConfig, err := config.RateLimitConfigFromCommand(c)
+	if err != nil {
+		return fmt.Errorf("failed to parse process tracker config: %w", err)
+	}
 	tracerProvider, err := traces.NewTracerProvider(ctx, tracingConfig)
 	if err != nil {
 		return fmt.Errorf("failed to set up tracer: %w", err)
@@ -47,10 +51,11 @@ func run(ctx context.Context, c *cli.Command) error {
 		otel.SetTracerProvider(tracerProvider)
 		otel.SetTextMapPropagator(propagation.TraceContext{})
 	}
-	browser := service.NewBrowserService(browserConfig)
+	processStatService := service.NewProcessStatService(rateLimitConfig)
+	browser := service.NewBrowserService(browserConfig, processStatService)
 	versions := service.NewVersionService()
 	metrics := metrics.NewRegistry()
-	handler, err := api.NewHandler(metrics, serverConfig, browser, versions)
+	handler, err := api.NewHandler(metrics, serverConfig, rateLimitConfig, processStatService, browser, versions)
 	if err != nil {
 		return fmt.Errorf("failed to create API handler: %w", err)
 	}
diff --git a/devenv/docker/go-build/docker-compose.yaml b/devenv/docker/go-build/docker-compose.yaml
@@ -46,6 +46,10 @@ services:
     environment:
       TRACING_ENDPOINT: http://tempo:4318/v1/traces
       LOG_LEVEL: debug
+    command:
+      - server
+      # 1 GiB
+      - --rate-limit.max-available=1073741824
     ports:
       - 8081:8081
     depends_on:
diff --git a/go.mod b/go.mod
@@ -10,6 +10,7 @@ require (
 	github.com/docker/go-connections v0.6.0
 	github.com/gen2brain/go-fitz v1.24.15
 	github.com/go-jose/go-jose/v4 v4.1.2
+	github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58
 	github.com/prometheus/client_golang v1.23.0
 	github.com/shirou/gopsutil/v4 v4.25.7
 	github.com/stretchr/testify v1.11.1
diff --git a/go.sum b/go.sum
@@ -123,6 +123,8 @@ github.com/opencontainers/image-spec v1.1.1 h1:y0fUlFfIZhPF1W537XOLg0/fcx6zcHCJw
 github.com/opencontainers/image-spec v1.1.1/go.mod h1:qpqAh3Dmcf36wStyyWU+kCeDgrGnAve2nCC8+7h8Q0M=
 github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde h1:x0TT0RDC7UhAVbbWWBzr41ElhJx5tXPWkIHA2HWPRuw=
 github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0=
+github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0=
+github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58/go.mod h1:DXv8WO4yhMYhSNPKjeNKa5WY9YCIEBRbNzFFPJbWO6Y=
 github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
 github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
diff --git a/pkg/api/middleware/auth.go b/pkg/api/middleware/auth.go
@@ -8,7 +8,7 @@ import (
 )
 
 var MetricAuthenticatedRequestAttempt = prometheus.NewCounterVec(prometheus.CounterOpts{
-	Name: "http_authenticated_request_attempt",
+	Name: "http_authenticated_request_attempts_total",
 	Help: "Counts the attempts of authenticated requests",
 }, []string{"result"})
 
diff --git a/pkg/api/middleware/ratelimiter.go b/pkg/api/middleware/ratelimiter.go
@@ -0,0 +1,148 @@
+package middleware
+
+import (
+	"context"
+	"log/slog"
+	"net/http"
+	"sync/atomic"
+
+	"github.com/grafana/grafana-image-renderer/pkg/config"
+	"github.com/grafana/grafana-image-renderer/pkg/service"
+	"github.com/pbnjay/memory"
+	"github.com/prometheus/client_golang/prometheus"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/trace"
+)
+
+var (
+	MetricRateLimiterRequests = prometheus.NewCounterVec(prometheus.CounterOpts{
+		Name: "http_rate_limiter_requests_total",
+		Help: "Number of HTTP requests that pass through the rate-limiter, and their outcomes.",
+	}, []string{"result", "why"})
+	MetricRateLimiterSlots = prometheus.NewGaugeVec(prometheus.GaugeOpts{
+		Name: "http_rate_limiter_slots",
+		Help: "The number of total available slots for handling requests, based on memory.",
+	}, []string{"type"})
+)
+
+// Limiter unifies the limiter types.
+type Limiter interface {
+	Limit(http.Handler) http.Handler
+}
+
+type noOpLimiter struct{}
+
+func (noOpLimiter) Limit(next http.Handler) http.Handler {
+	return next
+}
+
+type processBasedLimiter struct {
+	svc     *service.ProcessStatService
+	cfg     config.RateLimitConfig
+	running *atomic.Uint32
+	logger  *slog.Logger
+}
+
+func (p processBasedLimiter) Limit(next http.Handler) http.Handler {
+	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		tracer := tracer(r.Context())
+		ctx, span := tracer.Start(r.Context(), "processBasedLimiter.Limit")
+		defer span.End()
+
+		fits, why := p.canFitRequest(ctx)
+		span.SetAttributes(attribute.Bool("accepted", fits), attribute.String("reason", why))
+
+		if !fits {
+			span.SetStatus(codes.Error, "rate limit exceeded")
+			span.SetAttributes(attribute.Bool("accepted", false), attribute.String("reason", why))
+			MetricRateLimiterRequests.WithLabelValues("rejected", why).Inc()
+
+			w.Header().Set("Retry-After", "5")
+			w.WriteHeader(http.StatusTooManyRequests)
+			_, _ = w.Write([]byte("server is too busy, try again later"))
+			return
+		} else {
+			p.running.Add(1)
+			MetricRateLimiterRequests.WithLabelValues("accepted", why).Inc()
+			// From sync.AddUint32:
+			// > AddUint32 atomically adds delta to *addr and returns the new value.
+			// > To subtract a signed positive constant value c from x, do AddUint32(&x, ^uint32(c-1)).
+			// > In particular, to decrement x, do AddUint32(&x, ^uint32(0)).
+			// > Consider using the more ergonomic and less error-prone [Uint32.Add] instead.
+			defer p.running.Add(^uint32(0)) // decrement
+
+			next.ServeHTTP(w, r)
+		}
+	})
+}
+
+func (p processBasedLimiter) canFitRequest(ctx context.Context) (bool, string) {
+	tracer := tracer(ctx)
+	_, span := tracer.Start(ctx, "processBasedLimiter.canFitRequest", trace.WithAttributes(
+		attribute.Int64("headroom", int64(p.cfg.Headroom)),
+		attribute.Int64("min_memory_per_browser", int64(p.cfg.MinMemoryPerBrowser)),
+		attribute.Int64("min_limit", int64(p.cfg.MinLimit)),
+		attribute.Int64("max_limit", int64(p.cfg.MaxLimit)),
+		attribute.Int64("max_available", int64(p.cfg.MaxAvailable))))
+	defer span.End()
+
+	currentlyRunning := p.running.Load()
+	span.SetAttributes(attribute.Int64("currently_running", int64(currentlyRunning)))
+	if currentlyRunning < p.cfg.MinLimit {
+		return true, "below minimum limit"
+	} else if p.cfg.MaxLimit > 0 && currentlyRunning >= p.cfg.MaxLimit {
+		return false, "hit maximum limit"
+	}
+
+	totalMemory := memory.TotalMemory()
+	if p.cfg.MaxAvailable > 0 && totalMemory > p.cfg.MaxAvailable {
+		span.AddEvent("capping total memory to configured maximum")
+		totalMemory = p.cfg.MaxAvailable
+	}
+	freeMemory := memory.FreeMemory()
+	span.SetAttributes(
+		attribute.Int64("total_memory", int64(totalMemory)),
+		attribute.Int64("free_memory", int64(freeMemory)))
+
+	if totalMemory != 0 {
+		totalSlots := totalMemory / p.cfg.MinMemoryPerBrowser
+		MetricRateLimiterSlots.WithLabelValues("total").Set(float64(totalSlots))
+		MetricRateLimiterSlots.WithLabelValues("free").Set(float64(totalSlots - uint64(currentlyRunning)))
+		span.SetAttributes(attribute.Int64("total_slots", int64(totalSlots)))
+		if currentlyRunning >= uint32(totalSlots) {
+			return false, "no memory slots exist based on total memory"
+		}
+	} else {
+		span.AddEvent("unable to determine total memory, skipping total memory slot check")
+	}
+
+	if freeMemory != 0 {
+		// Calculate whether we have enough for another slot.
+		minRequired := max(p.cfg.MinMemoryPerBrowser, uint64(p.svc.PeakMemory))
+		span.SetAttributes(attribute.Int64("min_required_per_browser", int64(minRequired)))
+		if freeMemory < p.cfg.Headroom {
+			return false, "free memory smaller than required headroom"
+		} else if freeMemory-p.cfg.Headroom < minRequired {
+			return false, "not enough free memory without headroom for another browser"
+		}
+		// We have enough free memory.
+	} else {
+		span.AddEvent("unable to determine free memory, skipping free memory check")
+	}
+
+	return true, "sufficient memory slots exist"
+}
+
+func NewRateLimiter(svc *service.ProcessStatService, cfg config.RateLimitConfig) (Limiter, error) {
+	if cfg.Disabled {
+		return noOpLimiter{}, nil
+	}
+
+	return processBasedLimiter{
+		svc:     svc,
+		cfg:     cfg,
+		running: &atomic.Uint32{},
+		logger:  slog.With("middleware", "rate_limiter"),
+	}, nil
+}
diff --git a/pkg/api/middleware/recovery.go b/pkg/api/middleware/recovery.go
@@ -9,7 +9,7 @@ import (
 )
 
 var MetricRecoveredRequests = prometheus.NewCounter(prometheus.CounterOpts{
-	Name: "http_recovered_requests",
+	Name: "http_recovered_requests_total",
 	Help: "How many HTTP requests have panicked but recovered to not crash the application?",
 })
 
diff --git a/pkg/api/middleware/trustedurl.go b/pkg/api/middleware/trustedurl.go
@@ -8,7 +8,7 @@ import (
 )
 
 var MetricTrustedURLRequests = prometheus.NewCounterVec(prometheus.CounterOpts{
-	Name: "http_trusted_url_requests",
+	Name: "http_trusted_url_requests_total",
 	Help: "Counts the requests with URL queries",
 }, []string{"result"})
 
diff --git a/pkg/api/mux.go b/pkg/api/mux.go
@@ -1,6 +1,7 @@
 package api
 
 import (
+	"fmt"
 	"net/http"
 
 	"github.com/grafana/grafana-image-renderer/pkg/api/middleware"
@@ -17,18 +18,25 @@ func NewHandler(
 		prometheus.Registerer
 	},
 	serverConfig config.ServerConfig,
+	rateLimitConfig config.RateLimitConfig,
+	processStatService *service.ProcessStatService,
 	browser *service.BrowserService,
 	versions *service.VersionService,
 ) (http.Handler, error) {
+	limiter, err := middleware.NewRateLimiter(processStatService, rateLimitConfig)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create rate limiter: %w", err)
+	}
+
 	mux := http.NewServeMux()
 	mux.Handle("GET /", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		_, _ = w.Write([]byte("Grafana Image Renderer (Go)"))
 	}))
 	mux.Handle("GET /metrics", middleware.TracingFor("promhttp.HandlerFor", promhttp.HandlerFor(metrics, promhttp.HandlerOpts{Registry: metrics})))
 	mux.Handle("GET /healthz", HandleGetHealthz())
 	mux.Handle("GET /version", HandleGetVersion(versions, browser))
-	mux.Handle("GET /render", middleware.RequireAuthToken(middleware.TrustedURL(HandleGetRender(browser)), serverConfig.AuthTokens...))
-	mux.Handle("GET /render/csv", middleware.RequireAuthToken(middleware.TrustedURL(HandleGetRenderCSV(browser)), serverConfig.AuthTokens...))
+	mux.Handle("GET /render", middleware.RequireAuthToken(middleware.TrustedURL(limiter.Limit(HandleGetRender(browser))), serverConfig.AuthTokens...))
+	mux.Handle("GET /render/csv", middleware.RequireAuthToken(middleware.TrustedURL(limiter.Limit(HandleGetRenderCSV(browser))), serverConfig.AuthTokens...))
 	mux.Handle("GET /render/version", HandleGetRenderVersion(versions))
 
 	handler := middleware.RequestMetrics(mux)
diff --git a/pkg/config/config.go b/pkg/config/config.go
@@ -588,3 +588,114 @@ func BrowserConfigFromCommand(c *cli.Command) (BrowserConfig, error) {
 		Landscape:                       !c.Bool("browser.portrait"),
 	}, nil
 }
+
+type RateLimitConfig struct {
+	// Disabled indicates whether rate limiting is disabled.
+	Disabled bool
+
+	// TrackerDecay is the number N in decaying averages, `avg = ((N-1)*avg + new) / N`.
+	// This must be a minimum of 1, which will not use a slow-moving average at all.
+	TrackerDecay int64
+	// TrackerInterval is how often to sample process statistics on the browser processes.
+	// This must be a minimum of 1ms.
+	TrackerInterval time.Duration
+
+	// MinLimit is the minimum number of requests to permit.
+	// Even if we don't have slots for it, we will permit at least this many requests.
+	// Set to 0 to disable minimum; this is generally not recommended, especially in containerised environments like Kubernetes.
+	MinLimit uint32
+	// MaxLimit is the maximum number of requests to permit.
+	// Even if we have memory slots for more, we won't exceed this.
+	// Set to 0 to disable maximum; this is generally the way to go in horizontally scaled deployments.
+	MaxLimit uint32
+
+	// MaxAvailable is the maximum amount of memory (in bytes) available to processes.
+	// If there is more memory than this, we will only consider this amount.
+	// Set to 0 to use all available memory.
+	MaxAvailable uint64
+	// MinMemoryPerBrowser is the minimum amount of memory (in bytes) each browser process is expected to use.
+	// If the process tracker reports less, this is the value used. Otherwise, we use the process tracker's value.
+	// Set to 0 to disable the minimum.
+	MinMemoryPerBrowser uint64
+	// Headroom is how much memory (in bytes) should be left after the request's browser takes its share.
+	// If this cannot be accommodated, we will reject the request.
+	// Set to 0 to disable headroom.
+	Headroom uint64
+}
+
+func RateLimitFlags() []cli.Flag {
+	return []cli.Flag{
+		&cli.BoolFlag{
+			Name:    "rate-limit.disabled",
+			Usage:   "Disable rate limiting entirely.",
+			Sources: FromConfig("rate-limit.disabled", "RATE_LIMIT_DISABLED"),
+		},
+		&cli.Int64Flag{
+			Name:    "rate-limit.process-tracker.decay",
+			Usage:   "The decay factor N to use in slow-moving averages of process statistics, where `avg = ((N-1)*avg + new) / N`. Must be at least 1.",
+			Value:   5,
+			Sources: FromConfig("rate-limit.process-tracker.decay", "RATE_LIMIT_PROCESS_TRACKER_DECAY"),
+			Validator: func(i int64) error {
+				if i < 1 {
+					return fmt.Errorf("rate-limit.process-tracker.decay must be at least 1")
+				}
+				return nil
+			},
+		},
+		&cli.DurationFlag{
+			Name:    "rate-limit.process-tracker.interval",
+			Usage:   "How often to sample process statistics on the browser processes. Must be >= 1ms.",
+			Value:   50 * time.Millisecond,
+			Sources: FromConfig("rate-limit.process-tracker.interval", "RATE_LIMIT_PROCESS_TRACKER_INTERVAL"),
+			Validator: func(d time.Duration) error {
+				if d < time.Millisecond {
+					return fmt.Errorf("rate-limit.process-tracker.interval must be at least 1ms")
+				}
+				return nil
+			},
+		},
+		&cli.Uint32Flag{
+			Name:    "rate-limit.min-limit",
+			Usage:   "The minimum number of requests to permit. Ratelimiting will not reject requests if the number of currently running requests is below this value. Set to 0 to disable minimum (not recommended).",
+			Value:   3,
+			Sources: FromConfig("rate-limit.min-limit", "RATE_LIMIT_MIN_LIMIT"),
+		},
+		&cli.Uint32Flag{
+			Name:    "rate-limit.max-limit",
+			Usage:   "The maximum number of requests to permit. Ratelimiting will reject requests if the number of currently running requests is at or above this value. Set to 0 to disable maximum. The v4 service used 5 by default.",
+			Value:   0,
+			Sources: FromConfig("rate-limit.max-limit", "RATE_LIMIT_MAX_LIMIT"),
+		},
+		&cli.Uint64Flag{
+			Name:    "rate-limit.max-available",
+			Usage:   "The maximum amount of memory (in bytes) available to processes. If more memory exists, only this amount is used. 0 disables the maximum.",
+			Value:   0,
+			Sources: FromConfig("rate-limit.max-available", "RATE_LIMIT_MAX_AVAILABLE", "GOMEMLIMIT"),
+		},
+		&cli.Uint64Flag{
+			Name:    "rate-limit.min-memory-per-browser",
+			Usage:   "The minimum amount of memory (in bytes) each browser process is expected to use. Set to 0 to disable the minimum.",
+			Value:   64 * 1024 * 1024, // 64 MiB
+			Sources: FromConfig("rate-limit.min-memory-per-browser", "RATE_LIMIT_MIN_MEMORY_PER_BROWSER"),
+		},
+		&cli.Uint64Flag{
+			Name:    "rate-limit.headroom",
+			Usage:   "The amount of memory (in bytes) to leave as headroom after allocating memory for browser processes. Set to 0 to disable headroom.",
+			Value:   32 * 1024 * 1024, // 32 MiB
+			Sources: FromConfig("rate-limit.headroom", "RATE_LIMIT_HEADROOM"),
+		},
+	}
+}
+
+func RateLimitConfigFromCommand(c *cli.Command) (RateLimitConfig, error) {
+	return RateLimitConfig{
+		Disabled:            c.Bool("rate-limit.disabled"),
+		TrackerDecay:        c.Int64("rate-limit.process-tracker.decay"),
+		TrackerInterval:     c.Duration("rate-limit.process-tracker.interval"),
+		MinLimit:            c.Uint32("rate-limit.min-limit"),
+		MaxLimit:            c.Uint32("rate-limit.max-limit"),
+		MaxAvailable:        c.Uint64("rate-limit.max-available"),
+		MinMemoryPerBrowser: c.Uint64("rate-limit.min-memory-per-browser"),
+		Headroom:            c.Uint64("rate-limit.headroom"),
+	}, nil
+}
diff --git a/pkg/metrics/registry.go b/pkg/metrics/registry.go
@@ -16,6 +16,8 @@ func NewRegistry() *prometheus.Registry {
 		collectors.NewBuildInfoCollector(),
 
 		middleware.MetricAuthenticatedRequestAttempt,
+		middleware.MetricRateLimiterSlots,
+		middleware.MetricRateLimiterRequests,
 		middleware.MetricRequestsInFlight,
 		middleware.MetricRequestDurations,
 		middleware.MetricRecoveredRequests,
diff --git a/pkg/service/browser.go b/pkg/service/browser.go
diff --git a/pkg/service/process.go b/pkg/service/process.go
diff --git a/tests/acceptance/fixtures/dashboards/grafana-image-renderer.json b/tests/acceptance/fixtures/dashboards/grafana-image-renderer.json

Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,7 @@ import (`
`8`	`8`	`)`
`9`	`9`
`10`	`10`	`var MetricAuthenticatedRequestAttempt = prometheus.NewCounterVec(prometheus.CounterOpts{`
`11`		`- Name: "http_authenticated_request_attempt",`
	`11`	`+ Name: "http_authenticated_request_attempts_total",`
`12`	`12`	`Help: "Counts the attempts of authenticated requests",`
`13`	`13`	`}, []string{"result"})`
`14`	`14`
Original file line number	Diff line number	Diff line change
`@@ -9,7 +9,7 @@ import (`
`9`	`9`	`)`
`10`	`10`
`11`	`11`	`var MetricRecoveredRequests = prometheus.NewCounter(prometheus.CounterOpts{`
`12`		`- Name: "http_recovered_requests",`
	`12`	`+ Name: "http_recovered_requests_total",`
`13`	`13`	`Help: "How many HTTP requests have panicked but recovered to not crash the application?",`
`14`	`14`	`})`
`15`	`15`