Improve Worker timeout (#506)

cyriltovena · cyriltovena · commit 05b16c91714b · 2023-02-20T23:54:20.000+01:00
* Setup timeout for frontend and querier worker

* Increase timeout by 2 when using query-scheduler

* Review feedback
diff --git a/pkg/frontend/frontend.go b/pkg/frontend/frontend.go
@@ -52,6 +52,7 @@ type Config struct {
 
 	// This configuration is injected internally.
 	QuerySchedulerDiscovery schedulerdiscovery.Config `yaml:"-"`
+	MaxLoopDuration         time.Duration             `yaml:"-"`
 }
 
 func (cfg *Config) RegisterFlags(f *flag.FlagSet, logger log.Logger) {
diff --git a/pkg/frontend/frontend_scheduler_worker.go b/pkg/frontend/frontend_scheduler_worker.go
@@ -8,6 +8,7 @@ package frontend
 import (
 	"context"
 	"io"
+	"math/rand"
 	"net/http"
 	"sync"
 	"time"
@@ -28,6 +29,10 @@ import (
 	"github.com/grafana/phlare/pkg/util/servicediscovery"
 )
 
+func init() {
+	rand.Seed(time.Now().UnixNano())
+}
+
 const (
 	schedulerAddressLabel = "scheduler_address"
 	// schedulerWorkerCancelChanCapacity should be at least as big as the number of sub-queries issued by a single query
@@ -135,7 +140,7 @@ func (f *frontendSchedulerWorkers) addScheduler(address string) {
 	}
 
 	// No worker for this address yet, start a new one.
-	w = newFrontendSchedulerWorker(conn, address, f.frontendAddress, f.requestsCh, f.cfg.WorkerConcurrency, f.enqueuedRequests.WithLabelValues(address), f.log)
+	w = newFrontendSchedulerWorker(conn, address, f.frontendAddress, f.requestsCh, f.cfg.WorkerConcurrency, f.enqueuedRequests.WithLabelValues(address), f.cfg.MaxLoopDuration, f.log)
 
 	f.mu.Lock()
 	defer f.mu.Unlock()
@@ -229,9 +234,11 @@ type frontendSchedulerWorker struct {
 
 	// Number of queries sent to this scheduler.
 	enqueuedRequests prometheus.Counter
+
+	maxLoopDuration time.Duration
 }
 
-func newFrontendSchedulerWorker(conn *grpc.ClientConn, schedulerAddr string, frontendAddr string, requestCh <-chan *frontendRequest, concurrency int, enqueuedRequests prometheus.Counter, log log.Logger) *frontendSchedulerWorker {
+func newFrontendSchedulerWorker(conn *grpc.ClientConn, schedulerAddr string, frontendAddr string, requestCh <-chan *frontendRequest, concurrency int, enqueuedRequests prometheus.Counter, maxLoopDuration time.Duration, log log.Logger) *frontendSchedulerWorker {
 	w := &frontendSchedulerWorker{
 		log:              log,
 		conn:             conn,
@@ -241,6 +248,7 @@ func newFrontendSchedulerWorker(conn *grpc.ClientConn, schedulerAddr string, fro
 		requestCh:        requestCh,
 		cancelCh:         make(chan uint64, schedulerWorkerCancelChanCapacity),
 		enqueuedRequests: enqueuedRequests,
+		maxLoopDuration:  maxLoopDuration,
 	}
 	w.ctx, w.cancel = context.WithCancel(context.Background())
 
@@ -308,6 +316,11 @@ func (w *frontendSchedulerWorker) runOne(ctx context.Context, client schedulerpb
 	}
 }
 
+func jitter(d time.Duration, factor float64) time.Duration {
+	maxJitter := time.Duration(float64(d) * factor)
+	return d - time.Duration(rand.Int63n(int64(maxJitter)))
+}
+
 func (w *frontendSchedulerWorker) schedulerLoop(loop schedulerpb.SchedulerForFrontend_FrontendLoopClient) error {
 	if err := loop.Send(&schedulerpb.FrontendToScheduler{
 		Type:            schedulerpb.FrontendToSchedulerType_INIT,
@@ -323,7 +336,22 @@ func (w *frontendSchedulerWorker) schedulerLoop(loop schedulerpb.SchedulerForFro
 		return errors.Errorf("unexpected status received for init: %v", resp.Status)
 	}
 
-	ctx := loop.Context()
+	ctx, cancel := context.WithCancel(loop.Context())
+	defer cancel()
+	if w.maxLoopDuration > 0 {
+		go func() {
+			timer := time.NewTimer(jitter(w.maxLoopDuration, 0.3))
+			defer timer.Stop()
+
+			select {
+			case <-ctx.Done():
+				return
+			case <-timer.C:
+				cancel()
+				return
+			}
+		}()
+	}
 
 	for {
 		select {
@@ -335,7 +363,6 @@ func (w *frontendSchedulerWorker) schedulerLoop(loop schedulerpb.SchedulerForFro
 			// Reporting error here would delay reopening the stream (if the worker context is not done yet).
 			level.Debug(w.log).Log("msg", "stream context finished", "err", ctx.Err())
 			return nil
-
 		case req := <-w.requestCh:
 			err := loop.Send(&schedulerpb.FrontendToScheduler{
 				Type:            schedulerpb.FrontendToSchedulerType_ENQUEUE,
diff --git a/pkg/phlare/modules.go b/pkg/phlare/modules.go
@@ -4,8 +4,8 @@ import (
 	"context"
 	"fmt"
 	"os"
+	"time"
 
-	"github.com/bufbuild/connect-go"
 	"github.com/felixge/fgprof"
 	"github.com/go-kit/log"
 	"github.com/go-kit/log/level"
@@ -116,28 +116,24 @@ func (f *Phlare) initQueryScheduler() (services.Service, error) {
 		return nil, errors.Wrap(err, "query-scheduler init")
 	}
 	schedulerpbconnect.RegisterSchedulerForFrontendHandler(f.Server.HTTP, s)
-	schedulerpbconnect.RegisterSchedulerForQuerierHandler(f.Server.HTTP, s, f.schedulerQuerierTimeout())
+	schedulerpbconnect.RegisterSchedulerForQuerierHandler(f.Server.HTTP, s)
 	return s, nil
 }
 
-// schedulerQuerierTimeout returns a HandlerOption that sets the timeout for the
-// communication between the scheduler and the querier.
-// This is required because connect streaming handler does not propagate timeouts
-// through the context.
-// Adding a timeout options to the handler enforce the timeout to be propagated
-// and cancel the stream if the timeout is reached.
-// Querier expects this and will gracefully reconnects.
-func (f *Phlare) schedulerQuerierTimeout() connect.HandlerOption {
-	opts := []connect.HandlerOption{}
+// setupWorkerTimeout sets the max loop duration for the querier worker and frontend worker
+// to 90% of the read or write http timeout, whichever is smaller.
+// This is to ensure that the worker doesn't timeout before the http handler and that the connection
+// is refreshed.
+func (f *Phlare) setupWorkerTimeout() {
 	timeout := f.Cfg.Server.HTTPServerReadTimeout
 	if f.Cfg.Server.HTTPServerWriteTimeout < timeout {
 		timeout = f.Cfg.Server.HTTPServerWriteTimeout
 	}
 
 	if timeout > 0 {
-		opts = append(opts, connect.WithInterceptors(util.WithTimeout(timeout)))
+		f.Cfg.Worker.MaxLoopDuration = time.Duration(float64(timeout) * 0.9)
+		f.Cfg.Frontend.MaxLoopDuration = time.Duration(float64(timeout) * 0.9)
 	}
-	return connect.WithHandlerOptions(opts...)
 }
 
 func (f *Phlare) initQuerier() (services.Service, error) {
@@ -309,6 +305,12 @@ func (f *Phlare) initServer() (services.Service, error) {
 	// see https://github.com/grafana/phlare/issues/231
 	f.Cfg.Server.DoNotAddDefaultHTTPMiddleware = true
 
+	f.setupWorkerTimeout()
+	if f.isModuleActive(QueryScheduler) {
+		// to ensure that the query scheduler is always able to handle the request, we need to double the timeout
+		f.Cfg.Server.HTTPServerReadTimeout = 2 * f.Cfg.Server.HTTPServerReadTimeout
+		f.Cfg.Server.HTTPServerWriteTimeout = 2 * f.Cfg.Server.HTTPServerWriteTimeout
+	}
 	serv, err := server.New(f.Cfg.Server)
 	if err != nil {
 		return nil, err
diff --git a/pkg/querier/worker/scheduler_processor.go b/pkg/querier/worker/scheduler_processor.go
@@ -8,6 +8,7 @@ package worker
 import (
 	"context"
 	"fmt"
+	"math/rand"
 	"net/http"
 	"strings"
 	"time"
@@ -37,18 +38,23 @@ import (
 	"github.com/grafana/phlare/pkg/util/httpgrpcutil"
 )
 
+func init() {
+	rand.Seed(time.Now().UnixNano())
+}
+
 var processorBackoffConfig = backoff.Config{
 	MinBackoff: 250 * time.Millisecond,
 	MaxBackoff: 2 * time.Second,
 }
 
 func newSchedulerProcessor(cfg Config, handler RequestHandler, log log.Logger, reg prometheus.Registerer) (*schedulerProcessor, []services.Service) {
 	p := &schedulerProcessor{
-		log:            log,
-		handler:        handler,
-		maxMessageSize: cfg.GRPCClientConfig.MaxSendMsgSize,
-		querierID:      cfg.QuerierID,
-		grpcConfig:     cfg.GRPCClientConfig,
+		log:             log,
+		handler:         handler,
+		maxMessageSize:  cfg.GRPCClientConfig.MaxSendMsgSize,
+		querierID:       cfg.QuerierID,
+		grpcConfig:      cfg.GRPCClientConfig,
+		maxLoopDuration: cfg.MaxLoopDuration,
 
 		schedulerClientFactory: func(conn *grpc.ClientConn) schedulerpb.SchedulerForQuerierClient {
 			return schedulerpb.NewSchedulerForQuerierClient(conn)
@@ -78,11 +84,12 @@ func newSchedulerProcessor(cfg Config, handler RequestHandler, log log.Logger, r
 
 // Handles incoming queries from query-scheduler.
 type schedulerProcessor struct {
-	log            log.Logger
-	handler        RequestHandler
-	grpcConfig     grpcclient.Config
-	maxMessageSize int
-	querierID      string
+	log             log.Logger
+	handler         RequestHandler
+	grpcConfig      grpcclient.Config
+	maxMessageSize  int
+	querierID       string
+	maxLoopDuration time.Duration
 
 	frontendPool                  *client.Pool
 	frontendClientRequestDuration *prometheus.HistogramVec
@@ -111,42 +118,68 @@ func (sp *schedulerProcessor) processQueriesOnSingleStream(workerCtx context.Con
 
 	backoff := backoff.New(execCtx, processorBackoffConfig)
 	for backoff.Ongoing() {
-		c, err := schedulerClient.QuerierLoop(execCtx)
-		if err == nil {
-			err = c.Send(&schedulerpb.QuerierToScheduler{QuerierID: sp.querierID})
-		}
-
-		if err != nil {
-			level.Warn(sp.log).Log("msg", "error contacting scheduler", "err", err, "addr", address)
-			backoff.Wait()
-			continue
-		}
-
-		if err := sp.querierLoop(c, address, inflightQuery); err != nil {
-			// Do not log an error is the query-scheduler is shutting down.
-			if s, ok := status.FromError(err); !ok ||
-				(!strings.Contains(s.Message(), schedulerpb.ErrSchedulerIsNotRunning.Error()) &&
-					!strings.Contains(s.Message(), context.DeadlineExceeded.Error()) &&
-					!strings.Contains(s.Message(), "stream terminated")) {
-				level.Error(sp.log).Log("msg", "error processing requests from scheduler", "err", err, "addr", address)
-			}
-			if strings.Contains(err.Error(), context.DeadlineExceeded.Error()) || strings.Contains(err.Error(), "stream terminated") {
-				backoff.Reset()
-				continue
+		func() {
+			if err := sp.querierLoop(execCtx, schedulerClient, address, inflightQuery); err != nil {
+				// Do not log an error is the query-scheduler is shutting down.
+				if s, ok := status.FromError(err); !ok ||
+					(!strings.Contains(s.Message(), schedulerpb.ErrSchedulerIsNotRunning.Error()) &&
+						!strings.Contains(s.Message(), context.Canceled.Error()) &&
+						!strings.Contains(s.Message(), "stream terminated")) {
+					level.Error(sp.log).Log("msg", "error processing requests from scheduler", "err", err, "addr", address)
+				}
+				if strings.Contains(err.Error(), context.Canceled.Error()) || strings.Contains(err.Error(), "stream terminated") {
+					backoff.Reset()
+					return
+				}
+				backoff.Wait()
+				return
 			}
-			backoff.Wait()
-			continue
-		}
 
-		backoff.Reset()
+			backoff.Reset()
+		}()
 	}
 }
 
 // process loops processing requests on an established stream.
-func (sp *schedulerProcessor) querierLoop(c schedulerpb.SchedulerForQuerier_QuerierLoopClient, address string, inflightQuery *atomic.Bool) error {
-	// Build a child context so we can cancel a query when the stream is closed.
-	ctx, cancel := context.WithCancel(c.Context())
-	defer cancel()
+func (sp *schedulerProcessor) querierLoop(parentCtx context.Context, schedulerClient schedulerpb.SchedulerForQuerierClient, address string, inflightQuery *atomic.Bool) error {
+	loopCtx, loopCancel := context.WithCancel(parentCtx)
+	defer loopCancel()
+
+	if sp.maxLoopDuration > 0 {
+		go func() {
+			timer := time.NewTimer(jitter(sp.maxLoopDuration, 0.3))
+			defer timer.Stop()
+
+			select {
+			case <-timer.C:
+				level.Debug(sp.log).Log("msg", "waiting for inflight queries to complete")
+				for inflightQuery.Load() {
+					select {
+					case <-parentCtx.Done():
+						// In the meanwhile, the execution context has been explicitly canceled, so we should just terminate.
+						return
+					default:
+						// Wait and check again inflight queries.
+						time.Sleep(100 * time.Millisecond)
+					}
+				}
+				level.Debug(sp.log).Log("msg", "refreshing scheduler connection")
+				loopCancel()
+			case <-parentCtx.Done():
+				return
+			}
+		}()
+	}
+
+	c, err := schedulerClient.QuerierLoop(loopCtx)
+	if err == nil {
+		err = c.Send(&schedulerpb.QuerierToScheduler{QuerierID: sp.querierID})
+	}
+
+	if err != nil {
+		level.Warn(sp.log).Log("msg", "error contacting scheduler", "err", err, "addr", address)
+		return err
+	}
 
 	for {
 		request, err := c.Recv()
@@ -165,7 +198,7 @@ func (sp *schedulerProcessor) querierLoop(c schedulerpb.SchedulerForQuerier_Quer
 			defer inflightQuery.Store(false)
 
 			// We need to inject user into context for sending response back.
-			ctx := user.InjectOrgID(ctx, request.UserID)
+			ctx := user.InjectOrgID(c.Context(), request.UserID)
 
 			tracer := opentracing.GlobalTracer()
 			// Ignore errors here. If we cannot get parent span, we just don't create new one.
@@ -188,6 +221,11 @@ func (sp *schedulerProcessor) querierLoop(c schedulerpb.SchedulerForQuerier_Quer
 	}
 }
 
+func jitter(d time.Duration, factor float64) time.Duration {
+	maxJitter := time.Duration(float64(d) * factor)
+	return d - time.Duration(rand.Int63n(int64(maxJitter)))
+}
+
 func (sp *schedulerProcessor) runRequest(ctx context.Context, logger log.Logger, queryID uint64, frontendAddress string, statsEnabled bool, request *httpgrpc.HTTPRequest) {
 	var stats *querier_stats.Stats
 	if statsEnabled {
diff --git a/pkg/querier/worker/worker.go b/pkg/querier/worker/worker.go
@@ -35,6 +35,7 @@ type Config struct {
 	// This configuration is injected internally.
 	MaxConcurrentRequests   int                       `yaml:"-"` // Must be same as passed to PromQL Engine.
 	QuerySchedulerDiscovery schedulerdiscovery.Config `yaml:"-"`
+	MaxLoopDuration         time.Duration             `yaml:"-"`
 }
 
 func (cfg *Config) RegisterFlags(f *flag.FlagSet) {

Original file line number	Diff line number	Diff line change
`@@ -52,6 +52,7 @@ type Config struct {`
`52`	`52`
`53`	`53`	`// This configuration is injected internally.`
`54`	`54`	QuerySchedulerDiscovery schedulerdiscovery.Config `yaml:"-"`
	`55`	+ MaxLoopDuration time.Duration `yaml:"-"`
`55`	`56`	`}`
`56`	`57`
`57`	`58`	`func (cfg Config) RegisterFlags(f flag.FlagSet, logger log.Logger) {`
Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,7 @@ type Config struct {`
`35`	`35`	`// This configuration is injected internally.`
`36`	`36`	MaxConcurrentRequests int `yaml:"-"` // Must be same as passed to PromQL Engine.
`37`	`37`	QuerySchedulerDiscovery schedulerdiscovery.Config `yaml:"-"`
	`38`	+ MaxLoopDuration time.Duration `yaml:"-"`
`38`	`39`	`}`
`39`	`40`
`40`	`41`	`func (cfg Config) RegisterFlags(f flag.FlagSet) {`