Skip to content

Commit da59f58

Browse files
craig[bot]zachlite
andcommitted
107796: ui, server: add a timeout per node while collecting hot ranges r=zachlite a=zachlite Requests for hot ranges are serviced by a cluster wide fan-out, where non-trivial work is done on each node to provide a response. For each store, and for each hot range, we start a transaction with KV to look up descriptor info. Previously, there was no upper-bound set on the time a node could take to provide a response. This commit introduces a per-node timeout in the pagination logic, and is configurable with the new cluster setting server.hot_ranges.node.timeout. A value of 0 will disable the timeout. Error behavior and semantics are preserved. If a particular node times out, The fan-out continues as before, as if a node failed to provide a response. Informs cockroachdb#104269 Resolves cockroachdb#107627 Epic: none Release note (ops change): Added a new cluster setting named server.hot_ranges.node.timeout, with a default value of 5 minutes. The setting controls the maximum amount of time that a hot ranges request will spend waiting for a node to provide a response. Set to 0 to disable timeouts. Co-authored-by: zachlite <[email protected]>
2 parents e9add29 + 472d414 commit da59f58

File tree

9 files changed

+115
-17
lines changed

9 files changed

+115
-17
lines changed

docs/generated/settings/settings-for-tenants.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ server.clock.persist_upper_bound_interval duration 0s the interval between persi
5858
server.eventlog.enabled boolean true if set, logged notable events are also stored in the table system.eventlog tenant-rw
5959
server.eventlog.ttl duration 2160h0m0s if nonzero, entries in system.eventlog older than this duration are periodically purged tenant-rw
6060
server.host_based_authentication.configuration string host-based authentication configuration to use during connection authentication tenant-rw
61+
server.hot_ranges_request.node.timeout duration 5m0s the duration allowed for a single node to return hot range data before the request is cancelled; if set to 0, there is no timeout tenant-rw
6162
server.hsts.enabled boolean false if true, HSTS headers will be sent along with all HTTP requests. The headers will contain a max-age setting of one year. Browsers honoring the header will always use HTTPS to access the DB Console. Ensure that TLS is correctly configured prior to enabling. tenant-rw
6263
server.identity_map.configuration string system-identity to database-username mappings tenant-rw
6364
server.log_gc.max_deletions_per_cycle integer 1000 the maximum number of entries to delete on each purge of log-like system tables tenant-rw

docs/generated/settings/settings.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@
8888
<tr><td><div id="setting-server-eventlog-enabled" class="anchored"><code>server.eventlog.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>if set, logged notable events are also stored in the table system.eventlog</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
8989
<tr><td><div id="setting-server-eventlog-ttl" class="anchored"><code>server.eventlog.ttl</code></div></td><td>duration</td><td><code>2160h0m0s</code></td><td>if nonzero, entries in system.eventlog older than this duration are periodically purged</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
9090
<tr><td><div id="setting-server-host-based-authentication-configuration" class="anchored"><code>server.host_based_authentication.configuration</code></div></td><td>string</td><td><code></code></td><td>host-based authentication configuration to use during connection authentication</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
91+
<tr><td><div id="setting-server-hot-ranges-request-node-timeout" class="anchored"><code>server.hot_ranges_request.node.timeout</code></div></td><td>duration</td><td><code>5m0s</code></td><td>the duration allowed for a single node to return hot range data before the request is cancelled; if set to 0, there is no timeout</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
9192
<tr><td><div id="setting-server-hsts-enabled" class="anchored"><code>server.hsts.enabled</code></div></td><td>boolean</td><td><code>false</code></td><td>if true, HSTS headers will be sent along with all HTTP requests. The headers will contain a max-age setting of one year. Browsers honoring the header will always use HTTPS to access the DB Console. Ensure that TLS is correctly configured prior to enabling.</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
9293
<tr><td><div id="setting-server-identity-map-configuration" class="anchored"><code>server.identity_map.configuration</code></div></td><td>string</td><td><code></code></td><td>system-identity to database-username mappings</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
9394
<tr><td><div id="setting-server-log-gc-max-deletions-per-cycle" class="anchored"><code>server.log_gc.max_deletions_per_cycle</code></div></td><td>integer</td><td><code>1000</code></td><td>the maximum number of entries to delete on each purge of log-like system tables</td><td>Serverless/Dedicated/Self-Hosted</td></tr>

pkg/server/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ go_library(
2727
"fanout_clients.go",
2828
"grpc_gateway.go",
2929
"grpc_server.go",
30+
"hot_ranges.go",
3031
"import_ts.go",
3132
"index_usage_stats.go",
3233
"init.go",

pkg/server/api_v2_ranges.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -566,8 +566,9 @@ func (a *apiV2Server) listHotRanges(w http.ResponseWriter, r *http.Request) {
566566
})
567567
}
568568

569+
timeout := HotRangesRequestNodeTimeout.Get(&a.status.st.SV)
569570
next, err := a.status.paginatedIterateNodes(
570-
ctx, "hot ranges", limit, start, requestedNodes, dialFn,
571+
ctx, "hot ranges", limit, start, requestedNodes, timeout, dialFn,
571572
nodeFn, responseFn, errorFn)
572573

573574
if err != nil {

pkg/server/hot_ranges.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
// Copyright 2023 The Cockroach Authors.
2+
//
3+
// Use of this software is governed by the Business Source License
4+
// included in the file licenses/BSL.txt.
5+
//
6+
// As of the Change Date specified in that file, in accordance with
7+
// the Business Source License, use of this software will be governed
8+
// by the Apache License, Version 2.0, included in the file
9+
// licenses/APL.txt.
10+
11+
package server
12+
13+
import (
14+
"time"
15+
16+
"github.com/cockroachdb/cockroach/pkg/settings"
17+
)
18+
19+
// HotRangesRequestNodeTimeout controls the timeout of a serverpb.HotRangesRequest.
20+
// A default value of 5 minutes is meant to be an escape hatch for a node that is taking
21+
// too long to fulfill the request. The setting should be lowered for a faster response,
22+
// at the expense of possibly incomplete data, or raised for complete data, at the cost
23+
// of a possibly slower response.
24+
var HotRangesRequestNodeTimeout = settings.RegisterDurationSetting(
25+
settings.TenantWritable,
26+
"server.hot_ranges_request.node.timeout",
27+
"the duration allowed for a single node to return hot range data before the request is cancelled; if set to 0, there is no timeout",
28+
time.Minute*5,
29+
settings.NonNegativeDuration,
30+
).WithPublic()

pkg/server/pagination.go

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"strings"
2424
"sync"
2525
"sync/atomic"
26+
"time"
2627

2728
"github.com/cockroachdb/cockroach/pkg/base"
2829
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness/livenesspb"
@@ -323,10 +324,14 @@ func (r *rpcNodePaginator) init() {
323324
r.responseChan = make(chan paginatedNodeResponse, r.numNodes)
324325
}
325326

327+
const noTimeout time.Duration = 0
328+
326329
// queryNode queries the given node, and sends the responses back through responseChan
327330
// in order of idx (i.e. when all nodes with a lower idx have already sent theirs).
328331
// Safe for concurrent use.
329-
func (r *rpcNodePaginator) queryNode(ctx context.Context, nodeID roachpb.NodeID, idx int) {
332+
func (r *rpcNodePaginator) queryNode(
333+
ctx context.Context, nodeID roachpb.NodeID, idx int, timeout time.Duration,
334+
) {
330335
if atomic.LoadInt32(&r.done) != 0 {
331336
// There are more values than we need. currentLen >= limit.
332337
return
@@ -379,7 +384,18 @@ func (r *rpcNodePaginator) queryNode(ctx context.Context, nodeID roachpb.NodeID,
379384
return
380385
}
381386

382-
res, err := r.nodeFn(ctx, client, nodeID)
387+
var res interface{}
388+
var err error
389+
if timeout == noTimeout {
390+
res, err = r.nodeFn(ctx, client, nodeID)
391+
} else {
392+
err = timeutil.RunWithTimeout(ctx, "node fn", timeout, func(ctx context.Context) error {
393+
var _err error
394+
res, _err = r.nodeFn(ctx, client, nodeID)
395+
return _err
396+
})
397+
}
398+
383399
if err != nil {
384400
err = errors.Wrapf(err, "error requesting %s from node %d (%s)",
385401
r.errorCtx, nodeID, r.nodeStatuses[serverID(nodeID)])

pkg/server/pagination_test.go

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,11 @@ import (
1919
"testing"
2020
"time"
2121

22+
"github.com/cockroachdb/cockroach/pkg/base"
2223
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness/livenesspb"
2324
"github.com/cockroachdb/cockroach/pkg/roachpb"
2425
"github.com/cockroachdb/cockroach/pkg/testutils/datapathutils"
26+
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
2527
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
2628
"github.com/cockroachdb/cockroach/pkg/util/log"
2729
"github.com/cockroachdb/datadriven"
@@ -389,7 +391,7 @@ func TestRPCPaginator(t *testing.T) {
389391
// Issue requests in parallel.
390392
for idx, nodeID := range nodesToQuery {
391393
go func(nodeID roachpb.NodeID, idx int) {
392-
paginator.queryNode(ctx, nodeID, idx)
394+
paginator.queryNode(ctx, nodeID, idx, noTimeout)
393395
}(nodeID, idx)
394396
}
395397

@@ -412,3 +414,54 @@ func TestRPCPaginator(t *testing.T) {
412414
}
413415

414416
}
417+
418+
func TestRPCPaginatorWithTimeout(t *testing.T) {
419+
defer leaktest.AfterTest(t)()
420+
defer log.Scope(t).Close(t)
421+
ctx := context.Background()
422+
server := serverutils.StartServerOnly(t, base.TestServerArgs{})
423+
defer server.Stopper().Stop(ctx)
424+
425+
s := server.StatusServer().(*systemStatusServer)
426+
427+
dialFn := func(ctx context.Context, nodeID roachpb.NodeID) (interface{}, error) {
428+
client, err := s.dialNode(ctx, nodeID)
429+
return client, err
430+
}
431+
nodeFn := func(ctx context.Context, client interface{}, nodeID roachpb.NodeID) (interface{}, error) {
432+
select {
433+
case <-time.After(time.Second * 10):
434+
case <-ctx.Done():
435+
break
436+
}
437+
438+
// Return an error that mimics the error returned when a rpc's context is cancelled:
439+
return nil, errors.New("some error")
440+
}
441+
responseFn := func(nodeID roachpb.NodeID, resp interface{}) {
442+
// noop
443+
}
444+
445+
var timeoutError error
446+
errorFn := func(nodeID roachpb.NodeID, err error) {
447+
timeoutError = err
448+
log.Infof(ctx, "error from node %d: %v", nodeID, err)
449+
}
450+
451+
pagState := paginationState{}
452+
453+
_, _ = s.paginatedIterateNodes(
454+
ctx,
455+
"test-paginate-with-timeout",
456+
1000,
457+
pagState,
458+
[]roachpb.NodeID{},
459+
time.Second*2,
460+
dialFn,
461+
nodeFn,
462+
responseFn,
463+
errorFn,
464+
)
465+
466+
require.ErrorContains(t, timeoutError, "operation \"node fn\" timed out")
467+
}

pkg/server/status.go

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2809,8 +2809,9 @@ func (s *systemStatusServer) HotRangesV2(
28092809
response.ErrorsByNodeID[nodeID] = err.Error()
28102810
}
28112811

2812+
timeout := HotRangesRequestNodeTimeout.Get(&s.st.SV)
28122813
next, err := s.paginatedIterateNodes(
2813-
ctx, "hotRanges", size, start, requestedNodes, dialFn,
2814+
ctx, "hotRanges", size, start, requestedNodes, timeout, dialFn,
28142815
nodeFn, responseFn, errorFn)
28152816

28162817
if err != nil {
@@ -3087,12 +3088,14 @@ func (s *statusServer) iterateNodes(
30873088
// and nodeError on every error result. It returns the next `limit` results
30883089
// after `start`. If `requestedNodes` is specified and non-empty, iteration is
30893090
// only done on that subset of nodes in addition to any nodes already in pagState.
3091+
// If non-zero, nodeFn will run with a timeout specified by nodeFnTimeout.
30903092
func (s *statusServer) paginatedIterateNodes(
30913093
ctx context.Context,
30923094
errorCtx string,
30933095
limit int,
30943096
pagState paginationState,
30953097
requestedNodes []roachpb.NodeID,
3098+
nodeFnTimeout time.Duration,
30963099
dialFn func(ctx context.Context, nodeID roachpb.NodeID) (interface{}, error),
30973100
nodeFn func(ctx context.Context, client interface{}, nodeID roachpb.NodeID) (interface{}, error),
30983101
responseFn func(nodeID roachpb.NodeID, resp interface{}),
@@ -3154,7 +3157,9 @@ func (s *statusServer) paginatedIterateNodes(
31543157
Sem: sem,
31553158
WaitForSem: true,
31563159
},
3157-
func(ctx context.Context) { paginator.queryNode(ctx, nodeID, idx) },
3160+
func(ctx context.Context) {
3161+
paginator.queryNode(ctx, nodeID, idx, nodeFnTimeout)
3162+
},
31583163
); err != nil {
31593164
return pagState, err
31603165
}
@@ -3208,7 +3213,7 @@ func (s *statusServer) listSessionsHelper(
32083213
var err error
32093214
var pagState paginationState
32103215
if pagState, err = s.paginatedIterateNodes(
3211-
ctx, "session list", limit, start, nil, dialFn, nodeFn, responseFn, errorFn); err != nil {
3216+
ctx, "session list", limit, start, nil, noTimeout, dialFn, nodeFn, responseFn, errorFn); err != nil {
32123217
err := serverpb.ListSessionsError{Message: err.Error()}
32133218
response.Errors = append(response.Errors, err)
32143219
}

pkg/ui/workspaces/db-console/src/views/hotRanges/index.tsx

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -55,16 +55,6 @@ const HotRangesPage = () => {
5555
}
5656
}, [dispatch, isValid]);
5757

58-
useEffect(() => {
59-
dispatch(
60-
refreshHotRanges(
61-
new HotRangesRequest({
62-
page_size: 1000,
63-
}),
64-
),
65-
);
66-
}, [dispatch]);
67-
6858
const [filteredHotRanges, setFilteredHotRanges] =
6959
useState<HotRange[]>(hotRanges);
7060

0 commit comments

Comments
 (0)