Skip to content

Commit 17e9fdd

Browse files
craig[bot]shaikzakiriitm
andcommitted
Merge #143633
143633: server: add EngineStats endpoint in multitenant setup r=shubhamdhama,cthumuluru-crdb a=shaikzakiriitm EngineStats endpoint was inaccessible from secondary tenant. EngineStats endpoint provides statistics of storage layer which can help diagnose myriad of issues critical to database performance. To address this, we created EngineStatus endpoint in tenant status server. Implementation uses tenant connector to redirect call to system status server on requested kv node. Access to this endpoint is guarded by `can_view_node_info` capability as this endpoint returns info of all stores on a given node. Updated corresponding unit tests. Epic: CRDB-38968 Fixes: #110020 Release note: None Co-authored-by: Shaik Zakir Hussain <[email protected]>
2 parents 5ce759e + a9e7ccf commit 17e9fdd

File tree

9 files changed

+89
-13
lines changed

9 files changed

+89
-13
lines changed

pkg/ccl/changefeedccl/mocks/tenant_status_server_generated.go

Lines changed: 15 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/cli/testdata/zip/testzip_external_process_virtualization

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,7 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null
4343
[node 1] requesting stacks... received response... writing binary output: debug/nodes/1/stacks.txt... done
4444
[node 1] requesting stacks with labels... received response... writing binary output: debug/nodes/1/stacks_with_labels.txt... done
4545
[node 1] requesting heap profile... received response... writing binary output: debug/nodes/1/heap.pprof... done
46-
[node 1] requesting engine stats... received response...
47-
[node 1] requesting engine stats: last request failed: rpc error: ...
48-
[node 1] requesting engine stats: creating error output: debug/nodes/1/lsm.txt.err.txt... done
46+
[node 1] requesting engine stats... received response... writing binary output: debug/nodes/1/lsm.txt... done
4947
[node 1] requesting heap profile list... received response... done
5048
[node ?] ? heap profiles found
5149
[node 1] requesting goroutine dump list... received response... done

pkg/cli/testdata/zip/testzip_shared_process_virtualization

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,9 +82,7 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null
8282
[node 1] requesting stacks... received response... writing binary output: debug/cluster/test-tenant/nodes/1/stacks.txt... done
8383
[node 1] requesting stacks with labels... received response... writing binary output: debug/cluster/test-tenant/nodes/1/stacks_with_labels.txt... done
8484
[node 1] requesting heap profile... received response... writing binary output: debug/cluster/test-tenant/nodes/1/heap.pprof... done
85-
[node 1] requesting engine stats... received response...
86-
[node 1] requesting engine stats: last request failed: rpc error: ...
87-
[node 1] requesting engine stats: creating error output: debug/cluster/test-tenant/nodes/1/lsm.txt.err.txt... done
85+
[node 1] requesting engine stats... received response... writing binary output: debug/cluster/test-tenant/nodes/1/lsm.txt... done
8886
[node 1] requesting heap profile list... received response...
8987
[node 1] requesting heap profile list: last request failed: rpc error: ...
9088
[node 1] requesting heap profile list: creating error output: debug/cluster/test-tenant/nodes/1/heapprof.err.txt... done

pkg/cli/testdata/zip/testzip_shared_process_virtualization_with_default_tenant

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,9 +82,7 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null
8282
[node 1] requesting stacks... received response... writing binary output: debug/cluster/test-tenant/nodes/1/stacks.txt... done
8383
[node 1] requesting stacks with labels... received response... writing binary output: debug/cluster/test-tenant/nodes/1/stacks_with_labels.txt... done
8484
[node 1] requesting heap profile... received response... writing binary output: debug/cluster/test-tenant/nodes/1/heap.pprof... done
85-
[node 1] requesting engine stats... received response...
86-
[node 1] requesting engine stats: last request failed: rpc error: ...
87-
[node 1] requesting engine stats: creating error output: debug/cluster/test-tenant/nodes/1/lsm.txt.err.txt... done
85+
[node 1] requesting engine stats... received response... writing binary output: debug/cluster/test-tenant/nodes/1/lsm.txt... done
8886
[node 1] requesting heap profile list... received response...
8987
[node 1] requesting heap profile list: last request failed: rpc error: ...
9088
[node 1] requesting heap profile list: creating error output: debug/cluster/test-tenant/nodes/1/heapprof.err.txt... done

pkg/kv/kvclient/kvtenant/connector.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -635,6 +635,16 @@ func (c *connector) Gossip(
635635
return
636636
}
637637

638+
func (c *connector) EngineStats(
639+
ctx context.Context, req *serverpb.EngineStatsRequest,
640+
) (resp *serverpb.EngineStatsResponse, retErr error) {
641+
retErr = c.withClient(ctx, func(ctx context.Context, client *client) (err error) {
642+
resp, err = client.EngineStats(ctx, req)
643+
return
644+
})
645+
return
646+
}
647+
638648
// NewIterator implements the rangedesc.IteratorFactory interface.
639649
func (c *connector) NewIterator(
640650
ctx context.Context, span roachpb.Span,

pkg/rpc/auth_tenant.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ func (a tenantAuthorizer) authorize(
121121
case "/cockroach.server.serverpb.Status/NetworkConnectivity":
122122
return a.capabilitiesAuthorizer.HasProcessDebugCapability(ctx, tenID)
123123

124-
case "/cockroach.server.serverpb.Status/Gossip":
124+
case "/cockroach.server.serverpb.Status/Gossip", "/cockroach.server.serverpb.Status/EngineStats":
125125
return a.capabilitiesAuthorizer.HasNodeStatusCapability(ctx, tenID)
126126

127127
case "/cockroach.server.serverpb.Status/TransactionContentionEvents":

pkg/server/serverpb/status.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ type TenantStatusServer interface {
9494
DownloadSpan(ctx context.Context, request *DownloadSpanRequest) (*DownloadSpanResponse, error)
9595
NetworkConnectivity(context.Context, *NetworkConnectivityRequest) (*NetworkConnectivityResponse, error)
9696
Gossip(context.Context, *GossipRequest) (*gossip.InfoStatus, error)
97+
EngineStats(context.Context, *EngineStatsRequest) (*EngineStatsResponse, error)
9798
}
9899

99100
// OptionalNodesStatusServer returns the wrapped NodesStatusServer, if it is

pkg/server/status.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -788,6 +788,22 @@ func (s *statusServer) redactGossipResponse(resp *gossip.InfoStatus) *gossip.Inf
788788
return resp
789789
}
790790

791+
// EngineStats returns statistical information of storage layer on the given node
792+
// which is crucial for diagnosing issues related to disk usage,compaction efficiency,
793+
// read/write amplification and other storage engine metrics critical for database
794+
// performance.
795+
func (t *statusServer) EngineStats(
796+
ctx context.Context, req *serverpb.EngineStatsRequest,
797+
) (*serverpb.EngineStatsResponse, error) {
798+
ctx = t.AnnotateCtx(ctx)
799+
800+
if err := t.privilegeChecker.RequireViewClusterMetadataPermission(ctx); err != nil {
801+
return nil, err
802+
}
803+
804+
return t.sqlServer.tenantConnect.EngineStats(ctx, req)
805+
}
806+
791807
func (s *systemStatusServer) EngineStats(
792808
ctx context.Context, req *serverpb.EngineStatsRequest,
793809
) (*serverpb.EngineStatsResponse, error) {

pkg/server/storage_api/engine_test.go

Lines changed: 43 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,11 @@ package storage_api_test
88
import (
99
"context"
1010
"regexp"
11+
"strings"
1112
"testing"
1213

1314
"github.com/cockroachdb/cockroach/pkg/base"
15+
"github.com/cockroachdb/cockroach/pkg/multitenant/tenantcapabilitiespb"
1416
"github.com/cockroachdb/cockroach/pkg/server/debug"
1517
"github.com/cockroachdb/cockroach/pkg/server/serverpb"
1618
"github.com/cockroachdb/cockroach/pkg/server/srvtestutils"
@@ -19,25 +21,34 @@ import (
1921
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
2022
"github.com/cockroachdb/cockroach/pkg/util/log"
2123
"github.com/pkg/errors"
24+
"github.com/stretchr/testify/require"
2225
)
2326

2427
// TestStatusEngineStatsJson ensures that the output response for the engine
2528
// stats contains the required fields.
2629
func TestStatusEngineStatsJson(t *testing.T) {
2730
defer leaktest.AfterTest(t)()
2831
defer log.Scope(t).Close(t)
32+
ctx := context.Background()
2933

3034
dir, cleanupFn := testutils.TempDir(t)
3135
defer cleanupFn()
3236

3337
srv := serverutils.StartServerOnly(t, base.TestServerArgs{
34-
DefaultTestTenant: base.TestIsForStuffThatShouldWorkWithSecondaryTenantsButDoesntYet(110020),
35-
3638
StoreSpecs: []base.StoreSpec{{
3739
Path: dir,
3840
}},
3941
})
40-
defer srv.Stopper().Stop(context.Background())
42+
defer srv.Stopper().Stop(ctx)
43+
44+
if srv.DeploymentMode().IsExternal() {
45+
// Explicitly enabling CanViewNodeInfo capability for the secondary/application tenant
46+
// when in external process mode, as shared process mode already has all capabilities.
47+
require.NoError(t, srv.GrantTenantCapabilities(
48+
ctx, serverutils.TestTenantID(),
49+
map[tenantcapabilitiespb.ID]string{tenantcapabilitiespb.CanViewNodeInfo: "true"}))
50+
}
51+
4152
s := srv.ApplicationLayer()
4253

4354
t.Logf("using admin URL %s", s.AdminURL())
@@ -57,3 +68,32 @@ func TestStatusEngineStatsJson(t *testing.T) {
5768
t.Fatal(errors.Errorf("expected engine metrics to be correctly formatted, got:\n %s", formattedStats))
5869
}
5970
}
71+
72+
func TestStatusEngineStatsJsonWithoutTenantCapability(t *testing.T) {
73+
defer leaktest.AfterTest(t)()
74+
defer log.Scope(t).Close(t)
75+
76+
srv := serverutils.StartServerOnly(t, base.TestServerArgs{
77+
// Note: We're only testing external-process mode because shared service
78+
// mode tenants have all capabilities.
79+
DefaultTestTenant: base.ExternalTestTenantAlwaysEnabled,
80+
})
81+
defer srv.Stopper().Stop(context.Background())
82+
83+
s := srv.ApplicationLayer()
84+
85+
var engineStats serverpb.EngineStatsResponse
86+
// Using SucceedsSoon because we have seen in the wild that
87+
// occasionally requests don't go through with error "transport:
88+
// error while dialing: connection interrupted (did the remote node
89+
// shut down or are there networking issues?)"
90+
testutils.SucceedsSoon(t, func() error {
91+
actualErr := srvtestutils.GetStatusJSONProto(s, "enginestats/local", &engineStats)
92+
require.Error(t, actualErr)
93+
if !strings.Contains(actualErr.Error(), "client tenant does not have capability to query cluster node metadata") {
94+
return errors.Wrap(actualErr, "unexpected error message")
95+
}
96+
return nil
97+
})
98+
99+
}

0 commit comments

Comments
 (0)