Skip to content

Commit 9364bf1

Browse files
craig[bot]dhartunian
andcommitted
Merge #147700
147700: structlogging: rewrite test to use stub hot ranges r=angles-n-daemons a=dhartunian Previously, this test used a full server to get hot range data, but that proved to be quite difficult to work with. For the purposes of this test, which is meant to exercise the business logic of the hot range logger, we assume that the endpoint will work correctly and return ranges with a QPS and a CPU time. Resolves: #145412 Resolves: #147238 Release note: None Co-authored-by: David Hartunian <[email protected]>
2 parents 1fc8830 + 923a17c commit 9364bf1

File tree

6 files changed

+98
-85
lines changed

6 files changed

+98
-85
lines changed

pkg/server/server.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2324,7 +2324,6 @@ func (s *topLevelServer) AcceptClients(ctx context.Context) error {
23242324
ctx,
23252325
s.stopper,
23262326
s.status,
2327-
*s.sqlServer.internalExecutor,
23282327
s.ClusterSettings(),
23292328
nil,
23302329
); err != nil {

pkg/server/structlogging/BUILD.bazel

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ go_library(
1515
"//pkg/server/serverpb",
1616
"//pkg/settings",
1717
"//pkg/settings/cluster",
18-
"//pkg/sql",
1918
"//pkg/util/log",
2019
"//pkg/util/log/eventpb",
2120
"//pkg/util/log/logpb",
@@ -38,13 +37,10 @@ go_test(
3837
}),
3938
deps = [
4039
":structlogging",
41-
"//pkg/base",
42-
"//pkg/kv/kvserver",
43-
"//pkg/kv/kvserver/allocator/plan",
44-
"//pkg/roachpb",
4540
"//pkg/security/securityassets",
4641
"//pkg/security/securitytest",
4742
"//pkg/server",
43+
"//pkg/server/serverpb",
4844
"//pkg/settings/cluster",
4945
"//pkg/testutils",
5046
"//pkg/testutils/serverutils",
@@ -55,7 +51,9 @@ go_test(
5551
"//pkg/util/log/eventpb",
5652
"//pkg/util/log/logpb",
5753
"//pkg/util/randutil",
54+
"//pkg/util/stop",
5855
"//pkg/util/syncutil",
56+
"@com_github_cockroachdb_errors//:errors",
5957
"@com_github_stretchr_testify//require",
6058
],
6159
)

pkg/server/structlogging/hot_ranges_log.go

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ import (
1515
"github.com/cockroachdb/cockroach/pkg/server/serverpb"
1616
"github.com/cockroachdb/cockroach/pkg/settings"
1717
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
18-
"github.com/cockroachdb/cockroach/pkg/sql"
1918
"github.com/cockroachdb/cockroach/pkg/util/log"
2019
"github.com/cockroachdb/cockroach/pkg/util/log/eventpb"
2120
"github.com/cockroachdb/cockroach/pkg/util/log/logpb"
@@ -34,7 +33,7 @@ var CheckInterval = time.Minute
3433
// TestLoopChannel triggers the hot ranges logging loop to start again.
3534
// It's useful in the context of a test, where we don't want to wait
3635
// for whatever the last time the interval was.
37-
var TestLoopChannel = make(chan struct{}, 1)
36+
var TestLoopChannel = make(chan struct{})
3837

3938
var TelemetryHotRangesStatsInterval = settings.RegisterDurationSetting(
4039
settings.ApplicationLevel,
@@ -70,10 +69,14 @@ var TelemetryHotRangesStatsCPUThreshold = settings.RegisterDurationSetting(
7069
time.Second/4,
7170
)
7271

72+
type HotRangeGetter interface {
73+
HotRangesV2(ctx context.Context, req *serverpb.HotRangesRequest) (*serverpb.HotRangesResponseV2, error)
74+
}
75+
7376
// hotRangesLoggingScheduler is responsible for logging index usage stats
7477
// on a scheduled interval.
7578
type hotRangesLoggingScheduler struct {
76-
sServer serverpb.TenantStatusServer
79+
sServer HotRangeGetter
7780
st *cluster.Settings
7881
stopper *stop.Stopper
7982
job *jobs.Job
@@ -92,8 +95,7 @@ type hotRangesLoggingScheduler struct {
9295
func StartHotRangesLoggingScheduler(
9396
ctx context.Context,
9497
stopper *stop.Stopper,
95-
sServer serverpb.TenantStatusServer,
96-
ie sql.InternalExecutor,
98+
sServer HotRangeGetter,
9799
st *cluster.Settings,
98100
ti *tenantcapabilities.Entry,
99101
) error {

pkg/server/structlogging/hot_ranges_log_test.go

Lines changed: 86 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -8,90 +8,87 @@ package structlogging_test
88
import (
99
"context"
1010
"encoding/json"
11-
"errors"
1211
"fmt"
1312
"regexp"
1413
"testing"
1514
"time"
1615

17-
"github.com/cockroachdb/cockroach/pkg/base"
18-
"github.com/cockroachdb/cockroach/pkg/kv/kvserver"
19-
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/allocator/plan"
20-
"github.com/cockroachdb/cockroach/pkg/roachpb"
16+
"github.com/cockroachdb/cockroach/pkg/server/serverpb"
2117
"github.com/cockroachdb/cockroach/pkg/server/structlogging"
2218
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
2319
"github.com/cockroachdb/cockroach/pkg/testutils"
24-
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
2520
"github.com/cockroachdb/cockroach/pkg/testutils/skip"
2621
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
2722
"github.com/cockroachdb/cockroach/pkg/util/log"
2823
"github.com/cockroachdb/cockroach/pkg/util/log/eventpb"
2924
"github.com/cockroachdb/cockroach/pkg/util/log/logpb"
25+
"github.com/cockroachdb/cockroach/pkg/util/stop"
3026
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
27+
"github.com/cockroachdb/errors"
3128
"github.com/stretchr/testify/require"
3229
)
3330

3431
// setup an impossibly low cpu threshold, test clusters
3532
// do not seem to record cpu utilization per replica.
3633
const lowCPUThreshold = time.Duration(-1)
3734
const highCPUThreshold = time.Second
38-
const defaultTestWait = time.Second
3935
const lowDelay = 50 * time.Millisecond
4036
const highDelay = time.Minute
4137

4238
// TestHotRangeLogger tests that hot ranges stats are logged per node.
4339
// It uses system ranges to verify behavior.
4440
func TestHotRangeLoggerSettings(t *testing.T) {
45-
skip.WithIssue(t, 145412)
41+
defer leaktest.AfterTest(t)()
42+
4643
skip.UnderRace(t)
47-
skip.UnderStress(t)
4844
ctx := context.Background()
4945

50-
// We only want to run this once within the suite, as the
51-
// subsystem we depend on takes on the order of whole seconds
52-
// to warm.
53-
s, spy, teardown := setupTestServer(t, ctx)
46+
settings, spy, teardown := setupTestServer(t, ctx)
5447
defer teardown()
5548

5649
for _, test := range []struct {
5750
enabled bool
5851
tickerInterval time.Duration
5952
logSettingInterval time.Duration
60-
waitFor time.Duration
6153
logCPUThreshold time.Duration
6254
hasLogs bool
6355
}{
6456
// Tests the straightforward use case, where we expect no threshold,
6557
// a minimal interval, minimal loop, and zero threshold should
6658
// result in multiple logs.
67-
{true, lowDelay, lowDelay, defaultTestWait, lowCPUThreshold, true},
59+
{true, lowDelay, lowDelay, lowCPUThreshold, true},
6860

6961
// This test is the same as the default case, except the
7062
// cluster setting which controls logging is off.
71-
{false, lowDelay, lowDelay, defaultTestWait, lowCPUThreshold, false},
63+
{false, lowDelay, lowDelay, lowCPUThreshold, false},
7264

7365
// This test validates that even when we check on a low cadance,
7466
// if the threshold is not passed and the interval is long,
7567
// no logs will appear.
76-
{true, lowDelay, highDelay, defaultTestWait, highCPUThreshold, false},
68+
{true, lowDelay, highDelay, highCPUThreshold, false},
7769

7870
// This test validates that even if the interval is long,
7971
// if the cpu threshold is low, and its checked, the system
8072
// will produce logs.
81-
{true, lowDelay, highDelay, defaultTestWait, lowCPUThreshold, true},
73+
{true, lowDelay, highDelay, lowCPUThreshold, true},
8274

8375
// This test validates with a high check cadance, no logs
8476
// will appear, even if the interval and thresholds are low.
85-
{true, highDelay, lowDelay, defaultTestWait, lowCPUThreshold, false},
77+
{true, highDelay, lowDelay, lowCPUThreshold, false},
8678

8779
// This test checks that if there's a low logging interval
8880
// if the cpuThreshold is high, logs will still appear.
89-
{true, lowDelay, lowDelay, defaultTestWait, highCPUThreshold, true},
81+
{true, lowDelay, lowDelay, highCPUThreshold, true},
9082
} {
9183
t.Run(fmt.Sprintf("settings tests %v", test), func(t *testing.T) {
92-
setupTest(ctx, s.ClusterSettings(), test.enabled, test.logSettingInterval, test.tickerInterval, test.logCPUThreshold, spy)
93-
time.Sleep(test.waitFor)
94-
require.Equal(t, test.hasLogs, hasNonZeroQPSRange(spy.Logs()))
84+
setupTest(ctx, settings, test.enabled, test.logSettingInterval, test.tickerInterval, test.logCPUThreshold, spy)
85+
testutils.SucceedsSoon(t, func() error {
86+
actual := hasNonZeroQPSRange(spy.Logs())
87+
if test.hasLogs != actual {
88+
return errors.Errorf("expected hasLogs %v, got %v", test.hasLogs, actual)
89+
}
90+
return nil
91+
})
9592
})
9693
}
9794

@@ -105,15 +102,23 @@ func TestHotRangeLoggerSettings(t *testing.T) {
105102
}
106103

107104
// without a limit set, we should see many ranges.
108-
setupTest(ctx, s.ClusterSettings(), true, lowDelay, lowDelay, lowCPUThreshold, spy)
109-
time.Sleep(time.Second)
110-
require.Greater(t, countSeenRanges(spy.Logs()), 1)
105+
setupTest(ctx, settings, true, lowDelay, lowDelay, lowCPUThreshold, spy)
106+
testutils.SucceedsSoon(t, func() error {
107+
if actual := countSeenRanges(spy.Logs()); actual <= 1 {
108+
return fmt.Errorf("expected >1 range, got %d", actual)
109+
}
110+
return nil
111+
})
111112

112113
// with a limit, only one range should show up.
113114
structlogging.ReportTopHottestRanges = 1
114-
setupTest(ctx, s.ClusterSettings(), true, lowDelay, lowDelay, lowCPUThreshold, spy)
115-
time.Sleep(time.Second)
116-
require.Equal(t, countSeenRanges(spy.Logs()), 1)
115+
setupTest(ctx, settings, true, lowDelay, lowDelay, lowCPUThreshold, spy)
116+
testutils.SucceedsSoon(t, func() error {
117+
if actual := countSeenRanges(spy.Logs()); actual != 1 {
118+
return fmt.Errorf("expected 1 range, got %d", actual)
119+
}
120+
return nil
121+
})
117122
})
118123
}
119124

@@ -125,16 +130,12 @@ func TestHotRangeLoggerSettings(t *testing.T) {
125130
// - For app tenants, a job is initialized for the hot ranges
126131
// logger, whereas for the system tenant it runs as a task.
127132
func TestHotRangeLoggerMultitenant(t *testing.T) {
133+
defer leaktest.AfterTest(t)()
134+
128135
skip.UnderRace(t)
129136
ctx := context.Background()
130-
s, spy, teardown := setupTestServer(t, ctx)
131-
tenantID := roachpb.MustMakeTenantID(2)
132-
tt, err := s.TenantController().StartTenant(ctx, base.TestTenantArgs{
133-
TenantID: tenantID,
134-
})
137+
_, spy, teardown := setupTestServer(t, ctx)
135138
spy.Logs()
136-
require.NoError(t, err)
137-
require.NotNil(t, tt)
138139
// TODO (brian): the jobs system isn't registering this correctly,
139140
// this will be fixed in a short follow pr.
140141
defer teardown()
@@ -188,65 +189,78 @@ func (spy *hotRangesLogSpy) Reset() {
188189
spy.mu.logs = nil
189190
}
190191

192+
type testHotRangeGetter struct{}
193+
194+
func (t testHotRangeGetter) HotRangesV2(
195+
ctx context.Context, req *serverpb.HotRangesRequest,
196+
) (*serverpb.HotRangesResponseV2, error) {
197+
if req.PerNodeLimit == 1 {
198+
return &serverpb.HotRangesResponseV2{
199+
Ranges: []*serverpb.HotRangesResponseV2_HotRange{
200+
{
201+
RangeID: 1,
202+
CPUTimePerSecond: float64(100 * time.Millisecond),
203+
QPS: float64(100),
204+
},
205+
},
206+
}, nil
207+
}
208+
return &serverpb.HotRangesResponseV2{
209+
Ranges: []*serverpb.HotRangesResponseV2_HotRange{
210+
{
211+
RangeID: 1,
212+
CPUTimePerSecond: float64(100 * time.Millisecond),
213+
QPS: float64(100),
214+
},
215+
{
216+
RangeID: 2,
217+
CPUTimePerSecond: float64(1300 * time.Millisecond),
218+
QPS: float64(100),
219+
},
220+
{
221+
RangeID: 3,
222+
CPUTimePerSecond: float64(900 * time.Millisecond),
223+
QPS: float64(100),
224+
},
225+
},
226+
}, nil
227+
}
228+
229+
var _ structlogging.HotRangeGetter = testHotRangeGetter{}
230+
191231
// setupTestServer is a somewhat lengthy warmup process
192232
// to ensure that the hot ranges tests are ready to run.
193233
// It sets up a cluster, runs it until the hot range stats are
194234
// warm by dialing the knobs to noisy, and checking for output,
195235
// then redials the knobs back to quiet so the test can take over.
196236
func setupTestServer(
197237
t *testing.T, ctx context.Context,
198-
) (serverutils.TestServerInterface, *hotRangesLogSpy, func()) {
238+
) (*cluster.Settings, *hotRangesLogSpy, func()) {
199239
sc := log.ScopeWithoutShowLogs(t)
200240
spy := &hotRangesLogSpy{t: t}
201241

202242
// override internal settings.
203243
structlogging.ReportTopHottestRanges = 1000
204244
structlogging.CheckInterval = 100 * time.Millisecond
205245

206-
s := serverutils.StartServerOnly(t, base.TestServerArgs{
207-
DefaultTestTenant: base.TestControlsTenantsExplicitly,
208-
Knobs: base.TestingKnobs{
209-
Store: &kvserver.StoreTestingKnobs{
210-
ReplicaPlannerKnobs: plan.ReplicaPlannerTestingKnobs{
211-
DisableReplicaRebalancing: true,
212-
},
213-
},
214-
},
215-
})
216-
217-
leakChecker := leaktest.AfterTest(t)
218246
logInterceptor := log.InterceptWith(ctx, spy)
219-
stopper := s.Stopper()
247+
stopper := stop.NewStopper()
248+
settings := cluster.MakeTestingClusterSettings()
220249
teardown := func() {
221250
stopper.Stop(ctx)
222251
sc.Close(t)
223252
logInterceptor()
224-
leakChecker()
225253
}
226254

227-
ts := s.ApplicationLayer()
228-
229255
// lower settings so that we can wait for the stats to warm.
230-
structlogging.TelemetryHotRangesStatsEnabled.Override(ctx, &ts.ClusterSettings().SV, true)
231-
structlogging.TelemetryHotRangesStatsInterval.Override(ctx, &ts.ClusterSettings().SV, time.Millisecond)
232-
structlogging.TelemetryHotRangesStatsLoggingDelay.Override(ctx, &ts.ClusterSettings().SV, 0*time.Millisecond)
233-
234-
// simulate some queries.
235-
for range 1000 {
236-
_, err := ts.SQLConn(t).Exec("SELECT * FROM system.namespace")
237-
require.NoError(t, err)
238-
}
239-
240-
testutils.SucceedsSoon(t, func() error {
241-
logs := spy.Logs()
256+
structlogging.TelemetryHotRangesStatsEnabled.Override(ctx, &settings.SV, true)
257+
structlogging.TelemetryHotRangesStatsInterval.Override(ctx, &settings.SV, time.Millisecond)
258+
structlogging.TelemetryHotRangesStatsLoggingDelay.Override(ctx, &settings.SV, 0*time.Millisecond)
242259

243-
if hasNonZeroQPSRange(logs) {
244-
return nil
245-
}
246-
return errors.New("waited too long for the synthetic data")
247-
})
260+
err := structlogging.StartHotRangesLoggingScheduler(ctx, stopper, testHotRangeGetter{}, settings, nil)
261+
require.NoError(t, err)
248262

249-
return s, spy, teardown
263+
return settings, spy, teardown
250264
}
251265

252266
// Utility function which generally indicates that the hot ranges
@@ -273,8 +287,7 @@ func setupTest(
273287
structlogging.TelemetryHotRangesStatsInterval.Override(ctx, &st.SV, logInterval)
274288
structlogging.TelemetryHotRangesStatsCPUThreshold.Override(ctx, &st.SV, logCPUThreshold)
275289
structlogging.CheckInterval = tickerInterval
276-
// wait for the activity from the previous test to drain.
277-
time.Sleep(100 * time.Millisecond)
278290
structlogging.TestLoopChannel <- struct{}{}
291+
log.FlushAllSync()
279292
spy.Reset()
280293
}

pkg/server/structlogging/main_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,5 @@ func TestMain(m *testing.M) {
2424
serverutils.InitTestClusterFactory(testcluster.TestClusterFactory)
2525
os.Exit(m.Run())
2626
}
27+
28+
//go:generate ../../util/leaktest/add-leaktest.sh *_test.go

pkg/server/tenant.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -994,7 +994,6 @@ func (s *SQLServerWrapper) AcceptClients(ctx context.Context) error {
994994
ctx,
995995
s.stopper,
996996
s.sqlServer.tenantConnect,
997-
*s.sqlServer.internalExecutor,
998997
s.ClusterSettings(),
999998
&ti,
1000999
); err != nil {

0 commit comments

Comments
 (0)