Merge #147700

craig[bot] · dhartunian · craig[bot] · commit 9364bf1344c6 · 2025-06-18T15:35:55.000Z
147700: structlogging: rewrite test to use stub hot ranges r=angles-n-daemons a=dhartunian Previously, this test used a full server to get hot range data, but that proved to be quite difficult to work with. For the purposes of this test, which is meant to exercise the business logic of the hot range logger, we assume that the endpoint will work correctly and return ranges with a QPS and a CPU time. Resolves: #145412 Resolves: #147238 Release note: None Co-authored-by: David Hartunian <davidh@cockroachlabs.com>
diff --git a/pkg/server/server.go b/pkg/server/server.go
@@ -2324,7 +2324,6 @@ func (s *topLevelServer) AcceptClients(ctx context.Context) error {
 		ctx,
 		s.stopper,
 		s.status,
-		*s.sqlServer.internalExecutor,
 		s.ClusterSettings(),
 		nil,
 	); err != nil {
diff --git a/pkg/server/structlogging/BUILD.bazel b/pkg/server/structlogging/BUILD.bazel
@@ -15,7 +15,6 @@ go_library(
         "//pkg/server/serverpb",
         "//pkg/settings",
         "//pkg/settings/cluster",
-        "//pkg/sql",
         "//pkg/util/log",
         "//pkg/util/log/eventpb",
         "//pkg/util/log/logpb",
@@ -38,13 +37,10 @@ go_test(
     }),
     deps = [
         ":structlogging",
-        "//pkg/base",
-        "//pkg/kv/kvserver",
-        "//pkg/kv/kvserver/allocator/plan",
-        "//pkg/roachpb",
         "//pkg/security/securityassets",
         "//pkg/security/securitytest",
         "//pkg/server",
+        "//pkg/server/serverpb",
         "//pkg/settings/cluster",
         "//pkg/testutils",
         "//pkg/testutils/serverutils",
@@ -55,7 +51,9 @@ go_test(
         "//pkg/util/log/eventpb",
         "//pkg/util/log/logpb",
         "//pkg/util/randutil",
+        "//pkg/util/stop",
         "//pkg/util/syncutil",
+        "@com_github_cockroachdb_errors//:errors",
         "@com_github_stretchr_testify//require",
     ],
 )
diff --git a/pkg/server/structlogging/hot_ranges_log.go b/pkg/server/structlogging/hot_ranges_log.go
@@ -15,7 +15,6 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/server/serverpb"
 	"github.com/cockroachdb/cockroach/pkg/settings"
 	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
-	"github.com/cockroachdb/cockroach/pkg/sql"
 	"github.com/cockroachdb/cockroach/pkg/util/log"
 	"github.com/cockroachdb/cockroach/pkg/util/log/eventpb"
 	"github.com/cockroachdb/cockroach/pkg/util/log/logpb"
@@ -34,7 +33,7 @@ var CheckInterval = time.Minute
 // TestLoopChannel triggers the hot ranges logging loop to start again.
 // It's useful in the context of a test, where we don't want to wait
 // for whatever the last time the interval was.
-var TestLoopChannel = make(chan struct{}, 1)
+var TestLoopChannel = make(chan struct{})
 
 var TelemetryHotRangesStatsInterval = settings.RegisterDurationSetting(
 	settings.ApplicationLevel,
@@ -70,10 +69,14 @@ var TelemetryHotRangesStatsCPUThreshold = settings.RegisterDurationSetting(
 	time.Second/4,
 )
 
+type HotRangeGetter interface {
+	HotRangesV2(ctx context.Context, req *serverpb.HotRangesRequest) (*serverpb.HotRangesResponseV2, error)
+}
+
 // hotRangesLoggingScheduler is responsible for logging index usage stats
 // on a scheduled interval.
 type hotRangesLoggingScheduler struct {
-	sServer     serverpb.TenantStatusServer
+	sServer     HotRangeGetter
 	st          *cluster.Settings
 	stopper     *stop.Stopper
 	job         *jobs.Job
@@ -92,8 +95,7 @@ type hotRangesLoggingScheduler struct {
 func StartHotRangesLoggingScheduler(
 	ctx context.Context,
 	stopper *stop.Stopper,
-	sServer serverpb.TenantStatusServer,
-	ie sql.InternalExecutor,
+	sServer HotRangeGetter,
 	st *cluster.Settings,
 	ti *tenantcapabilities.Entry,
 ) error {
diff --git a/pkg/server/structlogging/hot_ranges_log_test.go b/pkg/server/structlogging/hot_ranges_log_test.go
@@ -8,90 +8,87 @@ package structlogging_test
 import (
 	"context"
 	"encoding/json"
-	"errors"
 	"fmt"
 	"regexp"
 	"testing"
 	"time"
 
-	"github.com/cockroachdb/cockroach/pkg/base"
-	"github.com/cockroachdb/cockroach/pkg/kv/kvserver"
-	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/allocator/plan"
-	"github.com/cockroachdb/cockroach/pkg/roachpb"
+	"github.com/cockroachdb/cockroach/pkg/server/serverpb"
 	"github.com/cockroachdb/cockroach/pkg/server/structlogging"
 	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
 	"github.com/cockroachdb/cockroach/pkg/testutils"
-	"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
 	"github.com/cockroachdb/cockroach/pkg/testutils/skip"
 	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
 	"github.com/cockroachdb/cockroach/pkg/util/log"
 	"github.com/cockroachdb/cockroach/pkg/util/log/eventpb"
 	"github.com/cockroachdb/cockroach/pkg/util/log/logpb"
+	"github.com/cockroachdb/cockroach/pkg/util/stop"
 	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
+	"github.com/cockroachdb/errors"
 	"github.com/stretchr/testify/require"
 )
 
 // setup an impossibly low cpu threshold, test clusters
 // do not seem to record cpu utilization per replica.
 const lowCPUThreshold = time.Duration(-1)
 const highCPUThreshold = time.Second
-const defaultTestWait = time.Second
 const lowDelay = 50 * time.Millisecond
 const highDelay = time.Minute
 
 // TestHotRangeLogger tests that hot ranges stats are logged per node.
 // It uses system ranges to verify behavior.
 func TestHotRangeLoggerSettings(t *testing.T) {
-	skip.WithIssue(t, 145412)
+	defer leaktest.AfterTest(t)()
+
 	skip.UnderRace(t)
-	skip.UnderStress(t)
 	ctx := context.Background()
 
-	// We only want to run this once within the suite, as the
-	// subsystem we depend on takes on the order of whole seconds
-	// to warm.
-	s, spy, teardown := setupTestServer(t, ctx)
+	settings, spy, teardown := setupTestServer(t, ctx)
 	defer teardown()
 
 	for _, test := range []struct {
 		enabled            bool
 		tickerInterval     time.Duration
 		logSettingInterval time.Duration
-		waitFor            time.Duration
 		logCPUThreshold    time.Duration
 		hasLogs            bool
 	}{
 		// Tests the straightforward use case, where we expect no threshold,
 		// a minimal interval, minimal loop, and zero threshold should
 		// result in multiple logs.
-		{true, lowDelay, lowDelay, defaultTestWait, lowCPUThreshold, true},
+		{true, lowDelay, lowDelay, lowCPUThreshold, true},
 
 		// This test is the same as the default case, except the
 		// cluster setting which controls logging is off.
-		{false, lowDelay, lowDelay, defaultTestWait, lowCPUThreshold, false},
+		{false, lowDelay, lowDelay, lowCPUThreshold, false},
 
 		// This test validates that even when we check on a low cadance,
 		// if the threshold is not passed and the interval is long,
 		// no logs will appear.
-		{true, lowDelay, highDelay, defaultTestWait, highCPUThreshold, false},
+		{true, lowDelay, highDelay, highCPUThreshold, false},
 
 		// This test validates that even if the interval is long,
 		// if the cpu threshold is low, and its checked, the system
 		// will produce logs.
-		{true, lowDelay, highDelay, defaultTestWait, lowCPUThreshold, true},
+		{true, lowDelay, highDelay, lowCPUThreshold, true},
 
 		// This test validates with a high check cadance, no logs
 		// will appear, even if the interval and thresholds are low.
-		{true, highDelay, lowDelay, defaultTestWait, lowCPUThreshold, false},
+		{true, highDelay, lowDelay, lowCPUThreshold, false},
 
 		// This test checks that if there's a low logging interval
 		// if the cpuThreshold is high, logs will still appear.
-		{true, lowDelay, lowDelay, defaultTestWait, highCPUThreshold, true},
+		{true, lowDelay, lowDelay, highCPUThreshold, true},
 	} {
 		t.Run(fmt.Sprintf("settings tests %v", test), func(t *testing.T) {
-			setupTest(ctx, s.ClusterSettings(), test.enabled, test.logSettingInterval, test.tickerInterval, test.logCPUThreshold, spy)
-			time.Sleep(test.waitFor)
-			require.Equal(t, test.hasLogs, hasNonZeroQPSRange(spy.Logs()))
+			setupTest(ctx, settings, test.enabled, test.logSettingInterval, test.tickerInterval, test.logCPUThreshold, spy)
+			testutils.SucceedsSoon(t, func() error {
+				actual := hasNonZeroQPSRange(spy.Logs())
+				if test.hasLogs != actual {
+					return errors.Errorf("expected hasLogs %v, got %v", test.hasLogs, actual)
+				}
+				return nil
+			})
 		})
 	}
 
@@ -105,15 +102,23 @@ func TestHotRangeLoggerSettings(t *testing.T) {
 		}
 
 		// without a limit set, we should see many ranges.
-		setupTest(ctx, s.ClusterSettings(), true, lowDelay, lowDelay, lowCPUThreshold, spy)
-		time.Sleep(time.Second)
-		require.Greater(t, countSeenRanges(spy.Logs()), 1)
+		setupTest(ctx, settings, true, lowDelay, lowDelay, lowCPUThreshold, spy)
+		testutils.SucceedsSoon(t, func() error {
+			if actual := countSeenRanges(spy.Logs()); actual <= 1 {
+				return fmt.Errorf("expected >1 range, got %d", actual)
+			}
+			return nil
+		})
 
 		// with a limit, only one range should show up.
 		structlogging.ReportTopHottestRanges = 1
-		setupTest(ctx, s.ClusterSettings(), true, lowDelay, lowDelay, lowCPUThreshold, spy)
-		time.Sleep(time.Second)
-		require.Equal(t, countSeenRanges(spy.Logs()), 1)
+		setupTest(ctx, settings, true, lowDelay, lowDelay, lowCPUThreshold, spy)
+		testutils.SucceedsSoon(t, func() error {
+			if actual := countSeenRanges(spy.Logs()); actual != 1 {
+				return fmt.Errorf("expected 1 range, got %d", actual)
+			}
+			return nil
+		})
 	})
 }
 
@@ -125,16 +130,12 @@ func TestHotRangeLoggerSettings(t *testing.T) {
 //   - For app tenants, a job is initialized for the hot ranges
 //     logger, whereas for the system tenant it runs as a task.
 func TestHotRangeLoggerMultitenant(t *testing.T) {
+	defer leaktest.AfterTest(t)()
+
 	skip.UnderRace(t)
 	ctx := context.Background()
-	s, spy, teardown := setupTestServer(t, ctx)
-	tenantID := roachpb.MustMakeTenantID(2)
-	tt, err := s.TenantController().StartTenant(ctx, base.TestTenantArgs{
-		TenantID: tenantID,
-	})
+	_, spy, teardown := setupTestServer(t, ctx)
 	spy.Logs()
-	require.NoError(t, err)
-	require.NotNil(t, tt)
 	// TODO (brian): the jobs system isn't registering this correctly,
 	// this will be fixed in a short follow pr.
 	defer teardown()
@@ -188,65 +189,78 @@ func (spy *hotRangesLogSpy) Reset() {
 	spy.mu.logs = nil
 }
 
+type testHotRangeGetter struct{}
+
+func (t testHotRangeGetter) HotRangesV2(
+	ctx context.Context, req *serverpb.HotRangesRequest,
+) (*serverpb.HotRangesResponseV2, error) {
+	if req.PerNodeLimit == 1 {
+		return &serverpb.HotRangesResponseV2{
+			Ranges: []*serverpb.HotRangesResponseV2_HotRange{
+				{
+					RangeID:          1,
+					CPUTimePerSecond: float64(100 * time.Millisecond),
+					QPS:              float64(100),
+				},
+			},
+		}, nil
+	}
+	return &serverpb.HotRangesResponseV2{
+		Ranges: []*serverpb.HotRangesResponseV2_HotRange{
+			{
+				RangeID:          1,
+				CPUTimePerSecond: float64(100 * time.Millisecond),
+				QPS:              float64(100),
+			},
+			{
+				RangeID:          2,
+				CPUTimePerSecond: float64(1300 * time.Millisecond),
+				QPS:              float64(100),
+			},
+			{
+				RangeID:          3,
+				CPUTimePerSecond: float64(900 * time.Millisecond),
+				QPS:              float64(100),
+			},
+		},
+	}, nil
+}
+
+var _ structlogging.HotRangeGetter = testHotRangeGetter{}
+
 // setupTestServer is a somewhat lengthy warmup process
 // to ensure that the hot ranges tests are ready to run.
 // It sets up a cluster, runs it until the hot range stats are
 // warm by dialing the knobs to noisy, and checking for output,
 // then redials the knobs back to quiet so the test can take over.
 func setupTestServer(
 	t *testing.T, ctx context.Context,
-) (serverutils.TestServerInterface, *hotRangesLogSpy, func()) {
+) (*cluster.Settings, *hotRangesLogSpy, func()) {
 	sc := log.ScopeWithoutShowLogs(t)
 	spy := &hotRangesLogSpy{t: t}
 
 	// override internal settings.
 	structlogging.ReportTopHottestRanges = 1000
 	structlogging.CheckInterval = 100 * time.Millisecond
 
-	s := serverutils.StartServerOnly(t, base.TestServerArgs{
-		DefaultTestTenant: base.TestControlsTenantsExplicitly,
-		Knobs: base.TestingKnobs{
-			Store: &kvserver.StoreTestingKnobs{
-				ReplicaPlannerKnobs: plan.ReplicaPlannerTestingKnobs{
-					DisableReplicaRebalancing: true,
-				},
-			},
-		},
-	})
-
-	leakChecker := leaktest.AfterTest(t)
 	logInterceptor := log.InterceptWith(ctx, spy)
-	stopper := s.Stopper()
+	stopper := stop.NewStopper()
+	settings := cluster.MakeTestingClusterSettings()
 	teardown := func() {
 		stopper.Stop(ctx)
 		sc.Close(t)
 		logInterceptor()
-		leakChecker()
 	}
 
-	ts := s.ApplicationLayer()
-
 	// lower settings so that we can wait for the stats to warm.
-	structlogging.TelemetryHotRangesStatsEnabled.Override(ctx, &ts.ClusterSettings().SV, true)
-	structlogging.TelemetryHotRangesStatsInterval.Override(ctx, &ts.ClusterSettings().SV, time.Millisecond)
-	structlogging.TelemetryHotRangesStatsLoggingDelay.Override(ctx, &ts.ClusterSettings().SV, 0*time.Millisecond)
-
-	// simulate some queries.
-	for range 1000 {
-		_, err := ts.SQLConn(t).Exec("SELECT * FROM system.namespace")
-		require.NoError(t, err)
-	}
-
-	testutils.SucceedsSoon(t, func() error {
-		logs := spy.Logs()
+	structlogging.TelemetryHotRangesStatsEnabled.Override(ctx, &settings.SV, true)
+	structlogging.TelemetryHotRangesStatsInterval.Override(ctx, &settings.SV, time.Millisecond)
+	structlogging.TelemetryHotRangesStatsLoggingDelay.Override(ctx, &settings.SV, 0*time.Millisecond)
 
-		if hasNonZeroQPSRange(logs) {
-			return nil
-		}
-		return errors.New("waited too long for the synthetic data")
-	})
+	err := structlogging.StartHotRangesLoggingScheduler(ctx, stopper, testHotRangeGetter{}, settings, nil)
+	require.NoError(t, err)
 
-	return s, spy, teardown
+	return settings, spy, teardown
 }
 
 // Utility function which generally indicates that the hot ranges
@@ -273,8 +287,7 @@ func setupTest(
 	structlogging.TelemetryHotRangesStatsInterval.Override(ctx, &st.SV, logInterval)
 	structlogging.TelemetryHotRangesStatsCPUThreshold.Override(ctx, &st.SV, logCPUThreshold)
 	structlogging.CheckInterval = tickerInterval
-	// wait for the activity from the previous test to drain.
-	time.Sleep(100 * time.Millisecond)
 	structlogging.TestLoopChannel <- struct{}{}
+	log.FlushAllSync()
 	spy.Reset()
 }
diff --git a/pkg/server/structlogging/main_test.go b/pkg/server/structlogging/main_test.go
@@ -24,3 +24,5 @@ func TestMain(m *testing.M) {
 	serverutils.InitTestClusterFactory(testcluster.TestClusterFactory)
 	os.Exit(m.Run())
 }
+
+//go:generate ../../util/leaktest/add-leaktest.sh *_test.go
diff --git a/pkg/server/tenant.go b/pkg/server/tenant.go
@@ -994,7 +994,6 @@ func (s *SQLServerWrapper) AcceptClients(ctx context.Context) error {
 		ctx,
 		s.stopper,
 		s.sqlServer.tenantConnect,
-		*s.sqlServer.internalExecutor,
 		s.ClusterSettings(),
 		&ti,
 	); err != nil {

Original file line number	Diff line number	Diff line change
`@@ -24,3 +24,5 @@ func TestMain(m *testing.M) {`
`24`	`24`	`serverutils.InitTestClusterFactory(testcluster.TestClusterFactory)`
`25`	`25`	`os.Exit(m.Run())`
`26`	`26`	`}`
	`27`	`+`
	`28`	`+//go:generate ../../util/leaktest/add-leaktest.sh *_test.go`