@@ -25,7 +25,16 @@ import (
2525)
2626
2727// ReportTopHottestRanges limits the number of ranges to be reported per iteration
28- const ReportTopHottestRanges = 5
28+ var ReportTopHottestRanges int32 = 5
29+
30+ // CheckInterval is the interval at which the system checks
31+ // whether or not to log the hot ranges.
32+ var CheckInterval = time .Second
33+
34+ // TestLoopChannel triggers the hot ranges logging loop to start again.
35+ // It's useful in the context of a test, where we don't want to wait
36+ // for whatever the last time the interval was.
37+ var TestLoopChannel = make (chan struct {}, 1 )
2938
3039var TelemetryHotRangesStatsInterval = settings .RegisterDurationSetting (
3140 settings .ApplicationLevel ,
@@ -48,18 +57,38 @@ var TelemetryHotRangesStatsLoggingDelay = settings.RegisterDurationSetting(
4857 1 * time .Second ,
4958)
5059
60+ // TelemetryHotRangesStatsCPUThreshold defines the cpu duration
61+ // per second which needs to be exceeded for the system to automatically
62+ // log the hot ranges. It tracks the reasoning that the kv layer
63+ // uses to determine when to begin sampling reads for a given
64+ // range in the keyspace, more information found where the cluster
65+ // setting SplitByLoadCPUThreshold is defined.
66+ var TelemetryHotRangesStatsCPUThreshold = settings .RegisterDurationSetting (
67+ settings .SystemOnly ,
68+ "server.telemetry.hot_ranges_stats.cpu_threshold" ,
69+ "the cpu time over which the system will automatically begin logging hot ranges" ,
70+ time .Second / 4 ,
71+ )
72+
5173// hotRangesLoggingScheduler is responsible for logging index usage stats
5274// on a scheduled interval.
5375type hotRangesLoggingScheduler struct {
54- ie sql.InternalExecutor
5576 sServer serverpb.TenantStatusServer
5677 st * cluster.Settings
5778 stopper * stop.Stopper
5879 job * jobs.Job
5980 multiTenant bool
81+ lastLogged time.Time
6082}
6183
62- // StartHotRangesLoggingScheduler starts the capture index usage statistics logging scheduler.
84+ // StartHotRangesLoggingScheduler starts the hot range log task
85+ // or job.
86+ //
87+ // For system tenants, or single tenant deployments, it runs as
88+ // a task on each node, logging only the ranges on the node in
89+ // which it runs. For app tenants in a multi-tenant deployment,
90+ // it runs on a single node in the sql cluster, applying a fanout
91+ // to the kv layer to collect the hot ranges from all nodes.
6392func StartHotRangesLoggingScheduler (
6493 ctx context.Context ,
6594 stopper * stop.Stopper ,
@@ -68,30 +97,32 @@ func StartHotRangesLoggingScheduler(
6897 st * cluster.Settings ,
6998 ti * tenantcapabilities.Entry ,
7099) error {
71- multiTenant := ti != nil && ! ti .TenantID .IsSystem ()
100+ multiTenant := ti != nil && ti . TenantID . IsSet () && ! ti .TenantID .IsSystem ()
72101 scheduler := hotRangesLoggingScheduler {
73- ie : ie ,
74102 sServer : sServer ,
75103 st : st ,
76104 stopper : stopper ,
77105 multiTenant : multiTenant ,
106+ lastLogged : timeutil .Now (),
78107 }
79108
80109 if multiTenant {
81- return scheduler .startJob (ctx , stopper )
110+ return scheduler .startJob ()
82111 }
83112
84113 return scheduler .startTask (ctx , stopper )
85114}
86115
116+ // startTask is for usage in a system-tenant or non-multi-tenant
117+ // installation.
87118func (s * hotRangesLoggingScheduler ) startTask (ctx context.Context , stopper * stop.Stopper ) error {
88119 return stopper .RunAsyncTask (ctx , "hot-ranges-stats" , func (ctx context.Context ) {
89120 err := s .start (ctx , stopper )
90121 log .Warningf (ctx , "hot ranges stats logging scheduler stopped: %s" , err )
91122 })
92123}
93124
94- func (s * hotRangesLoggingScheduler ) startJob (ctx context. Context , stopper * stop. Stopper ) error {
125+ func (s * hotRangesLoggingScheduler ) startJob () error {
95126 jobs .RegisterConstructor (
96127 jobspb .TypeHotRangesLogger ,
97128 func (job * jobs.Job , settings * cluster.Settings ) jobs.Resumer {
@@ -103,57 +134,99 @@ func (s *hotRangesLoggingScheduler) startJob(ctx context.Context, stopper *stop.
103134}
104135
105136func (s * hotRangesLoggingScheduler ) start (ctx context.Context , stopper * stop.Stopper ) error {
106- intervalChangedChan := make (chan struct {})
107- // We have to register this callback first. Otherwise we may run into
108- // an unlikely but possible scenario where we've started the ticker,
109- // and the setting is changed before we register the callback and the
110- // ticker will not be reset to the new value.
111- TelemetryHotRangesStatsInterval .SetOnChange (& s .st .SV , func (ctx context.Context ) {
112- intervalChangedChan <- struct {}{}
113- })
114-
115- ticker := time .NewTicker (TelemetryHotRangesStatsInterval .Get (& s .st .SV ))
116-
117137 for {
138+ ci := CheckInterval
139+ if s .multiTenant {
140+ ci *= 5
141+ }
118142 select {
119143 case <- stopper .ShouldQuiesce ():
120144 return nil
121145 case <- ctx .Done ():
122146 return nil
123- case <- ticker . C :
147+ case <- time . After ( ci ) :
124148 s .maybeLogHotRanges (ctx , stopper )
125- case <- intervalChangedChan :
126- ticker . Reset ( TelemetryHotRangesStatsInterval . Get ( & s . st . SV ))
149+ case <- TestLoopChannel :
150+ continue
127151 }
128152 }
129153}
130154
131155// maybeLogHotRanges is a small helper function which couples the
132156// functionality of checking whether to log and logging.
133157func (s * hotRangesLoggingScheduler ) maybeLogHotRanges (ctx context.Context , stopper * stop.Stopper ) {
134- if s .shouldLog () {
158+ if s .shouldLog (ctx ) {
135159 s .logHotRanges (ctx , stopper )
160+ s .lastLogged = timeutil .Now ()
161+ }
162+ }
163+
164+ // shouldLog checks the below conditions to see whether it
165+ // should emit logs.
166+ //
167+ // To return true, we verify that both:
168+ // - The logging setting is enabled.
169+ // - One of the following conditions is met:
170+ // -- It's been greater than the log interval since we last logged.
171+ // -- One of the replicas see exceeds our cpu threshold.
172+ func (s * hotRangesLoggingScheduler ) shouldLog (ctx context.Context ) bool {
173+ enabled := TelemetryHotRangesStatsEnabled .Get (& s .st .SV )
174+ if ! enabled {
175+ return false
176+ }
177+
178+ logInterval := TelemetryHotRangesStatsInterval .Get (& s .st .SV )
179+ if timeutil .Since (s .lastLogged ) > logInterval {
180+ return true
136181 }
182+
183+ // Getting the hot ranges with the statsOnly flag will
184+ // ensure the call doesn't touch the keyspace. Therefore
185+ // drastically lightening the overhead of fetching them.
186+ resp , err := s .getHotRanges (context .Background (), true )
187+ if err != nil {
188+ log .Warningf (ctx , "failed to get hot ranges: %s" , err )
189+ return false
190+ }
191+ cpuThreshold := TelemetryHotRangesStatsCPUThreshold .Get (& s .st .SV )
192+ return maxCPU (resp .Ranges ) > cpuThreshold
137193}
138194
139- // shouldLog checks the below conditions to see whether it should emit logs.
140- // - Is the cluster setting server.telemetry.hot_ranges_stats.enabled true?
141- func (s * hotRangesLoggingScheduler ) shouldLog () bool {
142- return TelemetryHotRangesStatsEnabled .Get (& s .st .SV )
195+ func maxCPU (ranges []* serverpb.HotRangesResponseV2_HotRange ) time.Duration {
196+ maxSeen := float64 (0 )
197+ for _ , r := range ranges {
198+ if r .CPUTimePerSecond > maxSeen {
199+ maxSeen = r .CPUTimePerSecond
200+ }
201+ }
202+ return time .Duration (maxSeen )
143203}
144204
145- // logHotRanges collects the hot ranges from this node's status server and
146- // sends them to the TELEMETRY log channel.
147- func (s * hotRangesLoggingScheduler ) logHotRanges (ctx context.Context , stopper * stop.Stopper ) {
148- req := & serverpb.HotRangesRequest {}
205+ // getHotRanges is a simple utility function for making a hot ranges
206+ // request to the status server. It can be used to fetch only the
207+ // stats for ranges requested, or everything. It also determines
208+ // whether to limit the request to only the local node, or to
209+ // issue a fanout for multi-tenant apps.
210+ func (s * hotRangesLoggingScheduler ) getHotRanges (
211+ ctx context.Context , statsOnly bool ,
212+ ) (* serverpb.HotRangesResponseV2 , error ) {
213+ req := & serverpb.HotRangesRequest {
214+ PerNodeLimit : ReportTopHottestRanges ,
215+ StatsOnly : statsOnly ,
216+ }
149217
150218 // if we are running in single tenant mode, only log the ranges on the status server.
151219 if ! s .multiTenant {
152220 req .Nodes = []string {"local" }
153- req .PageSize = ReportTopHottestRanges
154221 }
155222
156- resp , err := s .sServer .HotRangesV2 (ctx , req )
223+ return s .sServer .HotRangesV2 (ctx , req )
224+ }
225+
226+ // logHotRanges collects the hot ranges from this node's status server and
227+ // sends them to the HEALTH log channel.
228+ func (s * hotRangesLoggingScheduler ) logHotRanges (ctx context.Context , stopper * stop.Stopper ) {
229+ resp , err := s .getHotRanges (ctx , false )
157230 if err != nil {
158231 log .Warningf (ctx , "failed to get hot ranges: %s" , err )
159232 return
0 commit comments