@@ -7,6 +7,9 @@ package changefeedccl
7
7
8
8
import (
9
9
"context"
10
+ "fmt"
11
+ "math"
12
+ "sync/atomic"
10
13
"testing"
11
14
"time"
12
15
@@ -17,6 +20,7 @@ import (
17
20
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
18
21
"github.com/cockroachdb/cockroach/pkg/roachpb"
19
22
"github.com/cockroachdb/cockroach/pkg/sql/catalog/desctestutils"
23
+ "github.com/cockroachdb/cockroach/pkg/sql/execinfra"
20
24
"github.com/cockroachdb/cockroach/pkg/sql/isql"
21
25
"github.com/cockroachdb/cockroach/pkg/sql/rowenc"
22
26
"github.com/cockroachdb/cockroach/pkg/testutils"
@@ -40,7 +44,8 @@ func TestChangefeedFrontierPersistence(t *testing.T) {
40
44
ctx := context .Background ()
41
45
42
46
// Set a short interval for frontier persistence.
43
- sqlDB .Exec (t , "SET CLUSTER SETTING changefeed.progress.frontier_persistence.interval = '5s'" )
47
+ changefeedbase .FrontierPersistenceInterval .Override (ctx ,
48
+ & s .Server .ClusterSettings ().SV , 5 * time .Second )
44
49
45
50
// Get frontier persistence metric.
46
51
registry := s .Server .JobRegistry ().(* jobs.Registry )
@@ -179,3 +184,128 @@ RETURNING cluster_logical_timestamp()`).Scan(&tsStr)
179
184
180
185
cdcTest (t , testFn , feedTestEnterpriseSinks )
181
186
}
187
+
188
+ func TestChangefeedProgressSkewMetrics (t * testing.T ) {
189
+ defer leaktest .AfterTest (t )()
190
+ defer log .Scope (t ).Close (t )
191
+
192
+ testutils .RunTrueAndFalse (t , "per-table tracking" , func (t * testing.T , perTableTracking bool ) {
193
+ testFn := func (t * testing.T , s TestServer , f cdctest.TestFeedFactory ) {
194
+ sqlDB := sqlutils .MakeSQLRunner (s .DB )
195
+ ctx := context .Background ()
196
+
197
+ // Enable/disable per-table tracking.
198
+ changefeedbase .TrackPerTableProgress .Override (ctx ,
199
+ & s .Server .ClusterSettings ().SV , perTableTracking )
200
+
201
+ registry := s .Server .JobRegistry ().(* jobs.Registry )
202
+ aggMetrics := registry .MetricsStruct ().Changefeed .(* Metrics ).AggMetrics
203
+ const scope = "skew"
204
+ scopedMetrics , err := aggMetrics .getOrCreateScope (scope )
205
+ require .NoError (t , err )
206
+
207
+ // Progress skew metrics should start at zero.
208
+ require .Zero (t , aggMetrics .SpanProgressSkew .Value ())
209
+ require .Zero (t , aggMetrics .TableProgressSkew .Value ())
210
+ require .Zero (t , scopedMetrics .SpanProgressSkew .Value ())
211
+ require .Zero (t , scopedMetrics .TableProgressSkew .Value ())
212
+
213
+ // Create two tables and insert some initial data.
214
+ sqlDB .Exec (t , `CREATE TABLE foo (a INT PRIMARY KEY)` )
215
+ sqlDB .Exec (t , `CREATE TABLE bar (b INT PRIMARY KEY)` )
216
+ sqlDB .Exec (t , `INSERT INTO foo VALUES (1), (2), (3)` )
217
+ sqlDB .Exec (t , `INSERT INTO bar VALUES (1), (2), (3)` )
218
+
219
+ // Set up testing knobs to block all progress updates for bar.
220
+ var blockBarProgress atomic.Bool
221
+ blockBarProgress .Store (true )
222
+ {
223
+ barTableSpan := desctestutils .
224
+ TestingGetPublicTableDescriptor (s .Server .DB (), s .Codec , "d" , "bar" ).
225
+ PrimaryIndexSpan (s .Codec )
226
+
227
+ knobs := s .TestingKnobs .
228
+ DistSQL .(* execinfra.TestingKnobs ).
229
+ Changefeed .(* TestingKnobs )
230
+
231
+ knobs .FilterSpanWithMutation = func (rs * jobspb.ResolvedSpan ) (bool , error ) {
232
+ if blockBarProgress .Load () && barTableSpan .Contains (rs .Span ) {
233
+ return true , nil
234
+ }
235
+ return false , nil
236
+ }
237
+ }
238
+
239
+ // Create changefeed for both tables with no initial scan.
240
+ feed := feed (t , f , fmt .Sprintf (`CREATE CHANGEFEED FOR foo, bar
241
+ WITH no_initial_scan, min_checkpoint_frequency='1s', resolved, metrics_label='%s'` , scope ))
242
+ defer closeFeed (t , feed )
243
+
244
+ assertSpanSkewInRange := func (start int64 , end int64 ) int64 {
245
+ var spanSkew int64
246
+ testutils .SucceedsSoon (t , func () error {
247
+ spanSkew = aggMetrics .SpanProgressSkew .Value ()
248
+ scopedSpanSkew := scopedMetrics .SpanProgressSkew .Value ()
249
+ if spanSkew != scopedSpanSkew {
250
+ return errors .Newf ("aggregate and scoped span skew don't match" )
251
+ }
252
+ if spanSkew < start {
253
+ return errors .Newf ("expected span skew to be at least %d, got %d" , start , spanSkew )
254
+ }
255
+ if spanSkew >= end {
256
+ return errors .Newf ("expected span skew to be less than %d, got %d" , end , spanSkew )
257
+ }
258
+ return nil
259
+ })
260
+ return spanSkew
261
+ }
262
+ assertTableSkewInRange := func (start int64 , end int64 ) int64 {
263
+ var tableSkew int64
264
+ testutils .SucceedsSoon (t , func () error {
265
+ tableSkew = aggMetrics .TableProgressSkew .Value ()
266
+ scopedTableSkew := scopedMetrics .TableProgressSkew .Value ()
267
+ if tableSkew != scopedTableSkew {
268
+ return errors .Newf ("aggregate and scoped table skew don't match" )
269
+ }
270
+ if ! perTableTracking {
271
+ if tableSkew != 0 {
272
+ return errors .Newf ("expected table skew to be 0, got %d" , tableSkew )
273
+ }
274
+ return nil
275
+ }
276
+ if tableSkew < start {
277
+ return errors .Newf ("expected table skew to be at least %d, got %d" , start , tableSkew )
278
+ }
279
+ if tableSkew >= end {
280
+ return errors .Newf ("expected table skew to be less than %d, got %d" , end , tableSkew )
281
+ }
282
+ return nil
283
+ })
284
+ return tableSkew
285
+ }
286
+
287
+ // Verify that progress skew metrics show a non-negligible amount of lag
288
+ // since bar progress is blocked. Some amount of skew is often unavoidable
289
+ // due to the fact the aggregator processes the rangefeed checkpoints for
290
+ // different spans separately and at the time of a flush, may have only
291
+ // processed a portion of the checkpoints for a specific closed timestamp.
292
+ // The duration of 5s has been chosen given the default closed timestamp
293
+ // interval is 3s.
294
+ startingSpanSkew := assertSpanSkewInRange (int64 (5 * time .Second ), math .MaxInt64 )
295
+ startingTableSkew := assertTableSkewInRange (int64 (5 * time .Second ), math .MaxInt64 )
296
+
297
+ // Verify that skew continues to increase since bar progress is still blocked.
298
+ assertSpanSkewInRange (startingSpanSkew + int64 (5 * time .Second ), math .MaxInt64 )
299
+ assertTableSkewInRange (startingTableSkew + int64 (5 * time .Second ), math .MaxInt64 )
300
+
301
+ // Re-enable progress updates for bar.
302
+ blockBarProgress .Store (false )
303
+
304
+ // Verify that skew drops below the skew observed at the start.
305
+ assertSpanSkewInRange (0 , startingSpanSkew )
306
+ assertTableSkewInRange (0 , startingTableSkew )
307
+ }
308
+
309
+ cdcTest (t , testFn )
310
+ })
311
+ }
0 commit comments