Skip to content

Commit 101352f

Browse files
craig[bot]kyle-a-wongkev-caoangles-n-daemonsyuzefovich
committed
152275: log,*: find and replace log.*Warning* with log.Dev.*Warning* r=kyle-a-wong a=kyle-a-wong Epic: CRDB-53410 Release note: None 152348: backup: remove mixed-version check for incremental path suffix r=msbutler a=kev-cao The `start_time` suffix was added to incremental backup paths in 25.2 A mixed version check was added to ensure backups written during the mixed-version state when ugprading to 25.2 would be written correctly. This check is no longer needed. Epic: None Release note: None 152366: unsafesql: convert tests to table tests r=angles-n-daemons a=angles-n-daemons This is just a bit of cleanup in the unsafesql test, many of the existing tests share a similar structure, so I created a table test layout so that they work better. Fixes: none Epic: none Release note: none 152371: jobs: mark a few auto partial stats metrics as essential r=yuzefovich a=yuzefovich In order to match which metrics we mark as "essential" for AUTO CREATE STATS jobs, we now mark the following AUTO CREATE PARTIAL STATS job metrics: - `jobs.auto_create_partial_stats.currently_paused` - `jobs.auto_create_partial_stats.currently_running` - `jobs.auto_create_partial_stats.resume_failed`. Epic: CRDB-52656 Release note: None Co-authored-by: Kyle Wong <[email protected]> Co-authored-by: Kevin Cao <[email protected]> Co-authored-by: Brian Dillmann <[email protected]> Co-authored-by: Yahor Yuzefovich <[email protected]>
5 parents 7b270fe + 2a0c14d + 0a5989d + 3e2aa8c + a081d93 commit 101352f

File tree

333 files changed

+1058
-1031
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

333 files changed

+1058
-1031
lines changed

docs/generated/metrics/metrics.yaml

Lines changed: 34 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -330,6 +330,39 @@ layers:
330330
essential: true
331331
- name: SQL
332332
metrics:
333+
- name: jobs.auto_create_partial_stats.currently_paused
334+
exported_name: jobs_auto_create_partial_stats_currently_paused
335+
labeled_name: 'jobs{name: auto_create_partial_stats, status: currently_paused}'
336+
description: Number of auto_create_partial_stats jobs currently considered Paused
337+
y_axis_label: jobs
338+
type: GAUGE
339+
unit: COUNT
340+
aggregation: AVG
341+
derivative: NONE
342+
how_to_use: This metric is a high-level indicator that automatically generated partial statistics jobs are paused which can lead to the query optimizer running with stale statistics. Stale statistics can cause suboptimal query plans to be selected leading to poor query performance.
343+
essential: true
344+
- name: jobs.auto_create_partial_stats.currently_running
345+
exported_name: jobs_auto_create_partial_stats_currently_running
346+
labeled_name: 'jobs{type: auto_create_partial_stats, status: currently_running}'
347+
description: Number of auto_create_partial_stats jobs currently running in Resume or OnFailOrCancel state
348+
y_axis_label: jobs
349+
type: GAUGE
350+
unit: COUNT
351+
aggregation: AVG
352+
derivative: NONE
353+
how_to_use: This metric tracks the number of active automatically generated partial statistics jobs that could also be consuming resources. Ensure that foreground SQL traffic is not impacted by correlating this metric with SQL latency and query volume metrics.
354+
essential: true
355+
- name: jobs.auto_create_partial_stats.resume_failed
356+
exported_name: jobs_auto_create_partial_stats_resume_failed
357+
labeled_name: 'jobs.resume{name: auto_create_partial_stats, status: failed}'
358+
description: Number of auto_create_partial_stats jobs which failed with a non-retriable error
359+
y_axis_label: jobs
360+
type: COUNTER
361+
unit: COUNT
362+
aggregation: AVG
363+
derivative: NON_NEGATIVE_DERIVATIVE
364+
how_to_use: This metric is a high-level indicator that automatically generated partial table statistics is failing. Failed statistic creation can lead to the query optimizer running with stale statistics. Stale statistics can cause suboptimal query plans to be selected leading to poor query performance.
365+
essential: true
333366
- name: jobs.auto_create_stats.currently_paused
334367
exported_name: jobs_auto_create_stats_currently_paused
335368
labeled_name: 'jobs{name: auto_create_stats, status: currently_paused}'
@@ -394,7 +427,7 @@ layers:
394427
unit: COUNT
395428
aggregation: AVG
396429
derivative: NONE
397-
how_to_use: This metric tracks the number of active create statistics jobs that may be consuming resources. Ensure that foreground SQL traffic is not impacted by correlating this metric with SQL latency and query volume metrics.
430+
how_to_use: This metric tracks the number of active create statistics jobs that could also be consuming resources. Ensure that foreground SQL traffic is not impacted by correlating this metric with SQL latency and query volume metrics.
398431
essential: true
399432
- name: schedules.BACKUP.failed
400433
exported_name: schedules_BACKUP_failed
@@ -4055,24 +4088,6 @@ layers:
40554088
unit: COUNT
40564089
aggregation: AVG
40574090
derivative: NONE
4058-
- name: jobs.auto_create_partial_stats.currently_paused
4059-
exported_name: jobs_auto_create_partial_stats_currently_paused
4060-
labeled_name: 'jobs{name: auto_create_partial_stats, status: currently_paused}'
4061-
description: Number of auto_create_partial_stats jobs currently considered Paused
4062-
y_axis_label: jobs
4063-
type: GAUGE
4064-
unit: COUNT
4065-
aggregation: AVG
4066-
derivative: NONE
4067-
- name: jobs.auto_create_partial_stats.currently_running
4068-
exported_name: jobs_auto_create_partial_stats_currently_running
4069-
labeled_name: 'jobs{type: auto_create_partial_stats, status: currently_running}'
4070-
description: Number of auto_create_partial_stats jobs currently running in Resume or OnFailOrCancel state
4071-
y_axis_label: jobs
4072-
type: GAUGE
4073-
unit: COUNT
4074-
aggregation: AVG
4075-
derivative: NONE
40764091
- name: jobs.auto_create_partial_stats.expired_pts_records
40774092
exported_name: jobs_auto_create_partial_stats_expired_pts_records
40784093
labeled_name: 'jobs.expired_pts_records{type: auto_create_partial_stats}'
@@ -4136,15 +4151,6 @@ layers:
41364151
unit: COUNT
41374152
aggregation: AVG
41384153
derivative: NON_NEGATIVE_DERIVATIVE
4139-
- name: jobs.auto_create_partial_stats.resume_failed
4140-
exported_name: jobs_auto_create_partial_stats_resume_failed
4141-
labeled_name: 'jobs.resume{name: auto_create_partial_stats, status: failed}'
4142-
description: Number of auto_create_partial_stats jobs which failed with a non-retriable error
4143-
y_axis_label: jobs
4144-
type: COUNTER
4145-
unit: COUNT
4146-
aggregation: AVG
4147-
derivative: NON_NEGATIVE_DERIVATIVE
41484154
- name: jobs.auto_create_partial_stats.resume_retry_error
41494155
exported_name: jobs_auto_create_partial_stats_resume_retry_error
41504156
labeled_name: 'jobs.resume{name: auto_create_partial_stats, status: retry_error}'

pkg/acceptance/cluster/docker.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -307,7 +307,7 @@ func (c *Container) WaitUntilNotRunning(ctx context.Context) error {
307307

308308
out := io.MultiWriter(cmdLog, os.Stderr)
309309
if err := c.Logs(ctx, out); err != nil {
310-
log.Warningf(ctx, "%v", err)
310+
log.Dev.Warningf(ctx, "%v", err)
311311
}
312312

313313
if exitCode := waitOKBody.StatusCode; exitCode != 0 {
@@ -405,7 +405,7 @@ func (cli resilientDockerClient) ContainerStart(
405405
// Keep going if ContainerStart timed out, but client's context is not
406406
// expired.
407407
if errors.Is(err, context.DeadlineExceeded) && clientCtx.Err() == nil {
408-
log.Warningf(clientCtx, "ContainerStart timed out, retrying")
408+
log.Dev.Warningf(clientCtx, "ContainerStart timed out, retrying")
409409
continue
410410
}
411411
return err
@@ -452,7 +452,7 @@ func (cli resilientDockerClient) ContainerCreate(
452452
return cli.ContainerCreate(ctx, config, hostConfig, networkingConfig, platformSpec, containerName)
453453
}
454454
}
455-
log.Warningf(ctx, "error indicated existing container %s, "+
455+
log.Dev.Warningf(ctx, "error indicated existing container %s, "+
456456
"but none found:\nerror: %s\ncontainers: %+v",
457457
containerName, err, containers)
458458
// We likely raced with a previous (late) removal of the container.

pkg/acceptance/cluster/dockercluster.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,7 @@ func (l *DockerCluster) createNetwork(ctx context.Context) {
303303
})
304304
maybePanic(err)
305305
if resp.Warning != "" {
306-
log.Warningf(ctx, "creating network: %s", resp.Warning)
306+
log.Dev.Warningf(ctx, "creating network: %s", resp.Warning)
307307
}
308308
l.networkID = resp.ID
309309
}
@@ -920,7 +920,7 @@ func (l *DockerCluster) ExecCLI(ctx context.Context, i int, cmd []string) (strin
920920
func (l *DockerCluster) Cleanup(ctx context.Context, preserveLogs bool) {
921921
volumes, err := os.ReadDir(l.volumesDir)
922922
if err != nil {
923-
log.Warningf(ctx, "%v", err)
923+
log.Dev.Warningf(ctx, "%v", err)
924924
return
925925
}
926926
for _, v := range volumes {
@@ -929,7 +929,7 @@ func (l *DockerCluster) Cleanup(ctx context.Context, preserveLogs bool) {
929929
continue
930930
}
931931
if err := os.RemoveAll(filepath.Join(l.volumesDir, v.Name())); err != nil {
932-
log.Warningf(ctx, "%v", err)
932+
log.Dev.Warningf(ctx, "%v", err)
933933
}
934934
}
935935
}

pkg/acceptance/localcluster/cluster.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -564,10 +564,10 @@ func (n *Node) startAsyncInnerLocked(ctx context.Context, joins ...string) error
564564

565565
if err := n.cmd.Start(); err != nil {
566566
if err := stdout.Close(); err != nil {
567-
log.Warningf(ctx, "%v", err)
567+
log.Dev.Warningf(ctx, "%v", err)
568568
}
569569
if err := stderr.Close(); err != nil {
570-
log.Warningf(ctx, "%v", err)
570+
log.Dev.Warningf(ctx, "%v", err)
571571
}
572572
return errors.Wrapf(err, "running %s %v", n.cmd.Path, n.cmd.Args)
573573
}
@@ -577,13 +577,13 @@ func (n *Node) startAsyncInnerLocked(ctx context.Context, joins ...string) error
577577
go func(cmd *exec.Cmd) {
578578
waitErr := cmd.Wait()
579579
if waitErr != nil {
580-
log.Warningf(ctx, "%v", waitErr)
580+
log.Dev.Warningf(ctx, "%v", waitErr)
581581
}
582582
if err := stdout.Close(); err != nil {
583-
log.Warningf(ctx, "%v", err)
583+
log.Dev.Warningf(ctx, "%v", err)
584584
}
585585
if err := stderr.Close(); err != nil {
586-
log.Warningf(ctx, "%v", err)
586+
log.Dev.Warningf(ctx, "%v", err)
587587
}
588588

589589
log.Dev.Infof(ctx, "process %d: %s", cmd.Process.Pid, cmd.ProcessState)
@@ -795,7 +795,7 @@ func (n *Node) Signal(s os.Signal) {
795795
return
796796
}
797797
if err := n.cmd.Process.Signal(s); err != nil {
798-
log.Warningf(context.Background(), "%v", err)
798+
log.Dev.Warningf(context.Background(), "%v", err)
799799
}
800800
}
801801

@@ -806,7 +806,7 @@ func (n *Node) Wait() *exec.ExitError {
806806
ch := n.notRunning
807807
n.Unlock()
808808
if ch == nil {
809-
log.Warning(context.Background(), "(*Node).Wait called when node was not running")
809+
log.Dev.Warning(context.Background(), "(*Node).Wait called when node was not running")
810810
return nil
811811
}
812812
<-ch

pkg/backup/backup_job.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -478,7 +478,7 @@ func releaseProtectedTimestamp(
478478
if errors.Is(err, protectedts.ErrNotExists) {
479479
// No reason to return an error which might cause problems if it doesn't
480480
// seem to exist.
481-
log.Warningf(ctx, "failed to release protected which seems not to exist: %v", err)
481+
log.Dev.Warningf(ctx, "failed to release protected which seems not to exist: %v", err)
482482
err = nil
483483
}
484484
return err
@@ -500,7 +500,7 @@ func getTableStatsForBackup(
500500
tableDesc := tabledesc.NewBuilder(tbl).BuildImmutableTable()
501501
tableStatisticsAcc, err := stats.GetTableStatsProtosFromDB(ctx, tableDesc, executor)
502502
if err != nil {
503-
log.Warningf(
503+
log.Dev.Warningf(
504504
ctx, "failed to collect stats for table: %s, table ID: %d during a backup: %s",
505505
tableDesc.GetName(), tableDesc.GetID(), err,
506506
)
@@ -771,7 +771,7 @@ func (b *backupResumer) Resume(ctx context.Context, execCtx interface{}) error {
771771
return jobs.MarkAsRetryJobError(errors.Wrapf(err, "job encountered retryable error on draining node"))
772772
}
773773

774-
log.Warningf(ctx, "encountered retryable error: %+v", err)
774+
log.Dev.Warningf(ctx, "encountered retryable error: %+v", err)
775775

776776
// Reload the backup manifest to pick up any spans we may have completed on
777777
// previous attempts.
@@ -789,7 +789,7 @@ func (b *backupResumer) Resume(ctx context.Context, execCtx interface{}) error {
789789
if ctx.Err() != nil {
790790
return ctx.Err()
791791
}
792-
log.Warningf(ctx, "BACKUP job %d could not reload job progress when retrying: %+v",
792+
log.Dev.Warningf(ctx, "BACKUP job %d could not reload job progress when retrying: %+v",
793793
b.job.ID(), reloadErr)
794794
} else {
795795
curProgress := reloadedJob.FractionCompleted()
@@ -1947,7 +1947,7 @@ func (b *backupResumer) processScheduledBackupCompletion(
19471947
if _, err := maybeStartCompactionJob(
19481948
ctx, execCtx.ExecCfg(), execCtx.User(), details,
19491949
); err != nil {
1950-
log.Warningf(ctx, "failed to trigger backup compaction for schedule %d: %v", scheduleID, err)
1950+
log.Dev.Warningf(ctx, "failed to trigger backup compaction for schedule %d: %v", scheduleID, err)
19511951
}
19521952
}
19531953
return nil
@@ -2028,19 +2028,19 @@ func (b *backupResumer) deleteCheckpoint(
20282028
// backups, and then from the progress directory.
20292029
err = exportStore.Delete(ctx, backupinfo.BackupManifestCheckpointName)
20302030
if err != nil {
2031-
log.Warningf(ctx, "unable to delete checkpointed backup descriptor file in base directory: %+v", err)
2031+
log.Dev.Warningf(ctx, "unable to delete checkpointed backup descriptor file in base directory: %+v", err)
20322032
}
20332033
err = exportStore.Delete(ctx, backupinfo.BackupManifestCheckpointName+backupinfo.BackupManifestChecksumSuffix)
20342034
if err != nil {
2035-
log.Warningf(ctx, "unable to delete checkpoint checksum file in base directory: %+v", err)
2035+
log.Dev.Warningf(ctx, "unable to delete checkpoint checksum file in base directory: %+v", err)
20362036
}
20372037
// Delete will not delete a nonempty directory, so we have to go through
20382038
// all files and delete each file one by one.
20392039
return exportStore.List(ctx, backupinfo.BackupProgressDirectory, "", func(p string) error {
20402040
return exportStore.Delete(ctx, backupinfo.BackupProgressDirectory+p)
20412041
})
20422042
}(); err != nil {
2043-
log.Warningf(ctx, "unable to delete checkpointed backup descriptor file in progress directory: %+v", err)
2043+
log.Dev.Warningf(ctx, "unable to delete checkpointed backup descriptor file in progress directory: %+v", err)
20442044
}
20452045
}
20462046

pkg/backup/backup_processor.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ func (bp *backupDataProcessor) constructProgressProducerMeta(
232232
// Annotate the progress with the fraction completed by this backupDataProcessor.
233233
progDetails := backuppb.BackupManifest_Progress{}
234234
if err := gogotypes.UnmarshalAny(&prog.ProgressDetails, &progDetails); err != nil {
235-
log.Warningf(bp.Ctx(), "failed to unmarshal progress details %v", err)
235+
log.Dev.Warningf(bp.Ctx(), "failed to unmarshal progress details %v", err)
236236
} else {
237237
totalSpans := int32(len(bp.spec.Spans) + len(bp.spec.IntroducedSpans))
238238
bp.completedSpans += progDetails.CompletedSpans
@@ -464,7 +464,7 @@ func runBackupProcessor(
464464
sink := backupsink.MakeFileSSTSink(sinkConf, storage, pacer)
465465
defer func() {
466466
if err := sink.Flush(ctx); err != nil {
467-
log.Warningf(ctx, "failed to flush SST sink: %s", err)
467+
log.Dev.Warningf(ctx, "failed to flush SST sink: %s", err)
468468
}
469469
logClose(ctx, sink, "SST sink")
470470
}()
@@ -561,7 +561,7 @@ func runBackupProcessor(
561561
tracer = flowCtx.Cfg.Tracer
562562
}
563563
if tracer == nil {
564-
log.Warning(ctx, "nil tracer in backup processor")
564+
log.Dev.Warning(ctx, "nil tracer in backup processor")
565565
}
566566
opts := make([]tracing.SpanOption, 0)
567567
opts = append(opts, tracing.WithParent(sp))
@@ -657,7 +657,7 @@ func runBackupProcessor(
657657
}
658658

659659
if len(resp.Files) > 1 {
660-
log.Warning(ctx, "unexpected multi-file response using header.TargetBytes = 1")
660+
log.Dev.Warning(ctx, "unexpected multi-file response using header.TargetBytes = 1")
661661
}
662662

663663
// Even if the ExportRequest did not export any data we want to report
@@ -756,7 +756,7 @@ func reserveWorkerMemory(
756756
workerCount := minimumWorkerCount
757757
for i := 0; i < (maxWorkerCount - minimumWorkerCount); i++ {
758758
if err := memAcc.Grow(ctx, perWorkerMemory); err != nil {
759-
log.Warningf(ctx, "backup worker count restricted by memory limit")
759+
log.Dev.Warningf(ctx, "backup worker count restricted by memory limit")
760760
break
761761
}
762762
workerCount++
@@ -766,7 +766,7 @@ func reserveWorkerMemory(
766766

767767
func logClose(ctx context.Context, c io.Closer, desc string) {
768768
if err := c.Close(); err != nil {
769-
log.Warningf(ctx, "failed to close %s: %s", redact.SafeString(desc), err.Error())
769+
log.Dev.Warningf(ctx, "failed to close %s: %s", redact.SafeString(desc), err.Error())
770770
}
771771
}
772772

pkg/backup/backup_telemetry.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -119,12 +119,12 @@ func createBackupRecoveryEvent(
119119
storageTypes := make(map[string]struct{})
120120
defaultURI, urisByLocalityKV, err := backupdest.GetURIsByLocalityKV(initialDetails.Destination.To, "")
121121
if err != nil {
122-
log.Warningf(ctx, "failed to get URIs by locality: %v", err)
122+
log.Dev.Warningf(ctx, "failed to get URIs by locality: %v", err)
123123
}
124124

125125
if defaultURI != "" {
126126
if storageType, authType, err := parseStorageAndAuth(defaultURI); err != nil {
127-
log.Warningf(ctx, "failed to parse backup default URI: %v", err)
127+
log.Dev.Warningf(ctx, "failed to parse backup default URI: %v", err)
128128
} else {
129129
storageTypes[storageType] = struct{}{}
130130
authTypes[authType] = struct{}{}
@@ -133,7 +133,7 @@ func createBackupRecoveryEvent(
133133

134134
for _, uri := range urisByLocalityKV {
135135
if storageType, authType, err := parseStorageAndAuth(uri); err != nil {
136-
log.Warningf(ctx, "failed to parse locality URI: %v", err)
136+
log.Dev.Warningf(ctx, "failed to parse locality URI: %v", err)
137137
} else {
138138
storageTypes[storageType] = struct{}{}
139139
authTypes[authType] = struct{}{}
@@ -218,7 +218,7 @@ func getPassphraseAndKMS(
218218
if enc.KMSInfo != nil {
219219
parsedKMSURI, err := url.ParseRequestURI(enc.KMSInfo.Uri)
220220
if err != nil {
221-
log.Warningf(ctx, "failed to parse KMS URI %s: %v", enc.KMSInfo.Uri, err)
221+
log.Dev.Warningf(ctx, "failed to parse KMS URI %s: %v", enc.KMSInfo.Uri, err)
222222
} else {
223223
kms = parsedKMSURI.Scheme
224224
}
@@ -348,7 +348,7 @@ func logRestoreTelemetry(
348348

349349
for _, uri := range details.URIs {
350350
if storage, auth, err := parseStorageAndAuth(uri); err != nil {
351-
log.Warningf(ctx, "failed to parse URI: %v", err)
351+
log.Dev.Warningf(ctx, "failed to parse URI: %v", err)
352352
} else {
353353
authTypes[auth] = struct{}{}
354354
storageTypes[storage] = struct{}{}
@@ -362,7 +362,7 @@ func logRestoreTelemetry(
362362

363363
for _, uri := range localityInfo.URIsByOriginalLocalityKV {
364364
if storage, auth, err := parseStorageAndAuth(uri); err != nil {
365-
log.Warningf(ctx, "failed to parse URI: %v", err)
365+
log.Dev.Warningf(ctx, "failed to parse URI: %v", err)
366366
} else {
367367
authTypes[auth] = struct{}{}
368368
storageTypes[storage] = struct{}{}

0 commit comments

Comments
 (0)