Skip to content

Commit e227d62

Browse files
craig[bot]jlinderjeffswensonannrpomrafiss
committed
151365: bazel: enable remote build on mesolite from linux r=rickystewart a=jlinder We are merging EngFlow clusters. This transitions to using the mesolite cluster for running remote builds from gceworkers. Part of: DEVINF-1489 Release note: None 151668: backupsink: increase test size to large r=msbutler a=jeffswenson The test allocates a relatively large buffers and we have seen periodic ooms as a result. Release note: None Informs: #145892 Informs: #150623 Informs: #151665 Informs: #151032 151796: kvserver: add timeseries metric for value retrieval count r=annrpom a=annrpom Epic: none Release note: None 151810: scplan: unredact logs for unable to make progress error r=rafiss a=rafiss We have seen this error in tests and Sentry reports. The details for the error do not show up when logs are redacted, so this change makes the details appear in the AssertionError itself. This also makes stageBuilder implement redact.SafeFormatter so that the error is formatted safely. fixes #151769 informs #147315 Release note: None Co-authored-by: James H. Linder <[email protected]> Co-authored-by: Jeff Swenson <[email protected]> Co-authored-by: Annie Pompa <[email protected]> Co-authored-by: Rafi Shamim <[email protected]>
5 parents 993bd51 + 8af161d + 0427fa9 + 06af161 + 118a936 commit e227d62

File tree

10 files changed

+130
-50
lines changed

10 files changed

+130
-50
lines changed

.bazelrc

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -147,17 +147,16 @@ build:engflowbase --remote_upload_local_results=false
147147
build:engflowbase --remote_download_toplevel
148148
build:engflowbase --test_env=GO_TEST_WRAP_TESTV=1
149149
build:engflowbase --config=lintonbuild
150+
build:engflowbase --remote_cache=grpcs://mesolite.cluster.engflow.com
151+
build:engflowbase --remote_executor=grpcs://mesolite.cluster.engflow.com
152+
build:engflowbase --bes_backend=grpcs://mesolite.cluster.engflow.com
150153
test:engflowbase --test_env=REMOTE_EXEC=1
151154
test:engflowbase --test_env=GOTRACEBACK=all
152155
build:engflow --config=engflowbase
153-
build:engflow --remote_cache=grpcs://tanzanite.cluster.engflow.com
154-
build:engflow --remote_executor=grpcs://tanzanite.cluster.engflow.com
155-
build:engflow --bes_backend=grpcs://tanzanite.cluster.engflow.com
156-
build:engflow --bes_results_url=https://tanzanite.cluster.engflow.com/invocation/
156+
build:engflow --bes_results_url=https://mesolite.cluster.engflow.com/invocations/private
157+
build:engflow --remote_instance_name=private
158+
build:engflow --bes_instance_name=private
157159
build:engflowpublic --config=engflowbase
158-
build:engflowpublic --remote_cache=grpcs://mesolite.cluster.engflow.com
159-
build:engflowpublic --remote_executor=grpcs://mesolite.cluster.engflow.com
160-
build:engflowpublic --bes_backend=grpcs://mesolite.cluster.engflow.com
161160
build:engflowpublic --bes_results_url=https://mesolite.cluster.engflow.com/invocation/
162161

163162
try-import %workspace%/.bazelrc.user

dev

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ fi
88
set -euo pipefail
99

1010
# Bump this counter to force rebuilding `dev` on all machines.
11-
DEV_VERSION=112
11+
DEV_VERSION=113
1212

1313
THIS_DIR=$(cd "$(dirname "$0")" && pwd)
1414
BINARY_DIR=$THIS_DIR/bin/dev-versions

docs/generated/metrics/metrics.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17366,6 +17366,14 @@ layers:
1736617366
unit: BYTES
1736717367
aggregation: AVG
1736817368
derivative: NONE
17369+
- name: storage.value_separation.value_retrieval.count
17370+
exported_name: storage_value_separation_value_retrieval_count
17371+
description: The number of value retrievals of values separated into blob files.
17372+
y_axis_label: Events
17373+
type: COUNTER
17374+
unit: COUNT
17375+
aggregation: AVG
17376+
derivative: NON_NEGATIVE_DERIVATIVE
1736917377
- name: storage.wal.bytes_in
1737017378
exported_name: storage_wal_bytes_in
1737117379
description: The number of logical bytes the storage engine has written to the WAL

pkg/backup/backupsink/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ go_library(
3232

3333
go_test(
3434
name = "backupsink_test",
35+
size = "large",
3536
srcs = [
3637
"file_sst_sink_test.go",
3738
"sst_sink_key_writer_test.go",

pkg/cmd/dev/doctor.go

Lines changed: 44 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ const (
3030
// doctorStatusVersion is the current "version" of the status checks
3131
// performed by `dev doctor``. Increasing it will force doctor to be re-run
3232
// before other dev commands can be run.
33-
doctorStatusVersion = 11
33+
doctorStatusVersion = 12
3434

3535
noCacheFlag = "no-cache"
3636
interactiveFlag = "interactive"
@@ -276,8 +276,15 @@ Make sure one of the following lines is in the file %s/.bazelrc.user:
276276
if d.checkUsingConfig(cfg.workspace, "dev") {
277277
return "In --remote mode, you cannot use the `dev` build configuration."
278278
}
279-
if !d.checkLinePresenceInBazelRcUser(cfg.workspace, "build:engflow --jobs=200") {
280-
return fmt.Sprintf("Make sure the following line is in %s/.bazelrc.user: build:engflow --jobs=200", cfg.workspace)
279+
requiredLines := []string{
280+
"build:engflow --jobs=200",
281+
"build:engflow --credential_helper=mesolite.cluster.engflow.com=engflow_auth",
282+
"build:engflow --remote_execution_priority=-1",
283+
}
284+
for _, line := range requiredLines {
285+
if !d.checkLinePresenceInBazelRcUser(cfg.workspace, line) {
286+
return fmt.Sprintf("Make sure the following line is in %s/.bazelrc.user: %s", cfg.workspace, line)
287+
}
281288
}
282289
return ""
283290
},
@@ -304,13 +311,19 @@ Make sure one of the following lines is in the file %s/.bazelrc.user:
304311
return err
305312
}
306313
}
307-
if !d.checkLinePresenceInBazelRcUser(cfg.workspace, "build:engflow --jobs=200") {
308-
err := d.addLineToBazelRcUser(cfg.workspace, "build:engflow --jobs=200")
309-
if err != nil {
310-
return err
314+
requiredLines := []string{
315+
"build:engflow --jobs=200",
316+
"build:engflow --credential_helper=mesolite.cluster.engflow.com=engflow_auth",
317+
"build:engflow --remote_execution_priority=-1",
318+
}
319+
for _, line := range requiredLines {
320+
if !d.checkLinePresenceInBazelRcUser(cfg.workspace, line) {
321+
err := d.addLineToBazelRcUser(cfg.workspace, line)
322+
if err != nil {
323+
return err
324+
}
311325
}
312326
}
313-
314327
return nil
315328
},
316329
remoteOnly: true,
@@ -359,13 +372,16 @@ slightly slower and introduce a noticeable delay in first-time build setup.`
359372
},
360373
},
361374
{
362-
name: "engflow_certificates",
375+
name: "engflow_auth",
363376
check: func(d *dev, ctx context.Context, cfg doctorConfig) string {
364-
if !d.checkLinePresenceInBazelRcUser(cfg.workspace, "build:engflow --tls_client_certificate=") {
365-
return fmt.Sprintf("Must specify the --tls_client_certificate to use for EngFlow builds in %s/.bazelrc.user. This is a line of the form: `build:engflow --tls_client_certificate=/path/to/file`.", cfg.workspace)
377+
if !d.checkFileExistsUnderHomedir(".config/engflow_auth/tokens/mesolite.cluster.engflow.com") {
378+
return "Make sure you've installed engflow_auth (https://github.com/EngFlow/auth), added it to your PATH, and login using `engflow_auth login -store=file https://mesolite.cluster.engflow.com`."
379+
}
380+
if d.checkLinePresenceInBazelRcUser(cfg.workspace, "build:engflow --tls_client_certificate=") {
381+
return fmt.Sprintf("Please remove the --tls_client_certificate line from %s/.bazelrc.user. It is no longer necessary when using engflow_auth.", cfg.workspace)
366382
}
367-
if !d.checkLinePresenceInBazelRcUser(cfg.workspace, "build:engflow --tls_client_key=") {
368-
return fmt.Sprintf("Must specify the --tls_client_key to use for EngFlow builds in %s/.bazelrc.user. This is a line of the form: `build:engflow --tls_client_key=/path/to/file`.", cfg.workspace)
383+
if d.checkLinePresenceInBazelRcUser(cfg.workspace, "build:engflow --tls_client_key=") {
384+
return fmt.Sprintf("Please remove the --tls_client_key line from %s/.bazelrc.user. It is no longer necessary when using engflow_auth.", cfg.workspace)
369385
}
370386
return ""
371387
},
@@ -924,3 +940,18 @@ func (d *dev) removeAllPrefixesInFile(filename, prefixToRemove string) error {
924940
outStr = strings.TrimSpace(outStr) + "\n"
925941
return d.os.WriteFile(filename, outStr)
926942
}
943+
944+
// checkFileExistsUnderHomedir checks whether the given file or path is present
945+
// under the home directory. If it is, this function returns true. Otherwise,
946+
// it returns false.
947+
func (d *dev) checkFileExistsUnderHomedir(expectedFile string) bool {
948+
homeDir, err := os.UserHomeDir()
949+
if err != nil {
950+
return false
951+
}
952+
exists, err := d.os.Exists(filepath.Join(homeDir, expectedFile))
953+
if err != nil {
954+
return false
955+
}
956+
return exists
957+
}

pkg/cmd/dev/io/os/os.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,26 @@ func (o *OS) IsDir(dirname string) (bool, error) {
296296
return strconv.ParseBool(strings.TrimSpace(output))
297297
}
298298

299+
// Exists wraps around os.Stat, which returns the os.FileInfo of the named
300+
// path. Exists returns true if and only if it exists. If there is an error,
301+
// it will return false and the error.
302+
func (o *OS) Exists(filename string) (bool, error) {
303+
command := fmt.Sprintf("cat %s", filename)
304+
if !o.knobs.silent {
305+
o.logger.Print(command)
306+
}
307+
308+
output, _ := o.Next(command, func() (output string, err error) {
309+
_, err = os.Stat(filename)
310+
exists := !errors.Is(err, fs.ErrNotExist)
311+
if exists && err != nil {
312+
return "false", err
313+
}
314+
return strconv.FormatBool(exists), nil
315+
})
316+
return strconv.ParseBool(strings.TrimSpace(output))
317+
}
318+
299319
// ReadFile wraps around os.ReadFile, reading a file from disk and
300320
// returning the contents.
301321
func (o *OS) ReadFile(filename string) (string, error) {

pkg/kv/kvserver/metrics.go

Lines changed: 30 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2675,6 +2675,12 @@ Note that the measurement does not include the duration for replicating the eval
26752675
Measurement: "Bytes",
26762676
Unit: metric.Unit_BYTES,
26772677
}
2678+
metaValueSeparationValueRetrievalCount = metric.Metadata{
2679+
Name: "storage.value_separation.value_retrieval.count",
2680+
Help: "The number of value retrievals of values separated into blob files.",
2681+
Measurement: "Events",
2682+
Unit: metric.Unit_COUNT,
2683+
}
26782684
metaWALBytesWritten = metric.Metadata{
26792685
Name: "storage.wal.bytes_written",
26802686
Help: "The number of bytes the storage engine has written to the WAL",
@@ -3002,18 +3008,19 @@ type StoreMetrics struct {
30023008
CompressionUnknownBytes *metric.Gauge
30033009
CompressionOverallCR *metric.GaugeFloat64
30043010

3005-
categoryIterMetrics pebbleCategoryIterMetricsContainer
3006-
categoryDiskWriteMetrics pebbleCategoryDiskWriteMetricsContainer
3007-
ValueSeparationBytesReferenced *metric.Gauge
3008-
ValueSeparationBytesUnreferenced *metric.Gauge
3009-
ValueSeparationBlobFileCount *metric.Gauge
3010-
ValueSeparationBlobFileSize *metric.Gauge
3011-
WALBytesWritten *metric.Counter
3012-
WALBytesIn *metric.Counter
3013-
WALFailoverSwitchCount *metric.Counter
3014-
WALFailoverPrimaryDuration *metric.Counter
3015-
WALFailoverSecondaryDuration *metric.Counter
3016-
WALFailoverWriteAndSyncLatency *metric.ManualWindowHistogram
3011+
categoryIterMetrics pebbleCategoryIterMetricsContainer
3012+
categoryDiskWriteMetrics pebbleCategoryDiskWriteMetricsContainer
3013+
ValueSeparationBytesReferenced *metric.Gauge
3014+
ValueSeparationBytesUnreferenced *metric.Gauge
3015+
ValueSeparationBlobFileCount *metric.Gauge
3016+
ValueSeparationBlobFileSize *metric.Gauge
3017+
ValueSeparationValueRetrievalCount *metric.Counter
3018+
WALBytesWritten *metric.Counter
3019+
WALBytesIn *metric.Counter
3020+
WALFailoverSwitchCount *metric.Counter
3021+
WALFailoverPrimaryDuration *metric.Counter
3022+
WALFailoverSecondaryDuration *metric.Counter
3023+
WALFailoverWriteAndSyncLatency *metric.ManualWindowHistogram
30173024

30183025
RdbCheckpoints *metric.Gauge
30193026

@@ -3740,15 +3747,16 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
37403747
categoryDiskWriteMetrics: pebbleCategoryDiskWriteMetricsContainer{
37413748
registry: storeRegistry,
37423749
},
3743-
ValueSeparationBytesReferenced: metric.NewGauge(metaValueSeparationBytesReferenced),
3744-
ValueSeparationBytesUnreferenced: metric.NewGauge(metaValueSeparationBytesUnreferenced),
3745-
ValueSeparationBlobFileCount: metric.NewGauge(metaValueSeparationBlobFileCount),
3746-
ValueSeparationBlobFileSize: metric.NewGauge(metaValueSeparationBlobFileSize),
3747-
WALBytesWritten: metric.NewCounter(metaWALBytesWritten),
3748-
WALBytesIn: metric.NewCounter(metaWALBytesIn),
3749-
WALFailoverSwitchCount: metric.NewCounter(metaStorageWALFailoverSwitchCount),
3750-
WALFailoverPrimaryDuration: metric.NewCounter(metaStorageWALFailoverPrimaryDuration),
3751-
WALFailoverSecondaryDuration: metric.NewCounter(metaStorageWALFailoverSecondaryDuration),
3750+
ValueSeparationBytesReferenced: metric.NewGauge(metaValueSeparationBytesReferenced),
3751+
ValueSeparationBytesUnreferenced: metric.NewGauge(metaValueSeparationBytesUnreferenced),
3752+
ValueSeparationBlobFileCount: metric.NewGauge(metaValueSeparationBlobFileCount),
3753+
ValueSeparationBlobFileSize: metric.NewGauge(metaValueSeparationBlobFileSize),
3754+
ValueSeparationValueRetrievalCount: metric.NewCounter(metaValueSeparationValueRetrievalCount),
3755+
WALBytesWritten: metric.NewCounter(metaWALBytesWritten),
3756+
WALBytesIn: metric.NewCounter(metaWALBytesIn),
3757+
WALFailoverSwitchCount: metric.NewCounter(metaStorageWALFailoverSwitchCount),
3758+
WALFailoverPrimaryDuration: metric.NewCounter(metaStorageWALFailoverPrimaryDuration),
3759+
WALFailoverSecondaryDuration: metric.NewCounter(metaStorageWALFailoverSecondaryDuration),
37523760
WALFailoverWriteAndSyncLatency: metric.NewManualWindowHistogram(
37533761
metaStorageWALFailoverWriteAndSyncLatency,
37543762
pebble.FsyncLatencyBuckets,
@@ -4187,6 +4195,7 @@ func (sm *StoreMetrics) updateEngineMetrics(m storage.Metrics) {
41874195
sm.ValueSeparationBytesUnreferenced.Update(int64(m.BlobFiles.ValueSize - m.BlobFiles.ReferencedValueSize))
41884196
sm.ValueSeparationBlobFileCount.Update(int64(m.BlobFiles.LiveCount))
41894197
sm.ValueSeparationBlobFileSize.Update(int64(m.BlobFiles.LiveSize))
4198+
sm.ValueSeparationValueRetrievalCount.Update(int64(m.Iterator.ValueRetrievalCount))
41904199
// NB: `UpdateIfHigher` is used here since there is a race in pebble where
41914200
// sometimes the WAL is rotated but metrics are retrieved prior to the update
41924201
// to BytesIn to account for the previous WAL.

pkg/sql/schemachanger/scplan/internal/scstage/build.go

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -269,9 +269,10 @@ func buildPostCommitStages(bc buildContext, bs buildState) (stages []Stage) {
269269
var trace []string
270270
bs.trace = &trace
271271
sb = bc.makeStageBuilder(bs)
272-
panic(errors.WithDetailf(
273-
errors.AssertionFailedf("unable to make progress"),
274-
"terminal state:\n%s\nrule trace:\n%s", sb, strings.Join(trace, "\n")))
272+
panic(errors.AssertionFailedf(
273+
"unable to make progress\nterminal state:\n%v\nrule trace:\n%s",
274+
sb, redact.SafeString(strings.Join(trace, "\n")),
275+
))
275276
}
276277
build(sb)
277278
}
@@ -728,15 +729,22 @@ func (sb stageBuilder) hasAnyNonRevertibleOps() bool {
728729

729730
// String returns a string representation of the stageBuilder.
730731
func (sb stageBuilder) String() string {
731-
var str strings.Builder
732+
return redact.StringWithoutMarkers(sb)
733+
}
734+
735+
// SafeFormat implements redact.SafeFormatter.
736+
func (sb stageBuilder) SafeFormat(p redact.SafePrinter, verb rune) {
732737
for _, t := range sb.current {
733-
str.WriteString(" - ")
734-
str.WriteString(screl.NodeString(t.n))
735-
str.WriteString("\n")
738+
p.SafeString(" - ")
739+
if err := screl.FormatNode(p, t.n); err != nil {
740+
p.UnsafeString(screl.NodeString(t.n))
741+
}
742+
p.SafeString("\n")
736743
}
737-
return str.String()
738744
}
739745

746+
var _ redact.SafeFormatter = stageBuilder{}
747+
740748
// computeExtraOps generates extra operations to decorate a stage with.
741749
// These are typically job-related.
742750
//

pkg/storage/engine.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1303,6 +1303,9 @@ type AggregatedIteratorStats struct {
13031303
// ExternalSteps, it's a good indication that there's an accumulation of
13041304
// garbage within the LSM (NOT MVCC garbage).
13051305
InternalSteps int
1306+
// ValueRetrievalCount is the total count of value retrievals of values
1307+
// separated into blob files.
1308+
ValueRetrievalCount uint64
13061309
}
13071310

13081311
// AggregatedBatchCommitStats hold cumulative stats summed over all the

pkg/storage/pebble.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1420,6 +1420,7 @@ func (p *Pebble) aggregateIterStats(stats IteratorStats) {
14201420
p.iterStats.ExternalSteps += stats.Stats.ForwardStepCount[pebble.InterfaceCall] + stats.Stats.ReverseStepCount[pebble.InterfaceCall]
14211421
p.iterStats.InternalSeeks += stats.Stats.ForwardSeekCount[pebble.InternalIterCall] + stats.Stats.ReverseSeekCount[pebble.InternalIterCall]
14221422
p.iterStats.InternalSteps += stats.Stats.ForwardStepCount[pebble.InternalIterCall] + stats.Stats.ReverseStepCount[pebble.InternalIterCall]
1423+
p.iterStats.ValueRetrievalCount += stats.Stats.InternalStats.SeparatedPointValue.CountFetched
14231424
}
14241425

14251426
func (p *Pebble) aggregateBatchCommitStats(stats BatchCommitStats) {

0 commit comments

Comments
 (0)