Skip to content

Commit 4713613

Browse files
craig[bot]dhartunian
andcommitted
Merge #153048
153048: metric: reset net/disk counters on subtraction if new val is lower r=dhartunian a=dhartunian Previously, we would occasionally see negative values for counters in tests when the OS would report lower values than initial or earlier snapshots. This can cause confusion for customers. Now, when `subtractNetworkCounters` or `subtractDiskCounters` are invoked, we will reset the baseline values in order to prevent a negative counter. Some reasons this could happen were pointed out in #147137. Resolves: #147137 Resolves: #152484 Epic: None Release note (observability change): network and disk metric counters collected from the operating system could occasionally report negative values in scenarios where unerlying hardware changes occurred. Certain counters will now proactively reset their values to a new baseline to prevent this. Co-authored-by: David Hartunian <[email protected]>
2 parents e7bdf19 + 4605fda commit 4713613

File tree

2 files changed

+177
-34
lines changed

2 files changed

+177
-34
lines changed

pkg/server/status/runtime.go

Lines changed: 75 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1042,7 +1042,7 @@ func (rsr *RuntimeStatSampler) SampleEnvironment(ctx context.Context, cs *CGoMem
10421042
log.Ops.Warningf(ctx, "problem fetching disk stats: %s; disk stats will be empty.", err)
10431043
} else {
10441044
rsr.last.disk = diskCounters
1045-
subtractDiskCounters(&diskCounters, rsr.initialDiskCounters)
1045+
subtractDiskCounters(ctx, &diskCounters, &rsr.initialDiskCounters)
10461046

10471047
rsr.HostDiskReadBytes.Update(diskCounters.ReadBytes)
10481048
rsr.HostDiskReadCount.Update(diskCounters.readCount)
@@ -1063,11 +1063,11 @@ func (rsr *RuntimeStatSampler) SampleEnvironment(ctx context.Context, cs *CGoMem
10631063
}
10641064
} else {
10651065
deltaNet = nc // delta since *last* scrape
1066-
subtractNetworkCounters(&deltaNet, rsr.last.net)
1066+
subtractNetworkCounters(ctx, &deltaNet, &rsr.last.net)
10671067
rsr.last.net = nc
10681068

10691069
// `nc` will now be the delta since *first* scrape.
1070-
subtractNetworkCounters(&nc, rsr.initialNetCounters)
1070+
subtractNetworkCounters(ctx, &nc, &rsr.initialNetCounters)
10711071
// TODO(tbg): this is awkward: we're computing the delta above,
10721072
// why don't we increment the counters?
10731073
rsr.HostNetRecvBytes.Update(int64(nc.IOCounters.BytesRecv))
@@ -1443,19 +1443,38 @@ func sumAndFilterDiskCounters(disksStats []DiskStats) (DiskStats, error) {
14431443
return output, nil
14441444
}
14451445

1446-
// subtractDiskCounters subtracts the counters in `sub` from the counters in `from`,
1447-
// saving the results in `from`.
1448-
func subtractDiskCounters(from *DiskStats, sub DiskStats) {
1449-
from.writeCount -= sub.writeCount
1450-
from.WriteBytes -= sub.WriteBytes
1451-
from.writeTime -= sub.writeTime
1452-
1453-
from.readCount -= sub.readCount
1454-
from.ReadBytes -= sub.ReadBytes
1455-
from.readTime -= sub.readTime
1456-
1457-
from.ioTime -= sub.ioTime
1458-
from.weightedIOTime -= sub.weightedIOTime
1446+
// subtractDiskCounters subtracts the counters in `baseline` from the
1447+
// counters in `stats`, saving the results in `stats`. If any counter
1448+
// in `stats` is lower than the corresponding counter in `baseline`
1449+
// (indicating a reset), the value for all metrics in `baseline`
1450+
// is updated to the current value in `stats` to establish a new
1451+
// baseline.
1452+
func subtractDiskCounters(ctx context.Context, stats *DiskStats, baseline *DiskStats) {
1453+
if stats.WriteBytes < baseline.WriteBytes ||
1454+
stats.writeCount < baseline.writeCount ||
1455+
stats.writeTime < baseline.writeTime ||
1456+
stats.ReadBytes < baseline.ReadBytes ||
1457+
stats.readCount < baseline.readCount ||
1458+
stats.readTime < baseline.readTime ||
1459+
stats.ioTime < baseline.ioTime ||
1460+
stats.weightedIOTime < baseline.weightedIOTime {
1461+
*baseline = *stats
1462+
*stats = DiskStats{}
1463+
log.Ops.Info(ctx, "runtime: new baseline in disk stats from host. disk metric counters have been reset.")
1464+
return
1465+
}
1466+
1467+
// Perform normal subtraction
1468+
stats.writeCount -= baseline.writeCount
1469+
stats.WriteBytes -= baseline.WriteBytes
1470+
stats.writeTime -= baseline.writeTime
1471+
1472+
stats.readCount -= baseline.readCount
1473+
stats.ReadBytes -= baseline.ReadBytes
1474+
stats.readTime -= baseline.readTime
1475+
1476+
stats.ioTime -= baseline.ioTime
1477+
stats.weightedIOTime -= baseline.weightedIOTime
14591478
}
14601479

14611480
// sumNetworkCounters returns a new net.IOCountersStat whose values are the sum of the
@@ -1475,22 +1494,46 @@ func sumNetworkCounters(netCounters []net.IOCountersStat) net.IOCountersStat {
14751494
return output
14761495
}
14771496

1478-
// subtractNetworkCounters subtracts the counters in `sub` from the counters in `from`,
1479-
// saving the results in `from`.
1480-
func subtractNetworkCounters(from *netCounters, sub netCounters) {
1481-
from.IOCounters.BytesRecv -= sub.IOCounters.BytesRecv
1482-
from.IOCounters.PacketsRecv -= sub.IOCounters.PacketsRecv
1483-
from.IOCounters.Errin -= sub.IOCounters.Errin
1484-
from.IOCounters.Dropin -= sub.IOCounters.Dropin
1485-
from.IOCounters.BytesSent -= sub.IOCounters.BytesSent
1486-
from.IOCounters.PacketsSent -= sub.IOCounters.PacketsSent
1487-
from.IOCounters.Errout -= sub.IOCounters.Errout
1488-
from.IOCounters.Dropout -= sub.IOCounters.Dropout
1489-
from.TCPRetransSegs -= sub.TCPRetransSegs
1490-
from.TCPFastRetrans -= sub.TCPFastRetrans
1491-
from.TCPTimeouts -= sub.TCPTimeouts
1492-
from.TCPSlowStartRetrans -= sub.TCPSlowStartRetrans
1493-
from.TCPLossProbes -= sub.TCPLossProbes
1497+
// subtractNetworkCounters subtracts the counters in `baseline`
1498+
// from the counters in `stats`, saving the results in `stats`. If
1499+
// any counter in `stats` is lower than the corresponding counter
1500+
// in `baseline` (indicating a reset), the value for all metrics in
1501+
// `baseline` is updated to the current value in `stats` to establish
1502+
// a new baseline.
1503+
func subtractNetworkCounters(ctx context.Context, stats *netCounters, baseline *netCounters) {
1504+
if stats.IOCounters.BytesRecv < baseline.IOCounters.BytesRecv ||
1505+
stats.IOCounters.PacketsRecv < baseline.IOCounters.PacketsRecv ||
1506+
stats.IOCounters.Errin < baseline.IOCounters.Errin ||
1507+
stats.IOCounters.Dropin < baseline.IOCounters.Dropin ||
1508+
stats.IOCounters.BytesSent < baseline.IOCounters.BytesSent ||
1509+
stats.IOCounters.PacketsSent < baseline.IOCounters.PacketsSent ||
1510+
stats.IOCounters.Errout < baseline.IOCounters.Errout ||
1511+
stats.IOCounters.Dropout < baseline.IOCounters.Dropout ||
1512+
stats.TCPRetransSegs < baseline.TCPRetransSegs ||
1513+
stats.TCPFastRetrans < baseline.TCPFastRetrans ||
1514+
stats.TCPTimeouts < baseline.TCPTimeouts ||
1515+
stats.TCPSlowStartRetrans < baseline.TCPSlowStartRetrans ||
1516+
stats.TCPLossProbes < baseline.TCPLossProbes {
1517+
*baseline = *stats
1518+
*stats = netCounters{}
1519+
log.Ops.Info(ctx, "runtime: new baseline in network stats from host. network metric counters have been reset.")
1520+
return
1521+
}
1522+
1523+
// Perform normal subtraction
1524+
stats.IOCounters.BytesRecv -= baseline.IOCounters.BytesRecv
1525+
stats.IOCounters.PacketsRecv -= baseline.IOCounters.PacketsRecv
1526+
stats.IOCounters.Errin -= baseline.IOCounters.Errin
1527+
stats.IOCounters.Dropin -= baseline.IOCounters.Dropin
1528+
stats.IOCounters.BytesSent -= baseline.IOCounters.BytesSent
1529+
stats.IOCounters.PacketsSent -= baseline.IOCounters.PacketsSent
1530+
stats.IOCounters.Errout -= baseline.IOCounters.Errout
1531+
stats.IOCounters.Dropout -= baseline.IOCounters.Dropout
1532+
stats.TCPRetransSegs -= baseline.TCPRetransSegs
1533+
stats.TCPFastRetrans -= baseline.TCPFastRetrans
1534+
stats.TCPTimeouts -= baseline.TCPTimeouts
1535+
stats.TCPSlowStartRetrans -= baseline.TCPSlowStartRetrans
1536+
stats.TCPLossProbes -= baseline.TCPLossProbes
14941537
}
14951538

14961539
// GetProcCPUTime returns the cumulative user/system time (in ms) since the process start.

pkg/server/status/runtime_test.go

Lines changed: 102 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212
"runtime"
1313
"runtime/metrics"
1414
"testing"
15+
"time"
1516

1617
"github.com/cockroachdb/cockroach/pkg/testutils/skip"
1718
"github.com/cockroachdb/cockroach/pkg/util/hlc"
@@ -129,12 +130,54 @@ func TestSubtractDiskCounters(t *testing.T) {
129130
// Don't touch iops in progress; it is a gauge, not a counter.
130131
iopsInProgress: 3,
131132
}
132-
subtractDiskCounters(&from, sub)
133+
subtractDiskCounters(context.Background(), &from, &sub)
133134
if !reflect.DeepEqual(from, expected) {
134135
t.Fatalf("expected %+v; got %+v", expected, from)
135136
}
136137
}
137138

139+
func TestSubtractDiskCountersReset(t *testing.T) {
140+
defer leaktest.AfterTest(t)()
141+
142+
// Test case where current values are lower than baseline (indicating counter reset)
143+
stats := DiskStats{
144+
ReadBytes: 2, // lower than sub (10)
145+
readCount: 5, // higher than sub (3)
146+
readTime: time.Millisecond * 100, // lower than sub (200ms)
147+
WriteBytes: 3, // lower than sub (8)
148+
writeCount: 7, // higher than sub (4)
149+
writeTime: time.Millisecond * 150, // higher than sub (50ms)
150+
ioTime: time.Millisecond * 80, // lower than sub (120ms)
151+
weightedIOTime: time.Millisecond * 200, // higher than sub (180ms)
152+
iopsInProgress: 5, // gauge - should not be affected
153+
}
154+
baseline := DiskStats{
155+
ReadBytes: 10,
156+
readCount: 3,
157+
readTime: time.Millisecond * 200,
158+
WriteBytes: 8,
159+
writeCount: 4,
160+
writeTime: time.Millisecond * 50,
161+
ioTime: time.Millisecond * 120,
162+
weightedIOTime: time.Millisecond * 180,
163+
iopsInProgress: 2,
164+
}
165+
166+
// Expected: all values should be reset
167+
expected := DiskStats{}
168+
169+
// Verify that baselines were updated correctly for reset fields
170+
expectedBaseline := stats
171+
172+
subtractDiskCounters(context.Background(), &stats, &baseline)
173+
if !reflect.DeepEqual(stats, expected) {
174+
t.Fatalf("expected %+v; got %+v", expected, stats)
175+
}
176+
if !reflect.DeepEqual(baseline, expectedBaseline) {
177+
t.Fatalf("expected sub %+v; got %+v", expectedBaseline, baseline)
178+
}
179+
}
180+
138181
func TestSubtractNetCounters(t *testing.T) {
139182
defer leaktest.AfterTest(t)()
140183

@@ -180,12 +223,69 @@ func TestSubtractNetCounters(t *testing.T) {
180223
TCPRetransSegs: 3,
181224
TCPFastRetrans: 6,
182225
}
183-
subtractNetworkCounters(&from, sub)
226+
subtractNetworkCounters(context.Background(), &from, &sub)
184227
if !reflect.DeepEqual(from, expected) {
185228
t.Fatalf("expected %+v; got %+v", expected, from)
186229
}
187230
}
188231

232+
func TestSubtractNetCountersReset(t *testing.T) {
233+
defer leaktest.AfterTest(t)()
234+
235+
// Test case where current values are lower than baseline (indicating counter reset)
236+
stats := netCounters{
237+
IOCounters: net.IOCountersStat{
238+
PacketsRecv: 2, // lower than sub (5)
239+
BytesRecv: 1, // lower than sub (10)
240+
Errin: 3, // higher than sub (2)
241+
Dropin: 1, // lower than sub (4)
242+
BytesSent: 5, // lower than sub (8)
243+
PacketsSent: 7, // higher than sub (3)
244+
Errout: 2, // lower than sub (6)
245+
Dropout: 4, // higher than sub (1)
246+
},
247+
TCPRetransSegs: 15, // lower than sub (20)
248+
TCPFastRetrans: 8, // higher than sub (5)
249+
TCPTimeouts: 3, // lower than sub (12)
250+
TCPSlowStartRetrans: 9, // higher than sub (7)
251+
TCPLossProbes: 1, // lower than sub (4)
252+
}
253+
baseline := netCounters{
254+
IOCounters: net.IOCountersStat{
255+
PacketsRecv: 5,
256+
BytesRecv: 10,
257+
Errin: 2,
258+
Dropin: 4,
259+
BytesSent: 8,
260+
PacketsSent: 3,
261+
Errout: 6,
262+
Dropout: 1,
263+
},
264+
TCPRetransSegs: 20,
265+
TCPFastRetrans: 5,
266+
TCPTimeouts: 12,
267+
TCPSlowStartRetrans: 7,
268+
TCPLossProbes: 4,
269+
}
270+
271+
// Expected: reset values should be set to sub values when from < sub,
272+
// then normal subtraction occurs, resulting in 0 for reset fields
273+
expected := netCounters{
274+
IOCounters: net.IOCountersStat{},
275+
}
276+
277+
// Verify that baselines were updated correctly for reset fields
278+
expectedBaseilne := stats
279+
280+
subtractNetworkCounters(context.Background(), &stats, &baseline)
281+
if !reflect.DeepEqual(stats, expected) {
282+
t.Fatalf("expected %+v; got %+v", expected, stats)
283+
}
284+
if !reflect.DeepEqual(baseline, expectedBaseilne) {
285+
t.Fatalf("expected sub %+v; got %+v", expectedBaseilne, baseline)
286+
}
287+
}
288+
189289
func TestFloat64HistogramSum(t *testing.T) {
190290
defer leaktest.AfterTest(t)()
191291
type testCase struct {

0 commit comments

Comments
 (0)