Skip to content

Commit b339daf

Browse files
committed
kvserver: improve WAL metric descriptions
Epic: none Release note: None
1 parent 9e89769 commit b339daf

File tree

2 files changed

+30
-20
lines changed

2 files changed

+30
-20
lines changed

docs/generated/metrics/metrics.yaml

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10553,13 +10553,13 @@ layers:
1055310553
essential: true
1055410554
- name: storage.wal.fsync.latency
1055510555
exported_name: storage_wal_fsync_latency
10556-
description: The write ahead log fsync latency
10556+
description: The fsync latency to the Write-Ahead Log device.
1055710557
y_axis_label: Fsync Latency
1055810558
type: HISTOGRAM
1055910559
unit: NANOSECONDS
1056010560
aggregation: AVG
1056110561
derivative: NONE
10562-
how_to_use: If this value is greater than `100ms`, it is an indication of a disk stall. To mitigate the effects of disk stalls, consider deploying your cluster with WAL failover configured.
10562+
how_to_use: If this value is greater than 100ms, it is an indication of a disk stall. To mitigate the effects of disk stalls, consider deploying your cluster with WAL failover configured. When WAL failover is configured, the more relevant metric is storage.wal.failover_write_and_sync.latency, as this metric reflects the fsync latency of the primary and/or the secondary WAL device.
1056310563
essential: true
1056410564
- name: storage.write-stalls
1056510565
exported_name: storage_write_stalls
@@ -17287,36 +17287,38 @@ layers:
1728717287
derivative: NON_NEGATIVE_DERIVATIVE
1728817288
- name: storage.wal.bytes_in
1728917289
exported_name: storage_wal_bytes_in
17290-
description: The number of logical bytes the storage engine has written to the WAL
17290+
description: The number of logical bytes the storage engine has written to the Write-Ahead Log.
1729117291
y_axis_label: Events
1729217292
type: COUNTER
1729317293
unit: COUNT
1729417294
aggregation: AVG
1729517295
derivative: NON_NEGATIVE_DERIVATIVE
1729617296
- name: storage.wal.bytes_written
1729717297
exported_name: storage_wal_bytes_written
17298-
description: The number of bytes the storage engine has written to the WAL
17298+
description: The number of bytes the storage engine has written to the Write-Ahead Log.
1729917299
y_axis_label: Events
1730017300
type: COUNTER
1730117301
unit: COUNT
1730217302
aggregation: AVG
1730317303
derivative: NON_NEGATIVE_DERIVATIVE
1730417304
- name: storage.wal.failover.primary.duration
1730517305
exported_name: storage_wal_failover_primary_duration
17306-
description: Cumulative time spent writing to the primary WAL directory. Only populated when WAL failover is configured
17306+
description: Cumulative time spent writing to the primary WAL directory.
1730717307
y_axis_label: Nanoseconds
1730817308
type: COUNTER
1730917309
unit: NANOSECONDS
1731017310
aggregation: AVG
1731117311
derivative: NON_NEGATIVE_DERIVATIVE
17312+
how_to_use: Only populated when WAL failover is configured.
1731217313
- name: storage.wal.failover.secondary.duration
1731317314
exported_name: storage_wal_failover_secondary_duration
17314-
description: Cumulative time spent writing to the secondary WAL directory. Only populated when WAL failover is configured
17315+
description: Cumulative time spent writing to the secondary WAL directory.
1731517316
y_axis_label: Nanoseconds
1731617317
type: COUNTER
1731717318
unit: NANOSECONDS
1731817319
aggregation: AVG
1731917320
derivative: NON_NEGATIVE_DERIVATIVE
17321+
how_to_use: Only populated when WAL failover is configured.
1732017322
- name: storage.wal.failover.switch.count
1732117323
exported_name: storage_wal_failover_switch_count
1732217324
description: Count of the number of times WAL writing has switched from primary to secondary and vice versa.
@@ -17325,14 +17327,16 @@ layers:
1732517327
unit: COUNT
1732617328
aggregation: AVG
1732717329
derivative: NON_NEGATIVE_DERIVATIVE
17330+
how_to_use: Only populated when WAL failover is configured. A high switch count indicates that many disk stalls were encountered.
1732817331
- name: storage.wal.failover.write_and_sync.latency
1732917332
exported_name: storage_wal_failover_write_and_sync_latency
17330-
description: The observed latency for writing and syncing to the write ahead log. Only populated when WAL failover is configured
17333+
description: The observed latency for writing and syncing to the logical Write-Ahead Log.
1733117334
y_axis_label: Nanoseconds
1733217335
type: HISTOGRAM
1733317336
unit: NANOSECONDS
1733417337
aggregation: AVG
1733517338
derivative: NONE
17339+
how_to_use: Only populated when WAL failover is configured. Without WAL failover, the relevant metric is storage.wal.fsync.latency.
1733617340
- name: storage.write-amplification
1733717341
exported_name: storage_write_amplification
1733817342
description: |-

pkg/kv/kvserver/metrics.go

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2774,52 +2774,58 @@ Note that the measurement does not include the duration for replicating the eval
27742774
}
27752775
metaWALBytesWritten = metric.Metadata{
27762776
Name: "storage.wal.bytes_written",
2777-
Help: "The number of bytes the storage engine has written to the WAL",
2777+
Help: "The number of bytes the storage engine has written to the Write-Ahead Log.",
27782778
Measurement: "Events",
27792779
Unit: metric.Unit_COUNT,
27802780
}
27812781
metaWALBytesIn = metric.Metadata{
27822782
Name: "storage.wal.bytes_in",
2783-
Help: "The number of logical bytes the storage engine has written to the WAL",
2783+
Help: "The number of logical bytes the storage engine has written to the Write-Ahead Log.",
27842784
Measurement: "Events",
27852785
Unit: metric.Unit_COUNT,
27862786
}
27872787
metaStorageFsyncLatency = metric.Metadata{
27882788
Name: "storage.wal.fsync.latency",
2789-
Help: "The write ahead log fsync latency",
2789+
Help: "The fsync latency to the Write-Ahead Log device.",
27902790
Measurement: "Fsync Latency",
27912791
Unit: metric.Unit_NANOSECONDS,
27922792
Essential: true,
27932793
Category: metric.Metadata_STORAGE,
2794-
HowToUse: "If this value is greater than `100ms`, it is an indication of a disk stall. To mitigate the effects of disk stalls, consider deploying your cluster with WAL failover configured.",
2794+
HowToUse: "If this value is greater than 100ms, it is an indication of a disk stall. " +
2795+
"To mitigate the effects of disk stalls, consider deploying your cluster with WAL failover configured. " +
2796+
"When WAL failover is configured, the more relevant metric is storage.wal.failover_write_and_sync.latency, as " +
2797+
"this metric reflects the fsync latency of the primary and/or the secondary WAL device.",
27952798
}
27962799
metaStorageWALFailoverSwitchCount = metric.Metadata{
27972800
Name: "storage.wal.failover.switch.count",
27982801
Help: "Count of the number of times WAL writing has switched from primary to secondary " +
27992802
"and vice versa.",
28002803
Measurement: "Events",
28012804
Unit: metric.Unit_COUNT,
2805+
HowToUse: "Only populated when WAL failover is configured. A high switch count indicates that " +
2806+
"many disk stalls were encountered.",
28022807
}
28032808
metaStorageWALFailoverPrimaryDuration = metric.Metadata{
2804-
Name: "storage.wal.failover.primary.duration",
2805-
Help: "Cumulative time spent writing to the primary WAL directory. Only populated " +
2806-
"when WAL failover is configured",
2809+
Name: "storage.wal.failover.primary.duration",
2810+
Help: "Cumulative time spent writing to the primary WAL directory.",
28072811
Measurement: "Nanoseconds",
28082812
Unit: metric.Unit_NANOSECONDS,
2813+
HowToUse: "Only populated when WAL failover is configured.",
28092814
}
28102815
metaStorageWALFailoverSecondaryDuration = metric.Metadata{
2811-
Name: "storage.wal.failover.secondary.duration",
2812-
Help: "Cumulative time spent writing to the secondary WAL directory. Only populated " +
2813-
"when WAL failover is configured",
2816+
Name: "storage.wal.failover.secondary.duration",
2817+
Help: "Cumulative time spent writing to the secondary WAL directory.",
28142818
Measurement: "Nanoseconds",
28152819
Unit: metric.Unit_NANOSECONDS,
2820+
HowToUse: "Only populated when WAL failover is configured.",
28162821
}
28172822
metaStorageWALFailoverWriteAndSyncLatency = metric.Metadata{
2818-
Name: "storage.wal.failover.write_and_sync.latency",
2819-
Help: "The observed latency for writing and syncing to the write ahead log. Only populated " +
2820-
"when WAL failover is configured",
2823+
Name: "storage.wal.failover.write_and_sync.latency",
2824+
Help: "The observed latency for writing and syncing to the logical Write-Ahead Log.",
28212825
Measurement: "Nanoseconds",
28222826
Unit: metric.Unit_NANOSECONDS,
2827+
HowToUse: "Only populated when WAL failover is configured. Without WAL failover, the relevant " +
2828+
"metric is storage.wal.fsync.latency.",
28232829
}
28242830
metaReplicaReadBatchDroppedLatchesBeforeEval = metric.Metadata{
28252831
Name: "kv.replica_read_batch_evaluate.dropped_latches_before_eval",

0 commit comments

Comments
 (0)