Skip to content

Commit c5ac4c9

Browse files
craig[bot]RaduBerinde
andcommitted
Merge #155069
155069: kvserver: improve WAL metric descriptions r=RaduBerinde a=RaduBerinde Epic: none Release note: None Co-authored-by: Radu Berinde <[email protected]>
2 parents 2eff7bd + b339daf commit c5ac4c9

File tree

2 files changed

+30
-20
lines changed

2 files changed

+30
-20
lines changed

docs/generated/metrics/metrics.yaml

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10553,13 +10553,13 @@ layers:
1055310553
essential: true
1055410554
- name: storage.wal.fsync.latency
1055510555
exported_name: storage_wal_fsync_latency
10556-
description: The write ahead log fsync latency
10556+
description: The fsync latency to the Write-Ahead Log device.
1055710557
y_axis_label: Fsync Latency
1055810558
type: HISTOGRAM
1055910559
unit: NANOSECONDS
1056010560
aggregation: AVG
1056110561
derivative: NONE
10562-
how_to_use: If this value is greater than `100ms`, it is an indication of a disk stall. To mitigate the effects of disk stalls, consider deploying your cluster with WAL failover configured.
10562+
how_to_use: If this value is greater than 100ms, it is an indication of a disk stall. To mitigate the effects of disk stalls, consider deploying your cluster with WAL failover configured. When WAL failover is configured, the more relevant metric is storage.wal.failover_write_and_sync.latency, as this metric reflects the fsync latency of the primary and/or the secondary WAL device.
1056310563
essential: true
1056410564
- name: storage.write-stalls
1056510565
exported_name: storage_write_stalls
@@ -17311,36 +17311,38 @@ layers:
1731117311
derivative: NON_NEGATIVE_DERIVATIVE
1731217312
- name: storage.wal.bytes_in
1731317313
exported_name: storage_wal_bytes_in
17314-
description: The number of logical bytes the storage engine has written to the WAL
17314+
description: The number of logical bytes the storage engine has written to the Write-Ahead Log.
1731517315
y_axis_label: Events
1731617316
type: COUNTER
1731717317
unit: COUNT
1731817318
aggregation: AVG
1731917319
derivative: NON_NEGATIVE_DERIVATIVE
1732017320
- name: storage.wal.bytes_written
1732117321
exported_name: storage_wal_bytes_written
17322-
description: The number of bytes the storage engine has written to the WAL
17322+
description: The number of bytes the storage engine has written to the Write-Ahead Log.
1732317323
y_axis_label: Events
1732417324
type: COUNTER
1732517325
unit: COUNT
1732617326
aggregation: AVG
1732717327
derivative: NON_NEGATIVE_DERIVATIVE
1732817328
- name: storage.wal.failover.primary.duration
1732917329
exported_name: storage_wal_failover_primary_duration
17330-
description: Cumulative time spent writing to the primary WAL directory. Only populated when WAL failover is configured
17330+
description: Cumulative time spent writing to the primary WAL directory.
1733117331
y_axis_label: Nanoseconds
1733217332
type: COUNTER
1733317333
unit: NANOSECONDS
1733417334
aggregation: AVG
1733517335
derivative: NON_NEGATIVE_DERIVATIVE
17336+
how_to_use: Only populated when WAL failover is configured.
1733617337
- name: storage.wal.failover.secondary.duration
1733717338
exported_name: storage_wal_failover_secondary_duration
17338-
description: Cumulative time spent writing to the secondary WAL directory. Only populated when WAL failover is configured
17339+
description: Cumulative time spent writing to the secondary WAL directory.
1733917340
y_axis_label: Nanoseconds
1734017341
type: COUNTER
1734117342
unit: NANOSECONDS
1734217343
aggregation: AVG
1734317344
derivative: NON_NEGATIVE_DERIVATIVE
17345+
how_to_use: Only populated when WAL failover is configured.
1734417346
- name: storage.wal.failover.switch.count
1734517347
exported_name: storage_wal_failover_switch_count
1734617348
description: Count of the number of times WAL writing has switched from primary to secondary and vice versa.
@@ -17349,14 +17351,16 @@ layers:
1734917351
unit: COUNT
1735017352
aggregation: AVG
1735117353
derivative: NON_NEGATIVE_DERIVATIVE
17354+
how_to_use: Only populated when WAL failover is configured. A high switch count indicates that many disk stalls were encountered.
1735217355
- name: storage.wal.failover.write_and_sync.latency
1735317356
exported_name: storage_wal_failover_write_and_sync_latency
17354-
description: The observed latency for writing and syncing to the write ahead log. Only populated when WAL failover is configured
17357+
description: The observed latency for writing and syncing to the logical Write-Ahead Log.
1735517358
y_axis_label: Nanoseconds
1735617359
type: HISTOGRAM
1735717360
unit: NANOSECONDS
1735817361
aggregation: AVG
1735917362
derivative: NONE
17363+
how_to_use: Only populated when WAL failover is configured. Without WAL failover, the relevant metric is storage.wal.fsync.latency.
1736017364
- name: storage.write-amplification
1736117365
exported_name: storage_write_amplification
1736217366
description: |-

pkg/kv/kvserver/metrics.go

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2774,52 +2774,58 @@ Note that the measurement does not include the duration for replicating the eval
27742774
}
27752775
metaWALBytesWritten = metric.Metadata{
27762776
Name: "storage.wal.bytes_written",
2777-
Help: "The number of bytes the storage engine has written to the WAL",
2777+
Help: "The number of bytes the storage engine has written to the Write-Ahead Log.",
27782778
Measurement: "Events",
27792779
Unit: metric.Unit_COUNT,
27802780
}
27812781
metaWALBytesIn = metric.Metadata{
27822782
Name: "storage.wal.bytes_in",
2783-
Help: "The number of logical bytes the storage engine has written to the WAL",
2783+
Help: "The number of logical bytes the storage engine has written to the Write-Ahead Log.",
27842784
Measurement: "Events",
27852785
Unit: metric.Unit_COUNT,
27862786
}
27872787
metaStorageFsyncLatency = metric.Metadata{
27882788
Name: "storage.wal.fsync.latency",
2789-
Help: "The write ahead log fsync latency",
2789+
Help: "The fsync latency to the Write-Ahead Log device.",
27902790
Measurement: "Fsync Latency",
27912791
Unit: metric.Unit_NANOSECONDS,
27922792
Essential: true,
27932793
Category: metric.Metadata_STORAGE,
2794-
HowToUse: "If this value is greater than `100ms`, it is an indication of a disk stall. To mitigate the effects of disk stalls, consider deploying your cluster with WAL failover configured.",
2794+
HowToUse: "If this value is greater than 100ms, it is an indication of a disk stall. " +
2795+
"To mitigate the effects of disk stalls, consider deploying your cluster with WAL failover configured. " +
2796+
"When WAL failover is configured, the more relevant metric is storage.wal.failover_write_and_sync.latency, as " +
2797+
"this metric reflects the fsync latency of the primary and/or the secondary WAL device.",
27952798
}
27962799
metaStorageWALFailoverSwitchCount = metric.Metadata{
27972800
Name: "storage.wal.failover.switch.count",
27982801
Help: "Count of the number of times WAL writing has switched from primary to secondary " +
27992802
"and vice versa.",
28002803
Measurement: "Events",
28012804
Unit: metric.Unit_COUNT,
2805+
HowToUse: "Only populated when WAL failover is configured. A high switch count indicates that " +
2806+
"many disk stalls were encountered.",
28022807
}
28032808
metaStorageWALFailoverPrimaryDuration = metric.Metadata{
2804-
Name: "storage.wal.failover.primary.duration",
2805-
Help: "Cumulative time spent writing to the primary WAL directory. Only populated " +
2806-
"when WAL failover is configured",
2809+
Name: "storage.wal.failover.primary.duration",
2810+
Help: "Cumulative time spent writing to the primary WAL directory.",
28072811
Measurement: "Nanoseconds",
28082812
Unit: metric.Unit_NANOSECONDS,
2813+
HowToUse: "Only populated when WAL failover is configured.",
28092814
}
28102815
metaStorageWALFailoverSecondaryDuration = metric.Metadata{
2811-
Name: "storage.wal.failover.secondary.duration",
2812-
Help: "Cumulative time spent writing to the secondary WAL directory. Only populated " +
2813-
"when WAL failover is configured",
2816+
Name: "storage.wal.failover.secondary.duration",
2817+
Help: "Cumulative time spent writing to the secondary WAL directory.",
28142818
Measurement: "Nanoseconds",
28152819
Unit: metric.Unit_NANOSECONDS,
2820+
HowToUse: "Only populated when WAL failover is configured.",
28162821
}
28172822
metaStorageWALFailoverWriteAndSyncLatency = metric.Metadata{
2818-
Name: "storage.wal.failover.write_and_sync.latency",
2819-
Help: "The observed latency for writing and syncing to the write ahead log. Only populated " +
2820-
"when WAL failover is configured",
2823+
Name: "storage.wal.failover.write_and_sync.latency",
2824+
Help: "The observed latency for writing and syncing to the logical Write-Ahead Log.",
28212825
Measurement: "Nanoseconds",
28222826
Unit: metric.Unit_NANOSECONDS,
2827+
HowToUse: "Only populated when WAL failover is configured. Without WAL failover, the relevant " +
2828+
"metric is storage.wal.fsync.latency.",
28232829
}
28242830
metaReplicaReadBatchDroppedLatchesBeforeEval = metric.Metadata{
28252831
Name: "kv.replica_read_batch_evaluate.dropped_latches_before_eval",

0 commit comments

Comments
 (0)