Skip to content

Commit 26dbe94

Browse files
authored
Aggregate multiprocess replica metrics for console overview view (#31850)
We use this query in the Console to power the Environment Overview page and cluster replica graphs for the 14 day time period. <!-- Describe the contents of the PR briefly but completely. If you write detailed commit messages, it is acceptable to copy/paste them here, or write "see commit messages for details." If there is only one commit in the PR, GitHub will have already added its commit message above. --> ### Motivation Sibling PR to MaterializeInc/console#3733. Query should reflect the duplicate code there Gabor noticed the utilization graphs being broken for clusters with multiple processes. https://materializeinc.slack.com/archives/CU7ELJ6E9/p1741596295446089 <!-- Which of the following best describes the motivation behind this PR? * This PR fixes a recognized bug. [Ensure issue is linked somewhere.] * This PR adds a known-desirable feature. [Ensure issue is linked somewhere.] * This PR fixes a previously unreported bug. [Describe the bug in detail, as if you were filing a bug report.] * This PR adds a feature that has not yet been specified. [Write a brief specification for the feature, including justification for its inclusion in Materialize, as if you were writing the original feature specification.] * This PR refactors existing code. [Describe what was wrong with the existing code, if it is not obvious.] --> ### Tips for reviewer <!-- Leave some tips for your reviewer, like: * The diff is much smaller if viewed with whitespace hidden. * [Some function/module/file] deserves extra attention. * [Some function/module/file] is pure code movement and only needs a skim. Delete this section if no tips. --> ### Checklist - [x] This PR has adequate test coverage / QA involvement has been duly considered. ([trigger-ci for additional test/nightly runs](https://trigger-ci.dev.materialize.com/)) I've tested this manually, but I added a unit test in MaterializeInc/console#3733
1 parent e5475a3 commit 26dbe94

File tree

2 files changed

+47
-30
lines changed

2 files changed

+47
-30
lines changed

src/catalog/src/builtin.rs

Lines changed: 42 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -8105,7 +8105,8 @@ JOIN root_times r USING (id)",
81058105

81068106
/**
81078107
* This view is used to display the cluster utilization over 14 days bucketed by 8 hours.
8108-
* It's specifically for the Console's environment overview page to speed up load times
8108+
* It's specifically for the Console's environment overview page to speed up load times.
8109+
* This query should be kept in sync with MaterializeInc/console/src/api/materialize/cluster/replicaUtilizationHistory.ts
81098110
*/
81108111
pub static MZ_CONSOLE_CLUSTER_UTILIZATION_OVERVIEW: LazyLock<BuiltinView> = LazyLock::new(|| {
81118112
BuiltinView {
@@ -8143,70 +8144,87 @@ pub static MZ_CONSOLE_CLUSTER_UTILIZATION_OVERVIEW: LazyLock<BuiltinView> = Lazy
81438144
cluster_id
81448145
FROM mz_catalog.mz_cluster_replicas
81458146
),
8147+
replica_metrics_history AS (
8148+
SELECT
8149+
m.occurred_at,
8150+
m.replica_id,
8151+
r.size,
8152+
(SUM(m.cpu_nano_cores::float8) / s.cpu_nano_cores) / s.processes AS cpu_percent,
8153+
(SUM(m.memory_bytes::float8) / s.memory_bytes) / s.processes AS memory_percent,
8154+
(SUM(m.disk_bytes::float8) / s.disk_bytes) / s.processes AS disk_percent,
8155+
SUM(m.disk_bytes::float8) AS disk_bytes,
8156+
SUM(m.memory_bytes::float8) AS memory_bytes,
8157+
s.disk_bytes::numeric * s.processes AS total_disk_bytes,
8158+
s.memory_bytes::numeric * s.processes AS total_memory_bytes
8159+
FROM
8160+
replica_history AS r
8161+
INNER JOIN mz_catalog.mz_cluster_replica_sizes AS s ON r.size = s.size
8162+
INNER JOIN mz_internal.mz_cluster_replica_metrics_history AS m ON m.replica_id = r.replica_id
8163+
GROUP BY
8164+
m.occurred_at,
8165+
m.replica_id,
8166+
r.size,
8167+
s.cpu_nano_cores,
8168+
s.memory_bytes,
8169+
s.disk_bytes,
8170+
s.processes
8171+
),
81468172
replica_utilization_history_binned AS (
81478173
SELECT m.occurred_at,
81488174
m.replica_id,
8149-
m.process_id,
8150-
(m.cpu_nano_cores::float8 / s.cpu_nano_cores) AS cpu_percent,
8151-
(m.memory_bytes::float8 / s.memory_bytes) AS memory_percent,
8152-
(m.disk_bytes::float8 / s.disk_bytes) AS disk_percent,
8153-
m.disk_bytes::float8 AS disk_bytes,
8154-
m.memory_bytes::float8 AS memory_bytes,
8155-
s.disk_bytes AS total_disk_bytes,
8156-
s.memory_bytes AS total_memory_bytes,
8157-
r.size,
8175+
m.cpu_percent,
8176+
m.memory_percent,
8177+
m.memory_bytes,
8178+
m.disk_percent,
8179+
m.disk_bytes,
8180+
m.total_disk_bytes,
8181+
m.total_memory_bytes,
8182+
m.size,
81588183
date_bin(
81598184
'8 HOURS',
81608185
occurred_at,
81618186
'1970-01-01'::timestamp
81628187
) AS bucket_start
81638188
FROM replica_history AS r
8164-
JOIN mz_catalog.mz_cluster_replica_sizes AS s ON r.size = s.size
8165-
JOIN mz_internal.mz_cluster_replica_metrics_history AS m ON m.replica_id = r.replica_id
8189+
JOIN replica_metrics_history AS m ON m.replica_id = r.replica_id
81668190
WHERE mz_now() <= date_bin(
81678191
'8 HOURS',
81688192
occurred_at,
81698193
'1970-01-01'::timestamp
81708194
) + INTERVAL '14 DAYS'
81718195
),
8172-
-- For each (replica, process_id, bucket), take the (replica, process_id, bucket) with the highest memory
8196+
-- For each (replica, bucket), take the (replica, bucket) with the highest memory
81738197
max_memory AS (
8174-
SELECT DISTINCT ON (bucket_start, replica_id, process_id) bucket_start,
8198+
SELECT DISTINCT ON (bucket_start, replica_id) bucket_start,
81758199
replica_id,
8176-
process_id,
81778200
memory_percent,
81788201
occurred_at
81798202
FROM replica_utilization_history_binned
81808203
OPTIONS (DISTINCT ON INPUT GROUP SIZE = 480)
81818204
ORDER BY bucket_start,
81828205
replica_id,
8183-
process_id,
81848206
COALESCE(memory_bytes, 0) DESC
81858207
),
81868208
max_disk AS (
8187-
SELECT DISTINCT ON (bucket_start, replica_id, process_id) bucket_start,
8209+
SELECT DISTINCT ON (bucket_start, replica_id) bucket_start,
81888210
replica_id,
8189-
process_id,
81908211
disk_percent,
81918212
occurred_at
81928213
FROM replica_utilization_history_binned
81938214
OPTIONS (DISTINCT ON INPUT GROUP SIZE = 480)
81948215
ORDER BY bucket_start,
81958216
replica_id,
8196-
process_id,
81978217
COALESCE(disk_bytes, 0) DESC
81988218
),
81998219
max_cpu AS (
8200-
SELECT DISTINCT ON (bucket_start, replica_id, process_id) bucket_start,
8220+
SELECT DISTINCT ON (bucket_start, replica_id) bucket_start,
82018221
replica_id,
8202-
process_id,
82038222
cpu_percent,
82048223
occurred_at
82058224
FROM replica_utilization_history_binned
82068225
OPTIONS (DISTINCT ON INPUT GROUP SIZE = 480)
82078226
ORDER BY bucket_start,
82088227
replica_id,
8209-
process_id,
82108228
COALESCE(cpu_percent, 0) DESC
82118229
),
82128230
/*
@@ -8216,7 +8234,7 @@ max_cpu AS (
82168234
values may not occur at the same time if the bucket interval is large.
82178235
*/
82188236
max_memory_and_disk AS (
8219-
SELECT DISTINCT ON (bucket_start, replica_id, process_id) bucket_start,
8237+
SELECT DISTINCT ON (bucket_start, replica_id) bucket_start,
82208238
replica_id,
82218239
memory_percent,
82228240
disk_percent,
@@ -8242,10 +8260,9 @@ max_memory_and_disk AS (
82428260
OPTIONS (DISTINCT ON INPUT GROUP SIZE = 480)
82438261
ORDER BY bucket_start,
82448262
replica_id,
8245-
process_id,
82468263
COALESCE(memory_and_disk_percent, 0) DESC
82478264
),
8248-
-- For each (replica, process_id, bucket), get its offline events at that time
8265+
-- For each (replica, bucket), get its offline events at that time
82498266
replica_offline_event_history AS (
82508267
SELECT date_bin(
82518268
'8 HOURS',

test/sqllogictest/distinct_arrangements.slt

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -980,12 +980,12 @@ WHERE mdod.dataflow_name NOT LIKE '%introspection-subscribe%'
980980
GROUP BY mdod.name
981981
ORDER BY mdod.name;
982982
----
983-
AccumulableErrorCheck 9
983+
AccumulableErrorCheck 10
984984
Arrange␠ReduceMinsMaxes 3
985985
Arrange␠export␠iterative 2
986986
Arrange␠export␠iterative␠err 2
987987
Arrange␠recursive␠err 4
988-
ArrangeAccumulable␠[val:␠empty] 9
988+
ArrangeAccumulable␠[val:␠empty] 10
989989
ArrangeBy[[CallBinary␠{␠func:␠JsonbGetString␠{␠stringify:␠true␠},␠expr1:␠Column(1),␠expr2:␠Literal(Ok(Row{[String("id")]}),␠ColumnType␠{␠scalar_type:␠String,␠nullable:␠false␠})␠}]] 2
990990
ArrangeBy[[CallBinary␠{␠func:␠JsonbGetString␠{␠stringify:␠true␠},␠expr1:␠Column(2),␠expr2:␠Literal(Ok(Row{[String("id")]}),␠ColumnType␠{␠scalar_type:␠String,␠nullable:␠false␠})␠}]] 1
991991
ArrangeBy[[CallVariadic␠{␠func:␠Coalesce,␠exprs:␠[Column(2),␠Column(3)]␠}]] 2
@@ -1004,12 +1004,12 @@ ArrangeBy[[Column(0),␠Column(1)]] 2
10041004
ArrangeBy[[Column(0),␠Column(2)]] 4
10051005
ArrangeBy[[Column(0),␠Column(3)]] 4
10061006
ArrangeBy[[Column(0),␠Column(4)]] 1
1007-
ArrangeBy[[Column(0)]] 154
1007+
ArrangeBy[[Column(0)]] 155
10081008
ArrangeBy[[Column(0)]]-errors 44
10091009
ArrangeBy[[Column(1),␠Column(0)]] 1
10101010
ArrangeBy[[Column(1),␠Column(2)]] 2
10111011
ArrangeBy[[Column(1),␠Column(3)]] 1
1012-
ArrangeBy[[Column(1)]] 25
1012+
ArrangeBy[[Column(1)]] 26
10131013
ArrangeBy[[Column(1)]]-errors 7
10141014
ArrangeBy[[Column(13)]] 1
10151015
ArrangeBy[[Column(15)]] 1
@@ -1034,7 +1034,7 @@ Arranged␠TopK␠input 68
10341034
Distinct␠recursive␠err 4
10351035
DistinctBy 47
10361036
DistinctByErrorCheck 47
1037-
ReduceAccumulable 9
1037+
ReduceAccumulable 10
10381038
ReduceInaccumulable 3
10391039
ReduceInaccumulable␠Error␠Check 3
10401040
ReduceMinsMaxes 3

0 commit comments

Comments
 (0)