Skip to content

Commit 2609d4f

Browse files
committed
mgr/dashboard: Add /health/snapshot api
Fixes https://tracker.ceph.com/issues/72609 - The current minimal API relies on fetching data from osdmap and pgmap. - These commands produce large, detailed payloads that become a performance bottleneck and impact scalability, especially in large clusters. - To address this, we propose switching to the ceph snapshot API using ceph status command, which retrieves essential information directly from the cluster map. - ceph status is significantly more lightweight compared to osdmap/pgmap, reducing payload sizes and processing overhead. - This change ensures faster response times, improves system efficiency in large deployments, and minimizes unnecessary data transfer. - update tests Signed-off-by: Afreen Misbah <[email protected]>
1 parent 86006fb commit 2609d4f

21 files changed

+734
-468
lines changed

src/pybind/mgr/dashboard/controllers/health.py

Lines changed: 149 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,55 @@
110110
'scrub_status': (str, '')
111111
})
112112

113+
HEALTH_SNAPSHOT_SCHEMA = ({
114+
'fsid': (str, 'Cluster filesystem ID'),
115+
'health': ({
116+
'status': (str, 'Overall health status'),
117+
'checks': ({
118+
'<check_name>': ({
119+
'severity': (str, 'Health severity level'),
120+
'summary': ({
121+
'message': (str, 'Human-readable summary'),
122+
'count': (int, 'Occurrence count')
123+
}, 'Summary details'),
124+
'muted': (bool, 'Whether the check is muted')
125+
}, 'Individual health check object')
126+
}, 'Health checks keyed by name'),
127+
'mutes': ([str], 'List of muted check names')
128+
}, 'Cluster health overview'),
129+
'monmap': ({
130+
'num_mons': (int, 'Number of monitors')
131+
}, 'Monitor map details'),
132+
'osdmap': ({
133+
'in': (int, 'Number of OSDs in'),
134+
'up': (int, 'Number of OSDs up'),
135+
'num_osds': (int, 'Total OSD count')
136+
}, 'OSD map details'),
137+
'pgmap': ({
138+
'pgs_by_state': ([{
139+
'state_name': (str, 'Placement group state'),
140+
'count': (int, 'Count of PGs in this state')
141+
}], 'List of PG counts by state'),
142+
'num_pools': (int, 'Number of pools'),
143+
'num_pgs': (int, 'Total PG count'),
144+
'bytes_used': (int, 'Used capacity in bytes'),
145+
'bytes_total': (int, 'Total capacity in bytes'),
146+
}, 'Placement group map details'),
147+
'mgrmap': ({
148+
'num_active': (int, 'Number of active managers'),
149+
'num_standbys': (int, 'Standby manager count')
150+
}, 'Manager map details'),
151+
'fsmap': ({
152+
'num_active': (int, 'Number of active mds'),
153+
'num_standbys': (int, 'Standby MDS count'),
154+
}, 'Filesystem map details'),
155+
'num_rgw_gateways': (int, 'Count of RGW gateway daemons running'),
156+
'num_iscsi_gateways': ({
157+
'up': (int, 'Count of iSCSI gateways running'),
158+
'down': (int, 'Count of iSCSI gateways not running')
159+
}, 'Iscsi gateways status'),
160+
})
161+
113162

114163
class HealthData(object):
115164
"""
@@ -281,15 +330,28 @@ def scrub_status(self):
281330
class Health(BaseController):
282331
def __init__(self):
283332
super().__init__()
284-
self.health_full = HealthData(self._has_permissions, minimal=False)
285-
self.health_minimal = HealthData(self._has_permissions, minimal=True)
333+
self._health_full = None
334+
self._health_minimal = None
335+
336+
@property
337+
def health_full(self):
338+
if self._health_full is None:
339+
self._health_full = HealthData(self._has_permissions, minimal=False)
340+
return self._health_full
341+
342+
@property
343+
def health_minimal(self):
344+
if self._health_minimal is None:
345+
self._health_minimal = HealthData(self._has_permissions, minimal=True)
346+
return self._health_minimal
286347

287348
@Endpoint()
349+
@EndpointDoc("Get Cluster's detailed health report")
288350
def full(self):
289351
return self.health_full.all_health()
290352

291353
@Endpoint()
292-
@EndpointDoc("Get Cluster's minimal health report",
354+
@EndpointDoc("Get Cluster's health report with lesser details",
293355
responses={200: HEALTH_MINIMAL_SCHEMA})
294356
def minimal(self):
295357
return self.health_minimal.all_health()
@@ -305,3 +367,87 @@ def get_cluster_fsid(self):
305367
@Endpoint()
306368
def get_telemetry_status(self):
307369
return mgr.get_module_option_ex('telemetry', 'enabled', False)
370+
371+
@Endpoint()
372+
@EndpointDoc(
373+
"Get a quick overview of cluster health at a moment, analogous to "
374+
"the ceph status command in CLI.",
375+
responses={200: HEALTH_SNAPSHOT_SCHEMA})
376+
def snapshot(self):
377+
data = CephService.send_command('mon', 'status')
378+
379+
summary = {
380+
'fsid': data.get('fsid'),
381+
'health': {
382+
'status': data.get('health', {}).get('status'),
383+
'checks': data.get('health', {}).get('checks', {}),
384+
'mutes': data.get('health', {}).get('mutes', []),
385+
},
386+
}
387+
388+
if self._has_permissions(Permission.READ, Scope.MONITOR):
389+
summary['monmap'] = {
390+
'num_mons': data.get('monmap', {}).get('num_mons'),
391+
}
392+
393+
if self._has_permissions(Permission.READ, Scope.OSD):
394+
summary['osdmap'] = {
395+
'in': data.get('osdmap', {}).get('num_in_osds'),
396+
'up': data.get('osdmap', {}).get('num_up_osds'),
397+
'num_osds': data.get('osdmap', {}).get('num_osds'),
398+
}
399+
summary['pgmap'] = {
400+
'pgs_by_state': data.get('pgmap', {}).get('pgs_by_state', []),
401+
'num_pools': data.get('pgmap', {}).get('num_pools'),
402+
'num_pgs': data.get('pgmap', {}).get('num_pgs'),
403+
'bytes_used': data.get('pgmap', {}).get('bytes_used'),
404+
'bytes_total': data.get('pgmap', {}).get('bytes_total'),
405+
}
406+
407+
if self._has_permissions(Permission.READ, Scope.MANAGER):
408+
mgrmap = data.get('mgrmap', {})
409+
available = mgrmap.get('available', False)
410+
num_standbys = mgrmap.get('num_standbys')
411+
num_active = 1 if available else 0
412+
summary['mgrmap'] = {
413+
'num_active': num_active,
414+
'num_standbys': num_standbys,
415+
}
416+
417+
if self._has_permissions(Permission.READ, Scope.CEPHFS):
418+
fsmap = data.get('fsmap', {})
419+
by_rank = fsmap.get('by_rank', [])
420+
421+
active_count = 0
422+
standby_replay_count = 0
423+
424+
for mds in by_rank:
425+
state = mds.get('status', '')
426+
if state == 'up:standby-replay':
427+
standby_replay_count += 1
428+
elif state.startswith('up:'):
429+
active_count += 1
430+
431+
summary['fsmap'] = {
432+
'num_active': active_count,
433+
'num_standbys': fsmap.get('up:standby', 0) + standby_replay_count,
434+
}
435+
436+
if self._has_permissions(Permission.READ, Scope.RGW):
437+
daemons = (
438+
data.get('servicemap', {})
439+
.get('services', {})
440+
.get('rgw', {})
441+
.get('daemons', {})
442+
or {}
443+
)
444+
daemons.pop("summary", None)
445+
summary['num_rgw_gateways'] = len(daemons)
446+
447+
if self._has_permissions(Permission.READ, Scope.ISCSI):
448+
summary['num_iscsi_gateways'] = self.health_minimal.iscsi_daemons()
449+
450+
if self._has_permissions(Permission.READ, Scope.HOSTS):
451+
summary['num_hosts'] = len(get_hosts())
452+
453+
return summary

src/pybind/mgr/dashboard/frontend/package-lock.json

Lines changed: 56 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/pybind/mgr/dashboard/frontend/package.json

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -142,10 +142,11 @@
142142
"postcss-scss": "4.0.9",
143143
"prettier": "2.1.2",
144144
"pretty-quick": "3.0.2",
145+
"purgecss": "7.0.2",
145146
"start-server-and-test": "2.0.3",
146147
"stylelint": "16.20.0",
147-
"stylelint-scss": "6.12.1",
148148
"stylelint-config-standard": "38.0.0",
149+
"stylelint-scss": "6.12.1",
149150
"table": "6.8.0",
150151
"ts-node": "10.9.2",
151152
"typescript": "5.4.5",
@@ -155,6 +156,6 @@
155156
"stepDefinitions": "cypress/e2e/common"
156157
},
157158
"optionalDependencies": {
158-
"@rollup/rollup-linux-arm64-gnu": "4.22.4"
159+
"@rollup/rollup-linux-arm64-gnu": "4.22.4"
159160
}
160-
}
161+
}

src/pybind/mgr/dashboard/frontend/src/app/ceph/dashboard-v3/dashboard/dashboard-v3.component.html

Lines changed: 17 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -73,22 +73,22 @@
7373
[dropdownData]="(isHardwareEnabled$ | async) && (hardwareSummary$ | async)">
7474
</cd-card-row>
7575
<!-- Monitors -->
76-
<cd-card-row [data]="monMap?.monmap.mons.length"
76+
<cd-card-row [data]="monCount"
7777
link="/monitor"
7878
title="Monitor"
7979
summaryType="simplified"></cd-card-row>
8080
<!-- Managers -->
81-
<cd-card-row [data]="mgrMap | mgrSummary"
81+
<cd-card-row [data]="mgrStatus"
8282
title="Manager"></cd-card-row>
8383

8484
<!-- OSDs -->
85-
<cd-card-row [data]="osdMap | osdSummary"
85+
<cd-card-row [data]="osdCount"
8686
link="/osd"
8787
title="OSD"
8888
summaryType="osd"></cd-card-row>
8989

9090
<!-- Pools -->
91-
<cd-card-row [data]="poolStatus?.length"
91+
<cd-card-row [data]="poolCount"
9292
link="/pool"
9393
title="Pool"
9494
summaryType="simplified"></cd-card-row>
@@ -106,7 +106,7 @@
106106
*ngIf="enabledFeature.rgw"></cd-card-row>
107107

108108
<!-- Metadata Servers -->
109-
<cd-card-row [data]="mdsMap | mdsSummary"
109+
<cd-card-row [data]="mdsStatus"
110110
title="Metadata Server"
111111
id="mds-item"
112112
*ngIf="enabledFeature.cephfs"></cd-card-row>
@@ -142,28 +142,28 @@
142142
</div>
143143
<div class="d-flex flex-column ms-4 me-4 mt-4 mb-4">
144144
<div class="d-flex flex-row col-md-3 ms-4">
145-
<i *ngIf="healthData?.status else loadingTpl"
146-
[ngClass]="[healthData.status | healthIcon, icons.large2x]"
147-
[ngStyle]="healthData.status | healthColor"
148-
[title]="healthData.status">
145+
<i *ngIf="healthCardData?.status else loadingTpl"
146+
[ngClass]="[healthCardData.status | healthIcon, icons.large2x]"
147+
[ngStyle]="healthCardData.status | healthColor"
148+
[title]="healthCardData.status">
149149
</i>
150150
<span class="ms-2 mt-n1 lead"
151-
*ngIf="!healthData?.checks?.length"
151+
*ngIf="!hasHealthChecks"
152152
i18n>Cluster</span>
153153
<cds-toggletip [dropShadow]="true"
154154
[autoAlign]="true">
155155
<div cdsToggletipButton>
156156
<a class="ms-2 mt-n1 lead text-primary"
157157
popoverClass="info-card-popover-cluster-status"
158-
*ngIf="healthData?.checks?.length"
158+
*ngIf="hasHealthChecks"
159159
i18n>Cluster
160160
</a>
161161
</div>
162162
<div cdsToggletipContent
163163
#healthCheck>
164164
<div class="cds--popover-scroll-container">
165-
<cd-health-checks *ngIf="healthData?.checks"
166-
[healthData]="healthData.checks">
165+
<cd-health-checks *ngIf="hasHealthChecks"
166+
[healthData]="healthCardData.checks">
167167
</cd-health-checks>
168168
</div>
169169
</div>
@@ -227,8 +227,8 @@
227227
[fullHeight]="true"
228228
aria-label="Capacity card">
229229
<ng-container class="ms-4 me-4"
230-
*ngIf="capacity">
231-
<cd-dashboard-pie [data]="{max: capacity.total_bytes, current: capacity.total_used_raw_bytes}"
230+
*ngIf="totalCapacity && usedCapacity">
231+
<cd-dashboard-pie [data]="{max: totalCapacity, current: usedCapacity}"
232232
[lowThreshold]="capacityCardData.osdNearfull"
233233
[highThreshold]="capacityCardData.osdFull">
234234
</cd-dashboard-pie>
@@ -244,9 +244,9 @@
244244
<div class="ms-4 me-4 mt-0">
245245
<cd-dashboard-time-selector (selectedTime)="getPrometheusData($event)">
246246
</cd-dashboard-time-selector>
247-
<ng-container *ngIf="capacity">
247+
<ng-container *ngIf="usedCapacity">
248248
<cd-dashboard-area-chart chartTitle="Used Capacity (RAW)"
249-
[maxValue]="capacity.total_bytes"
249+
[maxValue]="usedCapacity"
250250
dataUnits="B"
251251
[labelsArray]="['Used Capacity']"
252252
[dataArray]="[queriesResults.USEDCAPACITY]">
@@ -318,13 +318,6 @@ <h6 class="card-title bold">{{ alert.labels.alertname }}</h6>
318318
</ng-container>
319319
</ng-template>
320320

321-
<ng-template #logsLink>
322-
<ng-container *ngIf="permissions.log.read">
323-
<p class="logs-link"
324-
i18n><i [ngClass]="[icons.infoCircle]"></i> See <a routerLink="/logs">Logs</a> for more details.</p>
325-
</ng-container>
326-
</ng-template>
327-
328321
<ng-template #loadingTpl>
329322
<cds-inline-loading></cds-inline-loading>
330323
</ng-template>

0 commit comments

Comments
 (0)