110110 'scrub_status' : (str , '' )
111111})
112112
113+ HEALTH_SNAPSHOT_SCHEMA = ({
114+ 'fsid' : (str , 'Cluster filesystem ID' ),
115+ 'health' : ({
116+ 'status' : (str , 'Overall health status' ),
117+ 'checks' : ({
118+ '<check_name>' : ({
119+ 'severity' : (str , 'Health severity level' ),
120+ 'summary' : ({
121+ 'message' : (str , 'Human-readable summary' ),
122+ 'count' : (int , 'Occurrence count' )
123+ }, 'Summary details' ),
124+ 'muted' : (bool , 'Whether the check is muted' )
125+ }, 'Individual health check object' )
126+ }, 'Health checks keyed by name' ),
127+ 'mutes' : ([str ], 'List of muted check names' )
128+ }, 'Cluster health overview' ),
129+ 'monmap' : ({
130+ 'num_mons' : (int , 'Number of monitors' )
131+ }, 'Monitor map details' ),
132+ 'osdmap' : ({
133+ 'in' : (int , 'Number of OSDs in' ),
134+ 'up' : (int , 'Number of OSDs up' ),
135+ 'num_osds' : (int , 'Total OSD count' )
136+ }, 'OSD map details' ),
137+ 'pgmap' : ({
138+ 'pgs_by_state' : ([{
139+ 'state_name' : (str , 'Placement group state' ),
140+ 'count' : (int , 'Count of PGs in this state' )
141+ }], 'List of PG counts by state' ),
142+ 'num_pools' : (int , 'Number of pools' ),
143+ 'num_pgs' : (int , 'Total PG count' ),
144+ 'bytes_used' : (int , 'Used capacity in bytes' ),
145+ 'bytes_total' : (int , 'Total capacity in bytes' ),
146+ }, 'Placement group map details' ),
147+ 'mgrmap' : ({
148+ 'num_active' : (int , 'Number of active managers' ),
149+ 'num_standbys' : (int , 'Standby manager count' )
150+ }, 'Manager map details' ),
151+ 'fsmap' : ({
152+ 'num_active' : (int , 'Number of active mds' ),
153+ 'num_standbys' : (int , 'Standby MDS count' ),
154+ }, 'Filesystem map details' ),
155+ 'num_rgw_gateways' : (int , 'Count of RGW gateway daemons running' ),
156+ 'num_iscsi_gateways' : ({
157+ 'up' : (int , 'Count of iSCSI gateways running' ),
158+ 'down' : (int , 'Count of iSCSI gateways not running' )
159+ }, 'Iscsi gateways status' ),
160+ })
161+
113162
114163class HealthData (object ):
115164 """
@@ -281,15 +330,28 @@ def scrub_status(self):
281330class Health (BaseController ):
282331 def __init__ (self ):
283332 super ().__init__ ()
284- self .health_full = HealthData (self ._has_permissions , minimal = False )
285- self .health_minimal = HealthData (self ._has_permissions , minimal = True )
333+ self ._health_full = None
334+ self ._health_minimal = None
335+
336+ @property
337+ def health_full (self ):
338+ if self ._health_full is None :
339+ self ._health_full = HealthData (self ._has_permissions , minimal = False )
340+ return self ._health_full
341+
342+ @property
343+ def health_minimal (self ):
344+ if self ._health_minimal is None :
345+ self ._health_minimal = HealthData (self ._has_permissions , minimal = True )
346+ return self ._health_minimal
286347
287348 @Endpoint ()
349+ @EndpointDoc ("Get Cluster's detailed health report" )
288350 def full (self ):
289351 return self .health_full .all_health ()
290352
291353 @Endpoint ()
292- @EndpointDoc ("Get Cluster's minimal health report" ,
354+ @EndpointDoc ("Get Cluster's health report with lesser details " ,
293355 responses = {200 : HEALTH_MINIMAL_SCHEMA })
294356 def minimal (self ):
295357 return self .health_minimal .all_health ()
@@ -305,3 +367,87 @@ def get_cluster_fsid(self):
305367 @Endpoint ()
306368 def get_telemetry_status (self ):
307369 return mgr .get_module_option_ex ('telemetry' , 'enabled' , False )
370+
371+ @Endpoint ()
372+ @EndpointDoc (
373+ "Get a quick overview of cluster health at a moment, analogous to "
374+ "the ceph status command in CLI." ,
375+ responses = {200 : HEALTH_SNAPSHOT_SCHEMA })
376+ def snapshot (self ):
377+ data = CephService .send_command ('mon' , 'status' )
378+
379+ summary = {
380+ 'fsid' : data .get ('fsid' ),
381+ 'health' : {
382+ 'status' : data .get ('health' , {}).get ('status' ),
383+ 'checks' : data .get ('health' , {}).get ('checks' , {}),
384+ 'mutes' : data .get ('health' , {}).get ('mutes' , []),
385+ },
386+ }
387+
388+ if self ._has_permissions (Permission .READ , Scope .MONITOR ):
389+ summary ['monmap' ] = {
390+ 'num_mons' : data .get ('monmap' , {}).get ('num_mons' ),
391+ }
392+
393+ if self ._has_permissions (Permission .READ , Scope .OSD ):
394+ summary ['osdmap' ] = {
395+ 'in' : data .get ('osdmap' , {}).get ('num_in_osds' ),
396+ 'up' : data .get ('osdmap' , {}).get ('num_up_osds' ),
397+ 'num_osds' : data .get ('osdmap' , {}).get ('num_osds' ),
398+ }
399+ summary ['pgmap' ] = {
400+ 'pgs_by_state' : data .get ('pgmap' , {}).get ('pgs_by_state' , []),
401+ 'num_pools' : data .get ('pgmap' , {}).get ('num_pools' ),
402+ 'num_pgs' : data .get ('pgmap' , {}).get ('num_pgs' ),
403+ 'bytes_used' : data .get ('pgmap' , {}).get ('bytes_used' ),
404+ 'bytes_total' : data .get ('pgmap' , {}).get ('bytes_total' ),
405+ }
406+
407+ if self ._has_permissions (Permission .READ , Scope .MANAGER ):
408+ mgrmap = data .get ('mgrmap' , {})
409+ available = mgrmap .get ('available' , False )
410+ num_standbys = mgrmap .get ('num_standbys' )
411+ num_active = 1 if available else 0
412+ summary ['mgrmap' ] = {
413+ 'num_active' : num_active ,
414+ 'num_standbys' : num_standbys ,
415+ }
416+
417+ if self ._has_permissions (Permission .READ , Scope .CEPHFS ):
418+ fsmap = data .get ('fsmap' , {})
419+ by_rank = fsmap .get ('by_rank' , [])
420+
421+ active_count = 0
422+ standby_replay_count = 0
423+
424+ for mds in by_rank :
425+ state = mds .get ('status' , '' )
426+ if state == 'up:standby-replay' :
427+ standby_replay_count += 1
428+ elif state .startswith ('up:' ):
429+ active_count += 1
430+
431+ summary ['fsmap' ] = {
432+ 'num_active' : active_count ,
433+ 'num_standbys' : fsmap .get ('up:standby' , 0 ) + standby_replay_count ,
434+ }
435+
436+ if self ._has_permissions (Permission .READ , Scope .RGW ):
437+ daemons = (
438+ data .get ('servicemap' , {})
439+ .get ('services' , {})
440+ .get ('rgw' , {})
441+ .get ('daemons' , {})
442+ or {}
443+ )
444+ daemons .pop ("summary" , None )
445+ summary ['num_rgw_gateways' ] = len (daemons )
446+
447+ if self ._has_permissions (Permission .READ , Scope .ISCSI ):
448+ summary ['num_iscsi_gateways' ] = self .health_minimal .iscsi_daemons ()
449+
450+ if self ._has_permissions (Permission .READ , Scope .HOSTS ):
451+ summary ['num_hosts' ] = len (get_hosts ())
452+
453+ return summary
0 commit comments