@@ -193,6 +193,50 @@ def _check_if_connect(self, config):
193193 except Exception :
194194 return False
195195
196+ def _check_mon_netsplit_warning (self ):
197+ """
198+ Returns True if MON_NETSPLIT warning exists in health checks.
199+ """
200+ (client ,) = self .ctx .cluster .only (self .CLIENT ).remotes .keys ()
201+ arg = ['ceph' , 'health' , 'detail' , '--format=json' ]
202+ proc = client .run (args = arg , wait = True , stdout = StringIO (), timeout = 30 )
203+ if proc .exitstatus != 0 :
204+ log .error ("ceph health detail failed" )
205+ raise Exception ("ceph health detail failed" )
206+ out = proc .stdout .getvalue ()
207+ j = json .loads (out )
208+ checks = j .get ("checks" , {})
209+ return "MON_NETSPLIT" in checks
210+
211+ def _check_mon_netsplit_warning_raised (self , detail ):
212+ """
213+ Check if the MON_NETSPLIT warning with the given detail is raised.
214+ """
215+ log .info ("Checking if MON_NETSPLIT warning is raised with detail: {}" .format (detail ))
216+ (client ,) = self .ctx .cluster .only (self .CLIENT ).remotes .keys ()
217+ arg = ['ceph' , 'health' , 'detail' , '--format=json' ]
218+ proc = client .run (args = arg , wait = True , stdout = StringIO (), timeout = 30 )
219+ if proc .exitstatus != 0 :
220+ log .error ("ceph health detail failed" )
221+ raise Exception ("ceph health detail failed" )
222+ out = proc .stdout .getvalue ()
223+ j = json .loads (out )
224+ # Access health checks
225+ checks = j .get ("checks" , {})
226+ netsplit = checks .get ("MON_NETSPLIT" , {})
227+ if not netsplit :
228+ log .info ("MON_NETSPLIT not found in health checks" )
229+ return False
230+
231+ # Check if the expected detail is present
232+ for d in netsplit .get ("detail" , []):
233+ if detail in d .get ("message" , "" ):
234+ log .info ("Found MON_NETSPLIT warning with detail: {}" .format (d ))
235+ return True
236+
237+ log .info ("MON_NETSPLIT found but detail does not match" )
238+ return False
239+
196240 def test_netsplit_dc1_dc2 (self ):
197241 """
198242 Test Netsplit between dc1 and dc2
@@ -220,6 +264,13 @@ def test_netsplit_dc1_dc2(self):
220264 lambda : self ._check_if_disconnect (config ),
221265 timeout = self .RECOVERY_PERIOD ,
222266 )
267+ # check if the MON_NETSPLIT warning is raised we expect none
268+ # because this is stretch mode
269+ self .wait_until_true_and_hold (
270+ lambda : not self ._check_mon_netsplit_warning (),
271+ timeout = self .RECOVERY_PERIOD ,
272+ success_hold_time = self .SUCCESS_HOLD_TIME
273+ )
223274 # check the cluster is accessible
224275 self .wait_until_true_and_hold (
225276 lambda : self ._reply_to_mon_command (),
@@ -263,6 +314,12 @@ def test_netsplit_dc1_dc2(self):
263314 lambda : self ._check_if_connect (config ),
264315 timeout = self .RECOVERY_PERIOD ,
265316 )
317+ # check if no MON_NETSPLIT warning is raised
318+ self .wait_until_true_and_hold (
319+ lambda : not self ._check_mon_netsplit_warning (),
320+ timeout = self .RECOVERY_PERIOD ,
321+ success_hold_time = self .SUCCESS_HOLD_TIME
322+ )
266323 # check if all the PGs are active+clean
267324 self .wait_until_true_and_hold (
268325 lambda : self ._pg_all_active_clean (),
@@ -305,6 +362,13 @@ def test_netsplit_arbiter_dc1_and_dc1_dc2(self):
305362 lambda : self ._check_if_disconnect (dc1_dc2 ),
306363 timeout = self .RECOVERY_PERIOD ,
307364 )
365+ # check if the MON_NETSPLIT warning is raised we expect none
366+ # because this is stretch mode
367+ self .wait_until_true_and_hold (
368+ lambda : not self ._check_mon_netsplit_warning (),
369+ timeout = self .RECOVERY_PERIOD ,
370+ success_hold_time = self .SUCCESS_HOLD_TIME
371+ )
308372 # check the cluster is accessible
309373 self .wait_until_true_and_hold (
310374 lambda : self ._reply_to_mon_command (),
@@ -352,6 +416,12 @@ def test_netsplit_arbiter_dc1_and_dc1_dc2(self):
352416 lambda : self ._check_if_connect (dc1_dc2 ),
353417 timeout = self .RECOVERY_PERIOD ,
354418 )
419+ # check if the MON_NETSPLIT warning is not raised
420+ self .wait_until_true_and_hold (
421+ lambda : not self ._check_mon_netsplit_warning (),
422+ timeout = self .RECOVERY_PERIOD ,
423+ success_hold_time = self .SUCCESS_HOLD_TIME
424+ )
355425 # check if all the PGs are active+clean
356426 self .wait_until_true_and_hold (
357427 lambda : self ._pg_all_active_clean (),
0 commit comments