1515from textwrap import dedent
1616
1717from teuthology .exceptions import CommandFailedError
18+ from teuthology import contextutil
19+ from tasks .cephfs .filesystem import FSDamaged
1820from tasks .cephfs .cephfs_test_case import CephFSTestCase , for_teuthology
1921
2022log = logging .getLogger (__name__ )
@@ -84,6 +86,18 @@ def damage(self):
8486 pool = self ._filesystem .get_metadata_pool_name ()
8587 self ._filesystem .rados (["purge" , pool , '--yes-i-really-really-mean-it' ])
8688
89+ def is_damaged (self ):
90+ sleep = 2
91+ timeout = 120
92+ with contextutil .safe_while (sleep = sleep , tries = timeout / sleep ) as proceed :
93+ while proceed ():
94+ try :
95+ self ._filesystem .wait_for_daemons ()
96+ except FSDamaged as e :
97+ if 0 in e .ranks :
98+ return True
99+ return False
100+
87101 def flush (self ):
88102 """
89103 Called after client unmount, after write: flush whatever you want
@@ -150,6 +164,90 @@ def validate(self):
150164 self .assert_equal (target , "symdir/onemegs" )
151165 return self ._errors
152166
167+ class NestedDirWorkload (Workload ):
168+ """
169+ Nested directories, one is lost.
170+ """
171+
172+ def write (self ):
173+ self ._mount .run_shell_payload ("mkdir -p dir_x/dir_xx/dir_xxx/" )
174+ self ._mount .run_shell_payload ("dd if=/dev/urandom of=dir_x/dir_xx/dir_xxx/file_y conv=fsync bs=1 count=1" )
175+ self ._initial_state = self ._filesystem .read_cache ("dir_x/dir_xx" , depth = 0 )
176+
177+ def damage (self ):
178+ dirfrag_obj = "{0:x}.00000000" .format (self ._initial_state [0 ]['ino' ])
179+ self ._filesystem .radosm (["rm" , dirfrag_obj ])
180+
181+ def is_damaged (self ):
182+ # workload runner expects MDS to be offline
183+ self ._filesystem .fail ()
184+ return True
185+
186+ def validate (self ):
187+ self ._mount .run_shell_payload ("find dir_x -execdir stat {} +" )
188+ self ._mount .run_shell_payload ("stat dir_x/dir_xx/dir_xxx/file_y" )
189+ return self ._errors
190+
191+ class NestedDirWorkloadRename (Workload ):
192+ """
193+ Nested directories, one is lost. With renames.
194+ """
195+
196+ def write (self ):
197+ self ._mount .run_shell_payload ("mkdir -p dir_x/dir_xx/dir_xxx/; mkdir -p dir_y" )
198+ self ._mount .run_shell_payload ("dd if=/dev/urandom of=dir_x/dir_xx/dir_xxx/file_y conv=fsync bs=1 count=1" )
199+ self ._initial_state = self ._filesystem .read_cache ("dir_x/dir_xx" , depth = 0 )
200+ self ._filesystem .flush ()
201+ self ._mount .run_shell_payload ("mv dir_x/dir_xx dir_y/dir_yy; sync dir_y" )
202+
203+ def damage (self ):
204+ dirfrag_obj = "{0:x}.00000000" .format (self ._initial_state [0 ]['ino' ])
205+ self ._filesystem .radosm (["rm" , dirfrag_obj ])
206+
207+ def is_damaged (self ):
208+ # workload runner expects MDS to be offline
209+ self ._filesystem .fail ()
210+ return True
211+
212+ def validate (self ):
213+ self ._mount .run_shell_payload ("find . -execdir stat {} +" )
214+ self ._mount .run_shell_payload ("stat dir_y/dir_yy/dir_xxx/file_y" )
215+ return self ._errors
216+
217+ class NestedDoubleDirWorkloadRename (Workload ):
218+ """
219+ Nested directories, two lost with backtraces to rebuild. With renames.
220+ """
221+
222+ def write (self ):
223+ self ._mount .run_shell_payload ("mkdir -p dir_x/dir_xx/dir_xxx/; mkdir -p dir_y" )
224+ self ._mount .run_shell_payload ("dd if=/dev/urandom of=dir_x/dir_xx/dir_xxx/file_y conv=fsync bs=1 count=1" )
225+ self ._initial_state = []
226+ self ._initial_state .append (self ._filesystem .read_cache ("dir_x/dir_xx" , depth = 0 ))
227+ self ._initial_state .append (self ._filesystem .read_cache ("dir_y" , depth = 0 ))
228+ self ._filesystem .flush ()
229+ self ._mount .run_shell_payload ("""
230+ mv dir_x/dir_xx dir_y/dir_yy
231+ sync dir_y
232+ dd if=/dev/urandom of=dir_y/dir_yy/dir_xxx/file_z conv=fsync bs=1 count=1
233+ """ )
234+
235+ def damage (self ):
236+ for o in self ._initial_state :
237+ dirfrag_obj = "{0:x}.00000000" .format (o [0 ]['ino' ])
238+ self ._filesystem .radosm (["rm" , dirfrag_obj ])
239+
240+ def is_damaged (self ):
241+ # workload runner expects MDS to be offline
242+ self ._filesystem .fail ()
243+ return True
244+
245+ def validate (self ):
246+ self ._mount .run_shell_payload ("find . -execdir stat {} +" )
247+ # during recovery: we may get dir_x/dir_xx or dir_y/dir_yy; depending on rados pg iteration order
248+ self ._mount .run_shell_payload ("stat dir_y/dir_yy/dir_xxx/file_y || stat dir_x/dir_xx/dir_xxx/file_y" )
249+ return self ._errors
250+
153251
154252class MovedFile (Workload ):
155253 def write (self ):
@@ -391,10 +489,6 @@ def validate(self):
391489class TestDataScan (CephFSTestCase ):
392490 MDSS_REQUIRED = 2
393491
394- def is_marked_damaged (self , rank ):
395- mds_map = self .fs .get_mds_map ()
396- return rank in mds_map ['damaged' ]
397-
398492 def _rebuild_metadata (self , workload , workers = 1 , unmount = True ):
399493 """
400494 That when all objects in metadata pool are removed, we can rebuild a metadata pool
@@ -425,19 +519,11 @@ def _rebuild_metadata(self, workload, workers=1, unmount=True):
425519 # Reset the MDS map in case multiple ranks were in play: recovery procedure
426520 # only understands how to rebuild metadata under rank 0
427521 self .fs .reset ()
522+ self .assertEqual (self .fs .get_var ('max_mds' ), 1 )
428523
429524 self .fs .set_joinable () # redundant with reset
430525
431- def get_state (mds_id ):
432- info = self .mds_cluster .get_mds_info (mds_id )
433- return info ['state' ] if info is not None else None
434-
435- self .wait_until_true (lambda : self .is_marked_damaged (0 ), 60 )
436- for mds_id in self .fs .mds_ids :
437- self .wait_until_equal (
438- lambda : get_state (mds_id ),
439- "up:standby" ,
440- timeout = 60 )
526+ self .assertTrue (workload .is_damaged ())
441527
442528 self .fs .table_tool ([self .fs .name + ":0" , "reset" , "session" ])
443529 self .fs .table_tool ([self .fs .name + ":0" , "reset" , "snap" ])
@@ -450,7 +536,7 @@ def get_state(mds_id):
450536 self .fs .journal_tool (["journal" , "reset" , "--yes-i-really-really-mean-it" ], 0 )
451537
452538 self .fs .journal_tool (["journal" , "reset" , "--force" , "--yes-i-really-really-mean-it" ], 0 )
453- self .fs .data_scan (["init" ])
539+ self .fs .data_scan (["init" , "--force-init" ])
454540 self .fs .data_scan (["scan_extents" ], worker_count = workers )
455541 self .fs .data_scan (["scan_inodes" ], worker_count = workers )
456542 self .fs .data_scan (["scan_links" ])
@@ -461,6 +547,7 @@ def get_state(mds_id):
461547 self .run_ceph_cmd ('mds' , 'repaired' , '0' )
462548
463549 # Start the MDS
550+ self .fs .set_joinable () # necessary for some tests without damage
464551 self .fs .wait_for_daemons ()
465552 log .info (str (self .mds_cluster .status ()))
466553
@@ -489,6 +576,15 @@ def test_rebuild_simple(self):
489576 def test_rebuild_symlink (self ):
490577 self ._rebuild_metadata (SymlinkWorkload (self .fs , self .mount_a ))
491578
579+ def test_rebuild_nested (self ):
580+ self ._rebuild_metadata (NestedDirWorkload (self .fs , self .mount_a ))
581+
582+ def test_rebuild_nested_rename (self ):
583+ self ._rebuild_metadata (NestedDirWorkloadRename (self .fs , self .mount_a ))
584+
585+ def test_rebuild_nested_double_rename (self ):
586+ self ._rebuild_metadata (NestedDoubleDirWorkloadRename (self .fs , self .mount_a ))
587+
492588 def test_rebuild_moved_file (self ):
493589 self ._rebuild_metadata (MovedFile (self .fs , self .mount_a ))
494590
0 commit comments