|
3 | 3 | import logging |
4 | 4 | import errno |
5 | 5 | import re |
| 6 | +import time |
6 | 7 | from teuthology.contextutil import MaxWhileTries |
7 | 8 | from teuthology.exceptions import CommandFailedError |
8 | 9 | from teuthology.orchestra.run import wait |
@@ -562,3 +563,99 @@ def test_open_ino_errors(self): |
562 | 563 | self.fs.mon_manager.raw_cluster_cmd( |
563 | 564 | 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), |
564 | 565 | "damage", "rm", str(entry['id'])) |
| 566 | + |
| 567 | + def test_dentry_first_existing(self): |
| 568 | + """ |
| 569 | + That the MDS won't abort when the dentry is already known to be damaged. |
| 570 | + """ |
| 571 | + |
| 572 | + def verify_corrupt(): |
| 573 | + info = self.fs.read_cache("/a", 0) |
| 574 | + log.debug('%s', info) |
| 575 | + self.assertEqual(len(info), 1) |
| 576 | + dirfrags = info[0]['dirfrags'] |
| 577 | + self.assertEqual(len(dirfrags), 1) |
| 578 | + dentries = dirfrags[0]['dentries'] |
| 579 | + self.assertEqual([dn['path'] for dn in dentries if dn['is_primary']], ['a/c']) |
| 580 | + self.assertEqual(dentries[0]['snap_first'], 18446744073709551606) # SNAP_HEAD |
| 581 | + |
| 582 | + self.mount_a.run_shell_payload("mkdir -p a/b") |
| 583 | + self.fs.flush() |
| 584 | + self.config_set("mds", "mds_abort_on_newly_corrupt_dentry", False) |
| 585 | + self.config_set("mds", "mds_inject_rename_corrupt_dentry_first", "1.0") |
| 586 | + time.sleep(5) # for conf to percolate |
| 587 | + self.mount_a.run_shell_payload("mv a/b a/c; sync .") |
| 588 | + self.mount_a.umount() |
| 589 | + verify_corrupt() |
| 590 | + self.fs.fail() |
| 591 | + self.config_rm("mds", "mds_inject_rename_corrupt_dentry_first") |
| 592 | + self.config_set("mds", "mds_abort_on_newly_corrupt_dentry", False) |
| 593 | + self.fs.set_joinable() |
| 594 | + status = self.fs.status() |
| 595 | + self.fs.flush() |
| 596 | + self.assertFalse(self.fs.status().hadfailover(status)) |
| 597 | + verify_corrupt() |
| 598 | + |
| 599 | + def test_dentry_first_preflush(self): |
| 600 | + """ |
| 601 | + That the MDS won't write a dentry with new damage to CDentry::first |
| 602 | + to the journal. |
| 603 | + """ |
| 604 | + |
| 605 | + rank0 = self.fs.get_rank() |
| 606 | + self.fs.rank_freeze(True, rank=0) |
| 607 | + self.mount_a.run_shell_payload("mkdir -p a/{b,c}/d") |
| 608 | + self.fs.flush() |
| 609 | + self.config_set("mds", "mds_inject_rename_corrupt_dentry_first", "1.0") |
| 610 | + time.sleep(5) # for conf to percolate |
| 611 | + p = self.mount_a.run_shell_payload("timeout 60 mv a/b a/z", wait=False) |
| 612 | + self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout) |
| 613 | + self.config_rm("mds", "mds_inject_rename_corrupt_dentry_first") |
| 614 | + self.fs.rank_freeze(False, rank=0) |
| 615 | + self.delete_mds_coredump(rank0['name']) |
| 616 | + self.fs.mds_restart(rank0['name']) |
| 617 | + self.fs.wait_for_daemons() |
| 618 | + p.wait() |
| 619 | + self.mount_a.run_shell_payload("stat a/ && find a/") |
| 620 | + self.fs.flush() |
| 621 | + |
| 622 | + def test_dentry_first_precommit(self): |
| 623 | + """ |
| 624 | + That the MDS won't write a dentry with new damage to CDentry::first |
| 625 | + to the directory object. |
| 626 | + """ |
| 627 | + |
| 628 | + fscid = self.fs.id |
| 629 | + self.mount_a.run_shell_payload("mkdir -p a/{b,c}/d; sync .") |
| 630 | + self.mount_a.umount() # allow immediate scatter write back |
| 631 | + self.fs.flush() |
| 632 | + # now just twiddle some inode metadata on a regular file |
| 633 | + self.mount_a.mount_wait() |
| 634 | + self.mount_a.run_shell_payload("chmod 711 a/b/d; sync .") |
| 635 | + self.mount_a.umount() # avoid journaling session related things |
| 636 | + # okay, now cause the dentry to get damaged after loading from the journal |
| 637 | + self.fs.fail() |
| 638 | + self.config_set("mds", "mds_inject_journal_corrupt_dentry_first", "1.0") |
| 639 | + time.sleep(5) # for conf to percolate |
| 640 | + self.fs.set_joinable() |
| 641 | + self.fs.wait_for_daemons() |
| 642 | + rank0 = self.fs.get_rank() |
| 643 | + self.fs.rank_freeze(True, rank=0) |
| 644 | + # so now we want to trigger commit but this will crash, so: |
| 645 | + c = ['--connect-timeout=60', 'tell', f"mds.{fscid}:0", "flush", "journal"] |
| 646 | + p = self.ceph_cluster.mon_manager.run_cluster_cmd(args=c, wait=False, timeoutcmd=30) |
| 647 | + self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout) |
| 648 | + self.config_rm("mds", "mds_inject_journal_corrupt_dentry_first") |
| 649 | + self.fs.rank_freeze(False, rank=0) |
| 650 | + self.delete_mds_coredump(rank0['name']) |
| 651 | + self.fs.mds_restart(rank0['name']) |
| 652 | + self.fs.wait_for_daemons() |
| 653 | + try: |
| 654 | + p.wait() |
| 655 | + except CommandFailedError as e: |
| 656 | + print(e) |
| 657 | + else: |
| 658 | + self.fail("flush journal should fail!") |
| 659 | + self.mount_a.mount_wait() |
| 660 | + self.mount_a.run_shell_payload("stat a/ && find a/") |
| 661 | + self.fs.flush() |
0 commit comments