@@ -697,8 +697,66 @@ def test_quiesce_path_splitauth(self):
697697 op = self .fs .rank_tell (["quiesce" , "path" , self .subvolume , '--await' ], rank = 0 , check_status = False )['op' ]
698698 self .assertEqual (op ['result' ], - 1 ) # EPERM
699699
700- @unittest .skip ("https://tracker.ceph.com/issues/66152" )
701- def test_quiesce_drops_remote_authpins_on_failure (self ):
700+ def test_quiesce_drops_remote_authpins_when_done (self ):
701+ """
702+ That a quiesce operation drops remote authpins after marking the node as quiesced
703+
704+ It's important that a remote quiesce doesn't stall freezing ops on the auth
705+ """
706+ self ._configure_subvolume ()
707+
708+ # create two dirs for pinning
709+ self .mount_a .run_shell_payload ("mkdir -p pin0 pin1" )
710+ # enable export by populating the directories
711+ self .mount_a .run_shell_payload ("touch pin0/export_dummy pin1/export_dummy" )
712+ # pin the files to different ranks
713+ self .mount_a .setfattr ("pin0" , "ceph.dir.pin" , "0" )
714+ self .mount_a .setfattr ("pin1" , "ceph.dir.pin" , "1" )
715+
716+ # prepare the patient at rank 0
717+ self .mount_a .write_file ("pin0/thefile" , "I'm ready, doc" )
718+
719+ # wait for the export to settle
720+ self ._wait_subtrees ([(f"{ self .mntpnt } /pin0" , 0 ), (f"{ self .mntpnt } /pin1" , 1 )])
721+
722+ def reqid (cmd ):
723+ J = json .loads (cmd .stdout .getvalue ())
724+ J = J .get ('type_data' , J ) # for op get
725+ J = J .get ('op' , J ) # for quiesce path
726+ # lock path returns the op directly
727+ return self ._reqid_tostr (J ['reqid' ])
728+
729+ def assertQuiesceOpDone (expected_done , quiesce_op , rank ):
730+ cmd = self .fs .run_ceph_cmd (f"tell mds.{ self .fs .name } :{ rank } op get { quiesce_op } " , stdout = StringIO ())
731+
732+ J = json .loads (cmd .stdout .getvalue ())
733+ self .assertEqual (J ['type_data' ]['result' ], 0 if expected_done else None )
734+
735+ # Take the policy lock on the auth to cause a quiesce operation to request the remote authpin
736+ # This is needed to cause the next command to block
737+ cmd = self .fs .run_ceph_cmd (f"tell mds.{ self .fs .name } :0 lock path { self .mntpnt } /pin0/thefile policy:x --await" , stdout = StringIO ())
738+ policy_block_op = reqid (cmd )
739+
740+ # Try quiescing on the replica. This should block for the policy lock
741+ # As a side effect, it should take the remote authpin
742+ cmd = self .fs .run_ceph_cmd (f"tell mds.{ self .fs .name } :1 quiesce path { self .mntpnt } /pin0/thefile" , stdout = StringIO ())
743+ quiesce_op = reqid (cmd )
744+
745+ # verify the quiesce is pending
746+ assertQuiesceOpDone (False , quiesce_op , rank = 1 )
747+
748+ # kill the op that holds the policy lock exclusively and verify the quiesce succeeds
749+ self .fs .kill_op (policy_block_op , rank = 0 )
750+ assertQuiesceOpDone (True , quiesce_op , rank = 1 )
751+
752+ # If all is good, the ap-freeze operation below should succeed
753+ # despite the quiesce_op that's still active.
754+ # We payload this with some lock that we know shouldn't block
755+ # The call below will block on freezing if the quiesce failed to release
756+ # remote authpins, and after the lifetime elapses will return ECANCELED
757+ cmd = self .fs .run_ceph_cmd (f"tell mds.{ self .fs .name } :1 lock path { self .mntpnt } /pin0/thefile policy:r --ap-freeze --await --lifetime 5" )
758+
759+ def test_request_drops_remote_authpins_when_waiting_for_quiescelock (self ):
702760 """
703761 That remote authpins are dropped when the request fails to acquire the quiesce lock
704762
0 commit comments