Merge PR ceph#64663 into main

vshankar · vshankar · commit 100415d600d7 · 2025-09-01T17:37:06.000+05:30
* refs/pull/64663/head:

Reviewed-by: Venky Shankar &lt;vshankar@redhat.com&gt;
Reviewed-by: Patrick Donnelly &lt;pdonnell@ibm.com&gt;
diff --git a/qa/cephfs/clusters/1-node-4-mds-1-osd.yaml b/qa/cephfs/clusters/1-node-4-mds-1-osd.yaml
@@ -1,5 +1,5 @@
 roles:
-- [mon.a, mgr.x, mds.a, osd.0, client.0]
+- [mon.a, mgr.x, mds.a, mds.b, mds.c, mds.d, osd.0, client.0]
 openstack:
 - volumes: # attached to each instance
     count: 1
diff --git a/qa/suites/fs/full/clusters/1-node-1-mds-1-osd.yaml b/qa/suites/fs/full/clusters/1-node-1-mds-1-osd.yaml
diff --git a/qa/suites/fs/full/clusters/1-node-4-mds-1-osd.yaml b/qa/suites/fs/full/clusters/1-node-4-mds-1-osd.yaml
@@ -0,0 +1 @@
+.qa/cephfs/clusters/1-node-4-mds-1-osd.yaml
diff --git a/qa/suites/fs/full/overrides.yaml b/qa/suites/fs/full/overrides.yaml
@@ -17,3 +17,4 @@ overrides:
       - OSD_OUT_OF_ORDER_FULL
       - OSD_NEARFULL
       - OSD_FULL
+      - MGR_DOWN
diff --git a/qa/suites/fs/full/tasks/mgr-osd-full.yaml b/qa/suites/fs/full/tasks/mgr-osd-full.yaml
@@ -29,3 +29,8 @@ tasks:
     clients:
       client.0:
         - fs/full/subvolume_snapshot_rm.sh
+- workunit:
+    cleanup: true
+    clients:
+      client.0:
+        - fs/full/subvolume_ls.sh
diff --git a/qa/workunits/fs/full/subvolume_ls.sh b/qa/workunits/fs/full/subvolume_ls.sh
@@ -0,0 +1,119 @@
+#!/usr/bin/env bash
+set -ex
+
+# This testcase tests the scenario of the 'ceph fs subvolume ls' mgr command
+# when the osd is full. The command used to miss out few subvolumes in the list.
+# The issue happens in the multi-mds active setup. Please see the tracker
+# https://tracker.ceph.com/issues/72260
+
+# The suite sets the 'bluestore block size' to 2GiB. So, the osd is of the
+# size 2GiB. The 25 subvolumes are created and a 1GB file is written on the
+# root. The full-ratios are set such that, the data less than 500MB is
+# treated as osd full. Now, subvolumes are listed 20 times with mgr failover
+# (to invalidate readdir cache) and validated each time.
+
+SUBVOL_CNT=25
+
+expect_failure() {
+  if "$@"; then return 1; else return 0; fi
+}
+validate_subvol_cnt() {
+  if [ $1 -eq $SUBVOL_CNT ]; then return 0; else return 1; fi
+}
+restart_mgr() {
+  ceph mgr fail x
+  timeout=30
+  while [ $timeout -gt 0 ]
+  do
+    active_mgr_cnt=$(ceph status | grep mgr | grep active | grep -v no | wc -l)
+    if [ $active_mgr_cnt -eq 1 ]; then break; fi
+    echo "Waiting for mgr to be active after failover: $timeout"
+    sleep 1
+    let "timeout-=1"
+  done
+}
+
+#Set client_use_random_mds
+ceph config set client client_use_random_mds true
+
+#Set max_mds to 3
+ceph fs set cephfs max_mds 3
+timeout=30
+while [ $timeout -gt 0 ]
+do
+  active_cnt=$(ceph fs status | grep active | wc -l)
+  if [ $active_cnt -eq 2 ]; then break; fi
+  echo "Wating for max_mds to be 2: $timeout"
+  sleep 1
+  let "timeout-=1"
+done
+
+#Create subvolumes
+for i in $(seq 1 $SUBVOL_CNT); do ceph fs subvolume create cephfs sub_$i; done
+
+#For debugging
+echo "Before write"
+df -h
+ceph osd df
+
+sudo dd if=/dev/urandom of=$CEPH_MNT/1GB_file-1 status=progress bs=1M count=1000
+
+# The suite (qa/suites/fs/full/tasks/mgr-osd-full.yaml) sets the 'bluestore block size'
+# to 2GiB. So, the osd is of the size 2GiB. The full-ratios are set below makes sure
+# that the data less than 500MB is treated as osd full.
+ceph osd set-full-ratio 0.2
+ceph osd set-nearfull-ratio 0.16
+ceph osd set-backfillfull-ratio 0.18
+
+timeout=30
+while [ $timeout -gt 0 ]
+do
+  health=$(ceph health detail)
+  [[ $health = *"OSD_FULL"* ]] && echo "OSD is full" && break
+  echo "Waiting for osd to be full: $timeout"
+  sleep 1
+  let "timeout-=1"
+done
+
+#For debugging
+echo "After ratio set"
+df -h
+ceph osd df
+
+#Clear readdir cache by failing over mgr which forces to use new libcephfs connection
+#Validate subvolume ls  20 times
+for i in {1..20};
+do
+  restart_mgr
+  #List and validate subvolumes count
+  subvol_cnt=$(ceph fs subvolume ls cephfs --format=json-pretty | grep sub_ | wc -l)
+  validate_subvol_cnt $subvol_cnt
+done
+
+#Delete all subvolumes
+for i in $(seq 1 $SUBVOL_CNT); do ceph fs subvolume rm cephfs sub_$i; done
+
+#Wait for subvolume to delete data
+trashdir=$CEPH_MNT/volumes/_deleting
+timeout=30
+while [ $timeout -gt 0 ]
+do
+  [ -z "$(sudo ls -A $trashdir)" ] && echo "Trash directory $trashdir is empty" &&  break
+  echo "Waiting for trash dir to be empty: $timeout"
+  sleep 1
+  let "timeout-=1"
+done
+
+sudo rm -f $CEPH_MNT/1GB_file-1
+
+#Set the ratios back for other full tests to run
+ceph osd set-full-ratio 0.95
+ceph osd set-nearfull-ratio 0.95
+ceph osd set-backfillfull-ratio 0.95
+
+#After test
+echo "After test"
+df -h
+ceph osd df
+
+echo OK
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
@@ -2783,11 +2783,6 @@ void Server::dispatch_client_request(const MDRequestRef& mdr)
   }
   
   if (is_full) {
-    CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
-    if (!cur) {
-      // the request is already responded to
-      return;
-    }
     if (req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
         req->get_op() == CEPH_MDS_OP_SETDIRLAYOUT ||
         req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
@@ -2800,7 +2795,18 @@ void Server::dispatch_client_request(const MDRequestRef& mdr)
 	  req->get_op() == CEPH_MDS_OP_RENAME) &&
 	 (!mdr->has_more() || mdr->more()->witnessed.empty())) // haven't started peer request
 	) {
-
+      /*
+       * The inode fetch below is specific to the operations above and the inode is
+       * expected to be in memory as these operations are likely preceded by lookup.
+       * Doing this generically outside the condition was incorrect as the ops like
+       * getattr might not have the inode in memory as this could be a non-auth mds
+       * and fails with ESTALE confusing the client without forwarding to the auth mds.
+       */
+      CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
+      if (!cur) {
+        // the request is already responded to
+        return;
+      }
       if (check_access(mdr, cur, MAY_FULL)) {
         dout(20) << __func__ << ": full, has FULL caps, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
       } else {

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+.qa/cephfs/clusters/1-node-4-mds-1-osd.yaml`