mgr/cephadm: additional debug logging for autotuner

adk3798 · adk3798 · commit a3f69e45cd80 · 2024-04-10T14:01:02.000-04:00
This came from trying to debug behavior of the autotuner
in the upstream mailing list. The test case being added
was what that user was seeing. The debug logging being
added was useful in getting a full understanding of how
the autotuner got the result it did. Therefore, why not
add the logging to the actual codebase so we can make use
of it to debug autotuner issues in the future

Signed-off-by: Adam King &lt;adking@redhat.com&gt;
diff --git a/src/pybind/mgr/cephadm/autotune.py b/src/pybind/mgr/cephadm/autotune.py
@@ -32,24 +32,38 @@ def __init__(
     def tune(self) -> Tuple[Optional[int], List[str]]:
         tuned_osds: List[str] = []
         total = self.total_mem
+        logger.debug('Autotuning OSD memory with given parameters:\n'
+                     f'Total memory: {total}\nDaemons: {self.daemons}')
         for d in self.daemons:
             if d.daemon_type == 'mds':
-                total -= self.config_get(d.name(), 'mds_cache_memory_limit')
+                mds_mem = self.config_get(d.name(), 'mds_cache_memory_limit')
+                logger.debug(f'Subtracting {mds_mem} from total for mds daemon')
+                total -= mds_mem
+                logger.debug(f'new total: {total}')
                 continue
             if d.daemon_type != 'osd':
                 assert d.daemon_type
-                total -= max(
+                daemon_mem = max(
                     self.min_size_by_type.get(d.daemon_type, self.default_size),
                     d.memory_usage or 0
                 )
+                logger.debug(f'Subtracting {daemon_mem} from total for {d.daemon_type} daemon')
+                total -= daemon_mem
+                logger.debug(f'new total: {total}')
                 continue
             if not self.config_get(d.name(), 'osd_memory_target_autotune'):
-                total -= self.config_get(d.name(), 'osd_memory_target')
+                osd_mem = self.config_get(d.name(), 'osd_memory_target')
+                logger.debug('osd_memory_target_autotune disabled. '
+                             f'Subtracting {osd_mem} from total for osd daemon')
+                total -= osd_mem
+                logger.debug(f'new total: {total}')
                 continue
             tuned_osds.append(d.name())
         if total < 0:
             return None, []
         if not tuned_osds:
             return None, []
+        logger.debug(f'Final total is {total} to be split among {len(tuned_osds)} OSDs')
         per = total // len(tuned_osds)
+        logger.debug(f'Result is {per} per OSD')
         return int(per), tuned_osds
diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py
@@ -195,6 +195,9 @@ def _autotune_host_memory(self, host: str) -> None:
             val = None
         else:
             total_mem *= 1024   # kb -> bytes
+            self.log.debug(f'Autotuning memory for host {host} with '
+                           f'{total_mem} total bytes of memory and '
+                           f'{self.mgr.autotune_memory_target_ratio} target ratio')
             total_mem *= self.mgr.autotune_memory_target_ratio
             a = MemoryAutotuner(
                 daemons=self.mgr.cache.get_daemons_by_host(host),
@@ -231,6 +234,9 @@ def _autotune_host_memory(self, host: str) -> None:
             # options as users may be using them. Since there is no way to set autotuning
             # on/off at a host level, best we can do is check if it is globally on.
             if self.mgr.get_foreign_ceph_option('osd', 'osd_memory_target_autotune'):
+                self.mgr.log.debug(f'Removing osd_memory_target for OSDs on {host}'
+                                   ' as either there were no OSDs to tune or the '
+                                   ' per OSD memory calculation result was <= 0')
                 self.mgr.check_mon_command({
                     'prefix': 'config rm',
                     'who': f'osd/host:{host.split(".")[0]}',
diff --git a/src/pybind/mgr/cephadm/tests/test_autotune.py b/src/pybind/mgr/cephadm/tests/test_autotune.py
@@ -57,7 +57,31 @@
             ],
             {},
             60 * 1024 * 1024 * 1024,
-        )
+        ),
+        (  # Taken from an actual user case
+            int(32827840 * 1024 * 0.7),
+            [
+                DaemonDescription('crash', 'a', 'host1'),
+                DaemonDescription('grafana', 'a', 'host1'),
+                DaemonDescription('mds', 'a', 'host1'),
+                DaemonDescription('mds', 'b', 'host1'),
+                DaemonDescription('mds', 'c', 'host1'),
+                DaemonDescription('mgr', 'a', 'host1'),
+                DaemonDescription('mon', 'a', 'host1'),
+                DaemonDescription('node-exporter', 'a', 'host1'),
+                DaemonDescription('osd', '1', 'host1'),
+                DaemonDescription('osd', '2', 'host1'),
+                DaemonDescription('osd', '3', 'host1'),
+                DaemonDescription('osd', '4', 'host1'),
+                DaemonDescription('prometheus', 'a', 'host1'),
+            ],
+            {
+                'mds.a': 4 * 1024 * 1024 * 1024,  # 4294967296
+                'mds.b': 4 * 1024 * 1024 * 1024,
+                'mds.c': 4 * 1024 * 1024 * 1024,
+            },
+            480485376,
+        ),
     ])
 def test_autotune(total, daemons, config, result):
     def fake_getter(who, opt):
@@ -69,6 +93,8 @@ def fake_getter(who, opt):
         if opt == 'osd_memory_target':
             return config.get(who, 4 * 1024 * 1024 * 1024)
         if opt == 'mds_cache_memory_limit':
+            if who in config:
+                return config.get(who, 16 * 1024 * 1024 * 1024)
             return 16 * 1024 * 1024 * 1024
 
     a = MemoryAutotuner(