Skip to content

Commit 010acce

Browse files
committed
libvirt: call get_capabilities() with all CPUs online
While we do cache the hosts's capabilities in self._caps in the libvirt Host object, if we happen to fist call get_capabilities() with some of our dedicated CPUs offline, libvirt erroneously reports them as being on socket 0 regardless of their real socket. We would then cache that topology, thus breaking pretty much all of our NUMA accounting. To fix this, this patch makes sure to call get_capabilities() immediately upon host init, and to power up all our dedicated CPUs before doing so. That way, we cache their real socket ID. For testing, because we don't really want to implement a libvirt bug in our Python libvirt fixture, we make due with a simple unit tests that asserts that init_host() has powered on the correct CPUs. Closes-bug: 2077228 Change-Id: I9a2a7614313297f11a55d99fb94916d3583a9504 (cherry picked from commit 79d1f06) (cherry picked from commit 294444b) (cherry picked from commit 6448e38)
1 parent 6c3d72f commit 010acce

File tree

3 files changed

+34
-5
lines changed

3 files changed

+34
-5
lines changed

nova/tests/unit/virt/libvirt/test_driver.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -894,6 +894,16 @@ def test_driver_capabilities_secure_boot(self, mock_supports):
894894
)
895895
mock_supports.assert_called_once_with()
896896

897+
@mock.patch.object(hardware, 'get_cpu_dedicated_set',
898+
return_value=set([0, 42, 1337]))
899+
@mock.patch.object(libvirt_driver.LibvirtDriver,
900+
'_register_all_undefined_instance_details')
901+
def test_init_host_topology(self, mock_get_cpu_dedicated_set, _):
902+
driver = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
903+
with mock.patch.object(driver.cpu_api, 'power_up') as mock_power_up:
904+
driver.init_host('goat')
905+
mock_power_up.assert_called_with(set([0, 42, 1337]))
906+
897907
@mock.patch.object(
898908
libvirt_driver.LibvirtDriver,
899909
'_register_all_undefined_instance_details',

nova/virt/libvirt/cpu/api.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def core(self, i):
9191
"""
9292
return Core(i)
9393

94-
def _power_up(self, cpus: ty.Set[int]) -> None:
94+
def power_up(self, cpus: ty.Set[int]) -> None:
9595
if not CONF.libvirt.cpu_power_management:
9696
return
9797
cpu_dedicated_set = hardware.get_cpu_dedicated_set_nozero() or set()
@@ -111,7 +111,7 @@ def power_up_for_instance(self, instance: objects.Instance) -> None:
111111
return
112112
pcpus = instance.numa_topology.cpu_pinning.union(
113113
instance.numa_topology.cpuset_reserved)
114-
self._power_up(pcpus)
114+
self.power_up(pcpus)
115115

116116
def power_up_for_migration(
117117
self, dst_numa_info: objects.LibvirtLiveMigrateNUMAInfo
@@ -121,7 +121,7 @@ def power_up_for_migration(
121121
pcpus = dst_numa_info.emulator_pins
122122
for pins in dst_numa_info.cpu_pins.values():
123123
pcpus = pcpus.union(pins)
124-
self._power_up(pcpus)
124+
self.power_up(pcpus)
125125

126126
def _power_down(self, cpus: ty.Set[int]) -> None:
127127
if not CONF.libvirt.cpu_power_management:

nova/virt/libvirt/driver.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -735,12 +735,31 @@ def _handle_conn_event(self, enabled, reason):
735735
{'enabled': enabled, 'reason': reason})
736736
self._set_host_enabled(enabled, reason)
737737

738+
def _init_host_topology(self):
739+
"""To work around a bug in libvirt that reports offline CPUs as always
740+
being on socket 0 regardless of their real socket, power up all
741+
dedicated CPUs (the only ones whose socket we actually care about),
742+
then call get_capabilities() to initialize the topology with the
743+
correct socket values. get_capabilities()'s implementation will reuse
744+
these initial socket value, and avoid clobbering them with 0 for
745+
offline CPUs.
746+
"""
747+
cpus = hardware.get_cpu_dedicated_set()
748+
if cpus:
749+
self.cpu_api.power_up(cpus)
750+
self._host.get_capabilities()
751+
738752
def init_host(self, host):
739753
self._host.initialize()
740754

741-
self._update_host_specific_capabilities()
742-
755+
# NOTE(artom) Do this first to make sure our first call to
756+
# get_capabilities() happens with all dedicated CPUs online and caches
757+
# their correct socket ID. Unused dedicated CPUs will be powered down
758+
# further down in this method.
743759
self._check_cpu_set_configuration()
760+
self._init_host_topology()
761+
762+
self._update_host_specific_capabilities()
744763

745764
self._do_quality_warnings()
746765

0 commit comments

Comments
 (0)