Skip to content

Commit 0b1fa9b

Browse files
author
Balazs Gibizer
committed
Reproduce bug 1944759
Add functional tests to reproduce the race between resize_instance() and update_available_resources(). Related-Bug: #1944759 Change-Id: Icb7e3379248fe00f9a94f9860181b5de44902379 (cherry picked from commit 3e4e448) (cherry picked from commit e6c6880) (cherry picked from commit 140ae45)
1 parent 7f00f7b commit 0b1fa9b

File tree

1 file changed

+121
-0
lines changed

1 file changed

+121
-0
lines changed

nova/tests/functional/libvirt/test_numa_servers.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -711,6 +711,127 @@ def fake_confirm_migration(*args, **kwargs):
711711

712712
server = self._wait_for_state_change(server, 'ACTIVE')
713713

714+
def _assert_pinned_cpus(self, hostname, expected_number_of_pinned):
715+
numa_topology = objects.NUMATopology.obj_from_db_obj(
716+
objects.ComputeNode.get_by_nodename(
717+
self.ctxt, hostname,
718+
).numa_topology,
719+
)
720+
self.assertEqual(
721+
expected_number_of_pinned, len(numa_topology.cells[0].pinned_cpus))
722+
723+
def _create_server_and_resize_bug_1944759(self):
724+
self.flags(
725+
cpu_dedicated_set='0-3', cpu_shared_set='4-7', group='compute')
726+
self.flags(vcpu_pin_set=None)
727+
728+
# start services
729+
self.start_compute(hostname='test_compute0')
730+
self.start_compute(hostname='test_compute1')
731+
732+
flavor_a_id = self._create_flavor(
733+
vcpu=2, extra_spec={'hw:cpu_policy': 'dedicated'})
734+
server = self._create_server(flavor_id=flavor_a_id)
735+
736+
src_host = server['OS-EXT-SRV-ATTR:host']
737+
self._assert_pinned_cpus(src_host, 2)
738+
739+
# we don't really care what the new flavor is, so long as the old
740+
# flavor is using pinning. We use a similar flavor for simplicity.
741+
flavor_b_id = self._create_flavor(
742+
vcpu=2, extra_spec={'hw:cpu_policy': 'dedicated'})
743+
744+
orig_rpc_finish_resize = nova.compute.rpcapi.ComputeAPI.finish_resize
745+
746+
# Simulate that the finish_resize call overlaps with an
747+
# update_available_resource periodic job
748+
def inject_periodic_to_finish_resize(*args, **kwargs):
749+
self._run_periodics()
750+
return orig_rpc_finish_resize(*args, **kwargs)
751+
752+
self.stub_out(
753+
'nova.compute.rpcapi.ComputeAPI.finish_resize',
754+
inject_periodic_to_finish_resize,
755+
)
756+
757+
# TODO(stephenfin): The mock of 'migrate_disk_and_power_off' should
758+
# probably be less...dumb
759+
with mock.patch(
760+
'nova.virt.libvirt.driver.LibvirtDriver'
761+
'.migrate_disk_and_power_off', return_value='{}',
762+
):
763+
post = {'resize': {'flavorRef': flavor_b_id}}
764+
self.api.post_server_action(server['id'], post)
765+
server = self._wait_for_state_change(server, 'VERIFY_RESIZE')
766+
767+
dst_host = server['OS-EXT-SRV-ATTR:host']
768+
769+
# This is a resource accounting bug, we should have 2 cpus pinned on
770+
# both computes. The source should have it due to the outbound
771+
# migration and the destination due to the instance running there
772+
self._assert_pinned_cpus(src_host, 0)
773+
self._assert_pinned_cpus(dst_host, 2)
774+
775+
return server, src_host, dst_host
776+
777+
def test_resize_confirm_bug_1944759(self):
778+
server, src_host, dst_host = (
779+
self._create_server_and_resize_bug_1944759())
780+
781+
# Now confirm the resize
782+
post = {'confirmResize': None}
783+
784+
# FIXME(gibi): This is bug 1944759 where during resize, on the source
785+
# node the resize_instance() call at the point of calling finish_resize
786+
# overlaps with a update_available_resources() periodic job. This
787+
# causes that the periodic job will not track the migration nor the
788+
# instance and therefore freeing the resource allocation. Then when
789+
# later the resize is confirmed the confirm_resize on the source
790+
# compute also wants to free up the resources, the pinned CPUs, and it
791+
# fails as they are already freed.
792+
exc = self.assertRaises(
793+
client.OpenStackApiException,
794+
self.api.post_server_action, server['id'], post
795+
)
796+
self.assertEqual(500, exc.response.status_code)
797+
self.assertIn('CPUUnpinningInvalid', str(exc))
798+
799+
# confirm failed above but the resource allocation reflects that the
800+
# VM is running on the dest node
801+
self._assert_pinned_cpus(src_host, 0)
802+
self._assert_pinned_cpus(dst_host, 2)
803+
804+
self._run_periodics()
805+
806+
# and such allocation situation is stable so as a recovery the VM
807+
# can be reset-state to ACTIVE without problem.
808+
self._assert_pinned_cpus(src_host, 0)
809+
self._assert_pinned_cpus(dst_host, 2)
810+
811+
def test_resize_revert_bug_1944759(self):
812+
server, src_host, dst_host = (
813+
self._create_server_and_resize_bug_1944759())
814+
815+
# Now revert the resize
816+
post = {'revertResize': None}
817+
818+
# reverts actually succeeds (not like confirm) but the resource
819+
# allocation is still flaky
820+
self.api.post_server_action(server['id'], post)
821+
self._wait_for_state_change(server, 'ACTIVE')
822+
823+
# This is a resource accounting bug. After the revert the source host
824+
# should have 2 cpus pinned due to the instance.
825+
self._assert_pinned_cpus(src_host, 0)
826+
self._assert_pinned_cpus(dst_host, 0)
827+
828+
# running the periodic job will fix the resource accounting
829+
self._run_periodics()
830+
831+
# this is now correct
832+
self._assert_pinned_cpus(src_host, 2)
833+
self._assert_pinned_cpus(dst_host, 0)
834+
714835

715836
class NUMAServerTestWithCountingQuotaFromPlacement(NUMAServersTest):
716837

0 commit comments

Comments
 (0)