@@ -711,6 +711,127 @@ def fake_confirm_migration(*args, **kwargs):
711
711
712
712
server = self ._wait_for_state_change (server , 'ACTIVE' )
713
713
714
+ def _assert_pinned_cpus (self , hostname , expected_number_of_pinned ):
715
+ numa_topology = objects .NUMATopology .obj_from_db_obj (
716
+ objects .ComputeNode .get_by_nodename (
717
+ self .ctxt , hostname ,
718
+ ).numa_topology ,
719
+ )
720
+ self .assertEqual (
721
+ expected_number_of_pinned , len (numa_topology .cells [0 ].pinned_cpus ))
722
+
723
+ def _create_server_and_resize_bug_1944759 (self ):
724
+ self .flags (
725
+ cpu_dedicated_set = '0-3' , cpu_shared_set = '4-7' , group = 'compute' )
726
+ self .flags (vcpu_pin_set = None )
727
+
728
+ # start services
729
+ self .start_compute (hostname = 'test_compute0' )
730
+ self .start_compute (hostname = 'test_compute1' )
731
+
732
+ flavor_a_id = self ._create_flavor (
733
+ vcpu = 2 , extra_spec = {'hw:cpu_policy' : 'dedicated' })
734
+ server = self ._create_server (flavor_id = flavor_a_id )
735
+
736
+ src_host = server ['OS-EXT-SRV-ATTR:host' ]
737
+ self ._assert_pinned_cpus (src_host , 2 )
738
+
739
+ # we don't really care what the new flavor is, so long as the old
740
+ # flavor is using pinning. We use a similar flavor for simplicity.
741
+ flavor_b_id = self ._create_flavor (
742
+ vcpu = 2 , extra_spec = {'hw:cpu_policy' : 'dedicated' })
743
+
744
+ orig_rpc_finish_resize = nova .compute .rpcapi .ComputeAPI .finish_resize
745
+
746
+ # Simulate that the finish_resize call overlaps with an
747
+ # update_available_resource periodic job
748
+ def inject_periodic_to_finish_resize (* args , ** kwargs ):
749
+ self ._run_periodics ()
750
+ return orig_rpc_finish_resize (* args , ** kwargs )
751
+
752
+ self .stub_out (
753
+ 'nova.compute.rpcapi.ComputeAPI.finish_resize' ,
754
+ inject_periodic_to_finish_resize ,
755
+ )
756
+
757
+ # TODO(stephenfin): The mock of 'migrate_disk_and_power_off' should
758
+ # probably be less...dumb
759
+ with mock .patch (
760
+ 'nova.virt.libvirt.driver.LibvirtDriver'
761
+ '.migrate_disk_and_power_off' , return_value = '{}' ,
762
+ ):
763
+ post = {'resize' : {'flavorRef' : flavor_b_id }}
764
+ self .api .post_server_action (server ['id' ], post )
765
+ server = self ._wait_for_state_change (server , 'VERIFY_RESIZE' )
766
+
767
+ dst_host = server ['OS-EXT-SRV-ATTR:host' ]
768
+
769
+ # This is a resource accounting bug, we should have 2 cpus pinned on
770
+ # both computes. The source should have it due to the outbound
771
+ # migration and the destination due to the instance running there
772
+ self ._assert_pinned_cpus (src_host , 0 )
773
+ self ._assert_pinned_cpus (dst_host , 2 )
774
+
775
+ return server , src_host , dst_host
776
+
777
+ def test_resize_confirm_bug_1944759 (self ):
778
+ server , src_host , dst_host = (
779
+ self ._create_server_and_resize_bug_1944759 ())
780
+
781
+ # Now confirm the resize
782
+ post = {'confirmResize' : None }
783
+
784
+ # FIXME(gibi): This is bug 1944759 where during resize, on the source
785
+ # node the resize_instance() call at the point of calling finish_resize
786
+ # overlaps with a update_available_resources() periodic job. This
787
+ # causes that the periodic job will not track the migration nor the
788
+ # instance and therefore freeing the resource allocation. Then when
789
+ # later the resize is confirmed the confirm_resize on the source
790
+ # compute also wants to free up the resources, the pinned CPUs, and it
791
+ # fails as they are already freed.
792
+ exc = self .assertRaises (
793
+ client .OpenStackApiException ,
794
+ self .api .post_server_action , server ['id' ], post
795
+ )
796
+ self .assertEqual (500 , exc .response .status_code )
797
+ self .assertIn ('CPUUnpinningInvalid' , str (exc ))
798
+
799
+ # confirm failed above but the resource allocation reflects that the
800
+ # VM is running on the dest node
801
+ self ._assert_pinned_cpus (src_host , 0 )
802
+ self ._assert_pinned_cpus (dst_host , 2 )
803
+
804
+ self ._run_periodics ()
805
+
806
+ # and such allocation situation is stable so as a recovery the VM
807
+ # can be reset-state to ACTIVE without problem.
808
+ self ._assert_pinned_cpus (src_host , 0 )
809
+ self ._assert_pinned_cpus (dst_host , 2 )
810
+
811
+ def test_resize_revert_bug_1944759 (self ):
812
+ server , src_host , dst_host = (
813
+ self ._create_server_and_resize_bug_1944759 ())
814
+
815
+ # Now revert the resize
816
+ post = {'revertResize' : None }
817
+
818
+ # reverts actually succeeds (not like confirm) but the resource
819
+ # allocation is still flaky
820
+ self .api .post_server_action (server ['id' ], post )
821
+ self ._wait_for_state_change (server , 'ACTIVE' )
822
+
823
+ # This is a resource accounting bug. After the revert the source host
824
+ # should have 2 cpus pinned due to the instance.
825
+ self ._assert_pinned_cpus (src_host , 0 )
826
+ self ._assert_pinned_cpus (dst_host , 0 )
827
+
828
+ # running the periodic job will fix the resource accounting
829
+ self ._run_periodics ()
830
+
831
+ # this is now correct
832
+ self ._assert_pinned_cpus (src_host , 2 )
833
+ self ._assert_pinned_cpus (dst_host , 0 )
834
+
714
835
715
836
class NUMAServerTestWithCountingQuotaFromPlacement (NUMAServersTest ):
716
837
0 commit comments