Skip to content

Commit ab8d473

Browse files
committed
virt: Delete LSP at external process killing VM
When something like OOM killer kill the running virt-launcher pod code enter on [1] this prevent ovnk from deleting the LSP so when VM is restarted the traffic is blocked. This change just return nil virtual machine status, since there is no live migration going on. [1] https://github.com/openshift/ovn-kubernetes/blob/release-4.18/go-controller/pkg/kubevirt/pod.go#L475 Signed-off-by: Enrique Llorente <[email protected]>
1 parent fd5a1d1 commit ab8d473

File tree

5 files changed

+92
-27
lines changed

5 files changed

+92
-27
lines changed

go-controller/pkg/kubevirt/pod.go

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -470,11 +470,15 @@ func DiscoverLiveMigrationStatus(client *factory.WatchFactory, pod *corev1.Pod)
470470

471471
targetPod := vmPods[len(vmPods)-1]
472472
livingPods := filterNotComplete(vmPods)
473+
474+
// If there is no living pod we should state no live migration status
475+
if len(livingPods) == 0 {
476+
return nil, nil
477+
}
478+
479+
// There is a living pod but is not the target one so the migration
480+
// has failed.
473481
if util.PodCompleted(targetPod) {
474-
// if target pod failed, then there should be only one living source pod.
475-
if len(livingPods) != 1 {
476-
return nil, fmt.Errorf("unexpected live migration state: should have a single living pod")
477-
}
478482
return &LiveMigrationStatus{
479483
SourcePod: livingPods[0],
480484
TargetPod: targetPod,

go-controller/pkg/kubevirt/pod_test.go

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,11 @@ var _ = Describe("Kubevirt Pod", func() {
9898
pods: []corev1.Pod{successfullyMigratedKvSourcePod, failedMigrationKvTargetPod, successfulMigrationKvTargetPod},
9999
},
100100
),
101+
Entry("returns nil when there is all the pods are completed (not running vm after migration)",
102+
testParams{
103+
pods: []corev1.Pod{completedKubevirtPod(t0), completedKubevirtPod(t1), completedKubevirtPod(t3)},
104+
},
105+
),
101106
Entry("returns Migration in progress status when 2 pods are running, target pod is not yet ready",
102107
testParams{
103108
pods: []corev1.Pod{runningKvSourcePod, duringMigrationKvTargetPod},
@@ -148,12 +153,6 @@ var _ = Describe("Kubevirt Pod", func() {
148153
},
149154
},
150155
),
151-
Entry("returns err when kubevirt VM has several living pods and target pod failed",
152-
testParams{
153-
pods: []corev1.Pod{runningKvSourcePod, successfulMigrationKvTargetPod, anotherFailedMigrationKvTargetPod},
154-
expectedError: fmt.Errorf("unexpected live migration state: should have a single living pod"),
155-
},
156-
),
157156
Entry("returns err when kubevirt VM has several living pods",
158157
testParams{
159158
pods: []corev1.Pod{runningKvSourcePod, duringMigrationKvTargetPod, yetAnotherDuringMigrationKvTargetPod},

go-controller/pkg/ovn/kubevirt_test.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -146,10 +146,6 @@ var _ = Describe("OVN Kubevirt Operations", func() {
146146
addressIPv6: "fd11::3",
147147
},
148148
}
149-
logicalSwitch *nbdb.LogicalSwitch
150-
ovnClusterRouter *nbdb.LogicalRouter
151-
logicalRouterPort *nbdb.LogicalRouterPort
152-
migrationSourceLSRP, migrationTargetLSRP *nbdb.LogicalSwitchPort
153149

154150
lrpIP = func(network string) string {
155151
return strings.Split(network, "/")[0]
@@ -497,6 +493,12 @@ var _ = Describe("OVN Kubevirt Operations", func() {
497493

498494
Context("during execution", func() {
499495
DescribeTable("reconcile migratable vm pods", func(t testData) {
496+
var (
497+
logicalSwitch *nbdb.LogicalSwitch
498+
ovnClusterRouter *nbdb.LogicalRouter
499+
logicalRouterPort *nbdb.LogicalRouterPort
500+
migrationSourceLSRP, migrationTargetLSRP *nbdb.LogicalSwitchPort
501+
)
500502

501503
_, parsedClusterCIDRIPv4, err := net.ParseCIDR(clusterCIDRIPv4)
502504
Expect(err).ToNot(HaveOccurred())

test/e2e/kubevirt.go

Lines changed: 56 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ import (
5050
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
5151
testutils "k8s.io/kubernetes/test/utils"
5252
utilnet "k8s.io/utils/net"
53-
"k8s.io/utils/pointer"
53+
"k8s.io/utils/ptr"
5454
crclient "sigs.k8s.io/controller-runtime/pkg/client"
5555

5656
butaneconfig "github.com/coreos/butane/config"
@@ -794,9 +794,9 @@ var _ = Describe("Kubevirt Virtual Machines", feature.VirtualMachineSupport, fun
794794
}).WithPolling(time.Second).WithTimeout(time.Minute).Should(Succeed())
795795
}
796796

797-
waitVirtualMachineInstanceReadiness = func(vmi *kubevirtv1.VirtualMachineInstance) {
797+
waitVirtualMachineInstanceReadinessWith = func(vmi *kubevirtv1.VirtualMachineInstance, conditionStatus corev1.ConditionStatus) {
798798
GinkgoHelper()
799-
By(fmt.Sprintf("Waiting for readiness at virtual machine %s", vmi.Name))
799+
By(fmt.Sprintf("Waiting for readiness=%q at virtual machine %s", conditionStatus, vmi.Name))
800800
Eventually(func() []kubevirtv1.VirtualMachineInstanceCondition {
801801
err := crClient.Get(context.Background(), crclient.ObjectKeyFromObject(vmi), vmi)
802802
Expect(err).To(SatisfyAny(
@@ -807,10 +807,20 @@ var _ = Describe("Kubevirt Virtual Machines", feature.VirtualMachineSupport, fun
807807
}).WithPolling(time.Second).WithTimeout(5 * time.Minute).Should(
808808
ContainElement(SatisfyAll(
809809
HaveField("Type", kubevirtv1.VirtualMachineInstanceReady),
810-
HaveField("Status", corev1.ConditionTrue),
810+
HaveField("Status", conditionStatus),
811811
)))
812812
}
813813

814+
waitVirtualMachineInstanceReadiness = func(vmi *kubevirtv1.VirtualMachineInstance) {
815+
GinkgoHelper()
816+
waitVirtualMachineInstanceReadinessWith(vmi, corev1.ConditionTrue)
817+
}
818+
819+
waitVirtualMachineInstanceFailed = func(vmi *kubevirtv1.VirtualMachineInstance) {
820+
GinkgoHelper()
821+
waitVirtualMachineInstanceReadinessWith(vmi, corev1.ConditionFalse)
822+
}
823+
814824
waitVirtualMachineAddresses = func(vmi *kubevirtv1.VirtualMachineInstance) []kubevirt.Address {
815825
GinkgoHelper()
816826
step := by(vmi.Name, "Wait for virtual machine to receive IPv4 address from DHCP")
@@ -903,7 +913,7 @@ var _ = Describe("Kubevirt Virtual Machines", feature.VirtualMachineSupport, fun
903913
NetworkSource: networkSource,
904914
},
905915
},
906-
TerminationGracePeriodSeconds: pointer.Int64(5),
916+
TerminationGracePeriodSeconds: ptr.To(int64(5)),
907917
Volumes: []kubevirtv1.Volume{
908918
{
909919
Name: "containerdisk",
@@ -929,7 +939,7 @@ var _ = Describe("Kubevirt Virtual Machines", feature.VirtualMachineSupport, fun
929939
GenerateName: vmi.GenerateName,
930940
},
931941
Spec: kubevirtv1.VirtualMachineSpec{
932-
Running: pointer.Bool(true),
942+
RunStrategy: ptr.To(kubevirtv1.RunStrategyAlways),
933943
Template: &kubevirtv1.VirtualMachineInstanceTemplateSpec{
934944
ObjectMeta: metav1.ObjectMeta{
935945
Annotations: vmi.Annotations,
@@ -1414,8 +1424,8 @@ fi
14141424
Name: "force-post-copy",
14151425
},
14161426
Spec: kvmigrationsv1alpha1.MigrationPolicySpec{
1417-
AllowPostCopy: pointer.Bool(true),
1418-
CompletionTimeoutPerGiB: pointer.Int64(1),
1427+
AllowPostCopy: ptr.To(true),
1428+
CompletionTimeoutPerGiB: ptr.To(int64(1)),
14191429
BandwidthPerMigration: &bandwidthPerMigration,
14201430
Selectors: &kvmigrationsv1alpha1.Selectors{
14211431
VirtualMachineInstanceSelector: kvmigrationsv1alpha1.LabelSelector{
@@ -2219,15 +2229,20 @@ chpasswd: { expire: False }
22192229
networkData, err := staticIPsNetworkData(selectCIDRs(vmiIPv4, vmiIPv6))
22202230
Expect(err).NotTo(HaveOccurred())
22212231

2222-
vmi := fedoraWithTestToolingVMI(nil /*labels*/, nil /*annotations*/, nil /*nodeSelector*/, kubevirtv1.NetworkSource{
2232+
vm := fedoraWithTestToolingVM(nil /*labels*/, nil /*annotations*/, nil /*nodeSelector*/, kubevirtv1.NetworkSource{
22232233
Multus: &kubevirtv1.MultusNetwork{
22242234
NetworkName: cudn.Name,
22252235
},
22262236
}, userData, networkData)
22272237
// Harcode mac address so it's the same after live migration
2228-
vmi.Spec.Domain.Devices.Interfaces[0].MacAddress = vmiMAC
2229-
createVirtualMachineInstance(vmi)
2230-
2238+
vm.Spec.Template.Spec.Domain.Devices.Interfaces[0].MacAddress = vmiMAC
2239+
createVirtualMachine(vm)
2240+
vmi := &kubevirtv1.VirtualMachineInstance{
2241+
ObjectMeta: metav1.ObjectMeta{
2242+
Namespace: namespace,
2243+
Name: vm.Name,
2244+
},
2245+
}
22312246
waitVirtualMachineInstanceReadiness(vmi)
22322247
Expect(crClient.Get(context.TODO(), crclient.ObjectKeyFromObject(vmi), vmi)).To(Succeed())
22332248

@@ -2253,11 +2268,39 @@ chpasswd: { expire: False }
22532268
by(vmi.Name, "Running live migration for virtual machine instance")
22542269
td(vmi)
22552270

2256-
step = by(vmi.Name, fmt.Sprintf("Login to virtual machine after virtual machine instance live migration"))
2271+
// Update vmi status after live migration
2272+
Expect(crClient.Get(context.Background(), crclient.ObjectKeyFromObject(vmi), vmi)).To(Succeed())
2273+
2274+
step = by(vmi.Name, "Login to virtual machine after virtual machine instance live migration")
22572275
Expect(kubevirt.LoginToFedora(vmi, "fedora", "fedora")).To(Succeed(), step)
22582276

22592277
step = by(vmi.Name, "Check east/west traffic after virtual machine instance live migration")
22602278
checkEastWestIperfTraffic(vmi, testPodsIPs, step)
2279+
2280+
By("Stop iperf3 traffic before force killing vm, so iperf3 server do not get stuck")
2281+
output, err = kubevirt.RunCommand(vmi, "killall iperf3", 5*time.Second)
2282+
Expect(err).ToNot(HaveOccurred(), output)
2283+
2284+
step = by(vmi.Name, fmt.Sprintf("Force kill qemu at node %q where VM is running on", vmi.Status.NodeName))
2285+
Expect(kubevirt.ForceKillVirtLauncherAtNode(infraprovider.Get(), vmi.Status.NodeName, vmi.Namespace, vmi.Name)).To(Succeed())
2286+
2287+
step = by(vmi.Name, "Waiting for failed restarted VMI to reach ready state")
2288+
waitVirtualMachineInstanceFailed(vmi)
2289+
waitVirtualMachineInstanceReadiness(vmi)
2290+
Expect(crClient.Get(context.TODO(), crclient.ObjectKeyFromObject(vmi), vmi)).To(Succeed())
2291+
2292+
step = by(vmi.Name, "Login to virtual machine after virtual machine instance force killed")
2293+
Expect(kubevirt.LoginToFedora(vmi, "fedora", "fedora")).To(Succeed(), step)
2294+
2295+
step = by(vmi.Name, "Restart iperf traffic after forcing a vm failure")
2296+
Expect(startEastWestIperfTraffic(vmi, testPodsIPs, step)).To(Succeed(), step)
2297+
checkEastWestIperfTraffic(vmi, testPodsIPs, step)
2298+
2299+
by(vmi.Name, "Running live migration after forcing vm failure")
2300+
td(vmi)
2301+
2302+
step = by(vmi.Name, "Check east/west traffic for failed virtual machine after live migration")
2303+
checkEastWestIperfTraffic(vmi, testPodsIPs, step)
22612304
},
22622305
Entry("after succeeded live migration", liveMigrateSucceed),
22632306
Entry("after failed live migration", liveMigrateFailed),

test/e2e/kubevirt/pod.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
package kubevirt
22

33
import (
4+
"fmt"
5+
6+
infraapi "github.com/ovn-org/ovn-kubernetes/test/e2e/infraprovider/api"
47
corev1 "k8s.io/api/core/v1"
58
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
69
"k8s.io/utils/ptr"
@@ -31,3 +34,17 @@ func GenerateFakeVirtLauncherPod(namespace, vmName string) *corev1.Pod {
3134
},
3235
}
3336
}
37+
38+
func ForceKillVirtLauncherAtNode(p infraapi.Provider, nodeName, vmNamespace, vmName string) error {
39+
// /usr/bin/virt-launcher --qemu-timeout 312s --name worker-dcf9j --uid bcf975f4-7bdd-4264-948b-b6080320e38a --namespace kv-live-migration-2575 --kubevirt-share-dir /var/run/kubevirt --ephemeral-disk-dir /var/run/kubevirt-ephemeral-disks --container-disk-dir /var/run/kubevirt/container-disks --grace-period-seconds 20 --hook-sidecars 0 --ovmf-path /usr/share/OVMF --run-as-nonroot
40+
killScript := fmt.Sprintf(`
41+
pid=$(pgrep -f 'virt-launcher .*--name %s.*--namespace %s'|grep -v $$)
42+
ps aux |grep virt-launcher
43+
kill -9 $pid
44+
`, vmName, vmNamespace)
45+
output, err := p.ExecK8NodeCommand(nodeName, []string{"bash", "-xe", "-c", killScript})
46+
if err != nil {
47+
return fmt.Errorf("%s:%w", output, err)
48+
}
49+
return nil
50+
}

0 commit comments

Comments
 (0)