Skip to content

Commit 36ba17d

Browse files
authored
Merge pull request #5368 from qinqon/kv-prevent-oom-killer-failure
virt: Delete LSP at external process killing VM
2 parents 96a59fe + ab8d473 commit 36ba17d

File tree

5 files changed

+92
-27
lines changed

5 files changed

+92
-27
lines changed

go-controller/pkg/kubevirt/pod.go

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -470,11 +470,15 @@ func DiscoverLiveMigrationStatus(client *factory.WatchFactory, pod *corev1.Pod)
470470

471471
targetPod := vmPods[len(vmPods)-1]
472472
livingPods := filterNotComplete(vmPods)
473+
474+
// If there is no living pod we should state no live migration status
475+
if len(livingPods) == 0 {
476+
return nil, nil
477+
}
478+
479+
// There is a living pod but is not the target one so the migration
480+
// has failed.
473481
if util.PodCompleted(targetPod) {
474-
// if target pod failed, then there should be only one living source pod.
475-
if len(livingPods) != 1 {
476-
return nil, fmt.Errorf("unexpected live migration state: should have a single living pod")
477-
}
478482
return &LiveMigrationStatus{
479483
SourcePod: livingPods[0],
480484
TargetPod: targetPod,

go-controller/pkg/kubevirt/pod_test.go

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,11 @@ var _ = Describe("Kubevirt Pod", func() {
9898
pods: []corev1.Pod{successfullyMigratedKvSourcePod, failedMigrationKvTargetPod, successfulMigrationKvTargetPod},
9999
},
100100
),
101+
Entry("returns nil when there is all the pods are completed (not running vm after migration)",
102+
testParams{
103+
pods: []corev1.Pod{completedKubevirtPod(t0), completedKubevirtPod(t1), completedKubevirtPod(t3)},
104+
},
105+
),
101106
Entry("returns Migration in progress status when 2 pods are running, target pod is not yet ready",
102107
testParams{
103108
pods: []corev1.Pod{runningKvSourcePod, duringMigrationKvTargetPod},
@@ -148,12 +153,6 @@ var _ = Describe("Kubevirt Pod", func() {
148153
},
149154
},
150155
),
151-
Entry("returns err when kubevirt VM has several living pods and target pod failed",
152-
testParams{
153-
pods: []corev1.Pod{runningKvSourcePod, successfulMigrationKvTargetPod, anotherFailedMigrationKvTargetPod},
154-
expectedError: fmt.Errorf("unexpected live migration state: should have a single living pod"),
155-
},
156-
),
157156
Entry("returns err when kubevirt VM has several living pods",
158157
testParams{
159158
pods: []corev1.Pod{runningKvSourcePod, duringMigrationKvTargetPod, yetAnotherDuringMigrationKvTargetPod},

go-controller/pkg/ovn/kubevirt_test.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -146,10 +146,6 @@ var _ = Describe("OVN Kubevirt Operations", func() {
146146
addressIPv6: "fd11::3",
147147
},
148148
}
149-
logicalSwitch *nbdb.LogicalSwitch
150-
ovnClusterRouter *nbdb.LogicalRouter
151-
logicalRouterPort *nbdb.LogicalRouterPort
152-
migrationSourceLSRP, migrationTargetLSRP *nbdb.LogicalSwitchPort
153149

154150
lrpIP = func(network string) string {
155151
return strings.Split(network, "/")[0]
@@ -497,6 +493,12 @@ var _ = Describe("OVN Kubevirt Operations", func() {
497493

498494
Context("during execution", func() {
499495
DescribeTable("reconcile migratable vm pods", func(t testData) {
496+
var (
497+
logicalSwitch *nbdb.LogicalSwitch
498+
ovnClusterRouter *nbdb.LogicalRouter
499+
logicalRouterPort *nbdb.LogicalRouterPort
500+
migrationSourceLSRP, migrationTargetLSRP *nbdb.LogicalSwitchPort
501+
)
500502

501503
_, parsedClusterCIDRIPv4, err := net.ParseCIDR(clusterCIDRIPv4)
502504
Expect(err).ToNot(HaveOccurred())

test/e2e/kubevirt.go

Lines changed: 56 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ import (
5050
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
5151
testutils "k8s.io/kubernetes/test/utils"
5252
utilnet "k8s.io/utils/net"
53-
"k8s.io/utils/pointer"
53+
"k8s.io/utils/ptr"
5454
crclient "sigs.k8s.io/controller-runtime/pkg/client"
5555

5656
butaneconfig "github.com/coreos/butane/config"
@@ -794,9 +794,9 @@ var _ = Describe("Kubevirt Virtual Machines", feature.VirtualMachineSupport, fun
794794
}).WithPolling(time.Second).WithTimeout(time.Minute).Should(Succeed())
795795
}
796796

797-
waitVirtualMachineInstanceReadiness = func(vmi *kubevirtv1.VirtualMachineInstance) {
797+
waitVirtualMachineInstanceReadinessWith = func(vmi *kubevirtv1.VirtualMachineInstance, conditionStatus corev1.ConditionStatus) {
798798
GinkgoHelper()
799-
By(fmt.Sprintf("Waiting for readiness at virtual machine %s", vmi.Name))
799+
By(fmt.Sprintf("Waiting for readiness=%q at virtual machine %s", conditionStatus, vmi.Name))
800800
Eventually(func() []kubevirtv1.VirtualMachineInstanceCondition {
801801
err := crClient.Get(context.Background(), crclient.ObjectKeyFromObject(vmi), vmi)
802802
Expect(err).To(SatisfyAny(
@@ -807,10 +807,20 @@ var _ = Describe("Kubevirt Virtual Machines", feature.VirtualMachineSupport, fun
807807
}).WithPolling(time.Second).WithTimeout(5 * time.Minute).Should(
808808
ContainElement(SatisfyAll(
809809
HaveField("Type", kubevirtv1.VirtualMachineInstanceReady),
810-
HaveField("Status", corev1.ConditionTrue),
810+
HaveField("Status", conditionStatus),
811811
)))
812812
}
813813

814+
waitVirtualMachineInstanceReadiness = func(vmi *kubevirtv1.VirtualMachineInstance) {
815+
GinkgoHelper()
816+
waitVirtualMachineInstanceReadinessWith(vmi, corev1.ConditionTrue)
817+
}
818+
819+
waitVirtualMachineInstanceFailed = func(vmi *kubevirtv1.VirtualMachineInstance) {
820+
GinkgoHelper()
821+
waitVirtualMachineInstanceReadinessWith(vmi, corev1.ConditionFalse)
822+
}
823+
814824
waitVirtualMachineAddresses = func(vmi *kubevirtv1.VirtualMachineInstance) []kubevirt.Address {
815825
GinkgoHelper()
816826
step := by(vmi.Name, "Wait for virtual machine to receive IPv4 address from DHCP")
@@ -903,7 +913,7 @@ var _ = Describe("Kubevirt Virtual Machines", feature.VirtualMachineSupport, fun
903913
NetworkSource: networkSource,
904914
},
905915
},
906-
TerminationGracePeriodSeconds: pointer.Int64(5),
916+
TerminationGracePeriodSeconds: ptr.To(int64(5)),
907917
Volumes: []kubevirtv1.Volume{
908918
{
909919
Name: "containerdisk",
@@ -929,7 +939,7 @@ var _ = Describe("Kubevirt Virtual Machines", feature.VirtualMachineSupport, fun
929939
GenerateName: vmi.GenerateName,
930940
},
931941
Spec: kubevirtv1.VirtualMachineSpec{
932-
Running: pointer.Bool(true),
942+
RunStrategy: ptr.To(kubevirtv1.RunStrategyAlways),
933943
Template: &kubevirtv1.VirtualMachineInstanceTemplateSpec{
934944
ObjectMeta: metav1.ObjectMeta{
935945
Annotations: vmi.Annotations,
@@ -1414,8 +1424,8 @@ fi
14141424
Name: "force-post-copy",
14151425
},
14161426
Spec: kvmigrationsv1alpha1.MigrationPolicySpec{
1417-
AllowPostCopy: pointer.Bool(true),
1418-
CompletionTimeoutPerGiB: pointer.Int64(1),
1427+
AllowPostCopy: ptr.To(true),
1428+
CompletionTimeoutPerGiB: ptr.To(int64(1)),
14191429
BandwidthPerMigration: &bandwidthPerMigration,
14201430
Selectors: &kvmigrationsv1alpha1.Selectors{
14211431
VirtualMachineInstanceSelector: kvmigrationsv1alpha1.LabelSelector{
@@ -2219,15 +2229,20 @@ chpasswd: { expire: False }
22192229
networkData, err := staticIPsNetworkData(filterCIDRs(fr.ClientSet, vmiIPv4, vmiIPv6))
22202230
Expect(err).NotTo(HaveOccurred())
22212231

2222-
vmi := fedoraWithTestToolingVMI(nil /*labels*/, nil /*annotations*/, nil /*nodeSelector*/, kubevirtv1.NetworkSource{
2232+
vm := fedoraWithTestToolingVM(nil /*labels*/, nil /*annotations*/, nil /*nodeSelector*/, kubevirtv1.NetworkSource{
22232233
Multus: &kubevirtv1.MultusNetwork{
22242234
NetworkName: cudn.Name,
22252235
},
22262236
}, userData, networkData)
22272237
// Harcode mac address so it's the same after live migration
2228-
vmi.Spec.Domain.Devices.Interfaces[0].MacAddress = vmiMAC
2229-
createVirtualMachineInstance(vmi)
2230-
2238+
vm.Spec.Template.Spec.Domain.Devices.Interfaces[0].MacAddress = vmiMAC
2239+
createVirtualMachine(vm)
2240+
vmi := &kubevirtv1.VirtualMachineInstance{
2241+
ObjectMeta: metav1.ObjectMeta{
2242+
Namespace: namespace,
2243+
Name: vm.Name,
2244+
},
2245+
}
22312246
waitVirtualMachineInstanceReadiness(vmi)
22322247
Expect(crClient.Get(context.TODO(), crclient.ObjectKeyFromObject(vmi), vmi)).To(Succeed())
22332248

@@ -2253,11 +2268,39 @@ chpasswd: { expire: False }
22532268
by(vmi.Name, "Running live migration for virtual machine instance")
22542269
td(vmi)
22552270

2256-
step = by(vmi.Name, fmt.Sprintf("Login to virtual machine after virtual machine instance live migration"))
2271+
// Update vmi status after live migration
2272+
Expect(crClient.Get(context.Background(), crclient.ObjectKeyFromObject(vmi), vmi)).To(Succeed())
2273+
2274+
step = by(vmi.Name, "Login to virtual machine after virtual machine instance live migration")
22572275
Expect(kubevirt.LoginToFedora(vmi, "fedora", "fedora")).To(Succeed(), step)
22582276

22592277
step = by(vmi.Name, "Check east/west traffic after virtual machine instance live migration")
22602278
checkEastWestIperfTraffic(vmi, testPodsIPs, step)
2279+
2280+
By("Stop iperf3 traffic before force killing vm, so iperf3 server do not get stuck")
2281+
output, err = kubevirt.RunCommand(vmi, "killall iperf3", 5*time.Second)
2282+
Expect(err).ToNot(HaveOccurred(), output)
2283+
2284+
step = by(vmi.Name, fmt.Sprintf("Force kill qemu at node %q where VM is running on", vmi.Status.NodeName))
2285+
Expect(kubevirt.ForceKillVirtLauncherAtNode(infraprovider.Get(), vmi.Status.NodeName, vmi.Namespace, vmi.Name)).To(Succeed())
2286+
2287+
step = by(vmi.Name, "Waiting for failed restarted VMI to reach ready state")
2288+
waitVirtualMachineInstanceFailed(vmi)
2289+
waitVirtualMachineInstanceReadiness(vmi)
2290+
Expect(crClient.Get(context.TODO(), crclient.ObjectKeyFromObject(vmi), vmi)).To(Succeed())
2291+
2292+
step = by(vmi.Name, "Login to virtual machine after virtual machine instance force killed")
2293+
Expect(kubevirt.LoginToFedora(vmi, "fedora", "fedora")).To(Succeed(), step)
2294+
2295+
step = by(vmi.Name, "Restart iperf traffic after forcing a vm failure")
2296+
Expect(startEastWestIperfTraffic(vmi, testPodsIPs, step)).To(Succeed(), step)
2297+
checkEastWestIperfTraffic(vmi, testPodsIPs, step)
2298+
2299+
by(vmi.Name, "Running live migration after forcing vm failure")
2300+
td(vmi)
2301+
2302+
step = by(vmi.Name, "Check east/west traffic for failed virtual machine after live migration")
2303+
checkEastWestIperfTraffic(vmi, testPodsIPs, step)
22612304
},
22622305
Entry("after succeeded live migration", liveMigrateSucceed),
22632306
Entry("after failed live migration", liveMigrateFailed),

test/e2e/kubevirt/pod.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
package kubevirt
22

33
import (
4+
"fmt"
5+
6+
infraapi "github.com/ovn-org/ovn-kubernetes/test/e2e/infraprovider/api"
47
corev1 "k8s.io/api/core/v1"
58
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
69
"k8s.io/utils/ptr"
@@ -31,3 +34,17 @@ func GenerateFakeVirtLauncherPod(namespace, vmName string) *corev1.Pod {
3134
},
3235
}
3336
}
37+
38+
func ForceKillVirtLauncherAtNode(p infraapi.Provider, nodeName, vmNamespace, vmName string) error {
39+
// /usr/bin/virt-launcher --qemu-timeout 312s --name worker-dcf9j --uid bcf975f4-7bdd-4264-948b-b6080320e38a --namespace kv-live-migration-2575 --kubevirt-share-dir /var/run/kubevirt --ephemeral-disk-dir /var/run/kubevirt-ephemeral-disks --container-disk-dir /var/run/kubevirt/container-disks --grace-period-seconds 20 --hook-sidecars 0 --ovmf-path /usr/share/OVMF --run-as-nonroot
40+
killScript := fmt.Sprintf(`
41+
pid=$(pgrep -f 'virt-launcher .*--name %s.*--namespace %s'|grep -v $$)
42+
ps aux |grep virt-launcher
43+
kill -9 $pid
44+
`, vmName, vmNamespace)
45+
output, err := p.ExecK8NodeCommand(nodeName, []string{"bash", "-xe", "-c", killScript})
46+
if err != nil {
47+
return fmt.Errorf("%s:%w", output, err)
48+
}
49+
return nil
50+
}

0 commit comments

Comments
 (0)