Skip to content

Commit bfa0dc7

Browse files
committed
test: change the echo server in test_5_snapshots
With AL2023 and with a 4.14 kernel and some machine configurations, this test can fail due to the vsock clients timing out during connect. The problem disappears if we run socat instead of the internal echo server that we run in a thread with Python. It may be the echo server is buggy or the Python GIL is acting up. So, replace the single-threaded echo server with socat, which can fork per client and is an external process. Signed-off-by: Pablo Barbáchano <[email protected]>
1 parent aa499ee commit bfa0dc7

File tree

3 files changed

+24
-83
lines changed

3 files changed

+24
-83
lines changed

tests/framework/utils_vsock.py

Lines changed: 21 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,11 @@
55
import hashlib
66
import os.path
77
import re
8-
from select import select
8+
import time
9+
from pathlib import Path
910
from socket import AF_UNIX, SOCK_STREAM, socket
10-
from threading import Event, Thread
11-
12-
from framework import utils
11+
from subprocess import Popen
12+
from threading import Thread
1313

1414
ECHO_SERVER_PORT = 5252
1515
SERVER_ACCEPT_BACKLOG = 128
@@ -19,78 +19,6 @@
1919
VSOCK_UDS_PATH = "v.sock"
2020

2121

22-
class HostEchoServer(Thread):
23-
"""A simple "echo" server for vsock.
24-
25-
This server will accept incoming connections (initiated by the guest vm),
26-
and, for each connection, it will read any incoming data and then echo it
27-
right back.
28-
"""
29-
30-
def __init__(self, vm, path):
31-
"""."""
32-
super().__init__()
33-
self.vm = vm
34-
self.path = path
35-
self.sock = socket(AF_UNIX, SOCK_STREAM)
36-
self.sock.bind(path)
37-
self.sock.listen(SERVER_ACCEPT_BACKLOG)
38-
self.error = None
39-
self.clients = []
40-
self.exit_evt = Event()
41-
42-
# Link the listening Unix socket into the VM's jail, so that
43-
# Firecracker can connect to it.
44-
vm.create_jailed_resource(path)
45-
46-
def run(self):
47-
"""Thread code payload.
48-
49-
Wrap up the real "run" into a catch-all block, because Python cannot
50-
into threads - if this thread were to raise an unhandled exception,
51-
the whole process would lock.
52-
"""
53-
try:
54-
self._run()
55-
# pylint: disable=broad-except
56-
except Exception as err:
57-
self.error = err
58-
59-
def _run(self):
60-
while not self.exit_evt.is_set():
61-
watch_list = self.clients + [self.sock]
62-
rd_list, _, _ = select(watch_list, [], [], 1)
63-
for rdo in rd_list:
64-
if rdo == self.sock:
65-
# Read event on the listening socket: a new client
66-
# wants to connect.
67-
(client, _) = self.sock.accept()
68-
self.clients.append(client)
69-
continue
70-
# Read event on a connected socket: new data is
71-
# available from some client.
72-
buf = rdo.recv(BUF_SIZE)
73-
if not buf:
74-
# Zero-length read: connection reset by peer.
75-
self.clients.remove(rdo)
76-
continue
77-
sent = 0
78-
while sent < len(buf):
79-
# Send back everything we just read.
80-
sent += rdo.send(buf[sent:])
81-
82-
def exit(self):
83-
"""Shut down the echo server and wait for it to exit.
84-
85-
This method can be called from any thread. Upon returning, the
86-
echo server will have shut down.
87-
"""
88-
self.exit_evt.set()
89-
self.join()
90-
self.sock.close()
91-
utils.run_cmd("rm -f {}".format(self.path))
92-
93-
9422
class HostEchoWorker(Thread):
9523
"""A vsock echo worker, connecting to a guest echo server.
9624
@@ -199,8 +127,19 @@ def check_guest_connections(vm, server_port_path, blob_path, blob_hash):
199127
start `TEST_CONNECTION_COUNT` workers inside the guest VM, all
200128
communicating with the echo server.
201129
"""
202-
echo_server = HostEchoServer(vm, server_port_path)
203-
echo_server.start()
130+
131+
echo_server = Popen(
132+
["socat", f"UNIX-LISTEN:{server_port_path},fork,backlog=5", "exec:'/bin/cat'"]
133+
)
134+
135+
# Link the listening Unix socket into the VM's jail, so that
136+
# Firecracker can connect to it.
137+
attempt = 0
138+
# But 1st, give socat a bit of time to create the socket
139+
while not Path(server_port_path).exists() and attempt < 3:
140+
time.sleep(0.2)
141+
attempt += 1
142+
vm.create_jailed_resource(server_port_path)
204143

205144
# Increase maximum process count for the ssh service.
206145
# Avoids: "bash: fork: retry: Resource temporarily unavailable"
@@ -236,9 +175,10 @@ def check_guest_connections(vm, server_port_path, blob_path, blob_hash):
236175
cmd += "for w in $workers; do wait $w || exit -1; done"
237176

238177
ecode, stdout, stderr = vm.ssh.run(cmd)
239-
240-
echo_server.exit()
241-
assert echo_server.error is None
178+
echo_server.terminate()
179+
rc = echo_server.wait()
180+
# socat exits with 128 + 15 (SIGTERM)
181+
assert rc == 143
242182

243183
print(stdout.read())
244184
assert ecode == 0, stderr.read()

tools/devctr/Dockerfile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,8 @@ RUN apt-get update \
4141
libcurl4-openssl-dev \
4242
lsof \
4343
musl-tools \
44-
net-tools iproute2 iperf3 fdisk \
44+
# needed for integration tests
45+
net-tools iproute2 iperf3 socat fdisk \
4546
openssh-client \
4647
pkgconf \
4748
python3 python3-dev python3-pip \

tools/devtool

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@
7272
DEVCTR_IMAGE_NO_TAG="public.ecr.aws/firecracker/fcuvm"
7373

7474
# Development container tag
75-
DEVCTR_IMAGE_TAG=${DEVCTR_IMAGE_TAG:-v59}
75+
DEVCTR_IMAGE_TAG=${DEVCTR_IMAGE_TAG:-v60}
7676

7777
# Development container image (name:tag)
7878
# This should be updated whenever we upgrade the development container.

0 commit comments

Comments
 (0)