Skip to content

Commit 24220c4

Browse files
committed
tests: upgrade snap resume baseline infra
The purpose of this change is to align the test framework with the one in the main branch (as of 32c7f1e) to allow snap restore baseline collection in firecracker-v1.1 and comparison with the baselines collected in main. The major difference compensated is the baseline config file schema. The following has been updated: - test_snapshot_restore_performance.py - snap restore baseline config files - parse_baseline script. Style of some Python files needed to be updated to satisfy the linter used before switching to black after v1.1. Signed-off-by: Nikita Kalyazin <[email protected]>
1 parent 1859426 commit 24220c4

File tree

6 files changed

+4693
-1942
lines changed

6 files changed

+4693
-1942
lines changed

tests/integration_tests/performance/configs/test_snap_restore_performance_config_4.14.json

Lines changed: 2229 additions & 905 deletions
Large diffs are not rendered by default.

tests/integration_tests/performance/configs/test_snap_restore_performance_config_5.10.json

Lines changed: 2229 additions & 905 deletions
Large diffs are not rendered by default.

tests/integration_tests/performance/test_snapshot_restore_performance.py

Lines changed: 98 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,11 @@
99

1010
import pytest
1111
from conftest import _test_images_s3_bucket
12-
from framework.artifacts import ArtifactCollection, ArtifactSet, \
13-
create_net_devices_configuration
12+
from framework.artifacts import (
13+
ArtifactCollection,
14+
ArtifactSet,
15+
create_net_devices_configuration,
16+
)
1417
from framework.builder import MicrovmBuilder, SnapshotBuilder, SnapshotType
1518
from framework.matrix import TestContext, TestMatrix
1619
from framework.stats import core
@@ -25,10 +28,10 @@
2528
from integration_tests.performance.utils import handle_failure
2629

2730
TEST_ID = "snap_restore_performance"
28-
CONFIG_NAME_REL = "test_{}_config_{}.json".format(
29-
TEST_ID, get_kernel_version(level=1))
31+
CONFIG_NAME_REL = "test_{}_config_{}.json"\
32+
.format(TEST_ID, get_kernel_version(level=1))
3033
CONFIG_NAME_ABS = os.path.join(defs.CFG_LOCATION, CONFIG_NAME_REL)
31-
CONFIG_DICT = json.load(open(CONFIG_NAME_ABS, encoding='utf-8'))
34+
CONFIG_DICT = json.load(open(CONFIG_NAME_ABS, encoding="utf-8"))
3235

3336
DEBUG = False
3437
BASE_VCPU_COUNT = 1
@@ -38,7 +41,7 @@
3841
USEC_IN_MSEC = 1000
3942

4043
# Measurements tags.
41-
RESTORE_LATENCY = "restore_latency"
44+
RESTORE_LATENCY = "latency"
4245

4346
# Define 4 net device configurations.
4447
net_ifaces = create_net_devices_configuration(4)
@@ -53,18 +56,21 @@
5356
class SnapRestoreBaselinesProvider(BaselineProvider):
5457
"""Baselines provider for snapshot restore latency."""
5558

56-
def __init__(self, env_id):
59+
def __init__(self, env_id, workload):
5760
"""Snapshot baseline provider initialization."""
5861
cpu_model_name = get_cpu_model_name()
59-
baselines = list(filter(
60-
lambda cpu_baseline: cpu_baseline["model"] == cpu_model_name,
61-
CONFIG_DICT["hosts"]["instances"][get_instance_type()]["cpus"]))
62+
baselines = list(
63+
filter(
64+
lambda cpu_baseline: cpu_baseline["model"] == cpu_model_name,
65+
CONFIG_DICT["hosts"]["instances"][get_instance_type()]["cpus"],
66+
)
67+
)
6268

6369
super().__init__(DictQuery({}))
6470
if len(baselines) > 0:
6571
super().__init__(DictQuery(baselines[0]))
6672

67-
self._tag = "baselines/{}/" + env_id + "/{}"
73+
self._tag = "baselines/{}/" + env_id + "/{}/" + workload
6874

6975
def get(self, ms_name: str, st_name: str) -> dict:
7076
"""Return the baseline value corresponding to the key."""
@@ -84,33 +90,35 @@ def construct_scratch_drives():
8490
"""Create an array of scratch disks."""
8591
scratchdisks = ["vdb", "vdc", "vdd", "vde"]
8692
disk_files = [
87-
drive_tools.FilesystemFile(tempfile.mktemp(), size=64)
88-
for _ in scratchdisks
93+
drive_tools.FilesystemFile(
94+
tempfile.mktemp(), size=64
95+
) for _ in scratchdisks
8996
]
9097
return list(zip(scratchdisks, disk_files))
9198

9299

93-
def default_lambda_consumer(env_id):
100+
def default_lambda_consumer(env_id, workload):
94101
"""Create a default lambda consumer for the snapshot restore test."""
95102
return st.consumer.LambdaConsumer(
96103
metadata_provider=DictMetadataProvider(
97104
CONFIG_DICT["measurements"],
98-
SnapRestoreBaselinesProvider(env_id)
105+
SnapRestoreBaselinesProvider(env_id, workload)
99106
),
100107
func=consume_output,
101-
func_kwargs={})
108+
func_kwargs={},
109+
)
102110

103111

104112
def get_snap_restore_latency(
105-
context,
106-
vcpus,
107-
mem_size,
108-
nets=1,
109-
blocks=1,
110-
all_devices=False,
111-
iterations=10):
113+
context, vcpus, mem_size, nets=1, blocks=1, all_devices=False,
114+
iterations=10
115+
):
112116
"""Restore snapshots with various configs to measure latency."""
113-
vm_builder = context.custom['builder']
117+
vm_builder = context.custom["builder"]
118+
logger = context.custom["logger"]
119+
balloon = vsock = 1 if all_devices else 0
120+
microvm_spec = f"{vcpus}vcpu_{mem_size}mb_{nets}net_{blocks}\
121+
block_{vsock}vsock_{balloon}balloon"
114122

115123
# Create a rw copy artifact.
116124
rw_disk = context.disk.copy()
@@ -128,7 +136,9 @@ def get_snap_restore_latency(
128136
ssh_key=ssh_key,
129137
config=context.microvm,
130138
net_ifaces=ifaces,
131-
use_ramdisk=True)
139+
use_ramdisk=True,
140+
io_engine="Sync",
141+
)
132142
basevm = vm_instance.vm
133143
response = basevm.machine_cfg.put(
134144
vcpu_count=vcpus,
@@ -138,44 +148,45 @@ def get_snap_restore_latency(
138148

139149
extra_disk_paths = []
140150
if blocks > 1:
141-
for (name, diskfile) in scratch_drives[:(blocks - 1)]:
142-
basevm.add_drive(name, diskfile.path, use_ramdisk=True)
151+
for (name, diskfile) in scratch_drives[: (blocks - 1)]:
152+
basevm.add_drive(
153+
name, diskfile.path, use_ramdisk=True, io_engine="Sync"
154+
)
143155
extra_disk_paths.append(diskfile.path)
144156
assert len(extra_disk_paths) > 0
145157

146158
if all_devices:
147159
response = basevm.balloon.put(
148-
amount_mib=0,
149-
deflate_on_oom=True,
150-
stats_polling_interval_s=1
160+
amount_mib=0, deflate_on_oom=True, stats_polling_interval_s=1
151161
)
152162
assert basevm.api_session.is_status_no_content(response.status_code)
153163

154164
response = basevm.vsock.put(
155-
vsock_id="vsock0",
156-
guest_cid=3,
157-
uds_path="/v.sock"
165+
vsock_id="vsock0", guest_cid=3, uds_path="/v.sock"
158166
)
159167
assert basevm.api_session.is_status_no_content(response.status_code)
160168

161169
basevm.start()
162170

171+
logger.info(
172+
'Testing with microvm: "{}", kernel {}, disk {}'.format(
173+
microvm_spec, context.kernel.name(), context.disk.name()
174+
)
175+
)
163176
# Create a snapshot builder from a microvm.
164177
snapshot_builder = SnapshotBuilder(basevm)
165178
full_snapshot = snapshot_builder.create(
166179
[rw_disk.local_path()] + extra_disk_paths,
167180
ssh_key,
168181
SnapshotType.FULL,
169182
net_ifaces=ifaces,
170-
use_ramdisk=True
183+
use_ramdisk=True,
171184
)
172185
basevm.kill()
173186
values = []
174187
for _ in range(iterations):
175188
microvm, metrics_fifo = vm_builder.build_from_snapshot(
176-
full_snapshot,
177-
resume=True,
178-
use_ramdisk=True
189+
full_snapshot, resume=True, use_ramdisk=True
179190
)
180191
# Attempt to connect to resumed microvm.
181192
ssh_connection = net_tools.SSHConnection(microvm.ssh_config)
@@ -188,7 +199,7 @@ def get_snap_restore_latency(
188199
metrics = microvm.get_all_metrics(metrics_fifo)
189200
for data_point in metrics:
190201
metrics = json.loads(data_point)
191-
cur_value = metrics['latencies_us']['load_snapshot']
202+
cur_value = metrics["latencies_us"]["load_snapshot"]
192203
if cur_value > 0:
193204
value = cur_value / USEC_IN_MSEC
194205
break
@@ -215,9 +226,7 @@ def consume_output(cons, result):
215226
@pytest.mark.nonci
216227
@pytest.mark.timeout(300 * 1000) # 1.40 hours
217228
@pytest.mark.parametrize(
218-
'results_file_dumper',
219-
[CONFIG_NAME_ABS],
220-
indirect=True
229+
"results_file_dumper", [CONFIG_NAME_ABS], indirect=True
221230
)
222231
def test_snap_restore_performance(bin_cloner_path, results_file_dumper):
223232
"""
@@ -236,124 +245,133 @@ def test_snap_restore_performance(bin_cloner_path, results_file_dumper):
236245
# Create a test context and add builder, logger, network.
237246
test_context = TestContext()
238247
test_context.custom = {
239-
'builder': MicrovmBuilder(bin_cloner_path),
240-
'logger': logger,
241-
'name': TEST_ID,
242-
'results_file_dumper': results_file_dumper
248+
"builder": MicrovmBuilder(bin_cloner_path),
249+
"logger": logger,
250+
"name": TEST_ID,
251+
"results_file_dumper": results_file_dumper,
252+
"workload": "restore",
243253
}
244254

245-
test_matrix = TestMatrix(context=test_context,
246-
artifact_sets=[
247-
microvm_artifacts,
248-
kernel_artifacts,
249-
disk_artifacts
250-
])
255+
test_matrix = TestMatrix(
256+
context=test_context,
257+
artifact_sets=[microvm_artifacts, kernel_artifacts, disk_artifacts],
258+
)
251259
test_matrix.run_test(snapshot_workload)
252260

253261

254262
def snapshot_scaling_vcpus(context, st_core, vcpu_count=10):
255263
"""Restore snapshots with variable vcpu count."""
264+
workload = context.custom["workload"]
256265
for i in range(vcpu_count):
257-
env_id = f"{context.kernel.name()}/{context.disk.name()}/" \
266+
env_id = (
267+
f"{context.kernel.name()}/{context.disk.name()}/"
258268
f"{BASE_VCPU_COUNT + i}vcpu_{BASE_MEM_SIZE_MIB}mb"
269+
)
259270

260271
st_prod = st.producer.LambdaProducer(
261272
func=get_snap_restore_latency,
262273
func_kwargs={
263274
"context": context,
264275
"vcpus": BASE_VCPU_COUNT + i,
265-
"mem_size": BASE_MEM_SIZE_MIB
266-
}
276+
"mem_size": BASE_MEM_SIZE_MIB,
277+
},
267278
)
268-
st_cons = default_lambda_consumer(env_id)
269-
st_core.add_pipe(st_prod, st_cons, f"{env_id}/restore_latency")
279+
st_cons = default_lambda_consumer(env_id, workload)
280+
st_core.add_pipe(st_prod, st_cons, f"{env_id}/{workload}")
270281

271282

272283
def snapshot_scaling_mem(context, st_core, mem_exponent=9):
273284
"""Restore snapshots with variable memory size."""
285+
workload = context.custom["workload"]
274286
for i in range(1, mem_exponent):
275-
env_id = f"{context.kernel.name()}/{context.disk.name()}/" \
287+
env_id = (
288+
f"{context.kernel.name()}/{context.disk.name()}/"
276289
f"{BASE_VCPU_COUNT}vcpu_{BASE_MEM_SIZE_MIB * (2 ** i)}mb"
290+
)
277291

278292
st_prod = st.producer.LambdaProducer(
279293
func=get_snap_restore_latency,
280294
func_kwargs={
281295
"context": context,
282296
"vcpus": BASE_VCPU_COUNT,
283-
"mem_size": BASE_MEM_SIZE_MIB * (2 ** i)
284-
}
297+
"mem_size": BASE_MEM_SIZE_MIB * (2**i),
298+
},
285299
)
286-
st_cons = default_lambda_consumer(env_id)
287-
st_core.add_pipe(st_prod, st_cons, f"{env_id}/restore_latency")
300+
st_cons = default_lambda_consumer(env_id, workload)
301+
st_core.add_pipe(st_prod, st_cons, f"{env_id}/{workload}")
288302

289303

290304
def snapshot_scaling_net(context, st_core, net_count=4):
291305
"""Restore snapshots with variable net device count."""
306+
workload = context.custom["workload"]
292307
for i in range(1, net_count):
293-
env_id = f"{context.kernel.name()}/{context.disk.name()}/" \
308+
env_id = (
309+
f"{context.kernel.name()}/{context.disk.name()}/"
294310
f"{BASE_NET_COUNT + i}net_dev"
311+
)
295312

296313
st_prod = st.producer.LambdaProducer(
297314
func=get_snap_restore_latency,
298315
func_kwargs={
299316
"context": context,
300317
"vcpus": BASE_VCPU_COUNT,
301318
"mem_size": BASE_MEM_SIZE_MIB,
302-
"nets": BASE_NET_COUNT + i
303-
}
319+
"nets": BASE_NET_COUNT + i,
320+
},
304321
)
305-
st_cons = default_lambda_consumer(env_id)
306-
st_core.add_pipe(st_prod, st_cons, f"{env_id}/restore_latency")
322+
st_cons = default_lambda_consumer(env_id, workload)
323+
st_core.add_pipe(st_prod, st_cons, f"{env_id}/{workload}")
307324

308325

309326
def snapshot_scaling_block(context, st_core, block_count=4):
310327
"""Restore snapshots with variable block device count."""
311328
# pylint: disable=W0603
329+
workload = context.custom["workload"]
312330
global scratch_drives
313331
scratch_drives = construct_scratch_drives()
314332

315333
for i in range(1, block_count):
316-
env_id = f"{context.kernel.name()}/{context.disk.name()}/" \
334+
env_id = (
335+
f"{context.kernel.name()}/{context.disk.name()}/"
317336
f"{BASE_BLOCK_COUNT + i}block_dev"
337+
)
318338

319339
st_prod = st.producer.LambdaProducer(
320340
func=get_snap_restore_latency,
321341
func_kwargs={
322342
"context": context,
323343
"vcpus": BASE_VCPU_COUNT,
324344
"mem_size": BASE_MEM_SIZE_MIB,
325-
"blocks": BASE_BLOCK_COUNT + i
326-
}
345+
"blocks": BASE_BLOCK_COUNT + i,
346+
},
327347
)
328-
st_cons = default_lambda_consumer(env_id)
329-
st_core.add_pipe(st_prod, st_cons, f"{env_id}/restore_latency")
348+
st_cons = default_lambda_consumer(env_id, workload)
349+
st_core.add_pipe(st_prod, st_cons, f"{env_id}/{workload}")
330350

331351

332352
def snapshot_all_devices(context, st_core):
333353
"""Restore snapshots with one of each devices."""
334-
env_id = f"{context.kernel.name()}/{context.disk.name()}/" \
335-
f"all_dev"
336-
354+
workload = context.custom["workload"]
355+
env_id = f"{context.kernel.name()}/{context.disk.name()}/" f"all_dev"
337356
st_prod = st.producer.LambdaProducer(
338357
func=get_snap_restore_latency,
339358
func_kwargs={
340359
"context": context,
341360
"vcpus": BASE_VCPU_COUNT,
342361
"mem_size": BASE_MEM_SIZE_MIB,
343-
"all_devices": True
344-
}
362+
"all_devices": True,
363+
},
345364
)
346-
st_cons = default_lambda_consumer(env_id)
347-
st_core.add_pipe(st_prod, st_cons, f"{env_id}/restore_latency")
365+
st_cons = default_lambda_consumer(env_id, workload)
366+
st_core.add_pipe(st_prod, st_cons, f"{env_id}/{workload}")
348367

349368

350369
def snapshot_workload(context):
351370
"""Test all VM configurations for snapshot restore."""
352371
file_dumper = context.custom["results_file_dumper"]
353372

354373
st_core = core.Core(
355-
name=TEST_ID,
356-
iterations=1,
374+
name=TEST_ID, iterations=1,
357375
custom={"cpu_model_name": get_cpu_model_name()}
358376
)
359377

0 commit comments

Comments
 (0)