Skip to content

Commit 3634039

Browse files
dwmw2mstsirkin
authored andcommitted
hw/acpi: Add vmclock device
The vmclock device addresses the problem of live migration with precision clocks. The tolerances of a hardware counter (e.g. TSC) are typically around ±50PPM. A guest will use NTP/PTP/PPS to discipline that counter against an external source of 'real' time, and track the precise frequency of the counter as it changes with environmental conditions. When a guest is live migrated, anything it knows about the frequency of the underlying counter becomes invalid. It may move from a host where the counter running at -50PPM of its nominal frequency, to a host where it runs at +50PPM. There will also be a step change in the value of the counter, as the correctness of its absolute value at migration is limited by the accuracy of the source and destination host's time synchronization. The device exposes a shared memory region to guests, which can be mapped all the way to userspace. In the first phase, this merely advertises a 'disruption_marker', which indicates that the guest should throw away any NTP synchronization it thinks it has, and start again. Because the region can be exposed all the way to userspace, applications can still use time from a fast vDSO 'system call', and check the disruption marker to be sure that their timestamp is indeed truthful. The structure also allows for the precise time, as known by the host, to be exposed directly to guests so that they don't have to wait for NTP to resync from scratch. The values and fields are based on the nascent virtio-rtc specification, and the intent is that a version (hopefully precisely this version) of this structure will be included as an optional part of that spec. In the meantime, a simple ACPI device along the lines of VMGENID is perfectly sufficient and is compatible with what's being shipped in certain commercial hypervisors. Linux guest support was merged into the 6.13-rc1 kernel: https://git.kernel.org/torvalds/c/205032724226 Signed-off-by: David Woodhouse <[email protected]> Reviewed-by: Paul Durrant <[email protected]> Message-Id: <[email protected]> Reviewed-by: Michael S. Tsirkin <[email protected]> Signed-off-by: Michael S. Tsirkin <[email protected]>
1 parent 60f543a commit 3634039

File tree

8 files changed

+412
-1
lines changed

8 files changed

+412
-1
lines changed

hw/acpi/Kconfig

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,11 @@ config ACPI_VMGENID
6060
default y
6161
depends on PC
6262

63+
config ACPI_VMCLOCK
64+
bool
65+
default y
66+
depends on PC
67+
6368
config ACPI_VIOT
6469
bool
6570
depends on ACPI

hw/acpi/meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ acpi_ss.add(when: 'CONFIG_ACPI_NVDIMM', if_false: files('acpi-nvdimm-stub.c'))
1515
acpi_ss.add(when: 'CONFIG_ACPI_PCI', if_true: files('pci.c'))
1616
acpi_ss.add(when: 'CONFIG_ACPI_CXL', if_true: files('cxl.c'), if_false: files('cxl-stub.c'))
1717
acpi_ss.add(when: 'CONFIG_ACPI_VMGENID', if_true: files('vmgenid.c'))
18+
acpi_ss.add(when: 'CONFIG_ACPI_VMCLOCK', if_true: files('vmclock.c'))
1819
acpi_ss.add(when: 'CONFIG_ACPI_HW_REDUCED', if_true: files('generic_event_device.c'))
1920
acpi_ss.add(when: 'CONFIG_ACPI_HMAT', if_true: files('hmat.c'))
2021
acpi_ss.add(when: 'CONFIG_ACPI_APEI', if_true: files('ghes.c'), if_false: files('ghes-stub.c'))

hw/acpi/vmclock.c

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
/*
2+
* Virtual Machine Clock Device
3+
*
4+
* Copyright © 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
5+
*
6+
* Authors: David Woodhouse <[email protected]>
7+
*
8+
* This work is licensed under the terms of the GNU GPL, version 2 or later.
9+
* See the COPYING file in the top-level directory.
10+
*/
11+
12+
#include "qemu/osdep.h"
13+
#include "qapi/error.h"
14+
#include "qemu/module.h"
15+
#include "hw/i386/e820_memory_layout.h"
16+
#include "hw/acpi/acpi.h"
17+
#include "hw/acpi/aml-build.h"
18+
#include "hw/acpi/vmclock.h"
19+
#include "hw/nvram/fw_cfg.h"
20+
#include "hw/qdev-properties.h"
21+
#include "hw/qdev-properties-system.h"
22+
#include "migration/vmstate.h"
23+
#include "system/reset.h"
24+
25+
#include "standard-headers/linux/vmclock-abi.h"
26+
27+
void vmclock_build_acpi(VmclockState *vms, GArray *table_data,
28+
BIOSLinker *linker, const char *oem_id)
29+
{
30+
Aml *ssdt, *dev, *scope, *crs;
31+
AcpiTable table = { .sig = "SSDT", .rev = 1,
32+
.oem_id = oem_id, .oem_table_id = "VMCLOCK" };
33+
34+
/* Put VMCLOCK into a separate SSDT table */
35+
acpi_table_begin(&table, table_data);
36+
ssdt = init_aml_allocator();
37+
38+
scope = aml_scope("\\_SB");
39+
dev = aml_device("VCLK");
40+
aml_append(dev, aml_name_decl("_HID", aml_string("AMZNC10C")));
41+
aml_append(dev, aml_name_decl("_CID", aml_string("VMCLOCK")));
42+
aml_append(dev, aml_name_decl("_DDN", aml_string("VMCLOCK")));
43+
44+
/* Simple status method */
45+
aml_append(dev, aml_name_decl("_STA", aml_int(0xf)));
46+
47+
crs = aml_resource_template();
48+
aml_append(crs, aml_qword_memory(AML_POS_DECODE,
49+
AML_MIN_FIXED, AML_MAX_FIXED,
50+
AML_CACHEABLE, AML_READ_ONLY,
51+
0xffffffffffffffffULL,
52+
vms->physaddr,
53+
vms->physaddr + VMCLOCK_SIZE - 1,
54+
0, VMCLOCK_SIZE));
55+
aml_append(dev, aml_name_decl("_CRS", crs));
56+
aml_append(scope, dev);
57+
aml_append(ssdt, scope);
58+
59+
g_array_append_vals(table_data, ssdt->buf->data, ssdt->buf->len);
60+
acpi_table_end(linker, &table);
61+
free_aml_allocator();
62+
}
63+
64+
static void vmclock_update_guest(VmclockState *vms)
65+
{
66+
uint64_t disruption_marker;
67+
uint32_t seq_count;
68+
69+
if (!vms->clk) {
70+
return;
71+
}
72+
73+
seq_count = le32_to_cpu(vms->clk->seq_count) | 1;
74+
vms->clk->seq_count = cpu_to_le32(seq_count);
75+
/* These barriers pair with read barriers in the guest */
76+
smp_wmb();
77+
78+
disruption_marker = le64_to_cpu(vms->clk->disruption_marker);
79+
disruption_marker++;
80+
vms->clk->disruption_marker = cpu_to_le64(disruption_marker);
81+
82+
/* These barriers pair with read barriers in the guest */
83+
smp_wmb();
84+
vms->clk->seq_count = cpu_to_le32(seq_count + 1);
85+
}
86+
87+
/*
88+
* After restoring an image, we need to update the guest memory to notify
89+
* it of clock disruption.
90+
*/
91+
static int vmclock_post_load(void *opaque, int version_id)
92+
{
93+
VmclockState *vms = opaque;
94+
95+
vmclock_update_guest(vms);
96+
return 0;
97+
}
98+
99+
static const VMStateDescription vmstate_vmclock = {
100+
.name = "vmclock",
101+
.version_id = 1,
102+
.minimum_version_id = 1,
103+
.post_load = vmclock_post_load,
104+
.fields = (const VMStateField[]) {
105+
VMSTATE_UINT64(physaddr, VmclockState),
106+
VMSTATE_END_OF_LIST()
107+
},
108+
};
109+
110+
static void vmclock_handle_reset(void *opaque)
111+
{
112+
VmclockState *vms = VMCLOCK(opaque);
113+
114+
if (!memory_region_is_mapped(&vms->clk_page)) {
115+
memory_region_add_subregion_overlap(get_system_memory(),
116+
vms->physaddr,
117+
&vms->clk_page, 0);
118+
}
119+
}
120+
121+
static void vmclock_realize(DeviceState *dev, Error **errp)
122+
{
123+
VmclockState *vms = VMCLOCK(dev);
124+
125+
/*
126+
* Given that this function is executing, there is at least one VMCLOCK
127+
* device. Check if there are several.
128+
*/
129+
if (!find_vmclock_dev()) {
130+
error_setg(errp, "at most one %s device is permitted", TYPE_VMCLOCK);
131+
return;
132+
}
133+
134+
vms->physaddr = VMCLOCK_ADDR;
135+
136+
e820_add_entry(vms->physaddr, VMCLOCK_SIZE, E820_RESERVED);
137+
138+
memory_region_init_ram(&vms->clk_page, OBJECT(dev), "vmclock_page",
139+
VMCLOCK_SIZE, &error_abort);
140+
memory_region_set_enabled(&vms->clk_page, true);
141+
vms->clk = memory_region_get_ram_ptr(&vms->clk_page);
142+
memset(vms->clk, 0, VMCLOCK_SIZE);
143+
144+
vms->clk->magic = cpu_to_le32(VMCLOCK_MAGIC);
145+
vms->clk->size = cpu_to_le16(VMCLOCK_SIZE);
146+
vms->clk->version = cpu_to_le16(1);
147+
148+
/* These are all zero and thus default, but be explicit */
149+
vms->clk->clock_status = VMCLOCK_STATUS_UNKNOWN;
150+
vms->clk->counter_id = VMCLOCK_COUNTER_INVALID;
151+
152+
qemu_register_reset(vmclock_handle_reset, vms);
153+
154+
vmclock_update_guest(vms);
155+
}
156+
157+
static void vmclock_device_class_init(ObjectClass *klass, void *data)
158+
{
159+
DeviceClass *dc = DEVICE_CLASS(klass);
160+
161+
dc->vmsd = &vmstate_vmclock;
162+
dc->realize = vmclock_realize;
163+
dc->hotpluggable = false;
164+
set_bit(DEVICE_CATEGORY_MISC, dc->categories);
165+
}
166+
167+
static const TypeInfo vmclock_device_info = {
168+
.name = TYPE_VMCLOCK,
169+
.parent = TYPE_DEVICE,
170+
.instance_size = sizeof(VmclockState),
171+
.class_init = vmclock_device_class_init,
172+
};
173+
174+
static void vmclock_register_types(void)
175+
{
176+
type_register_static(&vmclock_device_info);
177+
}
178+
179+
type_init(vmclock_register_types)

hw/i386/Kconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ config PC
4343
select SERIAL_ISA
4444
select ACPI_PCI
4545
select ACPI_VMGENID
46+
select ACPI_VMCLOCK
4647
select VIRTIO_PMEM_SUPPORTED
4748
select VIRTIO_MEM_SUPPORTED
4849
select HV_BALLOON_SUPPORTED

hw/i386/acpi-build.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
#include "system/tpm.h"
4444
#include "hw/acpi/tpm.h"
4545
#include "hw/acpi/vmgenid.h"
46+
#include "hw/acpi/vmclock.h"
4647
#include "hw/acpi/erst.h"
4748
#include "hw/acpi/piix4.h"
4849
#include "system/tpm_backend.h"
@@ -2445,7 +2446,7 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine)
24452446
uint8_t *u;
24462447
GArray *tables_blob = tables->table_data;
24472448
AcpiSlicOem slic_oem = { .id = NULL, .table_id = NULL };
2448-
Object *vmgenid_dev;
2449+
Object *vmgenid_dev, *vmclock_dev;
24492450
char *oem_id;
24502451
char *oem_table_id;
24512452

@@ -2518,6 +2519,13 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine)
25182519
tables->vmgenid, tables->linker, x86ms->oem_id);
25192520
}
25202521

2522+
vmclock_dev = find_vmclock_dev();
2523+
if (vmclock_dev) {
2524+
acpi_add_table(table_offsets, tables_blob);
2525+
vmclock_build_acpi(VMCLOCK(vmclock_dev), tables_blob, tables->linker,
2526+
x86ms->oem_id);
2527+
}
2528+
25212529
if (misc.has_hpet) {
25222530
acpi_add_table(table_offsets, tables_blob);
25232531
build_hpet(tables_blob, tables->linker, x86ms->oem_id,

include/hw/acpi/vmclock.h

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#ifndef ACPI_VMCLOCK_H
2+
#define ACPI_VMCLOCK_H
3+
4+
#include "hw/acpi/bios-linker-loader.h"
5+
#include "hw/qdev-core.h"
6+
#include "qemu/uuid.h"
7+
#include "qom/object.h"
8+
9+
#define TYPE_VMCLOCK "vmclock"
10+
11+
#define VMCLOCK_ADDR 0xfeffb000
12+
#define VMCLOCK_SIZE 0x1000
13+
14+
OBJECT_DECLARE_SIMPLE_TYPE(VmclockState, VMCLOCK)
15+
16+
struct vmclock_abi;
17+
18+
struct VmclockState {
19+
DeviceState parent_obj;
20+
MemoryRegion clk_page;
21+
uint64_t physaddr;
22+
struct vmclock_abi *clk;
23+
};
24+
25+
/* returns NULL unless there is exactly one device */
26+
static inline Object *find_vmclock_dev(void)
27+
{
28+
return object_resolve_path_type("", TYPE_VMCLOCK, NULL);
29+
}
30+
31+
void vmclock_build_acpi(VmclockState *vms, GArray *table_data,
32+
BIOSLinker *linker, const char *oem_id);
33+
34+
#endif

0 commit comments

Comments
 (0)