Skip to content

Commit 2f3d08f

Browse files
fenrus75rafaeljw
authored andcommitted
intel_idle: Add support for using intel_idle in a VM guest using just hlt
In a typical VM guest, the mwait instruction is not available, leaving only the 'hlt' instruction (which causes a VMEXIT to the host). So for this common case, intel_idle will detect the lack of mwait, and fail to initialize (after which another idle method would step in which will just use hlt always). Other (non-common) cases exist; the table below shows the before/after for these: +------------+--------------------------+-------------------------+ | Hypervisor | Idle method before patch | Idle method after patch | | exposes | | | +============+==========================+=========================+ | nothing | default_idle fallback | intel_idle VM table | | (common) | (straight "hlt") | | +------------+--------------------------+-------------------------+ | mwait | intel_idle mwait table | intel_idle mwait table | +------------+--------------------------+-------------------------+ | ACPI | ACPI C1 state ("hlt") | intel_idle VM table | +------------+--------------------------+-------------------------+ This is only applicable to CPUs known by intel_idle. For the bare metal case, unknown CPU models will use the ACPI tables (when available) to get estimates for latency and break even point for longer idle states. In guests, the common case is that ACPI tables are not available, but even when they are available, they can't and don't provide the latency information for the longer (mwait based) states. For this scenario (unknown CPU model), the default_idle mode (no ACPI) or ACPI C1 (ACPI avaible) will be used. By providing capability to do this with the intel_idle driver, we can do better than the fallback or ACPI table methods. While this current change only gets us to the existing behavior, later patches in this series will add new capabilities such as optimized TLB flushing. In order to do this, a simplified version of the initialization function for VM guests is created, and this will be called if the CPU is recognized, but mwait is not supported, and we're in a VM guest. One thing to note is that the max latency (and break even) of this C1 state is higher than the typical bare metal C1 state. Because hlt causes a vmexit, and the cost of vmexit + hypervisor overhead + vmenter is typically in the order of upto 5 microseconds... even if the hypervisor does not actually goes into a hardware power saving state. Signed-off-by: Arjan van de Ven <[email protected]> [ rjw: Dropped redundant checks from should_verify_mwait() ] Signed-off-by: Rafael J. Wysocki <[email protected]>
1 parent 7826c06 commit 2f3d08f

File tree

1 file changed

+116
-1
lines changed

1 file changed

+116
-1
lines changed

drivers/idle/intel_idle.c

Lines changed: 116 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,43 @@ static __cpuidle int intel_idle_xstate(struct cpuidle_device *dev,
199199
return __intel_idle(dev, drv, index);
200200
}
201201

202+
static __always_inline int __intel_idle_hlt(struct cpuidle_device *dev,
203+
struct cpuidle_driver *drv, int index)
204+
{
205+
raw_safe_halt();
206+
raw_local_irq_disable();
207+
return index;
208+
}
209+
210+
/**
211+
* intel_idle_hlt - Ask the processor to enter the given idle state using hlt.
212+
* @dev: cpuidle device of the target CPU.
213+
* @drv: cpuidle driver (assumed to point to intel_idle_driver).
214+
* @index: Target idle state index.
215+
*
216+
* Use the HLT instruction to notify the processor that the CPU represented by
217+
* @dev is idle and it can try to enter the idle state corresponding to @index.
218+
*
219+
* Must be called under local_irq_disable().
220+
*/
221+
static __cpuidle int intel_idle_hlt(struct cpuidle_device *dev,
222+
struct cpuidle_driver *drv, int index)
223+
{
224+
return __intel_idle_hlt(dev, drv, index);
225+
}
226+
227+
static __cpuidle int intel_idle_hlt_irq_on(struct cpuidle_device *dev,
228+
struct cpuidle_driver *drv, int index)
229+
{
230+
int ret;
231+
232+
raw_local_irq_enable();
233+
ret = __intel_idle_hlt(dev, drv, index);
234+
raw_local_irq_disable();
235+
236+
return ret;
237+
}
238+
202239
/**
203240
* intel_idle_s2idle - Ask the processor to enter the given idle state.
204241
* @dev: cpuidle device of the target CPU.
@@ -1242,6 +1279,18 @@ static struct cpuidle_state snr_cstates[] __initdata = {
12421279
.enter = NULL }
12431280
};
12441281

1282+
static struct cpuidle_state vmguest_cstates[] __initdata = {
1283+
{
1284+
.name = "C1",
1285+
.desc = "HLT",
1286+
.flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_IRQ_ENABLE,
1287+
.exit_latency = 5,
1288+
.target_residency = 10,
1289+
.enter = &intel_idle_hlt, },
1290+
{
1291+
.enter = NULL }
1292+
};
1293+
12451294
static const struct idle_cpu idle_cpu_nehalem __initconst = {
12461295
.state_table = nehalem_cstates,
12471296
.auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE,
@@ -1841,6 +1890,16 @@ static bool __init intel_idle_verify_cstate(unsigned int mwait_hint)
18411890

18421891
static void state_update_enter_method(struct cpuidle_state *state, int cstate)
18431892
{
1893+
if (state->enter == intel_idle_hlt) {
1894+
if (force_irq_on) {
1895+
pr_info("forced intel_idle_irq for state %d\n", cstate);
1896+
state->enter = intel_idle_hlt_irq_on;
1897+
}
1898+
return;
1899+
}
1900+
if (state->enter == intel_idle_hlt_irq_on)
1901+
return; /* no update scenarios */
1902+
18441903
if (state->flags & CPUIDLE_FLAG_INIT_XSTATE) {
18451904
/*
18461905
* Combining with XSTATE with IBRS or IRQ_ENABLE flags
@@ -1874,6 +1933,21 @@ static void state_update_enter_method(struct cpuidle_state *state, int cstate)
18741933
}
18751934
}
18761935

1936+
/*
1937+
* For mwait based states, we want to verify the cpuid data to see if the state
1938+
* is actually supported by this specific CPU.
1939+
* For non-mwait based states, this check should be skipped.
1940+
*/
1941+
static bool should_verify_mwait(struct cpuidle_state *state)
1942+
{
1943+
if (state->enter == intel_idle_hlt)
1944+
return false;
1945+
if (state->enter == intel_idle_hlt_irq_on)
1946+
return false;
1947+
1948+
return true;
1949+
}
1950+
18771951
static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
18781952
{
18791953
int cstate;
@@ -1922,7 +1996,7 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
19221996
}
19231997

19241998
mwait_hint = flg2MWAIT(cpuidle_state_table[cstate].flags);
1925-
if (!intel_idle_verify_cstate(mwait_hint))
1999+
if (should_verify_mwait(&cpuidle_state_table[cstate]) && !intel_idle_verify_cstate(mwait_hint))
19262000
continue;
19272001

19282002
/* Structure copy. */
@@ -2056,6 +2130,45 @@ static void __init intel_idle_cpuidle_devices_uninit(void)
20562130
cpuidle_unregister_device(per_cpu_ptr(intel_idle_cpuidle_devices, i));
20572131
}
20582132

2133+
static int __init intel_idle_vminit(const struct x86_cpu_id *id)
2134+
{
2135+
int retval;
2136+
2137+
cpuidle_state_table = vmguest_cstates;
2138+
2139+
icpu = (const struct idle_cpu *)id->driver_data;
2140+
2141+
pr_debug("v" INTEL_IDLE_VERSION " model 0x%X\n",
2142+
boot_cpu_data.x86_model);
2143+
2144+
intel_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device);
2145+
if (!intel_idle_cpuidle_devices)
2146+
return -ENOMEM;
2147+
2148+
intel_idle_cpuidle_driver_init(&intel_idle_driver);
2149+
2150+
retval = cpuidle_register_driver(&intel_idle_driver);
2151+
if (retval) {
2152+
struct cpuidle_driver *drv = cpuidle_get_driver();
2153+
printk(KERN_DEBUG pr_fmt("intel_idle yielding to %s\n"),
2154+
drv ? drv->name : "none");
2155+
goto init_driver_fail;
2156+
}
2157+
2158+
retval = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "idle/intel:online",
2159+
intel_idle_cpu_online, NULL);
2160+
if (retval < 0)
2161+
goto hp_setup_fail;
2162+
2163+
return 0;
2164+
hp_setup_fail:
2165+
intel_idle_cpuidle_devices_uninit();
2166+
cpuidle_unregister_driver(&intel_idle_driver);
2167+
init_driver_fail:
2168+
free_percpu(intel_idle_cpuidle_devices);
2169+
return retval;
2170+
}
2171+
20592172
static int __init intel_idle_init(void)
20602173
{
20612174
const struct x86_cpu_id *id;
@@ -2074,6 +2187,8 @@ static int __init intel_idle_init(void)
20742187
id = x86_match_cpu(intel_idle_ids);
20752188
if (id) {
20762189
if (!boot_cpu_has(X86_FEATURE_MWAIT)) {
2190+
if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
2191+
return intel_idle_vminit(id);
20772192
pr_debug("Please enable MWAIT in BIOS SETUP\n");
20782193
return -ENODEV;
20792194
}

0 commit comments

Comments
 (0)