Skip to content

Commit f6aee50

Browse files
committed
Merge tag 'x86-timers-2020-06-03' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 timer updates from Thomas Gleixner: "X86 timer specific updates: - Add TPAUSE based delay which allows the CPU to enter an optimized power state while waiting for the delay to pass. The delay is based on TSC cycles. - Add tsc_early_khz command line parameter to workaround the problem that overclocked CPUs can report the wrong frequency via CPUID.16h which causes the refined calibration to fail because the delta to the initial frequency value is too big. With the parameter users can provide an halfways accurate initial value" * tag 'x86-timers-2020-06-03' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/tsc: Add tsc_early_khz command line parameter x86/delay: Introduce TPAUSE delay x86/delay: Refactor delay_mwaitx() for TPAUSE support x86/delay: Preparatory code cleanup
2 parents dabc4df + bd35c77 commit f6aee50

File tree

7 files changed

+128
-39
lines changed

7 files changed

+128
-39
lines changed

Documentation/admin-guide/kernel-parameters.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5093,6 +5093,12 @@
50935093
interruptions from clocksource watchdog are not
50945094
acceptable).
50955095

5096+
tsc_early_khz= [X86] Skip early TSC calibration and use the given
5097+
value instead. Useful when the early TSC frequency discovery
5098+
procedure is not reliable, such as on overclocked systems
5099+
with CPUID.16h support and partial CPUID.15h support.
5100+
Format: <unsigned int>
5101+
50965102
tsx= [X86] Control Transactional Synchronization
50975103
Extensions (TSX) feature in Intel processors that
50985104
support TSX control.

arch/x86/Kconfig.assembler

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,7 @@ config AS_SHA256_NI
1515
def_bool $(as-instr,sha256msg1 %xmm0$(comma)%xmm1)
1616
help
1717
Supported by binutils >= 2.24 and LLVM integrated assembler
18+
config AS_TPAUSE
19+
def_bool $(as-instr,tpause %ecx)
20+
help
21+
Supported by binutils >= 2.31.1 and LLVM integrated assembler >= V7

arch/x86/include/asm/delay.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@
33
#define _ASM_X86_DELAY_H
44

55
#include <asm-generic/delay.h>
6+
#include <linux/init.h>
67

7-
void use_tsc_delay(void);
8+
void __init use_tsc_delay(void);
9+
void __init use_tpause_delay(void);
810
void use_mwaitx_delay(void);
911

1012
#endif /* _ASM_X86_DELAY_H */

arch/x86/include/asm/mwait.h

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,10 @@
2020

2121
#define MWAIT_ECX_INTERRUPT_BREAK 0x1
2222
#define MWAITX_ECX_TIMER_ENABLE BIT(1)
23-
#define MWAITX_MAX_LOOPS ((u32)-1)
23+
#define MWAITX_MAX_WAIT_CYCLES UINT_MAX
2424
#define MWAITX_DISABLE_CSTATES 0xf0
25+
#define TPAUSE_C01_STATE 1
26+
#define TPAUSE_C02_STATE 0
2527

2628
u32 get_umwait_control_msr(void);
2729

@@ -122,4 +124,24 @@ static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
122124
current_clr_polling();
123125
}
124126

127+
/*
128+
* Caller can specify whether to enter C0.1 (low latency, less
129+
* power saving) or C0.2 state (saves more power, but longer wakeup
130+
* latency). This may be overridden by the IA32_UMWAIT_CONTROL MSR
131+
* which can force requests for C0.2 to be downgraded to C0.1.
132+
*/
133+
static inline void __tpause(u32 ecx, u32 edx, u32 eax)
134+
{
135+
/* "tpause %ecx, %edx, %eax;" */
136+
#ifdef CONFIG_AS_TPAUSE
137+
asm volatile("tpause %%ecx\n"
138+
:
139+
: "c"(ecx), "d"(edx), "a"(eax));
140+
#else
141+
asm volatile(".byte 0x66, 0x0f, 0xae, 0xf1\t\n"
142+
:
143+
: "c"(ecx), "d"(edx), "a"(eax));
144+
#endif
145+
}
146+
125147
#endif /* _ASM_X86_MWAIT_H */

arch/x86/kernel/time.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,9 @@ static __init void x86_late_time_init(void)
103103
*/
104104
x86_init.irqs.intr_mode_init();
105105
tsc_init();
106+
107+
if (static_cpu_has(X86_FEATURE_WAITPKG))
108+
use_tpause_delay();
106109
}
107110

108111
/*

arch/x86/kernel/tsc.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ EXPORT_SYMBOL(tsc_khz);
4141
* TSC can be unstable due to cpufreq or due to unsynced TSCs
4242
*/
4343
static int __read_mostly tsc_unstable;
44+
static unsigned int __initdata tsc_early_khz;
4445

4546
static DEFINE_STATIC_KEY_FALSE(__use_tsc);
4647

@@ -59,6 +60,12 @@ struct cyc2ns {
5960

6061
static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
6162

63+
static int __init tsc_early_khz_setup(char *buf)
64+
{
65+
return kstrtouint(buf, 0, &tsc_early_khz);
66+
}
67+
early_param("tsc_early_khz", tsc_early_khz_setup);
68+
6269
__always_inline void cyc2ns_read_begin(struct cyc2ns_data *data)
6370
{
6471
int seq, idx;
@@ -1412,7 +1419,10 @@ static bool __init determine_cpu_tsc_frequencies(bool early)
14121419

14131420
if (early) {
14141421
cpu_khz = x86_platform.calibrate_cpu();
1415-
tsc_khz = x86_platform.calibrate_tsc();
1422+
if (tsc_early_khz)
1423+
tsc_khz = tsc_early_khz;
1424+
else
1425+
tsc_khz = x86_platform.calibrate_tsc();
14161426
} else {
14171427
/* We should not be here with non-native cpu calibration */
14181428
WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu);

arch/x86/lib/delay.c

Lines changed: 78 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,20 @@
2727
# include <asm/smp.h>
2828
#endif
2929

30+
static void delay_loop(u64 __loops);
31+
32+
/*
33+
* Calibration and selection of the delay mechanism happens only once
34+
* during boot.
35+
*/
36+
static void (*delay_fn)(u64) __ro_after_init = delay_loop;
37+
static void (*delay_halt_fn)(u64 start, u64 cycles) __ro_after_init;
38+
3039
/* simple loop based delay: */
31-
static void delay_loop(unsigned long loops)
40+
static void delay_loop(u64 __loops)
3241
{
42+
unsigned long loops = (unsigned long)__loops;
43+
3344
asm volatile(
3445
" test %0,%0 \n"
3546
" jz 3f \n"
@@ -49,17 +60,17 @@ static void delay_loop(unsigned long loops)
4960
}
5061

5162
/* TSC based delay: */
52-
static void delay_tsc(unsigned long __loops)
63+
static void delay_tsc(u64 cycles)
5364
{
54-
u64 bclock, now, loops = __loops;
65+
u64 bclock, now;
5566
int cpu;
5667

5768
preempt_disable();
5869
cpu = smp_processor_id();
5970
bclock = rdtsc_ordered();
6071
for (;;) {
6172
now = rdtsc_ordered();
62-
if ((now - bclock) >= loops)
73+
if ((now - bclock) >= cycles)
6374
break;
6475

6576
/* Allow RT tasks to run */
@@ -77,74 +88,105 @@ static void delay_tsc(unsigned long __loops)
7788
* counter for this CPU.
7889
*/
7990
if (unlikely(cpu != smp_processor_id())) {
80-
loops -= (now - bclock);
91+
cycles -= (now - bclock);
8192
cpu = smp_processor_id();
8293
bclock = rdtsc_ordered();
8394
}
8495
}
8596
preempt_enable();
8697
}
8798

99+
/*
100+
* On Intel the TPAUSE instruction waits until any of:
101+
* 1) the TSC counter exceeds the value provided in EDX:EAX
102+
* 2) global timeout in IA32_UMWAIT_CONTROL is exceeded
103+
* 3) an external interrupt occurs
104+
*/
105+
static void delay_halt_tpause(u64 start, u64 cycles)
106+
{
107+
u64 until = start + cycles;
108+
u32 eax, edx;
109+
110+
eax = lower_32_bits(until);
111+
edx = upper_32_bits(until);
112+
113+
/*
114+
* Hard code the deeper (C0.2) sleep state because exit latency is
115+
* small compared to the "microseconds" that usleep() will delay.
116+
*/
117+
__tpause(TPAUSE_C02_STATE, edx, eax);
118+
}
119+
88120
/*
89121
* On some AMD platforms, MWAITX has a configurable 32-bit timer, that
90-
* counts with TSC frequency. The input value is the loop of the
91-
* counter, it will exit when the timer expires.
122+
* counts with TSC frequency. The input value is the number of TSC cycles
123+
* to wait. MWAITX will also exit when the timer expires.
92124
*/
93-
static void delay_mwaitx(unsigned long __loops)
125+
static void delay_halt_mwaitx(u64 unused, u64 cycles)
94126
{
95-
u64 start, end, delay, loops = __loops;
127+
u64 delay;
128+
129+
delay = min_t(u64, MWAITX_MAX_WAIT_CYCLES, cycles);
130+
/*
131+
* Use cpu_tss_rw as a cacheline-aligned, seldomly accessed per-cpu
132+
* variable as the monitor target.
133+
*/
134+
__monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
135+
136+
/*
137+
* AMD, like Intel, supports the EAX hint and EAX=0xf means, do not
138+
* enter any deep C-state and we use it here in delay() to minimize
139+
* wakeup latency.
140+
*/
141+
__mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE);
142+
}
143+
144+
/*
145+
* Call a vendor specific function to delay for a given amount of time. Because
146+
* these functions may return earlier than requested, check for actual elapsed
147+
* time and call again until done.
148+
*/
149+
static void delay_halt(u64 __cycles)
150+
{
151+
u64 start, end, cycles = __cycles;
96152

97153
/*
98154
* Timer value of 0 causes MWAITX to wait indefinitely, unless there
99155
* is a store on the memory monitored by MONITORX.
100156
*/
101-
if (loops == 0)
157+
if (!cycles)
102158
return;
103159

104160
start = rdtsc_ordered();
105161

106162
for (;;) {
107-
delay = min_t(u64, MWAITX_MAX_LOOPS, loops);
108-
109-
/*
110-
* Use cpu_tss_rw as a cacheline-aligned, seldomly
111-
* accessed per-cpu variable as the monitor target.
112-
*/
113-
__monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
114-
115-
/*
116-
* AMD, like Intel's MWAIT version, supports the EAX hint and
117-
* EAX=0xf0 means, do not enter any deep C-state and we use it
118-
* here in delay() to minimize wakeup latency.
119-
*/
120-
__mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE);
121-
163+
delay_halt_fn(start, cycles);
122164
end = rdtsc_ordered();
123165

124-
if (loops <= end - start)
166+
if (cycles <= end - start)
125167
break;
126168

127-
loops -= end - start;
128-
169+
cycles -= end - start;
129170
start = end;
130171
}
131172
}
132173

133-
/*
134-
* Since we calibrate only once at boot, this
135-
* function should be set once at boot and not changed
136-
*/
137-
static void (*delay_fn)(unsigned long) = delay_loop;
138-
139-
void use_tsc_delay(void)
174+
void __init use_tsc_delay(void)
140175
{
141176
if (delay_fn == delay_loop)
142177
delay_fn = delay_tsc;
143178
}
144179

180+
void __init use_tpause_delay(void)
181+
{
182+
delay_halt_fn = delay_halt_tpause;
183+
delay_fn = delay_halt;
184+
}
185+
145186
void use_mwaitx_delay(void)
146187
{
147-
delay_fn = delay_mwaitx;
188+
delay_halt_fn = delay_halt_mwaitx;
189+
delay_fn = delay_halt;
148190
}
149191

150192
int read_current_timer(unsigned long *timer_val)

0 commit comments

Comments
 (0)