Skip to content

Commit 47125db

Browse files
Kan LiangPeter Zijlstra
authored andcommitted
perf/x86/intel/lbr: Support Architectural LBR
Last Branch Records (LBR) enables recording of software path history by logging taken branches and other control flows within architectural registers now. Intel CPUs have had model-specific LBR for quite some time, but this evolves them into an architectural feature now. The main improvements of Architectural LBR implemented includes: - Linux kernel can support the LBR features without knowing the model number of the current CPU. - Architectural LBR capabilities can be enumerated by CPUID. The lbr_ctl_map is based on the CPUID Enumeration. - The possible LBR depth can be retrieved from CPUID enumeration. The max value is written to the new MSR_ARCH_LBR_DEPTH as the number of LBR entries. - A new IA32_LBR_CTL MSR is introduced to enable and configure LBRs, which replaces the IA32_DEBUGCTL[bit 0] and the LBR_SELECT MSR. - Each LBR record or entry is still comprised of three MSRs, IA32_LBR_x_FROM_IP, IA32_LBR_x_TO_IP and IA32_LBR_x_TO_IP. But they become the architectural MSRs. - Architectural LBR is stack-like now. Entry 0 is always the youngest branch, entry 1 the next youngest... The TOS MSR has been removed. The way to enable/disable Architectural LBR is similar to the previous model-specific LBR. __intel_pmu_lbr_enable/disable() can be reused, but some modifications are required, which include: - MSR_ARCH_LBR_CTL is used to enable and configure the Architectural LBR. - When checking the value of the IA32_DEBUGCTL MSR, ignoring the DEBUGCTLMSR_LBR (bit 0) for Architectural LBR, which has no meaning and always return 0. - The FREEZE_LBRS_ON_PMI has to be explicitly set/clear, because MSR_IA32_DEBUGCTLMSR is not touched in __intel_pmu_lbr_disable() for Architectural LBR. - Only MSR_ARCH_LBR_CTL is cleared in __intel_pmu_lbr_disable() for Architectural LBR. Some Architectural LBR dedicated functions are implemented to reset/read/save/restore LBR. - For reset, writing to the ARCH_LBR_DEPTH MSR clears all Arch LBR entries, which is a lot faster and can improve the context switch latency. - For read, the branch type information can be retrieved from the MSR_ARCH_LBR_INFO_*. But it's not fully compatible due to OTHER_BRANCH type. The software decoding is still required for the OTHER_BRANCH case. LBR records are stored in the age order as well. Reuse intel_pmu_store_lbr(). Check the CPUID enumeration before accessing the corresponding bits in LBR_INFO. - For save/restore, applying the fast reset (writing ARCH_LBR_DEPTH). Reading 'lbr_from' of entry 0 instead of the TOS MSR to check if the LBR registers are reset in the deep C-state. If 'the deep C-state reset' bit is not set in CPUID enumeration, ignoring the check. XSAVE support for Architectural LBR will be implemented later. The number of LBR entries cannot be hardcoded anymore, which should be retrieved from CPUID enumeration. A new structure x86_perf_task_context_arch_lbr is introduced for Architectural LBR. Signed-off-by: Kan Liang <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Link: https://lkml.kernel.org/r/[email protected]
1 parent 631618a commit 47125db

File tree

3 files changed

+253
-11
lines changed

3 files changed

+253
-11
lines changed

arch/x86/events/intel/core.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4664,6 +4664,9 @@ __init int intel_pmu_init(void)
46644664
x86_pmu.lbr_read = intel_pmu_lbr_read_32;
46654665
}
46664666

4667+
if (boot_cpu_has(X86_FEATURE_ARCH_LBR))
4668+
intel_pmu_arch_lbr_init();
4669+
46674670
intel_ds_init();
46684671

46694672
x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */

arch/x86/events/intel/lbr.c

Lines changed: 240 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,14 @@ enum {
172172

173173
static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
174174

175+
static __always_inline bool is_lbr_call_stack_bit_set(u64 config)
176+
{
177+
if (static_cpu_has(X86_FEATURE_ARCH_LBR))
178+
return !!(config & ARCH_LBR_CALL_STACK);
179+
180+
return !!(config & LBR_CALL_STACK);
181+
}
182+
175183
/*
176184
* We only support LBR implementations that have FREEZE_LBRS_ON_PMI
177185
* otherwise it becomes near impossible to get a reliable stack.
@@ -195,27 +203,40 @@ static void __intel_pmu_lbr_enable(bool pmi)
195203
*/
196204
if (cpuc->lbr_sel)
197205
lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask;
198-
if (!pmi && cpuc->lbr_sel)
206+
if (!static_cpu_has(X86_FEATURE_ARCH_LBR) && !pmi && cpuc->lbr_sel)
199207
wrmsrl(MSR_LBR_SELECT, lbr_select);
200208

201209
rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
202210
orig_debugctl = debugctl;
203-
debugctl |= DEBUGCTLMSR_LBR;
211+
212+
if (!static_cpu_has(X86_FEATURE_ARCH_LBR))
213+
debugctl |= DEBUGCTLMSR_LBR;
204214
/*
205215
* LBR callstack does not work well with FREEZE_LBRS_ON_PMI.
206216
* If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions
207217
* may cause superfluous increase/decrease of LBR_TOS.
208218
*/
209-
if (!(lbr_select & LBR_CALL_STACK))
219+
if (is_lbr_call_stack_bit_set(lbr_select))
220+
debugctl &= ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
221+
else
210222
debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
223+
211224
if (orig_debugctl != debugctl)
212225
wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
226+
227+
if (static_cpu_has(X86_FEATURE_ARCH_LBR))
228+
wrmsrl(MSR_ARCH_LBR_CTL, lbr_select | ARCH_LBR_CTL_LBREN);
213229
}
214230

215231
static void __intel_pmu_lbr_disable(void)
216232
{
217233
u64 debugctl;
218234

235+
if (static_cpu_has(X86_FEATURE_ARCH_LBR)) {
236+
wrmsrl(MSR_ARCH_LBR_CTL, 0);
237+
return;
238+
}
239+
219240
rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
220241
debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
221242
wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
@@ -241,6 +262,12 @@ void intel_pmu_lbr_reset_64(void)
241262
}
242263
}
243264

265+
static void intel_pmu_arch_lbr_reset(void)
266+
{
267+
/* Write to ARCH_LBR_DEPTH MSR, all LBR entries are reset to 0 */
268+
wrmsrl(MSR_ARCH_LBR_DEPTH, x86_pmu.lbr_nr);
269+
}
270+
244271
void intel_pmu_lbr_reset(void)
245272
{
246273
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -439,8 +466,28 @@ void intel_pmu_lbr_restore(void *ctx)
439466
wrmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel);
440467
}
441468

469+
static void intel_pmu_arch_lbr_restore(void *ctx)
470+
{
471+
struct x86_perf_task_context_arch_lbr *task_ctx = ctx;
472+
struct lbr_entry *entries = task_ctx->entries;
473+
int i;
474+
475+
/* Fast reset the LBRs before restore if the call stack is not full. */
476+
if (!entries[x86_pmu.lbr_nr - 1].from)
477+
intel_pmu_arch_lbr_reset();
478+
479+
for (i = 0; i < x86_pmu.lbr_nr; i++) {
480+
if (!entries[i].from)
481+
break;
482+
wrlbr_all(&entries[i], i, true);
483+
}
484+
}
485+
442486
static __always_inline bool lbr_is_reset_in_cstate(void *ctx)
443487
{
488+
if (static_cpu_has(X86_FEATURE_ARCH_LBR))
489+
return x86_pmu.lbr_deep_c_reset && !rdlbr_from(0, NULL);
490+
444491
return !rdlbr_from(((struct x86_perf_task_context *)ctx)->tos, NULL);
445492
}
446493

@@ -494,6 +541,22 @@ void intel_pmu_lbr_save(void *ctx)
494541
rdmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel);
495542
}
496543

544+
static void intel_pmu_arch_lbr_save(void *ctx)
545+
{
546+
struct x86_perf_task_context_arch_lbr *task_ctx = ctx;
547+
struct lbr_entry *entries = task_ctx->entries;
548+
int i;
549+
550+
for (i = 0; i < x86_pmu.lbr_nr; i++) {
551+
if (!rdlbr_all(&entries[i], i, true))
552+
break;
553+
}
554+
555+
/* LBR call stack is not full. Reset is required in restore. */
556+
if (i < x86_pmu.lbr_nr)
557+
entries[x86_pmu.lbr_nr - 1].from = 0;
558+
}
559+
497560
static void __intel_pmu_lbr_save(void *ctx)
498561
{
499562
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -786,6 +849,39 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
786849
cpuc->lbr_stack.hw_idx = tos;
787850
}
788851

852+
static __always_inline int get_lbr_br_type(u64 info)
853+
{
854+
if (!static_cpu_has(X86_FEATURE_ARCH_LBR) || !x86_pmu.lbr_br_type)
855+
return 0;
856+
857+
return (info & LBR_INFO_BR_TYPE) >> LBR_INFO_BR_TYPE_OFFSET;
858+
}
859+
860+
static __always_inline bool get_lbr_mispred(u64 info)
861+
{
862+
if (static_cpu_has(X86_FEATURE_ARCH_LBR) && !x86_pmu.lbr_mispred)
863+
return 0;
864+
865+
return !!(info & LBR_INFO_MISPRED);
866+
}
867+
868+
static __always_inline bool get_lbr_predicted(u64 info)
869+
{
870+
if (static_cpu_has(X86_FEATURE_ARCH_LBR) && !x86_pmu.lbr_mispred)
871+
return 0;
872+
873+
return !(info & LBR_INFO_MISPRED);
874+
}
875+
876+
static __always_inline bool get_lbr_cycles(u64 info)
877+
{
878+
if (static_cpu_has(X86_FEATURE_ARCH_LBR) &&
879+
!(x86_pmu.lbr_timed_lbr && info & LBR_INFO_CYC_CNT_VALID))
880+
return 0;
881+
882+
return info & LBR_INFO_CYCLES;
883+
}
884+
789885
static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc,
790886
struct lbr_entry *entries)
791887
{
@@ -810,18 +906,23 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc,
810906

811907
e->from = from;
812908
e->to = to;
813-
e->mispred = !!(info & LBR_INFO_MISPRED);
814-
e->predicted = !(info & LBR_INFO_MISPRED);
909+
e->mispred = get_lbr_mispred(info);
910+
e->predicted = get_lbr_predicted(info);
815911
e->in_tx = !!(info & LBR_INFO_IN_TX);
816912
e->abort = !!(info & LBR_INFO_ABORT);
817-
e->cycles = info & LBR_INFO_CYCLES;
818-
e->type = 0;
913+
e->cycles = get_lbr_cycles(info);
914+
e->type = get_lbr_br_type(info);
819915
e->reserved = 0;
820916
}
821917

822918
cpuc->lbr_stack.nr = i;
823919
}
824920

921+
static void intel_pmu_arch_lbr_read(struct cpu_hw_events *cpuc)
922+
{
923+
intel_pmu_store_lbr(cpuc, NULL);
924+
}
925+
825926
void intel_pmu_lbr_read(void)
826927
{
827928
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1197,6 +1298,27 @@ common_branch_type(int type)
11971298
return PERF_BR_UNKNOWN;
11981299
}
11991300

1301+
enum {
1302+
ARCH_LBR_BR_TYPE_JCC = 0,
1303+
ARCH_LBR_BR_TYPE_NEAR_IND_JMP = 1,
1304+
ARCH_LBR_BR_TYPE_NEAR_REL_JMP = 2,
1305+
ARCH_LBR_BR_TYPE_NEAR_IND_CALL = 3,
1306+
ARCH_LBR_BR_TYPE_NEAR_REL_CALL = 4,
1307+
ARCH_LBR_BR_TYPE_NEAR_RET = 5,
1308+
ARCH_LBR_BR_TYPE_KNOWN_MAX = ARCH_LBR_BR_TYPE_NEAR_RET,
1309+
1310+
ARCH_LBR_BR_TYPE_MAP_MAX = 16,
1311+
};
1312+
1313+
static const int arch_lbr_br_type_map[ARCH_LBR_BR_TYPE_MAP_MAX] = {
1314+
[ARCH_LBR_BR_TYPE_JCC] = X86_BR_JCC,
1315+
[ARCH_LBR_BR_TYPE_NEAR_IND_JMP] = X86_BR_IND_JMP,
1316+
[ARCH_LBR_BR_TYPE_NEAR_REL_JMP] = X86_BR_JMP,
1317+
[ARCH_LBR_BR_TYPE_NEAR_IND_CALL] = X86_BR_IND_CALL,
1318+
[ARCH_LBR_BR_TYPE_NEAR_REL_CALL] = X86_BR_CALL,
1319+
[ARCH_LBR_BR_TYPE_NEAR_RET] = X86_BR_RET,
1320+
};
1321+
12001322
/*
12011323
* implement actual branch filter based on user demand.
12021324
* Hardware may not exactly satisfy that request, thus
@@ -1209,7 +1331,7 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
12091331
{
12101332
u64 from, to;
12111333
int br_sel = cpuc->br_sel;
1212-
int i, j, type;
1334+
int i, j, type, to_plm;
12131335
bool compress = false;
12141336

12151337
/* if sampling all branches, then nothing to filter */
@@ -1221,8 +1343,19 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
12211343

12221344
from = cpuc->lbr_entries[i].from;
12231345
to = cpuc->lbr_entries[i].to;
1346+
type = cpuc->lbr_entries[i].type;
12241347

1225-
type = branch_type(from, to, cpuc->lbr_entries[i].abort);
1348+
/*
1349+
* Parse the branch type recorded in LBR_x_INFO MSR.
1350+
* Doesn't support OTHER_BRANCH decoding for now.
1351+
* OTHER_BRANCH branch type still rely on software decoding.
1352+
*/
1353+
if (static_cpu_has(X86_FEATURE_ARCH_LBR) &&
1354+
type <= ARCH_LBR_BR_TYPE_KNOWN_MAX) {
1355+
to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
1356+
type = arch_lbr_br_type_map[type] | to_plm;
1357+
} else
1358+
type = branch_type(from, to, cpuc->lbr_entries[i].abort);
12261359
if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) {
12271360
if (cpuc->lbr_entries[i].in_tx)
12281361
type |= X86_BR_IN_TX;
@@ -1261,8 +1394,9 @@ void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr)
12611394
{
12621395
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
12631396

1264-
/* Cannot get TOS for large PEBS */
1265-
if (cpuc->n_pebs == cpuc->n_large_pebs)
1397+
/* Cannot get TOS for large PEBS and Arch LBR */
1398+
if (static_cpu_has(X86_FEATURE_ARCH_LBR) ||
1399+
(cpuc->n_pebs == cpuc->n_large_pebs))
12661400
cpuc->lbr_stack.hw_idx = -1ULL;
12671401
else
12681402
cpuc->lbr_stack.hw_idx = intel_pmu_lbr_tos();
@@ -1324,6 +1458,26 @@ static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
13241458
[PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL,
13251459
};
13261460

1461+
static int arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
1462+
[PERF_SAMPLE_BRANCH_ANY_SHIFT] = ARCH_LBR_ANY,
1463+
[PERF_SAMPLE_BRANCH_USER_SHIFT] = ARCH_LBR_USER,
1464+
[PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = ARCH_LBR_KERNEL,
1465+
[PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
1466+
[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = ARCH_LBR_RETURN |
1467+
ARCH_LBR_OTHER_BRANCH,
1468+
[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = ARCH_LBR_REL_CALL |
1469+
ARCH_LBR_IND_CALL |
1470+
ARCH_LBR_OTHER_BRANCH,
1471+
[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = ARCH_LBR_IND_CALL,
1472+
[PERF_SAMPLE_BRANCH_COND_SHIFT] = ARCH_LBR_JCC,
1473+
[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = ARCH_LBR_REL_CALL |
1474+
ARCH_LBR_IND_CALL |
1475+
ARCH_LBR_RETURN |
1476+
ARCH_LBR_CALL_STACK,
1477+
[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = ARCH_LBR_IND_JMP,
1478+
[PERF_SAMPLE_BRANCH_CALL_SHIFT] = ARCH_LBR_REL_CALL,
1479+
};
1480+
13271481
/* core */
13281482
void __init intel_pmu_lbr_init_core(void)
13291483
{
@@ -1471,6 +1625,81 @@ void intel_pmu_lbr_init_knl(void)
14711625
x86_pmu.intel_cap.lbr_format = LBR_FORMAT_EIP_FLAGS;
14721626
}
14731627

1628+
void __init intel_pmu_arch_lbr_init(void)
1629+
{
1630+
union cpuid28_eax eax;
1631+
union cpuid28_ebx ebx;
1632+
union cpuid28_ecx ecx;
1633+
unsigned int unused_edx;
1634+
u64 lbr_nr;
1635+
1636+
/* Arch LBR Capabilities */
1637+
cpuid(28, &eax.full, &ebx.full, &ecx.full, &unused_edx);
1638+
1639+
lbr_nr = fls(eax.split.lbr_depth_mask) * 8;
1640+
if (!lbr_nr)
1641+
goto clear_arch_lbr;
1642+
1643+
/* Apply the max depth of Arch LBR */
1644+
if (wrmsrl_safe(MSR_ARCH_LBR_DEPTH, lbr_nr))
1645+
goto clear_arch_lbr;
1646+
1647+
x86_pmu.lbr_depth_mask = eax.split.lbr_depth_mask;
1648+
x86_pmu.lbr_deep_c_reset = eax.split.lbr_deep_c_reset;
1649+
x86_pmu.lbr_lip = eax.split.lbr_lip;
1650+
x86_pmu.lbr_cpl = ebx.split.lbr_cpl;
1651+
x86_pmu.lbr_filter = ebx.split.lbr_filter;
1652+
x86_pmu.lbr_call_stack = ebx.split.lbr_call_stack;
1653+
x86_pmu.lbr_mispred = ecx.split.lbr_mispred;
1654+
x86_pmu.lbr_timed_lbr = ecx.split.lbr_timed_lbr;
1655+
x86_pmu.lbr_br_type = ecx.split.lbr_br_type;
1656+
x86_pmu.lbr_nr = lbr_nr;
1657+
1658+
x86_get_pmu()->task_ctx_size = sizeof(struct x86_perf_task_context_arch_lbr) +
1659+
lbr_nr * sizeof(struct lbr_entry);
1660+
1661+
x86_pmu.lbr_from = MSR_ARCH_LBR_FROM_0;
1662+
x86_pmu.lbr_to = MSR_ARCH_LBR_TO_0;
1663+
x86_pmu.lbr_info = MSR_ARCH_LBR_INFO_0;
1664+
1665+
/* LBR callstack requires both CPL and Branch Filtering support */
1666+
if (!x86_pmu.lbr_cpl ||
1667+
!x86_pmu.lbr_filter ||
1668+
!x86_pmu.lbr_call_stack)
1669+
arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_NOT_SUPP;
1670+
1671+
if (!x86_pmu.lbr_cpl) {
1672+
arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_NOT_SUPP;
1673+
arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_NOT_SUPP;
1674+
} else if (!x86_pmu.lbr_filter) {
1675+
arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_NOT_SUPP;
1676+
arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_NOT_SUPP;
1677+
arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_NOT_SUPP;
1678+
arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_NOT_SUPP;
1679+
arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_NOT_SUPP;
1680+
arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_NOT_SUPP;
1681+
arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_NOT_SUPP;
1682+
}
1683+
1684+
x86_pmu.lbr_ctl_mask = ARCH_LBR_CTL_MASK;
1685+
x86_pmu.lbr_ctl_map = arch_lbr_ctl_map;
1686+
1687+
if (!x86_pmu.lbr_cpl && !x86_pmu.lbr_filter)
1688+
x86_pmu.lbr_ctl_map = NULL;
1689+
1690+
x86_pmu.lbr_reset = intel_pmu_arch_lbr_reset;
1691+
x86_pmu.lbr_read = intel_pmu_arch_lbr_read;
1692+
x86_pmu.lbr_save = intel_pmu_arch_lbr_save;
1693+
x86_pmu.lbr_restore = intel_pmu_arch_lbr_restore;
1694+
1695+
pr_cont("Architectural LBR, ");
1696+
1697+
return;
1698+
1699+
clear_arch_lbr:
1700+
clear_cpu_cap(&boot_cpu_data, X86_FEATURE_ARCH_LBR);
1701+
}
1702+
14741703
/**
14751704
* x86_perf_get_lbr - get the LBR records information
14761705
*

0 commit comments

Comments
 (0)