Skip to content

Commit 1470a10

Browse files
ftang1acmel
authored andcommitted
perf c2c: Add report option to show false sharing in adjacent cachelines
Many platforms have feature of adjacent cachelines prefetch, when it is enabled, for data in RAM of 2 cachelines (2N and 2N+1) granularity, if one is fetched to cache, the other one could likely be fetched too, which sort of extends the cacheline size to double, thus the false sharing could happens in adjacent cachelines. 0Day has captured performance changed related with this [1], and some commercial software explicitly makes its hot global variables 128 bytes aligned (2 cache lines) to avoid this kind of extended false sharing. So add an option "--double-cl" for 'perf c2c report' to show false sharing in double cache line granularity, which acts just like the cacheline size is doubled. There is no change to c2c record. The hardware events of shared cacheline are still per cacheline, and this option just changes the granularity of how events are grouped and displayed. In the 'perf c2c report' output below (will-it-scale's 'pagefault2' case on old kernel): ---------------------------------------------------------------------- 26 31 2 0 0 0 0xffff888103ec6000 ---------------------------------------------------------------------- 35.48% 50.00% 0.00% 0.00% 0.00% 0x10 0 1 0xffffffff8133148b 1153 66 971 3748 74 [k] get_mem_cgroup_from_mm 6.45% 0.00% 0.00% 0.00% 0.00% 0x10 0 1 0xffffffff813396e4 570 0 1531 879 75 [k] mem_cgroup_charge 25.81% 50.00% 0.00% 0.00% 0.00% 0x54 0 1 0xffffffff81331472 949 70 593 3359 74 [k] get_mem_cgroup_from_mm 19.35% 0.00% 0.00% 0.00% 0.00% 0x54 0 1 0xffffffff81339686 1352 0 1073 1022 74 [k] mem_cgroup_charge 9.68% 0.00% 0.00% 0.00% 0.00% 0x54 0 1 0xffffffff813396d6 1401 0 863 768 74 [k] mem_cgroup_charge 3.23% 0.00% 0.00% 0.00% 0.00% 0x54 0 1 0xffffffff81333106 618 0 804 11 9 [k] uncharge_batch The offset 0x10 and 0x54 used to displayed in 2 groups, and now they are listed together to give users a hint of extended false sharing. [1]. https://lore.kernel.org/lkml/20201102091543.GM31092@shao2-debian/ Committer notes: Link: https://lore.kernel.org/r/Y+wvVNWqXb70l4uy@feng-clx Removed -a, leaving just as --double-cl, as this probably is not used so frequently and perhaps will be even auto-detected if we manage to record the MSR where this is configured. Reviewed-by: Andi Kleen <[email protected]> Reviewed-by: Leo Yan <[email protected]> Signed-off-by: Feng Tang <[email protected]> Tested-by: Leo Yan <[email protected]> Acked-by: Joe Mario <[email protected]> Cc: Alexander Shishkin <[email protected]> Cc: Ingo Molnar <[email protected]> Cc: Jiri Olsa <[email protected]> Cc: Kan Liang <[email protected]> Cc: Mark Rutland <[email protected]> Cc: Namhyung Kim <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Tim Chen <[email protected]> Cc: Xing Zhengjun <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
1 parent 91621be commit 1470a10

File tree

5 files changed

+49
-17
lines changed

5 files changed

+49
-17
lines changed

tools/perf/Documentation/perf-c2c.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,12 @@ REPORT OPTIONS
130130
The known limitations include exception handing such as
131131
setjmp/longjmp will have calls/returns not match.
132132

133+
--double-cl::
134+
Group the detection of shared cacheline events into double cacheline
135+
granularity. Some architectures have an Adjacent Cacheline Prefetch
136+
feature, which causes cacheline sharing to behave like the cacheline
137+
size is doubled.
138+
133139
C2C RECORD
134140
----------
135141
The perf c2c record command setup options related to HITM cacheline analysis

tools/perf/builtin-c2c.c

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -524,7 +524,7 @@ static int dcacheline_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
524524
char buf[20];
525525

526526
if (he->mem_info)
527-
addr = cl_address(he->mem_info->daddr.addr);
527+
addr = cl_address(he->mem_info->daddr.addr, chk_double_cl);
528528

529529
return scnprintf(hpp->buf, hpp->size, "%*s", width, HEX_STR(buf, addr));
530530
}
@@ -562,7 +562,7 @@ static int offset_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
562562
char buf[20];
563563

564564
if (he->mem_info)
565-
addr = cl_offset(he->mem_info->daddr.al_addr);
565+
addr = cl_offset(he->mem_info->daddr.al_addr, chk_double_cl);
566566

567567
return scnprintf(hpp->buf, hpp->size, "%*s", width, HEX_STR(buf, addr));
568568
}
@@ -574,9 +574,10 @@ offset_cmp(struct perf_hpp_fmt *fmt __maybe_unused,
574574
uint64_t l = 0, r = 0;
575575

576576
if (left->mem_info)
577-
l = cl_offset(left->mem_info->daddr.addr);
577+
l = cl_offset(left->mem_info->daddr.addr, chk_double_cl);
578+
578579
if (right->mem_info)
579-
r = cl_offset(right->mem_info->daddr.addr);
580+
r = cl_offset(right->mem_info->daddr.addr, chk_double_cl);
580581

581582
return (int64_t)(r - l);
582583
}
@@ -2590,7 +2591,7 @@ perf_c2c_cacheline_browser__title(struct hist_browser *browser,
25902591
he = cl_browser->he;
25912592

25922593
if (he->mem_info)
2593-
addr = cl_address(he->mem_info->daddr.addr);
2594+
addr = cl_address(he->mem_info->daddr.addr, chk_double_cl);
25942595

25952596
scnprintf(bf, size, "Cacheline 0x%lx", addr);
25962597
return 0;
@@ -2788,15 +2789,16 @@ static int ui_quirks(void)
27882789
if (!c2c.use_stdio) {
27892790
dim_offset.width = 5;
27902791
dim_offset.header = header_offset_tui;
2791-
nodestr = "CL";
2792+
nodestr = chk_double_cl ? "Double-CL" : "CL";
27922793
}
27932794

27942795
dim_percent_costly_snoop.header = percent_costly_snoop_header[c2c.display];
27952796

27962797
/* Fix the zero line for dcacheline column. */
2797-
buf = fill_line("Cacheline", dim_dcacheline.width +
2798-
dim_dcacheline_node.width +
2799-
dim_dcacheline_count.width + 4);
2798+
buf = fill_line(chk_double_cl ? "Double-Cacheline" : "Cacheline",
2799+
dim_dcacheline.width +
2800+
dim_dcacheline_node.width +
2801+
dim_dcacheline_count.width + 4);
28002802
if (!buf)
28012803
return -ENOMEM;
28022804

@@ -3037,6 +3039,7 @@ static int perf_c2c__report(int argc, const char **argv)
30373039
OPT_BOOLEAN('f', "force", &symbol_conf.force, "don't complain, do it"),
30383040
OPT_BOOLEAN(0, "stitch-lbr", &c2c.stitch_lbr,
30393041
"Enable LBR callgraph stitching approach"),
3042+
OPT_BOOLEAN(0, "double-cl", &chk_double_cl, "Detect adjacent cacheline false sharing"),
30403043
OPT_PARENT(c2c_options),
30413044
OPT_END()
30423045
};

tools/perf/util/cacheline.h

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,31 @@
66

77
int __pure cacheline_size(void);
88

9-
static inline u64 cl_address(u64 address)
9+
10+
/*
11+
* Some architectures have 'Adjacent Cacheline Prefetch' feature,
12+
* which performs like the cacheline size being doubled.
13+
*/
14+
static inline u64 cl_address(u64 address, bool double_cl)
1015
{
16+
u64 size = cacheline_size();
17+
18+
if (double_cl)
19+
size *= 2;
20+
1121
/* return the cacheline of the address */
12-
return (address & ~(cacheline_size() - 1));
22+
return (address & ~(size - 1));
1323
}
1424

15-
static inline u64 cl_offset(u64 address)
25+
static inline u64 cl_offset(u64 address, bool double_cl)
1626
{
17-
/* return the cacheline of the address */
18-
return (address & (cacheline_size() - 1));
27+
u64 size = cacheline_size();
28+
29+
if (double_cl)
30+
size *= 2;
31+
32+
/* return the offset inside cacheline */
33+
return (address & (size - 1));
1934
}
2035

2136
#endif // PERF_CACHELINE_H

tools/perf/util/sort.c

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,13 @@ enum sort_mode sort__mode = SORT_MODE__NORMAL;
5353
static const char *const dynamic_headers[] = {"local_ins_lat", "ins_lat", "local_p_stage_cyc", "p_stage_cyc"};
5454
static const char *const arch_specific_sort_keys[] = {"local_p_stage_cyc", "p_stage_cyc"};
5555

56+
/*
57+
* Some architectures have Adjacent Cacheline Prefetch feature, which
58+
* behaves like the cacheline size is doubled. Enable this flag to
59+
* check things in double cacheline granularity.
60+
*/
61+
bool chk_double_cl;
62+
5663
/*
5764
* Replaces all occurrences of a char used with the:
5865
*
@@ -1500,8 +1507,8 @@ sort__dcacheline_cmp(struct hist_entry *left, struct hist_entry *right)
15001507

15011508
addr:
15021509
/* al_addr does all the right addr - start + offset calculations */
1503-
l = cl_address(left->mem_info->daddr.al_addr);
1504-
r = cl_address(right->mem_info->daddr.al_addr);
1510+
l = cl_address(left->mem_info->daddr.al_addr, chk_double_cl);
1511+
r = cl_address(right->mem_info->daddr.al_addr, chk_double_cl);
15051512

15061513
if (l > r) return -1;
15071514
if (l < r) return 1;
@@ -1520,7 +1527,7 @@ static int hist_entry__dcacheline_snprintf(struct hist_entry *he, char *bf,
15201527
if (he->mem_info) {
15211528
struct map *map = he->mem_info->daddr.ms.map;
15221529

1523-
addr = cl_address(he->mem_info->daddr.al_addr);
1530+
addr = cl_address(he->mem_info->daddr.al_addr, chk_double_cl);
15241531
ms = &he->mem_info->daddr.ms;
15251532

15261533
/* print [s] for shared data mmaps */

tools/perf/util/sort.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ extern struct sort_entry sort_sym_from;
3535
extern struct sort_entry sort_sym_to;
3636
extern struct sort_entry sort_srcline;
3737
extern const char default_mem_sort_order[];
38+
extern bool chk_double_cl;
3839

3940
struct res_sample {
4041
u64 time;

0 commit comments

Comments
 (0)