Skip to content

Commit 5738891

Browse files
kimphillamdPeter Zijlstra
authored andcommitted
perf/x86/amd: Add support for Large Increment per Cycle Events
Description of hardware operation --------------------------------- The core AMD PMU has a 4-bit wide per-cycle increment for each performance monitor counter. That works for most events, but now with AMD Family 17h and above processors, some events can occur more than 15 times in a cycle. Those events are called "Large Increment per Cycle" events. In order to count these events, two adjacent h/w PMCs get their count signals merged to form 8 bits per cycle total. In addition, the PERF_CTR count registers are merged to be able to count up to 64 bits. Normally, events like instructions retired, get programmed on a single counter like so: PERF_CTL0 (MSR 0xc0010200) 0x000000000053ff0c # event 0x0c, umask 0xff PERF_CTR0 (MSR 0xc0010201) 0x0000800000000001 # r/w 48-bit count The next counter at MSRs 0xc0010202-3 remains unused, or can be used independently to count something else. When counting Large Increment per Cycle events, such as FLOPs, however, we now have to reserve the next counter and program the PERF_CTL (config) register with the Merge event (0xFFF), like so: PERF_CTL0 (msr 0xc0010200) 0x000000000053ff03 # FLOPs event, umask 0xff PERF_CTR0 (msr 0xc0010201) 0x0000800000000001 # rd 64-bit cnt, wr lo 48b PERF_CTL1 (msr 0xc0010202) 0x0000000f004000ff # Merge event, enable bit PERF_CTR1 (msr 0xc0010203) 0x0000000000000000 # wr hi 16-bits count The count is widened from the normal 48-bits to 64 bits by having the second counter carry the higher 16 bits of the count in its lower 16 bits of its counter register. The odd counter, e.g., PERF_CTL1, is programmed with the enabled Merge event before the even counter, PERF_CTL0. The Large Increment feature is available starting with Family 17h. For more details, search any Family 17h PPR for the "Large Increment per Cycle Events" section, e.g., section 2.1.15.3 on p. 173 in this version: https://www.amd.com/system/files/TechDocs/56176_ppr_Family_17h_Model_71h_B0_pub_Rev_3.06.zip Description of software operation --------------------------------- The following steps are taken in order to support reserving and enabling the extra counter for Large Increment per Cycle events: 1. In the main x86 scheduler, we reduce the number of available counters by the number of Large Increment per Cycle events being scheduled, tracked by a new cpuc variable 'n_pair' and a new amd_put_event_constraints_f17h(). This improves the counter scheduler success rate. 2. In perf_assign_events(), if a counter is assigned to a Large Increment event, we increment the current counter variable, so the counter used for the Merge event is removed from assignment consideration by upcoming event assignments. 3. In find_counter(), if a counter has been found for the Large Increment event, we set the next counter as used, to prevent other events from using it. 4. We perform steps 2 & 3 also in the x86 scheduler fastpath, i.e., we add Merge event accounting to the existing used_mask logic. 5. Finally, we add on the programming of Merge event to the neighbouring PMC counters in the counter enable/disable{_all} code paths. Currently, software does not support a single PMU with mixed 48- and 64-bit counting, so Large increment event counts are limited to 48 bits. In set_period, we zero-out the upper 16 bits of the count, so the hardware doesn't copy them to the even counter's higher bits. Simple invocation example showing counting 8 FLOPs per 256-bit/%ymm vaddps instruction executed in a loop 100 million times: perf stat -e cpu/fp_ret_sse_avx_ops.all/,cpu/instructions/ <workload> Performance counter stats for '<workload>': 800,000,000 cpu/fp_ret_sse_avx_ops.all/u 300,042,101 cpu/instructions/u Prior to this patch, the reported SSE/AVX FLOPs retired count would be wrong. [peterz: lots of renames and edits to the code] Signed-off-by: Kim Phillips <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
1 parent 471af00 commit 5738891

File tree

3 files changed

+95
-15
lines changed

3 files changed

+95
-15
lines changed

arch/x86/events/amd/core.c

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@
1414
static DEFINE_PER_CPU(unsigned long, perf_nmi_tstamp);
1515
static unsigned long perf_nmi_window;
1616

17+
/* AMD Event 0xFFF: Merge. Used with Large Increment per Cycle events */
18+
#define AMD_MERGE_EVENT ((0xFULL << 32) | 0xFFULL)
19+
#define AMD_MERGE_EVENT_ENABLE (AMD_MERGE_EVENT | ARCH_PERFMON_EVENTSEL_ENABLE)
20+
1721
static __initconst const u64 amd_hw_cache_event_ids
1822
[PERF_COUNT_HW_CACHE_MAX]
1923
[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -335,6 +339,9 @@ static int amd_core_hw_config(struct perf_event *event)
335339
else if (event->attr.exclude_guest)
336340
event->hw.config |= AMD64_EVENTSEL_HOSTONLY;
337341

342+
if ((x86_pmu.flags & PMU_FL_PAIR) && amd_is_pair_event_code(&event->hw))
343+
event->hw.flags |= PERF_X86_EVENT_PAIR;
344+
338345
return 0;
339346
}
340347

@@ -880,6 +887,15 @@ amd_get_event_constraints_f17h(struct cpu_hw_events *cpuc, int idx,
880887
return &unconstrained;
881888
}
882889

890+
static void amd_put_event_constraints_f17h(struct cpu_hw_events *cpuc,
891+
struct perf_event *event)
892+
{
893+
struct hw_perf_event *hwc = &event->hw;
894+
895+
if (is_counter_pair(hwc))
896+
--cpuc->n_pair;
897+
}
898+
883899
static ssize_t amd_event_sysfs_show(char *page, u64 config)
884900
{
885901
u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) |
@@ -967,6 +983,8 @@ static int __init amd_core_pmu_init(void)
967983
PERF_X86_EVENT_PAIR);
968984

969985
x86_pmu.get_event_constraints = amd_get_event_constraints_f17h;
986+
x86_pmu.put_event_constraints = amd_put_event_constraints_f17h;
987+
x86_pmu.perf_ctr_pair_en = AMD_MERGE_EVENT_ENABLE;
970988
x86_pmu.flags |= PMU_FL_PAIR;
971989
}
972990

arch/x86/events/core.c

Lines changed: 59 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -618,6 +618,7 @@ void x86_pmu_disable_all(void)
618618
int idx;
619619

620620
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
621+
struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
621622
u64 val;
622623

623624
if (!test_bit(idx, cpuc->active_mask))
@@ -627,6 +628,8 @@ void x86_pmu_disable_all(void)
627628
continue;
628629
val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
629630
wrmsrl(x86_pmu_config_addr(idx), val);
631+
if (is_counter_pair(hwc))
632+
wrmsrl(x86_pmu_config_addr(idx + 1), 0);
630633
}
631634
}
632635

@@ -699,7 +702,7 @@ struct sched_state {
699702
int counter; /* counter index */
700703
int unassigned; /* number of events to be assigned left */
701704
int nr_gp; /* number of GP counters used */
702-
unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
705+
u64 used;
703706
};
704707

705708
/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
@@ -756,8 +759,12 @@ static bool perf_sched_restore_state(struct perf_sched *sched)
756759
sched->saved_states--;
757760
sched->state = sched->saved[sched->saved_states];
758761

759-
/* continue with next counter: */
760-
clear_bit(sched->state.counter++, sched->state.used);
762+
/* this assignment didn't work out */
763+
/* XXX broken vs EVENT_PAIR */
764+
sched->state.used &= ~BIT_ULL(sched->state.counter);
765+
766+
/* try the next one */
767+
sched->state.counter++;
761768

762769
return true;
763770
}
@@ -782,20 +789,32 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
782789
if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
783790
idx = INTEL_PMC_IDX_FIXED;
784791
for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
785-
if (!__test_and_set_bit(idx, sched->state.used))
786-
goto done;
792+
u64 mask = BIT_ULL(idx);
793+
794+
if (sched->state.used & mask)
795+
continue;
796+
797+
sched->state.used |= mask;
798+
goto done;
787799
}
788800
}
789801

790802
/* Grab the first unused counter starting with idx */
791803
idx = sched->state.counter;
792804
for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
793-
if (!__test_and_set_bit(idx, sched->state.used)) {
794-
if (sched->state.nr_gp++ >= sched->max_gp)
795-
return false;
805+
u64 mask = BIT_ULL(idx);
796806

797-
goto done;
798-
}
807+
if (c->flags & PERF_X86_EVENT_PAIR)
808+
mask |= mask << 1;
809+
810+
if (sched->state.used & mask)
811+
continue;
812+
813+
if (sched->state.nr_gp++ >= sched->max_gp)
814+
return false;
815+
816+
sched->state.used |= mask;
817+
goto done;
799818
}
800819

801820
return false;
@@ -872,12 +891,10 @@ EXPORT_SYMBOL_GPL(perf_assign_events);
872891
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
873892
{
874893
struct event_constraint *c;
875-
unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
876894
struct perf_event *e;
877895
int n0, i, wmin, wmax, unsched = 0;
878896
struct hw_perf_event *hwc;
879-
880-
bitmap_zero(used_mask, X86_PMC_IDX_MAX);
897+
u64 used_mask = 0;
881898

882899
/*
883900
* Compute the number of events already present; see x86_pmu_add(),
@@ -920,6 +937,8 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
920937
* fastpath, try to reuse previous register
921938
*/
922939
for (i = 0; i < n; i++) {
940+
u64 mask;
941+
923942
hwc = &cpuc->event_list[i]->hw;
924943
c = cpuc->event_constraint[i];
925944

@@ -931,11 +950,16 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
931950
if (!test_bit(hwc->idx, c->idxmsk))
932951
break;
933952

953+
mask = BIT_ULL(hwc->idx);
954+
if (is_counter_pair(hwc))
955+
mask |= mask << 1;
956+
934957
/* not already used */
935-
if (test_bit(hwc->idx, used_mask))
958+
if (used_mask & mask)
936959
break;
937960

938-
__set_bit(hwc->idx, used_mask);
961+
used_mask |= mask;
962+
939963
if (assign)
940964
assign[i] = hwc->idx;
941965
}
@@ -958,6 +982,15 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
958982
READ_ONCE(cpuc->excl_cntrs->exclusive_present))
959983
gpmax /= 2;
960984

985+
/*
986+
* Reduce the amount of available counters to allow fitting
987+
* the extra Merge events needed by large increment events.
988+
*/
989+
if (x86_pmu.flags & PMU_FL_PAIR) {
990+
gpmax = x86_pmu.num_counters - cpuc->n_pair;
991+
WARN_ON(gpmax <= 0);
992+
}
993+
961994
unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
962995
wmax, gpmax, assign);
963996
}
@@ -1038,6 +1071,8 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
10381071
return -EINVAL;
10391072
cpuc->event_list[n] = leader;
10401073
n++;
1074+
if (is_counter_pair(&leader->hw))
1075+
cpuc->n_pair++;
10411076
}
10421077
if (!dogrp)
10431078
return n;
@@ -1052,6 +1087,8 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
10521087

10531088
cpuc->event_list[n] = event;
10541089
n++;
1090+
if (is_counter_pair(&event->hw))
1091+
cpuc->n_pair++;
10551092
}
10561093
return n;
10571094
}
@@ -1237,6 +1274,13 @@ int x86_perf_event_set_period(struct perf_event *event)
12371274

12381275
wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
12391276

1277+
/*
1278+
* Clear the Merge event counter's upper 16 bits since
1279+
* we currently declare a 48-bit counter width
1280+
*/
1281+
if (is_counter_pair(hwc))
1282+
wrmsrl(x86_pmu_event_addr(idx + 1), 0);
1283+
12401284
/*
12411285
* Due to erratum on certan cpu we need
12421286
* a second write to be sure the register

arch/x86/events/perf_event.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,7 @@ struct cpu_hw_events {
273273
struct amd_nb *amd_nb;
274274
/* Inverted mask of bits to clear in the perf_ctr ctrl registers */
275275
u64 perf_ctr_virt_mask;
276+
int n_pair; /* Large increment events */
276277

277278
void *kfree_on_online[X86_PERF_KFREE_MAX];
278279
};
@@ -695,6 +696,7 @@ struct x86_pmu {
695696
* AMD bits
696697
*/
697698
unsigned int amd_nb_constraints : 1;
699+
u64 perf_ctr_pair_en;
698700

699701
/*
700702
* Extra registers for events
@@ -840,13 +842,26 @@ int x86_pmu_hw_config(struct perf_event *event);
840842

841843
void x86_pmu_disable_all(void);
842844

845+
static inline bool is_counter_pair(struct hw_perf_event *hwc)
846+
{
847+
return hwc->flags & PERF_X86_EVENT_PAIR;
848+
}
849+
843850
static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
844851
u64 enable_mask)
845852
{
846853
u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask);
847854

848855
if (hwc->extra_reg.reg)
849856
wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
857+
858+
/*
859+
* Add enabled Merge event on next counter
860+
* if large increment event being enabled on this counter
861+
*/
862+
if (is_counter_pair(hwc))
863+
wrmsrl(x86_pmu_config_addr(hwc->idx + 1), x86_pmu.perf_ctr_pair_en);
864+
850865
wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask);
851866
}
852867

@@ -863,6 +878,9 @@ static inline void x86_pmu_disable_event(struct perf_event *event)
863878
struct hw_perf_event *hwc = &event->hw;
864879

865880
wrmsrl(hwc->config_base, hwc->config);
881+
882+
if (is_counter_pair(hwc))
883+
wrmsrl(x86_pmu_config_addr(hwc->idx + 1), 0);
866884
}
867885

868886
void x86_pmu_enable_event(struct perf_event *event);

0 commit comments

Comments
 (0)