Skip to content

Commit d166453

Browse files
48casean-jc
authored andcommitted
KVM: selftests: access_tracking_perf_test: Use MGLRU for access tracking
Use MGLRU's debugfs interface to do access tracking instead of page_idle. The logic to use the page_idle bitmap is left in, as it is useful for kernels that do not have MGLRU built in. When MGLRU is enabled, page_idle will report pages as still idle even after being accessed, as MGLRU doesn't necessarily clear the Idle folio flag when accessing an idle page, so the test will not attempt to use page_idle if MGLRU is enabled but otherwise not usable. Aging pages with MGLRU is much faster than marking pages as idle with page_idle. Co-developed-by: Axel Rasmussen <[email protected]> Signed-off-by: Axel Rasmussen <[email protected]> Signed-off-by: James Houghton <[email protected]> Link: https://lore.kernel.org/r/[email protected] [sean: print parsed features, not raw string] Signed-off-by: Sean Christopherson <[email protected]>
1 parent b11fcb5 commit d166453

File tree

4 files changed

+638
-26
lines changed

4 files changed

+638
-26
lines changed

tools/testing/selftests/kvm/Makefile.kvm

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ LIBKVM += lib/elf.c
88
LIBKVM += lib/guest_modes.c
99
LIBKVM += lib/io.c
1010
LIBKVM += lib/kvm_util.c
11+
LIBKVM += lib/lru_gen_util.c
1112
LIBKVM += lib/memstress.c
1213
LIBKVM += lib/guest_sprintf.c
1314
LIBKVM += lib/rbtree.c

tools/testing/selftests/kvm/access_tracking_perf_test.c

Lines changed: 199 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -7,20 +7,23 @@
77
* This test measures the performance effects of KVM's access tracking.
88
* Access tracking is driven by the MMU notifiers test_young, clear_young, and
99
* clear_flush_young. These notifiers do not have a direct userspace API,
10-
* however the clear_young notifier can be triggered by marking a pages as idle
11-
* in /sys/kernel/mm/page_idle/bitmap. This test leverages that mechanism to
12-
* enable access tracking on guest memory.
10+
* however the clear_young notifier can be triggered either by
11+
* 1. marking a pages as idle in /sys/kernel/mm/page_idle/bitmap OR
12+
* 2. adding a new MGLRU generation using the lru_gen debugfs file.
13+
* This test leverages page_idle to enable access tracking on guest memory
14+
* unless MGLRU is enabled, in which case MGLRU is used.
1315
*
1416
* To measure performance this test runs a VM with a configurable number of
1517
* vCPUs that each touch every page in disjoint regions of memory. Performance
1618
* is measured in the time it takes all vCPUs to finish touching their
1719
* predefined region.
1820
*
1921
* Note that a deterministic correctness test of access tracking is not possible
20-
* by using page_idle as it exists today. This is for a few reasons:
22+
* by using page_idle or MGLRU aging as it exists today. This is for a few
23+
* reasons:
2124
*
22-
* 1. page_idle only issues clear_young notifiers, which lack a TLB flush. This
23-
* means subsequent guest accesses are not guaranteed to see page table
25+
* 1. page_idle and MGLRU only issue clear_young notifiers, which lack a TLB flush.
26+
* This means subsequent guest accesses are not guaranteed to see page table
2427
* updates made by KVM until some time in the future.
2528
*
2629
* 2. page_idle only operates on LRU pages. Newly allocated pages are not
@@ -48,9 +51,17 @@
4851
#include "guest_modes.h"
4952
#include "processor.h"
5053

54+
#include "cgroup_util.h"
55+
#include "lru_gen_util.h"
56+
57+
static const char *TEST_MEMCG_NAME = "access_tracking_perf_test";
58+
5159
/* Global variable used to synchronize all of the vCPU threads. */
5260
static int iteration;
5361

62+
/* The cgroup memory controller root. Needed for lru_gen-based aging. */
63+
char cgroup_root[PATH_MAX];
64+
5465
/* Defines what vCPU threads should do during a given iteration. */
5566
static enum {
5667
/* Run the vCPU to access all its memory. */
@@ -75,6 +86,15 @@ static bool overlap_memory_access;
7586
*/
7687
static int idle_pages_warn_only = -1;
7788

89+
/* Whether or not to use MGLRU instead of page_idle for access tracking */
90+
static bool use_lru_gen;
91+
92+
/* Total number of pages to expect in the memcg after touching everything */
93+
static long test_pages;
94+
95+
/* Last generation we found the pages in */
96+
static int lru_gen_last_gen = -1;
97+
7898
struct test_params {
7999
/* The backing source for the region of memory. */
80100
enum vm_mem_backing_src_type backing_src;
@@ -133,8 +153,24 @@ static void mark_page_idle(int page_idle_fd, uint64_t pfn)
133153
"Set page_idle bits for PFN 0x%" PRIx64, pfn);
134154
}
135155

136-
static void mark_vcpu_memory_idle(struct kvm_vm *vm,
137-
struct memstress_vcpu_args *vcpu_args)
156+
static void too_many_idle_pages(long idle_pages, long total_pages, int vcpu_idx)
157+
{
158+
char prefix[18] = {};
159+
160+
if (vcpu_idx >= 0)
161+
snprintf(prefix, 18, "vCPU%d: ", vcpu_idx);
162+
163+
TEST_ASSERT(idle_pages_warn_only,
164+
"%sToo many pages still idle (%lu out of %lu)",
165+
prefix, idle_pages, total_pages);
166+
167+
printf("WARNING: %sToo many pages still idle (%lu out of %lu), "
168+
"this will affect performance results.\n",
169+
prefix, idle_pages, total_pages);
170+
}
171+
172+
static void pageidle_mark_vcpu_memory_idle(struct kvm_vm *vm,
173+
struct memstress_vcpu_args *vcpu_args)
138174
{
139175
int vcpu_idx = vcpu_args->vcpu_idx;
140176
uint64_t base_gva = vcpu_args->gva;
@@ -188,20 +224,78 @@ static void mark_vcpu_memory_idle(struct kvm_vm *vm,
188224
* access tracking but low enough as to not make the test too brittle
189225
* over time and across architectures.
190226
*/
191-
if (still_idle >= pages / 10) {
192-
TEST_ASSERT(idle_pages_warn_only,
193-
"vCPU%d: Too many pages still idle (%lu out of %lu)",
194-
vcpu_idx, still_idle, pages);
195-
196-
printf("WARNING: vCPU%d: Too many pages still idle (%lu out of %lu), "
197-
"this will affect performance results.\n",
198-
vcpu_idx, still_idle, pages);
199-
}
227+
if (still_idle >= pages / 10)
228+
too_many_idle_pages(still_idle, pages,
229+
overlap_memory_access ? -1 : vcpu_idx);
200230

201231
close(page_idle_fd);
202232
close(pagemap_fd);
203233
}
204234

235+
int find_generation(struct memcg_stats *stats, long total_pages)
236+
{
237+
/*
238+
* For finding the generation that contains our pages, use the same
239+
* 90% threshold that page_idle uses.
240+
*/
241+
int gen = lru_gen_find_generation(stats, total_pages * 9 / 10);
242+
243+
if (gen >= 0)
244+
return gen;
245+
246+
if (!idle_pages_warn_only) {
247+
TEST_FAIL("Could not find a generation with 90%% of guest memory (%ld pages).",
248+
total_pages * 9 / 10);
249+
return gen;
250+
}
251+
252+
/*
253+
* We couldn't find a generation with 90% of guest memory, which can
254+
* happen if access tracking is unreliable. Simply look for a majority
255+
* of pages.
256+
*/
257+
puts("WARNING: Couldn't find a generation with 90% of guest memory. "
258+
"Performance results may not be accurate.");
259+
gen = lru_gen_find_generation(stats, total_pages / 2);
260+
TEST_ASSERT(gen >= 0,
261+
"Could not find a generation with 50%% of guest memory (%ld pages).",
262+
total_pages / 2);
263+
return gen;
264+
}
265+
266+
static void lru_gen_mark_memory_idle(struct kvm_vm *vm)
267+
{
268+
struct timespec ts_start;
269+
struct timespec ts_elapsed;
270+
struct memcg_stats stats;
271+
int new_gen;
272+
273+
/* Make a new generation */
274+
clock_gettime(CLOCK_MONOTONIC, &ts_start);
275+
lru_gen_do_aging(&stats, TEST_MEMCG_NAME);
276+
ts_elapsed = timespec_elapsed(ts_start);
277+
278+
/* Check the generation again */
279+
new_gen = find_generation(&stats, test_pages);
280+
281+
/*
282+
* This function should only be invoked with newly-accessed pages,
283+
* so pages should always move to a newer generation.
284+
*/
285+
if (new_gen <= lru_gen_last_gen) {
286+
/* We did not move to a newer generation. */
287+
long idle_pages = lru_gen_sum_memcg_stats_for_gen(lru_gen_last_gen,
288+
&stats);
289+
290+
too_many_idle_pages(min_t(long, idle_pages, test_pages),
291+
test_pages, -1);
292+
}
293+
pr_info("%-30s: %ld.%09lds\n",
294+
"Mark memory idle (lru_gen)", ts_elapsed.tv_sec,
295+
ts_elapsed.tv_nsec);
296+
lru_gen_last_gen = new_gen;
297+
}
298+
205299
static void assert_ucall(struct kvm_vcpu *vcpu, uint64_t expected_ucall)
206300
{
207301
struct ucall uc;
@@ -241,7 +335,7 @@ static void vcpu_thread_main(struct memstress_vcpu_args *vcpu_args)
241335
assert_ucall(vcpu, UCALL_SYNC);
242336
break;
243337
case ITERATION_MARK_IDLE:
244-
mark_vcpu_memory_idle(vm, vcpu_args);
338+
pageidle_mark_vcpu_memory_idle(vm, vcpu_args);
245339
break;
246340
}
247341

@@ -293,15 +387,18 @@ static void access_memory(struct kvm_vm *vm, int nr_vcpus,
293387

294388
static void mark_memory_idle(struct kvm_vm *vm, int nr_vcpus)
295389
{
390+
if (use_lru_gen)
391+
return lru_gen_mark_memory_idle(vm);
392+
296393
/*
297394
* Even though this parallelizes the work across vCPUs, this is still a
298395
* very slow operation because page_idle forces the test to mark one pfn
299-
* at a time and the clear_young notifier serializes on the KVM MMU
396+
* at a time and the clear_young notifier may serialize on the KVM MMU
300397
* lock.
301398
*/
302399
pr_debug("Marking VM memory idle (slow)...\n");
303400
iteration_work = ITERATION_MARK_IDLE;
304-
run_iteration(vm, nr_vcpus, "Mark memory idle");
401+
run_iteration(vm, nr_vcpus, "Mark memory idle (page_idle)");
305402
}
306403

307404
static void run_test(enum vm_guest_mode mode, void *arg)
@@ -313,11 +410,38 @@ static void run_test(enum vm_guest_mode mode, void *arg)
313410
vm = memstress_create_vm(mode, nr_vcpus, params->vcpu_memory_bytes, 1,
314411
params->backing_src, !overlap_memory_access);
315412

413+
/*
414+
* If guest_page_size is larger than the host's page size, the
415+
* guest (memstress) will only fault in a subset of the host's pages.
416+
*/
417+
test_pages = params->nr_vcpus * params->vcpu_memory_bytes /
418+
max(memstress_args.guest_page_size,
419+
(uint64_t)getpagesize());
420+
316421
memstress_start_vcpu_threads(nr_vcpus, vcpu_thread_main);
317422

318423
pr_info("\n");
319424
access_memory(vm, nr_vcpus, ACCESS_WRITE, "Populating memory");
320425

426+
if (use_lru_gen) {
427+
struct memcg_stats stats;
428+
429+
/*
430+
* Do a page table scan now. Following initial population, aging
431+
* may not cause the pages to move to a newer generation. Do
432+
* an aging pass now so that future aging passes always move
433+
* pages to a newer generation.
434+
*/
435+
printf("Initial aging pass (lru_gen)\n");
436+
lru_gen_do_aging(&stats, TEST_MEMCG_NAME);
437+
TEST_ASSERT(lru_gen_sum_memcg_stats(&stats) >= test_pages,
438+
"Not all pages accounted for (looking for %ld). "
439+
"Was the memcg set up correctly?", test_pages);
440+
access_memory(vm, nr_vcpus, ACCESS_WRITE, "Re-populating memory");
441+
lru_gen_read_memcg_stats(&stats, TEST_MEMCG_NAME);
442+
lru_gen_last_gen = find_generation(&stats, test_pages);
443+
}
444+
321445
/* As a control, read and write to the populated memory first. */
322446
access_memory(vm, nr_vcpus, ACCESS_WRITE, "Writing to populated memory");
323447
access_memory(vm, nr_vcpus, ACCESS_READ, "Reading from populated memory");
@@ -354,7 +478,12 @@ static int access_tracking_unreliable(void)
354478
puts("Skipping idle page count sanity check, because NUMA balancing is enabled");
355479
return 1;
356480
}
481+
return 0;
482+
}
357483

484+
static int run_test_for_each_guest_mode(const char *cgroup, void *arg)
485+
{
486+
for_each_guest_mode(run_test, arg);
358487
return 0;
359488
}
360489

@@ -383,13 +512,19 @@ static void help(char *name)
383512
exit(0);
384513
}
385514

515+
void destroy_cgroup(char *cg)
516+
{
517+
printf("Destroying cgroup: %s\n", cg);
518+
}
519+
386520
int main(int argc, char *argv[])
387521
{
388522
struct test_params params = {
389523
.backing_src = DEFAULT_VM_MEM_SRC,
390524
.vcpu_memory_bytes = DEFAULT_PER_VCPU_MEM_SIZE,
391525
.nr_vcpus = 1,
392526
};
527+
char *new_cg = NULL;
393528
int page_idle_fd;
394529
int opt;
395530

@@ -424,15 +559,53 @@ int main(int argc, char *argv[])
424559
}
425560
}
426561

427-
page_idle_fd = open("/sys/kernel/mm/page_idle/bitmap", O_RDWR);
428-
__TEST_REQUIRE(page_idle_fd >= 0,
429-
"CONFIG_IDLE_PAGE_TRACKING is not enabled");
430-
close(page_idle_fd);
431-
432562
if (idle_pages_warn_only == -1)
433563
idle_pages_warn_only = access_tracking_unreliable();
434564

435-
for_each_guest_mode(run_test, &params);
565+
if (lru_gen_usable()) {
566+
bool cg_created = true;
567+
int ret;
568+
569+
puts("Using lru_gen for aging");
570+
use_lru_gen = true;
571+
572+
if (cg_find_controller_root(cgroup_root, sizeof(cgroup_root), "memory"))
573+
ksft_exit_skip("Cannot find memory cgroup controller\n");
574+
575+
new_cg = cg_name(cgroup_root, TEST_MEMCG_NAME);
576+
printf("Creating cgroup: %s\n", new_cg);
577+
if (cg_create(new_cg)) {
578+
if (errno == EEXIST) {
579+
printf("Found existing cgroup");
580+
cg_created = false;
581+
} else {
582+
ksft_exit_skip("could not create new cgroup: %s\n", new_cg);
583+
}
584+
}
585+
586+
/*
587+
* This will fork off a new process to run the test within
588+
* a new memcg, so we need to properly propagate the return
589+
* value up.
590+
*/
591+
ret = cg_run(new_cg, &run_test_for_each_guest_mode, &params);
592+
if (cg_created)
593+
cg_destroy(new_cg);
594+
if (ret < 0)
595+
TEST_FAIL("child did not spawn or was abnormally killed");
596+
if (ret)
597+
return ret;
598+
} else {
599+
page_idle_fd = open("/sys/kernel/mm/page_idle/bitmap", O_RDWR);
600+
__TEST_REQUIRE(page_idle_fd >= 0,
601+
"Couldn't open /sys/kernel/mm/page_idle/bitmap. "
602+
"Is CONFIG_IDLE_PAGE_TRACKING enabled?");
603+
604+
close(page_idle_fd);
605+
606+
puts("Using page_idle for aging");
607+
run_test_for_each_guest_mode(NULL, &params);
608+
}
436609

437610
return 0;
438611
}

0 commit comments

Comments
 (0)