77 * This test measures the performance effects of KVM's access tracking.
88 * Access tracking is driven by the MMU notifiers test_young, clear_young, and
99 * clear_flush_young. These notifiers do not have a direct userspace API,
10- * however the clear_young notifier can be triggered by marking a pages as idle
11- * in /sys/kernel/mm/page_idle/bitmap. This test leverages that mechanism to
12- * enable access tracking on guest memory.
10+ * however the clear_young notifier can be triggered either by
11+ * 1. marking a pages as idle in /sys/kernel/mm/page_idle/bitmap OR
12+ * 2. adding a new MGLRU generation using the lru_gen debugfs file.
13+ * This test leverages page_idle to enable access tracking on guest memory
14+ * unless MGLRU is enabled, in which case MGLRU is used.
1315 *
1416 * To measure performance this test runs a VM with a configurable number of
1517 * vCPUs that each touch every page in disjoint regions of memory. Performance
1618 * is measured in the time it takes all vCPUs to finish touching their
1719 * predefined region.
1820 *
1921 * Note that a deterministic correctness test of access tracking is not possible
20- * by using page_idle as it exists today. This is for a few reasons:
22+ * by using page_idle or MGLRU aging as it exists today. This is for a few
23+ * reasons:
2124 *
22- * 1. page_idle only issues clear_young notifiers, which lack a TLB flush. This
23- * means subsequent guest accesses are not guaranteed to see page table
25+ * 1. page_idle and MGLRU only issue clear_young notifiers, which lack a TLB flush.
26+ * This means subsequent guest accesses are not guaranteed to see page table
2427 * updates made by KVM until some time in the future.
2528 *
2629 * 2. page_idle only operates on LRU pages. Newly allocated pages are not
4851#include "guest_modes.h"
4952#include "processor.h"
5053
54+ #include "cgroup_util.h"
55+ #include "lru_gen_util.h"
56+
57+ static const char * TEST_MEMCG_NAME = "access_tracking_perf_test" ;
58+
5159/* Global variable used to synchronize all of the vCPU threads. */
5260static int iteration ;
5361
62+ /* The cgroup memory controller root. Needed for lru_gen-based aging. */
63+ char cgroup_root [PATH_MAX ];
64+
5465/* Defines what vCPU threads should do during a given iteration. */
5566static enum {
5667 /* Run the vCPU to access all its memory. */
@@ -75,6 +86,15 @@ static bool overlap_memory_access;
7586 */
7687static int idle_pages_warn_only = -1 ;
7788
89+ /* Whether or not to use MGLRU instead of page_idle for access tracking */
90+ static bool use_lru_gen ;
91+
92+ /* Total number of pages to expect in the memcg after touching everything */
93+ static long test_pages ;
94+
95+ /* Last generation we found the pages in */
96+ static int lru_gen_last_gen = -1 ;
97+
7898struct test_params {
7999 /* The backing source for the region of memory. */
80100 enum vm_mem_backing_src_type backing_src ;
@@ -133,8 +153,24 @@ static void mark_page_idle(int page_idle_fd, uint64_t pfn)
133153 "Set page_idle bits for PFN 0x%" PRIx64 , pfn );
134154}
135155
136- static void mark_vcpu_memory_idle (struct kvm_vm * vm ,
137- struct memstress_vcpu_args * vcpu_args )
156+ static void too_many_idle_pages (long idle_pages , long total_pages , int vcpu_idx )
157+ {
158+ char prefix [18 ] = {};
159+
160+ if (vcpu_idx >= 0 )
161+ snprintf (prefix , 18 , "vCPU%d: " , vcpu_idx );
162+
163+ TEST_ASSERT (idle_pages_warn_only ,
164+ "%sToo many pages still idle (%lu out of %lu)" ,
165+ prefix , idle_pages , total_pages );
166+
167+ printf ("WARNING: %sToo many pages still idle (%lu out of %lu), "
168+ "this will affect performance results.\n" ,
169+ prefix , idle_pages , total_pages );
170+ }
171+
172+ static void pageidle_mark_vcpu_memory_idle (struct kvm_vm * vm ,
173+ struct memstress_vcpu_args * vcpu_args )
138174{
139175 int vcpu_idx = vcpu_args -> vcpu_idx ;
140176 uint64_t base_gva = vcpu_args -> gva ;
@@ -188,20 +224,78 @@ static void mark_vcpu_memory_idle(struct kvm_vm *vm,
188224 * access tracking but low enough as to not make the test too brittle
189225 * over time and across architectures.
190226 */
191- if (still_idle >= pages / 10 ) {
192- TEST_ASSERT (idle_pages_warn_only ,
193- "vCPU%d: Too many pages still idle (%lu out of %lu)" ,
194- vcpu_idx , still_idle , pages );
195-
196- printf ("WARNING: vCPU%d: Too many pages still idle (%lu out of %lu), "
197- "this will affect performance results.\n" ,
198- vcpu_idx , still_idle , pages );
199- }
227+ if (still_idle >= pages / 10 )
228+ too_many_idle_pages (still_idle , pages ,
229+ overlap_memory_access ? -1 : vcpu_idx );
200230
201231 close (page_idle_fd );
202232 close (pagemap_fd );
203233}
204234
235+ int find_generation (struct memcg_stats * stats , long total_pages )
236+ {
237+ /*
238+ * For finding the generation that contains our pages, use the same
239+ * 90% threshold that page_idle uses.
240+ */
241+ int gen = lru_gen_find_generation (stats , total_pages * 9 / 10 );
242+
243+ if (gen >= 0 )
244+ return gen ;
245+
246+ if (!idle_pages_warn_only ) {
247+ TEST_FAIL ("Could not find a generation with 90%% of guest memory (%ld pages)." ,
248+ total_pages * 9 / 10 );
249+ return gen ;
250+ }
251+
252+ /*
253+ * We couldn't find a generation with 90% of guest memory, which can
254+ * happen if access tracking is unreliable. Simply look for a majority
255+ * of pages.
256+ */
257+ puts ("WARNING: Couldn't find a generation with 90% of guest memory. "
258+ "Performance results may not be accurate." );
259+ gen = lru_gen_find_generation (stats , total_pages / 2 );
260+ TEST_ASSERT (gen >= 0 ,
261+ "Could not find a generation with 50%% of guest memory (%ld pages)." ,
262+ total_pages / 2 );
263+ return gen ;
264+ }
265+
266+ static void lru_gen_mark_memory_idle (struct kvm_vm * vm )
267+ {
268+ struct timespec ts_start ;
269+ struct timespec ts_elapsed ;
270+ struct memcg_stats stats ;
271+ int new_gen ;
272+
273+ /* Make a new generation */
274+ clock_gettime (CLOCK_MONOTONIC , & ts_start );
275+ lru_gen_do_aging (& stats , TEST_MEMCG_NAME );
276+ ts_elapsed = timespec_elapsed (ts_start );
277+
278+ /* Check the generation again */
279+ new_gen = find_generation (& stats , test_pages );
280+
281+ /*
282+ * This function should only be invoked with newly-accessed pages,
283+ * so pages should always move to a newer generation.
284+ */
285+ if (new_gen <= lru_gen_last_gen ) {
286+ /* We did not move to a newer generation. */
287+ long idle_pages = lru_gen_sum_memcg_stats_for_gen (lru_gen_last_gen ,
288+ & stats );
289+
290+ too_many_idle_pages (min_t (long , idle_pages , test_pages ),
291+ test_pages , -1 );
292+ }
293+ pr_info ("%-30s: %ld.%09lds\n" ,
294+ "Mark memory idle (lru_gen)" , ts_elapsed .tv_sec ,
295+ ts_elapsed .tv_nsec );
296+ lru_gen_last_gen = new_gen ;
297+ }
298+
205299static void assert_ucall (struct kvm_vcpu * vcpu , uint64_t expected_ucall )
206300{
207301 struct ucall uc ;
@@ -241,7 +335,7 @@ static void vcpu_thread_main(struct memstress_vcpu_args *vcpu_args)
241335 assert_ucall (vcpu , UCALL_SYNC );
242336 break ;
243337 case ITERATION_MARK_IDLE :
244- mark_vcpu_memory_idle (vm , vcpu_args );
338+ pageidle_mark_vcpu_memory_idle (vm , vcpu_args );
245339 break ;
246340 }
247341
@@ -293,15 +387,18 @@ static void access_memory(struct kvm_vm *vm, int nr_vcpus,
293387
294388static void mark_memory_idle (struct kvm_vm * vm , int nr_vcpus )
295389{
390+ if (use_lru_gen )
391+ return lru_gen_mark_memory_idle (vm );
392+
296393 /*
297394 * Even though this parallelizes the work across vCPUs, this is still a
298395 * very slow operation because page_idle forces the test to mark one pfn
299- * at a time and the clear_young notifier serializes on the KVM MMU
396+ * at a time and the clear_young notifier may serialize on the KVM MMU
300397 * lock.
301398 */
302399 pr_debug ("Marking VM memory idle (slow)...\n" );
303400 iteration_work = ITERATION_MARK_IDLE ;
304- run_iteration (vm , nr_vcpus , "Mark memory idle" );
401+ run_iteration (vm , nr_vcpus , "Mark memory idle (page_idle) " );
305402}
306403
307404static void run_test (enum vm_guest_mode mode , void * arg )
@@ -313,11 +410,38 @@ static void run_test(enum vm_guest_mode mode, void *arg)
313410 vm = memstress_create_vm (mode , nr_vcpus , params -> vcpu_memory_bytes , 1 ,
314411 params -> backing_src , !overlap_memory_access );
315412
413+ /*
414+ * If guest_page_size is larger than the host's page size, the
415+ * guest (memstress) will only fault in a subset of the host's pages.
416+ */
417+ test_pages = params -> nr_vcpus * params -> vcpu_memory_bytes /
418+ max (memstress_args .guest_page_size ,
419+ (uint64_t )getpagesize ());
420+
316421 memstress_start_vcpu_threads (nr_vcpus , vcpu_thread_main );
317422
318423 pr_info ("\n" );
319424 access_memory (vm , nr_vcpus , ACCESS_WRITE , "Populating memory" );
320425
426+ if (use_lru_gen ) {
427+ struct memcg_stats stats ;
428+
429+ /*
430+ * Do a page table scan now. Following initial population, aging
431+ * may not cause the pages to move to a newer generation. Do
432+ * an aging pass now so that future aging passes always move
433+ * pages to a newer generation.
434+ */
435+ printf ("Initial aging pass (lru_gen)\n" );
436+ lru_gen_do_aging (& stats , TEST_MEMCG_NAME );
437+ TEST_ASSERT (lru_gen_sum_memcg_stats (& stats ) >= test_pages ,
438+ "Not all pages accounted for (looking for %ld). "
439+ "Was the memcg set up correctly?" , test_pages );
440+ access_memory (vm , nr_vcpus , ACCESS_WRITE , "Re-populating memory" );
441+ lru_gen_read_memcg_stats (& stats , TEST_MEMCG_NAME );
442+ lru_gen_last_gen = find_generation (& stats , test_pages );
443+ }
444+
321445 /* As a control, read and write to the populated memory first. */
322446 access_memory (vm , nr_vcpus , ACCESS_WRITE , "Writing to populated memory" );
323447 access_memory (vm , nr_vcpus , ACCESS_READ , "Reading from populated memory" );
@@ -354,7 +478,12 @@ static int access_tracking_unreliable(void)
354478 puts ("Skipping idle page count sanity check, because NUMA balancing is enabled" );
355479 return 1 ;
356480 }
481+ return 0 ;
482+ }
357483
484+ static int run_test_for_each_guest_mode (const char * cgroup , void * arg )
485+ {
486+ for_each_guest_mode (run_test , arg );
358487 return 0 ;
359488}
360489
@@ -383,13 +512,19 @@ static void help(char *name)
383512 exit (0 );
384513}
385514
515+ void destroy_cgroup (char * cg )
516+ {
517+ printf ("Destroying cgroup: %s\n" , cg );
518+ }
519+
386520int main (int argc , char * argv [])
387521{
388522 struct test_params params = {
389523 .backing_src = DEFAULT_VM_MEM_SRC ,
390524 .vcpu_memory_bytes = DEFAULT_PER_VCPU_MEM_SIZE ,
391525 .nr_vcpus = 1 ,
392526 };
527+ char * new_cg = NULL ;
393528 int page_idle_fd ;
394529 int opt ;
395530
@@ -424,15 +559,53 @@ int main(int argc, char *argv[])
424559 }
425560 }
426561
427- page_idle_fd = open ("/sys/kernel/mm/page_idle/bitmap" , O_RDWR );
428- __TEST_REQUIRE (page_idle_fd >= 0 ,
429- "CONFIG_IDLE_PAGE_TRACKING is not enabled" );
430- close (page_idle_fd );
431-
432562 if (idle_pages_warn_only == -1 )
433563 idle_pages_warn_only = access_tracking_unreliable ();
434564
435- for_each_guest_mode (run_test , & params );
565+ if (lru_gen_usable ()) {
566+ bool cg_created = true;
567+ int ret ;
568+
569+ puts ("Using lru_gen for aging" );
570+ use_lru_gen = true;
571+
572+ if (cg_find_controller_root (cgroup_root , sizeof (cgroup_root ), "memory" ))
573+ ksft_exit_skip ("Cannot find memory cgroup controller\n" );
574+
575+ new_cg = cg_name (cgroup_root , TEST_MEMCG_NAME );
576+ printf ("Creating cgroup: %s\n" , new_cg );
577+ if (cg_create (new_cg )) {
578+ if (errno == EEXIST ) {
579+ printf ("Found existing cgroup" );
580+ cg_created = false;
581+ } else {
582+ ksft_exit_skip ("could not create new cgroup: %s\n" , new_cg );
583+ }
584+ }
585+
586+ /*
587+ * This will fork off a new process to run the test within
588+ * a new memcg, so we need to properly propagate the return
589+ * value up.
590+ */
591+ ret = cg_run (new_cg , & run_test_for_each_guest_mode , & params );
592+ if (cg_created )
593+ cg_destroy (new_cg );
594+ if (ret < 0 )
595+ TEST_FAIL ("child did not spawn or was abnormally killed" );
596+ if (ret )
597+ return ret ;
598+ } else {
599+ page_idle_fd = open ("/sys/kernel/mm/page_idle/bitmap" , O_RDWR );
600+ __TEST_REQUIRE (page_idle_fd >= 0 ,
601+ "Couldn't open /sys/kernel/mm/page_idle/bitmap. "
602+ "Is CONFIG_IDLE_PAGE_TRACKING enabled?" );
603+
604+ close (page_idle_fd );
605+
606+ puts ("Using page_idle for aging" );
607+ run_test_for_each_guest_mode (NULL , & params );
608+ }
436609
437610 return 0 ;
438611}
0 commit comments