Skip to content

Commit 255f598

Browse files
davidhildenbrandmstsirkin
authored andcommitted
virtio-mem: Paravirtualized memory hotunplug part 2
We also want to unplug online memory (contained in online memory blocks and, therefore, managed by the buddy), and eventually replug it later. When requested to unplug memory, we use alloc_contig_range() to allocate subblocks in online memory blocks (so we are the owner) and send them to our hypervisor. When requested to plug memory, we can replug such memory using free_contig_range() after asking our hypervisor. We also want to mark all allocated pages PG_offline, so nobody will touch them. To differentiate pages that were never onlined when onlining the memory block from pages allocated via alloc_contig_range(), we use PageDirty(). Based on this flag, virtio_mem_fake_online() can either online the pages for the first time or use free_contig_range(). It is worth noting that there are no guarantees on how much memory can actually get unplugged again. All device memory might completely be fragmented with unmovable data, such that no subblock can get unplugged. We are not touching the ZONE_MOVABLE. If memory is onlined to the ZONE_MOVABLE, it can only get unplugged after that memory was offlined manually by user space. In normal operation, virtio-mem memory is suggested to be onlined to ZONE_NORMAL. In the future, we will try to make unplug more likely to succeed. Add a module parameter to control if online memory shall be touched. As we want to access alloc_contig_range()/free_contig_range() from kernel module context, export the symbols. Note: Whenever virtio-mem uses alloc_contig_range(), all affected pages are on the same node, in the same zone, and contain no holes. Acked-by: Michal Hocko <[email protected]> # to export contig range allocator API Tested-by: Pankaj Gupta <[email protected]> Cc: "Michael S. Tsirkin" <[email protected]> Cc: Jason Wang <[email protected]> Cc: Oscar Salvador <[email protected]> Cc: Michal Hocko <[email protected]> Cc: Igor Mammedov <[email protected]> Cc: Dave Young <[email protected]> Cc: Andrew Morton <[email protected]> Cc: Dan Williams <[email protected]> Cc: Pavel Tatashin <[email protected]> Cc: Stefan Hajnoczi <[email protected]> Cc: Vlastimil Babka <[email protected]> Cc: Mel Gorman <[email protected]> Cc: Mike Rapoport <[email protected]> Cc: Alexander Duyck <[email protected]> Cc: Alexander Potapenko <[email protected]> Signed-off-by: David Hildenbrand <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Michael S. Tsirkin <[email protected]>
1 parent c627ff5 commit 255f598

File tree

3 files changed

+146
-14
lines changed

3 files changed

+146
-14
lines changed

drivers/virtio/Kconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ config VIRTIO_MEM
8585
depends on VIRTIO
8686
depends on MEMORY_HOTPLUG_SPARSE
8787
depends on MEMORY_HOTREMOVE
88+
select CONTIG_ALLOC
8889
help
8990
This driver provides access to virtio-mem paravirtualized memory
9091
devices, allowing to hotplug and hotunplug memory.

drivers/virtio/virtio_mem.c

Lines changed: 143 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@
2323

2424
#include <acpi/acpi_numa.h>
2525

26+
static bool unplug_online = true;
27+
module_param(unplug_online, bool, 0644);
28+
MODULE_PARM_DESC(unplug_online, "Try to unplug online memory");
29+
2630
enum virtio_mem_mb_state {
2731
/* Unplugged, not added to Linux. Can be reused later. */
2832
VIRTIO_MEM_MB_STATE_UNUSED = 0,
@@ -654,23 +658,35 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb,
654658
}
655659

656660
/*
657-
* Set a range of pages PG_offline.
661+
* Set a range of pages PG_offline. Remember pages that were never onlined
662+
* (via generic_online_page()) using PageDirty().
658663
*/
659664
static void virtio_mem_set_fake_offline(unsigned long pfn,
660-
unsigned int nr_pages)
665+
unsigned int nr_pages, bool onlined)
661666
{
662-
for (; nr_pages--; pfn++)
663-
__SetPageOffline(pfn_to_page(pfn));
667+
for (; nr_pages--; pfn++) {
668+
struct page *page = pfn_to_page(pfn);
669+
670+
__SetPageOffline(page);
671+
if (!onlined)
672+
SetPageDirty(page);
673+
}
664674
}
665675

666676
/*
667-
* Clear PG_offline from a range of pages.
677+
* Clear PG_offline from a range of pages. If the pages were never onlined,
678+
* (via generic_online_page()), clear PageDirty().
668679
*/
669680
static void virtio_mem_clear_fake_offline(unsigned long pfn,
670-
unsigned int nr_pages)
681+
unsigned int nr_pages, bool onlined)
671682
{
672-
for (; nr_pages--; pfn++)
673-
__ClearPageOffline(pfn_to_page(pfn));
683+
for (; nr_pages--; pfn++) {
684+
struct page *page = pfn_to_page(pfn);
685+
686+
__ClearPageOffline(page);
687+
if (!onlined)
688+
ClearPageDirty(page);
689+
}
674690
}
675691

676692
/*
@@ -686,10 +702,26 @@ static void virtio_mem_fake_online(unsigned long pfn, unsigned int nr_pages)
686702
* We are always called with subblock granularity, which is at least
687703
* aligned to MAX_ORDER - 1.
688704
*/
689-
virtio_mem_clear_fake_offline(pfn, nr_pages);
705+
for (i = 0; i < nr_pages; i += 1 << order) {
706+
struct page *page = pfn_to_page(pfn + i);
690707

691-
for (i = 0; i < nr_pages; i += 1 << order)
692-
generic_online_page(pfn_to_page(pfn + i), order);
708+
/*
709+
* If the page is PageDirty(), it was kept fake-offline when
710+
* onlining the memory block. Otherwise, it was allocated
711+
* using alloc_contig_range(). All pages in a subblock are
712+
* alike.
713+
*/
714+
if (PageDirty(page)) {
715+
virtio_mem_clear_fake_offline(pfn + i, 1 << order,
716+
false);
717+
generic_online_page(page, order);
718+
} else {
719+
virtio_mem_clear_fake_offline(pfn + i, 1 << order,
720+
true);
721+
free_contig_range(pfn + i, 1 << order);
722+
adjust_managed_page_count(page, 1 << order);
723+
}
724+
}
693725
}
694726

695727
static void virtio_mem_online_page_cb(struct page *page, unsigned int order)
@@ -718,7 +750,8 @@ static void virtio_mem_online_page_cb(struct page *page, unsigned int order)
718750
if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1))
719751
generic_online_page(page, order);
720752
else
721-
virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order);
753+
virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order,
754+
false);
722755
rcu_read_unlock();
723756
return;
724757
}
@@ -1186,6 +1219,72 @@ static int virtio_mem_mb_unplug_any_sb_offline(struct virtio_mem *vm,
11861219
return 0;
11871220
}
11881221

1222+
/*
1223+
* Unplug the desired number of plugged subblocks of an online memory block.
1224+
* Will skip subblock that are busy.
1225+
*
1226+
* Will modify the state of the memory block.
1227+
*
1228+
* Note: Can fail after some subblocks were successfully unplugged. Can
1229+
* return 0 even if subblocks were busy and could not get unplugged.
1230+
*/
1231+
static int virtio_mem_mb_unplug_any_sb_online(struct virtio_mem *vm,
1232+
unsigned long mb_id,
1233+
uint64_t *nb_sb)
1234+
{
1235+
const unsigned long nr_pages = PFN_DOWN(vm->subblock_size);
1236+
unsigned long start_pfn;
1237+
int rc, sb_id;
1238+
1239+
/*
1240+
* TODO: To increase the performance we want to try bigger, consecutive
1241+
* subblocks first before falling back to single subblocks. Also,
1242+
* we should sense via something like is_mem_section_removable()
1243+
* first if it makes sense to go ahead any try to allocate.
1244+
*/
1245+
for (sb_id = 0; sb_id < vm->nb_sb_per_mb && *nb_sb; sb_id++) {
1246+
/* Find the next candidate subblock */
1247+
while (sb_id < vm->nb_sb_per_mb &&
1248+
!virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1))
1249+
sb_id++;
1250+
if (sb_id >= vm->nb_sb_per_mb)
1251+
break;
1252+
1253+
start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
1254+
sb_id * vm->subblock_size);
1255+
rc = alloc_contig_range(start_pfn, start_pfn + nr_pages,
1256+
MIGRATE_MOVABLE, GFP_KERNEL);
1257+
if (rc == -ENOMEM)
1258+
/* whoops, out of memory */
1259+
return rc;
1260+
if (rc)
1261+
/* memory busy, we can't unplug this chunk */
1262+
continue;
1263+
1264+
/* Mark it as fake-offline before unplugging it */
1265+
virtio_mem_set_fake_offline(start_pfn, nr_pages, true);
1266+
adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
1267+
1268+
/* Try to unplug the allocated memory */
1269+
rc = virtio_mem_mb_unplug_sb(vm, mb_id, sb_id, 1);
1270+
if (rc) {
1271+
/* Return the memory to the buddy. */
1272+
virtio_mem_fake_online(start_pfn, nr_pages);
1273+
return rc;
1274+
}
1275+
1276+
virtio_mem_mb_set_state(vm, mb_id,
1277+
VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL);
1278+
*nb_sb -= 1;
1279+
}
1280+
1281+
/*
1282+
* TODO: Once all subblocks of a memory block were unplugged, we want
1283+
* to offline the memory block and remove it.
1284+
*/
1285+
return 0;
1286+
}
1287+
11891288
/*
11901289
* Try to unplug the requested amount of memory.
11911290
*/
@@ -1225,8 +1324,37 @@ static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff)
12251324
cond_resched();
12261325
}
12271326

1327+
if (!unplug_online) {
1328+
mutex_unlock(&vm->hotplug_mutex);
1329+
return 0;
1330+
}
1331+
1332+
/* Try to unplug subblocks of partially plugged online blocks. */
1333+
virtio_mem_for_each_mb_state_rev(vm, mb_id,
1334+
VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL) {
1335+
rc = virtio_mem_mb_unplug_any_sb_online(vm, mb_id,
1336+
&nb_sb);
1337+
if (rc || !nb_sb)
1338+
goto out_unlock;
1339+
mutex_unlock(&vm->hotplug_mutex);
1340+
cond_resched();
1341+
mutex_lock(&vm->hotplug_mutex);
1342+
}
1343+
1344+
/* Try to unplug subblocks of plugged online blocks. */
1345+
virtio_mem_for_each_mb_state_rev(vm, mb_id,
1346+
VIRTIO_MEM_MB_STATE_ONLINE) {
1347+
rc = virtio_mem_mb_unplug_any_sb_online(vm, mb_id,
1348+
&nb_sb);
1349+
if (rc || !nb_sb)
1350+
goto out_unlock;
1351+
mutex_unlock(&vm->hotplug_mutex);
1352+
cond_resched();
1353+
mutex_lock(&vm->hotplug_mutex);
1354+
}
1355+
12281356
mutex_unlock(&vm->hotplug_mutex);
1229-
return 0;
1357+
return nb_sb ? -EBUSY : 0;
12301358
out_unlock:
12311359
mutex_unlock(&vm->hotplug_mutex);
12321360
return rc;
@@ -1332,7 +1460,8 @@ static void virtio_mem_run_wq(struct work_struct *work)
13321460
case -EBUSY:
13331461
/*
13341462
* The hypervisor cannot process our request right now
1335-
* (e.g., out of memory, migrating).
1463+
* (e.g., out of memory, migrating) or we cannot free up
1464+
* any memory to unplug it (all plugged memory is busy).
13361465
*/
13371466
case -ENOMEM:
13381467
/* Out of memory, try again later. */

mm/page_alloc.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8603,6 +8603,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
86038603
pfn_max_align_up(end), migratetype);
86048604
return ret;
86058605
}
8606+
EXPORT_SYMBOL(alloc_contig_range);
86068607

86078608
static int __alloc_contig_pages(unsigned long start_pfn,
86088609
unsigned long nr_pages, gfp_t gfp_mask)
@@ -8718,6 +8719,7 @@ void free_contig_range(unsigned long pfn, unsigned int nr_pages)
87188719
}
87198720
WARN(count != 0, "%d pages are still in use!\n", count);
87208721
}
8722+
EXPORT_SYMBOL(free_contig_range);
87218723

87228724
/*
87238725
* The zone indicated has a new number of managed_pages; batch sizes and percpu

0 commit comments

Comments
 (0)