Skip to content

Commit 85b3258

Browse files
nhatsmrtakpm00
authored andcommitted
zsmalloc: fix a race with deferred_handles storing
Currently, there is a race between zs_free() and zs_reclaim_page(): zs_reclaim_page() finds a handle to an allocated object, but before the eviction happens, an independent zs_free() call to the same handle could come in and overwrite the object value stored at the handle with the last deferred handle. When zs_reclaim_page() finally gets to call the eviction handler, it will see an invalid object value (i.e the previous deferred handle instead of the original object value). This race happens quite infrequently. We only managed to produce it with out-of-tree developmental code that triggers zsmalloc writeback with a much higher frequency than usual. This patch fixes this race by storing the deferred handle in the object header instead. We differentiate the deferred handle from the other two cases (handle for allocated object, and linkage for free object) with a new tag. If zspage reclamation succeeds, we will free these deferred handles by walking through the zspage objects. On the other hand, if zspage reclamation fails, we reconstruct the zspage freelist (with the deferred handle tag and allocated tag) before trying again with the reclamation. [[email protected]: avoid unused-function warning] Link: https://lkml.kernel.org/r/[email protected] Link: https://lkml.kernel.org/r/[email protected] Fixes: 9997bc0 ("zsmalloc: implement writeback mechanism for zsmalloc") Signed-off-by: Nhat Pham <[email protected]> Signed-off-by: Arnd Bergmann <[email protected]> Suggested-by: Johannes Weiner <[email protected]> Cc: Dan Streetman <[email protected]> Cc: Minchan Kim <[email protected]> Cc: Nitin Gupta <[email protected]> Cc: Sergey Senozhatsky <[email protected]> Cc: Seth Jennings <[email protected]> Cc: Vitaly Wool <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent 023f47a commit 85b3258

File tree

1 file changed

+205
-32
lines changed

1 file changed

+205
-32
lines changed

mm/zsmalloc.c

Lines changed: 205 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,23 @@
113113
* have room for two bit at least.
114114
*/
115115
#define OBJ_ALLOCATED_TAG 1
116-
#define OBJ_TAG_BITS 1
116+
117+
#ifdef CONFIG_ZPOOL
118+
/*
119+
* The second least-significant bit in the object's header identifies if the
120+
* value stored at the header is a deferred handle from the last reclaim
121+
* attempt.
122+
*
123+
* As noted above, this is valid because we have room for two bits.
124+
*/
125+
#define OBJ_DEFERRED_HANDLE_TAG 2
126+
#define OBJ_TAG_BITS 2
127+
#define OBJ_TAG_MASK (OBJ_ALLOCATED_TAG | OBJ_DEFERRED_HANDLE_TAG)
128+
#else
129+
#define OBJ_TAG_BITS 1
130+
#define OBJ_TAG_MASK OBJ_ALLOCATED_TAG
131+
#endif /* CONFIG_ZPOOL */
132+
117133
#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
118134
#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
119135

@@ -222,6 +238,12 @@ struct link_free {
222238
* Handle of allocated object.
223239
*/
224240
unsigned long handle;
241+
#ifdef CONFIG_ZPOOL
242+
/*
243+
* Deferred handle of a reclaimed object.
244+
*/
245+
unsigned long deferred_handle;
246+
#endif
225247
};
226248
};
227249

@@ -272,8 +294,6 @@ struct zspage {
272294
/* links the zspage to the lru list in the pool */
273295
struct list_head lru;
274296
bool under_reclaim;
275-
/* list of unfreed handles whose objects have been reclaimed */
276-
unsigned long *deferred_handles;
277297
#endif
278298

279299
struct zs_pool *pool;
@@ -897,7 +917,8 @@ static unsigned long handle_to_obj(unsigned long handle)
897917
return *(unsigned long *)handle;
898918
}
899919

900-
static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle)
920+
static bool obj_tagged(struct page *page, void *obj, unsigned long *phandle,
921+
int tag)
901922
{
902923
unsigned long handle;
903924
struct zspage *zspage = get_zspage(page);
@@ -908,13 +929,27 @@ static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle)
908929
} else
909930
handle = *(unsigned long *)obj;
910931

911-
if (!(handle & OBJ_ALLOCATED_TAG))
932+
if (!(handle & tag))
912933
return false;
913934

914-
*phandle = handle & ~OBJ_ALLOCATED_TAG;
935+
/* Clear all tags before returning the handle */
936+
*phandle = handle & ~OBJ_TAG_MASK;
915937
return true;
916938
}
917939

940+
static inline bool obj_allocated(struct page *page, void *obj, unsigned long *phandle)
941+
{
942+
return obj_tagged(page, obj, phandle, OBJ_ALLOCATED_TAG);
943+
}
944+
945+
#ifdef CONFIG_ZPOOL
946+
static bool obj_stores_deferred_handle(struct page *page, void *obj,
947+
unsigned long *phandle)
948+
{
949+
return obj_tagged(page, obj, phandle, OBJ_DEFERRED_HANDLE_TAG);
950+
}
951+
#endif
952+
918953
static void reset_page(struct page *page)
919954
{
920955
__ClearPageMovable(page);
@@ -946,22 +981,36 @@ static int trylock_zspage(struct zspage *zspage)
946981
}
947982

948983
#ifdef CONFIG_ZPOOL
984+
static unsigned long find_deferred_handle_obj(struct size_class *class,
985+
struct page *page, int *obj_idx);
986+
949987
/*
950988
* Free all the deferred handles whose objects are freed in zs_free.
951989
*/
952-
static void free_handles(struct zs_pool *pool, struct zspage *zspage)
990+
static void free_handles(struct zs_pool *pool, struct size_class *class,
991+
struct zspage *zspage)
953992
{
954-
unsigned long handle = (unsigned long)zspage->deferred_handles;
993+
int obj_idx = 0;
994+
struct page *page = get_first_page(zspage);
995+
unsigned long handle;
955996

956-
while (handle) {
957-
unsigned long nxt_handle = handle_to_obj(handle);
997+
while (1) {
998+
handle = find_deferred_handle_obj(class, page, &obj_idx);
999+
if (!handle) {
1000+
page = get_next_page(page);
1001+
if (!page)
1002+
break;
1003+
obj_idx = 0;
1004+
continue;
1005+
}
9581006

9591007
cache_free_handle(pool, handle);
960-
handle = nxt_handle;
1008+
obj_idx++;
9611009
}
9621010
}
9631011
#else
964-
static inline void free_handles(struct zs_pool *pool, struct zspage *zspage) {}
1012+
static inline void free_handles(struct zs_pool *pool, struct size_class *class,
1013+
struct zspage *zspage) {}
9651014
#endif
9661015

9671016
static void __free_zspage(struct zs_pool *pool, struct size_class *class,
@@ -979,7 +1028,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
9791028
VM_BUG_ON(fg != ZS_EMPTY);
9801029

9811030
/* Free all deferred handles from zs_free */
982-
free_handles(pool, zspage);
1031+
free_handles(pool, class, zspage);
9831032

9841033
next = page = get_first_page(zspage);
9851034
do {
@@ -1067,7 +1116,6 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
10671116
#ifdef CONFIG_ZPOOL
10681117
INIT_LIST_HEAD(&zspage->lru);
10691118
zspage->under_reclaim = false;
1070-
zspage->deferred_handles = NULL;
10711119
#endif
10721120

10731121
set_freeobj(zspage, 0);
@@ -1568,7 +1616,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
15681616
}
15691617
EXPORT_SYMBOL_GPL(zs_malloc);
15701618

1571-
static void obj_free(int class_size, unsigned long obj)
1619+
static void obj_free(int class_size, unsigned long obj, unsigned long *handle)
15721620
{
15731621
struct link_free *link;
15741622
struct zspage *zspage;
@@ -1582,15 +1630,29 @@ static void obj_free(int class_size, unsigned long obj)
15821630
zspage = get_zspage(f_page);
15831631

15841632
vaddr = kmap_atomic(f_page);
1585-
1586-
/* Insert this object in containing zspage's freelist */
15871633
link = (struct link_free *)(vaddr + f_offset);
1588-
if (likely(!ZsHugePage(zspage)))
1589-
link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
1590-
else
1591-
f_page->index = 0;
1634+
1635+
if (handle) {
1636+
#ifdef CONFIG_ZPOOL
1637+
/* Stores the (deferred) handle in the object's header */
1638+
*handle |= OBJ_DEFERRED_HANDLE_TAG;
1639+
*handle &= ~OBJ_ALLOCATED_TAG;
1640+
1641+
if (likely(!ZsHugePage(zspage)))
1642+
link->deferred_handle = *handle;
1643+
else
1644+
f_page->index = *handle;
1645+
#endif
1646+
} else {
1647+
/* Insert this object in containing zspage's freelist */
1648+
if (likely(!ZsHugePage(zspage)))
1649+
link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
1650+
else
1651+
f_page->index = 0;
1652+
set_freeobj(zspage, f_objidx);
1653+
}
1654+
15921655
kunmap_atomic(vaddr);
1593-
set_freeobj(zspage, f_objidx);
15941656
mod_zspage_inuse(zspage, -1);
15951657
}
15961658

@@ -1615,7 +1677,6 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
16151677
zspage = get_zspage(f_page);
16161678
class = zspage_class(pool, zspage);
16171679

1618-
obj_free(class->size, obj);
16191680
class_stat_dec(class, OBJ_USED, 1);
16201681

16211682
#ifdef CONFIG_ZPOOL
@@ -1624,15 +1685,15 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
16241685
* Reclaim needs the handles during writeback. It'll free
16251686
* them along with the zspage when it's done with them.
16261687
*
1627-
* Record current deferred handle at the memory location
1628-
* whose address is given by handle.
1688+
* Record current deferred handle in the object's header.
16291689
*/
1630-
record_obj(handle, (unsigned long)zspage->deferred_handles);
1631-
zspage->deferred_handles = (unsigned long *)handle;
1690+
obj_free(class->size, obj, &handle);
16321691
spin_unlock(&pool->lock);
16331692
return;
16341693
}
16351694
#endif
1695+
obj_free(class->size, obj, NULL);
1696+
16361697
fullness = fix_fullness_group(class, zspage);
16371698
if (fullness == ZS_EMPTY)
16381699
free_zspage(pool, class, zspage);
@@ -1713,11 +1774,11 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
17131774
}
17141775

17151776
/*
1716-
* Find alloced object in zspage from index object and
1777+
* Find object with a certain tag in zspage from index object and
17171778
* return handle.
17181779
*/
1719-
static unsigned long find_alloced_obj(struct size_class *class,
1720-
struct page *page, int *obj_idx)
1780+
static unsigned long find_tagged_obj(struct size_class *class,
1781+
struct page *page, int *obj_idx, int tag)
17211782
{
17221783
unsigned int offset;
17231784
int index = *obj_idx;
@@ -1728,7 +1789,7 @@ static unsigned long find_alloced_obj(struct size_class *class,
17281789
offset += class->size * index;
17291790

17301791
while (offset < PAGE_SIZE) {
1731-
if (obj_allocated(page, addr + offset, &handle))
1792+
if (obj_tagged(page, addr + offset, &handle, tag))
17321793
break;
17331794

17341795
offset += class->size;
@@ -1742,6 +1803,28 @@ static unsigned long find_alloced_obj(struct size_class *class,
17421803
return handle;
17431804
}
17441805

1806+
/*
1807+
* Find alloced object in zspage from index object and
1808+
* return handle.
1809+
*/
1810+
static unsigned long find_alloced_obj(struct size_class *class,
1811+
struct page *page, int *obj_idx)
1812+
{
1813+
return find_tagged_obj(class, page, obj_idx, OBJ_ALLOCATED_TAG);
1814+
}
1815+
1816+
#ifdef CONFIG_ZPOOL
1817+
/*
1818+
* Find object storing a deferred handle in header in zspage from index object
1819+
* and return handle.
1820+
*/
1821+
static unsigned long find_deferred_handle_obj(struct size_class *class,
1822+
struct page *page, int *obj_idx)
1823+
{
1824+
return find_tagged_obj(class, page, obj_idx, OBJ_DEFERRED_HANDLE_TAG);
1825+
}
1826+
#endif
1827+
17451828
struct zs_compact_control {
17461829
/* Source spage for migration which could be a subpage of zspage */
17471830
struct page *s_page;
@@ -1784,7 +1867,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
17841867
zs_object_copy(class, free_obj, used_obj);
17851868
obj_idx++;
17861869
record_obj(handle, free_obj);
1787-
obj_free(class->size, used_obj);
1870+
obj_free(class->size, used_obj, NULL);
17881871
}
17891872

17901873
/* Remember last position in this iteration */
@@ -2478,6 +2561,90 @@ void zs_destroy_pool(struct zs_pool *pool)
24782561
EXPORT_SYMBOL_GPL(zs_destroy_pool);
24792562

24802563
#ifdef CONFIG_ZPOOL
2564+
static void restore_freelist(struct zs_pool *pool, struct size_class *class,
2565+
struct zspage *zspage)
2566+
{
2567+
unsigned int obj_idx = 0;
2568+
unsigned long handle, off = 0; /* off is within-page offset */
2569+
struct page *page = get_first_page(zspage);
2570+
struct link_free *prev_free = NULL;
2571+
void *prev_page_vaddr = NULL;
2572+
2573+
/* in case no free object found */
2574+
set_freeobj(zspage, (unsigned int)(-1UL));
2575+
2576+
while (page) {
2577+
void *vaddr = kmap_atomic(page);
2578+
struct page *next_page;
2579+
2580+
while (off < PAGE_SIZE) {
2581+
void *obj_addr = vaddr + off;
2582+
2583+
/* skip allocated object */
2584+
if (obj_allocated(page, obj_addr, &handle)) {
2585+
obj_idx++;
2586+
off += class->size;
2587+
continue;
2588+
}
2589+
2590+
/* free deferred handle from reclaim attempt */
2591+
if (obj_stores_deferred_handle(page, obj_addr, &handle))
2592+
cache_free_handle(pool, handle);
2593+
2594+
if (prev_free)
2595+
prev_free->next = obj_idx << OBJ_TAG_BITS;
2596+
else /* first free object found */
2597+
set_freeobj(zspage, obj_idx);
2598+
2599+
prev_free = (struct link_free *)vaddr + off / sizeof(*prev_free);
2600+
/* if last free object in a previous page, need to unmap */
2601+
if (prev_page_vaddr) {
2602+
kunmap_atomic(prev_page_vaddr);
2603+
prev_page_vaddr = NULL;
2604+
}
2605+
2606+
obj_idx++;
2607+
off += class->size;
2608+
}
2609+
2610+
/*
2611+
* Handle the last (full or partial) object on this page.
2612+
*/
2613+
next_page = get_next_page(page);
2614+
if (next_page) {
2615+
if (!prev_free || prev_page_vaddr) {
2616+
/*
2617+
* There is no free object in this page, so we can safely
2618+
* unmap it.
2619+
*/
2620+
kunmap_atomic(vaddr);
2621+
} else {
2622+
/* update prev_page_vaddr since prev_free is on this page */
2623+
prev_page_vaddr = vaddr;
2624+
}
2625+
} else { /* this is the last page */
2626+
if (prev_free) {
2627+
/*
2628+
* Reset OBJ_TAG_BITS bit to last link to tell
2629+
* whether it's allocated object or not.
2630+
*/
2631+
prev_free->next = -1UL << OBJ_TAG_BITS;
2632+
}
2633+
2634+
/* unmap previous page (if not done yet) */
2635+
if (prev_page_vaddr) {
2636+
kunmap_atomic(prev_page_vaddr);
2637+
prev_page_vaddr = NULL;
2638+
}
2639+
2640+
kunmap_atomic(vaddr);
2641+
}
2642+
2643+
page = next_page;
2644+
off %= PAGE_SIZE;
2645+
}
2646+
}
2647+
24812648
static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries)
24822649
{
24832650
int i, obj_idx, ret = 0;
@@ -2561,6 +2728,12 @@ static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries)
25612728
return 0;
25622729
}
25632730

2731+
/*
2732+
* Eviction fails on one of the handles, so we need to restore zspage.
2733+
* We need to rebuild its freelist (and free stored deferred handles),
2734+
* put it back to the correct size class, and add it to the LRU list.
2735+
*/
2736+
restore_freelist(pool, class, zspage);
25642737
putback_zspage(class, zspage);
25652738
list_add(&zspage->lru, &pool->lru);
25662739
unlock_zspage(zspage);

0 commit comments

Comments
 (0)