Skip to content

Commit 2498812

Browse files
apopple-nvidiaakpm00
authored andcommitted
nouveau/dmem: evict device private memory during release
When the module is unloaded or a GPU is unbound from the module it is possible for device private pages to still be mapped in currently running processes. This can lead to a hangs and RCU stall warnings when unbinding the device as memunmap_pages() will wait in an uninterruptible state until all device pages have been freed which may never happen. Fix this by migrating device mappings back to normal CPU memory prior to freeing the GPU memory chunks and associated device private pages. Link: https://lkml.kernel.org/r/66277601fb8fda9af408b33da9887192bf895bda.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple <[email protected]> Cc: Lyude Paul <[email protected]> Cc: Ben Skeggs <[email protected]> Cc: Ralph Campbell <[email protected]> Cc: John Hubbard <[email protected]> Cc: Alex Deucher <[email protected]> Cc: Alex Sierra <[email protected]> Cc: Christian König <[email protected]> Cc: Dan Williams <[email protected]> Cc: David Hildenbrand <[email protected]> Cc: Felix Kuehling <[email protected]> Cc: "Huang, Ying" <[email protected]> Cc: Jason Gunthorpe <[email protected]> Cc: Matthew Wilcox <[email protected]> Cc: Michael Ellerman <[email protected]> Cc: Yang Shi <[email protected]> Cc: Zi Yan <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent d9b7193 commit 2498812

File tree

1 file changed

+48
-0
lines changed

1 file changed

+48
-0
lines changed

drivers/gpu/drm/nouveau/nouveau_dmem.c

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,52 @@ nouveau_dmem_suspend(struct nouveau_drm *drm)
367367
mutex_unlock(&drm->dmem->mutex);
368368
}
369369

370+
/*
371+
* Evict all pages mapping a chunk.
372+
*/
373+
static void
374+
nouveau_dmem_evict_chunk(struct nouveau_dmem_chunk *chunk)
375+
{
376+
unsigned long i, npages = range_len(&chunk->pagemap.range) >> PAGE_SHIFT;
377+
unsigned long *src_pfns, *dst_pfns;
378+
dma_addr_t *dma_addrs;
379+
struct nouveau_fence *fence;
380+
381+
src_pfns = kcalloc(npages, sizeof(*src_pfns), GFP_KERNEL);
382+
dst_pfns = kcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL);
383+
dma_addrs = kcalloc(npages, sizeof(*dma_addrs), GFP_KERNEL);
384+
385+
migrate_device_range(src_pfns, chunk->pagemap.range.start >> PAGE_SHIFT,
386+
npages);
387+
388+
for (i = 0; i < npages; i++) {
389+
if (src_pfns[i] & MIGRATE_PFN_MIGRATE) {
390+
struct page *dpage;
391+
392+
/*
393+
* _GFP_NOFAIL because the GPU is going away and there
394+
* is nothing sensible we can do if we can't copy the
395+
* data back.
396+
*/
397+
dpage = alloc_page(GFP_HIGHUSER | __GFP_NOFAIL);
398+
dst_pfns[i] = migrate_pfn(page_to_pfn(dpage));
399+
nouveau_dmem_copy_one(chunk->drm,
400+
migrate_pfn_to_page(src_pfns[i]), dpage,
401+
&dma_addrs[i]);
402+
}
403+
}
404+
405+
nouveau_fence_new(chunk->drm->dmem->migrate.chan, false, &fence);
406+
migrate_device_pages(src_pfns, dst_pfns, npages);
407+
nouveau_dmem_fence_done(&fence);
408+
migrate_device_finalize(src_pfns, dst_pfns, npages);
409+
kfree(src_pfns);
410+
kfree(dst_pfns);
411+
for (i = 0; i < npages; i++)
412+
dma_unmap_page(chunk->drm->dev->dev, dma_addrs[i], PAGE_SIZE, DMA_BIDIRECTIONAL);
413+
kfree(dma_addrs);
414+
}
415+
370416
void
371417
nouveau_dmem_fini(struct nouveau_drm *drm)
372418
{
@@ -378,8 +424,10 @@ nouveau_dmem_fini(struct nouveau_drm *drm)
378424
mutex_lock(&drm->dmem->mutex);
379425

380426
list_for_each_entry_safe(chunk, tmp, &drm->dmem->chunks, list) {
427+
nouveau_dmem_evict_chunk(chunk);
381428
nouveau_bo_unpin(chunk->bo);
382429
nouveau_bo_ref(NULL, &chunk->bo);
430+
WARN_ON(chunk->callocated);
383431
list_del(&chunk->list);
384432
memunmap_pages(&chunk->pagemap);
385433
release_mem_region(chunk->pagemap.range.start,

0 commit comments

Comments
 (0)