Skip to content

Commit 833ccea

Browse files
authored
Merge pull request #164 from bo3z/bugfix/iommu-mappings
Fix memory mapping of writeback region
2 parents a29cf61 + 73be990 commit 833ccea

File tree

8 files changed

+211
-51
lines changed

8 files changed

+211
-51
lines changed

contrib/vm/driver-guest/guest_dev.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,10 @@
4545
#define USER_HASH_TABLE_ORDER 8
4646

4747
/* MMAP Regions */
48-
#define MMAP_CTRL 0x0
48+
#define MMAP_WB 0x0
4949
#define MMAP_CNFG 0x1
5050
#define MMAP_CNFG_AVX 0x2
51-
#define MMAP_WB 0x3
51+
#define MMAP_CTRL 0x3
5252
#define MMAP_BUFF 0x200
5353
#define MMAP_PR 0x400
5454
#define FPGA_CTRL_SIZE 256 * 1024

driver/include/coyote_defs.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@
8080
#include <linux/dma-buf.h>
8181
#include <linux/dma-direct.h>
8282
#include <linux/dma-resv.h>
83+
#include <linux/dma-mapping.h>
8384

8485
// Driver arguments; see coyote_driver.c for details
8586
extern char *ip_addr;
@@ -269,6 +270,7 @@ extern bool en_hmm;
269270
#define MAX_N_MAP_PAGES 256
270271
#define MAX_N_MAP_HUGE_PAGES 256
271272
#define MAX_SINGLE_DMA_SYNC 4 // 4 pages
273+
#define BUFF_NEEDS_EXP_SYNC_RET_CODE 99
272274

273275
// Card memory constants
274276
#define MEM_START (256UL * 1024UL * 1024UL)
@@ -305,10 +307,10 @@ extern bool en_hmm;
305307
#define MAX_CHAR_FDEV 32
306308

307309
// Offsets for memory-mapped regions
308-
#define MMAP_CTRL 0x0
310+
#define MMAP_WB 0x0
309311
#define MMAP_CNFG 0x1
310312
#define MMAP_CNFG_AVX 0x2
311-
#define MMAP_WB 0x3
313+
#define MMAP_CTRL 0x3
312314
#define MMAP_RECONFIG 0x100
313315

314316
// vFPGA IOCTL calls; see vfpga_ops.c for more details
@@ -652,6 +654,9 @@ struct user_pages {
652654

653655
/// Array of physical addresses on the host, one for each page in the pages array, simply obtained by calling page_to_phys on each page
654656
uint64_t *hpages;
657+
658+
/// Set to true if explicit synchronization (i.e. dma_sync_single_for_{device,cpu}) is needed for this buffer, false otherwise
659+
bool needs_explicit_sync;
655660
};
656661

657662
/**

driver/src/coyote_setup.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -394,10 +394,17 @@ int setup_vfpga_devices(struct bus_driver_data *data) {
394394
pr_err("failed to allocate writeback memory\n");
395395
goto err_wb;
396396
}
397+
398+
int ret_val = set_memory_uc((uint64_t) data->vfpga_dev[i].wb_addr_virt, N_WB_PAGES);
399+
if (ret_val) {
400+
pr_err("failed so set UC for writeback memory\n");
401+
goto err_wb;
402+
}
397403

398404
for (int j = 0; j < WB_BLOCKS; j++) {
399405
data->vfpga_dev[i].cnfg_regs->wback[j] = data->vfpga_dev[i].wb_phys_addr + j * (N_CTID_MAX * sizeof(uint32_t));
400406
}
407+
401408
dbg_info(
402409
"allocated memory for descriptor writeback, vaddr %llx, paddr %llx\n",
403410
(uint64_t)data->vfpga_dev[i].wb_addr_virt, data->vfpga_dev[i].wb_phys_addr

driver/src/vfpga/vfpga_gup.c

Lines changed: 173 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,13 @@ int mmu_handler_gup(struct vfpga_dev *device, uint64_t vaddr, uint64_t len, int3
9292
return -ENOMEM;
9393
}
9494

95+
// In case there are caching effects, return a non-zero code to the user space
96+
// The application continues as normal, but this code will generate a warning
97+
// in the user space
98+
if (user_pg->needs_explicit_sync) {
99+
ret_val = BUFF_NEEDS_EXP_SYNC_RET_CODE;
100+
}
101+
95102
if(stream) {
96103
tlb_map_gup(device, &pf_desc, user_pg, hpid);
97104
} else {
@@ -257,6 +264,7 @@ void tlb_unmap_gup(struct vfpga_dev *device, struct user_pages *user_pg, pid_t h
257264

258265
struct user_pages* tlb_get_user_pages(struct vfpga_dev *device, struct pf_aligned_desc *pf_desc, pid_t hpid, struct task_struct *curr_task, struct mm_struct *curr_mm) {
259266
int ret_val = 0;
267+
int pg_inc, pg_size;
260268
struct bus_driver_data *bd_data = device->bd_data;
261269

262270
// Error handling
@@ -287,27 +295,97 @@ struct user_pages* tlb_get_user_pages(struct vfpga_dev *device, struct pf_aligne
287295
dbg_info("pages=0x%p\n", user_pg->pages);
288296

289297
// Pin the pages
298+
// On newer kernels, pin_user_pages_remote is preferred over get_user_pages_remote for DMA,
299+
// as it guarantees that the pages remain pinned (and not just the page struct) until explicitly unpinned
290300
#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 5, 0)
291-
ret_val = get_user_pages_remote(curr_mm, (unsigned long) pf_desc->vaddr << PAGE_SHIFT, pf_desc->n_pages, 1, user_pg->pages, NULL);
301+
ret_val = pin_user_pages_remote(curr_mm, (unsigned long) pf_desc->vaddr << PAGE_SHIFT, pf_desc->n_pages, FOLL_WRITE | FOLL_LONGTERM, user_pg->pages, NULL);
292302
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0)
293-
ret_val = get_user_pages_remote(curr_mm, (unsigned long) pf_desc->vaddr << PAGE_SHIFT, pf_desc->n_pages, 1, user_pg->pages, NULL, NULL);
294-
#else
295-
ret_val = get_user_pages_remote(curr_task, curr_mm, (unsigned long)pf_desc->vaddr << PAGE_SHIFT, pf_desc->n_pages, 1, user_pg->pages, NULL, NULL);
303+
ret_val = pin_user_pages_remote(curr_mm, (unsigned long) pf_desc->vaddr << PAGE_SHIFT, pf_desc->n_pages, FOLL_WRITE | FOLL_LONGTERM, user_pg->pages, NULL, NULL);
304+
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 4, 0)
305+
ret_val = pin_user_pages_remote(curr_task, curr_mm, (unsigned long) pf_desc->vaddr << PAGE_SHIFT, pf_desc->n_pages, FOLL_WRITE | FOLL_LONGTERM, user_pg->pages, NULL, NULL);
306+
#else
307+
ret_val = get_user_pages_remote(curr_task, curr_mm, (unsigned long) pf_desc->vaddr << PAGE_SHIFT, pf_desc->n_pages, 1, user_pg->pages, NULL, NULL);
296308
#endif
297-
dbg_info("get_user_pages_remote(%llx, n_pages = %d, page start = %lx, hugepages = %d)\n", pf_desc->vaddr, pf_desc->n_pages, page_to_pfn(user_pg->pages[0]), pf_desc->hugepages);
309+
dbg_info("pin_user_pages_remote(%llx, n_pages = %d, page start = %lx, hugepages = %d)\n", pf_desc->vaddr, pf_desc->n_pages, page_to_pfn(user_pg->pages[0]), pf_desc->hugepages);
298310

299-
if(ret_val < pf_desc->n_pages) {
300-
dbg_info("could not get all user pages, %d\n", ret_val);
301-
goto fail_host_unmap;
311+
if (ret_val < pf_desc->n_pages) {
312+
pr_warn("could not get all user pages, %d\n", ret_val);
313+
goto fail_host_alloc;
302314
}
303315

304316
// Flush cache
305-
for(int i = 0; i < pf_desc->n_pages; i++)
317+
for (int i = 0; i < pf_desc->n_pages; i++) {
306318
flush_dcache_page(user_pg->pages[i]);
319+
}
307320

308321
// Find the physical address of the pages
309-
for(int i = 0;i < pf_desc->n_pages; i++)
310-
user_pg->hpages[i] = page_to_phys(user_pg->pages[i]);
322+
// NOTE: The CPU's physical address and the physical address
323+
// that the FPGA should write to may be different if there is
324+
// an additional layer of translation via the system IOMMU
325+
// Therefore, we use the dma_map_single to obtain a physical address
326+
// suitable for FPGA accesses.
327+
user_pg->needs_explicit_sync = false;
328+
if (pf_desc->hugepages) {
329+
// Map each hugepage (e.g., 2MB) chunk
330+
for (int i = 0; i < pf_desc->n_pages; i+=device->bd_data->n_pages_in_huge) {
331+
// Obtain the physical address of this hugepage chunk
332+
// Generally, for the hardware TLB, only the starting address of the page is needed
333+
// The exact physical address is calculated from the starting address and the virtual address offset
334+
user_pg->hpages[i] = dma_map_single(
335+
&device->bd_data->pci_dev->dev,
336+
page_to_virt(user_pg->pages[i]),
337+
device->bd_data->ltlb_meta->page_size,
338+
DMA_BIDIRECTIONAL
339+
);
340+
341+
if (dma_mapping_error(&device->bd_data->pci_dev->dev, user_pg->hpages[i])) {
342+
pr_warn("failed to map user pages and obtain physical address");
343+
goto fail_dma_map;
344+
}
345+
346+
if (dma_need_sync(&device->bd_data->pci_dev->dev, user_pg->hpages[i])) {
347+
pr_warn("the DMA buffer with virt_addr %lx, phys_addr %lx, may be subject to cache coherency issues and may require explicit synchronization which is not supported out of the box by Coyote\n",
348+
(unsigned long) page_to_virt(user_pg->pages[i]), (unsigned long) user_pg->hpages[i]
349+
);
350+
user_pg->needs_explicit_sync = true;
351+
}
352+
353+
// However, in some cases (e.g., migrating data between the host and FPGA memory)
354+
// Coyote still needs all the entries in the hpages array; since the transfers
355+
// are issued in 4k granularity from the driver
356+
for (int j = i + 1; j < i + device->bd_data->n_pages_in_huge; j++) {
357+
user_pg->hpages[j] = user_pg->hpages[i] + (j - i) * PAGE_SIZE;
358+
359+
if (j >= pf_desc->n_pages) {
360+
break;
361+
}
362+
}
363+
364+
}
365+
} else {
366+
// For each page, map it to the FPGA char dev
367+
// and obtain its physical address
368+
for (int i = 0; i < pf_desc->n_pages; i++) {
369+
user_pg->hpages[i] = dma_map_single(
370+
&device->bd_data->pci_dev->dev,
371+
page_to_virt(user_pg->pages[i]),
372+
PAGE_SIZE,
373+
DMA_BIDIRECTIONAL
374+
);
375+
376+
if (dma_mapping_error(&device->bd_data->pci_dev->dev, user_pg->hpages[i])) {
377+
pr_warn("failed to map user pages and obtain physical address");
378+
goto fail_dma_map;
379+
}
380+
381+
if (dma_need_sync(&device->bd_data->pci_dev->dev, user_pg->hpages[i])) {
382+
pr_warn("the DMA buffer with virt_addr %lx, phys_addr %lx, may be subject to cache coherency issues and may require explicit synchronization which is not supported out of the box by Coyote\n",
383+
(unsigned long) page_to_virt(user_pg->pages[i]), (unsigned long) user_pg->hpages[i]
384+
);
385+
user_pg->needs_explicit_sync = true;
386+
}
387+
}
388+
}
311389

312390
// Allocate memory on the card if available
313391
if(bd_data->en_mem) {
@@ -317,7 +395,7 @@ struct user_pages* tlb_get_user_pages(struct vfpga_dev *device, struct pf_aligne
317395
ret_val = alloc_card_memory(device, user_pg->cpages, pf_desc->n_pages, pf_desc->hugepages);
318396
if (ret_val) {
319397
dbg_info("could not get all card pages, %d\n", ret_val);
320-
goto fail_card_unmap;
398+
goto fail_card_alloc;
321399
}
322400
}
323401

@@ -332,25 +410,64 @@ struct user_pages* tlb_get_user_pages(struct vfpga_dev *device, struct pf_aligne
332410

333411
return user_pg;
334412

335-
fail_host_unmap:
336-
// Release the pages
337-
for(int i = 0; i < ret_val; i++) {
338-
put_page(user_pg->pages[i]);
413+
fail_host_alloc:
414+
// Unpin the pages
415+
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 4, 0)
416+
unpin_user_pages(user_pg->pages, ret_val);
417+
#else
418+
for(int i = 0; i < ret_val; i++) {
419+
put_page(user_pg->pages[i]);
420+
}
421+
#endif
422+
423+
// Free the dynamically allocated memory
424+
vfree(user_pg->pages);
425+
vfree(user_pg->hpages);
426+
kfree(user_pg);
427+
428+
return NULL;
429+
430+
fail_dma_map:
431+
// Unmap DMA
432+
pg_inc = pf_desc->hugepages ? device->bd_data->n_pages_in_huge : 1;
433+
pg_size = pf_desc->hugepages ? device->bd_data->ltlb_meta->page_size : PAGE_SIZE;
434+
for (int i = 0; i < pf_desc->n_pages; i+=pg_inc) {
435+
dma_unmap_single(&device->bd_data->pci_dev->dev, user_pg->hpages[i], pg_size, DMA_BIDIRECTIONAL);
339436
}
340437

438+
// Unpin the pages
439+
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 4, 0)
440+
unpin_user_pages(user_pg->pages, pf_desc->n_pages);
441+
#else
442+
for(int i = 0; i < pf_desc->n_pages; i++) {
443+
put_page(user_pg->pages[i]);
444+
}
445+
#endif
446+
341447
// Free the dynamically allocated memory
342448
vfree(user_pg->pages);
343449
vfree(user_pg->hpages);
344450
kfree(user_pg);
345451

346452
return NULL;
347453

348-
fail_card_unmap:
349-
// Release the pages
350-
for(int i = 0; i < user_pg->n_pages; i++) {
351-
put_page(user_pg->pages[i]);
454+
fail_card_alloc:
455+
// Unmap DMA
456+
pg_inc = pf_desc->hugepages ? device->bd_data->n_pages_in_huge : 1;
457+
pg_size = pf_desc->hugepages ? device->bd_data->ltlb_meta->page_size : PAGE_SIZE;
458+
for (int i = 0; i < pf_desc->n_pages; i+=pg_inc) {
459+
dma_unmap_single(&device->bd_data->pci_dev->dev, user_pg->hpages[i], pg_size, DMA_BIDIRECTIONAL);
352460
}
353461

462+
// Unpin the pages
463+
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 4, 0)
464+
unpin_user_pages(user_pg->pages, pf_desc->n_pages);
465+
#else
466+
for(int i = 0; i < pf_desc->n_pages; i++) {
467+
put_page(user_pg->pages[i]);
468+
}
469+
#endif
470+
354471
// Free the dynamically allocated memory
355472
vfree(user_pg->pages);
356473
vfree(user_pg->hpages);
@@ -403,12 +520,23 @@ int tlb_put_user_pages(struct vfpga_dev *device, uint64_t vaddr, int32_t ctid, p
403520
SetPageDirty(tmp_entry->pages[i]);
404521
}
405522
}
406-
407-
// Release page
408-
for(int i = 0; i < tmp_entry->n_pages; i++) {
409-
put_page(tmp_entry->pages[i]);
523+
524+
// Unmap DMA
525+
int pg_inc = tmp_entry->huge ? device->bd_data->n_pages_in_huge : 1;
526+
int pg_size = tmp_entry->huge ? device->bd_data->ltlb_meta->page_size : PAGE_SIZE;
527+
for (int i = 0; i < tmp_entry->n_pages; i+=pg_inc) {
528+
dma_unmap_single(&device->bd_data->pci_dev->dev, tmp_entry->hpages[i], pg_size, DMA_BIDIRECTIONAL);
410529
}
411530

531+
// Unpin the pages
532+
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 4, 0)
533+
unpin_user_pages(tmp_entry->pages, tmp_entry->n_pages);
534+
#else
535+
for(int i = 0; i < tmp_entry->n_pages; i++) {
536+
put_page(tmp_entry->pages[i]);
537+
}
538+
#endif
539+
412540
// Release memory to hold pages
413541
vfree(tmp_entry->pages);
414542
}
@@ -460,14 +588,29 @@ int tlb_put_user_pages_ctid(struct vfpga_dev *device, int32_t ctid, pid_t hpid,
460588
return -1;
461589
#endif
462590
} else {
463-
if(dirtied)
464-
for(i = 0; i < tmp_entry->n_pages; i++)
591+
if (dirtied) {
592+
for (i = 0; i < tmp_entry->n_pages; i++) {
465593
SetPageDirty(tmp_entry->pages[i]);
594+
}
595+
}
596+
597+
// Unmap DMA
598+
int pg_inc = tmp_entry->huge ? device->bd_data->n_pages_in_huge : 1;
599+
int pg_size = tmp_entry->huge ? device->bd_data->ltlb_meta->page_size : PAGE_SIZE;
600+
for (int i = 0; i < tmp_entry->n_pages; i+=pg_inc) {
601+
dma_unmap_single(&device->bd_data->pci_dev->dev, tmp_entry->hpages[i], pg_size, DMA_BIDIRECTIONAL);
602+
}
603+
604+
// Unpin the pages
605+
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 4, 0)
606+
unpin_user_pages(tmp_entry->pages, tmp_entry->n_pages);
607+
#else
608+
for(int i = 0; i < tmp_entry->n_pages; i++) {
609+
put_page(tmp_entry->pages[i]);
610+
}
611+
#endif
612+
466613

467-
// Release host pages
468-
for(i = 0; i < tmp_entry->n_pages; i++)
469-
put_page(tmp_entry->pages[i]);
470-
471614
// Release memory to hold pages
472615
vfree(tmp_entry->pages);
473616
}

driver/src/vfpga/vfpga_isr.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ void vfpga_pfault_handler(struct work_struct *work) {
175175
ret_val = mmu_handler_gup(device, irq_pf->vaddr, irq_pf->len, irq_pf->ctid, irq_pf->stream, hpid);
176176
#endif
177177

178-
if (ret_val) {
178+
if (ret_val && ret_val != BUFF_NEEDS_EXP_SYNC_RET_CODE) {
179179
drop_irq_pfault(device, irq_pf->wr, irq_pf->ctid);
180180
pr_err("MMU handler error, vFPGA %d, error %d\n", device->id, ret_val);
181181
goto err_mmu;

driver/src/vfpga/vfpga_ops.c

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,7 @@ long vfpga_dev_ioctl(struct file *file, unsigned int command, unsigned long arg)
303303
#endif
304304
ret_val = mmu_handler_gup(device, tmp[0], tmp[1], ctid, true, hpid);
305305

306-
if (ret_val) {
306+
if (ret_val && ret_val != BUFF_NEEDS_EXP_SYNC_RET_CODE) {
307307
dbg_info("buffer could not be mapped, ret_val: %d\n", ret_val);
308308
}
309309

@@ -654,20 +654,18 @@ int vfpga_dev_mmap(struct file *file, struct vm_area_struct *vma) {
654654

655655
// Memory map writeback region
656656
if (vma->vm_pgoff == MMAP_WB) {
657-
set_memory_uc((uint64_t) device->wb_addr_virt, N_WB_PAGES);
658657
dbg_info(
659658
"fpga dev. %d, memory mapping writeback regions at %llx of size %lx\n",
660659
device->id, device->wb_phys_addr, WB_SIZE
661660
);
662-
int ret_val = remap_pfn_range(
663-
vma,
664-
vma->vm_start,
665-
(device->wb_phys_addr) >> PAGE_SHIFT,
666-
WB_SIZE,
667-
vma->vm_page_prot
661+
662+
// dma_mmap_coherent expects vma->pg_offs to be 0; hence MMAP_WB was changed to 0 and MMAP_CTRL to 3
663+
int ret_val = dma_mmap_coherent(
664+
&device->bd_data->pci_dev->dev, vma, (void *) device->wb_addr_virt, device->wb_phys_addr, PAGE_SIZE
668665
);
666+
669667
if (ret_val) {
670-
pr_warn("remap_pfn_range failed for writeback region, ret_val: %d\n", ret_val);
668+
pr_warn("dma_mmap_coherent failed for writeback region, ret_val: %d\n", ret_val);
671669
return -EIO;
672670
} else {
673671
return 0;

0 commit comments

Comments
 (0)