Skip to content

Commit 40f5175

Browse files
committed
iommufd: Implement sw_msi support natively
iommufd has a model where the iommu_domain can be changed while the VFIO device is attached. In this case, the MSI should continue to work. This corner case has not worked because the dma-iommu implementation of sw_msi is tied to a single domain. Implement the sw_msi mapping directly and use a global per-fd table to associate assigned IOVA to the MSI pages. This allows the MSI pages to be loaded into a domain before it is attached ensuring that MSI is not disrupted. Link: https://patch.msgid.link/r/e13d23eeacd67c0a692fc468c85b483f4dd51c57.1740014950.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen <[email protected]> Signed-off-by: Jason Gunthorpe <[email protected]>
1 parent 748706d commit 40f5175

File tree

4 files changed

+173
-23
lines changed

4 files changed

+173
-23
lines changed

drivers/iommu/iommufd/device.c

Lines changed: 139 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <linux/iommufd.h>
66
#include <linux/slab.h>
77
#include <uapi/linux/iommufd.h>
8+
#include <linux/msi.h>
89

910
#include "../iommu-priv.h"
1011
#include "io_pagetable.h"
@@ -293,36 +294,152 @@ u32 iommufd_device_to_id(struct iommufd_device *idev)
293294
}
294295
EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, "IOMMUFD");
295296

297+
/*
298+
* Get a iommufd_sw_msi_map for the msi physical address requested by the irq
299+
* layer. The mapping to IOVA is global to the iommufd file descriptor, every
300+
* domain that is attached to a device using the same MSI parameters will use
301+
* the same IOVA.
302+
*/
303+
static __maybe_unused struct iommufd_sw_msi_map *
304+
iommufd_sw_msi_get_map(struct iommufd_ctx *ictx, phys_addr_t msi_addr,
305+
phys_addr_t sw_msi_start)
306+
{
307+
struct iommufd_sw_msi_map *cur;
308+
unsigned int max_pgoff = 0;
309+
310+
lockdep_assert_held(&ictx->sw_msi_lock);
311+
312+
list_for_each_entry(cur, &ictx->sw_msi_list, sw_msi_item) {
313+
if (cur->sw_msi_start != sw_msi_start)
314+
continue;
315+
max_pgoff = max(max_pgoff, cur->pgoff + 1);
316+
if (cur->msi_addr == msi_addr)
317+
return cur;
318+
}
319+
320+
if (ictx->sw_msi_id >=
321+
BITS_PER_BYTE * sizeof_field(struct iommufd_sw_msi_maps, bitmap))
322+
return ERR_PTR(-EOVERFLOW);
323+
324+
cur = kzalloc(sizeof(*cur), GFP_KERNEL);
325+
if (!cur)
326+
return ERR_PTR(-ENOMEM);
327+
328+
cur->sw_msi_start = sw_msi_start;
329+
cur->msi_addr = msi_addr;
330+
cur->pgoff = max_pgoff;
331+
cur->id = ictx->sw_msi_id++;
332+
list_add_tail(&cur->sw_msi_item, &ictx->sw_msi_list);
333+
return cur;
334+
}
335+
336+
static int iommufd_sw_msi_install(struct iommufd_ctx *ictx,
337+
struct iommufd_hwpt_paging *hwpt_paging,
338+
struct iommufd_sw_msi_map *msi_map)
339+
{
340+
unsigned long iova;
341+
342+
lockdep_assert_held(&ictx->sw_msi_lock);
343+
344+
iova = msi_map->sw_msi_start + msi_map->pgoff * PAGE_SIZE;
345+
if (!test_bit(msi_map->id, hwpt_paging->present_sw_msi.bitmap)) {
346+
int rc;
347+
348+
rc = iommu_map(hwpt_paging->common.domain, iova,
349+
msi_map->msi_addr, PAGE_SIZE,
350+
IOMMU_WRITE | IOMMU_READ | IOMMU_MMIO,
351+
GFP_KERNEL_ACCOUNT);
352+
if (rc)
353+
return rc;
354+
__set_bit(msi_map->id, hwpt_paging->present_sw_msi.bitmap);
355+
}
356+
return 0;
357+
}
358+
359+
/*
360+
* Called by the irq code if the platform translates the MSI address through the
361+
* IOMMU. msi_addr is the physical address of the MSI page. iommufd will
362+
* allocate a fd global iova for the physical page that is the same on all
363+
* domains and devices.
364+
*/
365+
#ifdef CONFIG_IRQ_MSI_IOMMU
366+
int iommufd_sw_msi(struct iommu_domain *domain, struct msi_desc *desc,
367+
phys_addr_t msi_addr)
368+
{
369+
struct device *dev = msi_desc_to_dev(desc);
370+
struct iommufd_hwpt_paging *hwpt_paging;
371+
struct iommu_attach_handle *raw_handle;
372+
struct iommufd_attach_handle *handle;
373+
struct iommufd_sw_msi_map *msi_map;
374+
struct iommufd_ctx *ictx;
375+
unsigned long iova;
376+
int rc;
377+
378+
/*
379+
* It is safe to call iommu_attach_handle_get() here because the iommu
380+
* core code invokes this under the group mutex which also prevents any
381+
* change of the attach handle for the duration of this function.
382+
*/
383+
iommu_group_mutex_assert(dev);
384+
385+
raw_handle =
386+
iommu_attach_handle_get(dev->iommu_group, IOMMU_NO_PASID, 0);
387+
if (IS_ERR(raw_handle))
388+
return 0;
389+
hwpt_paging = find_hwpt_paging(domain->iommufd_hwpt);
390+
391+
handle = to_iommufd_handle(raw_handle);
392+
/* No IOMMU_RESV_SW_MSI means no change to the msi_msg */
393+
if (handle->idev->igroup->sw_msi_start == PHYS_ADDR_MAX)
394+
return 0;
395+
396+
ictx = handle->idev->ictx;
397+
guard(mutex)(&ictx->sw_msi_lock);
398+
/*
399+
* The input msi_addr is the exact byte offset of the MSI doorbell, we
400+
* assume the caller has checked that it is contained with a MMIO region
401+
* that is secure to map at PAGE_SIZE.
402+
*/
403+
msi_map = iommufd_sw_msi_get_map(handle->idev->ictx,
404+
msi_addr & PAGE_MASK,
405+
handle->idev->igroup->sw_msi_start);
406+
if (IS_ERR(msi_map))
407+
return PTR_ERR(msi_map);
408+
409+
rc = iommufd_sw_msi_install(ictx, hwpt_paging, msi_map);
410+
if (rc)
411+
return rc;
412+
__set_bit(msi_map->id, handle->idev->igroup->required_sw_msi.bitmap);
413+
414+
iova = msi_map->sw_msi_start + msi_map->pgoff * PAGE_SIZE;
415+
msi_desc_set_iommu_msi_iova(desc, iova, PAGE_SHIFT);
416+
return 0;
417+
}
418+
#endif
419+
296420
static int iommufd_group_setup_msi(struct iommufd_group *igroup,
297421
struct iommufd_hwpt_paging *hwpt_paging)
298422
{
299-
phys_addr_t sw_msi_start = igroup->sw_msi_start;
300-
int rc;
423+
struct iommufd_ctx *ictx = igroup->ictx;
424+
struct iommufd_sw_msi_map *cur;
425+
426+
if (igroup->sw_msi_start == PHYS_ADDR_MAX)
427+
return 0;
301428

302429
/*
303-
* If the IOMMU driver gives a IOMMU_RESV_SW_MSI then it is asking us to
304-
* call iommu_get_msi_cookie() on its behalf. This is necessary to setup
305-
* the MSI window so iommu_dma_prepare_msi() can install pages into our
306-
* domain after request_irq(). If it is not done interrupts will not
307-
* work on this domain.
308-
*
309-
* FIXME: This is conceptually broken for iommufd since we want to allow
310-
* userspace to change the domains, eg switch from an identity IOAS to a
311-
* DMA IOAS. There is currently no way to create a MSI window that
312-
* matches what the IRQ layer actually expects in a newly created
313-
* domain.
430+
* Install all the MSI pages the device has been using into the domain
314431
*/
315-
if (sw_msi_start != PHYS_ADDR_MAX && !hwpt_paging->msi_cookie) {
316-
rc = iommu_get_msi_cookie(hwpt_paging->common.domain,
317-
sw_msi_start);
432+
guard(mutex)(&ictx->sw_msi_lock);
433+
list_for_each_entry(cur, &ictx->sw_msi_list, sw_msi_item) {
434+
int rc;
435+
436+
if (cur->sw_msi_start != igroup->sw_msi_start ||
437+
!test_bit(cur->id, igroup->required_sw_msi.bitmap))
438+
continue;
439+
440+
rc = iommufd_sw_msi_install(ictx, hwpt_paging, cur);
318441
if (rc)
319442
return rc;
320-
321-
/*
322-
* iommu_get_msi_cookie() can only be called once per domain,
323-
* it returns -EBUSY on later calls.
324-
*/
325-
hwpt_paging->msi_cookie = true;
326443
}
327444
return 0;
328445
}

drivers/iommu/iommufd/hw_pagetable.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
156156
goto out_abort;
157157
}
158158
}
159+
iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi);
159160

160161
/*
161162
* Set the coherency mode before we do iopt_table_add_domain() as some
@@ -251,6 +252,7 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
251252
goto out_abort;
252253
}
253254
hwpt->domain->owner = ops;
255+
iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi);
254256

255257
if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) {
256258
rc = -EINVAL;
@@ -307,6 +309,7 @@ iommufd_viommu_alloc_hwpt_nested(struct iommufd_viommu *viommu, u32 flags,
307309
goto out_abort;
308310
}
309311
hwpt->domain->owner = viommu->iommu_dev->ops;
312+
iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi);
310313

311314
if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) {
312315
rc = -EINVAL;

drivers/iommu/iommufd/iommufd_private.h

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,33 @@ struct iommu_group;
1919
struct iommu_option;
2020
struct iommufd_device;
2121

22+
struct iommufd_sw_msi_map {
23+
struct list_head sw_msi_item;
24+
phys_addr_t sw_msi_start;
25+
phys_addr_t msi_addr;
26+
unsigned int pgoff;
27+
unsigned int id;
28+
};
29+
30+
/* Bitmap of struct iommufd_sw_msi_map::id */
31+
struct iommufd_sw_msi_maps {
32+
DECLARE_BITMAP(bitmap, 64);
33+
};
34+
35+
int iommufd_sw_msi(struct iommu_domain *domain, struct msi_desc *desc,
36+
phys_addr_t msi_addr);
37+
2238
struct iommufd_ctx {
2339
struct file *file;
2440
struct xarray objects;
2541
struct xarray groups;
2642
wait_queue_head_t destroy_wait;
2743
struct rw_semaphore ioas_creation_lock;
2844

45+
struct mutex sw_msi_lock;
46+
struct list_head sw_msi_list;
47+
unsigned int sw_msi_id;
48+
2949
u8 account_mode;
3050
/* Compatibility with VFIO no iommu */
3151
u8 no_iommu_mode;
@@ -283,10 +303,10 @@ struct iommufd_hwpt_paging {
283303
struct iommufd_ioas *ioas;
284304
bool auto_domain : 1;
285305
bool enforce_cache_coherency : 1;
286-
bool msi_cookie : 1;
287306
bool nest_parent : 1;
288307
/* Head at iommufd_ioas::hwpt_list */
289308
struct list_head hwpt_item;
309+
struct iommufd_sw_msi_maps present_sw_msi;
290310
};
291311

292312
struct iommufd_hwpt_nested {
@@ -383,6 +403,7 @@ struct iommufd_group {
383403
struct iommu_group *group;
384404
struct iommufd_hw_pagetable *hwpt;
385405
struct list_head device_list;
406+
struct iommufd_sw_msi_maps required_sw_msi;
386407
phys_addr_t sw_msi_start;
387408
};
388409

drivers/iommu/iommufd/main.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,13 +227,17 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp)
227227
xa_init(&ictx->groups);
228228
ictx->file = filp;
229229
init_waitqueue_head(&ictx->destroy_wait);
230+
mutex_init(&ictx->sw_msi_lock);
231+
INIT_LIST_HEAD(&ictx->sw_msi_list);
230232
filp->private_data = ictx;
231233
return 0;
232234
}
233235

234236
static int iommufd_fops_release(struct inode *inode, struct file *filp)
235237
{
236238
struct iommufd_ctx *ictx = filp->private_data;
239+
struct iommufd_sw_msi_map *next;
240+
struct iommufd_sw_msi_map *cur;
237241
struct iommufd_object *obj;
238242

239243
/*
@@ -262,6 +266,11 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp)
262266
break;
263267
}
264268
WARN_ON(!xa_empty(&ictx->groups));
269+
270+
mutex_destroy(&ictx->sw_msi_lock);
271+
list_for_each_entry_safe(cur, next, &ictx->sw_msi_list, sw_msi_item)
272+
kfree(cur);
273+
265274
kfree(ictx);
266275
return 0;
267276
}

0 commit comments

Comments
 (0)