@@ -1613,7 +1613,7 @@ static void hv_pci_compose_compl(void *context, struct pci_response *resp,
1613
1613
}
1614
1614
1615
1615
static u32 hv_compose_msi_req_v1 (
1616
- struct pci_create_interrupt * int_pkt , const struct cpumask * affinity ,
1616
+ struct pci_create_interrupt * int_pkt ,
1617
1617
u32 slot , u8 vector , u16 vector_count )
1618
1618
{
1619
1619
int_pkt -> message_type .type = PCI_CREATE_INTERRUPT_MESSAGE ;
@@ -1631,6 +1631,35 @@ static u32 hv_compose_msi_req_v1(
1631
1631
return sizeof (* int_pkt );
1632
1632
}
1633
1633
1634
+ /*
1635
+ * The vCPU selected by hv_compose_multi_msi_req_get_cpu() and
1636
+ * hv_compose_msi_req_get_cpu() is a "dummy" vCPU because the final vCPU to be
1637
+ * interrupted is specified later in hv_irq_unmask() and communicated to Hyper-V
1638
+ * via the HVCALL_RETARGET_INTERRUPT hypercall. But the choice of dummy vCPU is
1639
+ * not irrelevant because Hyper-V chooses the physical CPU to handle the
1640
+ * interrupts based on the vCPU specified in message sent to the vPCI VSP in
1641
+ * hv_compose_msi_msg(). Hyper-V's choice of pCPU is not visible to the guest,
1642
+ * but assigning too many vPCI device interrupts to the same pCPU can cause a
1643
+ * performance bottleneck. So we spread out the dummy vCPUs to influence Hyper-V
1644
+ * to spread out the pCPUs that it selects.
1645
+ *
1646
+ * For the single-MSI and MSI-X cases, it's OK for hv_compose_msi_req_get_cpu()
1647
+ * to always return the same dummy vCPU, because a second call to
1648
+ * hv_compose_msi_msg() contains the "real" vCPU, causing Hyper-V to choose a
1649
+ * new pCPU for the interrupt. But for the multi-MSI case, the second call to
1650
+ * hv_compose_msi_msg() exits without sending a message to the vPCI VSP, so the
1651
+ * original dummy vCPU is used. This dummy vCPU must be round-robin'ed so that
1652
+ * the pCPUs are spread out. All interrupts for a multi-MSI device end up using
1653
+ * the same pCPU, even though the vCPUs will be spread out by later calls
1654
+ * to hv_irq_unmask(), but that is the best we can do now.
1655
+ *
1656
+ * With Hyper-V in Nov 2022, the HVCALL_RETARGET_INTERRUPT hypercall does *not*
1657
+ * cause Hyper-V to reselect the pCPU based on the specified vCPU. Such an
1658
+ * enhancement is planned for a future version. With that enhancement, the
1659
+ * dummy vCPU selection won't matter, and interrupts for the same multi-MSI
1660
+ * device will be spread across multiple pCPUs.
1661
+ */
1662
+
1634
1663
/*
1635
1664
* Create MSI w/ dummy vCPU set targeting just one vCPU, overwritten
1636
1665
* by subsequent retarget in hv_irq_unmask().
@@ -1640,18 +1669,39 @@ static int hv_compose_msi_req_get_cpu(const struct cpumask *affinity)
1640
1669
return cpumask_first_and (affinity , cpu_online_mask );
1641
1670
}
1642
1671
1643
- static u32 hv_compose_msi_req_v2 (
1644
- struct pci_create_interrupt2 * int_pkt , const struct cpumask * affinity ,
1645
- u32 slot , u8 vector , u16 vector_count )
1672
+ /*
1673
+ * Make sure the dummy vCPU values for multi-MSI don't all point to vCPU0.
1674
+ */
1675
+ static int hv_compose_multi_msi_req_get_cpu (void )
1646
1676
{
1677
+ static DEFINE_SPINLOCK (multi_msi_cpu_lock );
1678
+
1679
+ /* -1 means starting with CPU 0 */
1680
+ static int cpu_next = -1 ;
1681
+
1682
+ unsigned long flags ;
1647
1683
int cpu ;
1648
1684
1685
+ spin_lock_irqsave (& multi_msi_cpu_lock , flags );
1686
+
1687
+ cpu_next = cpumask_next_wrap (cpu_next , cpu_online_mask , nr_cpu_ids ,
1688
+ false);
1689
+ cpu = cpu_next ;
1690
+
1691
+ spin_unlock_irqrestore (& multi_msi_cpu_lock , flags );
1692
+
1693
+ return cpu ;
1694
+ }
1695
+
1696
+ static u32 hv_compose_msi_req_v2 (
1697
+ struct pci_create_interrupt2 * int_pkt , int cpu ,
1698
+ u32 slot , u8 vector , u16 vector_count )
1699
+ {
1649
1700
int_pkt -> message_type .type = PCI_CREATE_INTERRUPT_MESSAGE2 ;
1650
1701
int_pkt -> wslot .slot = slot ;
1651
1702
int_pkt -> int_desc .vector = vector ;
1652
1703
int_pkt -> int_desc .vector_count = vector_count ;
1653
1704
int_pkt -> int_desc .delivery_mode = DELIVERY_MODE ;
1654
- cpu = hv_compose_msi_req_get_cpu (affinity );
1655
1705
int_pkt -> int_desc .processor_array [0 ] =
1656
1706
hv_cpu_number_to_vp_number (cpu );
1657
1707
int_pkt -> int_desc .processor_count = 1 ;
@@ -1660,18 +1710,15 @@ static u32 hv_compose_msi_req_v2(
1660
1710
}
1661
1711
1662
1712
static u32 hv_compose_msi_req_v3 (
1663
- struct pci_create_interrupt3 * int_pkt , const struct cpumask * affinity ,
1713
+ struct pci_create_interrupt3 * int_pkt , int cpu ,
1664
1714
u32 slot , u32 vector , u16 vector_count )
1665
1715
{
1666
- int cpu ;
1667
-
1668
1716
int_pkt -> message_type .type = PCI_CREATE_INTERRUPT_MESSAGE3 ;
1669
1717
int_pkt -> wslot .slot = slot ;
1670
1718
int_pkt -> int_desc .vector = vector ;
1671
1719
int_pkt -> int_desc .reserved = 0 ;
1672
1720
int_pkt -> int_desc .vector_count = vector_count ;
1673
1721
int_pkt -> int_desc .delivery_mode = DELIVERY_MODE ;
1674
- cpu = hv_compose_msi_req_get_cpu (affinity );
1675
1722
int_pkt -> int_desc .processor_array [0 ] =
1676
1723
hv_cpu_number_to_vp_number (cpu );
1677
1724
int_pkt -> int_desc .processor_count = 1 ;
@@ -1715,20 +1762,25 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
1715
1762
struct pci_create_interrupt3 v3 ;
1716
1763
} int_pkts ;
1717
1764
} __packed ctxt ;
1765
+ bool multi_msi ;
1718
1766
u64 trans_id ;
1719
1767
u32 size ;
1720
1768
int ret ;
1769
+ int cpu ;
1770
+
1771
+ msi_desc = irq_data_get_msi_desc (data );
1772
+ multi_msi = !msi_desc -> pci .msi_attrib .is_msix &&
1773
+ msi_desc -> nvec_used > 1 ;
1721
1774
1722
1775
/* Reuse the previous allocation */
1723
- if (data -> chip_data ) {
1776
+ if (data -> chip_data && multi_msi ) {
1724
1777
int_desc = data -> chip_data ;
1725
1778
msg -> address_hi = int_desc -> address >> 32 ;
1726
1779
msg -> address_lo = int_desc -> address & 0xffffffff ;
1727
1780
msg -> data = int_desc -> data ;
1728
1781
return ;
1729
1782
}
1730
1783
1731
- msi_desc = irq_data_get_msi_desc (data );
1732
1784
pdev = msi_desc_to_pci_dev (msi_desc );
1733
1785
dest = irq_data_get_effective_affinity_mask (data );
1734
1786
pbus = pdev -> bus ;
@@ -1738,11 +1790,18 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
1738
1790
if (!hpdev )
1739
1791
goto return_null_message ;
1740
1792
1793
+ /* Free any previous message that might have already been composed. */
1794
+ if (data -> chip_data && !multi_msi ) {
1795
+ int_desc = data -> chip_data ;
1796
+ data -> chip_data = NULL ;
1797
+ hv_int_desc_free (hpdev , int_desc );
1798
+ }
1799
+
1741
1800
int_desc = kzalloc (sizeof (* int_desc ), GFP_ATOMIC );
1742
1801
if (!int_desc )
1743
1802
goto drop_reference ;
1744
1803
1745
- if (! msi_desc -> pci . msi_attrib . is_msix && msi_desc -> nvec_used > 1 ) {
1804
+ if (multi_msi ) {
1746
1805
/*
1747
1806
* If this is not the first MSI of Multi MSI, we already have
1748
1807
* a mapping. Can exit early.
@@ -1767,9 +1826,11 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
1767
1826
*/
1768
1827
vector = 32 ;
1769
1828
vector_count = msi_desc -> nvec_used ;
1829
+ cpu = hv_compose_multi_msi_req_get_cpu ();
1770
1830
} else {
1771
1831
vector = hv_msi_get_int_vector (data );
1772
1832
vector_count = 1 ;
1833
+ cpu = hv_compose_msi_req_get_cpu (dest );
1773
1834
}
1774
1835
1775
1836
/*
@@ -1785,7 +1846,6 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
1785
1846
switch (hbus -> protocol_version ) {
1786
1847
case PCI_PROTOCOL_VERSION_1_1 :
1787
1848
size = hv_compose_msi_req_v1 (& ctxt .int_pkts .v1 ,
1788
- dest ,
1789
1849
hpdev -> desc .win_slot .slot ,
1790
1850
(u8 )vector ,
1791
1851
vector_count );
@@ -1794,15 +1854,15 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
1794
1854
case PCI_PROTOCOL_VERSION_1_2 :
1795
1855
case PCI_PROTOCOL_VERSION_1_3 :
1796
1856
size = hv_compose_msi_req_v2 (& ctxt .int_pkts .v2 ,
1797
- dest ,
1857
+ cpu ,
1798
1858
hpdev -> desc .win_slot .slot ,
1799
1859
(u8 )vector ,
1800
1860
vector_count );
1801
1861
break ;
1802
1862
1803
1863
case PCI_PROTOCOL_VERSION_1_4 :
1804
1864
size = hv_compose_msi_req_v3 (& ctxt .int_pkts .v3 ,
1805
- dest ,
1865
+ cpu ,
1806
1866
hpdev -> desc .win_slot .slot ,
1807
1867
vector ,
1808
1868
vector_count );
0 commit comments