Skip to content

Commit 07d4419

Browse files
msudheer337Jeff Kirsher
authored andcommitted
i40e/i40evf: Detect and recover hung queue scenario
In VFs, there is a known issue which can cause writebacks to not occur when interrupts are disabled and there are less than 4 descriptors resulting in TX timeout. Timeout can also occur due to lost interrupt. The current implementation for detecting and recovering from hung queues in the PF is problematic because it actually actively encourages lost interrupts. By triggering a SW interrupt, interrupts are forced on. If we are already in napi_poll and an interrupt fires, napi_poll will not be rescheduled and the interrupt is effectively lost; thereby potentially *causing* hung queues. This patch checks whether packets are being processed between every watchdog cycle and determine potential hung queue and fires triggers SW interrupt only for that particular queue. Signed-off-by: Sudheer Mogilappagari <[email protected]> Tested-by: Andrew Bowers <[email protected]> Signed-off-by: Jeff Kirsher <[email protected]>
1 parent d95cd48 commit 07d4419

File tree

6 files changed

+115
-99
lines changed

6 files changed

+115
-99
lines changed

drivers/net/ethernet/intel/i40e/i40e_main.c

Lines changed: 1 addition & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -4876,104 +4876,6 @@ static int i40e_pf_wait_queues_disabled(struct i40e_pf *pf)
48764876

48774877
#endif
48784878

4879-
/**
4880-
* i40e_detect_recover_hung_queue - Function to detect and recover hung_queue
4881-
* @q_idx: TX queue number
4882-
* @vsi: Pointer to VSI struct
4883-
*
4884-
* This function checks specified queue for given VSI. Detects hung condition.
4885-
* We proactively detect hung TX queues by checking if interrupts are disabled
4886-
* but there are pending descriptors. If it appears hung, attempt to recover
4887-
* by triggering a SW interrupt.
4888-
**/
4889-
static void i40e_detect_recover_hung_queue(int q_idx, struct i40e_vsi *vsi)
4890-
{
4891-
struct i40e_ring *tx_ring = NULL;
4892-
struct i40e_pf *pf;
4893-
u32 val, tx_pending;
4894-
int i;
4895-
4896-
pf = vsi->back;
4897-
4898-
/* now that we have an index, find the tx_ring struct */
4899-
for (i = 0; i < vsi->num_queue_pairs; i++) {
4900-
if (vsi->tx_rings[i] && vsi->tx_rings[i]->desc) {
4901-
if (q_idx == vsi->tx_rings[i]->queue_index) {
4902-
tx_ring = vsi->tx_rings[i];
4903-
break;
4904-
}
4905-
}
4906-
}
4907-
4908-
if (!tx_ring)
4909-
return;
4910-
4911-
/* Read interrupt register */
4912-
if (pf->flags & I40E_FLAG_MSIX_ENABLED)
4913-
val = rd32(&pf->hw,
4914-
I40E_PFINT_DYN_CTLN(tx_ring->q_vector->v_idx +
4915-
tx_ring->vsi->base_vector - 1));
4916-
else
4917-
val = rd32(&pf->hw, I40E_PFINT_DYN_CTL0);
4918-
4919-
tx_pending = i40e_get_tx_pending(tx_ring);
4920-
4921-
/* Interrupts are disabled and TX pending is non-zero,
4922-
* trigger the SW interrupt (don't wait). Worst case
4923-
* there will be one extra interrupt which may result
4924-
* into not cleaning any queues because queues are cleaned.
4925-
*/
4926-
if (tx_pending && (!(val & I40E_PFINT_DYN_CTLN_INTENA_MASK)))
4927-
i40e_force_wb(vsi, tx_ring->q_vector);
4928-
}
4929-
4930-
/**
4931-
* i40e_detect_recover_hung - Function to detect and recover hung_queues
4932-
* @pf: pointer to PF struct
4933-
*
4934-
* LAN VSI has netdev and netdev has TX queues. This function is to check
4935-
* each of those TX queues if they are hung, trigger recovery by issuing
4936-
* SW interrupt.
4937-
**/
4938-
static void i40e_detect_recover_hung(struct i40e_pf *pf)
4939-
{
4940-
struct net_device *netdev;
4941-
struct i40e_vsi *vsi;
4942-
unsigned int i;
4943-
4944-
/* Only for LAN VSI */
4945-
vsi = pf->vsi[pf->lan_vsi];
4946-
4947-
if (!vsi)
4948-
return;
4949-
4950-
/* Make sure, VSI state is not DOWN/RECOVERY_PENDING */
4951-
if (test_bit(__I40E_VSI_DOWN, vsi->back->state) ||
4952-
test_bit(__I40E_RESET_RECOVERY_PENDING, vsi->back->state))
4953-
return;
4954-
4955-
/* Make sure type is MAIN VSI */
4956-
if (vsi->type != I40E_VSI_MAIN)
4957-
return;
4958-
4959-
netdev = vsi->netdev;
4960-
if (!netdev)
4961-
return;
4962-
4963-
/* Bail out if netif_carrier is not OK */
4964-
if (!netif_carrier_ok(netdev))
4965-
return;
4966-
4967-
/* Go thru' TX queues for netdev */
4968-
for (i = 0; i < netdev->num_tx_queues; i++) {
4969-
struct netdev_queue *q;
4970-
4971-
q = netdev_get_tx_queue(netdev, i);
4972-
if (q)
4973-
i40e_detect_recover_hung_queue(i, vsi);
4974-
}
4975-
}
4976-
49774879
/**
49784880
* i40e_get_iscsi_tc_map - Return TC map for iSCSI APP
49794881
* @pf: pointer to PF
@@ -9695,7 +9597,7 @@ static void i40e_service_task(struct work_struct *work)
96959597
if (test_and_set_bit(__I40E_SERVICE_SCHED, pf->state))
96969598
return;
96979599

9698-
i40e_detect_recover_hung(pf);
9600+
i40e_detect_recover_hung(pf->vsi[pf->lan_vsi]);
96999601
i40e_sync_filters_subtask(pf);
97009602
i40e_reset_subtask(pf);
97019603
i40e_handle_mdd_event(pf);

drivers/net/ethernet/intel/i40e/i40e_txrx.c

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -726,6 +726,59 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring)
726726
return 0;
727727
}
728728

729+
/**
730+
* i40e_detect_recover_hung - Function to detect and recover hung_queues
731+
* @vsi: pointer to vsi struct with tx queues
732+
*
733+
* VSI has netdev and netdev has TX queues. This function is to check each of
734+
* those TX queues if they are hung, trigger recovery by issuing SW interrupt.
735+
**/
736+
void i40e_detect_recover_hung(struct i40e_vsi *vsi)
737+
{
738+
struct i40e_ring *tx_ring = NULL;
739+
struct net_device *netdev;
740+
unsigned int i;
741+
int packets;
742+
743+
if (!vsi)
744+
return;
745+
746+
if (test_bit(__I40E_VSI_DOWN, vsi->state))
747+
return;
748+
749+
netdev = vsi->netdev;
750+
if (!netdev)
751+
return;
752+
753+
if (!netif_carrier_ok(netdev))
754+
return;
755+
756+
for (i = 0; i < vsi->num_queue_pairs; i++) {
757+
tx_ring = vsi->tx_rings[i];
758+
if (tx_ring && tx_ring->desc) {
759+
/* If packet counter has not changed the queue is
760+
* likely stalled, so force an interrupt for this
761+
* queue.
762+
*
763+
* prev_pkt_ctr would be negative if there was no
764+
* pending work.
765+
*/
766+
packets = tx_ring->stats.packets & INT_MAX;
767+
if (tx_ring->tx_stats.prev_pkt_ctr == packets) {
768+
i40e_force_wb(vsi, tx_ring->q_vector);
769+
continue;
770+
}
771+
772+
/* Memory barrier between read of packet count and call
773+
* to i40e_get_tx_pending()
774+
*/
775+
smp_rmb();
776+
tx_ring->tx_stats.prev_pkt_ctr =
777+
i40e_get_tx_pending(tx_ring) ? packets : -1;
778+
}
779+
}
780+
}
781+
729782
#define WB_STRIDE 4
730783

731784
/**
@@ -1163,6 +1216,7 @@ int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
11631216

11641217
tx_ring->next_to_use = 0;
11651218
tx_ring->next_to_clean = 0;
1219+
tx_ring->tx_stats.prev_pkt_ctr = -1;
11661220
return 0;
11671221

11681222
err:

drivers/net/ethernet/intel/i40e/i40e_txrx.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,7 @@ struct i40e_tx_queue_stats {
333333
u64 tx_done_old;
334334
u64 tx_linearize;
335335
u64 tx_force_wb;
336+
int prev_pkt_ctr;
336337
};
337338

338339
struct i40e_rx_queue_stats {
@@ -501,6 +502,7 @@ void i40e_free_rx_resources(struct i40e_ring *rx_ring);
501502
int i40e_napi_poll(struct napi_struct *napi, int budget);
502503
void i40e_force_wb(struct i40e_vsi *vsi, struct i40e_q_vector *q_vector);
503504
u32 i40e_get_tx_pending(struct i40e_ring *ring);
505+
void i40e_detect_recover_hung(struct i40e_vsi *vsi);
504506
int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
505507
bool __i40e_chk_linearize(struct sk_buff *skb);
506508

drivers/net/ethernet/intel/i40evf/i40e_txrx.c

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,59 @@ u32 i40evf_get_tx_pending(struct i40e_ring *ring, bool in_sw)
148148
return 0;
149149
}
150150

151+
/**
152+
* i40evf_detect_recover_hung - Function to detect and recover hung_queues
153+
* @vsi: pointer to vsi struct with tx queues
154+
*
155+
* VSI has netdev and netdev has TX queues. This function is to check each of
156+
* those TX queues if they are hung, trigger recovery by issuing SW interrupt.
157+
**/
158+
void i40evf_detect_recover_hung(struct i40e_vsi *vsi)
159+
{
160+
struct i40e_ring *tx_ring = NULL;
161+
struct net_device *netdev;
162+
unsigned int i;
163+
int packets;
164+
165+
if (!vsi)
166+
return;
167+
168+
if (test_bit(__I40E_VSI_DOWN, vsi->state))
169+
return;
170+
171+
netdev = vsi->netdev;
172+
if (!netdev)
173+
return;
174+
175+
if (!netif_carrier_ok(netdev))
176+
return;
177+
178+
for (i = 0; i < vsi->back->num_active_queues; i++) {
179+
tx_ring = &vsi->back->tx_rings[i];
180+
if (tx_ring && tx_ring->desc) {
181+
/* If packet counter has not changed the queue is
182+
* likely stalled, so force an interrupt for this
183+
* queue.
184+
*
185+
* prev_pkt_ctr would be negative if there was no
186+
* pending work.
187+
*/
188+
packets = tx_ring->stats.packets & INT_MAX;
189+
if (tx_ring->tx_stats.prev_pkt_ctr == packets) {
190+
i40evf_force_wb(vsi, tx_ring->q_vector);
191+
continue;
192+
}
193+
194+
/* Memory barrier between read of packet count and call
195+
* to i40evf_get_tx_pending()
196+
*/
197+
smp_rmb();
198+
tx_ring->tx_stats.prev_pkt_ctr =
199+
i40evf_get_tx_pending(tx_ring, false) ? packets : -1;
200+
}
201+
}
202+
}
203+
151204
#define WB_STRIDE 4
152205

153206
/**
@@ -469,6 +522,7 @@ int i40evf_setup_tx_descriptors(struct i40e_ring *tx_ring)
469522

470523
tx_ring->next_to_use = 0;
471524
tx_ring->next_to_clean = 0;
525+
tx_ring->tx_stats.prev_pkt_ctr = -1;
472526
return 0;
473527

474528
err:

drivers/net/ethernet/intel/i40evf/i40e_txrx.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,7 @@ struct i40e_tx_queue_stats {
313313
u64 tx_done_old;
314314
u64 tx_linearize;
315315
u64 tx_force_wb;
316+
int prev_pkt_ctr;
316317
u64 tx_lost_interrupt;
317318
};
318319

@@ -467,6 +468,7 @@ void i40evf_free_rx_resources(struct i40e_ring *rx_ring);
467468
int i40evf_napi_poll(struct napi_struct *napi, int budget);
468469
void i40evf_force_wb(struct i40e_vsi *vsi, struct i40e_q_vector *q_vector);
469470
u32 i40evf_get_tx_pending(struct i40e_ring *ring, bool in_sw);
471+
void i40evf_detect_recover_hung(struct i40e_vsi *vsi);
470472
int __i40evf_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
471473
bool __i40evf_chk_linearize(struct sk_buff *skb);
472474

drivers/net/ethernet/intel/i40evf/i40evf_main.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1716,6 +1716,8 @@ static void i40evf_watchdog_task(struct work_struct *work)
17161716
if (adapter->state == __I40EVF_RUNNING)
17171717
i40evf_request_stats(adapter);
17181718
watchdog_done:
1719+
if (adapter->state == __I40EVF_RUNNING)
1720+
i40evf_detect_recover_hung(&adapter->vsi);
17191721
clear_bit(__I40EVF_IN_CRITICAL_TASK, &adapter->crit_section);
17201722
restart_watchdog:
17211723
if (adapter->state == __I40EVF_REMOVE)

0 commit comments

Comments
 (0)