1515
1616struct dentry * mana_debugfs_root ;
1717
18+ struct mana_dev_recovery {
19+ struct list_head list ;
20+ struct pci_dev * pdev ;
21+ enum gdma_eqe_type type ;
22+ };
23+
24+ static struct mana_dev_recovery_work {
25+ struct list_head dev_list ;
26+ struct delayed_work work ;
27+
28+ /* Lock for dev_list above */
29+ spinlock_t lock ;
30+ } mana_dev_recovery_work ;
31+
1832static u32 mana_gd_r32 (struct gdma_context * g , u64 offset )
1933{
2034 return readl (g -> bar0_va + offset );
@@ -387,6 +401,25 @@ EXPORT_SYMBOL_NS(mana_gd_ring_cq, "NET_MANA");
387401
388402#define MANA_SERVICE_PERIOD 10
389403
404+ static void mana_serv_rescan (struct pci_dev * pdev )
405+ {
406+ struct pci_bus * parent ;
407+
408+ pci_lock_rescan_remove ();
409+
410+ parent = pdev -> bus ;
411+ if (!parent ) {
412+ dev_err (& pdev -> dev , "MANA service: no parent bus\n" );
413+ goto out ;
414+ }
415+
416+ pci_stop_and_remove_bus_device (pdev );
417+ pci_rescan_bus (parent );
418+
419+ out :
420+ pci_unlock_rescan_remove ();
421+ }
422+
390423static void mana_serv_fpga (struct pci_dev * pdev )
391424{
392425 struct pci_bus * bus , * parent ;
@@ -419,9 +452,12 @@ static void mana_serv_reset(struct pci_dev *pdev)
419452{
420453 struct gdma_context * gc = pci_get_drvdata (pdev );
421454 struct hw_channel_context * hwc ;
455+ int ret ;
422456
423457 if (!gc ) {
424- dev_err (& pdev -> dev , "MANA service: no GC\n" );
458+ /* Perform PCI rescan on device if GC is not set up */
459+ dev_err (& pdev -> dev , "MANA service: GC not setup, rescanning\n" );
460+ mana_serv_rescan (pdev );
425461 return ;
426462 }
427463
@@ -440,9 +476,18 @@ static void mana_serv_reset(struct pci_dev *pdev)
440476
441477 msleep (MANA_SERVICE_PERIOD * 1000 );
442478
443- mana_gd_resume (pdev );
479+ ret = mana_gd_resume (pdev );
480+ if (ret == - ETIMEDOUT || ret == - EPROTO ) {
481+ /* Perform PCI rescan on device if we failed on HWC */
482+ dev_err (& pdev -> dev , "MANA service: resume failed, rescanning\n" );
483+ mana_serv_rescan (pdev );
484+ goto out ;
485+ }
444486
445- dev_info (& pdev -> dev , "MANA reset cycle completed\n" );
487+ if (ret )
488+ dev_info (& pdev -> dev , "MANA reset cycle failed err %d\n" , ret );
489+ else
490+ dev_info (& pdev -> dev , "MANA reset cycle completed\n" );
446491
447492out :
448493 gc -> in_service = false;
@@ -454,18 +499,9 @@ struct mana_serv_work {
454499 enum gdma_eqe_type type ;
455500};
456501
457- static void mana_serv_func ( struct work_struct * w )
502+ static void mana_do_service ( enum gdma_eqe_type type , struct pci_dev * pdev )
458503{
459- struct mana_serv_work * mns_wk ;
460- struct pci_dev * pdev ;
461-
462- mns_wk = container_of (w , struct mana_serv_work , serv_work );
463- pdev = mns_wk -> pdev ;
464-
465- if (!pdev )
466- goto out ;
467-
468- switch (mns_wk -> type ) {
504+ switch (type ) {
469505 case GDMA_EQE_HWC_FPGA_RECONFIG :
470506 mana_serv_fpga (pdev );
471507 break ;
@@ -475,12 +511,48 @@ static void mana_serv_func(struct work_struct *w)
475511 break ;
476512
477513 default :
478- dev_err (& pdev -> dev , "MANA service: unknown type %d\n" ,
479- mns_wk -> type );
514+ dev_err (& pdev -> dev , "MANA service: unknown type %d\n" , type );
480515 break ;
481516 }
517+ }
518+
519+ static void mana_recovery_delayed_func (struct work_struct * w )
520+ {
521+ struct mana_dev_recovery_work * work ;
522+ struct mana_dev_recovery * dev ;
523+ unsigned long flags ;
524+
525+ work = container_of (w , struct mana_dev_recovery_work , work .work );
526+
527+ spin_lock_irqsave (& work -> lock , flags );
528+
529+ while (!list_empty (& work -> dev_list )) {
530+ dev = list_first_entry (& work -> dev_list ,
531+ struct mana_dev_recovery , list );
532+ list_del (& dev -> list );
533+ spin_unlock_irqrestore (& work -> lock , flags );
534+
535+ mana_do_service (dev -> type , dev -> pdev );
536+ pci_dev_put (dev -> pdev );
537+ kfree (dev );
538+
539+ spin_lock_irqsave (& work -> lock , flags );
540+ }
541+
542+ spin_unlock_irqrestore (& work -> lock , flags );
543+ }
544+
545+ static void mana_serv_func (struct work_struct * w )
546+ {
547+ struct mana_serv_work * mns_wk ;
548+ struct pci_dev * pdev ;
549+
550+ mns_wk = container_of (w , struct mana_serv_work , serv_work );
551+ pdev = mns_wk -> pdev ;
552+
553+ if (pdev )
554+ mana_do_service (mns_wk -> type , pdev );
482555
483- out :
484556 pci_dev_put (pdev );
485557 kfree (mns_wk );
486558 module_put (THIS_MODULE );
@@ -541,6 +613,17 @@ static void mana_gd_process_eqe(struct gdma_queue *eq)
541613 case GDMA_EQE_HWC_RESET_REQUEST :
542614 dev_info (gc -> dev , "Recv MANA service type:%d\n" , type );
543615
616+ if (!test_and_set_bit (GC_PROBE_SUCCEEDED , & gc -> flags )) {
617+ /*
618+ * Device is in probe and we received a hardware reset
619+ * event, the probe function will detect that the flag
620+ * has changed and perform service procedure.
621+ */
622+ dev_info (gc -> dev ,
623+ "Service is to be processed in probe\n" );
624+ break ;
625+ }
626+
544627 if (gc -> in_service ) {
545628 dev_info (gc -> dev , "Already in service\n" );
546629 break ;
@@ -1938,8 +2021,19 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
19382021 if (err )
19392022 goto cleanup_mana ;
19402023
2024+ /*
2025+ * If a hardware reset event has occurred over HWC during probe,
2026+ * rollback and perform hardware reset procedure.
2027+ */
2028+ if (test_and_set_bit (GC_PROBE_SUCCEEDED , & gc -> flags )) {
2029+ err = - EPROTO ;
2030+ goto cleanup_mana_rdma ;
2031+ }
2032+
19412033 return 0 ;
19422034
2035+ cleanup_mana_rdma :
2036+ mana_rdma_remove (& gc -> mana_ib );
19432037cleanup_mana :
19442038 mana_remove (& gc -> mana , false);
19452039cleanup_gd :
@@ -1963,6 +2057,35 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
19632057disable_dev :
19642058 pci_disable_device (pdev );
19652059 dev_err (& pdev -> dev , "gdma probe failed: err = %d\n" , err );
2060+
2061+ /*
2062+ * Hardware could be in recovery mode and the HWC returns TIMEDOUT or
2063+ * EPROTO from mana_gd_setup(), mana_probe() or mana_rdma_probe(), or
2064+ * we received a hardware reset event over HWC interrupt. In this case,
2065+ * perform the device recovery procedure after MANA_SERVICE_PERIOD
2066+ * seconds.
2067+ */
2068+ if (err == - ETIMEDOUT || err == - EPROTO ) {
2069+ struct mana_dev_recovery * dev ;
2070+ unsigned long flags ;
2071+
2072+ dev_info (& pdev -> dev , "Start MANA recovery mode\n" );
2073+
2074+ dev = kzalloc (sizeof (* dev ), GFP_KERNEL );
2075+ if (!dev )
2076+ return err ;
2077+
2078+ dev -> pdev = pci_dev_get (pdev );
2079+ dev -> type = GDMA_EQE_HWC_RESET_REQUEST ;
2080+
2081+ spin_lock_irqsave (& mana_dev_recovery_work .lock , flags );
2082+ list_add_tail (& dev -> list , & mana_dev_recovery_work .dev_list );
2083+ spin_unlock_irqrestore (& mana_dev_recovery_work .lock , flags );
2084+
2085+ schedule_delayed_work (& mana_dev_recovery_work .work ,
2086+ secs_to_jiffies (MANA_SERVICE_PERIOD ));
2087+ }
2088+
19662089 return err ;
19672090}
19682091
@@ -2067,6 +2190,10 @@ static int __init mana_driver_init(void)
20672190{
20682191 int err ;
20692192
2193+ INIT_LIST_HEAD (& mana_dev_recovery_work .dev_list );
2194+ spin_lock_init (& mana_dev_recovery_work .lock );
2195+ INIT_DELAYED_WORK (& mana_dev_recovery_work .work , mana_recovery_delayed_func );
2196+
20702197 mana_debugfs_root = debugfs_create_dir ("mana" , NULL );
20712198
20722199 err = pci_register_driver (& mana_driver );
@@ -2080,6 +2207,21 @@ static int __init mana_driver_init(void)
20802207
20812208static void __exit mana_driver_exit (void )
20822209{
2210+ struct mana_dev_recovery * dev ;
2211+ unsigned long flags ;
2212+
2213+ disable_delayed_work_sync (& mana_dev_recovery_work .work );
2214+
2215+ spin_lock_irqsave (& mana_dev_recovery_work .lock , flags );
2216+ while (!list_empty (& mana_dev_recovery_work .dev_list )) {
2217+ dev = list_first_entry (& mana_dev_recovery_work .dev_list ,
2218+ struct mana_dev_recovery , list );
2219+ list_del (& dev -> list );
2220+ pci_dev_put (dev -> pdev );
2221+ kfree (dev );
2222+ }
2223+ spin_unlock_irqrestore (& mana_dev_recovery_work .lock , flags );
2224+
20832225 pci_unregister_driver (& mana_driver );
20842226
20852227 debugfs_remove (mana_debugfs_root );
0 commit comments