IB/mthca: Recover from catastrophic errors

Trigger device remove and then add when a catastrophic error is
detected in hardware.  This, in turn, will cause a device reset, which
we hope will recover from the catastrophic condition.

Since this might interefere with debugging the root cause, add a
module option to suppress this behaviour.

Signed-off-by: Jack Morgenstein <jackm@mellanox.co.il>
Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
This commit is contained in:
Jack Morgenstein
2006-08-15 21:11:18 +03:00
committed by Roland Dreier
parent 07eeec0627
commit b3b30f5e8a
3 changed files with 136 additions and 21 deletions

View File

@@ -45,6 +45,7 @@
#include <linux/dma-mapping.h>
#include <linux/timer.h>
#include <linux/mutex.h>
#include <linux/list.h>
#include <asm/semaphore.h>
@@ -283,8 +284,11 @@ struct mthca_catas_err {
unsigned long stop;
u32 size;
struct timer_list timer;
struct list_head list;
};
extern struct mutex mthca_device_mutex;
struct mthca_dev {
struct ib_device ib_dev;
struct pci_dev *pdev;
@@ -450,6 +454,9 @@ void mthca_unregister_device(struct mthca_dev *dev);
void mthca_start_catas_poll(struct mthca_dev *dev);
void mthca_stop_catas_poll(struct mthca_dev *dev);
int __mthca_restart_one(struct pci_dev *pdev);
int mthca_catas_init(void);
void mthca_catas_cleanup(void);
int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar);
void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar);