From b3b30f5e8a0c50db3d76b6f7c7cc50245aeb57fd Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Tue, 15 Aug 2006 21:11:18 +0300 Subject: IB/mthca: Recover from catastrophic errors Trigger device remove and then add when a catastrophic error is detected in hardware. This, in turn, will cause a device reset, which we hope will recover from the catastrophic condition. Since this might interefere with debugging the root cause, add a module option to suppress this behaviour. Signed-off-by: Jack Morgenstein Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_catas.c | 62 ++++++++++++++++++++++ drivers/infiniband/hw/mthca/mthca_dev.h | 7 +++ drivers/infiniband/hw/mthca/mthca_main.c | 88 +++++++++++++++++++++++-------- 3 files changed, 136 insertions(+), 21 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mthca/mthca_catas.c b/drivers/infiniband/hw/mthca/mthca_catas.c index c3bec7490f5..cd044ea2dfa 100644 --- a/drivers/infiniband/hw/mthca/mthca_catas.c +++ b/drivers/infiniband/hw/mthca/mthca_catas.c @@ -34,6 +34,7 @@ #include #include +#include #include "mthca_dev.h" @@ -48,9 +49,41 @@ enum { static DEFINE_SPINLOCK(catas_lock); +static LIST_HEAD(catas_list); +static struct workqueue_struct *catas_wq; +static struct work_struct catas_work; + +static int catas_reset_disable; +module_param_named(catas_reset_disable, catas_reset_disable, int, 0644); +MODULE_PARM_DESC(catas_reset_disable, "disable reset on catastrophic event if nonzero"); + +static void catas_reset(void *work_ptr) +{ + struct mthca_dev *dev, *tmpdev; + LIST_HEAD(tlist); + int ret; + + mutex_lock(&mthca_device_mutex); + + spin_lock_irq(&catas_lock); + list_splice_init(&catas_list, &tlist); + spin_unlock_irq(&catas_lock); + + list_for_each_entry_safe(dev, tmpdev, &tlist, catas_err.list) { + ret = __mthca_restart_one(dev->pdev); + if (ret) + mthca_err(dev, "Reset failed (%d)\n", ret); + else + mthca_dbg(dev, "Reset succeeded\n"); + } + + mutex_unlock(&mthca_device_mutex); +} + static void handle_catas(struct mthca_dev *dev) { struct ib_event event; + unsigned long flags; const char *type; int i; @@ -82,6 +115,14 @@ static void handle_catas(struct mthca_dev *dev) for (i = 0; i < dev->catas_err.size; ++i) mthca_err(dev, " buf[%02x]: %08x\n", i, swab32(readl(dev->catas_err.map + i))); + + if (catas_reset_disable) + return; + + spin_lock_irqsave(&catas_lock, flags); + list_add(&dev->catas_err.list, &catas_list); + queue_work(catas_wq, &catas_work); + spin_unlock_irqrestore(&catas_lock, flags); } static void poll_catas(unsigned long dev_ptr) @@ -135,6 +176,7 @@ void mthca_start_catas_poll(struct mthca_dev *dev) dev->catas_err.timer.data = (unsigned long) dev; dev->catas_err.timer.function = poll_catas; dev->catas_err.timer.expires = jiffies + MTHCA_CATAS_POLL_INTERVAL; + INIT_LIST_HEAD(&dev->catas_err.list); add_timer(&dev->catas_err.timer); } @@ -153,4 +195,24 @@ void mthca_stop_catas_poll(struct mthca_dev *dev) dev->catas_err.addr), dev->catas_err.size * 4); } + + spin_lock_irq(&catas_lock); + list_del(&dev->catas_err.list); + spin_unlock_irq(&catas_lock); +} + +int __init mthca_catas_init(void) +{ + INIT_WORK(&catas_work, catas_reset, NULL); + + catas_wq = create_singlethread_workqueue("mthca_catas"); + if (!catas_wq) + return -ENOMEM; + + return 0; +} + +void mthca_catas_cleanup(void) +{ + destroy_workqueue(catas_wq); } diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h index 33bd0b8bfd1..fe5cecf70fe 100644 --- a/drivers/infiniband/hw/mthca/mthca_dev.h +++ b/drivers/infiniband/hw/mthca/mthca_dev.h @@ -45,6 +45,7 @@ #include #include #include +#include #include @@ -283,8 +284,11 @@ struct mthca_catas_err { unsigned long stop; u32 size; struct timer_list timer; + struct list_head list; }; +extern struct mutex mthca_device_mutex; + struct mthca_dev { struct ib_device ib_dev; struct pci_dev *pdev; @@ -450,6 +454,9 @@ void mthca_unregister_device(struct mthca_dev *dev); void mthca_start_catas_poll(struct mthca_dev *dev); void mthca_stop_catas_poll(struct mthca_dev *dev); +int __mthca_restart_one(struct pci_dev *pdev); +int mthca_catas_init(void); +void mthca_catas_cleanup(void); int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar); void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar); diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c index 7b82c1907f0..47ea0214836 100644 --- a/drivers/infiniband/hw/mthca/mthca_main.c +++ b/drivers/infiniband/hw/mthca/mthca_main.c @@ -80,6 +80,8 @@ static int tune_pci = 0; module_param(tune_pci, int, 0444); MODULE_PARM_DESC(tune_pci, "increase PCI burst from the default set by BIOS if nonzero"); +struct mutex mthca_device_mutex; + static const char mthca_version[] __devinitdata = DRV_NAME ": Mellanox InfiniBand HCA driver v" DRV_VERSION " (" DRV_RELDATE ")\n"; @@ -978,28 +980,15 @@ static struct { MTHCA_FLAG_SINAI_OPT } }; -static int __devinit mthca_init_one(struct pci_dev *pdev, - const struct pci_device_id *id) +static int __mthca_init_one(struct pci_dev *pdev, int hca_type) { - static int mthca_version_printed = 0; int ddr_hidden = 0; int err; struct mthca_dev *mdev; - if (!mthca_version_printed) { - printk(KERN_INFO "%s", mthca_version); - ++mthca_version_printed; - } - printk(KERN_INFO PFX "Initializing %s\n", pci_name(pdev)); - if (id->driver_data >= ARRAY_SIZE(mthca_hca_table)) { - printk(KERN_ERR PFX "%s has invalid driver data %lx\n", - pci_name(pdev), id->driver_data); - return -ENODEV; - } - err = pci_enable_device(pdev); if (err) { dev_err(&pdev->dev, "Cannot enable PCI device, " @@ -1065,7 +1054,7 @@ static int __devinit mthca_init_one(struct pci_dev *pdev, mdev->pdev = pdev; - mdev->mthca_flags = mthca_hca_table[id->driver_data].flags; + mdev->mthca_flags = mthca_hca_table[hca_type].flags; if (ddr_hidden) mdev->mthca_flags |= MTHCA_FLAG_DDR_HIDDEN; @@ -1099,13 +1088,13 @@ static int __devinit mthca_init_one(struct pci_dev *pdev, if (err) goto err_cmd; - if (mdev->fw_ver < mthca_hca_table[id->driver_data].latest_fw) { + if (mdev->fw_ver < mthca_hca_table[hca_type].latest_fw) { mthca_warn(mdev, "HCA FW version %d.%d.%d is old (%d.%d.%d is current).\n", (int) (mdev->fw_ver >> 32), (int) (mdev->fw_ver >> 16) & 0xffff, (int) (mdev->fw_ver & 0xffff), - (int) (mthca_hca_table[id->driver_data].latest_fw >> 32), - (int) (mthca_hca_table[id->driver_data].latest_fw >> 16) & 0xffff, - (int) (mthca_hca_table[id->driver_data].latest_fw & 0xffff)); + (int) (mthca_hca_table[hca_type].latest_fw >> 32), + (int) (mthca_hca_table[hca_type].latest_fw >> 16) & 0xffff, + (int) (mthca_hca_table[hca_type].latest_fw & 0xffff)); mthca_warn(mdev, "If you have problems, try updating your HCA FW.\n"); } @@ -1122,6 +1111,7 @@ static int __devinit mthca_init_one(struct pci_dev *pdev, goto err_unregister; pci_set_drvdata(pdev, mdev); + mdev->hca_type = hca_type; return 0; @@ -1166,7 +1156,7 @@ err_disable_pdev: return err; } -static void __devexit mthca_remove_one(struct pci_dev *pdev) +static void __mthca_remove_one(struct pci_dev *pdev) { struct mthca_dev *mdev = pci_get_drvdata(pdev); u8 status; @@ -1211,6 +1201,51 @@ static void __devexit mthca_remove_one(struct pci_dev *pdev) } } +int __mthca_restart_one(struct pci_dev *pdev) +{ + struct mthca_dev *mdev; + + mdev = pci_get_drvdata(pdev); + if (!mdev) + return -ENODEV; + __mthca_remove_one(pdev); + return __mthca_init_one(pdev, mdev->hca_type); +} + +static int __devinit mthca_init_one(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + static int mthca_version_printed = 0; + int ret; + + mutex_lock(&mthca_device_mutex); + + if (!mthca_version_printed) { + printk(KERN_INFO "%s", mthca_version); + ++mthca_version_printed; + } + + if (id->driver_data >= ARRAY_SIZE(mthca_hca_table)) { + printk(KERN_ERR PFX "%s has invalid driver data %lx\n", + pci_name(pdev), id->driver_data); + mutex_unlock(&mthca_device_mutex); + return -ENODEV; + } + + ret = __mthca_init_one(pdev, id->driver_data); + + mutex_unlock(&mthca_device_mutex); + + return ret; +} + +static void __devexit mthca_remove_one(struct pci_dev *pdev) +{ + mutex_lock(&mthca_device_mutex); + __mthca_remove_one(pdev); + mutex_unlock(&mthca_device_mutex); +} + static struct pci_device_id mthca_pci_table[] = { { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_TAVOR), .driver_data = TAVOR }, @@ -1248,13 +1283,24 @@ static int __init mthca_init(void) { int ret; + mutex_init(&mthca_device_mutex); + ret = mthca_catas_init(); + if (ret) + return ret; + ret = pci_register_driver(&mthca_driver); - return ret < 0 ? ret : 0; + if (ret < 0) { + mthca_catas_cleanup(); + return ret; + } + + return 0; } static void __exit mthca_cleanup(void) { pci_unregister_driver(&mthca_driver); + mthca_catas_cleanup(); } module_init(mthca_init); -- cgit v1.2.3