From ac325acd50013fa8f4953208cbb96504dec9b12a Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Tue, 18 Apr 2006 21:05:21 -0700 Subject: [PATCH] powerpc/pseries: clear PCI failure counter if no new failures The current PCI error recovery system keeps track of the number of PCI card resets, and refuses to bring a card back up if this number is too large. The goal of doing this was to avoid an infinite loop of resets if a card is obviously dead. However, if the failures are rare, but the machine has a high uptime, this mechanism might still be triggered; this is too harsh. This patch will avoids this problem by decrementing the fail count after an hour. Thus, as long as a pci card BSOD's less than 6 times an hour, it will continue to be reset indefinitely. If it's failure rate is greater than that, it will be taken off-line permanently. This patch is larger than it might otherwise be because it changes indentation by removing a pointless while-loop. The while loop is not needed, as the handler is invoked once fo each event (by schedule_work()); the loop is leftover cruft from an earlier implementation. Signed-off-by: Linas Vepstas Signed-off-by: Andrew Morton Signed-off-by: Paul Mackerras --- arch/powerpc/platforms/pseries/eeh_driver.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/platforms/pseries/eeh_driver.c') diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c index 1fba695e32e..2a9eb263073 100644 --- a/arch/powerpc/platforms/pseries/eeh_driver.c +++ b/arch/powerpc/platforms/pseries/eeh_driver.c @@ -23,9 +23,8 @@ * */ #include -#include #include -#include +#include #include #include #include @@ -250,7 +249,7 @@ static int eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus) */ #define MAX_WAIT_FOR_RECOVERY 15 -void handle_eeh_events (struct eeh_event *event) +struct pci_dn * handle_eeh_events (struct eeh_event *event) { struct device_node *frozen_dn; struct pci_dn *frozen_pdn; @@ -265,7 +264,7 @@ void handle_eeh_events (struct eeh_event *event) if (!frozen_dn) { printk(KERN_ERR "EEH: Error: Cannot find partition endpoint for %s\n", pci_name(event->dev)); - return; + return NULL; } /* There are two different styles for coming up with the PE. @@ -280,7 +279,7 @@ void handle_eeh_events (struct eeh_event *event) if (!frozen_bus) { printk(KERN_ERR "EEH: Cannot find PCI bus for %s\n", frozen_dn->full_name); - return; + return NULL; } #if 0 @@ -355,7 +354,7 @@ void handle_eeh_events (struct eeh_event *event) /* Tell all device drivers that they can resume operations */ pci_walk_bus(frozen_bus, eeh_report_resume, NULL); - return; + return frozen_pdn; excess_failures: /* @@ -384,6 +383,8 @@ perm_error: /* Shut down the device drivers for good. */ pcibios_remove_pci_devices(frozen_bus); + + return NULL; } /* ---------- end of file ---------- */ -- cgit v1.2.3