From 70ef6d595b6e51618a0cbe44b848d8c9db11a010 Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Thu, 29 May 2008 18:41:04 +0800 Subject: x86: get irq for hpet timer HPET timer's IRQ is 0 by default. So we have to select which irq will be used by these timers. We wait to set the timer's irq until we really open it in order to reduce the chance of conflicting with other device. Signed-off-by: Kevin Hao Signed-off-by: Ingo Molnar --- drivers/char/hpet.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/hpet.h | 3 ++- 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c index e7fb0bca366..c9bf5d44402 100644 --- a/drivers/char/hpet.c +++ b/drivers/char/hpet.c @@ -184,6 +184,67 @@ static irqreturn_t hpet_interrupt(int irq, void *data) return IRQ_HANDLED; } +static void hpet_timer_set_irq(struct hpet_dev *devp) +{ + unsigned long v; + int irq, gsi; + struct hpet_timer __iomem *timer; + + spin_lock_irq(&hpet_lock); + if (devp->hd_hdwirq) { + spin_unlock_irq(&hpet_lock); + return; + } + + timer = devp->hd_timer; + + /* we prefer level triggered mode */ + v = readl(&timer->hpet_config); + if (!(v & Tn_INT_TYPE_CNF_MASK)) { + v |= Tn_INT_TYPE_CNF_MASK; + writel(v, &timer->hpet_config); + } + spin_unlock_irq(&hpet_lock); + + v = (readq(&timer->hpet_config) & Tn_INT_ROUTE_CAP_MASK) >> + Tn_INT_ROUTE_CAP_SHIFT; + + /* + * In PIC mode, skip IRQ0-4, IRQ6-9, IRQ12-15 which is always used by + * legacy device. In IO APIC mode, we skip all the legacy IRQS. + */ + if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) + v &= ~0xf3df; + else + v &= ~0xffff; + + for (irq = find_first_bit(&v, HPET_MAX_IRQ); irq < HPET_MAX_IRQ; + irq = find_next_bit(&v, HPET_MAX_IRQ, 1 + irq)) { + + if (irq >= NR_IRQS) { + irq = HPET_MAX_IRQ; + break; + } + + gsi = acpi_register_gsi(irq, ACPI_LEVEL_SENSITIVE, + ACPI_ACTIVE_LOW); + if (gsi > 0) + break; + + /* FIXME: Setup interrupt source table */ + } + + if (irq < HPET_MAX_IRQ) { + spin_lock_irq(&hpet_lock); + v = readl(&timer->hpet_config); + v |= irq << Tn_INT_ROUTE_CNF_SHIFT; + writel(v, &timer->hpet_config); + devp->hd_hdwirq = gsi; + spin_unlock_irq(&hpet_lock); + } + return; +} + static int hpet_open(struct inode *inode, struct file *file) { struct hpet_dev *devp; @@ -215,6 +276,8 @@ static int hpet_open(struct inode *inode, struct file *file) devp->hd_flags |= HPET_OPEN; spin_unlock_irq(&hpet_lock); + hpet_timer_set_irq(devp); + return 0; } diff --git a/include/linux/hpet.h b/include/linux/hpet.h index 2dc29ce6c8e..6d2626b63a9 100644 --- a/include/linux/hpet.h +++ b/include/linux/hpet.h @@ -37,6 +37,7 @@ struct hpet { #define hpet_compare _u1._hpet_compare #define HPET_MAX_TIMERS (32) +#define HPET_MAX_IRQ (32) /* * HPET general capabilities register @@ -64,7 +65,7 @@ struct hpet { */ #define Tn_INT_ROUTE_CAP_MASK (0xffffffff00000000ULL) -#define Tn_INI_ROUTE_CAP_SHIFT (32UL) +#define Tn_INT_ROUTE_CAP_SHIFT (32UL) #define Tn_FSB_INT_DELCAP_MASK (0x8000UL) #define Tn_FSB_INT_DELCAP_SHIFT (15) #define Tn_FSB_EN_CNF_MASK (0x4000UL) -- cgit v1.2.3 From 3243b4ed817fad04e56e7b7dc78ec28dc8322ff2 Mon Sep 17 00:00:00 2001 From: Krzysztof Oledzki Date: Wed, 4 Jun 2008 03:40:17 +0200 Subject: x86: add another PCI ID for ICH6 force-hpet Tested on Asus P5GDC-V $ lspci -n -n |grep ISA 00:1f.0 ISA bridge [0601]: Intel Corporation 82801FB/FR (ICH6/ICH6R) LPC Interface Bridge [8086:2640] (rev 03) Force enabled HPET at base address 0xfed00000 hpet clockevent registered hpet0: at MMIO 0xfed00000, IRQs 2, 8, 0 hpet0: 3 64-bit timers, 14318180 Hz Signed-off-by: Krzysztof Piotr Oledzki Signed-off-by: Ingo Molnar --- arch/x86/kernel/quirks.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index d89a648fe71..06e1fd6be83 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -158,6 +158,8 @@ static void ich_force_enable_hpet(struct pci_dev *dev) DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0, ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_0, + ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1, ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_0, -- cgit v1.2.3 From f3561810b163aca5388ad550abbbc82ae5323189 Mon Sep 17 00:00:00 2001 From: Joe Buehler Date: Mon, 9 Jun 2008 08:55:20 -0400 Subject: x86: add PCI ID for 6300ESB force hpet 00:1f.0 ISA bridge: Intel Corporation 6300ESB LPC Interface Controller (rev 02) 00:1f.0 Class 0601: 8086:25a1 (rev 02) kernel: pci 0000:00:1f.0: Force enabled HPET at 0xfed00000 kernel: hpet clockevent registered kernel: hpet0: at MMIO 0xfed00000, IRQs 2, 8, 0 kernel: hpet0: 3 64-bit timers, 14318180 Hz Signed-off-by: Ingo Molnar --- arch/x86/kernel/quirks.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 06e1fd6be83..f327abafe3e 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -257,6 +257,8 @@ static void old_ich_force_enable_hpet_user(struct pci_dev *dev) old_ich_force_enable_hpet(dev); } +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB_1, + old_ich_force_enable_hpet_user); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0, old_ich_force_enable_hpet_user); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12, -- cgit v1.2.3 From a300bec952127d9a15e666b391bb35c9aecb3002 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 22 Jul 2008 09:46:13 -0700 Subject: documentation: move hpet.txt to timers/ subdirectory Move hpet.txt to Documentation/timers/ subdirectory. Add 00-INDEX to Documentation/timers/ subdirectory. Signed-off-by: Randy Dunlap Cc: tglx Signed-off-by: Thomas Gleixner --- Documentation/00-INDEX | 2 - Documentation/hpet.txt | 300 ------------------------------------------ Documentation/timers/00-INDEX | 10 ++ Documentation/timers/hpet.txt | 300 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 310 insertions(+), 302 deletions(-) delete mode 100644 Documentation/hpet.txt create mode 100644 Documentation/timers/00-INDEX create mode 100644 Documentation/timers/hpet.txt diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX index 1977fab3865..6c40c4c300d 100644 --- a/Documentation/00-INDEX +++ b/Documentation/00-INDEX @@ -161,8 +161,6 @@ hayes-esp.txt - info on using the Hayes ESP serial driver. highuid.txt - notes on the change from 16 bit to 32 bit user/group IDs. -hpet.txt - - High Precision Event Timer Driver for Linux. timers/ - info on the timer related topics hw_random.txt diff --git a/Documentation/hpet.txt b/Documentation/hpet.txt deleted file mode 100644 index 6ad52d9dad6..00000000000 --- a/Documentation/hpet.txt +++ /dev/null @@ -1,300 +0,0 @@ - High Precision Event Timer Driver for Linux - -The High Precision Event Timer (HPET) hardware is the future replacement -for the 8254 and Real Time Clock (RTC) periodic timer functionality. -Each HPET can have up to 32 timers. It is possible to configure the -first two timers as legacy replacements for 8254 and RTC periodic timers. -A specification done by Intel and Microsoft can be found at -. - -The driver supports detection of HPET driver allocation and initialization -of the HPET before the driver module_init routine is called. This enables -platform code which uses timer 0 or 1 as the main timer to intercept HPET -initialization. An example of this initialization can be found in -arch/i386/kernel/time_hpet.c. - -The driver provides two APIs which are very similar to the API found in -the rtc.c driver. There is a user space API and a kernel space API. -An example user space program is provided below. - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -extern void hpet_open_close(int, const char **); -extern void hpet_info(int, const char **); -extern void hpet_poll(int, const char **); -extern void hpet_fasync(int, const char **); -extern void hpet_read(int, const char **); - -#include -#include -#include - -struct hpet_command { - char *command; - void (*func)(int argc, const char ** argv); -} hpet_command[] = { - { - "open-close", - hpet_open_close - }, - { - "info", - hpet_info - }, - { - "poll", - hpet_poll - }, - { - "fasync", - hpet_fasync - }, -}; - -int -main(int argc, const char ** argv) -{ - int i; - - argc--; - argv++; - - if (!argc) { - fprintf(stderr, "-hpet: requires command\n"); - return -1; - } - - - for (i = 0; i < (sizeof (hpet_command) / sizeof (hpet_command[0])); i++) - if (!strcmp(argv[0], hpet_command[i].command)) { - argc--; - argv++; - fprintf(stderr, "-hpet: executing %s\n", - hpet_command[i].command); - hpet_command[i].func(argc, argv); - return 0; - } - - fprintf(stderr, "do_hpet: command %s not implemented\n", argv[0]); - - return -1; -} - -void -hpet_open_close(int argc, const char **argv) -{ - int fd; - - if (argc != 1) { - fprintf(stderr, "hpet_open_close: device-name\n"); - return; - } - - fd = open(argv[0], O_RDONLY); - if (fd < 0) - fprintf(stderr, "hpet_open_close: open failed\n"); - else - close(fd); - - return; -} - -void -hpet_info(int argc, const char **argv) -{ -} - -void -hpet_poll(int argc, const char **argv) -{ - unsigned long freq; - int iterations, i, fd; - struct pollfd pfd; - struct hpet_info info; - struct timeval stv, etv; - struct timezone tz; - long usec; - - if (argc != 3) { - fprintf(stderr, "hpet_poll: device-name freq iterations\n"); - return; - } - - freq = atoi(argv[1]); - iterations = atoi(argv[2]); - - fd = open(argv[0], O_RDONLY); - - if (fd < 0) { - fprintf(stderr, "hpet_poll: open of %s failed\n", argv[0]); - return; - } - - if (ioctl(fd, HPET_IRQFREQ, freq) < 0) { - fprintf(stderr, "hpet_poll: HPET_IRQFREQ failed\n"); - goto out; - } - - if (ioctl(fd, HPET_INFO, &info) < 0) { - fprintf(stderr, "hpet_poll: failed to get info\n"); - goto out; - } - - fprintf(stderr, "hpet_poll: info.hi_flags 0x%lx\n", info.hi_flags); - - if (info.hi_flags && (ioctl(fd, HPET_EPI, 0) < 0)) { - fprintf(stderr, "hpet_poll: HPET_EPI failed\n"); - goto out; - } - - if (ioctl(fd, HPET_IE_ON, 0) < 0) { - fprintf(stderr, "hpet_poll, HPET_IE_ON failed\n"); - goto out; - } - - pfd.fd = fd; - pfd.events = POLLIN; - - for (i = 0; i < iterations; i++) { - pfd.revents = 0; - gettimeofday(&stv, &tz); - if (poll(&pfd, 1, -1) < 0) - fprintf(stderr, "hpet_poll: poll failed\n"); - else { - long data; - - gettimeofday(&etv, &tz); - usec = stv.tv_sec * 1000000 + stv.tv_usec; - usec = (etv.tv_sec * 1000000 + etv.tv_usec) - usec; - - fprintf(stderr, - "hpet_poll: expired time = 0x%lx\n", usec); - - fprintf(stderr, "hpet_poll: revents = 0x%x\n", - pfd.revents); - - if (read(fd, &data, sizeof(data)) != sizeof(data)) { - fprintf(stderr, "hpet_poll: read failed\n"); - } - else - fprintf(stderr, "hpet_poll: data 0x%lx\n", - data); - } - } - -out: - close(fd); - return; -} - -static int hpet_sigio_count; - -static void -hpet_sigio(int val) -{ - fprintf(stderr, "hpet_sigio: called\n"); - hpet_sigio_count++; -} - -void -hpet_fasync(int argc, const char **argv) -{ - unsigned long freq; - int iterations, i, fd, value; - sig_t oldsig; - struct hpet_info info; - - hpet_sigio_count = 0; - fd = -1; - - if ((oldsig = signal(SIGIO, hpet_sigio)) == SIG_ERR) { - fprintf(stderr, "hpet_fasync: failed to set signal handler\n"); - return; - } - - if (argc != 3) { - fprintf(stderr, "hpet_fasync: device-name freq iterations\n"); - goto out; - } - - fd = open(argv[0], O_RDONLY); - - if (fd < 0) { - fprintf(stderr, "hpet_fasync: failed to open %s\n", argv[0]); - return; - } - - - if ((fcntl(fd, F_SETOWN, getpid()) == 1) || - ((value = fcntl(fd, F_GETFL)) == 1) || - (fcntl(fd, F_SETFL, value | O_ASYNC) == 1)) { - fprintf(stderr, "hpet_fasync: fcntl failed\n"); - goto out; - } - - freq = atoi(argv[1]); - iterations = atoi(argv[2]); - - if (ioctl(fd, HPET_IRQFREQ, freq) < 0) { - fprintf(stderr, "hpet_fasync: HPET_IRQFREQ failed\n"); - goto out; - } - - if (ioctl(fd, HPET_INFO, &info) < 0) { - fprintf(stderr, "hpet_fasync: failed to get info\n"); - goto out; - } - - fprintf(stderr, "hpet_fasync: info.hi_flags 0x%lx\n", info.hi_flags); - - if (info.hi_flags && (ioctl(fd, HPET_EPI, 0) < 0)) { - fprintf(stderr, "hpet_fasync: HPET_EPI failed\n"); - goto out; - } - - if (ioctl(fd, HPET_IE_ON, 0) < 0) { - fprintf(stderr, "hpet_fasync, HPET_IE_ON failed\n"); - goto out; - } - - for (i = 0; i < iterations; i++) { - (void) pause(); - fprintf(stderr, "hpet_fasync: count = %d\n", hpet_sigio_count); - } - -out: - signal(SIGIO, oldsig); - - if (fd >= 0) - close(fd); - - return; -} - -The kernel API has three interfaces exported from the driver: - - hpet_register(struct hpet_task *tp, int periodic) - hpet_unregister(struct hpet_task *tp) - hpet_control(struct hpet_task *tp, unsigned int cmd, unsigned long arg) - -The kernel module using this interface fills in the ht_func and ht_data -members of the hpet_task structure before calling hpet_register. -hpet_control simply vectors to the hpet_ioctl routine and has the same -commands and respective arguments as the user API. hpet_unregister -is used to terminate usage of the HPET timer reserved by hpet_register. diff --git a/Documentation/timers/00-INDEX b/Documentation/timers/00-INDEX new file mode 100644 index 00000000000..397dc35e132 --- /dev/null +++ b/Documentation/timers/00-INDEX @@ -0,0 +1,10 @@ +00-INDEX + - this file +highres.txt + - High resolution timers and dynamic ticks design notes +hpet.txt + - High Precision Event Timer Driver for Linux +hrtimers.txt + - subsystem for high-resolution kernel timers +timer_stats.txt + - timer usage statistics diff --git a/Documentation/timers/hpet.txt b/Documentation/timers/hpet.txt new file mode 100644 index 00000000000..6ad52d9dad6 --- /dev/null +++ b/Documentation/timers/hpet.txt @@ -0,0 +1,300 @@ + High Precision Event Timer Driver for Linux + +The High Precision Event Timer (HPET) hardware is the future replacement +for the 8254 and Real Time Clock (RTC) periodic timer functionality. +Each HPET can have up to 32 timers. It is possible to configure the +first two timers as legacy replacements for 8254 and RTC periodic timers. +A specification done by Intel and Microsoft can be found at +. + +The driver supports detection of HPET driver allocation and initialization +of the HPET before the driver module_init routine is called. This enables +platform code which uses timer 0 or 1 as the main timer to intercept HPET +initialization. An example of this initialization can be found in +arch/i386/kernel/time_hpet.c. + +The driver provides two APIs which are very similar to the API found in +the rtc.c driver. There is a user space API and a kernel space API. +An example user space program is provided below. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +extern void hpet_open_close(int, const char **); +extern void hpet_info(int, const char **); +extern void hpet_poll(int, const char **); +extern void hpet_fasync(int, const char **); +extern void hpet_read(int, const char **); + +#include +#include +#include + +struct hpet_command { + char *command; + void (*func)(int argc, const char ** argv); +} hpet_command[] = { + { + "open-close", + hpet_open_close + }, + { + "info", + hpet_info + }, + { + "poll", + hpet_poll + }, + { + "fasync", + hpet_fasync + }, +}; + +int +main(int argc, const char ** argv) +{ + int i; + + argc--; + argv++; + + if (!argc) { + fprintf(stderr, "-hpet: requires command\n"); + return -1; + } + + + for (i = 0; i < (sizeof (hpet_command) / sizeof (hpet_command[0])); i++) + if (!strcmp(argv[0], hpet_command[i].command)) { + argc--; + argv++; + fprintf(stderr, "-hpet: executing %s\n", + hpet_command[i].command); + hpet_command[i].func(argc, argv); + return 0; + } + + fprintf(stderr, "do_hpet: command %s not implemented\n", argv[0]); + + return -1; +} + +void +hpet_open_close(int argc, const char **argv) +{ + int fd; + + if (argc != 1) { + fprintf(stderr, "hpet_open_close: device-name\n"); + return; + } + + fd = open(argv[0], O_RDONLY); + if (fd < 0) + fprintf(stderr, "hpet_open_close: open failed\n"); + else + close(fd); + + return; +} + +void +hpet_info(int argc, const char **argv) +{ +} + +void +hpet_poll(int argc, const char **argv) +{ + unsigned long freq; + int iterations, i, fd; + struct pollfd pfd; + struct hpet_info info; + struct timeval stv, etv; + struct timezone tz; + long usec; + + if (argc != 3) { + fprintf(stderr, "hpet_poll: device-name freq iterations\n"); + return; + } + + freq = atoi(argv[1]); + iterations = atoi(argv[2]); + + fd = open(argv[0], O_RDONLY); + + if (fd < 0) { + fprintf(stderr, "hpet_poll: open of %s failed\n", argv[0]); + return; + } + + if (ioctl(fd, HPET_IRQFREQ, freq) < 0) { + fprintf(stderr, "hpet_poll: HPET_IRQFREQ failed\n"); + goto out; + } + + if (ioctl(fd, HPET_INFO, &info) < 0) { + fprintf(stderr, "hpet_poll: failed to get info\n"); + goto out; + } + + fprintf(stderr, "hpet_poll: info.hi_flags 0x%lx\n", info.hi_flags); + + if (info.hi_flags && (ioctl(fd, HPET_EPI, 0) < 0)) { + fprintf(stderr, "hpet_poll: HPET_EPI failed\n"); + goto out; + } + + if (ioctl(fd, HPET_IE_ON, 0) < 0) { + fprintf(stderr, "hpet_poll, HPET_IE_ON failed\n"); + goto out; + } + + pfd.fd = fd; + pfd.events = POLLIN; + + for (i = 0; i < iterations; i++) { + pfd.revents = 0; + gettimeofday(&stv, &tz); + if (poll(&pfd, 1, -1) < 0) + fprintf(stderr, "hpet_poll: poll failed\n"); + else { + long data; + + gettimeofday(&etv, &tz); + usec = stv.tv_sec * 1000000 + stv.tv_usec; + usec = (etv.tv_sec * 1000000 + etv.tv_usec) - usec; + + fprintf(stderr, + "hpet_poll: expired time = 0x%lx\n", usec); + + fprintf(stderr, "hpet_poll: revents = 0x%x\n", + pfd.revents); + + if (read(fd, &data, sizeof(data)) != sizeof(data)) { + fprintf(stderr, "hpet_poll: read failed\n"); + } + else + fprintf(stderr, "hpet_poll: data 0x%lx\n", + data); + } + } + +out: + close(fd); + return; +} + +static int hpet_sigio_count; + +static void +hpet_sigio(int val) +{ + fprintf(stderr, "hpet_sigio: called\n"); + hpet_sigio_count++; +} + +void +hpet_fasync(int argc, const char **argv) +{ + unsigned long freq; + int iterations, i, fd, value; + sig_t oldsig; + struct hpet_info info; + + hpet_sigio_count = 0; + fd = -1; + + if ((oldsig = signal(SIGIO, hpet_sigio)) == SIG_ERR) { + fprintf(stderr, "hpet_fasync: failed to set signal handler\n"); + return; + } + + if (argc != 3) { + fprintf(stderr, "hpet_fasync: device-name freq iterations\n"); + goto out; + } + + fd = open(argv[0], O_RDONLY); + + if (fd < 0) { + fprintf(stderr, "hpet_fasync: failed to open %s\n", argv[0]); + return; + } + + + if ((fcntl(fd, F_SETOWN, getpid()) == 1) || + ((value = fcntl(fd, F_GETFL)) == 1) || + (fcntl(fd, F_SETFL, value | O_ASYNC) == 1)) { + fprintf(stderr, "hpet_fasync: fcntl failed\n"); + goto out; + } + + freq = atoi(argv[1]); + iterations = atoi(argv[2]); + + if (ioctl(fd, HPET_IRQFREQ, freq) < 0) { + fprintf(stderr, "hpet_fasync: HPET_IRQFREQ failed\n"); + goto out; + } + + if (ioctl(fd, HPET_INFO, &info) < 0) { + fprintf(stderr, "hpet_fasync: failed to get info\n"); + goto out; + } + + fprintf(stderr, "hpet_fasync: info.hi_flags 0x%lx\n", info.hi_flags); + + if (info.hi_flags && (ioctl(fd, HPET_EPI, 0) < 0)) { + fprintf(stderr, "hpet_fasync: HPET_EPI failed\n"); + goto out; + } + + if (ioctl(fd, HPET_IE_ON, 0) < 0) { + fprintf(stderr, "hpet_fasync, HPET_IE_ON failed\n"); + goto out; + } + + for (i = 0; i < iterations; i++) { + (void) pause(); + fprintf(stderr, "hpet_fasync: count = %d\n", hpet_sigio_count); + } + +out: + signal(SIGIO, oldsig); + + if (fd >= 0) + close(fd); + + return; +} + +The kernel API has three interfaces exported from the driver: + + hpet_register(struct hpet_task *tp, int periodic) + hpet_unregister(struct hpet_task *tp) + hpet_control(struct hpet_task *tp, unsigned int cmd, unsigned long arg) + +The kernel module using this interface fills in the ht_func and ht_data +members of the hpet_task structure before calling hpet_register. +hpet_control simply vectors to the hpet_ioctl routine and has the same +commands and respective arguments as the user API. hpet_unregister +is used to terminate usage of the HPET timer reserved by hpet_register. -- cgit v1.2.3 From 021f8b75e78f9da67421a2c2e320e8934a90914a Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:08:45 +0200 Subject: x86: add PCI IDs for AMD Barcelona PCI devices Signed-off-by: Robert Richter Cc: oprofile-list Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- include/linux/pci_ids.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index c3b1761aba2..917d48e5ea0 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -497,6 +497,11 @@ #define PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP 0x1101 #define PCI_DEVICE_ID_AMD_K8_NB_MEMCTL 0x1102 #define PCI_DEVICE_ID_AMD_K8_NB_MISC 0x1103 +#define PCI_DEVICE_ID_AMD_10H_NB_HT 0x1200 +#define PCI_DEVICE_ID_AMD_10H_NB_MAP 0x1201 +#define PCI_DEVICE_ID_AMD_10H_NB_DRAM 0x1202 +#define PCI_DEVICE_ID_AMD_10H_NB_MISC 0x1203 +#define PCI_DEVICE_ID_AMD_10H_NB_LINK 0x1204 #define PCI_DEVICE_ID_AMD_LANCE 0x2000 #define PCI_DEVICE_ID_AMD_LANCE_HOME 0x2001 #define PCI_DEVICE_ID_AMD_SCSI 0x2020 -- cgit v1.2.3 From 286f571837ba9d67625afd015366d79345abb414 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:08:46 +0200 Subject: x86: apic_*.c: add description to AMD's extended LVT functions Signed-off-by: Robert Richter Cc: oprofile-list Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 3 +++ arch/x86/kernel/apic_64.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index d6c89835837..fad94b0decc 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -646,6 +646,9 @@ int setup_profiling_timer(unsigned int multiplier) * * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and * MCE interrupts are supported. Thus MCE offset must be set to 0. + * + * If mask=1, the LVT entry does not generate interrupts while mask=0 + * enables the vector. See also the BKDGs. */ #define APIC_EILVT_LVTOFF_MCE 0 diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 7f1f030da7e..42bf69f8bc8 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -205,6 +205,9 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) * * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and * MCE interrupts are supported. Thus MCE offset must be set to 0. + * + * If mask=1, the LVT entry does not generate interrupts while mask=0 + * enables the vector. See also the BKDGs. */ #define APIC_EILVT_LVTOFF_MCE 0 -- cgit v1.2.3 From 12f2b2610e812627acf338aaf043fef20bb726ca Mon Sep 17 00:00:00 2001 From: Barry Kasindorf Date: Tue, 22 Jul 2008 21:08:47 +0200 Subject: oprofile: Add support for AMD Family 11h This patch add support for AMD Family 11h CPUs. Signed-off-by: Barry Kasindorf Signed-off-by: Robert Richter Cc: oprofile-list Signed-off-by: Ingo Molnar --- arch/x86/oprofile/nmi_int.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 3f90289410e..33db99ab90c 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -436,6 +436,10 @@ int __init op_nmi_init(struct oprofile_operations *ops) model = &op_athlon_spec; cpu_type = "x86-64/family10"; break; + case 0x11: + model = &op_athlon_spec; + cpu_type = "x86-64/family11h"; + break; } break; -- cgit v1.2.3 From adf5ec0bca553b763a6b9baed2677a4c7470025b Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:08:48 +0200 Subject: x86/oprofile: introduce model specific init/exit functions This patch implements model specific OProfile init/exit functions for x86 CPUs. Though there is more rework needed at the initialization code, this new introduced functions allow it to keep model specific code in the corresponding op_model_*.c files. The function interface is the same as for oprofile_arch_init/exit(). Signed-off-by: Robert Richter Cc: oprofile-list Signed-off-by: Ingo Molnar --- arch/x86/oprofile/nmi_int.c | 11 ++++++++++- arch/x86/oprofile/op_model_athlon.c | 18 +++++++++++++++--- arch/x86/oprofile/op_x86_model.h | 2 ++ 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 33db99ab90c..75e889156f2 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -1,10 +1,11 @@ /** * @file nmi_int.c * - * @remark Copyright 2002 OProfile authors + * @remark Copyright 2002-2008 OProfile authors * @remark Read the file COPYING * * @author John Levon + * @author Robert Richter */ #include @@ -411,6 +412,7 @@ int __init op_nmi_init(struct oprofile_operations *ops) __u8 vendor = boot_cpu_data.x86_vendor; __u8 family = boot_cpu_data.x86; char *cpu_type; + int ret = 0; if (!cpu_has_apic) return -ENODEV; @@ -466,6 +468,11 @@ int __init op_nmi_init(struct oprofile_operations *ops) return -ENODEV; } + if (model->init) + ret = model->init(ops); + if (ret) + return ret; + init_sysfs(); using_nmi = 1; ops->create_files = nmi_create_files; @@ -482,4 +489,6 @@ void op_nmi_exit(void) { if (using_nmi) exit_sysfs(); + if (model->exit) + model->exit(); } diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c index 3d534879a9d..dd8b1dcd163 100644 --- a/arch/x86/oprofile/op_model_athlon.c +++ b/arch/x86/oprofile/op_model_athlon.c @@ -1,14 +1,15 @@ /* - * @file op_model_athlon.h + * @file op_model_athlon.c * athlon / K7 / K8 / Family 10h model-specific MSR operations * - * @remark Copyright 2002 OProfile authors + * @remark Copyright 2002-2008 OProfile authors * @remark Read the file COPYING * * @author John Levon * @author Philippe Elie * @author Graydon Hoare - */ + * @author Robert Richter +*/ #include #include @@ -178,7 +179,18 @@ static void athlon_shutdown(struct op_msrs const * const msrs) } } +static int op_amd_init(struct oprofile_operations *ops) +{ + return 0; +} + +static void op_amd_exit(void) +{ +} + struct op_x86_model_spec const op_athlon_spec = { + .init = op_amd_init, + .exit = op_amd_exit, .num_counters = NUM_COUNTERS, .num_controls = NUM_CONTROLS, .fill_in_addresses = &athlon_fill_in_addresses, diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 45b605fa71d..ee9ca96253f 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -32,6 +32,8 @@ struct pt_regs; * various x86 CPU models' perfctr support. */ struct op_x86_model_spec { + int (*init)(struct oprofile_operations *ops); + void (*exit)(void); unsigned int const num_counters; unsigned int const num_controls; void (*fill_in_addresses)(struct op_msrs * const msrs); -- cgit v1.2.3 From dfa154289701ed315909b4e371a0381c52c8bdc9 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:08:49 +0200 Subject: x86/oprofile: Minor changes in op_model_athlon.c Signed-off-by: Robert Richter Cc: oprofile-list Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- arch/x86/oprofile/op_model_athlon.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c index dd8b1dcd163..d25d7f19503 100644 --- a/arch/x86/oprofile/op_model_athlon.c +++ b/arch/x86/oprofile/op_model_athlon.c @@ -45,6 +45,8 @@ static unsigned long reset_value[NUM_COUNTERS]; +/* functions for op_athlon_spec */ + static void athlon_fill_in_addresses(struct op_msrs * const msrs) { int i; -- cgit v1.2.3 From 6657fe4f5650ff7174d418d4bfa50c4640e81a2b Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:08:50 +0200 Subject: x86/oprofile: renaming athlon_*() into op_amd_*() These functions contain code for all AMD CPUs. The new names fit better. Signed-off-by: Robert Richter Cc: oprofile-list Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- arch/x86/oprofile/nmi_int.c | 8 ++++---- arch/x86/oprofile/op_model_athlon.c | 28 ++++++++++++++-------------- arch/x86/oprofile/op_x86_model.h | 2 +- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 75e889156f2..b29819313f2 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -425,21 +425,21 @@ int __init op_nmi_init(struct oprofile_operations *ops) default: return -ENODEV; case 6: - model = &op_athlon_spec; + model = &op_amd_spec; cpu_type = "i386/athlon"; break; case 0xf: - model = &op_athlon_spec; + model = &op_amd_spec; /* Actually it could be i386/hammer too, but give user space an consistent name. */ cpu_type = "x86-64/hammer"; break; case 0x10: - model = &op_athlon_spec; + model = &op_amd_spec; cpu_type = "x86-64/family10"; break; case 0x11: - model = &op_athlon_spec; + model = &op_amd_spec; cpu_type = "x86-64/family11h"; break; } diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c index d25d7f19503..40ecb020c7d 100644 --- a/arch/x86/oprofile/op_model_athlon.c +++ b/arch/x86/oprofile/op_model_athlon.c @@ -45,9 +45,9 @@ static unsigned long reset_value[NUM_COUNTERS]; -/* functions for op_athlon_spec */ +/* functions for op_amd_spec */ -static void athlon_fill_in_addresses(struct op_msrs * const msrs) +static void op_amd_fill_in_addresses(struct op_msrs * const msrs) { int i; @@ -67,7 +67,7 @@ static void athlon_fill_in_addresses(struct op_msrs * const msrs) } -static void athlon_setup_ctrs(struct op_msrs const * const msrs) +static void op_amd_setup_ctrs(struct op_msrs const * const msrs) { unsigned int low, high; int i; @@ -116,7 +116,7 @@ static void athlon_setup_ctrs(struct op_msrs const * const msrs) } -static int athlon_check_ctrs(struct pt_regs * const regs, +static int op_amd_check_ctrs(struct pt_regs * const regs, struct op_msrs const * const msrs) { unsigned int low, high; @@ -137,7 +137,7 @@ static int athlon_check_ctrs(struct pt_regs * const regs, } -static void athlon_start(struct op_msrs const * const msrs) +static void op_amd_start(struct op_msrs const * const msrs) { unsigned int low, high; int i; @@ -151,7 +151,7 @@ static void athlon_start(struct op_msrs const * const msrs) } -static void athlon_stop(struct op_msrs const * const msrs) +static void op_amd_stop(struct op_msrs const * const msrs) { unsigned int low, high; int i; @@ -167,7 +167,7 @@ static void athlon_stop(struct op_msrs const * const msrs) } } -static void athlon_shutdown(struct op_msrs const * const msrs) +static void op_amd_shutdown(struct op_msrs const * const msrs) { int i; @@ -190,15 +190,15 @@ static void op_amd_exit(void) { } -struct op_x86_model_spec const op_athlon_spec = { +struct op_x86_model_spec const op_amd_spec = { .init = op_amd_init, .exit = op_amd_exit, .num_counters = NUM_COUNTERS, .num_controls = NUM_CONTROLS, - .fill_in_addresses = &athlon_fill_in_addresses, - .setup_ctrs = &athlon_setup_ctrs, - .check_ctrs = &athlon_check_ctrs, - .start = &athlon_start, - .stop = &athlon_stop, - .shutdown = &athlon_shutdown + .fill_in_addresses = &op_amd_fill_in_addresses, + .setup_ctrs = &op_amd_setup_ctrs, + .check_ctrs = &op_amd_check_ctrs, + .start = &op_amd_start, + .stop = &op_amd_stop, + .shutdown = &op_amd_shutdown }; diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index ee9ca96253f..05a0261ba0c 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -48,6 +48,6 @@ struct op_x86_model_spec { extern struct op_x86_model_spec const op_ppro_spec; extern struct op_x86_model_spec const op_p4_spec; extern struct op_x86_model_spec const op_p4_ht2_spec; -extern struct op_x86_model_spec const op_athlon_spec; +extern struct op_x86_model_spec const op_amd_spec; #endif /* OP_X86_MODEL_H */ -- cgit v1.2.3 From 73185e0a5d11d729d451692034fbe55a9eba7468 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:08:51 +0200 Subject: drivers/oprofile: coding style fixes in buffer_sync.c Signed-off-by: Robert Richter Cc: oprofile-list Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- drivers/oprofile/buffer_sync.c | 111 +++++++++++++++++++++-------------------- 1 file changed, 57 insertions(+), 54 deletions(-) diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c index 9304c455507..69a732778ba 100644 --- a/drivers/oprofile/buffer_sync.c +++ b/drivers/oprofile/buffer_sync.c @@ -33,7 +33,7 @@ #include "event_buffer.h" #include "cpu_buffer.h" #include "buffer_sync.h" - + static LIST_HEAD(dying_tasks); static LIST_HEAD(dead_tasks); static cpumask_t marked_cpus = CPU_MASK_NONE; @@ -48,10 +48,11 @@ static void process_task_mortuary(void); * Can be invoked from softirq via RCU callback due to * call_rcu() of the task struct, hence the _irqsave. */ -static int task_free_notify(struct notifier_block * self, unsigned long val, void * data) +static int +task_free_notify(struct notifier_block *self, unsigned long val, void *data) { unsigned long flags; - struct task_struct * task = data; + struct task_struct *task = data; spin_lock_irqsave(&task_mortuary, flags); list_add(&task->tasks, &dying_tasks); spin_unlock_irqrestore(&task_mortuary, flags); @@ -62,13 +63,14 @@ static int task_free_notify(struct notifier_block * self, unsigned long val, voi /* The task is on its way out. A sync of the buffer means we can catch * any remaining samples for this task. */ -static int task_exit_notify(struct notifier_block * self, unsigned long val, void * data) +static int +task_exit_notify(struct notifier_block *self, unsigned long val, void *data) { /* To avoid latency problems, we only process the current CPU, * hoping that most samples for the task are on this CPU */ sync_buffer(raw_smp_processor_id()); - return 0; + return 0; } @@ -77,11 +79,12 @@ static int task_exit_notify(struct notifier_block * self, unsigned long val, voi * we don't lose any. This does not have to be exact, it's a QoI issue * only. */ -static int munmap_notify(struct notifier_block * self, unsigned long val, void * data) +static int +munmap_notify(struct notifier_block *self, unsigned long val, void *data) { unsigned long addr = (unsigned long)data; - struct mm_struct * mm = current->mm; - struct vm_area_struct * mpnt; + struct mm_struct *mm = current->mm; + struct vm_area_struct *mpnt; down_read(&mm->mmap_sem); @@ -99,11 +102,12 @@ static int munmap_notify(struct notifier_block * self, unsigned long val, void * return 0; } - + /* We need to be told about new modules so we don't attribute to a previously * loaded module, or drop the samples on the floor. */ -static int module_load_notify(struct notifier_block * self, unsigned long val, void * data) +static int +module_load_notify(struct notifier_block *self, unsigned long val, void *data) { #ifdef CONFIG_MODULES if (val != MODULE_STATE_COMING) @@ -118,7 +122,7 @@ static int module_load_notify(struct notifier_block * self, unsigned long val, v return 0; } - + static struct notifier_block task_free_nb = { .notifier_call = task_free_notify, }; @@ -135,7 +139,7 @@ static struct notifier_block module_load_nb = { .notifier_call = module_load_notify, }; - + static void end_sync(void) { end_cpu_work(); @@ -208,14 +212,14 @@ static inline unsigned long fast_get_dcookie(struct path *path) * not strictly necessary but allows oprofile to associate * shared-library samples with particular applications */ -static unsigned long get_exec_dcookie(struct mm_struct * mm) +static unsigned long get_exec_dcookie(struct mm_struct *mm) { unsigned long cookie = NO_COOKIE; - struct vm_area_struct * vma; - + struct vm_area_struct *vma; + if (!mm) goto out; - + for (vma = mm->mmap; vma; vma = vma->vm_next) { if (!vma->vm_file) continue; @@ -235,13 +239,14 @@ out: * sure to do this lookup before a mm->mmap modification happens so * we don't lose track. */ -static unsigned long lookup_dcookie(struct mm_struct * mm, unsigned long addr, off_t * offset) +static unsigned long +lookup_dcookie(struct mm_struct *mm, unsigned long addr, off_t *offset) { unsigned long cookie = NO_COOKIE; - struct vm_area_struct * vma; + struct vm_area_struct *vma; for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) { - + if (addr < vma->vm_start || addr >= vma->vm_end) continue; @@ -265,7 +270,7 @@ static unsigned long lookup_dcookie(struct mm_struct * mm, unsigned long addr, o static unsigned long last_cookie = INVALID_COOKIE; - + static void add_cpu_switch(int i) { add_event_entry(ESCAPE_CODE); @@ -278,16 +283,16 @@ static void add_kernel_ctx_switch(unsigned int in_kernel) { add_event_entry(ESCAPE_CODE); if (in_kernel) - add_event_entry(KERNEL_ENTER_SWITCH_CODE); + add_event_entry(KERNEL_ENTER_SWITCH_CODE); else - add_event_entry(KERNEL_EXIT_SWITCH_CODE); + add_event_entry(KERNEL_EXIT_SWITCH_CODE); } - + static void -add_user_ctx_switch(struct task_struct const * task, unsigned long cookie) +add_user_ctx_switch(struct task_struct const *task, unsigned long cookie) { add_event_entry(ESCAPE_CODE); - add_event_entry(CTX_SWITCH_CODE); + add_event_entry(CTX_SWITCH_CODE); add_event_entry(task->pid); add_event_entry(cookie); /* Another code for daemon back-compat */ @@ -296,7 +301,7 @@ add_user_ctx_switch(struct task_struct const * task, unsigned long cookie) add_event_entry(task->tgid); } - + static void add_cookie_switch(unsigned long cookie) { add_event_entry(ESCAPE_CODE); @@ -304,7 +309,7 @@ static void add_cookie_switch(unsigned long cookie) add_event_entry(cookie); } - + static void add_trace_begin(void) { add_event_entry(ESCAPE_CODE); @@ -319,13 +324,13 @@ static void add_sample_entry(unsigned long offset, unsigned long event) } -static int add_us_sample(struct mm_struct * mm, struct op_sample * s) +static int add_us_sample(struct mm_struct *mm, struct op_sample *s) { unsigned long cookie; off_t offset; - - cookie = lookup_dcookie(mm, s->eip, &offset); - + + cookie = lookup_dcookie(mm, s->eip, &offset); + if (cookie == INVALID_COOKIE) { atomic_inc(&oprofile_stats.sample_lost_no_mapping); return 0; @@ -341,13 +346,13 @@ static int add_us_sample(struct mm_struct * mm, struct op_sample * s) return 1; } - + /* Add a sample to the global event buffer. If possible the * sample is converted into a persistent dentry/offset pair * for later lookup from userspace. */ static int -add_sample(struct mm_struct * mm, struct op_sample * s, int in_kernel) +add_sample(struct mm_struct *mm, struct op_sample *s, int in_kernel) { if (in_kernel) { add_sample_entry(s->eip, s->event); @@ -359,9 +364,9 @@ add_sample(struct mm_struct * mm, struct op_sample * s, int in_kernel) } return 0; } - -static void release_mm(struct mm_struct * mm) + +static void release_mm(struct mm_struct *mm) { if (!mm) return; @@ -370,9 +375,9 @@ static void release_mm(struct mm_struct * mm) } -static struct mm_struct * take_tasks_mm(struct task_struct * task) +static struct mm_struct *take_tasks_mm(struct task_struct *task) { - struct mm_struct * mm = get_task_mm(task); + struct mm_struct *mm = get_task_mm(task); if (mm) down_read(&mm->mmap_sem); return mm; @@ -383,10 +388,10 @@ static inline int is_code(unsigned long val) { return val == ESCAPE_CODE; } - + /* "acquire" as many cpu buffer slots as we can */ -static unsigned long get_slots(struct oprofile_cpu_buffer * b) +static unsigned long get_slots(struct oprofile_cpu_buffer *b) { unsigned long head = b->head_pos; unsigned long tail = b->tail_pos; @@ -412,7 +417,7 @@ static unsigned long get_slots(struct oprofile_cpu_buffer * b) } -static void increment_tail(struct oprofile_cpu_buffer * b) +static void increment_tail(struct oprofile_cpu_buffer *b) { unsigned long new_tail = b->tail_pos + 1; @@ -435,8 +440,8 @@ static void process_task_mortuary(void) { unsigned long flags; LIST_HEAD(local_dead_tasks); - struct task_struct * task; - struct task_struct * ttask; + struct task_struct *task; + struct task_struct *ttask; spin_lock_irqsave(&task_mortuary, flags); @@ -493,7 +498,7 @@ void sync_buffer(int cpu) { struct oprofile_cpu_buffer *cpu_buf = &per_cpu(cpu_buffer, cpu); struct mm_struct *mm = NULL; - struct task_struct * new; + struct task_struct *new; unsigned long cookie = 0; int in_kernel = 1; unsigned int i; @@ -501,7 +506,7 @@ void sync_buffer(int cpu) unsigned long available; mutex_lock(&buffer_mutex); - + add_cpu_switch(cpu); /* Remember, only we can modify tail_pos */ @@ -509,8 +514,8 @@ void sync_buffer(int cpu) available = get_slots(cpu_buf); for (i = 0; i < available; ++i) { - struct op_sample * s = &cpu_buf->buffer[cpu_buf->tail_pos]; - + struct op_sample *s = &cpu_buf->buffer[cpu_buf->tail_pos]; + if (is_code(s->eip)) { if (s->event <= CPU_IS_KERNEL) { /* kernel/userspace switch */ @@ -522,7 +527,7 @@ void sync_buffer(int cpu) state = sb_bt_start; add_trace_begin(); } else { - struct mm_struct * oldmm = mm; + struct mm_struct *oldmm = mm; /* userspace context switch */ new = (struct task_struct *)s->event; @@ -533,13 +538,11 @@ void sync_buffer(int cpu) cookie = get_exec_dcookie(mm); add_user_ctx_switch(new, cookie); } - } else { - if (state >= sb_bt_start && - !add_sample(mm, s, in_kernel)) { - if (state == sb_bt_start) { - state = sb_bt_ignore; - atomic_inc(&oprofile_stats.bt_lost_no_mapping); - } + } else if (state >= sb_bt_start && + !add_sample(mm, s, in_kernel)) { + if (state == sb_bt_start) { + state = sb_bt_ignore; + atomic_inc(&oprofile_stats.bt_lost_no_mapping); } } -- cgit v1.2.3 From 5e11f98dcebc40c9d67e8d21a5f47248444c17a8 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:08:52 +0200 Subject: OProfile: moving increment_tail() in buffer_sync.c Signed-off-by: Robert Richter Cc: oprofile-list Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- drivers/oprofile/buffer_sync.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c index 69a732778ba..615929f6f0c 100644 --- a/drivers/oprofile/buffer_sync.c +++ b/drivers/oprofile/buffer_sync.c @@ -268,6 +268,17 @@ lookup_dcookie(struct mm_struct *mm, unsigned long addr, off_t *offset) return cookie; } +static void increment_tail(struct oprofile_cpu_buffer *b) +{ + unsigned long new_tail = b->tail_pos + 1; + + rmb(); + + if (new_tail < b->buffer_size) + b->tail_pos = new_tail; + else + b->tail_pos = 0; +} static unsigned long last_cookie = INVALID_COOKIE; @@ -417,19 +428,6 @@ static unsigned long get_slots(struct oprofile_cpu_buffer *b) } -static void increment_tail(struct oprofile_cpu_buffer *b) -{ - unsigned long new_tail = b->tail_pos + 1; - - rmb(); - - if (new_tail < b->buffer_size) - b->tail_pos = new_tail; - else - b->tail_pos = 0; -} - - /* Move tasks along towards death. Any tasks on dead_tasks * will definitely have no remaining references in any * CPU buffers at this point, because we use two lists, -- cgit v1.2.3 From ee648bc77f11b57d15a68d336fc30e343198f893 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:08:53 +0200 Subject: OProfile: add IBS code macros Signed-off-by: Robert Richter Cc: oprofile-list Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- include/linux/oprofile.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h index 041bb31100f..bcb8f725427 100644 --- a/include/linux/oprofile.h +++ b/include/linux/oprofile.h @@ -36,6 +36,8 @@ #define XEN_ENTER_SWITCH_CODE 10 #define SPU_PROFILING_CODE 11 #define SPU_CTX_SWITCH_CODE 12 +#define IBS_FETCH_CODE 13 +#define IBS_OP_CODE 14 struct super_block; struct dentry; -- cgit v1.2.3 From 345c25730d085c45622ac779da4dbd97dc3a10fe Mon Sep 17 00:00:00 2001 From: Barry Kasindorf Date: Tue, 22 Jul 2008 21:08:54 +0200 Subject: x86/oprofile: add IBS support for AMD CPUs, IBS buffer handling routines This patchset supports the new profiling hardware available in the latest AMD CPUs in the oProfile driver. Signed-off-by: Barry Kasindorf Signed-off-by: Robert Richter Cc: oprofile-list Signed-off-by: Ingo Molnar --- drivers/oprofile/buffer_sync.c | 72 +++++++++++++++++++++++++++++++++++++++++- drivers/oprofile/cpu_buffer.c | 68 ++++++++++++++++++++++++++++++++++++++- drivers/oprofile/cpu_buffer.h | 2 ++ 3 files changed, 140 insertions(+), 2 deletions(-) diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c index 615929f6f0c..e1782d2df09 100644 --- a/drivers/oprofile/buffer_sync.c +++ b/drivers/oprofile/buffer_sync.c @@ -5,6 +5,7 @@ * @remark Read the file COPYING * * @author John Levon + * @author Barry Kasindorf * * This is the core of the buffer management. Each * CPU buffer is processed and entered into the @@ -272,7 +273,7 @@ static void increment_tail(struct oprofile_cpu_buffer *b) { unsigned long new_tail = b->tail_pos + 1; - rmb(); + rmb(); /* be sure fifo pointers are synchromized */ if (new_tail < b->buffer_size) b->tail_pos = new_tail; @@ -327,6 +328,67 @@ static void add_trace_begin(void) add_event_entry(TRACE_BEGIN_CODE); } +#define IBS_FETCH_CODE_SIZE 2 +#define IBS_OP_CODE_SIZE 5 +#define IBS_EIP(offset) \ + (((struct op_sample *)&cpu_buf->buffer[(offset)])->eip) +#define IBS_EVENT(offset) \ + (((struct op_sample *)&cpu_buf->buffer[(offset)])->event) + +/* + * Add IBS fetch and op entries to event buffer + */ +static void add_ibs_begin(struct oprofile_cpu_buffer *cpu_buf, int code, + int in_kernel, struct mm_struct *mm) +{ + unsigned long rip; + int i, count; + unsigned long ibs_cookie = 0; + off_t offset; + + increment_tail(cpu_buf); /* move to RIP entry */ + + rip = IBS_EIP(cpu_buf->tail_pos); + +#ifdef __LP64__ + rip += IBS_EVENT(cpu_buf->tail_pos) << 32; +#endif + + if (mm) { + ibs_cookie = lookup_dcookie(mm, rip, &offset); + + if (ibs_cookie == NO_COOKIE) + offset = rip; + if (ibs_cookie == INVALID_COOKIE) { + atomic_inc(&oprofile_stats.sample_lost_no_mapping); + offset = rip; + } + if (ibs_cookie != last_cookie) { + add_cookie_switch(ibs_cookie); + last_cookie = ibs_cookie; + } + } else + offset = rip; + + add_event_entry(ESCAPE_CODE); + add_event_entry(code); + add_event_entry(offset); /* Offset from Dcookie */ + + /* we send the Dcookie offset, but send the raw Linear Add also*/ + add_event_entry(IBS_EIP(cpu_buf->tail_pos)); + add_event_entry(IBS_EVENT(cpu_buf->tail_pos)); + + if (code == IBS_FETCH_CODE) + count = IBS_FETCH_CODE_SIZE; /*IBS FETCH is 2 int64s*/ + else + count = IBS_OP_CODE_SIZE; /*IBS OP is 5 int64s*/ + + for (i = 0; i < count; i++) { + increment_tail(cpu_buf); + add_event_entry(IBS_EIP(cpu_buf->tail_pos)); + add_event_entry(IBS_EVENT(cpu_buf->tail_pos)); + } +} static void add_sample_entry(unsigned long offset, unsigned long event) { @@ -524,6 +586,14 @@ void sync_buffer(int cpu) } else if (s->event == CPU_TRACE_BEGIN) { state = sb_bt_start; add_trace_begin(); + } else if (s->event == IBS_FETCH_BEGIN) { + state = sb_bt_start; + add_ibs_begin(cpu_buf, + IBS_FETCH_CODE, in_kernel, mm); + } else if (s->event == IBS_OP_BEGIN) { + state = sb_bt_start; + add_ibs_begin(cpu_buf, + IBS_OP_CODE, in_kernel, mm); } else { struct mm_struct *oldmm = mm; diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c index 2450b3a393f..c9ac4e15691 100644 --- a/drivers/oprofile/cpu_buffer.c +++ b/drivers/oprofile/cpu_buffer.c @@ -5,6 +5,7 @@ * @remark Read the file COPYING * * @author John Levon + * @author Barry Kasindorf * * Each CPU has a local buffer that stores PC value/event * pairs. We also log context switches when we notice them. @@ -207,7 +208,7 @@ static int log_sample(struct oprofile_cpu_buffer * cpu_buf, unsigned long pc, return 1; } -static int oprofile_begin_trace(struct oprofile_cpu_buffer * cpu_buf) +static int oprofile_begin_trace(struct oprofile_cpu_buffer *cpu_buf) { if (nr_available_slots(cpu_buf) < 4) { cpu_buf->sample_lost_overflow++; @@ -252,6 +253,71 @@ void oprofile_add_sample(struct pt_regs * const regs, unsigned long event) oprofile_add_ext_sample(pc, regs, event, is_kernel); } +#define MAX_IBS_SAMPLE_SIZE 14 +static int log_ibs_sample(struct oprofile_cpu_buffer *cpu_buf, + unsigned long pc, int is_kernel, unsigned int *ibs, int ibs_code) +{ + struct task_struct *task; + + cpu_buf->sample_received++; + + if (nr_available_slots(cpu_buf) < MAX_IBS_SAMPLE_SIZE) { + cpu_buf->sample_lost_overflow++; + return 0; + } + + is_kernel = !!is_kernel; + + /* notice a switch from user->kernel or vice versa */ + if (cpu_buf->last_is_kernel != is_kernel) { + cpu_buf->last_is_kernel = is_kernel; + add_code(cpu_buf, is_kernel); + } + + /* notice a task switch */ + if (!is_kernel) { + task = current; + + if (cpu_buf->last_task != task) { + cpu_buf->last_task = task; + add_code(cpu_buf, (unsigned long)task); + } + } + + add_code(cpu_buf, ibs_code); + add_sample(cpu_buf, ibs[0], ibs[1]); + add_sample(cpu_buf, ibs[2], ibs[3]); + add_sample(cpu_buf, ibs[4], ibs[5]); + + if (ibs_code == IBS_OP_BEGIN) { + add_sample(cpu_buf, ibs[6], ibs[7]); + add_sample(cpu_buf, ibs[8], ibs[9]); + add_sample(cpu_buf, ibs[10], ibs[11]); + } + + return 1; +} + +void oprofile_add_ibs_sample(struct pt_regs *const regs, + unsigned int * const ibs_sample, u8 code) +{ + int is_kernel = !user_mode(regs); + unsigned long pc = profile_pc(regs); + + struct oprofile_cpu_buffer *cpu_buf = + &per_cpu(cpu_buffer, smp_processor_id()); + + if (!backtrace_depth) { + log_ibs_sample(cpu_buf, pc, is_kernel, ibs_sample, code); + return; + } + + /* if log_sample() fails we can't backtrace since we lost the source + * of this event */ + if (log_ibs_sample(cpu_buf, pc, is_kernel, ibs_sample, code)) + oprofile_ops.backtrace(regs, backtrace_depth); +} + void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event) { struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer); diff --git a/drivers/oprofile/cpu_buffer.h b/drivers/oprofile/cpu_buffer.h index c3e366b5226..9c44d004da6 100644 --- a/drivers/oprofile/cpu_buffer.h +++ b/drivers/oprofile/cpu_buffer.h @@ -55,5 +55,7 @@ void cpu_buffer_reset(struct oprofile_cpu_buffer * cpu_buf); /* transient events for the CPU buffer -> event buffer */ #define CPU_IS_KERNEL 1 #define CPU_TRACE_BEGIN 2 +#define IBS_FETCH_BEGIN 3 +#define IBS_OP_BEGIN 4 #endif /* OPROFILE_CPU_BUFFER_H */ -- cgit v1.2.3 From 56784f11df473b4c1d9d0e37777fd7c0b77b6bca Mon Sep 17 00:00:00 2001 From: Barry Kasindorf Date: Tue, 22 Jul 2008 21:08:55 +0200 Subject: x86/oprofile: add IBS support for AMD CPUs, model specific code This patchset supports the new profiling hardware available in the latest AMD CPUs in the oProfile driver. Signed-off-by: Barry Kasindorf Signed-off-by: Robert Richter Cc: oprofile-list Signed-off-by: Ingo Molnar --- arch/x86/oprofile/op_model_athlon.c | 257 ++++++++++++++++++++++++++++++++++++ 1 file changed, 257 insertions(+) diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c index 40ecb020c7d..229e0b4e21e 100644 --- a/arch/x86/oprofile/op_model_athlon.c +++ b/arch/x86/oprofile/op_model_athlon.c @@ -9,9 +9,13 @@ * @author Philippe Elie * @author Graydon Hoare * @author Robert Richter + * @author Barry Kasindorf */ #include +#include +#include + #include #include #include @@ -43,7 +47,83 @@ #define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9)) #define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8)) +#define IBS_FETCH_CTL_HIGH_MASK 0xFFFFFFFF +/* high dword bit IbsFetchCtl[bit 49] */ +#define IBS_FETCH_VALID_BIT (1UL << 17) +/* high dword bit IbsFetchCtl[bit 52] */ +#define IBS_FETCH_PHY_ADDR_VALID_BIT (1UL << 20) +/* high dword bit IbsFetchCtl[bit 48] */ +#define IBS_FETCH_ENABLE (1UL << 16) + +#define IBS_FETCH_CTL_CNT_MASK 0x00000000FFFF0000UL +#define IBS_FETCH_CTL_MAX_CNT_MASK 0x000000000000FFFFUL + +/*IbsOpCtl masks/bits */ +#define IBS_OP_VALID_BIT (1ULL<<18) /* IbsOpCtl[bit18] */ +#define IBS_OP_ENABLE (1ULL<<17) /* IBS_OP_ENABLE[bit17]*/ + +/* Codes used in cpu_buffer.c */ +#define IBS_FETCH_BEGIN 3 +#define IBS_OP_BEGIN 4 + +/*IbsOpData3 masks */ +#define IBS_CTL_LVT_OFFSET_VALID_BIT (1ULL<<8) + +/*PCI Extended Configuration Constants */ +/* MSR to set the IBS control register APIC LVT offset */ +#define IBS_LVT_OFFSET_PCI 0x1CC + +struct ibs_fetch_sample { + /* MSRC001_1031 IBS Fetch Linear Address Register */ + unsigned int ibs_fetch_lin_addr_low; + unsigned int ibs_fetch_lin_addr_high; + /* MSRC001_1030 IBS Fetch Control Register */ + unsigned int ibs_fetch_ctl_low; + unsigned int ibs_fetch_ctl_high; + /* MSRC001_1032 IBS Fetch Physical Address Register */ + unsigned int ibs_fetch_phys_addr_low; + unsigned int ibs_fetch_phys_addr_high; +}; + +struct ibs_op_sample { + /* MSRC001_1034 IBS Op Logical Address Register (IbsRIP) */ + unsigned int ibs_op_rip_low; + unsigned int ibs_op_rip_high; + /* MSRC001_1035 IBS Op Data Register */ + unsigned int ibs_op_data1_low; + unsigned int ibs_op_data1_high; + /* MSRC001_1036 IBS Op Data 2 Register */ + unsigned int ibs_op_data2_low; + unsigned int ibs_op_data2_high; + /* MSRC001_1037 IBS Op Data 3 Register */ + unsigned int ibs_op_data3_low; + unsigned int ibs_op_data3_high; + /* MSRC001_1038 IBS DC Linear Address Register (IbsDcLinAd) */ + unsigned int ibs_dc_linear_low; + unsigned int ibs_dc_linear_high; + /* MSRC001_1039 IBS DC Physical Address Register (IbsDcPhysAd) */ + unsigned int ibs_dc_phys_low; + unsigned int ibs_dc_phys_high; +}; + +/* + * unitialize the APIC for the IBS interrupts if needed on AMD Family10h+ +*/ +static void clear_ibs_nmi(void); + static unsigned long reset_value[NUM_COUNTERS]; +static int ibs_allowed; /* AMD Family10h and later */ + +struct op_ibs_config { + unsigned long op_enabled; + unsigned long fetch_enabled; + unsigned long max_cnt_fetch; + unsigned long max_cnt_op; + unsigned long rand_en; + unsigned long dispatched_ops; +}; + +static struct op_ibs_config ibs_config; /* functions for op_amd_spec */ @@ -121,6 +201,8 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, { unsigned int low, high; int i; + struct ibs_fetch_sample ibs_fetch; + struct ibs_op_sample ibs_op; for (i = 0 ; i < NUM_COUNTERS; ++i) { if (!reset_value[i]) @@ -132,6 +214,65 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, } } + /*If AMD and IBS is available */ + if (ibs_allowed && ibs_config.fetch_enabled) { + rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); + if (high & IBS_FETCH_VALID_BIT) { + ibs_fetch.ibs_fetch_ctl_high = high; + ibs_fetch.ibs_fetch_ctl_low = low; + rdmsr(MSR_AMD64_IBSFETCHLINAD, low, high); + ibs_fetch.ibs_fetch_lin_addr_high = high; + ibs_fetch.ibs_fetch_lin_addr_low = low; + rdmsr(MSR_AMD64_IBSFETCHPHYSAD, low, high); + ibs_fetch.ibs_fetch_phys_addr_high = high; + ibs_fetch.ibs_fetch_phys_addr_low = low; + + oprofile_add_ibs_sample(regs, + (unsigned int *)&ibs_fetch, + IBS_FETCH_BEGIN); + + /*reenable the IRQ */ + rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); + high &= ~(IBS_FETCH_VALID_BIT); + high |= IBS_FETCH_ENABLE; + low &= IBS_FETCH_CTL_MAX_CNT_MASK; + wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + } + } + + if (ibs_allowed && ibs_config.op_enabled) { + rdmsr(MSR_AMD64_IBSOPCTL, low, high); + if (low & IBS_OP_VALID_BIT) { + rdmsr(MSR_AMD64_IBSOPRIP, low, high); + ibs_op.ibs_op_rip_low = low; + ibs_op.ibs_op_rip_high = high; + rdmsr(MSR_AMD64_IBSOPDATA, low, high); + ibs_op.ibs_op_data1_low = low; + ibs_op.ibs_op_data1_high = high; + rdmsr(MSR_AMD64_IBSOPDATA2, low, high); + ibs_op.ibs_op_data2_low = low; + ibs_op.ibs_op_data2_high = high; + rdmsr(MSR_AMD64_IBSOPDATA3, low, high); + ibs_op.ibs_op_data3_low = low; + ibs_op.ibs_op_data3_high = high; + rdmsr(MSR_AMD64_IBSDCLINAD, low, high); + ibs_op.ibs_dc_linear_low = low; + ibs_op.ibs_dc_linear_high = high; + rdmsr(MSR_AMD64_IBSDCPHYSAD, low, high); + ibs_op.ibs_dc_phys_low = low; + ibs_op.ibs_dc_phys_high = high; + + /* reenable the IRQ */ + oprofile_add_ibs_sample(regs, + (unsigned int *)&ibs_op, + IBS_OP_BEGIN); + rdmsr(MSR_AMD64_IBSOPCTL, low, high); + low &= ~(IBS_OP_VALID_BIT); + low |= IBS_OP_ENABLE; + wrmsr(MSR_AMD64_IBSOPCTL, low, high); + } + } + /* See op_model_ppro.c */ return 1; } @@ -148,6 +289,17 @@ static void op_amd_start(struct op_msrs const * const msrs) CTRL_WRITE(low, high, msrs, i); } } + if (ibs_allowed && ibs_config.fetch_enabled) { + low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; + high = IBS_FETCH_ENABLE; + wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + } + + if (ibs_allowed && ibs_config.op_enabled) { + low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF) + IBS_OP_ENABLE; + high = 0; + wrmsr(MSR_AMD64_IBSOPCTL, low, high); + } } @@ -165,6 +317,18 @@ static void op_amd_stop(struct op_msrs const * const msrs) CTRL_SET_INACTIVE(low); CTRL_WRITE(low, high, msrs, i); } + + if (ibs_allowed && ibs_config.fetch_enabled) { + low = 0; /* clear max count and enable */ + high = 0; + wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + } + + if (ibs_allowed && ibs_config.op_enabled) { + low = 0; /* clear max count and enable */ + high = 0; + wrmsr(MSR_AMD64_IBSOPCTL, low, high); + } } static void op_amd_shutdown(struct op_msrs const * const msrs) @@ -181,6 +345,99 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) } } +static inline void apic_init_ibs_nmi_per_cpu(void *arg) +{ + setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0); +} + +static inline void apic_clear_ibs_nmi_per_cpu(void *arg) +{ + setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1); +} + +/* + * initialize the APIC for the IBS interrupts + * if needed on AMD Family10h rev B0 and later + */ +static void setup_ibs(void) +{ + struct pci_dev *gh_device = NULL; + u32 low, high; + u8 vector; + + ibs_allowed = boot_cpu_has(X86_FEATURE_IBS); + + if (!ibs_allowed) + return; + + /* This gets the APIC_EILVT_LVTOFF_IBS value */ + vector = setup_APIC_eilvt_ibs(0, 0, 1); + + /*see if the IBS control register is already set correctly*/ + /*remove this when we know for sure it is done + in the kernel init*/ + rdmsr(MSR_AMD64_IBSCTL, low, high); + if ((low & (IBS_CTL_LVT_OFFSET_VALID_BIT | vector)) != + (IBS_CTL_LVT_OFFSET_VALID_BIT | vector)) { + + /**** Be sure to run loop until NULL is returned to + decrement reference count on any pci_dev structures + returned ****/ + while ((gh_device = pci_get_device(PCI_VENDOR_ID_AMD, + PCI_DEVICE_ID_AMD_10H_NB_MISC, gh_device)) + != NULL) { + /* This code may change if we can find a proper + * way to get at the PCI extended config space */ + pci_write_config_dword( + gh_device, IBS_LVT_OFFSET_PCI, + (vector | IBS_CTL_LVT_OFFSET_VALID_BIT)); + } + } + on_each_cpu(apic_init_ibs_nmi_per_cpu, NULL, 1, 1); +} + + +/* + * unitialize the APIC for the IBS interrupts if needed on AMD Family10h + * rev B0 and later */ +static void clear_ibs_nmi(void) +{ + if (ibs_allowed) + on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1, 1); +} + +static void setup_ibs_files(struct super_block *sb, struct dentry *root) +{ + char buf[12]; + struct dentry *dir; + + if (!ibs_allowed) + return; + + /* setup some reasonable defaults */ + ibs_config.max_cnt_fetch = 250000; + ibs_config.fetch_enabled = 0; + ibs_config.max_cnt_op = 250000; + ibs_config.op_enabled = 0; + ibs_config.dispatched_ops = 1; + snprintf(buf, sizeof(buf), "ibs_fetch"); + dir = oprofilefs_mkdir(sb, root, buf); + oprofilefs_create_ulong(sb, dir, "rand_enable", + &ibs_config.rand_en); + oprofilefs_create_ulong(sb, dir, "enable", + &ibs_config.fetch_enabled); + oprofilefs_create_ulong(sb, dir, "max_count", + &ibs_config.max_cnt_fetch); + snprintf(buf, sizeof(buf), "ibs_uops"); + dir = oprofilefs_mkdir(sb, root, buf); + oprofilefs_create_ulong(sb, dir, "enable", + &ibs_config.op_enabled); + oprofilefs_create_ulong(sb, dir, "max_count", + &ibs_config.max_cnt_op); + oprofilefs_create_ulong(sb, dir, "dispatched_ops", + &ibs_config.dispatched_ops); +} + static int op_amd_init(struct oprofile_operations *ops) { return 0; -- cgit v1.2.3 From 7939d2bf7e30353d40d14f967b7fe2b2a7be5bd9 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:08:56 +0200 Subject: x86/oprofile: separating the IBS handler Signed-off-by: Robert Richter Cc: oprofile-list Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- arch/x86/oprofile/op_model_athlon.c | 45 +++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c index 229e0b4e21e..a2c8e2e372b 100644 --- a/arch/x86/oprofile/op_model_athlon.c +++ b/arch/x86/oprofile/op_model_athlon.c @@ -195,27 +195,18 @@ static void op_amd_setup_ctrs(struct op_msrs const * const msrs) } } - -static int op_amd_check_ctrs(struct pt_regs * const regs, - struct op_msrs const * const msrs) +static inline int +op_amd_handle_ibs(struct pt_regs * const regs, + struct op_msrs const * const msrs) { unsigned int low, high; - int i; struct ibs_fetch_sample ibs_fetch; struct ibs_op_sample ibs_op; - for (i = 0 ; i < NUM_COUNTERS; ++i) { - if (!reset_value[i]) - continue; - CTR_READ(low, high, msrs, i); - if (CTR_OVERFLOWED(low)) { - oprofile_add_sample(regs, i); - CTR_WRITE(reset_value[i], msrs, i); - } - } + if (!ibs_allowed) + return 1; - /*If AMD and IBS is available */ - if (ibs_allowed && ibs_config.fetch_enabled) { + if (ibs_config.fetch_enabled) { rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); if (high & IBS_FETCH_VALID_BIT) { ibs_fetch.ibs_fetch_ctl_high = high; @@ -240,7 +231,7 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, } } - if (ibs_allowed && ibs_config.op_enabled) { + if (ibs_config.op_enabled) { rdmsr(MSR_AMD64_IBSOPCTL, low, high); if (low & IBS_OP_VALID_BIT) { rdmsr(MSR_AMD64_IBSOPRIP, low, high); @@ -273,10 +264,30 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, } } - /* See op_model_ppro.c */ return 1; } +static int op_amd_check_ctrs(struct pt_regs * const regs, + struct op_msrs const * const msrs) +{ + unsigned int low, high; + int i; + + for (i = 0 ; i < NUM_COUNTERS; ++i) { + if (!reset_value[i]) + continue; + CTR_READ(low, high, msrs, i); + if (CTR_OVERFLOWED(low)) { + oprofile_add_sample(regs, i); + CTR_WRITE(reset_value[i], msrs, i); + } + } + + op_amd_handle_ibs(regs, msrs); + + /* See op_model_ppro.c */ + return 1; +} static void op_amd_start(struct op_msrs const * const msrs) { -- cgit v1.2.3 From 7d77f2dcae37cf232950cd0181fb0a2cddb18130 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:08:57 +0200 Subject: OProfile: change IBS interrupt initialization Signed-off-by: Robert Richter Cc: oprofile-list Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- arch/x86/oprofile/op_model_athlon.c | 84 ++++++++++++++++++++++++------------- 1 file changed, 54 insertions(+), 30 deletions(-) diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c index a2c8e2e372b..90193b1538a 100644 --- a/arch/x86/oprofile/op_model_athlon.c +++ b/arch/x86/oprofile/op_model_athlon.c @@ -356,9 +356,11 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) } } +static u8 ibs_eilvt_off; + static inline void apic_init_ibs_nmi_per_cpu(void *arg) { - setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0); + ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0); } static inline void apic_clear_ibs_nmi_per_cpu(void *arg) @@ -366,45 +368,67 @@ static inline void apic_clear_ibs_nmi_per_cpu(void *arg) setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1); } +static int pfm_amd64_setup_eilvt(void) +{ +#define IBSCTL_LVTOFFSETVAL (1 << 8) +#define IBSCTL 0x1cc + struct pci_dev *cpu_cfg; + int nodes; + u32 value = 0; + + /* per CPU setup */ + on_each_cpu(apic_init_ibs_nmi_per_cpu, NULL, 0, 1); + + nodes = 0; + cpu_cfg = NULL; + do { + cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD, + PCI_DEVICE_ID_AMD_10H_NB_MISC, + cpu_cfg); + if (!cpu_cfg) + break; + ++nodes; + pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off + | IBSCTL_LVTOFFSETVAL); + pci_read_config_dword(cpu_cfg, IBSCTL, &value); + if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) { + printk(KERN_DEBUG "Failed to setup IBS LVT offset, " + "IBSCTL = 0x%08x", value); + return 1; + } + } while (1); + + if (!nodes) { + printk(KERN_DEBUG "No CPU node configured for IBS"); + return 1; + } + +#ifdef CONFIG_NUMA + /* Sanity check */ + /* Works only for 64bit with proper numa implementation. */ + if (nodes != num_possible_nodes()) { + printk(KERN_DEBUG "Failed to setup CPU node(s) for IBS, " + "found: %d, expected %d", + nodes, num_possible_nodes()); + return 1; + } +#endif + return 0; +} + /* * initialize the APIC for the IBS interrupts - * if needed on AMD Family10h rev B0 and later + * if available (AMD Family10h rev B0 and later) */ static void setup_ibs(void) { - struct pci_dev *gh_device = NULL; - u32 low, high; - u8 vector; - ibs_allowed = boot_cpu_has(X86_FEATURE_IBS); if (!ibs_allowed) return; - /* This gets the APIC_EILVT_LVTOFF_IBS value */ - vector = setup_APIC_eilvt_ibs(0, 0, 1); - - /*see if the IBS control register is already set correctly*/ - /*remove this when we know for sure it is done - in the kernel init*/ - rdmsr(MSR_AMD64_IBSCTL, low, high); - if ((low & (IBS_CTL_LVT_OFFSET_VALID_BIT | vector)) != - (IBS_CTL_LVT_OFFSET_VALID_BIT | vector)) { - - /**** Be sure to run loop until NULL is returned to - decrement reference count on any pci_dev structures - returned ****/ - while ((gh_device = pci_get_device(PCI_VENDOR_ID_AMD, - PCI_DEVICE_ID_AMD_10H_NB_MISC, gh_device)) - != NULL) { - /* This code may change if we can find a proper - * way to get at the PCI extended config space */ - pci_write_config_dword( - gh_device, IBS_LVT_OFFSET_PCI, - (vector | IBS_CTL_LVT_OFFSET_VALID_BIT)); - } - } - on_each_cpu(apic_init_ibs_nmi_per_cpu, NULL, 1, 1); + if (pfm_amd64_setup_eilvt()) + ibs_allowed = 0; } -- cgit v1.2.3 From 90645700ef8393cd9cdc02d83da36726fc88f49e Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:08:58 +0200 Subject: OProfile: Fix build error in op_model_athlon.c Signed-off-by: Robert Richter Cc: oprofile-list Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- arch/x86/oprofile/op_model_athlon.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c index 90193b1538a..284c45661ed 100644 --- a/arch/x86/oprofile/op_model_athlon.c +++ b/arch/x86/oprofile/op_model_athlon.c @@ -73,6 +73,11 @@ /* MSR to set the IBS control register APIC LVT offset */ #define IBS_LVT_OFFSET_PCI 0x1CC +/* The function interface needs to be fixed, something like add + data. Should then be added to linux/oprofile.h. */ +extern void oprofile_add_ibs_sample(struct pt_regs *const regs, + unsigned int * const ibs_sample, u8 code); + struct ibs_fetch_sample { /* MSRC001_1031 IBS Fetch Linear Address Register */ unsigned int ibs_fetch_lin_addr_low; -- cgit v1.2.3 From ebb535de267386f659e0348185b1e361dbba3b59 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:08:59 +0200 Subject: OProfile: on_each_cpu(): kill unused retry parameter Signed-off-by: Robert Richter Cc: oprofile-list Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- arch/x86/oprofile/op_model_athlon.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c index 284c45661ed..ce732362dbb 100644 --- a/arch/x86/oprofile/op_model_athlon.c +++ b/arch/x86/oprofile/op_model_athlon.c @@ -382,7 +382,7 @@ static int pfm_amd64_setup_eilvt(void) u32 value = 0; /* per CPU setup */ - on_each_cpu(apic_init_ibs_nmi_per_cpu, NULL, 0, 1); + on_each_cpu(apic_init_ibs_nmi_per_cpu, NULL, 1); nodes = 0; cpu_cfg = NULL; @@ -443,7 +443,7 @@ static void setup_ibs(void) static void clear_ibs_nmi(void) { if (ibs_allowed) - on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1, 1); + on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); } static void setup_ibs_files(struct super_block *sb, struct dentry *root) -- cgit v1.2.3 From fc2bd7345b4e006a34c2ea3711d8c6b83cba50f7 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:09:00 +0200 Subject: OProfile: fix setup_ibs_files() function interface Signed-off-by: Robert Richter Cc: oprofile-list Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- arch/x86/oprofile/op_model_athlon.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c index ce732362dbb..2650b12a0c5 100644 --- a/arch/x86/oprofile/op_model_athlon.c +++ b/arch/x86/oprofile/op_model_athlon.c @@ -446,13 +446,13 @@ static void clear_ibs_nmi(void) on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); } -static void setup_ibs_files(struct super_block *sb, struct dentry *root) +static int setup_ibs_files(struct super_block * sb, struct dentry * root) { char buf[12]; struct dentry *dir; if (!ibs_allowed) - return; + return 0; /* setup some reasonable defaults */ ibs_config.max_cnt_fetch = 250000; @@ -476,6 +476,8 @@ static void setup_ibs_files(struct super_block *sb, struct dentry *root) &ibs_config.max_cnt_op); oprofilefs_create_ulong(sb, dir, "dispatched_ops", &ibs_config.dispatched_ops); + + return 0; } static int op_amd_init(struct oprofile_operations *ops) -- cgit v1.2.3 From 270d3e1a10e6ef85d5a085377e01d91dbcbe3726 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:09:01 +0200 Subject: OProfile: enable IBS for AMD CPUs Signed-off-by: Robert Richter Cc: oprofile-list Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- arch/x86/oprofile/nmi_int.c | 14 ++++++++------ arch/x86/oprofile/op_model_athlon.c | 18 +++++++++++++++++- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index b29819313f2..287513a0981 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -468,6 +468,14 @@ int __init op_nmi_init(struct oprofile_operations *ops) return -ENODEV; } + /* default values, can be overwritten by model */ + ops->create_files = nmi_create_files; + ops->setup = nmi_setup; + ops->shutdown = nmi_shutdown; + ops->start = nmi_start; + ops->stop = nmi_stop; + ops->cpu_type = cpu_type; + if (model->init) ret = model->init(ops); if (ret) @@ -475,12 +483,6 @@ int __init op_nmi_init(struct oprofile_operations *ops) init_sysfs(); using_nmi = 1; - ops->create_files = nmi_create_files; - ops->setup = nmi_setup; - ops->shutdown = nmi_shutdown; - ops->start = nmi_start; - ops->stop = nmi_stop; - ops->cpu_type = cpu_type; printk(KERN_INFO "oprofile: using NMI interrupt.\n"); return 0; } diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c index 2650b12a0c5..0d839031979 100644 --- a/arch/x86/oprofile/op_model_athlon.c +++ b/arch/x86/oprofile/op_model_athlon.c @@ -446,13 +446,25 @@ static void clear_ibs_nmi(void) on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); } +static int (*create_arch_files)(struct super_block * sb, struct dentry * root); + static int setup_ibs_files(struct super_block * sb, struct dentry * root) { char buf[12]; struct dentry *dir; + int ret = 0; + + /* architecture specific files */ + if (create_arch_files) + ret = create_arch_files(sb, root); + + if (ret) + return ret; if (!ibs_allowed) - return 0; + return ret; + + /* model specific files */ /* setup some reasonable defaults */ ibs_config.max_cnt_fetch = 250000; @@ -482,11 +494,15 @@ static int setup_ibs_files(struct super_block * sb, struct dentry * root) static int op_amd_init(struct oprofile_operations *ops) { + setup_ibs(); + create_arch_files = ops->create_files; + ops->create_files = setup_ibs_files; return 0; } static void op_amd_exit(void) { + clear_ibs_nmi(); } struct op_x86_model_spec const op_amd_spec = { -- cgit v1.2.3 From a4c408a41167949f820e2740e56a8f2f7bb6177c Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:09:02 +0200 Subject: OProfile: fix IBS build error for UP Signed-off-by: Robert Richter Cc: oprofile-list Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- arch/x86/oprofile/op_model_athlon.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c index 0d839031979..1acb067bd34 100644 --- a/arch/x86/oprofile/op_model_athlon.c +++ b/arch/x86/oprofile/op_model_athlon.c @@ -361,6 +361,26 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) } } +#ifndef CONFIG_SMP + +/* no IBS support */ + +static void setup_ibs(void) +{ + ibs_allowed = 0; +} + +static void clear_ibs_nmi(void) {} + +static int op_amd_init(struct oprofile_operations *ops) +{ + return 0; +} + +static void op_amd_exit(void) {} + +#else + static u8 ibs_eilvt_off; static inline void apic_init_ibs_nmi_per_cpu(void *arg) @@ -505,6 +525,8 @@ static void op_amd_exit(void) clear_ibs_nmi(); } +#endif + struct op_x86_model_spec const op_amd_spec = { .init = op_amd_init, .exit = op_amd_exit, -- cgit v1.2.3 From 87f0baccc2e4f194c931186d3c8499314494a484 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:09:03 +0200 Subject: x86/oprofile: macro definition cleanup in op_model_athlon.c Signed-off-by: Robert Richter Cc: oprofile-list Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- arch/x86/oprofile/op_model_athlon.c | 46 ++++++++++++++----------------------- 1 file changed, 17 insertions(+), 29 deletions(-) diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c index 1acb067bd34..a3a2058c372 100644 --- a/arch/x86/oprofile/op_model_athlon.c +++ b/arch/x86/oprofile/op_model_athlon.c @@ -47,32 +47,20 @@ #define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9)) #define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8)) -#define IBS_FETCH_CTL_HIGH_MASK 0xFFFFFFFF -/* high dword bit IbsFetchCtl[bit 49] */ -#define IBS_FETCH_VALID_BIT (1UL << 17) -/* high dword bit IbsFetchCtl[bit 52] */ -#define IBS_FETCH_PHY_ADDR_VALID_BIT (1UL << 20) -/* high dword bit IbsFetchCtl[bit 48] */ -#define IBS_FETCH_ENABLE (1UL << 16) +/* IbsFetchCtl bits/masks */ +#define IBS_FETCH_HIGH_VALID_BIT (1UL << 17) /* bit 49 */ +#define IBS_FETCH_HIGH_ENABLE (1UL << 16) /* bit 48 */ +#define IBS_FETCH_LOW_MAX_CNT_MASK 0x0000FFFFUL /* MaxCnt mask */ -#define IBS_FETCH_CTL_CNT_MASK 0x00000000FFFF0000UL -#define IBS_FETCH_CTL_MAX_CNT_MASK 0x000000000000FFFFUL - -/*IbsOpCtl masks/bits */ -#define IBS_OP_VALID_BIT (1ULL<<18) /* IbsOpCtl[bit18] */ -#define IBS_OP_ENABLE (1ULL<<17) /* IBS_OP_ENABLE[bit17]*/ +/*IbsOpCtl bits */ +#define IBS_OP_LOW_VALID_BIT (1ULL<<18) /* bit 18 */ +#define IBS_OP_LOW_ENABLE (1ULL<<17) /* bit 17 */ /* Codes used in cpu_buffer.c */ +/* This produces duplicate code, need to be fixed */ #define IBS_FETCH_BEGIN 3 #define IBS_OP_BEGIN 4 -/*IbsOpData3 masks */ -#define IBS_CTL_LVT_OFFSET_VALID_BIT (1ULL<<8) - -/*PCI Extended Configuration Constants */ -/* MSR to set the IBS control register APIC LVT offset */ -#define IBS_LVT_OFFSET_PCI 0x1CC - /* The function interface needs to be fixed, something like add data. Should then be added to linux/oprofile.h. */ extern void oprofile_add_ibs_sample(struct pt_regs *const regs, @@ -213,7 +201,7 @@ op_amd_handle_ibs(struct pt_regs * const regs, if (ibs_config.fetch_enabled) { rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); - if (high & IBS_FETCH_VALID_BIT) { + if (high & IBS_FETCH_HIGH_VALID_BIT) { ibs_fetch.ibs_fetch_ctl_high = high; ibs_fetch.ibs_fetch_ctl_low = low; rdmsr(MSR_AMD64_IBSFETCHLINAD, low, high); @@ -229,16 +217,16 @@ op_amd_handle_ibs(struct pt_regs * const regs, /*reenable the IRQ */ rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); - high &= ~(IBS_FETCH_VALID_BIT); - high |= IBS_FETCH_ENABLE; - low &= IBS_FETCH_CTL_MAX_CNT_MASK; + high &= ~IBS_FETCH_HIGH_VALID_BIT; + high |= IBS_FETCH_HIGH_ENABLE; + low &= IBS_FETCH_LOW_MAX_CNT_MASK; wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); } } if (ibs_config.op_enabled) { rdmsr(MSR_AMD64_IBSOPCTL, low, high); - if (low & IBS_OP_VALID_BIT) { + if (low & IBS_OP_LOW_VALID_BIT) { rdmsr(MSR_AMD64_IBSOPRIP, low, high); ibs_op.ibs_op_rip_low = low; ibs_op.ibs_op_rip_high = high; @@ -263,8 +251,8 @@ op_amd_handle_ibs(struct pt_regs * const regs, (unsigned int *)&ibs_op, IBS_OP_BEGIN); rdmsr(MSR_AMD64_IBSOPCTL, low, high); - low &= ~(IBS_OP_VALID_BIT); - low |= IBS_OP_ENABLE; + low &= ~IBS_OP_LOW_VALID_BIT; + low |= IBS_OP_LOW_ENABLE; wrmsr(MSR_AMD64_IBSOPCTL, low, high); } } @@ -307,12 +295,12 @@ static void op_amd_start(struct op_msrs const * const msrs) } if (ibs_allowed && ibs_config.fetch_enabled) { low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; - high = IBS_FETCH_ENABLE; + high = IBS_FETCH_HIGH_ENABLE; wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); } if (ibs_allowed && ibs_config.op_enabled) { - low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF) + IBS_OP_ENABLE; + low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF) + IBS_OP_LOW_ENABLE; high = 0; wrmsr(MSR_AMD64_IBSOPCTL, low, high); } -- cgit v1.2.3 From 543a157bbdfae8eb997506031c3b2d4d17957098 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:09:04 +0200 Subject: x86/oprofile: op_model_athlon.c: fix counter reset when reenabling IBS OP Signed-off-by: Robert Richter Cc: oprofile-list Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- arch/x86/oprofile/op_model_athlon.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c index a3a2058c372..9c8c8c58313 100644 --- a/arch/x86/oprofile/op_model_athlon.c +++ b/arch/x86/oprofile/op_model_athlon.c @@ -251,6 +251,7 @@ op_amd_handle_ibs(struct pt_regs * const regs, (unsigned int *)&ibs_op, IBS_OP_BEGIN); rdmsr(MSR_AMD64_IBSOPCTL, low, high); + high = 0; low &= ~IBS_OP_LOW_VALID_BIT; low |= IBS_OP_LOW_ENABLE; wrmsr(MSR_AMD64_IBSOPCTL, low, high); -- cgit v1.2.3 From 09691616850b3614dfb44790e1e1419b6a7f5d13 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:09:05 +0200 Subject: x86: apic: export symbols for extended interrupt LVT functions This patch adds EXPORT_SYMBOLs to allow OProfile to be built as module. Cc: Arjan van de Ven Signed-off-by: Robert Richter Cc: oprofile-list Cc: Arjan van de Ven Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 1 + arch/x86/kernel/apic_64.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index fad94b0decc..ed3a6d7e9de 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -672,6 +672,7 @@ u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); return APIC_EILVT_LVTOFF_IBS; } +EXPORT_SYMBOL(setup_APIC_eilvt_ibs); /* * Local APIC start and shutdown diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 42bf69f8bc8..45ec90e37b9 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -232,6 +232,7 @@ u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); return APIC_EILVT_LVTOFF_IBS; } +EXPORT_SYMBOL(setup_APIC_eilvt_ibs); /* * Program the next event, relative to now -- cgit v1.2.3 From 6aa360e6c16c145edf1837690e0f7aaea6b86ef3 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 23 Jul 2008 15:28:14 +0200 Subject: x86: apic: changing export symbols to *_GPL This fits better here. Signed-off-by: Robert Richter Cc: Arjan van de Ven Cc: Barry Kasindorf Cc: oprofile-list Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 2 +- arch/x86/kernel/apic_64.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index ed3a6d7e9de..0059e7a8a9e 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -672,7 +672,7 @@ u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); return APIC_EILVT_LVTOFF_IBS; } -EXPORT_SYMBOL(setup_APIC_eilvt_ibs); +EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs); /* * Local APIC start and shutdown diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 45ec90e37b9..e571351f2a9 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -232,7 +232,7 @@ u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); return APIC_EILVT_LVTOFF_IBS; } -EXPORT_SYMBOL(setup_APIC_eilvt_ibs); +EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs); /* * Program the next event, relative to now -- cgit v1.2.3 From 852402cc27bfa1200164e9e8dc7f6e5f0a4fbd46 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:09:06 +0200 Subject: x86/oprofile: add CONFIG_OPROFILE_IBS option Signed-off-by: Robert Richter Cc: oprofile-list Cc: Robert Richter Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- arch/Kconfig | 14 ++++++++++++++ arch/x86/oprofile/op_model_athlon.c | 33 +++++++++++++++++++++++---------- drivers/oprofile/buffer_sync.c | 6 ++++++ drivers/oprofile/cpu_buffer.c | 4 ++++ 4 files changed, 47 insertions(+), 10 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index b0fabfa864f..2651af48b2e 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -13,6 +13,20 @@ config OPROFILE If unsure, say N. +config OPROFILE_IBS + bool "OProfile AMD IBS support (EXPERIMENTAL)" + default n + depends on OPROFILE && SMP && X86 + help + Instruction-Based Sampling (IBS) is a new profiling + technique that provides rich, precise program performance + information. IBS is introduced by AMD Family10h processors + (AMD Opteron Quad-Core processor “Barcelona”) to overcome + the limitations of conventional performance counter + sampling. + + If unsure, say N. + config HAVE_OPROFILE def_bool n diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c index 9c8c8c58313..fb6015c1132 100644 --- a/arch/x86/oprofile/op_model_athlon.c +++ b/arch/x86/oprofile/op_model_athlon.c @@ -47,6 +47,10 @@ #define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9)) #define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8)) +static unsigned long reset_value[NUM_COUNTERS]; + +#ifdef CONFIG_OPROFILE_IBS + /* IbsFetchCtl bits/masks */ #define IBS_FETCH_HIGH_VALID_BIT (1UL << 17) /* bit 49 */ #define IBS_FETCH_HIGH_ENABLE (1UL << 16) /* bit 48 */ @@ -104,7 +108,6 @@ struct ibs_op_sample { */ static void clear_ibs_nmi(void); -static unsigned long reset_value[NUM_COUNTERS]; static int ibs_allowed; /* AMD Family10h and later */ struct op_ibs_config { @@ -118,6 +121,8 @@ struct op_ibs_config { static struct op_ibs_config ibs_config; +#endif + /* functions for op_amd_spec */ static void op_amd_fill_in_addresses(struct op_msrs * const msrs) @@ -188,6 +193,8 @@ static void op_amd_setup_ctrs(struct op_msrs const * const msrs) } } +#ifdef CONFIG_OPROFILE_IBS + static inline int op_amd_handle_ibs(struct pt_regs * const regs, struct op_msrs const * const msrs) @@ -261,6 +268,8 @@ op_amd_handle_ibs(struct pt_regs * const regs, return 1; } +#endif + static int op_amd_check_ctrs(struct pt_regs * const regs, struct op_msrs const * const msrs) { @@ -277,7 +286,9 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, } } +#ifdef CONFIG_OPROFILE_IBS op_amd_handle_ibs(regs, msrs); +#endif /* See op_model_ppro.c */ return 1; @@ -294,6 +305,8 @@ static void op_amd_start(struct op_msrs const * const msrs) CTRL_WRITE(low, high, msrs, i); } } + +#ifdef CONFIG_OPROFILE_IBS if (ibs_allowed && ibs_config.fetch_enabled) { low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; high = IBS_FETCH_HIGH_ENABLE; @@ -305,6 +318,7 @@ static void op_amd_start(struct op_msrs const * const msrs) high = 0; wrmsr(MSR_AMD64_IBSOPCTL, low, high); } +#endif } @@ -323,6 +337,7 @@ static void op_amd_stop(struct op_msrs const * const msrs) CTRL_WRITE(low, high, msrs, i); } +#ifdef CONFIG_OPROFILE_IBS if (ibs_allowed && ibs_config.fetch_enabled) { low = 0; /* clear max count and enable */ high = 0; @@ -334,6 +349,7 @@ static void op_amd_stop(struct op_msrs const * const msrs) high = 0; wrmsr(MSR_AMD64_IBSOPCTL, low, high); } +#endif } static void op_amd_shutdown(struct op_msrs const * const msrs) @@ -350,17 +366,10 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) } } -#ifndef CONFIG_SMP +#ifndef CONFIG_OPROFILE_IBS /* no IBS support */ -static void setup_ibs(void) -{ - ibs_allowed = 0; -} - -static void clear_ibs_nmi(void) {} - static int op_amd_init(struct oprofile_operations *ops) { return 0; @@ -441,8 +450,12 @@ static void setup_ibs(void) if (!ibs_allowed) return; - if (pfm_amd64_setup_eilvt()) + if (pfm_amd64_setup_eilvt()) { ibs_allowed = 0; + return; + } + + printk(KERN_INFO "oprofile: AMD IBS detected\n"); } diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c index e1782d2df09..ed982273fb8 100644 --- a/drivers/oprofile/buffer_sync.c +++ b/drivers/oprofile/buffer_sync.c @@ -328,6 +328,8 @@ static void add_trace_begin(void) add_event_entry(TRACE_BEGIN_CODE); } +#ifdef CONFIG_OPROFILE_IBS + #define IBS_FETCH_CODE_SIZE 2 #define IBS_OP_CODE_SIZE 5 #define IBS_EIP(offset) \ @@ -390,6 +392,8 @@ static void add_ibs_begin(struct oprofile_cpu_buffer *cpu_buf, int code, } } +#endif + static void add_sample_entry(unsigned long offset, unsigned long event) { add_event_entry(offset); @@ -586,6 +590,7 @@ void sync_buffer(int cpu) } else if (s->event == CPU_TRACE_BEGIN) { state = sb_bt_start; add_trace_begin(); +#ifdef CONFIG_OPROFILE_IBS } else if (s->event == IBS_FETCH_BEGIN) { state = sb_bt_start; add_ibs_begin(cpu_buf, @@ -594,6 +599,7 @@ void sync_buffer(int cpu) state = sb_bt_start; add_ibs_begin(cpu_buf, IBS_OP_CODE, in_kernel, mm); +#endif } else { struct mm_struct *oldmm = mm; diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c index c9ac4e15691..aba905b3afb 100644 --- a/drivers/oprofile/cpu_buffer.c +++ b/drivers/oprofile/cpu_buffer.c @@ -253,6 +253,8 @@ void oprofile_add_sample(struct pt_regs * const regs, unsigned long event) oprofile_add_ext_sample(pc, regs, event, is_kernel); } +#ifdef CONFIG_OPROFILE_IBS + #define MAX_IBS_SAMPLE_SIZE 14 static int log_ibs_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc, int is_kernel, unsigned int *ibs, int ibs_code) @@ -318,6 +320,8 @@ void oprofile_add_ibs_sample(struct pt_regs *const regs, oprofile_ops.backtrace(regs, backtrace_depth); } +#endif + void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event) { struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer); -- cgit v1.2.3 From bd17b625c09d1ed14c4d98604186b0bbb314f6b2 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:09:07 +0200 Subject: oprofile: fix printk in cpu_buffer.c Signed-off-by: Robert Richter Cc: oprofile-list Cc: Robert Richter Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- drivers/oprofile/cpu_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c index aba905b3afb..4decab624e7 100644 --- a/drivers/oprofile/cpu_buffer.c +++ b/drivers/oprofile/cpu_buffer.c @@ -364,7 +364,7 @@ static void wq_sync_buffer(struct work_struct *work) struct oprofile_cpu_buffer * b = container_of(work, struct oprofile_cpu_buffer, work.work); if (b->cpu != smp_processor_id()) { - printk("WQ on CPU%d, prefer CPU%d\n", + printk(KERN_DEBUG "WQ on CPU%d, prefer CPU%d\n", smp_processor_id(), b->cpu); } sync_buffer(b->cpu); -- cgit v1.2.3 From 6852fd9b86d05063c6ef49d2e12e061cc7f6a105 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:09:08 +0200 Subject: x86/oprofile: reanaming op_model_athlon.c to op_model_amd.c Signed-off-by: Robert Richter Cc: oprofile-list Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- arch/x86/oprofile/Makefile | 2 +- arch/x86/oprofile/op_model_amd.c | 543 ++++++++++++++++++++++++++++++++++++ arch/x86/oprofile/op_model_athlon.c | 543 ------------------------------------ 3 files changed, 544 insertions(+), 544 deletions(-) create mode 100644 arch/x86/oprofile/op_model_amd.c delete mode 100644 arch/x86/oprofile/op_model_athlon.c diff --git a/arch/x86/oprofile/Makefile b/arch/x86/oprofile/Makefile index 30f3eb36666..446902b2a6b 100644 --- a/arch/x86/oprofile/Makefile +++ b/arch/x86/oprofile/Makefile @@ -7,6 +7,6 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \ timer_int.o ) oprofile-y := $(DRIVER_OBJS) init.o backtrace.o -oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o \ +oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_amd.o \ op_model_ppro.o op_model_p4.o oprofile-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c new file mode 100644 index 00000000000..d9faf607b3a --- /dev/null +++ b/arch/x86/oprofile/op_model_amd.c @@ -0,0 +1,543 @@ +/* + * @file op_model_amd.c + * athlon / K7 / K8 / Family 10h model-specific MSR operations + * + * @remark Copyright 2002-2008 OProfile authors + * @remark Read the file COPYING + * + * @author John Levon + * @author Philippe Elie + * @author Graydon Hoare + * @author Robert Richter + * @author Barry Kasindorf +*/ + +#include +#include +#include + +#include +#include +#include + +#include "op_x86_model.h" +#include "op_counter.h" + +#define NUM_COUNTERS 4 +#define NUM_CONTROLS 4 + +#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) +#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0) +#define CTR_WRITE(l, msrs, c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1); } while (0) +#define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) + +#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) +#define CTRL_READ(l, h, msrs, c) do {rdmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) +#define CTRL_WRITE(l, h, msrs, c) do {wrmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) +#define CTRL_SET_ACTIVE(n) (n |= (1<<22)) +#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22)) +#define CTRL_CLEAR_LO(x) (x &= (1<<21)) +#define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0) +#define CTRL_SET_ENABLE(val) (val |= 1<<20) +#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16)) +#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17)) +#define CTRL_SET_UM(val, m) (val |= (m << 8)) +#define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff)) +#define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf)) +#define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9)) +#define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8)) + +static unsigned long reset_value[NUM_COUNTERS]; + +#ifdef CONFIG_OPROFILE_IBS + +/* IbsFetchCtl bits/masks */ +#define IBS_FETCH_HIGH_VALID_BIT (1UL << 17) /* bit 49 */ +#define IBS_FETCH_HIGH_ENABLE (1UL << 16) /* bit 48 */ +#define IBS_FETCH_LOW_MAX_CNT_MASK 0x0000FFFFUL /* MaxCnt mask */ + +/*IbsOpCtl bits */ +#define IBS_OP_LOW_VALID_BIT (1ULL<<18) /* bit 18 */ +#define IBS_OP_LOW_ENABLE (1ULL<<17) /* bit 17 */ + +/* Codes used in cpu_buffer.c */ +/* This produces duplicate code, need to be fixed */ +#define IBS_FETCH_BEGIN 3 +#define IBS_OP_BEGIN 4 + +/* The function interface needs to be fixed, something like add + data. Should then be added to linux/oprofile.h. */ +extern void oprofile_add_ibs_sample(struct pt_regs *const regs, + unsigned int * const ibs_sample, u8 code); + +struct ibs_fetch_sample { + /* MSRC001_1031 IBS Fetch Linear Address Register */ + unsigned int ibs_fetch_lin_addr_low; + unsigned int ibs_fetch_lin_addr_high; + /* MSRC001_1030 IBS Fetch Control Register */ + unsigned int ibs_fetch_ctl_low; + unsigned int ibs_fetch_ctl_high; + /* MSRC001_1032 IBS Fetch Physical Address Register */ + unsigned int ibs_fetch_phys_addr_low; + unsigned int ibs_fetch_phys_addr_high; +}; + +struct ibs_op_sample { + /* MSRC001_1034 IBS Op Logical Address Register (IbsRIP) */ + unsigned int ibs_op_rip_low; + unsigned int ibs_op_rip_high; + /* MSRC001_1035 IBS Op Data Register */ + unsigned int ibs_op_data1_low; + unsigned int ibs_op_data1_high; + /* MSRC001_1036 IBS Op Data 2 Register */ + unsigned int ibs_op_data2_low; + unsigned int ibs_op_data2_high; + /* MSRC001_1037 IBS Op Data 3 Register */ + unsigned int ibs_op_data3_low; + unsigned int ibs_op_data3_high; + /* MSRC001_1038 IBS DC Linear Address Register (IbsDcLinAd) */ + unsigned int ibs_dc_linear_low; + unsigned int ibs_dc_linear_high; + /* MSRC001_1039 IBS DC Physical Address Register (IbsDcPhysAd) */ + unsigned int ibs_dc_phys_low; + unsigned int ibs_dc_phys_high; +}; + +/* + * unitialize the APIC for the IBS interrupts if needed on AMD Family10h+ +*/ +static void clear_ibs_nmi(void); + +static int ibs_allowed; /* AMD Family10h and later */ + +struct op_ibs_config { + unsigned long op_enabled; + unsigned long fetch_enabled; + unsigned long max_cnt_fetch; + unsigned long max_cnt_op; + unsigned long rand_en; + unsigned long dispatched_ops; +}; + +static struct op_ibs_config ibs_config; + +#endif + +/* functions for op_amd_spec */ + +static void op_amd_fill_in_addresses(struct op_msrs * const msrs) +{ + int i; + + for (i = 0; i < NUM_COUNTERS; i++) { + if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) + msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; + else + msrs->counters[i].addr = 0; + } + + for (i = 0; i < NUM_CONTROLS; i++) { + if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) + msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; + else + msrs->controls[i].addr = 0; + } +} + + +static void op_amd_setup_ctrs(struct op_msrs const * const msrs) +{ + unsigned int low, high; + int i; + + /* clear all counters */ + for (i = 0 ; i < NUM_CONTROLS; ++i) { + if (unlikely(!CTRL_IS_RESERVED(msrs, i))) + continue; + CTRL_READ(low, high, msrs, i); + CTRL_CLEAR_LO(low); + CTRL_CLEAR_HI(high); + CTRL_WRITE(low, high, msrs, i); + } + + /* avoid a false detection of ctr overflows in NMI handler */ + for (i = 0; i < NUM_COUNTERS; ++i) { + if (unlikely(!CTR_IS_RESERVED(msrs, i))) + continue; + CTR_WRITE(1, msrs, i); + } + + /* enable active counters */ + for (i = 0; i < NUM_COUNTERS; ++i) { + if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { + reset_value[i] = counter_config[i].count; + + CTR_WRITE(counter_config[i].count, msrs, i); + + CTRL_READ(low, high, msrs, i); + CTRL_CLEAR_LO(low); + CTRL_CLEAR_HI(high); + CTRL_SET_ENABLE(low); + CTRL_SET_USR(low, counter_config[i].user); + CTRL_SET_KERN(low, counter_config[i].kernel); + CTRL_SET_UM(low, counter_config[i].unit_mask); + CTRL_SET_EVENT_LOW(low, counter_config[i].event); + CTRL_SET_EVENT_HIGH(high, counter_config[i].event); + CTRL_SET_HOST_ONLY(high, 0); + CTRL_SET_GUEST_ONLY(high, 0); + + CTRL_WRITE(low, high, msrs, i); + } else { + reset_value[i] = 0; + } + } +} + +#ifdef CONFIG_OPROFILE_IBS + +static inline int +op_amd_handle_ibs(struct pt_regs * const regs, + struct op_msrs const * const msrs) +{ + unsigned int low, high; + struct ibs_fetch_sample ibs_fetch; + struct ibs_op_sample ibs_op; + + if (!ibs_allowed) + return 1; + + if (ibs_config.fetch_enabled) { + rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); + if (high & IBS_FETCH_HIGH_VALID_BIT) { + ibs_fetch.ibs_fetch_ctl_high = high; + ibs_fetch.ibs_fetch_ctl_low = low; + rdmsr(MSR_AMD64_IBSFETCHLINAD, low, high); + ibs_fetch.ibs_fetch_lin_addr_high = high; + ibs_fetch.ibs_fetch_lin_addr_low = low; + rdmsr(MSR_AMD64_IBSFETCHPHYSAD, low, high); + ibs_fetch.ibs_fetch_phys_addr_high = high; + ibs_fetch.ibs_fetch_phys_addr_low = low; + + oprofile_add_ibs_sample(regs, + (unsigned int *)&ibs_fetch, + IBS_FETCH_BEGIN); + + /*reenable the IRQ */ + rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); + high &= ~IBS_FETCH_HIGH_VALID_BIT; + high |= IBS_FETCH_HIGH_ENABLE; + low &= IBS_FETCH_LOW_MAX_CNT_MASK; + wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + } + } + + if (ibs_config.op_enabled) { + rdmsr(MSR_AMD64_IBSOPCTL, low, high); + if (low & IBS_OP_LOW_VALID_BIT) { + rdmsr(MSR_AMD64_IBSOPRIP, low, high); + ibs_op.ibs_op_rip_low = low; + ibs_op.ibs_op_rip_high = high; + rdmsr(MSR_AMD64_IBSOPDATA, low, high); + ibs_op.ibs_op_data1_low = low; + ibs_op.ibs_op_data1_high = high; + rdmsr(MSR_AMD64_IBSOPDATA2, low, high); + ibs_op.ibs_op_data2_low = low; + ibs_op.ibs_op_data2_high = high; + rdmsr(MSR_AMD64_IBSOPDATA3, low, high); + ibs_op.ibs_op_data3_low = low; + ibs_op.ibs_op_data3_high = high; + rdmsr(MSR_AMD64_IBSDCLINAD, low, high); + ibs_op.ibs_dc_linear_low = low; + ibs_op.ibs_dc_linear_high = high; + rdmsr(MSR_AMD64_IBSDCPHYSAD, low, high); + ibs_op.ibs_dc_phys_low = low; + ibs_op.ibs_dc_phys_high = high; + + /* reenable the IRQ */ + oprofile_add_ibs_sample(regs, + (unsigned int *)&ibs_op, + IBS_OP_BEGIN); + rdmsr(MSR_AMD64_IBSOPCTL, low, high); + high = 0; + low &= ~IBS_OP_LOW_VALID_BIT; + low |= IBS_OP_LOW_ENABLE; + wrmsr(MSR_AMD64_IBSOPCTL, low, high); + } + } + + return 1; +} + +#endif + +static int op_amd_check_ctrs(struct pt_regs * const regs, + struct op_msrs const * const msrs) +{ + unsigned int low, high; + int i; + + for (i = 0 ; i < NUM_COUNTERS; ++i) { + if (!reset_value[i]) + continue; + CTR_READ(low, high, msrs, i); + if (CTR_OVERFLOWED(low)) { + oprofile_add_sample(regs, i); + CTR_WRITE(reset_value[i], msrs, i); + } + } + +#ifdef CONFIG_OPROFILE_IBS + op_amd_handle_ibs(regs, msrs); +#endif + + /* See op_model_ppro.c */ + return 1; +} + +static void op_amd_start(struct op_msrs const * const msrs) +{ + unsigned int low, high; + int i; + for (i = 0 ; i < NUM_COUNTERS ; ++i) { + if (reset_value[i]) { + CTRL_READ(low, high, msrs, i); + CTRL_SET_ACTIVE(low); + CTRL_WRITE(low, high, msrs, i); + } + } + +#ifdef CONFIG_OPROFILE_IBS + if (ibs_allowed && ibs_config.fetch_enabled) { + low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; + high = IBS_FETCH_HIGH_ENABLE; + wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + } + + if (ibs_allowed && ibs_config.op_enabled) { + low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF) + IBS_OP_LOW_ENABLE; + high = 0; + wrmsr(MSR_AMD64_IBSOPCTL, low, high); + } +#endif +} + + +static void op_amd_stop(struct op_msrs const * const msrs) +{ + unsigned int low, high; + int i; + + /* Subtle: stop on all counters to avoid race with + * setting our pm callback */ + for (i = 0 ; i < NUM_COUNTERS ; ++i) { + if (!reset_value[i]) + continue; + CTRL_READ(low, high, msrs, i); + CTRL_SET_INACTIVE(low); + CTRL_WRITE(low, high, msrs, i); + } + +#ifdef CONFIG_OPROFILE_IBS + if (ibs_allowed && ibs_config.fetch_enabled) { + low = 0; /* clear max count and enable */ + high = 0; + wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + } + + if (ibs_allowed && ibs_config.op_enabled) { + low = 0; /* clear max count and enable */ + high = 0; + wrmsr(MSR_AMD64_IBSOPCTL, low, high); + } +#endif +} + +static void op_amd_shutdown(struct op_msrs const * const msrs) +{ + int i; + + for (i = 0 ; i < NUM_COUNTERS ; ++i) { + if (CTR_IS_RESERVED(msrs, i)) + release_perfctr_nmi(MSR_K7_PERFCTR0 + i); + } + for (i = 0 ; i < NUM_CONTROLS ; ++i) { + if (CTRL_IS_RESERVED(msrs, i)) + release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } +} + +#ifndef CONFIG_OPROFILE_IBS + +/* no IBS support */ + +static int op_amd_init(struct oprofile_operations *ops) +{ + return 0; +} + +static void op_amd_exit(void) {} + +#else + +static u8 ibs_eilvt_off; + +static inline void apic_init_ibs_nmi_per_cpu(void *arg) +{ + ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0); +} + +static inline void apic_clear_ibs_nmi_per_cpu(void *arg) +{ + setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1); +} + +static int pfm_amd64_setup_eilvt(void) +{ +#define IBSCTL_LVTOFFSETVAL (1 << 8) +#define IBSCTL 0x1cc + struct pci_dev *cpu_cfg; + int nodes; + u32 value = 0; + + /* per CPU setup */ + on_each_cpu(apic_init_ibs_nmi_per_cpu, NULL, 1); + + nodes = 0; + cpu_cfg = NULL; + do { + cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD, + PCI_DEVICE_ID_AMD_10H_NB_MISC, + cpu_cfg); + if (!cpu_cfg) + break; + ++nodes; + pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off + | IBSCTL_LVTOFFSETVAL); + pci_read_config_dword(cpu_cfg, IBSCTL, &value); + if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) { + printk(KERN_DEBUG "Failed to setup IBS LVT offset, " + "IBSCTL = 0x%08x", value); + return 1; + } + } while (1); + + if (!nodes) { + printk(KERN_DEBUG "No CPU node configured for IBS"); + return 1; + } + +#ifdef CONFIG_NUMA + /* Sanity check */ + /* Works only for 64bit with proper numa implementation. */ + if (nodes != num_possible_nodes()) { + printk(KERN_DEBUG "Failed to setup CPU node(s) for IBS, " + "found: %d, expected %d", + nodes, num_possible_nodes()); + return 1; + } +#endif + return 0; +} + +/* + * initialize the APIC for the IBS interrupts + * if available (AMD Family10h rev B0 and later) + */ +static void setup_ibs(void) +{ + ibs_allowed = boot_cpu_has(X86_FEATURE_IBS); + + if (!ibs_allowed) + return; + + if (pfm_amd64_setup_eilvt()) { + ibs_allowed = 0; + return; + } + + printk(KERN_INFO "oprofile: AMD IBS detected\n"); +} + + +/* + * unitialize the APIC for the IBS interrupts if needed on AMD Family10h + * rev B0 and later */ +static void clear_ibs_nmi(void) +{ + if (ibs_allowed) + on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); +} + +static int (*create_arch_files)(struct super_block * sb, struct dentry * root); + +static int setup_ibs_files(struct super_block * sb, struct dentry * root) +{ + char buf[12]; + struct dentry *dir; + int ret = 0; + + /* architecture specific files */ + if (create_arch_files) + ret = create_arch_files(sb, root); + + if (ret) + return ret; + + if (!ibs_allowed) + return ret; + + /* model specific files */ + + /* setup some reasonable defaults */ + ibs_config.max_cnt_fetch = 250000; + ibs_config.fetch_enabled = 0; + ibs_config.max_cnt_op = 250000; + ibs_config.op_enabled = 0; + ibs_config.dispatched_ops = 1; + snprintf(buf, sizeof(buf), "ibs_fetch"); + dir = oprofilefs_mkdir(sb, root, buf); + oprofilefs_create_ulong(sb, dir, "rand_enable", + &ibs_config.rand_en); + oprofilefs_create_ulong(sb, dir, "enable", + &ibs_config.fetch_enabled); + oprofilefs_create_ulong(sb, dir, "max_count", + &ibs_config.max_cnt_fetch); + snprintf(buf, sizeof(buf), "ibs_uops"); + dir = oprofilefs_mkdir(sb, root, buf); + oprofilefs_create_ulong(sb, dir, "enable", + &ibs_config.op_enabled); + oprofilefs_create_ulong(sb, dir, "max_count", + &ibs_config.max_cnt_op); + oprofilefs_create_ulong(sb, dir, "dispatched_ops", + &ibs_config.dispatched_ops); + + return 0; +} + +static int op_amd_init(struct oprofile_operations *ops) +{ + setup_ibs(); + create_arch_files = ops->create_files; + ops->create_files = setup_ibs_files; + return 0; +} + +static void op_amd_exit(void) +{ + clear_ibs_nmi(); +} + +#endif + +struct op_x86_model_spec const op_amd_spec = { + .init = op_amd_init, + .exit = op_amd_exit, + .num_counters = NUM_COUNTERS, + .num_controls = NUM_CONTROLS, + .fill_in_addresses = &op_amd_fill_in_addresses, + .setup_ctrs = &op_amd_setup_ctrs, + .check_ctrs = &op_amd_check_ctrs, + .start = &op_amd_start, + .stop = &op_amd_stop, + .shutdown = &op_amd_shutdown +}; diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c deleted file mode 100644 index fb6015c1132..00000000000 --- a/arch/x86/oprofile/op_model_athlon.c +++ /dev/null @@ -1,543 +0,0 @@ -/* - * @file op_model_athlon.c - * athlon / K7 / K8 / Family 10h model-specific MSR operations - * - * @remark Copyright 2002-2008 OProfile authors - * @remark Read the file COPYING - * - * @author John Levon - * @author Philippe Elie - * @author Graydon Hoare - * @author Robert Richter - * @author Barry Kasindorf -*/ - -#include -#include -#include - -#include -#include -#include - -#include "op_x86_model.h" -#include "op_counter.h" - -#define NUM_COUNTERS 4 -#define NUM_CONTROLS 4 - -#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) -#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0) -#define CTR_WRITE(l, msrs, c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1); } while (0) -#define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) - -#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) -#define CTRL_READ(l, h, msrs, c) do {rdmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) -#define CTRL_WRITE(l, h, msrs, c) do {wrmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) -#define CTRL_SET_ACTIVE(n) (n |= (1<<22)) -#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22)) -#define CTRL_CLEAR_LO(x) (x &= (1<<21)) -#define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0) -#define CTRL_SET_ENABLE(val) (val |= 1<<20) -#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16)) -#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17)) -#define CTRL_SET_UM(val, m) (val |= (m << 8)) -#define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff)) -#define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf)) -#define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9)) -#define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8)) - -static unsigned long reset_value[NUM_COUNTERS]; - -#ifdef CONFIG_OPROFILE_IBS - -/* IbsFetchCtl bits/masks */ -#define IBS_FETCH_HIGH_VALID_BIT (1UL << 17) /* bit 49 */ -#define IBS_FETCH_HIGH_ENABLE (1UL << 16) /* bit 48 */ -#define IBS_FETCH_LOW_MAX_CNT_MASK 0x0000FFFFUL /* MaxCnt mask */ - -/*IbsOpCtl bits */ -#define IBS_OP_LOW_VALID_BIT (1ULL<<18) /* bit 18 */ -#define IBS_OP_LOW_ENABLE (1ULL<<17) /* bit 17 */ - -/* Codes used in cpu_buffer.c */ -/* This produces duplicate code, need to be fixed */ -#define IBS_FETCH_BEGIN 3 -#define IBS_OP_BEGIN 4 - -/* The function interface needs to be fixed, something like add - data. Should then be added to linux/oprofile.h. */ -extern void oprofile_add_ibs_sample(struct pt_regs *const regs, - unsigned int * const ibs_sample, u8 code); - -struct ibs_fetch_sample { - /* MSRC001_1031 IBS Fetch Linear Address Register */ - unsigned int ibs_fetch_lin_addr_low; - unsigned int ibs_fetch_lin_addr_high; - /* MSRC001_1030 IBS Fetch Control Register */ - unsigned int ibs_fetch_ctl_low; - unsigned int ibs_fetch_ctl_high; - /* MSRC001_1032 IBS Fetch Physical Address Register */ - unsigned int ibs_fetch_phys_addr_low; - unsigned int ibs_fetch_phys_addr_high; -}; - -struct ibs_op_sample { - /* MSRC001_1034 IBS Op Logical Address Register (IbsRIP) */ - unsigned int ibs_op_rip_low; - unsigned int ibs_op_rip_high; - /* MSRC001_1035 IBS Op Data Register */ - unsigned int ibs_op_data1_low; - unsigned int ibs_op_data1_high; - /* MSRC001_1036 IBS Op Data 2 Register */ - unsigned int ibs_op_data2_low; - unsigned int ibs_op_data2_high; - /* MSRC001_1037 IBS Op Data 3 Register */ - unsigned int ibs_op_data3_low; - unsigned int ibs_op_data3_high; - /* MSRC001_1038 IBS DC Linear Address Register (IbsDcLinAd) */ - unsigned int ibs_dc_linear_low; - unsigned int ibs_dc_linear_high; - /* MSRC001_1039 IBS DC Physical Address Register (IbsDcPhysAd) */ - unsigned int ibs_dc_phys_low; - unsigned int ibs_dc_phys_high; -}; - -/* - * unitialize the APIC for the IBS interrupts if needed on AMD Family10h+ -*/ -static void clear_ibs_nmi(void); - -static int ibs_allowed; /* AMD Family10h and later */ - -struct op_ibs_config { - unsigned long op_enabled; - unsigned long fetch_enabled; - unsigned long max_cnt_fetch; - unsigned long max_cnt_op; - unsigned long rand_en; - unsigned long dispatched_ops; -}; - -static struct op_ibs_config ibs_config; - -#endif - -/* functions for op_amd_spec */ - -static void op_amd_fill_in_addresses(struct op_msrs * const msrs) -{ - int i; - - for (i = 0; i < NUM_COUNTERS; i++) { - if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) - msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; - else - msrs->counters[i].addr = 0; - } - - for (i = 0; i < NUM_CONTROLS; i++) { - if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) - msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; - else - msrs->controls[i].addr = 0; - } -} - - -static void op_amd_setup_ctrs(struct op_msrs const * const msrs) -{ - unsigned int low, high; - int i; - - /* clear all counters */ - for (i = 0 ; i < NUM_CONTROLS; ++i) { - if (unlikely(!CTRL_IS_RESERVED(msrs, i))) - continue; - CTRL_READ(low, high, msrs, i); - CTRL_CLEAR_LO(low); - CTRL_CLEAR_HI(high); - CTRL_WRITE(low, high, msrs, i); - } - - /* avoid a false detection of ctr overflows in NMI handler */ - for (i = 0; i < NUM_COUNTERS; ++i) { - if (unlikely(!CTR_IS_RESERVED(msrs, i))) - continue; - CTR_WRITE(1, msrs, i); - } - - /* enable active counters */ - for (i = 0; i < NUM_COUNTERS; ++i) { - if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { - reset_value[i] = counter_config[i].count; - - CTR_WRITE(counter_config[i].count, msrs, i); - - CTRL_READ(low, high, msrs, i); - CTRL_CLEAR_LO(low); - CTRL_CLEAR_HI(high); - CTRL_SET_ENABLE(low); - CTRL_SET_USR(low, counter_config[i].user); - CTRL_SET_KERN(low, counter_config[i].kernel); - CTRL_SET_UM(low, counter_config[i].unit_mask); - CTRL_SET_EVENT_LOW(low, counter_config[i].event); - CTRL_SET_EVENT_HIGH(high, counter_config[i].event); - CTRL_SET_HOST_ONLY(high, 0); - CTRL_SET_GUEST_ONLY(high, 0); - - CTRL_WRITE(low, high, msrs, i); - } else { - reset_value[i] = 0; - } - } -} - -#ifdef CONFIG_OPROFILE_IBS - -static inline int -op_amd_handle_ibs(struct pt_regs * const regs, - struct op_msrs const * const msrs) -{ - unsigned int low, high; - struct ibs_fetch_sample ibs_fetch; - struct ibs_op_sample ibs_op; - - if (!ibs_allowed) - return 1; - - if (ibs_config.fetch_enabled) { - rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); - if (high & IBS_FETCH_HIGH_VALID_BIT) { - ibs_fetch.ibs_fetch_ctl_high = high; - ibs_fetch.ibs_fetch_ctl_low = low; - rdmsr(MSR_AMD64_IBSFETCHLINAD, low, high); - ibs_fetch.ibs_fetch_lin_addr_high = high; - ibs_fetch.ibs_fetch_lin_addr_low = low; - rdmsr(MSR_AMD64_IBSFETCHPHYSAD, low, high); - ibs_fetch.ibs_fetch_phys_addr_high = high; - ibs_fetch.ibs_fetch_phys_addr_low = low; - - oprofile_add_ibs_sample(regs, - (unsigned int *)&ibs_fetch, - IBS_FETCH_BEGIN); - - /*reenable the IRQ */ - rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); - high &= ~IBS_FETCH_HIGH_VALID_BIT; - high |= IBS_FETCH_HIGH_ENABLE; - low &= IBS_FETCH_LOW_MAX_CNT_MASK; - wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); - } - } - - if (ibs_config.op_enabled) { - rdmsr(MSR_AMD64_IBSOPCTL, low, high); - if (low & IBS_OP_LOW_VALID_BIT) { - rdmsr(MSR_AMD64_IBSOPRIP, low, high); - ibs_op.ibs_op_rip_low = low; - ibs_op.ibs_op_rip_high = high; - rdmsr(MSR_AMD64_IBSOPDATA, low, high); - ibs_op.ibs_op_data1_low = low; - ibs_op.ibs_op_data1_high = high; - rdmsr(MSR_AMD64_IBSOPDATA2, low, high); - ibs_op.ibs_op_data2_low = low; - ibs_op.ibs_op_data2_high = high; - rdmsr(MSR_AMD64_IBSOPDATA3, low, high); - ibs_op.ibs_op_data3_low = low; - ibs_op.ibs_op_data3_high = high; - rdmsr(MSR_AMD64_IBSDCLINAD, low, high); - ibs_op.ibs_dc_linear_low = low; - ibs_op.ibs_dc_linear_high = high; - rdmsr(MSR_AMD64_IBSDCPHYSAD, low, high); - ibs_op.ibs_dc_phys_low = low; - ibs_op.ibs_dc_phys_high = high; - - /* reenable the IRQ */ - oprofile_add_ibs_sample(regs, - (unsigned int *)&ibs_op, - IBS_OP_BEGIN); - rdmsr(MSR_AMD64_IBSOPCTL, low, high); - high = 0; - low &= ~IBS_OP_LOW_VALID_BIT; - low |= IBS_OP_LOW_ENABLE; - wrmsr(MSR_AMD64_IBSOPCTL, low, high); - } - } - - return 1; -} - -#endif - -static int op_amd_check_ctrs(struct pt_regs * const regs, - struct op_msrs const * const msrs) -{ - unsigned int low, high; - int i; - - for (i = 0 ; i < NUM_COUNTERS; ++i) { - if (!reset_value[i]) - continue; - CTR_READ(low, high, msrs, i); - if (CTR_OVERFLOWED(low)) { - oprofile_add_sample(regs, i); - CTR_WRITE(reset_value[i], msrs, i); - } - } - -#ifdef CONFIG_OPROFILE_IBS - op_amd_handle_ibs(regs, msrs); -#endif - - /* See op_model_ppro.c */ - return 1; -} - -static void op_amd_start(struct op_msrs const * const msrs) -{ - unsigned int low, high; - int i; - for (i = 0 ; i < NUM_COUNTERS ; ++i) { - if (reset_value[i]) { - CTRL_READ(low, high, msrs, i); - CTRL_SET_ACTIVE(low); - CTRL_WRITE(low, high, msrs, i); - } - } - -#ifdef CONFIG_OPROFILE_IBS - if (ibs_allowed && ibs_config.fetch_enabled) { - low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; - high = IBS_FETCH_HIGH_ENABLE; - wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); - } - - if (ibs_allowed && ibs_config.op_enabled) { - low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF) + IBS_OP_LOW_ENABLE; - high = 0; - wrmsr(MSR_AMD64_IBSOPCTL, low, high); - } -#endif -} - - -static void op_amd_stop(struct op_msrs const * const msrs) -{ - unsigned int low, high; - int i; - - /* Subtle: stop on all counters to avoid race with - * setting our pm callback */ - for (i = 0 ; i < NUM_COUNTERS ; ++i) { - if (!reset_value[i]) - continue; - CTRL_READ(low, high, msrs, i); - CTRL_SET_INACTIVE(low); - CTRL_WRITE(low, high, msrs, i); - } - -#ifdef CONFIG_OPROFILE_IBS - if (ibs_allowed && ibs_config.fetch_enabled) { - low = 0; /* clear max count and enable */ - high = 0; - wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); - } - - if (ibs_allowed && ibs_config.op_enabled) { - low = 0; /* clear max count and enable */ - high = 0; - wrmsr(MSR_AMD64_IBSOPCTL, low, high); - } -#endif -} - -static void op_amd_shutdown(struct op_msrs const * const msrs) -{ - int i; - - for (i = 0 ; i < NUM_COUNTERS ; ++i) { - if (CTR_IS_RESERVED(msrs, i)) - release_perfctr_nmi(MSR_K7_PERFCTR0 + i); - } - for (i = 0 ; i < NUM_CONTROLS ; ++i) { - if (CTRL_IS_RESERVED(msrs, i)) - release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } -} - -#ifndef CONFIG_OPROFILE_IBS - -/* no IBS support */ - -static int op_amd_init(struct oprofile_operations *ops) -{ - return 0; -} - -static void op_amd_exit(void) {} - -#else - -static u8 ibs_eilvt_off; - -static inline void apic_init_ibs_nmi_per_cpu(void *arg) -{ - ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0); -} - -static inline void apic_clear_ibs_nmi_per_cpu(void *arg) -{ - setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1); -} - -static int pfm_amd64_setup_eilvt(void) -{ -#define IBSCTL_LVTOFFSETVAL (1 << 8) -#define IBSCTL 0x1cc - struct pci_dev *cpu_cfg; - int nodes; - u32 value = 0; - - /* per CPU setup */ - on_each_cpu(apic_init_ibs_nmi_per_cpu, NULL, 1); - - nodes = 0; - cpu_cfg = NULL; - do { - cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD, - PCI_DEVICE_ID_AMD_10H_NB_MISC, - cpu_cfg); - if (!cpu_cfg) - break; - ++nodes; - pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off - | IBSCTL_LVTOFFSETVAL); - pci_read_config_dword(cpu_cfg, IBSCTL, &value); - if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) { - printk(KERN_DEBUG "Failed to setup IBS LVT offset, " - "IBSCTL = 0x%08x", value); - return 1; - } - } while (1); - - if (!nodes) { - printk(KERN_DEBUG "No CPU node configured for IBS"); - return 1; - } - -#ifdef CONFIG_NUMA - /* Sanity check */ - /* Works only for 64bit with proper numa implementation. */ - if (nodes != num_possible_nodes()) { - printk(KERN_DEBUG "Failed to setup CPU node(s) for IBS, " - "found: %d, expected %d", - nodes, num_possible_nodes()); - return 1; - } -#endif - return 0; -} - -/* - * initialize the APIC for the IBS interrupts - * if available (AMD Family10h rev B0 and later) - */ -static void setup_ibs(void) -{ - ibs_allowed = boot_cpu_has(X86_FEATURE_IBS); - - if (!ibs_allowed) - return; - - if (pfm_amd64_setup_eilvt()) { - ibs_allowed = 0; - return; - } - - printk(KERN_INFO "oprofile: AMD IBS detected\n"); -} - - -/* - * unitialize the APIC for the IBS interrupts if needed on AMD Family10h - * rev B0 and later */ -static void clear_ibs_nmi(void) -{ - if (ibs_allowed) - on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); -} - -static int (*create_arch_files)(struct super_block * sb, struct dentry * root); - -static int setup_ibs_files(struct super_block * sb, struct dentry * root) -{ - char buf[12]; - struct dentry *dir; - int ret = 0; - - /* architecture specific files */ - if (create_arch_files) - ret = create_arch_files(sb, root); - - if (ret) - return ret; - - if (!ibs_allowed) - return ret; - - /* model specific files */ - - /* setup some reasonable defaults */ - ibs_config.max_cnt_fetch = 250000; - ibs_config.fetch_enabled = 0; - ibs_config.max_cnt_op = 250000; - ibs_config.op_enabled = 0; - ibs_config.dispatched_ops = 1; - snprintf(buf, sizeof(buf), "ibs_fetch"); - dir = oprofilefs_mkdir(sb, root, buf); - oprofilefs_create_ulong(sb, dir, "rand_enable", - &ibs_config.rand_en); - oprofilefs_create_ulong(sb, dir, "enable", - &ibs_config.fetch_enabled); - oprofilefs_create_ulong(sb, dir, "max_count", - &ibs_config.max_cnt_fetch); - snprintf(buf, sizeof(buf), "ibs_uops"); - dir = oprofilefs_mkdir(sb, root, buf); - oprofilefs_create_ulong(sb, dir, "enable", - &ibs_config.op_enabled); - oprofilefs_create_ulong(sb, dir, "max_count", - &ibs_config.max_cnt_op); - oprofilefs_create_ulong(sb, dir, "dispatched_ops", - &ibs_config.dispatched_ops); - - return 0; -} - -static int op_amd_init(struct oprofile_operations *ops) -{ - setup_ibs(); - create_arch_files = ops->create_files; - ops->create_files = setup_ibs_files; - return 0; -} - -static void op_amd_exit(void) -{ - clear_ibs_nmi(); -} - -#endif - -struct op_x86_model_spec const op_amd_spec = { - .init = op_amd_init, - .exit = op_amd_exit, - .num_counters = NUM_COUNTERS, - .num_controls = NUM_CONTROLS, - .fill_in_addresses = &op_amd_fill_in_addresses, - .setup_ctrs = &op_amd_setup_ctrs, - .check_ctrs = &op_amd_check_ctrs, - .start = &op_amd_start, - .stop = &op_amd_stop, - .shutdown = &op_amd_shutdown -}; -- cgit v1.2.3 From 64a76f667d987a559ad0726b4692c987800b22bc Mon Sep 17 00:00:00 2001 From: David Brownell Date: Tue, 29 Jul 2008 12:47:38 -0700 Subject: hpet: /dev/hpet - fixes and cleanup Minor /dev/hpet updates and bugfixes: * Remove dead code, mostly remnants of an incomplete/unusable kernel interface ... noted when addressing "sparse" warnings: + hpet_unregister() and a routine it calls + hpet_task and all references, including hpet_task_lock + hpet_data.hd_flags (and HPET_DATA_PLATFORM) * Correct and improve boot message: + displays *counter* (shared between comparators) bit width, not *timer* bit widths (which are often mixed) + relabel "timers" as "comparators"; this is less confusing, they are not independent like normal timers are (sigh) + display MHz not Hz; it's never less than 10 MHz. * Tighten and correct the userspace interface code + don't accidentally program comparators in 64-bit mode using 32-bit values ... always force comparators into 32-bit mode + provide the correct bit definition flagging comparators with periodic capability ... the ABI is unchanged * Update Documentation/hpet.txt + be more correct and current + expand description a bit + don't mention that now-gone kernel interface Plus, add a FIXME comment for something that could cause big trouble on systems with more capable HPETs than at least Intel seems to ship. It seems that few folk use this userspace interface; it's not very usable given the general lack of HPET IRQ routing. I'm told that the only real point of it any more is to mmap for fast timestamps; IMO that's handled better through the gettimeofday() vsyscall. Signed-off-by: David Brownell Acked-by: Clemens Ladisch Signed-off-by: Ingo Molnar --- Documentation/timers/hpet.txt | 43 ++++++++++----------- arch/x86/kernel/hpet.c | 6 ++- drivers/char/hpet.c | 90 ++++++++++++------------------------------- include/linux/hpet.h | 11 +----- 4 files changed, 51 insertions(+), 99 deletions(-) diff --git a/Documentation/timers/hpet.txt b/Documentation/timers/hpet.txt index 6ad52d9dad6..e7c09abcfab 100644 --- a/Documentation/timers/hpet.txt +++ b/Documentation/timers/hpet.txt @@ -1,21 +1,32 @@ High Precision Event Timer Driver for Linux -The High Precision Event Timer (HPET) hardware is the future replacement -for the 8254 and Real Time Clock (RTC) periodic timer functionality. -Each HPET can have up to 32 timers. It is possible to configure the -first two timers as legacy replacements for 8254 and RTC periodic timers. -A specification done by Intel and Microsoft can be found at -. +The High Precision Event Timer (HPET) hardware follows a specification +by Intel and Microsoft which can be found at + + http://www.intel.com/technology/architecture/hpetspec.htm + +Each HPET has one fixed-rate counter (at 10+ MHz, hence "High Precision") +and up to 32 comparators. Normally three or more comparators are provided, +each of which can generate oneshot interupts and at least one of which has +additional hardware to support periodic interrupts. The comparators are +also called "timers", which can be misleading since usually timers are +independent of each other ... these share a counter, complicating resets. + +HPET devices can support two interrupt routing modes. In one mode, the +comparators are additional interrupt sources with no particular system +role. Many x86 BIOS writers don't route HPET interrupts at all, which +prevents use of that mode. They support the other "legacy replacement" +mode where the first two comparators block interrupts from 8254 timers +and from the RTC. The driver supports detection of HPET driver allocation and initialization of the HPET before the driver module_init routine is called. This enables platform code which uses timer 0 or 1 as the main timer to intercept HPET initialization. An example of this initialization can be found in -arch/i386/kernel/time_hpet.c. +arch/x86/kernel/hpet.c. -The driver provides two APIs which are very similar to the API found in -the rtc.c driver. There is a user space API and a kernel space API. -An example user space program is provided below. +The driver provides a userspace API which resembles the API found in the +RTC driver framework. An example user space program is provided below. #include #include @@ -286,15 +297,3 @@ out: return; } - -The kernel API has three interfaces exported from the driver: - - hpet_register(struct hpet_task *tp, int periodic) - hpet_unregister(struct hpet_task *tp) - hpet_control(struct hpet_task *tp, unsigned int cmd, unsigned long arg) - -The kernel module using this interface fills in the ht_func and ht_data -members of the hpet_task structure before calling hpet_register. -hpet_control simply vectors to the hpet_ioctl routine and has the same -commands and respective arguments as the user API. hpet_unregister -is used to terminate usage of the HPET timer reserved by hpet_register. diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ad2b15a1334..82d459186fd 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -115,13 +115,17 @@ static void hpet_reserve_platform_timers(unsigned long id) hd.hd_phys_address = hpet_address; hd.hd_address = hpet; hd.hd_nirqs = nrtimers; - hd.hd_flags = HPET_DATA_PLATFORM; hpet_reserve_timer(&hd, 0); #ifdef CONFIG_HPET_EMULATE_RTC hpet_reserve_timer(&hd, 1); #endif + /* + * NOTE that hd_irq[] reflects IOAPIC input pins (LEGACY_8254 + * is wrong for i8259!) not the output IRQ. Many BIOS writers + * don't bother configuring *any* comparator interrupts. + */ hd.hd_irq[0] = HPET_LEGACY_8254; hd.hd_irq[1] = HPET_LEGACY_RTC; diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c index f3981ffe20f..4bc1da4d4f8 100644 --- a/drivers/char/hpet.c +++ b/drivers/char/hpet.c @@ -53,6 +53,11 @@ #define HPET_RANGE_SIZE 1024 /* from HPET spec */ + +/* WARNING -- don't get confused. These macros are never used + * to write the (single) counter, and rarely to read it. + * They're badly named; to fix, someday. + */ #if BITS_PER_LONG == 64 #define write_counter(V, MC) writeq(V, MC) #define read_counter(MC) readq(MC) @@ -77,7 +82,7 @@ static struct clocksource clocksource_hpet = { .rating = 250, .read = read_hpet, .mask = CLOCKSOURCE_MASK(64), - .mult = 0, /*to be caluclated*/ + .mult = 0, /* to be calculated */ .shift = 10, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -86,8 +91,6 @@ static struct clocksource *hpet_clocksource; /* A lock for concurrent access by app and isr hpet activity. */ static DEFINE_SPINLOCK(hpet_lock); -/* A lock for concurrent intermodule access to hpet and isr hpet activity. */ -static DEFINE_SPINLOCK(hpet_task_lock); #define HPET_DEV_NAME (7) @@ -99,7 +102,6 @@ struct hpet_dev { unsigned long hd_irqdata; wait_queue_head_t hd_waitqueue; struct fasync_struct *hd_async_queue; - struct hpet_task *hd_task; unsigned int hd_flags; unsigned int hd_irq; unsigned int hd_hdwirq; @@ -173,11 +175,6 @@ static irqreturn_t hpet_interrupt(int irq, void *data) writel(isr, &devp->hd_hpet->hpet_isr); spin_unlock(&hpet_lock); - spin_lock(&hpet_task_lock); - if (devp->hd_task) - devp->hd_task->ht_func(devp->hd_task->ht_data); - spin_unlock(&hpet_task_lock); - wake_up_interruptible(&devp->hd_waitqueue); kill_fasync(&devp->hd_async_queue, SIGIO, POLL_IN); @@ -260,8 +257,7 @@ static int hpet_open(struct inode *inode, struct file *file) for (devp = NULL, hpetp = hpets; hpetp && !devp; hpetp = hpetp->hp_next) for (i = 0; i < hpetp->hp_ntimer; i++) - if (hpetp->hp_dev[i].hd_flags & HPET_OPEN - || hpetp->hp_dev[i].hd_task) + if (hpetp->hp_dev[i].hd_flags & HPET_OPEN) continue; else { devp = &hpetp->hp_dev[i]; @@ -504,7 +500,11 @@ static int hpet_ioctl_ieon(struct hpet_dev *devp) devp->hd_irq = irq; t = devp->hd_ireqfreq; v = readq(&timer->hpet_config); - g = v | Tn_INT_ENB_CNF_MASK; + + /* 64-bit comparators are not yet supported through the ioctls, + * so force this into 32-bit mode if it supports both modes + */ + g = v | Tn_32MODE_CNF_MASK | Tn_INT_ENB_CNF_MASK; if (devp->hd_flags & HPET_PERIODIC) { write_counter(t, &timer->hpet_compare); @@ -514,6 +514,12 @@ static int hpet_ioctl_ieon(struct hpet_dev *devp) v |= Tn_VAL_SET_CNF_MASK; writeq(v, &timer->hpet_config); local_irq_save(flags); + + /* NOTE: what we modify here is a hidden accumulator + * register supported by periodic-capable comparators. + * We never want to modify the (single) counter; that + * would affect all the comparators. + */ m = read_counter(&hpet->hpet_mc); write_counter(t + m + hpetp->hp_delta, &timer->hpet_compare); } else { @@ -667,57 +673,6 @@ static int hpet_is_known(struct hpet_data *hdp) return 0; } -static inline int hpet_tpcheck(struct hpet_task *tp) -{ - struct hpet_dev *devp; - struct hpets *hpetp; - - devp = tp->ht_opaque; - - if (!devp) - return -ENXIO; - - for (hpetp = hpets; hpetp; hpetp = hpetp->hp_next) - if (devp >= hpetp->hp_dev - && devp < (hpetp->hp_dev + hpetp->hp_ntimer) - && devp->hd_hpet == hpetp->hp_hpet) - return 0; - - return -ENXIO; -} - -#if 0 -int hpet_unregister(struct hpet_task *tp) -{ - struct hpet_dev *devp; - struct hpet_timer __iomem *timer; - int err; - - if ((err = hpet_tpcheck(tp))) - return err; - - spin_lock_irq(&hpet_task_lock); - spin_lock(&hpet_lock); - - devp = tp->ht_opaque; - if (devp->hd_task != tp) { - spin_unlock(&hpet_lock); - spin_unlock_irq(&hpet_task_lock); - return -ENXIO; - } - - timer = devp->hd_timer; - writeq((readq(&timer->hpet_config) & ~Tn_INT_ENB_CNF_MASK), - &timer->hpet_config); - devp->hd_flags &= ~(HPET_IE | HPET_PERIODIC); - devp->hd_task = NULL; - spin_unlock(&hpet_lock); - spin_unlock_irq(&hpet_task_lock); - - return 0; -} -#endif /* 0 */ - static ctl_table hpet_table[] = { { .ctl_name = CTL_UNNUMBERED, @@ -872,9 +827,12 @@ int hpet_alloc(struct hpet_data *hdp) printk("%s %d", i > 0 ? "," : "", hdp->hd_irq[i]); printk("\n"); - printk(KERN_INFO "hpet%u: %u %d-bit timers, %Lu Hz\n", - hpetp->hp_which, hpetp->hp_ntimer, - cap & HPET_COUNTER_SIZE_MASK ? 64 : 32, hpetp->hp_tick_freq); + printk(KERN_INFO + "hpet%u: %u comparators, %d-bit %u.%06u MHz counter\n", + hpetp->hp_which, hpetp->hp_ntimer, + cap & HPET_COUNTER_SIZE_MASK ? 64 : 32, + (unsigned) (hpetp->hp_tick_freq / 1000000), + (unsigned) (hpetp->hp_tick_freq % 1000000)); mcfg = readq(&hpet->hpet_config); if ((mcfg & HPET_ENABLE_CNF_MASK) == 0) { diff --git a/include/linux/hpet.h b/include/linux/hpet.h index 6d2626b63a9..79f63a27bce 100644 --- a/include/linux/hpet.h +++ b/include/linux/hpet.h @@ -92,23 +92,14 @@ struct hpet { * exported interfaces */ -struct hpet_task { - void (*ht_func) (void *); - void *ht_data; - void *ht_opaque; -}; - struct hpet_data { unsigned long hd_phys_address; void __iomem *hd_address; unsigned short hd_nirqs; - unsigned short hd_flags; unsigned int hd_state; /* timer allocated */ unsigned int hd_irq[HPET_MAX_TIMERS]; }; -#define HPET_DATA_PLATFORM 0x0001 /* platform call to hpet_alloc */ - static inline void hpet_reserve_timer(struct hpet_data *hd, int timer) { hd->hd_state |= (1 << timer); @@ -126,7 +117,7 @@ struct hpet_info { unsigned short hi_timer; }; -#define HPET_INFO_PERIODIC 0x0001 /* timer is periodic */ +#define HPET_INFO_PERIODIC 0x0010 /* periodic-capable comparator */ #define HPET_IE_ON _IO('h', 0x01) /* interrupt on */ #define HPET_IE_OFF _IO('h', 0x02) /* interrupt off */ -- cgit v1.2.3 From f92a789d259eb95afe7498ff5938fe2a93d39c82 Mon Sep 17 00:00:00 2001 From: David Brownell Date: Thu, 31 Jul 2008 12:59:56 -0700 Subject: hpet: /dev/hpet - fixes and cleanup, fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: On Thursday 31 July 2008, Ingo Molnar wrote: >   drivers/built-in.o: In function `hpet_alloc': >   : undefined reference to `__udivdi3' >   drivers/built-in.o: In function `hpet_alloc': >   : undefined reference to `__umoddi3' Signed-off-by: David Brownell Signed-off-by: Ingo Molnar --- drivers/char/hpet.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c index 4bc1da4d4f8..2908a0eb63a 100644 --- a/drivers/char/hpet.c +++ b/drivers/char/hpet.c @@ -764,6 +764,7 @@ int hpet_alloc(struct hpet_data *hdp) static struct hpets *last = NULL; unsigned long period; unsigned long long temp; + u32 remainder; /* * hpet_alloc can be called by platform dependent code. @@ -827,12 +828,13 @@ int hpet_alloc(struct hpet_data *hdp) printk("%s %d", i > 0 ? "," : "", hdp->hd_irq[i]); printk("\n"); + temp = hpetp->hp_tick_freq; + remainder = do_div(temp, 1000000); printk(KERN_INFO "hpet%u: %u comparators, %d-bit %u.%06u MHz counter\n", hpetp->hp_which, hpetp->hp_ntimer, cap & HPET_COUNTER_SIZE_MASK ? 64 : 32, - (unsigned) (hpetp->hp_tick_freq / 1000000), - (unsigned) (hpetp->hp_tick_freq % 1000000)); + (unsigned) temp, remainder); mcfg = readq(&hpet->hpet_config); if ((mcfg & HPET_ENABLE_CNF_MASK) == 0) { -- cgit v1.2.3 From e7250b8ae3870f37f660c2f65cafcaba85e3bfd3 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 5 Sep 2008 18:33:26 +0200 Subject: x86: hpet: modify IXP400 quirk to enable interrupts The current quirk is incomplete. Some more chipset fiddling has to be done to enable HPET interrupts. This patch aims to do this. From my tests it seems to work faultlessly. But the official statement is that HPET is not supported on SB4X0. Users will still have to use hpet=force to enable it. Use it at your own risk. Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/kernel/quirks.c | 41 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index d1385881810..f6a11b9b1f9 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -354,9 +354,27 @@ static void ati_force_hpet_resume(void) printk(KERN_DEBUG "Force enabled HPET at resume\n"); } +static u32 ati_ixp4x0_rev(struct pci_dev *dev) +{ + u32 d; + u8 b; + + pci_read_config_byte(dev, 0xac, &b); + b &= ~(1<<5); + pci_write_config_byte(dev, 0xac, b); + pci_read_config_dword(dev, 0x70, &d); + d |= 1<<8; + pci_write_config_dword(dev, 0x70, d); + pci_read_config_dword(dev, 0x8, &d); + d &= 0xff; + dev_printk(KERN_DEBUG, &dev->dev, "SB4X0 revision 0x%x\n", d); + return d; +} + static void ati_force_enable_hpet(struct pci_dev *dev) { - u32 uninitialized_var(val); + u32 d, val; + u8 b; if (hpet_address || force_hpet_address) return; @@ -366,14 +384,33 @@ static void ati_force_enable_hpet(struct pci_dev *dev) return; } + d = ati_ixp4x0_rev(dev); + if (d < 0x82) + return; + + /* base address */ pci_write_config_dword(dev, 0x14, 0xfed00000); pci_read_config_dword(dev, 0x14, &val); + + /* enable interrupt */ + outb(0x72, 0xcd6); b = inb(0xcd7); + b |= 0x1; + outb(0x72, 0xcd6); outb(b, 0xcd7); + outb(0x72, 0xcd6); b = inb(0xcd7); + if (!(b & 0x1)) + return; + pci_read_config_dword(dev, 0x64, &d); + d |= (1<<10); + pci_write_config_dword(dev, 0x64, d); + pci_read_config_dword(dev, 0x64, &d); + if (!(d & (1<<10))) + return; + force_hpet_address = val; force_hpet_resume_type = ATI_FORCE_HPET_RESUME; dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", force_hpet_address); cached_dev = dev; - return; } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, ati_force_enable_hpet); -- cgit v1.2.3 From d7451fca18e2ec62641ae4bbfe946069f13765a3 Mon Sep 17 00:00:00 2001 From: Jordan Crouse Date: Fri, 12 Sep 2008 11:45:22 -0600 Subject: x86, hpet: SB600 - remove HPET resources from PCI device Prevent the HPET resources from appearing in PCI device 14.0 which confuses the PCI resource engine. Signed-off-by: Jordan Crouse Signed-off-by: Ingo Molnar --- arch/x86/pci/fixup.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 4bdaa590375..3c27a809393 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -511,3 +511,31 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1201, fam10h_pci_cfg_space_size); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1202, fam10h_pci_cfg_space_size); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1203, fam10h_pci_cfg_space_size); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1204, fam10h_pci_cfg_space_size); + +/* + * SB600: Disable BAR1 on device 14.0 to avoid HPET resources from + * confusing the PCI engine: + */ +static void sb600_disable_hpet_bar(struct pci_dev *dev) +{ + u8 val; + + /* + * The SB600 and SB700 both share the same device + * ID, but the PM register 0x55 does something different + * for the SB700, so make sure we are dealing with the + * SB600 before touching the bit: + */ + + pci_read_config_byte(dev, 0x08, &val); + + if (val < 0x2F) { + outb(0x55, 0xCD6); + val = inb(0xCD7); + + /* Set bit 7 in PM register 0x55 */ + outb(0x55, 0xCD6); + outb(val | 0x80, 0xCD7); + } +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_ATI, 0x4385, sb600_disable_hpet_bar); -- cgit v1.2.3 From f26ed116c0baa8f9cbbe3d5db84034d75f6250f8 Mon Sep 17 00:00:00 2001 From: David John Date: Fri, 10 Oct 2008 11:42:44 +0530 Subject: HPET: Remove spurious HPET busy warning message. On x86 systems with CONFIG_HPET_TIMER enabled, when the HPET driver (drivers/char/hpet.c) is loaded, an incorrect busy message is printed when the driver initializes since the HPET has already been allocated by the core timer code. Remove the warning message. Signed-off-by: David John Acked-by: Venkatesh Pallipadi Signed-off-by: Ingo Molnar --- drivers/char/hpet.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c index 2908a0eb63a..f3cfb4c7612 100644 --- a/drivers/char/hpet.c +++ b/drivers/char/hpet.c @@ -897,8 +897,6 @@ static acpi_status hpet_resources(struct acpi_resource *res, void *data) hdp->hd_address = ioremap(addr.minimum, addr.address_length); if (hpet_is_known(hdp)) { - printk(KERN_DEBUG "%s: 0x%lx is busy\n", - __func__, hdp->hd_phys_address); iounmap(hdp->hd_address); return AE_ALREADY_EXISTS; } @@ -914,8 +912,6 @@ static acpi_status hpet_resources(struct acpi_resource *res, void *data) HPET_RANGE_SIZE); if (hpet_is_known(hdp)) { - printk(KERN_DEBUG "%s: 0x%lx is busy\n", - __func__, hdp->hd_phys_address); iounmap(hdp->hd_address); return AE_ALREADY_EXISTS; } -- cgit v1.2.3 From 9f8305fe164077bcef42034c56275c8208042e88 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 29 Aug 2008 13:14:30 +0100 Subject: x86-64: reduce boot fixmap space Just like for 32-bit - as 256 entries are needed, aligning to a 256-entry boundary is sufficient and still guarantees the single pte table requirement. Likewise move up __end_of_permanent_fixed_addresses, to match 32-bit. Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- include/asm-x86/fixmap_64.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/asm-x86/fixmap_64.h b/include/asm-x86/fixmap_64.h index dafb24bc042..33df64fd0e1 100644 --- a/include/asm-x86/fixmap_64.h +++ b/include/asm-x86/fixmap_64.h @@ -49,6 +49,7 @@ enum fixed_addresses { #ifdef CONFIG_PARAVIRT FIX_PARAVIRT_BOOTMAP, #endif + __end_of_permanent_fixed_addresses, #ifdef CONFIG_ACPI FIX_ACPI_BEGIN, FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1, @@ -56,18 +57,17 @@ enum fixed_addresses { #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT FIX_OHCI1394_BASE, #endif - __end_of_permanent_fixed_addresses, /* * 256 temporary boot-time mappings, used by early_ioremap(), * before ioremap() is functional. * - * We round it up to the next 512 pages boundary so that we + * We round it up to the next 256 pages boundary so that we * can have a single pgd entry and a single pte table: */ #define NR_FIX_BTMAPS 64 #define FIX_BTMAPS_NESTING 4 - FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 512 - - (__end_of_permanent_fixed_addresses & 511), + FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - + (__end_of_permanent_fixed_addresses & 255), FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1, __end_of_fixed_addresses }; -- cgit v1.2.3 From 295286a89107c353b9677bc604361c537fd6a1c0 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 29 Aug 2008 13:21:11 +0100 Subject: x86-64: slightly stream-line 32-bit syscall entry code Avoid updating registers or memory twice as well as needlessly loading or copying registers. Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32entry.S | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index ffc1bb4fed7..eb4314768bf 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -39,11 +39,11 @@ .endm /* clobbers %eax */ - .macro CLEAR_RREGS + .macro CLEAR_RREGS _r9=rax xorl %eax,%eax movq %rax,R11(%rsp) movq %rax,R10(%rsp) - movq %rax,R9(%rsp) + movq %\_r9,R9(%rsp) movq %rax,R8(%rsp) .endm @@ -52,11 +52,10 @@ * We don't reload %eax because syscall_trace_enter() returned * the value it wants us to use in the table lookup. */ - .macro LOAD_ARGS32 offset - movl \offset(%rsp),%r11d - movl \offset+8(%rsp),%r10d + .macro LOAD_ARGS32 offset, _r9=0 + .if \_r9 movl \offset+16(%rsp),%r9d - movl \offset+24(%rsp),%r8d + .endif movl \offset+40(%rsp),%ecx movl \offset+48(%rsp),%edx movl \offset+56(%rsp),%esi @@ -145,7 +144,7 @@ ENTRY(ia32_sysenter_target) SAVE_ARGS 0,0,1 /* no need to do an access_ok check here because rbp has been 32bit zero extended */ -1: movl (%rbp),%r9d +1: movl (%rbp),%ebp .section __ex_table,"a" .quad 1b,ia32_badarg .previous @@ -157,7 +156,7 @@ ENTRY(ia32_sysenter_target) cmpl $(IA32_NR_syscalls-1),%eax ja ia32_badsys sysenter_do_call: - IA32_ARG_FIXUP 1 + IA32_ARG_FIXUP sysenter_dispatch: call *ia32_sys_call_table(,%rax,8) movq %rax,RAX-ARGOFFSET(%rsp) @@ -234,20 +233,17 @@ sysexit_audit: #endif sysenter_tracesys: - xchgl %r9d,%ebp #ifdef CONFIG_AUDITSYSCALL testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) jz sysenter_auditsys #endif SAVE_REST CLEAR_RREGS - movq %r9,R9(%rsp) movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ RESTORE_REST - xchgl %ebp,%r9d cmpl $(IA32_NR_syscalls-1),%eax ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ jmp sysenter_do_call @@ -314,9 +310,9 @@ ENTRY(ia32_cstar_target) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) CFI_REMEMBER_STATE jnz cstar_tracesys -cstar_do_call: cmpl $IA32_NR_syscalls-1,%eax ja ia32_badsys +cstar_do_call: IA32_ARG_FIXUP 1 cstar_dispatch: call *ia32_sys_call_table(,%rax,8) @@ -357,15 +353,13 @@ cstar_tracesys: #endif xchgl %r9d,%ebp SAVE_REST - CLEAR_RREGS - movq %r9,R9(%rsp) + CLEAR_RREGS r9 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter - LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ + LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */ RESTORE_REST xchgl %ebp,%r9d - movl RSP-ARGOFFSET(%rsp), %r8d cmpl $(IA32_NR_syscalls-1),%eax ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ jmp cstar_do_call -- cgit v1.2.3 From 0cefa5b9b0a61b62442c5d0ca00a304c5896b6e9 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Sun, 7 Sep 2008 11:29:58 +0200 Subject: arch/x86/kernel/smpboot.c: Clarify when irq processing begins. Secondary cpus start with local interrupts disabled. start_secondary() first initializes the new cpu, then it enables the local interrupts. (although interrupts are enabled within smp_callin() as well). Right now, the local interrupts are enabled as a side effect of calling ipi_call_lock_irq(). The attached patch clarifies when local interrupts are enabled. Signed-off-by: Manfred Spraul Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 76b6f50978f..b700c9a1064 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -334,14 +334,17 @@ static void __cpuinit start_secondary(void *unused) * does not change while we are assigning vectors to cpus. Holding * this lock ensures we don't half assign or remove an irq from a cpu. */ - ipi_call_lock_irq(); + ipi_call_lock(); lock_vector_lock(); __setup_vector_irq(smp_processor_id()); cpu_set(smp_processor_id(), cpu_online_map); unlock_vector_lock(); - ipi_call_unlock_irq(); + ipi_call_unlock(); per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; + /* enable local interrupts */ + local_irq_enable(); + setup_secondary_clock(); wmb(); -- cgit v1.2.3 From 927604c7592473742891dc13e1da09febc06e01b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 9 Sep 2008 23:34:17 -0700 Subject: x86: rename discontig_32.c to numa_32.c name it in line with its purpose. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/Makefile | 6 +- arch/x86/mm/discontig_32.c | 444 --------------------------------------------- arch/x86/mm/numa_32.c | 444 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 445 insertions(+), 449 deletions(-) delete mode 100644 arch/x86/mm/discontig_32.c create mode 100644 arch/x86/mm/numa_32.c diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index dfb932dcf13..59f89b434b4 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -13,12 +13,8 @@ obj-$(CONFIG_MMIOTRACE) += mmiotrace.o mmiotrace-y := pf_in.o mmio-mod.o obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o -ifeq ($(CONFIG_X86_32),y) -obj-$(CONFIG_NUMA) += discontig_32.o -else -obj-$(CONFIG_NUMA) += numa_64.o +obj-$(CONFIG_NUMA) += numa_$(BITS).o obj-$(CONFIG_K8_NUMA) += k8topology_64.o -endif obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o obj-$(CONFIG_MEMTEST) += memtest.o diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c deleted file mode 100644 index 847c164725f..00000000000 --- a/arch/x86/mm/discontig_32.c +++ /dev/null @@ -1,444 +0,0 @@ -/* - * Written by: Patricia Gaughen , IBM Corporation - * August 2002: added remote node KVA remap - Martin J. Bligh - * - * Copyright (C) 2002, IBM Corp. - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; -EXPORT_SYMBOL(node_data); - -/* - * numa interface - we expect the numa architecture specific code to have - * populated the following initialisation. - * - * 1) node_online_map - the map of all nodes configured (online) in the system - * 2) node_start_pfn - the starting page frame number for a node - * 3) node_end_pfn - the ending page fram number for a node - */ -unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly; -unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly; - - -#ifdef CONFIG_DISCONTIGMEM -/* - * 4) physnode_map - the mapping between a pfn and owning node - * physnode_map keeps track of the physical memory layout of a generic - * numa node on a 64Mb break (each element of the array will - * represent 64Mb of memory and will be marked by the node id. so, - * if the first gig is on node 0, and the second gig is on node 1 - * physnode_map will contain: - * - * physnode_map[0-15] = 0; - * physnode_map[16-31] = 1; - * physnode_map[32- ] = -1; - */ -s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1}; -EXPORT_SYMBOL(physnode_map); - -void memory_present(int nid, unsigned long start, unsigned long end) -{ - unsigned long pfn; - - printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n", - nid, start, end); - printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); - printk(KERN_DEBUG " "); - for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) { - physnode_map[pfn / PAGES_PER_ELEMENT] = nid; - printk(KERN_CONT "%lx ", pfn); - } - printk(KERN_CONT "\n"); -} - -unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, - unsigned long end_pfn) -{ - unsigned long nr_pages = end_pfn - start_pfn; - - if (!nr_pages) - return 0; - - return (nr_pages + 1) * sizeof(struct page); -} -#endif - -extern unsigned long find_max_low_pfn(void); -extern unsigned long highend_pfn, highstart_pfn; - -#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) - -unsigned long node_remap_size[MAX_NUMNODES]; -static void *node_remap_start_vaddr[MAX_NUMNODES]; -void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); - -static unsigned long kva_start_pfn; -static unsigned long kva_pages; -/* - * FLAT - support for basic PC memory model with discontig enabled, essentially - * a single node with all available processors in it with a flat - * memory map. - */ -int __init get_memcfg_numa_flat(void) -{ - printk(KERN_DEBUG "NUMA - single node, flat memory mode\n"); - - node_start_pfn[0] = 0; - node_end_pfn[0] = max_pfn; - e820_register_active_regions(0, 0, max_pfn); - memory_present(0, 0, max_pfn); - node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn); - - /* Indicate there is one node available. */ - nodes_clear(node_online_map); - node_set_online(0); - return 1; -} - -/* - * Find the highest page frame number we have available for the node - */ -static void __init propagate_e820_map_node(int nid) -{ - if (node_end_pfn[nid] > max_pfn) - node_end_pfn[nid] = max_pfn; - /* - * if a user has given mem=XXXX, then we need to make sure - * that the node _starts_ before that, too, not just ends - */ - if (node_start_pfn[nid] > max_pfn) - node_start_pfn[nid] = max_pfn; - BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]); -} - -/* - * Allocate memory for the pg_data_t for this node via a crude pre-bootmem - * method. For node zero take this from the bottom of memory, for - * subsequent nodes place them at node_remap_start_vaddr which contains - * node local data in physically node local memory. See setup_memory() - * for details. - */ -static void __init allocate_pgdat(int nid) -{ - char buf[16]; - - if (node_has_online_mem(nid) && node_remap_start_vaddr[nid]) - NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; - else { - unsigned long pgdat_phys; - pgdat_phys = find_e820_area(min_low_pfn<>PAGE_SHIFT)); - memset(buf, 0, sizeof(buf)); - sprintf(buf, "NODE_DATA %d", nid); - reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf); - } - printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n", - nid, (unsigned long)NODE_DATA(nid)); -} - -/* - * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel - * virtual address space (KVA) is reserved and portions of nodes are mapped - * using it. This is to allow node-local memory to be allocated for - * structures that would normally require ZONE_NORMAL. The memory is - * allocated with alloc_remap() and callers should be prepared to allocate - * from the bootmem allocator instead. - */ -static unsigned long node_remap_start_pfn[MAX_NUMNODES]; -static void *node_remap_end_vaddr[MAX_NUMNODES]; -static void *node_remap_alloc_vaddr[MAX_NUMNODES]; -static unsigned long node_remap_offset[MAX_NUMNODES]; - -void *alloc_remap(int nid, unsigned long size) -{ - void *allocation = node_remap_alloc_vaddr[nid]; - - size = ALIGN(size, L1_CACHE_BYTES); - - if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) - return 0; - - node_remap_alloc_vaddr[nid] += size; - memset(allocation, 0, size); - - return allocation; -} - -static void __init remap_numa_kva(void) -{ - void *vaddr; - unsigned long pfn; - int node; - - for_each_online_node(node) { - printk(KERN_DEBUG "remap_numa_kva: node %d\n", node); - for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { - vaddr = node_remap_start_vaddr[node]+(pfn< max_pfn) - continue; - if (!node_end_pfn[nid]) - continue; - if (node_end_pfn[nid] > max_pfn) - node_end_pfn[nid] = max_pfn; - - /* ensure the remap includes space for the pgdat. */ - size = node_remap_size[nid] + sizeof(pg_data_t); - - /* convert size to large (pmd size) pages, rounding up */ - size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES; - /* now the roundup is correct, convert to PAGE_SIZE pages */ - size = size * PTRS_PER_PTE; - - node_kva_target = round_down(node_end_pfn[nid] - size, - PTRS_PER_PTE); - node_kva_target <<= PAGE_SHIFT; - do { - node_kva_final = find_e820_area(node_kva_target, - ((u64)node_end_pfn[nid])<>PAGE_SHIFT) > (node_start_pfn[nid])); - - if (node_kva_final == -1ULL) - panic("Can not get kva ram\n"); - - node_remap_size[nid] = size; - node_remap_offset[nid] = reserve_pages; - reserve_pages += size; - printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of" - " node %d at %llx\n", - size, nid, node_kva_final>>PAGE_SHIFT); - - /* - * prevent kva address below max_low_pfn want it on system - * with less memory later. - * layout will be: KVA address , KVA RAM - * - * we are supposed to only record the one less then max_low_pfn - * but we could have some hole in high memory, and it will only - * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide - * to use it as free. - * So reserve_early here, hope we don't run out of that array - */ - reserve_early(node_kva_final, - node_kva_final+(((u64)size)<>PAGE_SHIFT; - remove_active_range(nid, node_remap_start_pfn[nid], - node_remap_start_pfn[nid] + size); - } - printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n", - reserve_pages); - return reserve_pages; -} - -static void init_remap_allocator(int nid) -{ - node_remap_start_vaddr[nid] = pfn_to_kaddr( - kva_start_pfn + node_remap_offset[nid]); - node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] + - (node_remap_size[nid] * PAGE_SIZE); - node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + - ALIGN(sizeof(pg_data_t), PAGE_SIZE); - - printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid, - (ulong) node_remap_start_vaddr[nid], - (ulong) node_remap_end_vaddr[nid]); -} - -void __init initmem_init(unsigned long start_pfn, - unsigned long end_pfn) -{ - int nid; - long kva_target_pfn; - - /* - * When mapping a NUMA machine we allocate the node_mem_map arrays - * from node local memory. They are then mapped directly into KVA - * between zone normal and vmalloc space. Calculate the size of - * this space and use it to adjust the boundary between ZONE_NORMAL - * and ZONE_HIGHMEM. - */ - - get_memcfg_numa(); - - kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); - - kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); - do { - kva_start_pfn = find_e820_area(kva_target_pfn<> PAGE_SHIFT; - kva_target_pfn -= PTRS_PER_PTE; - } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn); - - if (kva_start_pfn == -1UL) - panic("Can not get kva space\n"); - - printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n", - kva_start_pfn, max_low_pfn); - printk(KERN_INFO "max_pfn = %lx\n", max_pfn); - - /* avoid clash with initrd */ - reserve_early(kva_start_pfn< max_low_pfn) - highstart_pfn = max_low_pfn; - printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", - pages_to_mb(highend_pfn - highstart_pfn)); - num_physpages = highend_pfn; - high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; -#else - num_physpages = max_low_pfn; - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; -#endif - printk(KERN_NOTICE "%ldMB LOWMEM available.\n", - pages_to_mb(max_low_pfn)); - printk(KERN_DEBUG "max_low_pfn = %lx, highstart_pfn = %lx\n", - max_low_pfn, highstart_pfn); - - printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n", - (ulong) pfn_to_kaddr(max_low_pfn)); - for_each_online_node(nid) { - init_remap_allocator(nid); - - allocate_pgdat(nid); - } - remap_numa_kva(); - - printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", - (ulong) pfn_to_kaddr(highstart_pfn)); - for_each_online_node(nid) - propagate_e820_map_node(nid); - - for_each_online_node(nid) - memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); - - NODE_DATA(0)->bdata = &bootmem_node_data[0]; - setup_bootmem_allocator(); -} - -void __init set_highmem_pages_init(void) -{ -#ifdef CONFIG_HIGHMEM - struct zone *zone; - int nid; - - for_each_zone(zone) { - unsigned long zone_start_pfn, zone_end_pfn; - - if (!is_highmem(zone)) - continue; - - zone_start_pfn = zone->zone_start_pfn; - zone_end_pfn = zone_start_pfn + zone->spanned_pages; - - nid = zone_to_nid(zone); - printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n", - zone->name, nid, zone_start_pfn, zone_end_pfn); - - add_highpages_with_active_regions(nid, zone_start_pfn, - zone_end_pfn); - } - totalram_pages += totalhigh_pages; -#endif -} - -#ifdef CONFIG_MEMORY_HOTPLUG -static int paddr_to_nid(u64 addr) -{ - int nid; - unsigned long pfn = PFN_DOWN(addr); - - for_each_node(nid) - if (node_start_pfn[nid] <= pfn && - pfn < node_end_pfn[nid]) - return nid; - - return -1; -} - -/* - * This function is used to ask node id BEFORE memmap and mem_section's - * initialization (pfn_to_nid() can't be used yet). - * If _PXM is not defined on ACPI's DSDT, node id must be found by this. - */ -int memory_add_physaddr_to_nid(u64 addr) -{ - int nid = paddr_to_nid(addr); - return (nid >= 0) ? nid : 0; -} - -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); -#endif - diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c new file mode 100644 index 00000000000..847c164725f --- /dev/null +++ b/arch/x86/mm/numa_32.c @@ -0,0 +1,444 @@ +/* + * Written by: Patricia Gaughen , IBM Corporation + * August 2002: added remote node KVA remap - Martin J. Bligh + * + * Copyright (C) 2002, IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; +EXPORT_SYMBOL(node_data); + +/* + * numa interface - we expect the numa architecture specific code to have + * populated the following initialisation. + * + * 1) node_online_map - the map of all nodes configured (online) in the system + * 2) node_start_pfn - the starting page frame number for a node + * 3) node_end_pfn - the ending page fram number for a node + */ +unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly; +unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly; + + +#ifdef CONFIG_DISCONTIGMEM +/* + * 4) physnode_map - the mapping between a pfn and owning node + * physnode_map keeps track of the physical memory layout of a generic + * numa node on a 64Mb break (each element of the array will + * represent 64Mb of memory and will be marked by the node id. so, + * if the first gig is on node 0, and the second gig is on node 1 + * physnode_map will contain: + * + * physnode_map[0-15] = 0; + * physnode_map[16-31] = 1; + * physnode_map[32- ] = -1; + */ +s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1}; +EXPORT_SYMBOL(physnode_map); + +void memory_present(int nid, unsigned long start, unsigned long end) +{ + unsigned long pfn; + + printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n", + nid, start, end); + printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); + printk(KERN_DEBUG " "); + for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) { + physnode_map[pfn / PAGES_PER_ELEMENT] = nid; + printk(KERN_CONT "%lx ", pfn); + } + printk(KERN_CONT "\n"); +} + +unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long nr_pages = end_pfn - start_pfn; + + if (!nr_pages) + return 0; + + return (nr_pages + 1) * sizeof(struct page); +} +#endif + +extern unsigned long find_max_low_pfn(void); +extern unsigned long highend_pfn, highstart_pfn; + +#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) + +unsigned long node_remap_size[MAX_NUMNODES]; +static void *node_remap_start_vaddr[MAX_NUMNODES]; +void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); + +static unsigned long kva_start_pfn; +static unsigned long kva_pages; +/* + * FLAT - support for basic PC memory model with discontig enabled, essentially + * a single node with all available processors in it with a flat + * memory map. + */ +int __init get_memcfg_numa_flat(void) +{ + printk(KERN_DEBUG "NUMA - single node, flat memory mode\n"); + + node_start_pfn[0] = 0; + node_end_pfn[0] = max_pfn; + e820_register_active_regions(0, 0, max_pfn); + memory_present(0, 0, max_pfn); + node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn); + + /* Indicate there is one node available. */ + nodes_clear(node_online_map); + node_set_online(0); + return 1; +} + +/* + * Find the highest page frame number we have available for the node + */ +static void __init propagate_e820_map_node(int nid) +{ + if (node_end_pfn[nid] > max_pfn) + node_end_pfn[nid] = max_pfn; + /* + * if a user has given mem=XXXX, then we need to make sure + * that the node _starts_ before that, too, not just ends + */ + if (node_start_pfn[nid] > max_pfn) + node_start_pfn[nid] = max_pfn; + BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]); +} + +/* + * Allocate memory for the pg_data_t for this node via a crude pre-bootmem + * method. For node zero take this from the bottom of memory, for + * subsequent nodes place them at node_remap_start_vaddr which contains + * node local data in physically node local memory. See setup_memory() + * for details. + */ +static void __init allocate_pgdat(int nid) +{ + char buf[16]; + + if (node_has_online_mem(nid) && node_remap_start_vaddr[nid]) + NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; + else { + unsigned long pgdat_phys; + pgdat_phys = find_e820_area(min_low_pfn<>PAGE_SHIFT)); + memset(buf, 0, sizeof(buf)); + sprintf(buf, "NODE_DATA %d", nid); + reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf); + } + printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n", + nid, (unsigned long)NODE_DATA(nid)); +} + +/* + * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel + * virtual address space (KVA) is reserved and portions of nodes are mapped + * using it. This is to allow node-local memory to be allocated for + * structures that would normally require ZONE_NORMAL. The memory is + * allocated with alloc_remap() and callers should be prepared to allocate + * from the bootmem allocator instead. + */ +static unsigned long node_remap_start_pfn[MAX_NUMNODES]; +static void *node_remap_end_vaddr[MAX_NUMNODES]; +static void *node_remap_alloc_vaddr[MAX_NUMNODES]; +static unsigned long node_remap_offset[MAX_NUMNODES]; + +void *alloc_remap(int nid, unsigned long size) +{ + void *allocation = node_remap_alloc_vaddr[nid]; + + size = ALIGN(size, L1_CACHE_BYTES); + + if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) + return 0; + + node_remap_alloc_vaddr[nid] += size; + memset(allocation, 0, size); + + return allocation; +} + +static void __init remap_numa_kva(void) +{ + void *vaddr; + unsigned long pfn; + int node; + + for_each_online_node(node) { + printk(KERN_DEBUG "remap_numa_kva: node %d\n", node); + for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { + vaddr = node_remap_start_vaddr[node]+(pfn< max_pfn) + continue; + if (!node_end_pfn[nid]) + continue; + if (node_end_pfn[nid] > max_pfn) + node_end_pfn[nid] = max_pfn; + + /* ensure the remap includes space for the pgdat. */ + size = node_remap_size[nid] + sizeof(pg_data_t); + + /* convert size to large (pmd size) pages, rounding up */ + size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES; + /* now the roundup is correct, convert to PAGE_SIZE pages */ + size = size * PTRS_PER_PTE; + + node_kva_target = round_down(node_end_pfn[nid] - size, + PTRS_PER_PTE); + node_kva_target <<= PAGE_SHIFT; + do { + node_kva_final = find_e820_area(node_kva_target, + ((u64)node_end_pfn[nid])<>PAGE_SHIFT) > (node_start_pfn[nid])); + + if (node_kva_final == -1ULL) + panic("Can not get kva ram\n"); + + node_remap_size[nid] = size; + node_remap_offset[nid] = reserve_pages; + reserve_pages += size; + printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of" + " node %d at %llx\n", + size, nid, node_kva_final>>PAGE_SHIFT); + + /* + * prevent kva address below max_low_pfn want it on system + * with less memory later. + * layout will be: KVA address , KVA RAM + * + * we are supposed to only record the one less then max_low_pfn + * but we could have some hole in high memory, and it will only + * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide + * to use it as free. + * So reserve_early here, hope we don't run out of that array + */ + reserve_early(node_kva_final, + node_kva_final+(((u64)size)<>PAGE_SHIFT; + remove_active_range(nid, node_remap_start_pfn[nid], + node_remap_start_pfn[nid] + size); + } + printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n", + reserve_pages); + return reserve_pages; +} + +static void init_remap_allocator(int nid) +{ + node_remap_start_vaddr[nid] = pfn_to_kaddr( + kva_start_pfn + node_remap_offset[nid]); + node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] + + (node_remap_size[nid] * PAGE_SIZE); + node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + + ALIGN(sizeof(pg_data_t), PAGE_SIZE); + + printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid, + (ulong) node_remap_start_vaddr[nid], + (ulong) node_remap_end_vaddr[nid]); +} + +void __init initmem_init(unsigned long start_pfn, + unsigned long end_pfn) +{ + int nid; + long kva_target_pfn; + + /* + * When mapping a NUMA machine we allocate the node_mem_map arrays + * from node local memory. They are then mapped directly into KVA + * between zone normal and vmalloc space. Calculate the size of + * this space and use it to adjust the boundary between ZONE_NORMAL + * and ZONE_HIGHMEM. + */ + + get_memcfg_numa(); + + kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); + + kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); + do { + kva_start_pfn = find_e820_area(kva_target_pfn<> PAGE_SHIFT; + kva_target_pfn -= PTRS_PER_PTE; + } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn); + + if (kva_start_pfn == -1UL) + panic("Can not get kva space\n"); + + printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n", + kva_start_pfn, max_low_pfn); + printk(KERN_INFO "max_pfn = %lx\n", max_pfn); + + /* avoid clash with initrd */ + reserve_early(kva_start_pfn< max_low_pfn) + highstart_pfn = max_low_pfn; + printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", + pages_to_mb(highend_pfn - highstart_pfn)); + num_physpages = highend_pfn; + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; +#else + num_physpages = max_low_pfn; + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; +#endif + printk(KERN_NOTICE "%ldMB LOWMEM available.\n", + pages_to_mb(max_low_pfn)); + printk(KERN_DEBUG "max_low_pfn = %lx, highstart_pfn = %lx\n", + max_low_pfn, highstart_pfn); + + printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n", + (ulong) pfn_to_kaddr(max_low_pfn)); + for_each_online_node(nid) { + init_remap_allocator(nid); + + allocate_pgdat(nid); + } + remap_numa_kva(); + + printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", + (ulong) pfn_to_kaddr(highstart_pfn)); + for_each_online_node(nid) + propagate_e820_map_node(nid); + + for_each_online_node(nid) + memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); + + NODE_DATA(0)->bdata = &bootmem_node_data[0]; + setup_bootmem_allocator(); +} + +void __init set_highmem_pages_init(void) +{ +#ifdef CONFIG_HIGHMEM + struct zone *zone; + int nid; + + for_each_zone(zone) { + unsigned long zone_start_pfn, zone_end_pfn; + + if (!is_highmem(zone)) + continue; + + zone_start_pfn = zone->zone_start_pfn; + zone_end_pfn = zone_start_pfn + zone->spanned_pages; + + nid = zone_to_nid(zone); + printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n", + zone->name, nid, zone_start_pfn, zone_end_pfn); + + add_highpages_with_active_regions(nid, zone_start_pfn, + zone_end_pfn); + } + totalram_pages += totalhigh_pages; +#endif +} + +#ifdef CONFIG_MEMORY_HOTPLUG +static int paddr_to_nid(u64 addr) +{ + int nid; + unsigned long pfn = PFN_DOWN(addr); + + for_each_node(nid) + if (node_start_pfn[nid] <= pfn && + pfn < node_end_pfn[nid]) + return nid; + + return -1; +} + +/* + * This function is used to ask node id BEFORE memmap and mem_section's + * initialization (pfn_to_nid() can't be used yet). + * If _PXM is not defined on ACPI's DSDT, node id must be found by this. + */ +int memory_add_physaddr_to_nid(u64 addr) +{ + int nid = paddr_to_nid(addr); + return (nid >= 0) ? nid : 0; +} + +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); +#endif + -- cgit v1.2.3 From 762db4347060c5d23e9675846e02f7d95d7f944f Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:55:55 +0200 Subject: i386: remove kprobes' restore_interrupts in favour of conditional_sti x86_64 uses a helper function conditional_sti in traps_64.c which is equal to restore_interrupts in kprobes.h. The only user of restore_interrupts is in traps_32.c. Introduce conditional_sti for i386 and remove restore_interrupts. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 8 +++++++- include/asm-x86/kprobes.h | 9 --------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 0429c5de5ea..203b863ac2a 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -84,6 +84,12 @@ static unsigned int code_bytes = 64; static int ignore_nmis; static int die_counter; +static inline void conditional_sti(struct pt_regs *regs) +{ + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); +} + void printk_address(unsigned long address, int reliable) { #ifdef CONFIG_KALLSYMS @@ -859,7 +865,7 @@ void __kprobes do_int3(struct pt_regs *regs, long error_code) * This is an interrupt gate, because kprobes wants interrupts * disabled. Normal trap handlers don't. */ - restore_interrupts(regs); + conditional_sti(regs); do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL); } diff --git a/include/asm-x86/kprobes.h b/include/asm-x86/kprobes.h index bd8407863c1..8a0748d0103 100644 --- a/include/asm-x86/kprobes.h +++ b/include/asm-x86/kprobes.h @@ -82,15 +82,6 @@ struct kprobe_ctlblk { struct prev_kprobe prev_kprobe; }; -/* trap3/1 are intr gates for kprobes. So, restore the status of IF, - * if necessary, before executing the original int3/1 (trap) handler. - */ -static inline void restore_interrupts(struct pt_regs *regs) -{ - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); -} - extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr); extern int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data); -- cgit v1.2.3 From 61aef7d24909b95f9eddaa78d78902fbea1d7059 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:55:56 +0200 Subject: i386: prepare to convert exceptions to interrupts There is some macro magic in traps_32.c to construct standard exception dispatch functions. This patch renames the DO_ERROR- like macros to DO_TRAP, and introduces new DO_ERROR ones that conditionally reenable interrupts explicitly, like x86_64. No code changes. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 76 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 65 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 203b863ac2a..fe6dd972068 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -528,6 +528,56 @@ vm86_trap: return; } +#define DO_TRAP(trapnr, signr, str, name) \ +void do_##name(struct pt_regs *regs, long error_code) \ +{ \ + trace_hardirqs_fixup(); \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ +} + +#define DO_TRAP_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \ +void do_##name(struct pt_regs *regs, long error_code) \ +{ \ + siginfo_t info; \ + if (irq) \ + local_irq_enable(); \ + info.si_signo = signr; \ + info.si_errno = 0; \ + info.si_code = sicode; \ + info.si_addr = (void __user *)siaddr; \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ +} + +#define DO_VM86_TRAP(trapnr, signr, str, name) \ +void do_##name(struct pt_regs *regs, long error_code) \ +{ \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ +} + +#define DO_VM86_TRAP_INFO(trapnr, signr, str, name, sicode, siaddr) \ +void do_##name(struct pt_regs *regs, long error_code) \ +{ \ + siginfo_t info; \ + info.si_signo = signr; \ + info.si_errno = 0; \ + info.si_code = sicode; \ + info.si_addr = (void __user *)siaddr; \ + trace_hardirqs_fixup(); \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ +} + #define DO_ERROR(trapnr, signr, str, name) \ void do_##name(struct pt_regs *regs, long error_code) \ { \ @@ -535,6 +585,7 @@ void do_##name(struct pt_regs *regs, long error_code) \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ return; \ + conditional_sti(regs); \ do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ } @@ -551,6 +602,7 @@ void do_##name(struct pt_regs *regs, long error_code) \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ return; \ + conditional_sti(regs); \ do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ } @@ -560,6 +612,7 @@ void do_##name(struct pt_regs *regs, long error_code) \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ return; \ + conditional_sti(regs); \ do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ } @@ -575,22 +628,23 @@ void do_##name(struct pt_regs *regs, long error_code) \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ return; \ + conditional_sti(regs); \ do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ } -DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) +DO_VM86_TRAP_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) #ifndef CONFIG_KPROBES -DO_VM86_ERROR(3, SIGTRAP, "int3", int3) +DO_VM86_TRAP(3, SIGTRAP, "int3", int3) #endif -DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow) -DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) -DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) -DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) -DO_ERROR(12, SIGBUS, "stack segment", stack_segment) -DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) -DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1) +DO_VM86_TRAP(4, SIGSEGV, "overflow", overflow) +DO_VM86_TRAP(5, SIGSEGV, "bounds", bounds) +DO_TRAP_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) +DO_TRAP(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) +DO_TRAP(10, SIGSEGV, "invalid TSS", invalid_TSS) +DO_TRAP(11, SIGBUS, "segment not present", segment_not_present) +DO_TRAP(12, SIGBUS, "stack segment", stack_segment) +DO_TRAP_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) +DO_TRAP_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1) void __kprobes do_general_protection(struct pt_regs *regs, long error_code) -- cgit v1.2.3 From 976382dcbe3a2233f48cda5b0840874f43a30825 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:55:57 +0200 Subject: i386: convert hardware exception 0 to an interrupt gate Handle divide error exception with interrupt initially off. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index fe6dd972068..c18824b9d1a 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -632,7 +632,7 @@ void do_##name(struct pt_regs *regs, long error_code) \ do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ } -DO_VM86_TRAP_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) +DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) #ifndef CONFIG_KPROBES DO_VM86_TRAP(3, SIGTRAP, "int3", int3) #endif @@ -1247,7 +1247,7 @@ void __init trap_init(void) early_iounmap(p, 4); #endif - set_trap_gate(0, ÷_error); + set_intr_gate(0, ÷_error); set_intr_gate(1, &debug); set_intr_gate(2, &nmi); set_system_intr_gate(3, &int3); /* int3 can be called from all */ -- cgit v1.2.3 From b94da1e4b7d396abe10ce4d566f1a5f623602c21 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:55:58 +0200 Subject: i386: expand exception 3 DO_TRAP macro The int3 exception was already takes as an interrupt and do_int3 does not fit in the new DO_ERROR macro. This patch just expands the DO_TRAP macro and rearranges the code a bit. No functional changes intended. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index c18824b9d1a..e3c4809f414 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -633,9 +633,6 @@ void do_##name(struct pt_regs *regs, long error_code) \ } DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) -#ifndef CONFIG_KPROBES -DO_VM86_TRAP(3, SIGTRAP, "int3", int3) -#endif DO_VM86_TRAP(4, SIGSEGV, "overflow", overflow) DO_VM86_TRAP(5, SIGSEGV, "bounds", bounds) DO_TRAP_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) @@ -907,9 +904,9 @@ void restart_nmi(void) acpi_nmi_enable(); } -#ifdef CONFIG_KPROBES void __kprobes do_int3(struct pt_regs *regs, long error_code) { +#ifdef CONFIG_KPROBES trace_hardirqs_fixup(); if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) @@ -920,10 +917,14 @@ void __kprobes do_int3(struct pt_regs *regs, long error_code) * disabled. Normal trap handlers don't. */ conditional_sti(regs); +#else + if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP) + == NOTIFY_STOP) + return; +#endif do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL); } -#endif /* * Our handling of the processor debug registers is non-trivial. -- cgit v1.2.3 From 8d6f9d69bdc1dcf75a3ba436c9b2efb20478619b Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:55:59 +0200 Subject: i386: convert hardware exception 4 to an interrupt gate Handle overflow exception with interrupt initially off. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index e3c4809f414..6f685e61325 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -633,7 +633,7 @@ void do_##name(struct pt_regs *regs, long error_code) \ } DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) -DO_VM86_TRAP(4, SIGSEGV, "overflow", overflow) +DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow) DO_VM86_TRAP(5, SIGSEGV, "bounds", bounds) DO_TRAP_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) DO_TRAP(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) @@ -1252,7 +1252,7 @@ void __init trap_init(void) set_intr_gate(1, &debug); set_intr_gate(2, &nmi); set_system_intr_gate(3, &int3); /* int3 can be called from all */ - set_system_gate(4, &overflow); /* int4 can be called from all */ + set_system_intr_gate(4, &overflow); /* int4 can be called from all */ set_trap_gate(5, &bounds); set_trap_gate(6, &invalid_op); set_trap_gate(7, &device_not_available); -- cgit v1.2.3 From 64f644c0b4b8b7818867fb27de077c98c4f0022b Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:00 +0200 Subject: i386: convert hardware exception 5 to an interrupt gate Handle bounds exception with interrupt initially off. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 6f685e61325..06149083763 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -634,7 +634,7 @@ void do_##name(struct pt_regs *regs, long error_code) \ DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow) -DO_VM86_TRAP(5, SIGSEGV, "bounds", bounds) +DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds) DO_TRAP_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) DO_TRAP(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_TRAP(10, SIGSEGV, "invalid TSS", invalid_TSS) @@ -1253,7 +1253,7 @@ void __init trap_init(void) set_intr_gate(2, &nmi); set_system_intr_gate(3, &int3); /* int3 can be called from all */ set_system_intr_gate(4, &overflow); /* int4 can be called from all */ - set_trap_gate(5, &bounds); + set_intr_gate(5, &bounds); set_trap_gate(6, &invalid_op); set_trap_gate(7, &device_not_available); set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); -- cgit v1.2.3 From 12394cf567d509f6ab5d94544e1f414b35286d9e Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:01 +0200 Subject: i386: convert hardware exception 6 to an interrupt gate Handle invalid opcode exception with interrupt initially off. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 06149083763..39a6101a612 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -635,7 +635,7 @@ void do_##name(struct pt_regs *regs, long error_code) \ DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow) DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds) -DO_TRAP_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) +DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) DO_TRAP(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_TRAP(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_TRAP(11, SIGBUS, "segment not present", segment_not_present) @@ -1254,7 +1254,7 @@ void __init trap_init(void) set_system_intr_gate(3, &int3); /* int3 can be called from all */ set_system_intr_gate(4, &overflow); /* int4 can be called from all */ set_intr_gate(5, &bounds); - set_trap_gate(6, &invalid_op); + set_intr_gate(6, &invalid_op); set_trap_gate(7, &device_not_available); set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); set_trap_gate(9, &coprocessor_segment_overrun); -- cgit v1.2.3 From 7643e9b936b4af31ba4851eb7d5b3a3bfad52502 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:02 +0200 Subject: i386: convert hardware exception 7 to an interrupt gate Handle no coprocessor exception with interrupt initially off. device_not_available in entry_32.S calls either math_state_restore or math_emulate. This patch adds an extra indirection to be able to re-enable interrupts explicitly in traps_32.c Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 15 ++------------- arch/x86/kernel/traps_32.c | 14 +++++++++++++- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 109792bc7cf..5a885855fc8 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -760,20 +760,9 @@ ENTRY(device_not_available) RING0_INT_FRAME pushl $-1 # mark this as an int CFI_ADJUST_CFA_OFFSET 4 - SAVE_ALL - GET_CR0_INTO_EAX - testl $0x4, %eax # EM (math emulation bit) - jne device_not_available_emulate - preempt_stop(CLBR_ANY) - call math_state_restore - jmp ret_from_exception -device_not_available_emulate: - pushl $0 # temporary storage for ORIG_EIP + pushl $do_device_not_available CFI_ADJUST_CFA_OFFSET 4 - call math_emulate - addl $4, %esp - CFI_ADJUST_CFA_OFFSET -4 - jmp ret_from_exception + jmp error_code CFI_ENDPROC END(device_not_available) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 39a6101a612..5b15f75d059 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -46,6 +46,7 @@ #include #endif +#include #include #include #include @@ -1236,6 +1237,17 @@ asmlinkage void math_emulate(long arg) #endif /* CONFIG_MATH_EMULATION */ +void __kprobes do_device_not_available(struct pt_regs *regs, long error) +{ + if (read_cr0() & X86_CR0_EM) { + conditional_sti(regs); + math_emulate(0); + } else { + math_state_restore(); /* interrupts still off */ + conditional_sti(regs); + } +} + void __init trap_init(void) { int i; @@ -1255,7 +1267,7 @@ void __init trap_init(void) set_system_intr_gate(4, &overflow); /* int4 can be called from all */ set_intr_gate(5, &bounds); set_intr_gate(6, &invalid_op); - set_trap_gate(7, &device_not_available); + set_intr_gate(7, &device_not_available); set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); set_trap_gate(9, &coprocessor_segment_overrun); set_trap_gate(10, &invalid_TSS); -- cgit v1.2.3 From 51bc1ed6060e96de5099f604071961540880efc2 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:03 +0200 Subject: i386: convert hardware exception 9 to an interrupt gate Handle coprocessor segment overrun exception with interrupt initially off. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 5b15f75d059..d26f32308db 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -637,7 +637,7 @@ DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow) DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds) DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) -DO_TRAP(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) +DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_TRAP(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_TRAP(11, SIGBUS, "segment not present", segment_not_present) DO_TRAP(12, SIGBUS, "stack segment", stack_segment) @@ -1269,7 +1269,7 @@ void __init trap_init(void) set_intr_gate(6, &invalid_op); set_intr_gate(7, &device_not_available); set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); - set_trap_gate(9, &coprocessor_segment_overrun); + set_intr_gate(9, &coprocessor_segment_overrun); set_trap_gate(10, &invalid_TSS); set_trap_gate(11, &segment_not_present); set_trap_gate(12, &stack_segment); -- cgit v1.2.3 From 6bf77bf93938ae6baad9945a0ba57b3225e8e58b Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:04 +0200 Subject: i386: convert hardware exception 10 to an interrupt gate Handle invalid TSS exception with interrupt initially off. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index d26f32308db..069075e19ea 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -638,7 +638,7 @@ DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow) DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds) DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) -DO_TRAP(10, SIGSEGV, "invalid TSS", invalid_TSS) +DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_TRAP(11, SIGBUS, "segment not present", segment_not_present) DO_TRAP(12, SIGBUS, "stack segment", stack_segment) DO_TRAP_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) @@ -1270,7 +1270,7 @@ void __init trap_init(void) set_intr_gate(7, &device_not_available); set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); set_intr_gate(9, &coprocessor_segment_overrun); - set_trap_gate(10, &invalid_TSS); + set_intr_gate(10, &invalid_TSS); set_trap_gate(11, &segment_not_present); set_trap_gate(12, &stack_segment); set_trap_gate(13, &general_protection); -- cgit v1.2.3 From 36d936c7988ac5caf575bc0e2dc618dbfebc6065 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:05 +0200 Subject: i386: convert hardware exception 11 to an interrupt gate Handle segment not present exception with interrupt initially off. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 069075e19ea..6910bbe94f6 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -639,7 +639,7 @@ DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds) DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) -DO_TRAP(11, SIGBUS, "segment not present", segment_not_present) +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) DO_TRAP(12, SIGBUS, "stack segment", stack_segment) DO_TRAP_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) DO_TRAP_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1) @@ -1271,7 +1271,7 @@ void __init trap_init(void) set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); set_intr_gate(9, &coprocessor_segment_overrun); set_intr_gate(10, &invalid_TSS); - set_trap_gate(11, &segment_not_present); + set_intr_gate(11, &segment_not_present); set_trap_gate(12, &stack_segment); set_trap_gate(13, &general_protection); set_intr_gate(14, &page_fault); -- cgit v1.2.3 From f5ca81878b42ae7d1b00d0ed5f62bb1a158bfac1 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:06 +0200 Subject: i386: convert hardware exception 12 to an interrupt gate Handle stack segment exception with interrupt initially off. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 6910bbe94f6..16c056ba14e 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -640,7 +640,7 @@ DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) -DO_TRAP(12, SIGBUS, "stack segment", stack_segment) +DO_ERROR(12, SIGBUS, "stack segment", stack_segment) DO_TRAP_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) DO_TRAP_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1) @@ -1272,7 +1272,7 @@ void __init trap_init(void) set_intr_gate(9, &coprocessor_segment_overrun); set_intr_gate(10, &invalid_TSS); set_intr_gate(11, &segment_not_present); - set_trap_gate(12, &stack_segment); + set_intr_gate(12, &stack_segment); set_trap_gate(13, &general_protection); set_intr_gate(14, &page_fault); set_trap_gate(15, &spurious_interrupt_bug); -- cgit v1.2.3 From c6df0d71bec391e78e0a38109d63154acd69a937 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:07 +0200 Subject: i386: convert hardware exception 13 to an interrupt gate Handle general protection exception with interrupt initially off. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 16c056ba14e..e2598505ef5 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -652,6 +652,8 @@ do_general_protection(struct pt_regs *regs, long error_code) struct tss_struct *tss; int cpu; + conditional_sti(regs); + cpu = get_cpu(); tss = &per_cpu(init_tss, cpu); thread = ¤t->thread; @@ -1273,7 +1275,7 @@ void __init trap_init(void) set_intr_gate(10, &invalid_TSS); set_intr_gate(11, &segment_not_present); set_intr_gate(12, &stack_segment); - set_trap_gate(13, &general_protection); + set_intr_gate(13, &general_protection); set_intr_gate(14, &page_fault); set_trap_gate(15, &spurious_interrupt_bug); set_trap_gate(16, &coprocessor_error); -- cgit v1.2.3 From cf81978d5fb32ab75f701690b372e1126b41861f Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:08 +0200 Subject: i386: convert hardware exception 15 to an interrupt gate Handle exception 15 with interrupt initially off. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index e2598505ef5..0039856b634 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -1164,6 +1164,7 @@ void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) { + conditional_sti(regs); #if 0 /* No need to warn about this any longer. */ printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); @@ -1277,7 +1278,7 @@ void __init trap_init(void) set_intr_gate(12, &stack_segment); set_intr_gate(13, &general_protection); set_intr_gate(14, &page_fault); - set_trap_gate(15, &spurious_interrupt_bug); + set_intr_gate(15, &spurious_interrupt_bug); set_trap_gate(16, &coprocessor_error); set_trap_gate(17, &alignment_check); #ifdef CONFIG_X86_MCE -- cgit v1.2.3 From 252d28fe65adc2e51bc71358eae43ba94b74da91 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:09 +0200 Subject: i386: convert hardware exception 16 to an interrupt gate Handle coprocessor error exception with interrupt initially off. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 0039856b634..5ab4c3fc7c1 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -1088,6 +1088,7 @@ void math_error(void __user *ip) void do_coprocessor_error(struct pt_regs *regs, long error_code) { + conditional_sti(regs); ignore_fpu_irq = 1; math_error((void __user *)regs->ip); } @@ -1279,7 +1280,7 @@ void __init trap_init(void) set_intr_gate(13, &general_protection); set_intr_gate(14, &page_fault); set_intr_gate(15, &spurious_interrupt_bug); - set_trap_gate(16, &coprocessor_error); + set_intr_gate(16, &coprocessor_error); set_trap_gate(17, &alignment_check); #ifdef CONFIG_X86_MCE set_trap_gate(18, &machine_check); -- cgit v1.2.3 From 5feedfd401c91e41bbe31ddec93cbe809dc98309 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:10 +0200 Subject: i386: convert hardware exception 17 to an interrupt gate Handle alignment check exception with interrupt initially off. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 5ab4c3fc7c1..86c808b4daf 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -641,7 +641,7 @@ DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) DO_ERROR(12, SIGBUS, "stack segment", stack_segment) -DO_TRAP_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) DO_TRAP_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1) void __kprobes @@ -1281,7 +1281,7 @@ void __init trap_init(void) set_intr_gate(14, &page_fault); set_intr_gate(15, &spurious_interrupt_bug); set_intr_gate(16, &coprocessor_error); - set_trap_gate(17, &alignment_check); + set_intr_gate(17, &alignment_check); #ifdef CONFIG_X86_MCE set_trap_gate(18, &machine_check); #endif -- cgit v1.2.3 From eb642f62082348c33ead53f736a9698953aa517d Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:11 +0200 Subject: i386: convert hardware exception 18 to an interrupt gate Handle machine check exception with interrupt initially off. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 2 +- arch/x86/kernel/traps_32.c | 11 ++++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 5a885855fc8..c5fe01bca6c 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1019,7 +1019,7 @@ ENTRY(machine_check) RING0_INT_FRAME pushl $0 CFI_ADJUST_CFA_OFFSET 4 - pushl machine_check_vector + pushl $do_machine_check CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 86c808b4daf..54b89f497ba 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -62,6 +62,7 @@ #include #include "mach_traps.h" +#include "cpu/mcheck/mce.h" DECLARE_BITMAP(used_vectors, NR_VECTORS); EXPORT_SYMBOL_GPL(used_vectors); @@ -1252,6 +1253,14 @@ void __kprobes do_device_not_available(struct pt_regs *regs, long error) } } +#ifdef CONFIG_X86_MCE +void __kprobes do_machine_check(struct pt_regs *regs, long error) +{ + conditional_sti(regs); + machine_check_vector(regs, error); +} +#endif + void __init trap_init(void) { int i; @@ -1283,7 +1292,7 @@ void __init trap_init(void) set_intr_gate(16, &coprocessor_error); set_intr_gate(17, &alignment_check); #ifdef CONFIG_X86_MCE - set_trap_gate(18, &machine_check); + set_intr_gate(18, &machine_check); #endif set_trap_gate(19, &simd_coprocessor_error); -- cgit v1.2.3 From b939bde2788a7f16741e563ee90f6f3ad38935cf Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:12 +0200 Subject: i386: convert hardware exception 19 to an interrupt gate Handle SIMD coprocessor exception with interrupt initially off. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 54b89f497ba..1bd7960b135 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -1144,6 +1144,8 @@ static void simd_math_error(void __user *ip) void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) { + conditional_sti(regs); + if (cpu_has_xmm) { /* Handle SIMD FPU exceptions on PIII+ processors. */ ignore_fpu_irq = 1; @@ -1294,7 +1296,7 @@ void __init trap_init(void) #ifdef CONFIG_X86_MCE set_intr_gate(18, &machine_check); #endif - set_trap_gate(19, &simd_coprocessor_error); + set_intr_gate(19, &simd_coprocessor_error); if (cpu_has_fxsr) { printk(KERN_INFO "Enabling fast FPU save and restore... "); -- cgit v1.2.3 From f8e0870f589cfa45575dc7333cdf1b8f769e7900 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:13 +0200 Subject: i386: remove temporary DO_TRAP macros, expanding the last one used Only one use of the DO_TRAP macros remains. Expand that one and remove the macros now. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 66 +++++++++++----------------------------------- 1 file changed, 15 insertions(+), 51 deletions(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 1bd7960b135..8fa8485b49c 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -530,56 +530,6 @@ vm86_trap: return; } -#define DO_TRAP(trapnr, signr, str, name) \ -void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - trace_hardirqs_fixup(); \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ -} - -#define DO_TRAP_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \ -void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - siginfo_t info; \ - if (irq) \ - local_irq_enable(); \ - info.si_signo = signr; \ - info.si_errno = 0; \ - info.si_code = sicode; \ - info.si_addr = (void __user *)siaddr; \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ -} - -#define DO_VM86_TRAP(trapnr, signr, str, name) \ -void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ -} - -#define DO_VM86_TRAP_INFO(trapnr, signr, str, name, sicode, siaddr) \ -void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - siginfo_t info; \ - info.si_signo = signr; \ - info.si_errno = 0; \ - info.si_code = sicode; \ - info.si_addr = (void __user *)siaddr; \ - trace_hardirqs_fixup(); \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ -} - #define DO_ERROR(trapnr, signr, str, name) \ void do_##name(struct pt_regs *regs, long error_code) \ { \ @@ -643,7 +593,6 @@ DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) DO_ERROR(12, SIGBUS, "stack segment", stack_segment) DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) -DO_TRAP_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1) void __kprobes do_general_protection(struct pt_regs *regs, long error_code) @@ -1263,6 +1212,21 @@ void __kprobes do_machine_check(struct pt_regs *regs, long error) } #endif +void do_iret_error(struct pt_regs *regs, long error_code) +{ + siginfo_t info; + local_irq_enable(); + + info.si_signo = SIGILL; + info.si_errno = 0; + info.si_code = ILL_BADSTK; + info.si_addr = 0; + if (notify_die(DIE_TRAP, "iret exception", + regs, error_code, 32, SIGILL) == NOTIFY_STOP) + return; + do_trap(32, SIGILL, "iret exception", 0, regs, error_code, &info); +} + void __init trap_init(void) { int i; -- cgit v1.2.3 From 85cea51d7e7b8d3408c8e933d88fa067309395fa Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:14 +0200 Subject: i386: add TRACE_IRQS_OFF to entry_32.S in 'error_code' Many exceptions use the same code path via the label 'error_code' in entry_32.S. At this point interrupts are off, so let's inform the tracing code of that fact before calling into C. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index c5fe01bca6c..49438e58291 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -730,6 +730,7 @@ error_code: movl $(__USER_DS), %ecx movl %ecx, %ds movl %ecx, %es + TRACE_IRQS_OFF movl %esp,%eax # pt_regs pointer call *%edi jmp ret_from_exception -- cgit v1.2.3 From 43024a8a5d4c63952687286f3083f7f34d4da2cc Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:15 +0200 Subject: i386: add TRACE_IRQS_OFF for exception 1 (debug) At this point interrupts are off, so let's inform the tracing code of that fact before calling into C. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 49438e58291..a278505baa1 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -804,6 +804,7 @@ debug_stack_correct: pushl $-1 # mark this as an int CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL + TRACE_IRQS_OFF xorl %edx,%edx # error code 0 movl %esp,%eax # pt_regs pointer call do_debug -- cgit v1.2.3 From e0c7317557c8fc8eacf611e30c2a80f4e24e47a3 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:16 +0200 Subject: i386: add TRACE_IRQS_OFF for the nmi At this point interrupts are off, so let's inform the tracing code of that fact before calling into C. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index a278505baa1..2abdc9a9f7b 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -849,6 +849,7 @@ nmi_stack_correct: pushl %eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL + TRACE_IRQS_OFF xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_nmi @@ -889,6 +890,7 @@ nmi_espfix_stack: pushl %eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL + TRACE_IRQS_OFF FIXUP_ESPFIX_STACK # %eax == %esp xorl %edx,%edx # zero error code call do_nmi -- cgit v1.2.3 From a790392faa3a6138b6e90d0fe320a2829652ce22 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:17 +0200 Subject: i386: add TRACE_IRQS_OFF for the exception 3 (int3) At this point interrupts are off, so let's inform the tracing code of that fact before calling into C. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 2abdc9a9f7b..b21fbfaffe3 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -921,6 +921,7 @@ KPROBE_ENTRY(int3) pushl $-1 # mark this as an int CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL + TRACE_IRQS_OFF xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_int3 -- cgit v1.2.3 From 07bb2f6236f11169fbd8a8916b16715b25fea9b6 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:18 +0200 Subject: i386: trace_hardirqs_fixup should now not be necessary: irqs are off. The exception handlers in entry_32.S should now all call TRACE_IRQS_OFF before calling the C code. The calls to trace_hardirqs_fixup should now be unnecessary. Remove them. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 8fa8485b49c..ab559365089 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -533,7 +533,6 @@ vm86_trap: #define DO_ERROR(trapnr, signr, str, name) \ void do_##name(struct pt_regs *regs, long error_code) \ { \ - trace_hardirqs_fixup(); \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ return; \ @@ -576,7 +575,6 @@ void do_##name(struct pt_regs *regs, long error_code) \ info.si_errno = 0; \ info.si_code = sicode; \ info.si_addr = (void __user *)siaddr; \ - trace_hardirqs_fixup(); \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ return; \ @@ -860,15 +858,9 @@ void restart_nmi(void) void __kprobes do_int3(struct pt_regs *regs, long error_code) { #ifdef CONFIG_KPROBES - trace_hardirqs_fixup(); - if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) return; - /* - * This is an interrupt gate, because kprobes wants interrupts - * disabled. Normal trap handlers don't. - */ conditional_sti(regs); #else if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP) @@ -907,8 +899,6 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code) unsigned int condition; int si_code; - trace_hardirqs_fixup(); - get_debugreg(condition, 6); /* -- cgit v1.2.3 From be43d72835ba610e4af274f2d123b26f66f4f7ed Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 7 Sep 2008 15:21:13 -0700 Subject: x86: add _PAGE_IOMAP pte flag for IO mappings Use one of the software-defined PTE bits to indicate that a mapping is intended for an IO address. On native hardware this is irrelevent, since a physical address is a physical address. But in a virtual environment, physical addresses are also virtualized, so there needs to be some way to distinguish between pseudo-physical addresses and actual hardware addresses; _PAGE_IOMAP indicates this intent. By default, __supported_pte_mask masks out _PAGE_IOMAP, so it doesn't even appear in the final pagetable. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 2 +- arch/x86/mm/init_64.c | 2 +- arch/x86/mm/ioremap.c | 8 ++++---- include/asm-x86/pgtable.h | 14 ++++++++++++-- 4 files changed, 18 insertions(+), 8 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index bbe044dbe01..8396868e82c 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -558,7 +558,7 @@ void zap_low_mappings(void) int nx_enabled; -pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL); +pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); EXPORT_SYMBOL_GPL(__supported_pte_mask); #ifdef CONFIG_X86_PAE diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3e10054c573..dec5c775e92 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -89,7 +89,7 @@ early_param("gbpages", parse_direct_gbpages_on); int after_bootmem; -unsigned long __supported_pte_mask __read_mostly = ~0UL; +pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; EXPORT_SYMBOL_GPL(__supported_pte_mask); static int do_not_nx __cpuinitdata; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 8cbeda15cd2..43c3b6896cd 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -242,16 +242,16 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, switch (prot_val) { case _PAGE_CACHE_UC: default: - prot = PAGE_KERNEL_NOCACHE; + prot = PAGE_KERNEL_IO_NOCACHE; break; case _PAGE_CACHE_UC_MINUS: - prot = PAGE_KERNEL_UC_MINUS; + prot = PAGE_KERNEL_IO_UC_MINUS; break; case _PAGE_CACHE_WC: - prot = PAGE_KERNEL_WC; + prot = PAGE_KERNEL_IO_WC; break; case _PAGE_CACHE_WB: - prot = PAGE_KERNEL; + prot = PAGE_KERNEL_IO; break; } diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h index ed932453ef2..81805403b64 100644 --- a/include/asm-x86/pgtable.h +++ b/include/asm-x86/pgtable.h @@ -15,7 +15,7 @@ #define _PAGE_BIT_PAT 7 /* on 4KB pages */ #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ #define _PAGE_BIT_UNUSED1 9 /* available for programmer */ -#define _PAGE_BIT_UNUSED2 10 +#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */ #define _PAGE_BIT_UNUSED3 11 #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ #define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 @@ -32,7 +32,7 @@ #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) #define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) -#define _PAGE_UNUSED2 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED2) +#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) #define _PAGE_UNUSED3 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED3) #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) @@ -99,6 +99,11 @@ #define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE) #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) +#define __PAGE_KERNEL_IO (__PAGE_KERNEL | _PAGE_IOMAP) +#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP) +#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP) +#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC | _PAGE_IOMAP) + #define PAGE_KERNEL __pgprot(__PAGE_KERNEL) #define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) #define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) @@ -113,6 +118,11 @@ #define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) #define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE) +#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) +#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) +#define PAGE_KERNEL_IO_UC_MINUS __pgprot(__PAGE_KERNEL_IO_UC_MINUS) +#define PAGE_KERNEL_IO_WC __pgprot(__PAGE_KERNEL_IO_WC) + /* xwr */ #define __P000 PAGE_NONE #define __P001 PAGE_READONLY -- cgit v1.2.3 From efc9eb20b2f5125642fc37a1dbabadc3ce5d321c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 7 Sep 2008 15:21:14 -0700 Subject: x86: remove duplicate early_ioremap declarations early_ioremap() is redeclared in several places; remove them. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- include/asm-x86/io.h | 14 -------------- include/asm-x86/io_64.h | 3 --- 2 files changed, 17 deletions(-) diff --git a/include/asm-x86/io.h b/include/asm-x86/io.h index 72b7719523b..e091f3949ec 100644 --- a/include/asm-x86/io.h +++ b/include/asm-x86/io.h @@ -5,20 +5,6 @@ #include -/* - * early_ioremap() and early_iounmap() are for temporary early boot-time - * mappings, before the real ioremap() is functional. - * A boot-time mapping is currently limited to at most 16 pages. - */ -#ifndef __ASSEMBLY__ -extern void early_ioremap_init(void); -extern void early_ioremap_clear(void); -extern void early_ioremap_reset(void); -extern void *early_ioremap(unsigned long offset, unsigned long size); -extern void early_iounmap(void *addr, unsigned long size); -extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys); -#endif - #define build_mmio_read(name, size, type, reg, barrier) \ static inline type name(const volatile void __iomem *addr) \ { type ret; asm volatile("mov" size " %1,%0":reg (ret) \ diff --git a/include/asm-x86/io_64.h b/include/asm-x86/io_64.h index 64429e9431a..ee6e086b7df 100644 --- a/include/asm-x86/io_64.h +++ b/include/asm-x86/io_64.h @@ -165,9 +165,6 @@ static inline void *phys_to_virt(unsigned long address) #include -extern void *early_ioremap(unsigned long addr, unsigned long size); -extern void early_iounmap(void *addr, unsigned long size); - /* * This one maps high address device memory and turns off caching for that area. * it's useful if some control registers are in such an area and write combining -- cgit v1.2.3 From 1494177942b23b7094ca291d37e6f6263fa60fdd Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 7 Sep 2008 15:21:15 -0700 Subject: x86: add early_memremap() early_ioremap() is also used to map normal memory when constructing the linear memory mapping. However, since we sometimes need to be able to distinguish between actual IO mappings and normal memory mappings, add a early_memremap() call, which maps with PAGE_KERNEL (as opposed to PAGE_KERNEL_IO for early_ioremap()), and use it when constructing pagetables. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 2 +- arch/x86/mm/ioremap.c | 22 +++++++++++++++++----- include/asm-x86/io.h | 1 + 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index dec5c775e92..5beb8968345 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -313,7 +313,7 @@ static __ref void *alloc_low_page(unsigned long *phys) if (pfn >= table_top) panic("alloc_low_page: ran out of memory"); - adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); + adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); memset(adr, 0, PAGE_SIZE); *phys = pfn * PAGE_SIZE; return adr; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 43c3b6896cd..7fb737c6b54 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -568,12 +568,12 @@ static void __init __early_set_fixmap(enum fixed_addresses idx, } static inline void __init early_set_fixmap(enum fixed_addresses idx, - unsigned long phys) + unsigned long phys, pgprot_t prot) { if (after_paging_init) - set_fixmap(idx, phys); + __set_fixmap(idx, phys, prot); else - __early_set_fixmap(idx, phys, PAGE_KERNEL); + __early_set_fixmap(idx, phys, prot); } static inline void __init early_clear_fixmap(enum fixed_addresses idx) @@ -601,7 +601,7 @@ static int __init check_early_ioremap_leak(void) } late_initcall(check_early_ioremap_leak); -void __init *early_ioremap(unsigned long phys_addr, unsigned long size) +static void __init *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot) { unsigned long offset, last_addr; unsigned int nrpages, nesting; @@ -650,7 +650,7 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size) idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; idx = idx0; while (nrpages > 0) { - early_set_fixmap(idx, phys_addr); + early_set_fixmap(idx, phys_addr, prot); phys_addr += PAGE_SIZE; --idx; --nrpages; @@ -661,6 +661,18 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size) return (void *) (offset + fix_to_virt(idx0)); } +/* Remap an IO device */ +void __init *early_ioremap(unsigned long phys_addr, unsigned long size) +{ + return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO); +} + +/* Remap memory */ +void __init *early_memremap(unsigned long phys_addr, unsigned long size) +{ + return __early_ioremap(phys_addr, size, PAGE_KERNEL); +} + void __init early_iounmap(void *addr, unsigned long size) { unsigned long virt_addr; diff --git a/include/asm-x86/io.h b/include/asm-x86/io.h index e091f3949ec..a233f835e0b 100644 --- a/include/asm-x86/io.h +++ b/include/asm-x86/io.h @@ -83,6 +83,7 @@ extern void early_ioremap_init(void); extern void early_ioremap_clear(void); extern void early_ioremap_reset(void); extern void *early_ioremap(unsigned long offset, unsigned long size); +extern void *early_memremap(unsigned long offset, unsigned long size); extern void early_iounmap(void *addr, unsigned long size); extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys); -- cgit v1.2.3 From 88b4c146961f4178f38a8c3e6e54709fa39a3f36 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 7 Sep 2008 15:21:16 -0700 Subject: x86: use early_memremap() in setup.c The remappings in setup.c are all just ordinary memory, so use early_memremap() rather than early_ioremap(). Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 21b8e0a5978..de6a9d6d599 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -302,7 +302,7 @@ static void __init relocate_initrd(void) if (clen > MAX_MAP_CHUNK-slop) clen = MAX_MAP_CHUNK-slop; mapaddr = ramdisk_image & PAGE_MASK; - p = early_ioremap(mapaddr, clen+slop); + p = early_memremap(mapaddr, clen+slop); memcpy(q, p+slop, clen); early_iounmap(p, clen+slop); q += clen; @@ -379,7 +379,7 @@ static void __init parse_setup_data(void) return; pa_data = boot_params.hdr.setup_data; while (pa_data) { - data = early_ioremap(pa_data, PAGE_SIZE); + data = early_memremap(pa_data, PAGE_SIZE); switch (data->type) { case SETUP_E820_EXT: parse_e820_ext(data, pa_data); @@ -402,7 +402,7 @@ static void __init e820_reserve_setup_data(void) return; pa_data = boot_params.hdr.setup_data; while (pa_data) { - data = early_ioremap(pa_data, sizeof(*data)); + data = early_memremap(pa_data, sizeof(*data)); e820_update_range(pa_data, sizeof(*data)+data->len, E820_RAM, E820_RESERVED_KERN); found = 1; @@ -428,7 +428,7 @@ static void __init reserve_early_setup_data(void) return; pa_data = boot_params.hdr.setup_data; while (pa_data) { - data = early_ioremap(pa_data, sizeof(*data)); + data = early_memremap(pa_data, sizeof(*data)); sprintf(buf, "setup data %x", data->type); reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); pa_data = data->next; -- cgit v1.2.3 From a32ad4626776f09b30ef98a872a5f6fb64fe6607 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 7 Sep 2008 15:21:17 -0700 Subject: x86-64: don't check for map replacement The check prevents flags on mappings from being changed, which is not desireable. There's no need to check for replacing a mapping, and x86-32 does not do this check. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 5beb8968345..7c8bb46e83e 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -196,9 +196,6 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) } pte = pte_offset_kernel(pmd, vaddr); - if (!pte_none(*pte) && pte_val(new_pte) && - pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask)) - pte_ERROR(*pte); set_pte(pte, new_pte); /* -- cgit v1.2.3 From a73aaedd95703bd49f4c3f9df06fb7b7373ba905 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 14 Sep 2008 02:33:14 -0700 Subject: x86: check dsdt before find oem table for es7000, v2 v2: use __acpi_unmap_table() Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/es7000_32.c | 28 +++++++++++++++++++++++----- arch/x86/mach-generic/es7000.c | 20 +++++++++++++++----- include/asm-x86/es7000/mpparse.h | 1 + 3 files changed, 39 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c index 849e5cd485b..f454c78fcef 100644 --- a/arch/x86/kernel/es7000_32.c +++ b/arch/x86/kernel/es7000_32.c @@ -109,6 +109,7 @@ struct oem_table { }; extern int find_unisys_acpi_oem_table(unsigned long *oem_addr); +extern void unmap_unisys_acpi_oem_table(unsigned long oem_addr); #endif struct mip_reg { @@ -243,21 +244,38 @@ parse_unisys_oem (char *oemptr) } #ifdef CONFIG_ACPI -int __init -find_unisys_acpi_oem_table(unsigned long *oem_addr) +static unsigned long oem_addrX; +static unsigned long oem_size; +int __init find_unisys_acpi_oem_table(unsigned long *oem_addr) { struct acpi_table_header *header = NULL; int i = 0; - while (ACPI_SUCCESS(acpi_get_table("OEM1", i++, &header))) { + acpi_size tbl_size; + + while (ACPI_SUCCESS(acpi_get_table_with_size("OEM1", i++, &header, &tbl_size))) { if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) { struct oem_table *t = (struct oem_table *)header; - *oem_addr = (unsigned long)__acpi_map_table(t->OEMTableAddr, - t->OEMTableSize); + + oem_addrX = t->OEMTableAddr; + oem_size = t->OEMTableSize; + early_acpi_os_unmap_memory(header, tbl_size); + + *oem_addr = (unsigned long)__acpi_map_table(oem_addrX, + oem_size); return 0; } + early_acpi_os_unmap_memory(header, tbl_size); } return -1; } + +void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr) +{ + if (!oem_addr) + return; + + __acpi_unmap_table((char *)oem_addr, oem_size); +} #endif static void diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c index 520cca0ee04..6513d41ea21 100644 --- a/arch/x86/mach-generic/es7000.c +++ b/arch/x86/mach-generic/es7000.c @@ -47,16 +47,26 @@ static __init int mps_oem_check(struct mp_config_table *mpc, char *oem, /* Hook from generic ACPI tables.c */ static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) { - unsigned long oem_addr; + unsigned long oem_addr = 0; + int check_dsdt; + int ret = 0; + + /* check dsdt at first to avoid clear fix_map for oem_addr */ + check_dsdt = es7000_check_dsdt(); + if (!find_unisys_acpi_oem_table(&oem_addr)) { - if (es7000_check_dsdt()) - return parse_unisys_oem((char *)oem_addr); + if (check_dsdt) + ret = parse_unisys_oem((char *)oem_addr); else { setup_unisys(); - return 1; + ret = 1; } + /* + * we need to unmap it + */ + unmap_unisys_acpi_oem_table(oem_addr); } - return 0; + return ret; } #else static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) diff --git a/include/asm-x86/es7000/mpparse.h b/include/asm-x86/es7000/mpparse.h index 7b5c889d8e7..ed5a3caae14 100644 --- a/include/asm-x86/es7000/mpparse.h +++ b/include/asm-x86/es7000/mpparse.h @@ -5,6 +5,7 @@ extern int parse_unisys_oem (char *oemptr); extern int find_unisys_acpi_oem_table(unsigned long *oem_addr); +extern void unmap_unisys_acpi_oem_table(unsigned long oem_addr); extern void setup_unisys(void); #ifndef CONFIG_X86_GENERICARCH -- cgit v1.2.3 From 3e6de5a393661c5cdabe44115e93bcbde6a742fc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Sep 2008 08:26:15 +0200 Subject: x86: print out EBDA/lowmem address it's useful for debugging purposes to know the location of the EBDA. Signed-off-by: Ingo Molnar --- arch/x86/kernel/head.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c index 3e66bd364a9..1dcb0f13897 100644 --- a/arch/x86/kernel/head.c +++ b/arch/x86/kernel/head.c @@ -35,6 +35,7 @@ void __init reserve_ebda_region(void) /* start of EBDA area */ ebda_addr = get_bios_ebda(); + printk(KERN_INFO "BIOS EBDA/lowmem at: %08x/%08x\n", ebda_addr, lowmem); /* Fixup: bios puts an EBDA in the top 64K segment */ /* of conventional memory, but does not adjust lowmem. */ -- cgit v1.2.3 From 33f8c40a30ae99e971d068c9ec6088e713c46f5f Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Sun, 14 Sep 2008 19:03:53 +0200 Subject: x86: add memory clobber in switch_to() Segment registers are reloaded, so we should add a memory clobber. The generated assembly code is identical in my tests, but this doesn't mean it is necessarily true for all configurations/compilers. x86_64 already has the memory clobber. Signed-off-by: Vegard Nossum Signed-off-by: Ingo Molnar --- include/asm-x86/system.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/asm-x86/system.h b/include/asm-x86/system.h index 34505dd7b24..b20c894660f 100644 --- a/include/asm-x86/system.h +++ b/include/asm-x86/system.h @@ -64,7 +64,10 @@ do { \ \ /* regparm parameters for __switch_to(): */ \ [prev] "a" (prev), \ - [next] "d" (next)); \ + [next] "d" (next) \ + \ + : /* reloaded segment registers */ \ + "memory"); \ } while (0) /* -- cgit v1.2.3 From 59ef48a58e59cc27255d526ae3fa60ddcd977208 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 14 Sep 2008 21:58:49 +0400 Subject: x86: smpboot - check if we have ESR register in wakeup_secondary_cpu We should check if we have ESR register before reading from it. Signed-off-by: Cyrill Gorcunov Cc: Yinghai Lu Cc: "Maciej W. Rozycki" Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index b700c9a1064..a778e221ccf 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -599,10 +599,12 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) * Give the other CPU some time to accept the IPI. */ udelay(200); - maxlvt = lapic_get_maxlvt(); - if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ - apic_write(APIC_ESR, 0); - accept_status = (apic_read(APIC_ESR) & 0xEF); + if (APIC_INTEGRATED(apic_version[phys_apicid])) { + maxlvt = lapic_get_maxlvt(); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + accept_status = (apic_read(APIC_ESR) & 0xEF); + } pr_debug("NMI sent.\n"); if (send_status) -- cgit v1.2.3 From 5e72d9e4850c91b6a0f06fa803f7393b55a38aa8 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 12 Sep 2008 15:43:04 +0100 Subject: x86-64: fix combining of regions in init_memory_mapping() When nr_range gets decremented, the same slot must be considered for coalescing with its new successor again. The issue is apparently pretty benign to native code, but surfaces as a boot time crash in our forward ported Xen tree (where the page table setup overall works differently than in native). Signed-off-by: Jan Beulich Acked-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 7c8bb46e83e..b8e461d4941 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -746,7 +746,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, old_start = mr[i].start; memmove(&mr[i], &mr[i+1], (nr_range - 1 - i) * sizeof (struct map_range)); - mr[i].start = old_start; + mr[i--].start = old_start; nr_range--; } -- cgit v1.2.3 From 606ee44dbb72fd48beb47f171d7b9cecf6ade6dd Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 17 Sep 2008 16:48:17 +0100 Subject: x86: make mm/gup.c more virtualization friendly Since pte_flags() is much cheaper than pte_val() in some virtualized environments (namely, Xen), use the former whereever possible. Signed-off-by: Jan Beulich Cc: "Nick Piggin" Signed-off-by: Ingo Molnar --- arch/x86/mm/gup.c | 10 +++++----- include/asm-x86/pgtable.h | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 007bb06c750..4ba373c5b8c 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -82,7 +82,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, pte_t pte = gup_get_pte(ptep); struct page *page; - if ((pte_val(pte) & (mask | _PAGE_SPECIAL)) != mask) { + if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { pte_unmap(ptep); return 0; } @@ -116,10 +116,10 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, mask = _PAGE_PRESENT|_PAGE_USER; if (write) mask |= _PAGE_RW; - if ((pte_val(pte) & mask) != mask) + if ((pte_flags(pte) & mask) != mask) return 0; /* hugepages are never "special" */ - VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL); + VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL); VM_BUG_ON(!pfn_valid(pte_pfn(pte))); refs = 0; @@ -173,10 +173,10 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr, mask = _PAGE_PRESENT|_PAGE_USER; if (write) mask |= _PAGE_RW; - if ((pte_val(pte) & mask) != mask) + if ((pte_flags(pte) & mask) != mask) return 0; /* hugepages are never "special" */ - VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL); + VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL); VM_BUG_ON(!pfn_valid(pte_pfn(pte))); refs = 0; diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h index 81805403b64..182f9d4c570 100644 --- a/include/asm-x86/pgtable.h +++ b/include/asm-x86/pgtable.h @@ -206,7 +206,7 @@ static inline int pte_exec(pte_t pte) static inline int pte_special(pte_t pte) { - return pte_val(pte) & _PAGE_SPECIAL; + return pte_flags(pte) & _PAGE_SPECIAL; } static inline unsigned long pte_pfn(pte_t pte) -- cgit v1.2.3 From bd32a8cfa8f172bd943655a3663d8005e5c1d83c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 19 Sep 2008 18:41:16 -0700 Subject: x86: cpu don't print duplicated vendor string Some CPUs have vendor string in the middle of model_id instead of beginning Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index fb789dd9e69..088fbdb6734 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -797,7 +797,7 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) else if (c->cpuid_level >= 0) vendor = c->x86_vendor_id; - if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor))) + if (vendor && !strstr(c->x86_model_id, vendor)) printk(KERN_CONT "%s ", vendor); if (c->x86_model_id[0]) -- cgit v1.2.3 From cf4cfb225ab2b48611cb4f9e8db23c87486416d6 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Fri, 11 Jul 2008 13:45:46 -0300 Subject: x86: use user_mode macro Instead of using SEGMENT_IS_KERNEL_CODE, use the "user_mode" macro, which can play the same role. Delete the former, since it now lacks any user. Signed-off-by: Glauber Costa Signed-off-by: Ingo Molnar --- arch/x86/kernel/time_32.c | 2 +- include/asm-x86/segment.h | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index bbecf8b6bf9..8abcb7b0869 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -47,7 +47,7 @@ unsigned long profile_pc(struct pt_regs *regs) unsigned long pc = instruction_pointer(regs); #ifdef CONFIG_SMP - if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs) && + if (!v8086_mode(regs) && !user_mode(regs) && in_lock_functions(pc)) { #ifdef CONFIG_FRAME_POINTER return *(unsigned long *)(regs->bp + 4); diff --git a/include/asm-x86/segment.h b/include/asm-x86/segment.h index ea5f0a8686f..4dcd8c6dd22 100644 --- a/include/asm-x86/segment.h +++ b/include/asm-x86/segment.h @@ -131,9 +131,6 @@ * Matching rules for certain types of segments. */ -/* Matches only __KERNEL_CS, ignoring PnP / USER / APM segments */ -#define SEGMENT_IS_KERNEL_CODE(x) (((x) & 0xfc) == GDT_ENTRY_KERNEL_CS * 8) - /* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */ #define SEGMENT_IS_FLAT_CODE(x) (((x) & 0xec) == GDT_ENTRY_KERNEL_CS * 8) -- cgit v1.2.3 From 2c44e66843cd50c5ef4f4271fbd63a4f4bf4d083 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Fri, 11 Jul 2008 13:48:07 -0300 Subject: x86: coalesce tests Coalesce v8086_mode and user_mode into a single user_mode_vm() test. Signed-off-by: Glauber Costa Signed-off-by: Ingo Molnar --- arch/x86/kernel/time_32.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 8abcb7b0869..ca0a4aab12c 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -47,8 +47,7 @@ unsigned long profile_pc(struct pt_regs *regs) unsigned long pc = instruction_pointer(regs); #ifdef CONFIG_SMP - if (!v8086_mode(regs) && !user_mode(regs) && - in_lock_functions(pc)) { + if (!user_mode_vm(regs) && in_lock_functions(pc)) { #ifdef CONFIG_FRAME_POINTER return *(unsigned long *)(regs->bp + 4); #else -- cgit v1.2.3 From 097a0788df71b0f3328c70ab5f4e41c27ee66817 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Thu, 14 Aug 2008 17:33:12 -0300 Subject: x86: set bp field in pt_regs properly Save rbp twice: One is for marking the stack frame, as usual (already there), and the other, to fill pt_regs properly. This is because bx comes right before the last saved register in that structure, and not bp. If the base pointer were in the place bx is today, this would not be needed. Signed-off-by: Glauber Costa Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index cf3a0b2d005..25bb3f9b255 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -667,6 +667,13 @@ END(stub_rt_sigreturn) SAVE_ARGS leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler pushq %rbp + /* + * Save rbp twice: One is for marking the stack frame, as usual, and the + * other, to fill pt_regs properly. This is because bx comes right + * before the last saved register in that structure, and not bp. If the + * base pointer were in the place bx is today, this would not be needed. + */ + movq %rbp, -8(%rsp) CFI_ADJUST_CFA_OFFSET 8 CFI_REL_OFFSET rbp, 0 movq %rsp,%rbp -- cgit v1.2.3 From 3927fa9e4b5d5f346d12aa0531744daef106ebd3 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Fri, 11 Jul 2008 13:53:43 -0300 Subject: x86: use frame pointer information on x86_64 profile_pc Signed-off-by: Glauber Costa Signed-off-by: Ingo Molnar --- arch/x86/kernel/time_64.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index e3d49c553af..7fd995edb76 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -34,11 +34,15 @@ unsigned long profile_pc(struct pt_regs *regs) of flags from PUSHF Eflags always has bits 22 and up cleared unlike kernel addresses. */ if (!user_mode(regs) && in_lock_functions(pc)) { +#ifdef CONFIG_FRAME_POINTER + return *(unsigned long *)(regs->bp + sizeof(long)); +#else unsigned long *sp = (unsigned long *)regs->sp; if (sp[0] >> 22) return sp[0]; if (sp[1] >> 22) return sp[1]; +#endif } return pc; } -- cgit v1.2.3 From efa323abd424a2450f810b96203c1fbf138998b9 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Fri, 11 Jul 2008 14:00:14 -0300 Subject: x86: remove SEGMENT_IS_FLAT_CODE There are no users in the kernel. Signed-off-by: Glauber Costa Signed-off-by: Ingo Molnar --- include/asm-x86/segment.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/asm-x86/segment.h b/include/asm-x86/segment.h index 4dcd8c6dd22..5d6e6945489 100644 --- a/include/asm-x86/segment.h +++ b/include/asm-x86/segment.h @@ -131,9 +131,6 @@ * Matching rules for certain types of segments. */ -/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */ -#define SEGMENT_IS_FLAT_CODE(x) (((x) & 0xec) == GDT_ENTRY_KERNEL_CS * 8) - /* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */ #define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8) -- cgit v1.2.3 From 8de0b8a7eaf274d197698b035090eeb805f62de6 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Fri, 11 Jul 2008 14:10:13 -0300 Subject: x86: use user_mode_vm instead of user_mode For x86_64, it does not really matter. But makes the code equal to i386. Signed-off-by: Glauber Costa Signed-off-by: Ingo Molnar --- arch/x86/kernel/time_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 7fd995edb76..0469243ae1b 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -33,7 +33,7 @@ unsigned long profile_pc(struct pt_regs *regs) /* Assume the lock function has either no stack frame or a copy of flags from PUSHF Eflags always has bits 22 and up cleared unlike kernel addresses. */ - if (!user_mode(regs) && in_lock_functions(pc)) { + if (!user_mode_vm(regs) && in_lock_functions(pc)) { #ifdef CONFIG_FRAME_POINTER return *(unsigned long *)(regs->bp + sizeof(long)); #else -- cgit v1.2.3 From 2ff298372d03b01c9ca8738ee2791227a81928e2 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Fri, 11 Jul 2008 14:21:29 -0300 Subject: x86: bind irq0 irq data to cpu0 Signed-off-by: Glauber Costa Signed-off-by: Ingo Molnar --- arch/x86/kernel/time_64.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 0469243ae1b..841938297c2 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -115,6 +115,7 @@ void __init hpet_time_init(void) if (!hpet_enable()) setup_pit_timer(); + irq0.mask = cpumask_of_cpu(0); setup_irq(0, &irq0); } -- cgit v1.2.3 From 2f97435e57926a5cdc104fe5a7c64932cb62cdda Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Fri, 11 Jul 2008 15:08:39 -0300 Subject: x86: factor out irq initialization for x86_64 Provide apic_intr_init and smp_intr_init functions. Signed-off-by: Glauber Costa Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_64.c | 43 ++++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 1f26fd9ec4f..18968a78ab3 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -164,22 +164,8 @@ static void __init init_ISA_irqs (void) void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); -void __init native_init_IRQ(void) +void __init smp_intr_init(void) { - int i; - - init_ISA_irqs(); - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; - if (vector != IA32_SYSCALL_VECTOR) - set_intr_gate(vector, interrupt[i]); - } - #ifdef CONFIG_SMP /* * The reschedule interrupt is a CPU-to-CPU reschedule-helper @@ -207,6 +193,14 @@ void __init native_init_IRQ(void) /* Low priority IPI to cleanup after moving an irq */ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); #endif +} + +void __init apic_intr_init(void) +{ +#ifdef CONFIG_SMP + smp_intr_init(); +#endif + alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); @@ -216,6 +210,25 @@ void __init native_init_IRQ(void) /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); +} + +void __init native_init_IRQ(void) +{ + int i; + + init_ISA_irqs(); + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; + if (vector != IA32_SYSCALL_VECTOR) + set_intr_gate(vector, interrupt[i]); + } + + apic_intr_init(); if (!acpi_ioapic) setup_irq(2, &irq2); -- cgit v1.2.3 From 780209af714ba733cd9e2f3526107becfc1969bc Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Fri, 11 Jul 2008 15:12:39 -0300 Subject: x86: make init_ISA_irqs nonstatic Signed-off-by: Glauber Costa Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 18968a78ab3..b01d9549b3d 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -135,7 +135,7 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = { [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 }; -static void __init init_ISA_irqs (void) +void __init init_ISA_irqs(void) { int i; -- cgit v1.2.3 From 461ebd109569d666d367f774f74002add6444d49 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Fri, 11 Jul 2008 15:25:13 -0300 Subject: x86: rename timer_event_interrupt to timer_interrupt Signed-off-by: Glauber Costa Signed-off-by: Ingo Molnar --- arch/x86/kernel/time_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 841938297c2..2be67c24909 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -48,7 +48,7 @@ unsigned long profile_pc(struct pt_regs *regs) } EXPORT_SYMBOL(profile_pc); -static irqreturn_t timer_event_interrupt(int irq, void *dev_id) +irqreturn_t timer_interrupt(int irq, void *dev_id) { add_pda(irq0_irqs, 1); @@ -104,7 +104,7 @@ unsigned long __init calibrate_cpu(void) } static struct irqaction irq0 = { - .handler = timer_event_interrupt, + .handler = timer_interrupt, .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING, .mask = CPU_MASK_NONE, .name = "timer" -- cgit v1.2.3 From 2c460d0b6813a5d2a7d571b0b561e4e727036dd7 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Fri, 11 Jul 2008 16:06:40 -0300 Subject: x86: replace hardcoded number Replace "4" in time_32.c code by sizeof(long). This way, it can work on x86_64 too. Signed-off-by: Glauber Costa Signed-off-by: Ingo Molnar --- arch/x86/kernel/time_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index ca0a4aab12c..7215bc6adfc 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -49,7 +49,7 @@ unsigned long profile_pc(struct pt_regs *regs) #ifdef CONFIG_SMP if (!user_mode_vm(regs) && in_lock_functions(pc)) { #ifdef CONFIG_FRAME_POINTER - return *(unsigned long *)(regs->bp + 4); + return *(unsigned long *)(regs->bp + sizeof(long)); #else unsigned long *sp = (unsigned long *)®s->sp; -- cgit v1.2.3 From 33c053d0aec344c8b2ad6966d904ebeff64e590b Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 21 Jul 2008 18:42:52 -0300 Subject: x86: wrap MCA_bus test around an ifdef Only test for MCA_bus if support for MCA is compiled in. Also, for x86_64, write the code inside the conditional for consistency with i386. It won't bite us, since it'll probably never select CONFIG_MCA anyway. Signed-off-by: Glauber Costa Signed-off-by: Ingo Molnar --- arch/x86/kernel/time_32.c | 2 ++ arch/x86/kernel/time_64.c | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 7215bc6adfc..77b400f06ea 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -94,6 +94,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id) do_timer_interrupt_hook(); +#ifdef CONFIG_MCA if (MCA_bus) { /* The PS/2 uses level-triggered interrupts. You can't turn them off, nor would you want to (any attempt to @@ -107,6 +108,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id) u8 irq_v = inb_p( 0x61 ); /* read the current state */ outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */ } +#endif return IRQ_HANDLED; } diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 2be67c24909..207a7a1d7ac 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -54,6 +55,13 @@ irqreturn_t timer_interrupt(int irq, void *dev_id) global_clock_event->event_handler(global_clock_event); +#ifdef CONFIG_MCA + if (MCA_bus) { + u8 irq_v = inb_p(0x61); /* read the current state */ + outb_p(irq_v|0x80, 0x61); /* reset the IRQ */ + } +#endif + return IRQ_HANDLED; } -- cgit v1.2.3 From e04d645f326bc8e591bc4ae21c4ab535987a17c1 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 22 Sep 2008 14:35:08 -0300 Subject: x86: move vgetcpu mode probing to cpu detection Take it out of time initialization and move it to cpu detection time. Signed-off-by: Glauber Costa Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 12 ++++++++++++ arch/x86/kernel/time_64.c | 4 ---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 088fbdb6734..f1af7185191 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -719,12 +719,24 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) #endif } +#ifdef CONFIG_X86_64 +static void vgetcpu_set_mode(void) +{ + if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) + vgetcpu_mode = VGETCPU_RDTSCP; + else + vgetcpu_mode = VGETCPU_LSL; +} +#endif + void __init identify_boot_cpu(void) { identify_cpu(&boot_cpu_data); #ifdef CONFIG_X86_32 sysenter_setup(); enable_sep_cpu(); +#else + vgetcpu_set_mode(); #endif } diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 207a7a1d7ac..cb19d650c21 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -130,10 +130,6 @@ void __init hpet_time_init(void) void __init time_init(void) { tsc_init(); - if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) - vgetcpu_mode = VGETCPU_RDTSCP; - else - vgetcpu_mode = VGETCPU_LSL; late_time_init = choose_time_init(); } -- cgit v1.2.3 From 2e42060c19cb79adacc48beb5e9ec5361df976a2 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Tue, 23 Sep 2008 15:37:13 -0500 Subject: x86, uv: add early detection of UV system types Portions of the ACPI code needs to know if a system is a UV system prior to genapic initialization. This patch adds a call early_acpi_boot_init() so that the apic type is discovered earlier. V2 of the patch adding fixes from Yinghai Lu. Much cleaner and smaller. Signed-off-by: Jack Steiner Acked-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 2 ++ arch/x86/mm/srat_64.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index de6a9d6d599..2255782e8d4 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -998,6 +998,8 @@ void __init setup_arch(char **cmdline_p) */ acpi_boot_table_init(); + early_acpi_boot_init(); + #ifdef CONFIG_ACPI_NUMA /* * Parse SRAT to discover nodes. diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 1b4763e26ea..51c0a2fc14f 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -138,7 +138,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) return; } - if (is_uv_system()) + if (get_uv_system_type() >= UV_X2APIC) apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; else apic_id = pa->apic_id; -- cgit v1.2.3 From 6b11d4ef3e4d6ce83fb177aa50fdf385abb17d1a Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 26 Sep 2008 14:03:02 +0200 Subject: traps: x86_64: add TRACE_IRQS_OFF in error_entry Add TRACE_IRQS_OFF just before entering the C code. All exceptions are taken via interrupt gates. If irq tracing is enabled, it should be notified as soon as possible. Interrupts are only (conditionally) re-enabled in C code. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 25bb3f9b255..963b35b81bc 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1065,7 +1065,8 @@ KPROBE_ENTRY(error_entry) je error_kernelspace error_swapgs: SWAPGS -error_sti: +error_sti: + TRACE_IRQS_OFF movq %rdi,RDI(%rsp) CFI_REL_OFFSET rdi,RDI movq %rsp,%rdi -- cgit v1.2.3 From 7e61a7932495e37685e95ec9a59ad08810dec959 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 26 Sep 2008 14:03:03 +0200 Subject: traps: x86_64: add TRACE_IRQS_OFF in paranoidentry macro Add TRACE_IRQS_OFF just before entering the C code. All exceptions are taken via interrupt gates. If irq tracing is enabled, it should be notified as soon as possible. Interrupts are only (conditionally) re-enabled in C code. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 963b35b81bc..977c8ea6602 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -939,6 +939,9 @@ END(spurious_interrupt) .if \ist movq %gs:pda_data_offset, %rbp .endif + .if \irqtrace + TRACE_IRQS_OFF + .endif movq %rsp,%rdi movq ORIG_RAX(%rsp),%rsi movq $-1,ORIG_RAX(%rsp) -- cgit v1.2.3 From 4b986a365253b57d8ab4ed7b796ba0893ff4c05c Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 26 Sep 2008 14:03:04 +0200 Subject: traps: x86_64: remove trace_hardirqs_fixup from DO_ERROR_INFO macro All exceptions are taken via interrupt gates. TRACE_IRQS_OFF is called just before entering the C code, so the irq state is known to the irq tracer at this point. No need to call trace_hardirqs_fixup. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_64.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 9c0ac0cab01..9f487f374c7 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -658,7 +658,6 @@ asmlinkage void do_##name(struct pt_regs *regs, long error_code) \ info.si_errno = 0; \ info.si_code = sicode; \ info.si_addr = (void __user *)siaddr; \ - trace_hardirqs_fixup(); \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ return; \ -- cgit v1.2.3 From 8b1c870f19849235b8c7e4dfe2ec6b0d691fa9d7 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 26 Sep 2008 14:03:05 +0200 Subject: traps: x86_64: remove trace_hardirqs_fixup from int3 handler All exceptions are taken via interrupt gates. TRACE_IRQS_OFF is called just before entering the C code, so the irq state is known to the irq tracer at this point. No need to call trace_hardirqs_fixup. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_64.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 9f487f374c7..7d56c875dd0 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -861,8 +861,6 @@ void restart_nmi(void) /* runs on IST stack. */ asmlinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) { - trace_hardirqs_fixup(); - if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) return; -- cgit v1.2.3 From a491503e4d0cb739f409069826e2746e38826099 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 26 Sep 2008 14:03:06 +0200 Subject: traps: x86_64: remove trace_hardirqs_fixup from debug handler All exceptions are taken via interrupt gates. TRACE_IRQS_OFF is called just before entering the C code, so the irq state is known to the irq tracer at this point. No need to call trace_hardirqs_fixup. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_64.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 7d56c875dd0..7d59559c2df 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -899,8 +899,6 @@ asmlinkage void __kprobes do_debug(struct pt_regs *regs, unsigned long condition; siginfo_t info; - trace_hardirqs_fixup(); - get_debugreg(condition, 6); /* -- cgit v1.2.3 From 69c89b5bf7f253756f3056e84b8603abe1c50f5b Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 26 Sep 2008 14:03:07 +0200 Subject: traps: x86: remove trace_hardirqs_fixup from pagefault handler The last use of trace_hardirqs_fixup is unnecessary, because the trap is taken with interrupt off on i386 as well as x86_64, and the irq-tracer is notified of this from the assembly code. trace_hardirqs_fixup and trace_hardirqs_fixup_flags are removed from include/asm-x86/irqflags.h as they are no longer used. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 5 ----- include/asm-x86/irqflags.h | 21 --------------------- 2 files changed, 26 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a742d753d5b..3f2b8962cbd 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -592,11 +592,6 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) unsigned long flags; #endif - /* - * We can fault from pretty much anywhere, with unknown IRQ state. - */ - trace_hardirqs_fixup(); - tsk = current; mm = tsk->mm; prefetchw(&mm->mmap_sem); diff --git a/include/asm-x86/irqflags.h b/include/asm-x86/irqflags.h index 424acb48cd6..2bdab21f089 100644 --- a/include/asm-x86/irqflags.h +++ b/include/asm-x86/irqflags.h @@ -166,27 +166,6 @@ static inline int raw_irqs_disabled(void) return raw_irqs_disabled_flags(flags); } -/* - * makes the traced hardirq state match with the machine state - * - * should be a rarely used function, only in places where its - * otherwise impossible to know the irq state, like in traps. - */ -static inline void trace_hardirqs_fixup_flags(unsigned long flags) -{ - if (raw_irqs_disabled_flags(flags)) - trace_hardirqs_off(); - else - trace_hardirqs_on(); -} - -static inline void trace_hardirqs_fixup(void) -{ - unsigned long flags = __raw_local_save_flags(); - - trace_hardirqs_fixup_flags(flags); -} - #else #ifdef CONFIG_X86_64 -- cgit v1.2.3 From 3c1326f8a6d8b9815ca88c95441330f96eef7352 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 26 Sep 2008 14:03:08 +0200 Subject: traps: i386: make do_trap more like x86_64 This patch hardcodes which traps should be forwarded to handle_vm86_trap in do_trap. This allows to remove the vm86 parameter from the i386-version of do_trap, which makes the DO_VM86_ERROR and DO_VM86_ERROR_INFO macros unnecessary. x86_64 part is whitespace only. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 55 ++++++++++++++-------------------------------- arch/x86/kernel/traps_64.c | 2 +- 2 files changed, 17 insertions(+), 40 deletions(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index ab559365089..bf4d71231d4 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -482,13 +482,17 @@ die_if_kernel(const char *str, struct pt_regs *regs, long err) } static void __kprobes -do_trap(int trapnr, int signr, char *str, int vm86, struct pt_regs *regs, +do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, long error_code, siginfo_t *info) { struct task_struct *tsk = current; if (regs->flags & X86_VM_MASK) { - if (vm86) + /* + * traps 0, 1, 3, 4, and 5 should be forwarded to vm86. + * On nmi (interrupt 2), do_trap should not be called. + */ + if (trapnr < 6) goto vm86_trap; goto trap_signal; } @@ -537,37 +541,10 @@ void do_##name(struct pt_regs *regs, long error_code) \ == NOTIFY_STOP) \ return; \ conditional_sti(regs); \ - do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ -} - -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \ -void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - siginfo_t info; \ - if (irq) \ - local_irq_enable(); \ - info.si_signo = signr; \ - info.si_errno = 0; \ - info.si_code = sicode; \ - info.si_addr = (void __user *)siaddr; \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - conditional_sti(regs); \ - do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ -} - -#define DO_VM86_ERROR(trapnr, signr, str, name) \ -void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - conditional_sti(regs); \ - do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ + do_trap(trapnr, signr, str, regs, error_code, NULL); \ } -#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ void do_##name(struct pt_regs *regs, long error_code) \ { \ siginfo_t info; \ @@ -579,18 +556,18 @@ void do_##name(struct pt_regs *regs, long error_code) \ == NOTIFY_STOP) \ return; \ conditional_sti(regs); \ - do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ + do_trap(trapnr, signr, str, regs, error_code, &info); \ } -DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) -DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow) -DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) +DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) +DO_ERROR(4, SIGSEGV, "overflow", overflow) +DO_ERROR(5, SIGSEGV, "bounds", bounds) +DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) DO_ERROR(12, SIGBUS, "stack segment", stack_segment) -DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) void __kprobes do_general_protection(struct pt_regs *regs, long error_code) @@ -868,7 +845,7 @@ void __kprobes do_int3(struct pt_regs *regs, long error_code) return; #endif - do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL); + do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); } /* @@ -1214,7 +1191,7 @@ void do_iret_error(struct pt_regs *regs, long error_code) if (notify_die(DIE_TRAP, "iret exception", regs, error_code, 32, SIGILL) == NOTIFY_STOP) return; - do_trap(32, SIGILL, "iret exception", 0, regs, error_code, &info); + do_trap(32, SIGILL, "iret exception", regs, error_code, &info); } void __init trap_init(void) diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 7d59559c2df..1efd1ea6466 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -640,7 +640,7 @@ kernel_trap: return; } -#define DO_ERROR(trapnr, signr, str, name) \ +#define DO_ERROR(trapnr, signr, str, name) \ asmlinkage void do_##name(struct pt_regs *regs, long error_code) \ { \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ -- cgit v1.2.3 From 9b658f6f8bd30b91aec527325e398771511ea61b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 25 Sep 2008 22:22:12 -0700 Subject: x86: cleanup, remove extra ifdef also change two functions to static. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_64.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index b01d9549b3d..5b5be9d43c2 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -164,7 +164,7 @@ void __init init_ISA_irqs(void) void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); -void __init smp_intr_init(void) +static void __init smp_intr_init(void) { #ifdef CONFIG_SMP /* @@ -195,11 +195,9 @@ void __init smp_intr_init(void) #endif } -void __init apic_intr_init(void) +static void __init apic_intr_init(void) { -#ifdef CONFIG_SMP smp_intr_init(); -#endif alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); -- cgit v1.2.3 From d2f904bb9a1ba88a58a03612abd8c6c54bdaf73a Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Thu, 25 Sep 2008 07:52:10 -0500 Subject: x86, uv: fix for size of hub mappings Fix the size of the mappings of UV hub registers. Size must be a function of the maximum node number within the SSI. Signed-off-by: Jack Steiner Signed-off-by: Ingo Molnar --- arch/x86/kernel/genx2apic_uv_x.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index ae2ffc8a400..8cf4ec2a37d 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -286,12 +286,13 @@ static __init void map_low_mmrs(void) enum map_type {map_wb, map_uc}; -static __init void map_high(char *id, unsigned long base, int shift, enum map_type map_type) +static __init void map_high(char *id, unsigned long base, int shift, + int max_pnode, enum map_type map_type) { unsigned long bytes, paddr; paddr = base << shift; - bytes = (1UL << shift); + bytes = (1UL << shift) * (max_pnode + 1); printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, paddr + bytes); if (map_type == map_uc) @@ -307,7 +308,7 @@ static __init void map_gru_high(int max_pnode) gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); if (gru.s.enable) - map_high("GRU", gru.s.base, shift, map_wb); + map_high("GRU", gru.s.base, shift, max_pnode, map_wb); } static __init void map_config_high(int max_pnode) @@ -317,7 +318,7 @@ static __init void map_config_high(int max_pnode) cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR); if (cfg.s.enable) - map_high("CONFIG", cfg.s.base, shift, map_uc); + map_high("CONFIG", cfg.s.base, shift, max_pnode, map_uc); } static __init void map_mmr_high(int max_pnode) @@ -327,7 +328,7 @@ static __init void map_mmr_high(int max_pnode) mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); if (mmr.s.enable) - map_high("MMR", mmr.s.base, shift, map_uc); + map_high("MMR", mmr.s.base, shift, max_pnode, map_uc); } static __init void map_mmioh_high(int max_pnode) @@ -337,7 +338,7 @@ static __init void map_mmioh_high(int max_pnode) mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); if (mmioh.s.enable) - map_high("MMIOH", mmioh.s.base, shift, map_uc); + map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc); } static __init void uv_rtc_init(void) -- cgit v1.2.3 From 69d45dd1c3bb512a9f5f9c464ac625eb707669ec Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Sun, 28 Sep 2008 21:28:15 +0200 Subject: x86: merge winchip-2 and winchip-2a cpu choices The Winchip-2 and Winchip-2A cpu choices select the same options for kernel and compiler. Merge them to save few bytes and reduce confusion. Signed-off-by: Krzysztof Helt Acked-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.cpu | 27 +++++++++------------------ arch/x86/Makefile_32.cpu | 1 - arch/x86/configs/i386_defconfig | 1 - arch/x86/configs/x86_64_defconfig | 1 - include/asm-x86/module.h | 2 -- 5 files changed, 9 insertions(+), 23 deletions(-) diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index c5f10136052..f4bdc0492d3 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -38,8 +38,7 @@ config M386 - "Crusoe" for the Transmeta Crusoe series. - "Efficeon" for the Transmeta Efficeon series. - "Winchip-C6" for original IDT Winchip. - - "Winchip-2" for IDT Winchip 2. - - "Winchip-2A" for IDT Winchips with 3dNow! capabilities. + - "Winchip-2" for IDT Winchips with 3dNow! capabilities. - "GeodeGX1" for Geode GX1 (Cyrix MediaGX). - "Geode GX/LX" For AMD Geode GX and LX processors. - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3. @@ -194,19 +193,11 @@ config MWINCHIPC6 treat this chip as a 586TSC with some extended instructions and alignment requirements. -config MWINCHIP2 - bool "Winchip-2" - depends on X86_32 - help - Select this for an IDT Winchip-2. Linux and GCC - treat this chip as a 586TSC with some extended instructions - and alignment requirements. - config MWINCHIP3D - bool "Winchip-2A/Winchip-3" + bool "Winchip-2/Winchip-2A/Winchip-3" depends on X86_32 help - Select this for an IDT Winchip-2A or 3. Linux and GCC + Select this for an IDT Winchip-2, 2A or 3. Linux and GCC treat this chip as a 586TSC with some extended instructions and alignment requirements. Also enable out of order memory stores for this CPU, which can increase performance of some @@ -318,7 +309,7 @@ config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || X86_GENERIC || GENERIC_CPU || MPSC default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 - default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX + default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 config X86_XADD @@ -360,7 +351,7 @@ config X86_POPAD_OK config X86_ALIGNMENT_16 def_bool y - depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 + depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 config X86_INTEL_USERCOPY def_bool y @@ -368,7 +359,7 @@ config X86_INTEL_USERCOPY config X86_USE_PPRO_CHECKSUM def_bool y - depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 + depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 config X86_USE_3DNOW def_bool y @@ -376,7 +367,7 @@ config X86_USE_3DNOW config X86_OOSTORE def_bool y - depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR + depends on (MWINCHIP3D || MWINCHIPC6) && MTRR # # P6_NOPs are a relatively minor optimization that require a family >= @@ -396,7 +387,7 @@ config X86_P6_NOP config X86_TSC def_bool y - depends on ((MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64 + depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64 config X86_CMPXCHG64 def_bool y @@ -417,7 +408,7 @@ config X86_MINIMUM_CPU_FAMILY config X86_DEBUGCTLMSR def_bool y - depends on !(MK6 || MWINCHIPC6 || MWINCHIP2 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) + depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) menuconfig PROCESSOR_SELECT bool "Supported processor vendors" if EMBEDDED diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index b72b4f75311..80177ec052f 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu @@ -28,7 +28,6 @@ cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon) cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) -cflags-$(CONFIG_MWINCHIP2) += $(call cc-option,-march=winchip2,-march=i586) cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index ca226ca3128..52d0359719d 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -213,7 +213,6 @@ CONFIG_M686=y # CONFIG_MCRUSOE is not set # CONFIG_MEFFICEON is not set # CONFIG_MWINCHIPC6 is not set -# CONFIG_MWINCHIP2 is not set # CONFIG_MWINCHIP3D is not set # CONFIG_MGEODEGX1 is not set # CONFIG_MGEODE_LX is not set diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 2c4b1c771e2..f0a03d7a7d6 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -210,7 +210,6 @@ CONFIG_X86_PC=y # CONFIG_MCRUSOE is not set # CONFIG_MEFFICEON is not set # CONFIG_MWINCHIPC6 is not set -# CONFIG_MWINCHIP2 is not set # CONFIG_MWINCHIP3D is not set # CONFIG_MGEODEGX1 is not set # CONFIG_MGEODE_LX is not set diff --git a/include/asm-x86/module.h b/include/asm-x86/module.h index 48dc3e0c07d..864f2005fc1 100644 --- a/include/asm-x86/module.h +++ b/include/asm-x86/module.h @@ -52,8 +52,6 @@ struct mod_arch_specific {}; #define MODULE_PROC_FAMILY "EFFICEON " #elif defined CONFIG_MWINCHIPC6 #define MODULE_PROC_FAMILY "WINCHIPC6 " -#elif defined CONFIG_MWINCHIP2 -#define MODULE_PROC_FAMILY "WINCHIP2 " #elif defined CONFIG_MWINCHIP3D #define MODULE_PROC_FAMILY "WINCHIP3D " #elif defined CONFIG_MCYRIXIII -- cgit v1.2.3 From 14adf855baefad5ac3b545be23a64e6b61d6b74a Mon Sep 17 00:00:00 2001 From: Chuck Ebbert Date: Mon, 29 Sep 2008 18:29:42 -0400 Subject: x86: move prefill_possible_map calling early, fix, V2 Commit 4a701737 ("x86: move prefill_possible_map calling early, fix") is the wrong fix: prefill_possible_map() needs to be available even when CONFIG_HOTPLUG_CPU is not set. A followon patch will do that. Fix this correctly by making prefill_possible_map() available even when CONFIG_HOTPLUG_CPU is not set. The function is needed so that the number of possible CPUs can be determined. Tested on uniprocessor machine with CPU hotplug disabled. From boot log: Before: NR_CPUS: 512, nr_cpu_ids: 512, nr_node_ids 1 After: NR_CPUS: 512, nr_cpu_ids: 1, nr_node_ids 1 Signed-off-by: Chuck Ebbert Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 62 +++++++++++++++++++++++------------------------ include/asm-x86/smp.h | 8 +++--- 2 files changed, 34 insertions(+), 36 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index a778e221ccf..23913785c26 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1261,39 +1261,8 @@ void __init native_smp_cpus_done(unsigned int max_cpus) check_nmi_watchdog(); } -#ifdef CONFIG_HOTPLUG_CPU - -static void remove_siblinginfo(int cpu) -{ - int sibling; - struct cpuinfo_x86 *c = &cpu_data(cpu); - - for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) { - cpu_clear(cpu, per_cpu(cpu_core_map, sibling)); - /*/ - * last thread sibling in this cpu core going down - */ - if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1) - cpu_data(sibling).booted_cores--; - } - - for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu)) - cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling)); - cpus_clear(per_cpu(cpu_sibling_map, cpu)); - cpus_clear(per_cpu(cpu_core_map, cpu)); - c->phys_proc_id = 0; - c->cpu_core_id = 0; - cpu_clear(cpu, cpu_sibling_setup_map); -} - static int additional_cpus __initdata = -1; -static __init int setup_additional_cpus(char *s) -{ - return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL; -} -early_param("additional_cpus", setup_additional_cpus); - /* * cpu_possible_map should be static, it cannot change as cpu's * are onlined, or offlined. The reason is per-cpu data-structures @@ -1340,6 +1309,37 @@ __init void prefill_possible_map(void) nr_cpu_ids = possible; } +#ifdef CONFIG_HOTPLUG_CPU + +static void remove_siblinginfo(int cpu) +{ + int sibling; + struct cpuinfo_x86 *c = &cpu_data(cpu); + + for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) { + cpu_clear(cpu, per_cpu(cpu_core_map, sibling)); + /*/ + * last thread sibling in this cpu core going down + */ + if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1) + cpu_data(sibling).booted_cores--; + } + + for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu)) + cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling)); + cpus_clear(per_cpu(cpu_sibling_map, cpu)); + cpus_clear(per_cpu(cpu_core_map, cpu)); + c->phys_proc_id = 0; + c->cpu_core_id = 0; + cpu_clear(cpu, cpu_sibling_setup_map); +} + +static __init int setup_additional_cpus(char *s) +{ + return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL; +} +early_param("additional_cpus", setup_additional_cpus); + static void __ref remove_cpu_from_maps(int cpu) { cpu_clear(cpu, cpu_online_map); diff --git a/include/asm-x86/smp.h b/include/asm-x86/smp.h index 6df2615f913..a6afc29f2dd 100644 --- a/include/asm-x86/smp.h +++ b/include/asm-x86/smp.h @@ -141,6 +141,8 @@ void play_dead_common(void); void native_send_call_func_ipi(cpumask_t mask); void native_send_call_func_single_ipi(int cpu); +extern void prefill_possible_map(void); + void smp_store_cpu_info(int id); #define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) @@ -149,15 +151,11 @@ static inline int num_booting_cpus(void) { return cpus_weight(cpu_callout_map); } -#endif /* CONFIG_SMP */ - -#if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_CPU) -extern void prefill_possible_map(void); #else static inline void prefill_possible_map(void) { } -#endif +#endif /* CONFIG_SMP */ extern unsigned disabled_cpus __cpuinitdata; -- cgit v1.2.3 From 1e0b5d00b230ceffe1bb33284b46b8572e418423 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Mon, 29 Sep 2008 08:45:29 -0500 Subject: x86, UV: new UV genapic functions for x2apic Add functions that use the infrastructure added by the x2apic code. These functions were originally stubbed out since the UV code went into the tree prior to the x2apic code. Signed-off-by: Jack Steiner Signed-off-by: Ingo Molnar --- arch/x86/kernel/genx2apic_uv_x.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 8cf4ec2a37d..33581d94a90 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -114,7 +114,7 @@ static void uv_send_IPI_one(int cpu, int vector) unsigned long val, apicid, lapicid; int pnode; - apicid = per_cpu(x86_cpu_to_apicid, cpu); /* ZZZ - cache node-local ? */ + apicid = per_cpu(x86_cpu_to_apicid, cpu); lapicid = apicid & 0x3f; /* ZZZ macro needed */ pnode = uv_apicid_to_pnode(apicid); val = @@ -202,12 +202,10 @@ static unsigned int phys_pkg_id(int index_msb) return uv_read_apic_id() >> index_msb; } -#ifdef ZZZ /* Needs x2apic patch */ static void uv_send_IPI_self(int vector) { apic_write(APIC_SELF_IPI, vector); } -#endif struct genapic apic_x2apic_uv_x = { .name = "UV large system", @@ -215,15 +213,15 @@ struct genapic apic_x2apic_uv_x = { .int_delivery_mode = dest_Fixed, .int_dest_mode = (APIC_DEST_PHYSICAL != 0), .target_cpus = uv_target_cpus, - .vector_allocation_domain = uv_vector_allocation_domain,/* Fixme ZZZ */ + .vector_allocation_domain = uv_vector_allocation_domain, .apic_id_registered = uv_apic_id_registered, .init_apic_ldr = uv_init_apic_ldr, .send_IPI_all = uv_send_IPI_all, .send_IPI_allbutself = uv_send_IPI_allbutself, .send_IPI_mask = uv_send_IPI_mask, - /* ZZZ.send_IPI_self = uv_send_IPI_self, */ + .send_IPI_self = uv_send_IPI_self, .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, - .phys_pkg_id = phys_pkg_id, /* Fixme ZZZ */ + .phys_pkg_id = phys_pkg_id, .get_apic_id = get_apic_id, .set_apic_id = set_apic_id, .apic_id_mask = (0xFFFFFFFFu), -- cgit v1.2.3 From e2ce07c8042975e52df4cec1f41faf15b83f2e42 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 3 Apr 2008 16:40:48 +0300 Subject: x86: __show_registers() and __show_regs() API unification Currently the low-level function to dump user-passed registers on i386 is called __show_registers() whereas on x86-64 it's called __show_regs(). Unify the API to simplify porting of kmemcheck to x86-64. Signed-off-by: Pekka Enberg Acked-by: Vegard Nossum Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_32.c | 4 ++-- arch/x86/kernel/process_64.c | 7 +++++-- arch/x86/kernel/traps_32.c | 2 +- arch/x86/kernel/traps_64.c | 2 +- include/asm-x86/kdebug.h | 3 +-- 5 files changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 922c14058f9..0a1302fe6d4 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -123,7 +123,7 @@ void cpu_idle(void) } } -void __show_registers(struct pt_regs *regs, int all) +void __show_regs(struct pt_regs *regs, int all) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; unsigned long d0, d1, d2, d3, d6, d7; @@ -189,7 +189,7 @@ void __show_registers(struct pt_regs *regs, int all) void show_regs(struct pt_regs *regs) { - __show_registers(regs, 1); + __show_regs(regs, 1); show_trace(NULL, regs, ®s->sp, regs->bp); } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ca80394ef5b..cd8c0ed02b7 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -136,7 +136,7 @@ void cpu_idle(void) } /* Prints also some state that isn't saved in the pt_regs */ -void __show_regs(struct pt_regs *regs) +void __show_regs(struct pt_regs *regs, int all) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; unsigned long d0, d1, d2, d3, d6, d7; @@ -175,6 +175,9 @@ void __show_regs(struct pt_regs *regs) rdmsrl(MSR_GS_BASE, gs); rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); + if (!all) + return; + cr0 = read_cr0(); cr2 = read_cr2(); cr3 = read_cr3(); @@ -200,7 +203,7 @@ void __show_regs(struct pt_regs *regs) void show_regs(struct pt_regs *regs) { printk(KERN_INFO "CPU %d:", smp_processor_id()); - __show_regs(regs); + __show_regs(regs, 1); show_trace(NULL, regs, (void *)(regs + 1), regs->bp); } diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index bf4d71231d4..4f0831593e3 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -319,7 +319,7 @@ void show_registers(struct pt_regs *regs) int i; print_modules(); - __show_registers(regs, 0); + __show_regs(regs, 0); printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", TASK_COMM_LEN, current->comm, task_pid_nr(current), diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 1efd1ea6466..729157ee4c1 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -430,7 +430,7 @@ void show_registers(struct pt_regs *regs) sp = regs->sp; printk("CPU %d ", cpu); - __show_regs(regs); + __show_regs(regs, 1); printk("Process %s (pid: %d, threadinfo %p, task %p)\n", cur->comm, cur->pid, task_thread_info(cur), cur); diff --git a/include/asm-x86/kdebug.h b/include/asm-x86/kdebug.h index 5ec3ad3e825..fbbab66ee9d 100644 --- a/include/asm-x86/kdebug.h +++ b/include/asm-x86/kdebug.h @@ -27,10 +27,9 @@ extern void printk_address(unsigned long address, int reliable); extern void die(const char *, struct pt_regs *,long); extern int __must_check __die(const char *, struct pt_regs *, long); extern void show_registers(struct pt_regs *regs); -extern void __show_registers(struct pt_regs *, int all); extern void show_trace(struct task_struct *t, struct pt_regs *regs, unsigned long *sp, unsigned long bp); -extern void __show_regs(struct pt_regs *regs); +extern void __show_regs(struct pt_regs *regs, int all); extern void show_regs(struct pt_regs *regs); extern unsigned long oops_begin(void); extern void oops_end(unsigned long, struct pt_regs *, int signr); -- cgit v1.2.3 From 94f6bac1058fd59a8bd472d18c4b77f220d930b0 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Tue, 30 Sep 2008 23:17:51 +0200 Subject: x86: do not allow to optimize flag_is_changeable_p() (rev. 2) The flag_is_changeable_p() is used by has_cpuid_p() which can return different results in the code sequence below: if (!have_cpuid_p()) identify_cpu_without_cpuid(c); /* cyrix could have cpuid enabled via c_identify()*/ if (!have_cpuid_p()) return; Otherwise, the gcc 3.4.6 optimizes these two calls into one which make the code not working correctly. Cyrix cpus have the CPUID instruction enabled before the second call to the have_cpuid_p() but it is not detected due to the gcc optimization. Thus the ARR registers (mtrr like) are not detected on such a cpu. Signed-off-by: Krzysztof Helt Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f1af7185191..25581dcb280 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -124,18 +124,25 @@ static inline int flag_is_changeable_p(u32 flag) { u32 f1, f2; - asm("pushfl\n\t" - "pushfl\n\t" - "popl %0\n\t" - "movl %0,%1\n\t" - "xorl %2,%0\n\t" - "pushl %0\n\t" - "popfl\n\t" - "pushfl\n\t" - "popl %0\n\t" - "popfl\n\t" - : "=&r" (f1), "=&r" (f2) - : "ir" (flag)); + /* + * Cyrix and IDT cpus allow disabling of CPUID + * so the code below may return different results + * when it is executed before and after enabling + * the CPUID. Add "volatile" to not allow gcc to + * optimize the subsequent calls to this function. + */ + asm volatile ("pushfl\n\t" + "pushfl\n\t" + "popl %0\n\t" + "movl %0,%1\n\t" + "xorl %2,%0\n\t" + "pushl %0\n\t" + "popfl\n\t" + "pushfl\n\t" + "popl %0\n\t" + "popfl\n\t" + : "=&r" (f1), "=&r" (f2) + : "ir" (flag)); return ((f1^f2) & flag) != 0; } -- cgit v1.2.3 From 7f2f49a58283110083a7358d2d98025a11653373 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert Date: Thu, 2 Oct 2008 15:30:07 -0400 Subject: x86: allow number of additional hotplug CPUs to be set at compile time, V2 x86: allow number of additional hotplug CPUs to be set at compile time, V2 The default number of additional CPU IDs for hotplugging is determined by asking ACPI or mptables how many "disabled" CPUs there are in the system, but many systems get this wrong so that e.g. a uniprocessor machine gets an extra CPU allocated and never switches to single CPU mode. And sometimes CPU hotplugging is enabled only for suspend/hibernate anyway, so the additional CPU IDs are not wanted. Allow the number to be set to zero at compile time. Also, force the number of extra CPUs to zero if hotplugging is disabled which allows removing some conditional code. Tested on uniprocessor x86_64 that ACPI claims has a disabled processor, with CPU hotplugging configured. ("After" has the number of additional CPUs set to 0) Before: NR_CPUS: 512, nr_cpu_ids: 2, nr_node_ids 1 After: NR_CPUS: 512, nr_cpu_ids: 1, nr_node_ids 1 [Changed the name of the option and the prompt according to Ingo's suggestion.] Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 18 ++++++++++++++++++ arch/x86/kernel/smpboot.c | 2 +- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fc8351f374f..adaca6fa927 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1438,6 +1438,24 @@ config HOTPLUG_CPU automatically on SMP systems. ) Say N if you want to disable CPU hotplug. +config HOTPLUG_RESTRICT_TO_BOOTUP_CPUS + def_bool n + prompt "Restrict CPU hotplugging to processors found during boot" if HOTPLUG_CPU + ---help--- + Say no here to use the default, which allows as many CPUs as are marked + "disabled" by ACPI or MPTABLES to be hotplugged after bootup. + + Say yes if you do not want to allow CPUs to be added after booting, for + example if you only need CPU hotplugging enabled for suspend/resume. + + If CPU_HOTPLUG is enabled this value may be overridden at boot time + with the "additional_cpus" kernel parameter. + +config HOTPLUG_ADDITIONAL_CPUS + int + default 0 if !HOTPLUG_CPU || HOTPLUG_RESTRICT_TO_BOOTUP_CPUS + default -1 + config COMPAT_VDSO def_bool y prompt "Compat VDSO support" diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 23913785c26..857a88bb919 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1261,7 +1261,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus) check_nmi_watchdog(); } -static int additional_cpus __initdata = -1; +static int additional_cpus __initdata = CONFIG_HOTPLUG_ADDITIONAL_CPUS; /* * cpu_possible_map should be static, it cannot change as cpu's -- cgit v1.2.3 From af5c2bd16ac2e5688c3bf46ea1f95112d696d294 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 3 Oct 2008 17:54:25 +0200 Subject: x86: fix virt_addr_valid() with CONFIG_DEBUG_VIRTUAL=y, v2 virt_addr_valid() calls __pa(), which calls __phys_addr(). With CONFIG_DEBUG_VIRTUAL=y, __phys_addr() will kill the kernel if the address *isn't* valid. That's clearly wrong for virt_addr_valid(). We also incorporate the debugging checks into virt_addr_valid(). Signed-off-by: Vegard Nossum Acked-by: Jiri Slaby Signed-off-by: Ingo Molnar --- arch/x86/kernel/doublefault_32.c | 2 +- arch/x86/mm/ioremap.c | 36 ++++++++++++++++++++++++++++++++++-- include/asm-x86/page.h | 8 +++++++- include/asm-x86/page_32.h | 4 ++-- 4 files changed, 44 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index 395acb12b0d..b4f14c6c09d 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c @@ -66,6 +66,6 @@ struct tss_struct doublefault_tss __cacheline_aligned = { .ds = __USER_DS, .fs = __KERNEL_PERCPU, - .__cr3 = __phys_addr_const((unsigned long)swapper_pg_dir) + .__cr3 = __pa_nodebug(swapper_pg_dir), } }; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 7fb737c6b54..8876220d906 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -45,6 +45,27 @@ unsigned long __phys_addr(unsigned long x) } EXPORT_SYMBOL(__phys_addr); +bool __virt_addr_valid(unsigned long x) +{ + if (x >= __START_KERNEL_map) { + x -= __START_KERNEL_map; + if (x >= KERNEL_IMAGE_SIZE) + return false; + x += phys_base; + } else { + if (x < PAGE_OFFSET) + return false; + x -= PAGE_OFFSET; + if (system_state == SYSTEM_BOOTING ? + x > MAXMEM : !phys_addr_valid(x)) { + return false; + } + } + + return pfn_valid(x >> PAGE_SHIFT); +} +EXPORT_SYMBOL(__virt_addr_valid); + #else static inline int phys_addr_valid(unsigned long addr) @@ -56,13 +77,24 @@ static inline int phys_addr_valid(unsigned long addr) unsigned long __phys_addr(unsigned long x) { /* VMALLOC_* aren't constants; not available at the boot time */ - VIRTUAL_BUG_ON(x < PAGE_OFFSET || (system_state != SYSTEM_BOOTING && - is_vmalloc_addr((void *)x))); + VIRTUAL_BUG_ON(x < PAGE_OFFSET); + VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING && + is_vmalloc_addr((void *) x)); return x - PAGE_OFFSET; } EXPORT_SYMBOL(__phys_addr); #endif +bool __virt_addr_valid(unsigned long x) +{ + if (x < PAGE_OFFSET) + return false; + if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x)) + return false; + return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); +} +EXPORT_SYMBOL(__virt_addr_valid); + #endif int page_is_ram(unsigned long pagenr) diff --git a/include/asm-x86/page.h b/include/asm-x86/page.h index c9157477675..d4f1d5791fc 100644 --- a/include/asm-x86/page.h +++ b/include/asm-x86/page.h @@ -179,6 +179,7 @@ static inline pteval_t native_pte_flags(pte_t pte) #endif /* CONFIG_PARAVIRT */ #define __pa(x) __phys_addr((unsigned long)(x)) +#define __pa_nodebug(x) __phys_addr_nodebug((unsigned long)(x)) /* __pa_symbol should be used for C visible symbols. This seems to be the official gcc blessed way to do such arithmetic. */ #define __pa_symbol(x) __pa(__phys_reloc_hide((unsigned long)(x))) @@ -188,9 +189,14 @@ static inline pteval_t native_pte_flags(pte_t pte) #define __boot_va(x) __va(x) #define __boot_pa(x) __pa(x) +/* + * virt_to_page(kaddr) returns a valid pointer if and only if + * virt_addr_valid(kaddr) returns true. + */ #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) -#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) +extern bool __virt_addr_valid(unsigned long kaddr); +#define virt_addr_valid(kaddr) __virt_addr_valid((unsigned long) (kaddr)) #endif /* __ASSEMBLY__ */ diff --git a/include/asm-x86/page_32.h b/include/asm-x86/page_32.h index 9c5a737a9af..5d6a68a1067 100644 --- a/include/asm-x86/page_32.h +++ b/include/asm-x86/page_32.h @@ -73,11 +73,11 @@ typedef struct page *pgtable_t; #endif #ifndef __ASSEMBLY__ -#define __phys_addr_const(x) ((x) - PAGE_OFFSET) +#define __phys_addr_nodebug(x) ((x) - PAGE_OFFSET) #ifdef CONFIG_DEBUG_VIRTUAL extern unsigned long __phys_addr(unsigned long); #else -#define __phys_addr(x) ((x) - PAGE_OFFSET) +#define __phys_addr(x) __phys_addr_nodebug(x) #endif #define __phys_reloc_hide(x) RELOC_HIDE((x), 0) -- cgit v1.2.3 From 2bc5f927d489f9e47b6fa71f323b653e8ec81782 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 30 Sep 2008 13:12:14 +0200 Subject: i386: split out dumpstack code from traps_32.c The dumpstack code is logically quite independent from the hardware traps. Split it out into its own file. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/dumpstack_32.c | 422 +++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/traps_32.c | 404 --------------------------------------- 3 files changed, 423 insertions(+), 405 deletions(-) create mode 100644 arch/x86/kernel/dumpstack_32.c diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5098585f87c..d5bde5d8c2b 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -28,7 +28,7 @@ obj-y += time_$(BITS).o ioport.o ldt.o obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o obj-$(CONFIG_X86_32) += probe_roms_32.o -obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o +obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o dumpstack_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o obj-y += bootflag.o e820.o diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c new file mode 100644 index 00000000000..c398b27df6c --- /dev/null +++ b/arch/x86/kernel/dumpstack_32.c @@ -0,0 +1,422 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +int panic_on_unrecovered_nmi; +int kstack_depth_to_print = 24; +static unsigned int code_bytes = 64; +static int die_counter; + +void printk_address(unsigned long address, int reliable) +{ +#ifdef CONFIG_KALLSYMS + unsigned long offset = 0; + unsigned long symsize; + const char *symname; + char *modname; + char *delim = ":"; + char namebuf[KSYM_NAME_LEN]; + char reliab[4] = ""; + + symname = kallsyms_lookup(address, &symsize, &offset, + &modname, namebuf); + if (!symname) { + printk(" [<%08lx>]\n", address); + return; + } + if (!reliable) + strcpy(reliab, "? "); + + if (!modname) + modname = delim = ""; + printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n", + address, reliab, delim, modname, delim, symname, offset, symsize); +#else + printk(" [<%08lx>]\n", address); +#endif +} + +static inline int valid_stack_ptr(struct thread_info *tinfo, + void *p, unsigned int size) +{ + void *t = tinfo; + return p > t && p <= t + THREAD_SIZE - size; +} + +/* The form of the top of the frame on the stack */ +struct stack_frame { + struct stack_frame *next_frame; + unsigned long return_address; +}; + +static inline unsigned long +print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data) +{ + struct stack_frame *frame = (struct stack_frame *)bp; + + while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) { + unsigned long addr; + + addr = *stack; + if (__kernel_text_address(addr)) { + if ((unsigned long) stack == bp + 4) { + ops->address(data, addr, 1); + frame = frame->next_frame; + bp = (unsigned long) frame; + } else { + ops->address(data, addr, bp == 0); + } + } + stack++; + } + return bp; +} + +void dump_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data) +{ + if (!task) + task = current; + + if (!stack) { + unsigned long dummy; + stack = &dummy; + if (task != current) + stack = (unsigned long *)task->thread.sp; + } + +#ifdef CONFIG_FRAME_POINTER + if (!bp) { + if (task == current) { + /* Grab bp right from our regs */ + asm("movl %%ebp, %0" : "=r" (bp) :); + } else { + /* bp is the last reg pushed by switch_to */ + bp = *(unsigned long *) task->thread.sp; + } + } +#endif + + for (;;) { + struct thread_info *context; + + context = (struct thread_info *) + ((unsigned long)stack & (~(THREAD_SIZE - 1))); + bp = print_context_stack(context, stack, bp, ops, data); + /* + * Should be after the line below, but somewhere + * in early boot context comes out corrupted and we + * can't reference it: + */ + if (ops->stack(data, "IRQ") < 0) + break; + stack = (unsigned long *)context->previous_esp; + if (!stack) + break; + touch_nmi_watchdog(); + } +} +EXPORT_SYMBOL(dump_trace); + +static void +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + printk(data); + print_symbol(msg, symbol); + printk("\n"); +} + +static void print_trace_warning(void *data, char *msg) +{ + printk("%s%s\n", (char *)data, msg); +} + +static int print_trace_stack(void *data, char *name) +{ + return 0; +} + +/* + * Print one address/symbol entries per line. + */ +static void print_trace_address(void *data, unsigned long addr, int reliable) +{ + printk("%s [<%08lx>] ", (char *)data, addr); + if (!reliable) + printk("? "); + print_symbol("%s\n", addr); + touch_nmi_watchdog(); +} + +static const struct stacktrace_ops print_trace_ops = { + .warning = print_trace_warning, + .warning_symbol = print_trace_warning_symbol, + .stack = print_trace_stack, + .address = print_trace_address, +}; + +static void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, char *log_lvl) +{ + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); + printk("%s =======================\n", log_lvl); +} + +void show_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp) +{ + show_trace_log_lvl(task, regs, stack, bp, ""); +} + +static void +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, unsigned long bp, char *log_lvl) +{ + unsigned long *stack; + int i; + + if (sp == NULL) { + if (task) + sp = (unsigned long *)task->thread.sp; + else + sp = (unsigned long *)&sp; + } + + stack = sp; + for (i = 0; i < kstack_depth_to_print; i++) { + if (kstack_end(stack)) + break; + if (i && ((i % 8) == 0)) + printk("\n%s ", log_lvl); + printk("%08lx ", *stack++); + } + printk("\n%sCall Trace:\n", log_lvl); + + show_trace_log_lvl(task, regs, sp, bp, log_lvl); +} + +void show_stack(struct task_struct *task, unsigned long *sp) +{ + printk(" "); + show_stack_log_lvl(task, NULL, sp, 0, ""); +} + +/* + * The architecture-independent dump_stack generator + */ +void dump_stack(void) +{ + unsigned long bp = 0; + unsigned long stack; + +#ifdef CONFIG_FRAME_POINTER + if (!bp) + asm("movl %%ebp, %0" : "=r" (bp):); +#endif + + printk("Pid: %d, comm: %.20s %s %s %.*s\n", + current->pid, current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + + show_trace(current, NULL, &stack, bp); +} + +EXPORT_SYMBOL(dump_stack); + +void show_registers(struct pt_regs *regs) +{ + int i; + + print_modules(); + __show_regs(regs, 0); + + printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", + TASK_COMM_LEN, current->comm, task_pid_nr(current), + current_thread_info(), current, task_thread_info(current)); + /* + * When in-kernel, we also print out the stack and code at the + * time of the fault.. + */ + if (!user_mode_vm(regs)) { + unsigned int code_prologue = code_bytes * 43 / 64; + unsigned int code_len = code_bytes; + unsigned char c; + u8 *ip; + + printk("\n" KERN_EMERG "Stack: "); + show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG); + + printk(KERN_EMERG "Code: "); + + ip = (u8 *)regs->ip - code_prologue; + if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { + /* try starting at EIP */ + ip = (u8 *)regs->ip; + code_len = code_len - code_prologue + 1; + } + for (i = 0; i < code_len; i++, ip++) { + if (ip < (u8 *)PAGE_OFFSET || + probe_kernel_address(ip, c)) { + printk(" Bad EIP value."); + break; + } + if (ip == (u8 *)regs->ip) + printk("<%02x> ", c); + else + printk("%02x ", c); + } + } + printk("\n"); +} + +int is_valid_bugaddr(unsigned long ip) +{ + unsigned short ud2; + + if (ip < PAGE_OFFSET) + return 0; + if (probe_kernel_address((unsigned short *)ip, ud2)) + return 0; + + return ud2 == 0x0b0f; +} + +static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; +static int die_owner = -1; +static unsigned int die_nest_count; + +unsigned __kprobes long oops_begin(void) +{ + unsigned long flags; + + oops_enter(); + + if (die_owner != raw_smp_processor_id()) { + console_verbose(); + raw_local_irq_save(flags); + __raw_spin_lock(&die_lock); + die_owner = smp_processor_id(); + die_nest_count = 0; + bust_spinlocks(1); + } else { + raw_local_irq_save(flags); + } + die_nest_count++; + return flags; +} + +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) +{ + bust_spinlocks(0); + die_owner = -1; + add_taint(TAINT_DIE); + __raw_spin_unlock(&die_lock); + raw_local_irq_restore(flags); + + if (!regs) + return; + + if (kexec_should_crash(current)) + crash_kexec(regs); + + if (in_interrupt()) + panic("Fatal exception in interrupt"); + + if (panic_on_oops) + panic("Fatal exception"); + + oops_exit(); + do_exit(signr); +} + +int __kprobes __die(const char *str, struct pt_regs *regs, long err) +{ + unsigned short ss; + unsigned long sp; + + printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); +#ifdef CONFIG_PREEMPT + printk("PREEMPT "); +#endif +#ifdef CONFIG_SMP + printk("SMP "); +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC + printk("DEBUG_PAGEALLOC"); +#endif + printk("\n"); + if (notify_die(DIE_OOPS, str, regs, err, + current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) + return 1; + + show_registers(regs); + /* Executive summary in case the oops scrolled away */ + sp = (unsigned long) (®s->sp); + savesegment(ss, ss); + if (user_mode(regs)) { + sp = regs->sp; + ss = regs->ss & 0xffff; + } + printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); + print_symbol("%s", regs->ip); + printk(" SS:ESP %04x:%08lx\n", ss, sp); + return 0; +} + +/* + * This is gone through when something in the kernel has done something bad + * and is about to be terminated: + */ +void die(const char *str, struct pt_regs *regs, long err) +{ + unsigned long flags = oops_begin(); + + if (die_nest_count < 3) { + report_bug(regs->ip, regs); + + if (__die(str, regs, err)) + regs = NULL; + } else { + printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); + } + + oops_end(flags, regs, SIGSEGV); +} + +static int __init kstack_setup(char *s) +{ + kstack_depth_to_print = simple_strtoul(s, NULL, 0); + + return 1; +} +__setup("kstack=", kstack_setup); + +static int __init code_bytes_setup(char *s) +{ + code_bytes = simple_strtoul(s, NULL, 0); + if (code_bytes > 8192) + code_bytes = 8192; + + return 1; +} +__setup("code_bytes=", code_bytes_setup); diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 4f0831593e3..ce1063c141f 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -80,11 +80,7 @@ char ignore_fpu_irq; gate_desc idt_table[256] __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; -int panic_on_unrecovered_nmi; -int kstack_depth_to_print = 24; -static unsigned int code_bytes = 64; static int ignore_nmis; -static int die_counter; static inline void conditional_sti(struct pt_regs *regs) { @@ -92,388 +88,6 @@ static inline void conditional_sti(struct pt_regs *regs) local_irq_enable(); } -void printk_address(unsigned long address, int reliable) -{ -#ifdef CONFIG_KALLSYMS - unsigned long offset = 0; - unsigned long symsize; - const char *symname; - char *modname; - char *delim = ":"; - char namebuf[KSYM_NAME_LEN]; - char reliab[4] = ""; - - symname = kallsyms_lookup(address, &symsize, &offset, - &modname, namebuf); - if (!symname) { - printk(" [<%08lx>]\n", address); - return; - } - if (!reliable) - strcpy(reliab, "? "); - - if (!modname) - modname = delim = ""; - printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n", - address, reliab, delim, modname, delim, symname, offset, symsize); -#else - printk(" [<%08lx>]\n", address); -#endif -} - -static inline int valid_stack_ptr(struct thread_info *tinfo, - void *p, unsigned int size) -{ - void *t = tinfo; - return p > t && p <= t + THREAD_SIZE - size; -} - -/* The form of the top of the frame on the stack */ -struct stack_frame { - struct stack_frame *next_frame; - unsigned long return_address; -}; - -static inline unsigned long -print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data) -{ - struct stack_frame *frame = (struct stack_frame *)bp; - - while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) { - unsigned long addr; - - addr = *stack; - if (__kernel_text_address(addr)) { - if ((unsigned long) stack == bp + 4) { - ops->address(data, addr, 1); - frame = frame->next_frame; - bp = (unsigned long) frame; - } else { - ops->address(data, addr, bp == 0); - } - } - stack++; - } - return bp; -} - -void dump_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data) -{ - if (!task) - task = current; - - if (!stack) { - unsigned long dummy; - stack = &dummy; - if (task != current) - stack = (unsigned long *)task->thread.sp; - } - -#ifdef CONFIG_FRAME_POINTER - if (!bp) { - if (task == current) { - /* Grab bp right from our regs */ - asm("movl %%ebp, %0" : "=r" (bp) :); - } else { - /* bp is the last reg pushed by switch_to */ - bp = *(unsigned long *) task->thread.sp; - } - } -#endif - - for (;;) { - struct thread_info *context; - - context = (struct thread_info *) - ((unsigned long)stack & (~(THREAD_SIZE - 1))); - bp = print_context_stack(context, stack, bp, ops, data); - /* - * Should be after the line below, but somewhere - * in early boot context comes out corrupted and we - * can't reference it: - */ - if (ops->stack(data, "IRQ") < 0) - break; - stack = (unsigned long *)context->previous_esp; - if (!stack) - break; - touch_nmi_watchdog(); - } -} -EXPORT_SYMBOL(dump_trace); - -static void -print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - printk(data); - print_symbol(msg, symbol); - printk("\n"); -} - -static void print_trace_warning(void *data, char *msg) -{ - printk("%s%s\n", (char *)data, msg); -} - -static int print_trace_stack(void *data, char *name) -{ - return 0; -} - -/* - * Print one address/symbol entries per line. - */ -static void print_trace_address(void *data, unsigned long addr, int reliable) -{ - printk("%s [<%08lx>] ", (char *)data, addr); - if (!reliable) - printk("? "); - print_symbol("%s\n", addr); - touch_nmi_watchdog(); -} - -static const struct stacktrace_ops print_trace_ops = { - .warning = print_trace_warning, - .warning_symbol = print_trace_warning_symbol, - .stack = print_trace_stack, - .address = print_trace_address, -}; - -static void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) -{ - dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); - printk("%s =======================\n", log_lvl); -} - -void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) -{ - show_trace_log_lvl(task, regs, stack, bp, ""); -} - -static void -show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) -{ - unsigned long *stack; - int i; - - if (sp == NULL) { - if (task) - sp = (unsigned long *)task->thread.sp; - else - sp = (unsigned long *)&sp; - } - - stack = sp; - for (i = 0; i < kstack_depth_to_print; i++) { - if (kstack_end(stack)) - break; - if (i && ((i % 8) == 0)) - printk("\n%s ", log_lvl); - printk("%08lx ", *stack++); - } - printk("\n%sCall Trace:\n", log_lvl); - - show_trace_log_lvl(task, regs, sp, bp, log_lvl); -} - -void show_stack(struct task_struct *task, unsigned long *sp) -{ - printk(" "); - show_stack_log_lvl(task, NULL, sp, 0, ""); -} - -/* - * The architecture-independent dump_stack generator - */ -void dump_stack(void) -{ - unsigned long bp = 0; - unsigned long stack; - -#ifdef CONFIG_FRAME_POINTER - if (!bp) - asm("movl %%ebp, %0" : "=r" (bp):); -#endif - - printk("Pid: %d, comm: %.20s %s %s %.*s\n", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - - show_trace(current, NULL, &stack, bp); -} - -EXPORT_SYMBOL(dump_stack); - -void show_registers(struct pt_regs *regs) -{ - int i; - - print_modules(); - __show_regs(regs, 0); - - printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", - TASK_COMM_LEN, current->comm, task_pid_nr(current), - current_thread_info(), current, task_thread_info(current)); - /* - * When in-kernel, we also print out the stack and code at the - * time of the fault.. - */ - if (!user_mode_vm(regs)) { - unsigned int code_prologue = code_bytes * 43 / 64; - unsigned int code_len = code_bytes; - unsigned char c; - u8 *ip; - - printk("\n" KERN_EMERG "Stack: "); - show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG); - - printk(KERN_EMERG "Code: "); - - ip = (u8 *)regs->ip - code_prologue; - if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - /* try starting at EIP */ - ip = (u8 *)regs->ip; - code_len = code_len - code_prologue + 1; - } - for (i = 0; i < code_len; i++, ip++) { - if (ip < (u8 *)PAGE_OFFSET || - probe_kernel_address(ip, c)) { - printk(" Bad EIP value."); - break; - } - if (ip == (u8 *)regs->ip) - printk("<%02x> ", c); - else - printk("%02x ", c); - } - } - printk("\n"); -} - -int is_valid_bugaddr(unsigned long ip) -{ - unsigned short ud2; - - if (ip < PAGE_OFFSET) - return 0; - if (probe_kernel_address((unsigned short *)ip, ud2)) - return 0; - - return ud2 == 0x0b0f; -} - -static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; -static int die_owner = -1; -static unsigned int die_nest_count; - -unsigned __kprobes long oops_begin(void) -{ - unsigned long flags; - - oops_enter(); - - if (die_owner != raw_smp_processor_id()) { - console_verbose(); - raw_local_irq_save(flags); - __raw_spin_lock(&die_lock); - die_owner = smp_processor_id(); - die_nest_count = 0; - bust_spinlocks(1); - } else { - raw_local_irq_save(flags); - } - die_nest_count++; - return flags; -} - -void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) -{ - bust_spinlocks(0); - die_owner = -1; - add_taint(TAINT_DIE); - __raw_spin_unlock(&die_lock); - raw_local_irq_restore(flags); - - if (!regs) - return; - - if (kexec_should_crash(current)) - crash_kexec(regs); - - if (in_interrupt()) - panic("Fatal exception in interrupt"); - - if (panic_on_oops) - panic("Fatal exception"); - - oops_exit(); - do_exit(signr); -} - -int __kprobes __die(const char *str, struct pt_regs *regs, long err) -{ - unsigned short ss; - unsigned long sp; - - printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP "); -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); -#endif - printk("\n"); - if (notify_die(DIE_OOPS, str, regs, err, - current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) - return 1; - - show_registers(regs); - /* Executive summary in case the oops scrolled away */ - sp = (unsigned long) (®s->sp); - savesegment(ss, ss); - if (user_mode(regs)) { - sp = regs->sp; - ss = regs->ss & 0xffff; - } - printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); - print_symbol("%s", regs->ip); - printk(" SS:ESP %04x:%08lx\n", ss, sp); - return 0; -} - -/* - * This is gone through when something in the kernel has done something bad - * and is about to be terminated: - */ -void die(const char *str, struct pt_regs *regs, long err) -{ - unsigned long flags = oops_begin(); - - if (die_nest_count < 3) { - report_bug(regs->ip, regs); - - if (__die(str, regs, err)) - regs = NULL; - } else { - printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); - } - - oops_end(flags, regs, SIGSEGV); -} - static inline void die_if_kernel(const char *str, struct pt_regs *regs, long err) { @@ -1256,21 +870,3 @@ void __init trap_init(void) trap_init_hook(); } - -static int __init kstack_setup(char *s) -{ - kstack_depth_to_print = simple_strtoul(s, NULL, 0); - - return 1; -} -__setup("kstack=", kstack_setup); - -static int __init code_bytes_setup(char *s) -{ - code_bytes = simple_strtoul(s, NULL, 0); - if (code_bytes > 8192) - code_bytes = 8192; - - return 1; -} -__setup("code_bytes=", code_bytes_setup); -- cgit v1.2.3 From 6fcbede3fdfbd83d8de97296286f5a9ff5a8f371 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 30 Sep 2008 13:12:15 +0200 Subject: x86_64: split out dumpstack code from traps_64.c The dumpstack code is logically quite independent from the hardware traps. Split it out into its own file. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 4 +- arch/x86/kernel/dumpstack_64.c | 565 +++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/traps_64.c | 545 --------------------------------------- 3 files changed, 567 insertions(+), 547 deletions(-) create mode 100644 arch/x86/kernel/dumpstack_64.c diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index d5bde5d8c2b..de63fed9fae 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -23,12 +23,12 @@ CFLAGS_hpet.o := $(nostackp) CFLAGS_tsc.o := $(nostackp) obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o -obj-y += traps_$(BITS).o irq_$(BITS).o +obj-y += traps_$(BITS).o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o obj-$(CONFIG_X86_32) += probe_roms_32.o -obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o dumpstack_32.o +obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o obj-y += bootflag.o e820.o diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c new file mode 100644 index 00000000000..6f1505074db --- /dev/null +++ b/arch/x86/kernel/dumpstack_64.c @@ -0,0 +1,565 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +int panic_on_unrecovered_nmi; +int kstack_depth_to_print = 12; +static unsigned int code_bytes = 64; +static int die_counter; + +void printk_address(unsigned long address, int reliable) +{ + printk(" [<%016lx>] %s%pS\n", + address, reliable ? "" : "? ", (void *) address); +} + +static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, + unsigned *usedp, char **idp) +{ + static char ids[][8] = { + [DEBUG_STACK - 1] = "#DB", + [NMI_STACK - 1] = "NMI", + [DOUBLEFAULT_STACK - 1] = "#DF", + [STACKFAULT_STACK - 1] = "#SS", + [MCE_STACK - 1] = "#MC", +#if DEBUG_STKSZ > EXCEPTION_STKSZ + [N_EXCEPTION_STACKS ... + N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" +#endif + }; + unsigned k; + + /* + * Iterate over all exception stacks, and figure out whether + * 'stack' is in one of them: + */ + for (k = 0; k < N_EXCEPTION_STACKS; k++) { + unsigned long end = per_cpu(orig_ist, cpu).ist[k]; + /* + * Is 'stack' above this exception frame's end? + * If yes then skip to the next frame. + */ + if (stack >= end) + continue; + /* + * Is 'stack' above this exception frame's start address? + * If yes then we found the right frame. + */ + if (stack >= end - EXCEPTION_STKSZ) { + /* + * Make sure we only iterate through an exception + * stack once. If it comes up for the second time + * then there's something wrong going on - just + * break out and return NULL: + */ + if (*usedp & (1U << k)) + break; + *usedp |= 1U << k; + *idp = ids[k]; + return (unsigned long *)end; + } + /* + * If this is a debug stack, and if it has a larger size than + * the usual exception stacks, then 'stack' might still + * be within the lower portion of the debug stack: + */ +#if DEBUG_STKSZ > EXCEPTION_STKSZ + if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) { + unsigned j = N_EXCEPTION_STACKS - 1; + + /* + * Black magic. A large debug stack is composed of + * multiple exception stack entries, which we + * iterate through now. Dont look: + */ + do { + ++j; + end -= EXCEPTION_STKSZ; + ids[j][4] = '1' + (j - N_EXCEPTION_STACKS); + } while (stack < end - EXCEPTION_STKSZ); + if (*usedp & (1U << j)) + break; + *usedp |= 1U << j; + *idp = ids[j]; + return (unsigned long *)end; + } +#endif + } + return NULL; +} + +/* + * x86-64 can have up to three kernel stacks: + * process stack + * interrupt stack + * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack + */ + +static inline int valid_stack_ptr(struct thread_info *tinfo, + void *p, unsigned int size, void *end) +{ + void *t = tinfo; + if (end) { + if (p < end && p >= (end-THREAD_SIZE)) + return 1; + else + return 0; + } + return p > t && p < t + THREAD_SIZE - size; +} + +/* The form of the top of the frame on the stack */ +struct stack_frame { + struct stack_frame *next_frame; + unsigned long return_address; +}; + +static inline unsigned long +print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end) +{ + struct stack_frame *frame = (struct stack_frame *)bp; + + while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { + unsigned long addr; + + addr = *stack; + if (__kernel_text_address(addr)) { + if ((unsigned long) stack == bp + 8) { + ops->address(data, addr, 1); + frame = frame->next_frame; + bp = (unsigned long) frame; + } else { + ops->address(data, addr, bp == 0); + } + } + stack++; + } + return bp; +} + +void dump_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data) +{ + const unsigned cpu = get_cpu(); + unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; + unsigned used = 0; + struct thread_info *tinfo; + + if (!task) + task = current; + + if (!stack) { + unsigned long dummy; + stack = &dummy; + if (task && task != current) + stack = (unsigned long *)task->thread.sp; + } + +#ifdef CONFIG_FRAME_POINTER + if (!bp) { + if (task == current) { + /* Grab bp right from our regs */ + asm("movq %%rbp, %0" : "=r" (bp) : ); + } else { + /* bp is the last reg pushed by switch_to */ + bp = *(unsigned long *) task->thread.sp; + } + } +#endif + + /* + * Print function call entries in all stacks, starting at the + * current stack address. If the stacks consist of nested + * exceptions + */ + tinfo = task_thread_info(task); + for (;;) { + char *id; + unsigned long *estack_end; + estack_end = in_exception_stack(cpu, (unsigned long)stack, + &used, &id); + + if (estack_end) { + if (ops->stack(data, id) < 0) + break; + + bp = print_context_stack(tinfo, stack, bp, ops, + data, estack_end); + ops->stack(data, ""); + /* + * We link to the next stack via the + * second-to-last pointer (index -2 to end) in the + * exception stack: + */ + stack = (unsigned long *) estack_end[-2]; + continue; + } + if (irqstack_end) { + unsigned long *irqstack; + irqstack = irqstack_end - + (IRQSTACKSIZE - 64) / sizeof(*irqstack); + + if (stack >= irqstack && stack < irqstack_end) { + if (ops->stack(data, "IRQ") < 0) + break; + bp = print_context_stack(tinfo, stack, bp, + ops, data, irqstack_end); + /* + * We link to the next stack (which would be + * the process stack normally) the last + * pointer (index -1 to end) in the IRQ stack: + */ + stack = (unsigned long *) (irqstack_end[-1]); + irqstack_end = NULL; + ops->stack(data, "EOI"); + continue; + } + } + break; + } + + /* + * This handles the process stack: + */ + bp = print_context_stack(tinfo, stack, bp, ops, data, NULL); + put_cpu(); +} +EXPORT_SYMBOL(dump_trace); + +static void +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + print_symbol(msg, symbol); + printk("\n"); +} + +static void print_trace_warning(void *data, char *msg) +{ + printk("%s\n", msg); +} + +static int print_trace_stack(void *data, char *name) +{ + printk(" <%s> ", name); + return 0; +} + +static void print_trace_address(void *data, unsigned long addr, int reliable) +{ + touch_nmi_watchdog(); + printk_address(addr, reliable); +} + +static const struct stacktrace_ops print_trace_ops = { + .warning = print_trace_warning, + .warning_symbol = print_trace_warning_symbol, + .stack = print_trace_stack, + .address = print_trace_address, +}; + +static void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, char *log_lvl) +{ + printk("Call Trace:\n"); + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); +} + +void show_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp) +{ + show_trace_log_lvl(task, regs, stack, bp, ""); +} + +static void +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, unsigned long bp, char *log_lvl) +{ + unsigned long *stack; + int i; + const int cpu = smp_processor_id(); + unsigned long *irqstack_end = + (unsigned long *) (cpu_pda(cpu)->irqstackptr); + unsigned long *irqstack = + (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); + + /* + * debugging aid: "show_stack(NULL, NULL);" prints the + * back trace for this cpu. + */ + + if (sp == NULL) { + if (task) + sp = (unsigned long *)task->thread.sp; + else + sp = (unsigned long *)&sp; + } + + stack = sp; + for (i = 0; i < kstack_depth_to_print; i++) { + if (stack >= irqstack && stack <= irqstack_end) { + if (stack == irqstack_end) { + stack = (unsigned long *) (irqstack_end[-1]); + printk(" "); + } + } else { + if (((long) stack & (THREAD_SIZE-1)) == 0) + break; + } + if (i && ((i % 4) == 0)) + printk("\n"); + printk(" %016lx", *stack++); + touch_nmi_watchdog(); + } + printk("\n"); + show_trace_log_lvl(task, regs, sp, bp, log_lvl); +} + +void show_stack(struct task_struct *task, unsigned long *sp) +{ + show_stack_log_lvl(task, NULL, sp, 0, ""); +} + +/* + * The architecture-independent dump_stack generator + */ +void dump_stack(void) +{ + unsigned long bp = 0; + unsigned long stack; + +#ifdef CONFIG_FRAME_POINTER + if (!bp) + asm("movq %%rbp, %0" : "=r" (bp) : ); +#endif + + printk("Pid: %d, comm: %.20s %s %s %.*s\n", + current->pid, current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + show_trace(NULL, NULL, &stack, bp); +} +EXPORT_SYMBOL(dump_stack); + +void show_registers(struct pt_regs *regs) +{ + int i; + unsigned long sp; + const int cpu = smp_processor_id(); + struct task_struct *cur = cpu_pda(cpu)->pcurrent; + + sp = regs->sp; + printk("CPU %d ", cpu); + __show_regs(regs, 1); + printk("Process %s (pid: %d, threadinfo %p, task %p)\n", + cur->comm, cur->pid, task_thread_info(cur), cur); + + /* + * When in-kernel, we also print out the stack and code at the + * time of the fault.. + */ + if (!user_mode(regs)) { + unsigned int code_prologue = code_bytes * 43 / 64; + unsigned int code_len = code_bytes; + unsigned char c; + u8 *ip; + + printk("Stack: "); + show_stack_log_lvl(NULL, regs, (unsigned long *)sp, + regs->bp, ""); + + printk(KERN_EMERG "Code: "); + + ip = (u8 *)regs->ip - code_prologue; + if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { + /* try starting at RIP */ + ip = (u8 *)regs->ip; + code_len = code_len - code_prologue + 1; + } + for (i = 0; i < code_len; i++, ip++) { + if (ip < (u8 *)PAGE_OFFSET || + probe_kernel_address(ip, c)) { + printk(" Bad RIP value."); + break; + } + if (ip == (u8 *)regs->ip) + printk("<%02x> ", c); + else + printk("%02x ", c); + } + } + printk("\n"); +} + +int is_valid_bugaddr(unsigned long ip) +{ + unsigned short ud2; + + if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2))) + return 0; + + return ud2 == 0x0b0f; +} + +static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; +static int die_owner = -1; +static unsigned int die_nest_count; + +unsigned __kprobes long oops_begin(void) +{ + int cpu; + unsigned long flags; + + oops_enter(); + + /* racy, but better than risking deadlock. */ + raw_local_irq_save(flags); + cpu = smp_processor_id(); + if (!__raw_spin_trylock(&die_lock)) { + if (cpu == die_owner) + /* nested oops. should stop eventually */; + else + __raw_spin_lock(&die_lock); + } + die_nest_count++; + die_owner = cpu; + console_verbose(); + bust_spinlocks(1); + return flags; +} + +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) +{ + die_owner = -1; + bust_spinlocks(0); + die_nest_count--; + if (!die_nest_count) + /* Nest count reaches zero, release the lock. */ + __raw_spin_unlock(&die_lock); + raw_local_irq_restore(flags); + if (!regs) { + oops_exit(); + return; + } + if (in_interrupt()) + panic("Fatal exception in interrupt"); + if (panic_on_oops) + panic("Fatal exception"); + oops_exit(); + do_exit(signr); +} + +int __kprobes __die(const char *str, struct pt_regs *regs, long err) +{ + printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter); +#ifdef CONFIG_PREEMPT + printk("PREEMPT "); +#endif +#ifdef CONFIG_SMP + printk("SMP "); +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC + printk("DEBUG_PAGEALLOC"); +#endif + printk("\n"); + if (notify_die(DIE_OOPS, str, regs, err, + current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) + return 1; + + show_registers(regs); + add_taint(TAINT_DIE); + /* Executive summary in case the oops scrolled away */ + printk(KERN_ALERT "RIP "); + printk_address(regs->ip, 1); + printk(" RSP <%016lx>\n", regs->sp); + if (kexec_should_crash(current)) + crash_kexec(regs); + return 0; +} + +void die(const char *str, struct pt_regs *regs, long err) +{ + unsigned long flags = oops_begin(); + + if (!user_mode(regs)) + report_bug(regs->ip, regs); + + if (__die(str, regs, err)) + regs = NULL; + oops_end(flags, regs, SIGSEGV); +} + +notrace __kprobes void +die_nmi(char *str, struct pt_regs *regs, int do_panic) +{ + unsigned long flags; + + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) + return; + + flags = oops_begin(); + /* + * We are in trouble anyway, lets at least try + * to get a message out. + */ + printk(KERN_EMERG "%s", str); + printk(" on CPU%d, ip %08lx, registers:\n", + smp_processor_id(), regs->ip); + show_registers(regs); + if (kexec_should_crash(current)) + crash_kexec(regs); + if (do_panic || panic_on_oops) + panic("Non maskable interrupt"); + oops_end(flags, NULL, SIGBUS); + nmi_exit(); + local_irq_enable(); + do_exit(SIGBUS); +} + +static int __init oops_setup(char *s) +{ + if (!s) + return -EINVAL; + if (!strcmp(s, "panic")) + panic_on_oops = 1; + return 0; +} +early_param("oops", oops_setup); + +static int __init kstack_setup(char *s) +{ + if (!s) + return -EINVAL; + kstack_depth_to_print = simple_strtoul(s, NULL, 0); + return 0; +} +early_param("kstack", kstack_setup); + +static int __init code_bytes_setup(char *s) +{ + code_bytes = simple_strtoul(s, NULL, 0); + if (code_bytes > 8192) + code_bytes = 8192; + + return 1; +} +__setup("code_bytes=", code_bytes_setup); diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 729157ee4c1..1cd61ddd90b 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -54,11 +54,7 @@ #include -int panic_on_unrecovered_nmi; -int kstack_depth_to_print = 12; -static unsigned int code_bytes = 64; static int ignore_nmis; -static int die_counter; static inline void conditional_sti(struct pt_regs *regs) { @@ -82,518 +78,6 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) dec_preempt_count(); } -void printk_address(unsigned long address, int reliable) -{ - printk(" [<%016lx>] %s%pS\n", - address, reliable ? "" : "? ", (void *) address); -} - -static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, - unsigned *usedp, char **idp) -{ - static char ids[][8] = { - [DEBUG_STACK - 1] = "#DB", - [NMI_STACK - 1] = "NMI", - [DOUBLEFAULT_STACK - 1] = "#DF", - [STACKFAULT_STACK - 1] = "#SS", - [MCE_STACK - 1] = "#MC", -#if DEBUG_STKSZ > EXCEPTION_STKSZ - [N_EXCEPTION_STACKS ... - N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" -#endif - }; - unsigned k; - - /* - * Iterate over all exception stacks, and figure out whether - * 'stack' is in one of them: - */ - for (k = 0; k < N_EXCEPTION_STACKS; k++) { - unsigned long end = per_cpu(orig_ist, cpu).ist[k]; - /* - * Is 'stack' above this exception frame's end? - * If yes then skip to the next frame. - */ - if (stack >= end) - continue; - /* - * Is 'stack' above this exception frame's start address? - * If yes then we found the right frame. - */ - if (stack >= end - EXCEPTION_STKSZ) { - /* - * Make sure we only iterate through an exception - * stack once. If it comes up for the second time - * then there's something wrong going on - just - * break out and return NULL: - */ - if (*usedp & (1U << k)) - break; - *usedp |= 1U << k; - *idp = ids[k]; - return (unsigned long *)end; - } - /* - * If this is a debug stack, and if it has a larger size than - * the usual exception stacks, then 'stack' might still - * be within the lower portion of the debug stack: - */ -#if DEBUG_STKSZ > EXCEPTION_STKSZ - if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) { - unsigned j = N_EXCEPTION_STACKS - 1; - - /* - * Black magic. A large debug stack is composed of - * multiple exception stack entries, which we - * iterate through now. Dont look: - */ - do { - ++j; - end -= EXCEPTION_STKSZ; - ids[j][4] = '1' + (j - N_EXCEPTION_STACKS); - } while (stack < end - EXCEPTION_STKSZ); - if (*usedp & (1U << j)) - break; - *usedp |= 1U << j; - *idp = ids[j]; - return (unsigned long *)end; - } -#endif - } - return NULL; -} - -/* - * x86-64 can have up to three kernel stacks: - * process stack - * interrupt stack - * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack - */ - -static inline int valid_stack_ptr(struct thread_info *tinfo, - void *p, unsigned int size, void *end) -{ - void *t = tinfo; - if (end) { - if (p < end && p >= (end-THREAD_SIZE)) - return 1; - else - return 0; - } - return p > t && p < t + THREAD_SIZE - size; -} - -/* The form of the top of the frame on the stack */ -struct stack_frame { - struct stack_frame *next_frame; - unsigned long return_address; -}; - -static inline unsigned long -print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - unsigned long *end) -{ - struct stack_frame *frame = (struct stack_frame *)bp; - - while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { - unsigned long addr; - - addr = *stack; - if (__kernel_text_address(addr)) { - if ((unsigned long) stack == bp + 8) { - ops->address(data, addr, 1); - frame = frame->next_frame; - bp = (unsigned long) frame; - } else { - ops->address(data, addr, bp == 0); - } - } - stack++; - } - return bp; -} - -void dump_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data) -{ - const unsigned cpu = get_cpu(); - unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; - unsigned used = 0; - struct thread_info *tinfo; - - if (!task) - task = current; - - if (!stack) { - unsigned long dummy; - stack = &dummy; - if (task && task != current) - stack = (unsigned long *)task->thread.sp; - } - -#ifdef CONFIG_FRAME_POINTER - if (!bp) { - if (task == current) { - /* Grab bp right from our regs */ - asm("movq %%rbp, %0" : "=r" (bp) : ); - } else { - /* bp is the last reg pushed by switch_to */ - bp = *(unsigned long *) task->thread.sp; - } - } -#endif - - /* - * Print function call entries in all stacks, starting at the - * current stack address. If the stacks consist of nested - * exceptions - */ - tinfo = task_thread_info(task); - for (;;) { - char *id; - unsigned long *estack_end; - estack_end = in_exception_stack(cpu, (unsigned long)stack, - &used, &id); - - if (estack_end) { - if (ops->stack(data, id) < 0) - break; - - bp = print_context_stack(tinfo, stack, bp, ops, - data, estack_end); - ops->stack(data, ""); - /* - * We link to the next stack via the - * second-to-last pointer (index -2 to end) in the - * exception stack: - */ - stack = (unsigned long *) estack_end[-2]; - continue; - } - if (irqstack_end) { - unsigned long *irqstack; - irqstack = irqstack_end - - (IRQSTACKSIZE - 64) / sizeof(*irqstack); - - if (stack >= irqstack && stack < irqstack_end) { - if (ops->stack(data, "IRQ") < 0) - break; - bp = print_context_stack(tinfo, stack, bp, - ops, data, irqstack_end); - /* - * We link to the next stack (which would be - * the process stack normally) the last - * pointer (index -1 to end) in the IRQ stack: - */ - stack = (unsigned long *) (irqstack_end[-1]); - irqstack_end = NULL; - ops->stack(data, "EOI"); - continue; - } - } - break; - } - - /* - * This handles the process stack: - */ - bp = print_context_stack(tinfo, stack, bp, ops, data, NULL); - put_cpu(); -} -EXPORT_SYMBOL(dump_trace); - -static void -print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - print_symbol(msg, symbol); - printk("\n"); -} - -static void print_trace_warning(void *data, char *msg) -{ - printk("%s\n", msg); -} - -static int print_trace_stack(void *data, char *name) -{ - printk(" <%s> ", name); - return 0; -} - -static void print_trace_address(void *data, unsigned long addr, int reliable) -{ - touch_nmi_watchdog(); - printk_address(addr, reliable); -} - -static const struct stacktrace_ops print_trace_ops = { - .warning = print_trace_warning, - .warning_symbol = print_trace_warning_symbol, - .stack = print_trace_stack, - .address = print_trace_address, -}; - -static void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) -{ - printk("Call Trace:\n"); - dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); -} - -void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) -{ - show_trace_log_lvl(task, regs, stack, bp, ""); -} - -static void -show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) -{ - unsigned long *stack; - int i; - const int cpu = smp_processor_id(); - unsigned long *irqstack_end = - (unsigned long *) (cpu_pda(cpu)->irqstackptr); - unsigned long *irqstack = - (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); - - /* - * debugging aid: "show_stack(NULL, NULL);" prints the - * back trace for this cpu. - */ - - if (sp == NULL) { - if (task) - sp = (unsigned long *)task->thread.sp; - else - sp = (unsigned long *)&sp; - } - - stack = sp; - for (i = 0; i < kstack_depth_to_print; i++) { - if (stack >= irqstack && stack <= irqstack_end) { - if (stack == irqstack_end) { - stack = (unsigned long *) (irqstack_end[-1]); - printk(" "); - } - } else { - if (((long) stack & (THREAD_SIZE-1)) == 0) - break; - } - if (i && ((i % 4) == 0)) - printk("\n"); - printk(" %016lx", *stack++); - touch_nmi_watchdog(); - } - printk("\n"); - show_trace_log_lvl(task, regs, sp, bp, log_lvl); -} - -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_log_lvl(task, NULL, sp, 0, ""); -} - -/* - * The architecture-independent dump_stack generator - */ -void dump_stack(void) -{ - unsigned long bp = 0; - unsigned long stack; - -#ifdef CONFIG_FRAME_POINTER - if (!bp) - asm("movq %%rbp, %0" : "=r" (bp) : ); -#endif - - printk("Pid: %d, comm: %.20s %s %s %.*s\n", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - show_trace(NULL, NULL, &stack, bp); -} -EXPORT_SYMBOL(dump_stack); - -void show_registers(struct pt_regs *regs) -{ - int i; - unsigned long sp; - const int cpu = smp_processor_id(); - struct task_struct *cur = cpu_pda(cpu)->pcurrent; - - sp = regs->sp; - printk("CPU %d ", cpu); - __show_regs(regs, 1); - printk("Process %s (pid: %d, threadinfo %p, task %p)\n", - cur->comm, cur->pid, task_thread_info(cur), cur); - - /* - * When in-kernel, we also print out the stack and code at the - * time of the fault.. - */ - if (!user_mode(regs)) { - unsigned int code_prologue = code_bytes * 43 / 64; - unsigned int code_len = code_bytes; - unsigned char c; - u8 *ip; - - printk("Stack: "); - show_stack_log_lvl(NULL, regs, (unsigned long *)sp, - regs->bp, ""); - - printk(KERN_EMERG "Code: "); - - ip = (u8 *)regs->ip - code_prologue; - if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - /* try starting at RIP */ - ip = (u8 *)regs->ip; - code_len = code_len - code_prologue + 1; - } - for (i = 0; i < code_len; i++, ip++) { - if (ip < (u8 *)PAGE_OFFSET || - probe_kernel_address(ip, c)) { - printk(" Bad RIP value."); - break; - } - if (ip == (u8 *)regs->ip) - printk("<%02x> ", c); - else - printk("%02x ", c); - } - } - printk("\n"); -} - -int is_valid_bugaddr(unsigned long ip) -{ - unsigned short ud2; - - if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2))) - return 0; - - return ud2 == 0x0b0f; -} - -static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; -static int die_owner = -1; -static unsigned int die_nest_count; - -unsigned __kprobes long oops_begin(void) -{ - int cpu; - unsigned long flags; - - oops_enter(); - - /* racy, but better than risking deadlock. */ - raw_local_irq_save(flags); - cpu = smp_processor_id(); - if (!__raw_spin_trylock(&die_lock)) { - if (cpu == die_owner) - /* nested oops. should stop eventually */; - else - __raw_spin_lock(&die_lock); - } - die_nest_count++; - die_owner = cpu; - console_verbose(); - bust_spinlocks(1); - return flags; -} - -void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) -{ - die_owner = -1; - bust_spinlocks(0); - die_nest_count--; - if (!die_nest_count) - /* Nest count reaches zero, release the lock. */ - __raw_spin_unlock(&die_lock); - raw_local_irq_restore(flags); - if (!regs) { - oops_exit(); - return; - } - if (panic_on_oops) - panic("Fatal exception"); - oops_exit(); - do_exit(signr); -} - -int __kprobes __die(const char *str, struct pt_regs *regs, long err) -{ - printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP "); -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); -#endif - printk("\n"); - if (notify_die(DIE_OOPS, str, regs, err, - current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) - return 1; - - show_registers(regs); - add_taint(TAINT_DIE); - /* Executive summary in case the oops scrolled away */ - printk(KERN_ALERT "RIP "); - printk_address(regs->ip, 1); - printk(" RSP <%016lx>\n", regs->sp); - if (kexec_should_crash(current)) - crash_kexec(regs); - return 0; -} - -void die(const char *str, struct pt_regs *regs, long err) -{ - unsigned long flags = oops_begin(); - - if (!user_mode(regs)) - report_bug(regs->ip, regs); - - if (__die(str, regs, err)) - regs = NULL; - oops_end(flags, regs, SIGSEGV); -} - -notrace __kprobes void -die_nmi(char *str, struct pt_regs *regs, int do_panic) -{ - unsigned long flags; - - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) - return; - - flags = oops_begin(); - /* - * We are in trouble anyway, lets at least try - * to get a message out. - */ - printk(KERN_EMERG "%s", str); - printk(" on CPU%d, ip %08lx, registers:\n", - smp_processor_id(), regs->ip); - show_registers(regs); - if (kexec_should_crash(current)) - crash_kexec(regs); - if (do_panic || panic_on_oops) - panic("Non maskable interrupt"); - oops_end(flags, NULL, SIGBUS); - nmi_exit(); - local_irq_enable(); - do_exit(SIGBUS); -} - static void __kprobes do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, long error_code, siginfo_t *info) @@ -1178,32 +662,3 @@ void __init trap_init(void) */ cpu_init(); } - -static int __init oops_setup(char *s) -{ - if (!s) - return -EINVAL; - if (!strcmp(s, "panic")) - panic_on_oops = 1; - return 0; -} -early_param("oops", oops_setup); - -static int __init kstack_setup(char *s) -{ - if (!s) - return -EINVAL; - kstack_depth_to_print = simple_strtoul(s, NULL, 0); - return 0; -} -early_param("kstack", kstack_setup); - -static int __init code_bytes_setup(char *s) -{ - code_bytes = simple_strtoul(s, NULL, 0); - if (code_bytes > 8192) - code_bytes = 8192; - - return 1; -} -__setup("code_bytes=", code_bytes_setup); -- cgit v1.2.3 From a28680b4b821a262fd3b5e57a28c148b5f9e662a Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 30 Sep 2008 18:41:34 +0200 Subject: x86, traps: split out math_error and simd_math_error Split out math_error from do_coprocessor_error and simd_math_error from do_simd_coprocessor_error, like on i386. While at it, add the "error_code" parameter to do_coprocessor_error, do_simd_coprocessor_error and do_spurious_interrupt_bug. This does not change the generated code, but brings the declarations in line with all the other trap handlers. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_64.c | 36 +++++++++++++++++++++--------------- include/asm-x86/traps.h | 6 +++--- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 1cd61ddd90b..b295ebf0cb3 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -452,18 +452,12 @@ static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) * the correct behaviour even in the presence of the asynchronous * IRQ13 behaviour */ -asmlinkage void do_coprocessor_error(struct pt_regs *regs) +void math_error(void __user *ip) { - void __user *ip = (void __user *)(regs->ip); struct task_struct *task; siginfo_t info; unsigned short cwd, swd; - conditional_sti(regs); - if (!user_mode(regs) && - kernel_math_error(regs, "kernel x87 math error", 16)) - return; - /* * Save the info for the exception handler and clear the error. */ @@ -516,23 +510,26 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs) force_sig_info(SIGFPE, &info, task); } +asmlinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) +{ + conditional_sti(regs); + if (!user_mode(regs) && + kernel_math_error(regs, "kernel x87 math error", 16)) + return; + math_error((void __user *)regs->ip); +} + asmlinkage void bad_intr(void) { printk("bad interrupt"); } -asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) +static void simd_math_error(void __user *ip) { - void __user *ip = (void __user *)(regs->ip); struct task_struct *task; siginfo_t info; unsigned short mxcsr; - conditional_sti(regs); - if (!user_mode(regs) && - kernel_math_error(regs, "kernel simd math error", 19)) - return; - /* * Save the info for the exception handler and clear the error. */ @@ -575,7 +572,16 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) force_sig_info(SIGFPE, &info, task); } -asmlinkage void do_spurious_interrupt_bug(struct pt_regs *regs) +asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) +{ + conditional_sti(regs); + if (!user_mode(regs) && + kernel_math_error(regs, "kernel simd math error", 19)) + return; + simd_math_error((void __user *)regs->ip); +} + +asmlinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) { } diff --git a/include/asm-x86/traps.h b/include/asm-x86/traps.h index 7a692baa51a..c82c39c7b5e 100644 --- a/include/asm-x86/traps.h +++ b/include/asm-x86/traps.h @@ -72,9 +72,9 @@ asmlinkage void double_fault(void); asmlinkage void do_int3(struct pt_regs *, long); asmlinkage void do_stack_segment(struct pt_regs *, long); asmlinkage void do_debug(struct pt_regs *, unsigned long); -asmlinkage void do_coprocessor_error(struct pt_regs *); -asmlinkage void do_simd_coprocessor_error(struct pt_regs *); -asmlinkage void do_spurious_interrupt_bug(struct pt_regs *); +asmlinkage void do_coprocessor_error(struct pt_regs *, long); +asmlinkage void do_simd_coprocessor_error(struct pt_regs *, long); +asmlinkage void do_spurious_interrupt_bug(struct pt_regs *, long); asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code); -- cgit v1.2.3 From ae82157b3d8bb4902f76b56c7450a945288786ac Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 30 Sep 2008 18:41:35 +0200 Subject: x86, traps, i386: factor out lazy io-bitmap copy x86_64 does not do the lazy io-bitmap dance. Putting it in its own function makes i386's do_general_protection look much more like x86_64's. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 76 ++++++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 33 deletions(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index ce1063c141f..0206c915748 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -95,6 +95,47 @@ die_if_kernel(const char *str, struct pt_regs *regs, long err) die(str, regs, err); } +/* + * Perform the lazy TSS's I/O bitmap copy. If the TSS has an + * invalid offset set (the LAZY one) and the faulting thread has + * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS, + * we set the offset field correctly and return 1. + */ +static int lazy_iobitmap_copy(void) +{ + struct thread_struct *thread; + struct tss_struct *tss; + int cpu; + + cpu = get_cpu(); + tss = &per_cpu(init_tss, cpu); + thread = ¤t->thread; + + if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && + thread->io_bitmap_ptr) { + memcpy(tss->io_bitmap, thread->io_bitmap_ptr, + thread->io_bitmap_max); + /* + * If the previously set map was extending to higher ports + * than the current one, pad extra space with 0xff (no access). + */ + if (thread->io_bitmap_max < tss->io_bitmap_max) { + memset((char *) tss->io_bitmap + + thread->io_bitmap_max, 0xff, + tss->io_bitmap_max - thread->io_bitmap_max); + } + tss->io_bitmap_max = thread->io_bitmap_max; + tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; + tss->io_bitmap_owner = thread; + put_cpu(); + + return 1; + } + put_cpu(); + + return 0; +} + static void __kprobes do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, long error_code, siginfo_t *info) @@ -187,44 +228,13 @@ void __kprobes do_general_protection(struct pt_regs *regs, long error_code) { struct task_struct *tsk; - struct thread_struct *thread; - struct tss_struct *tss; - int cpu; conditional_sti(regs); - cpu = get_cpu(); - tss = &per_cpu(init_tss, cpu); - thread = ¤t->thread; - - /* - * Perform the lazy TSS's I/O bitmap copy. If the TSS has an - * invalid offset set (the LAZY one) and the faulting thread has - * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS - * and we set the offset field correctly. Then we let the CPU to - * restart the faulting instruction. - */ - if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && - thread->io_bitmap_ptr) { - memcpy(tss->io_bitmap, thread->io_bitmap_ptr, - thread->io_bitmap_max); - /* - * If the previously set map was extending to higher ports - * than the current one, pad extra space with 0xff (no access). - */ - if (thread->io_bitmap_max < tss->io_bitmap_max) { - memset((char *) tss->io_bitmap + - thread->io_bitmap_max, 0xff, - tss->io_bitmap_max - thread->io_bitmap_max); - } - tss->io_bitmap_max = thread->io_bitmap_max; - tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; - tss->io_bitmap_owner = thread; - put_cpu(); - + if (lazy_iobitmap_copy()) { + /* restart the faulting instruction */ return; } - put_cpu(); if (regs->flags & X86_VM_MASK) goto gp_in_vm86; -- cgit v1.2.3 From e407d62088b7f61f38e1086062650c75a4f2757a Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 30 Sep 2008 18:41:36 +0200 Subject: x86, traps: introduce dotraplinkage Mark the exception handlers with "dotraplinkage" to hide the calling convention differences between i386 and x86_64. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 2 +- arch/x86/kernel/traps_32.c | 28 ++++++++++-------- arch/x86/kernel/traps_64.c | 31 ++++++++++++-------- include/asm-x86/traps.h | 73 +++++++++++++++++++++++----------------------- 4 files changed, 72 insertions(+), 62 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 977c8ea6602..1db6ce4314e 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1243,7 +1243,7 @@ ENTRY(simd_coprocessor_error) END(simd_coprocessor_error) ENTRY(device_not_available) - zeroentry math_state_restore + zeroentry do_device_not_available END(device_not_available) /* runs on exception stack */ diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 0206c915748..7eb1203c909 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -190,7 +190,7 @@ vm86_trap: } #define DO_ERROR(trapnr, signr, str, name) \ -void do_##name(struct pt_regs *regs, long error_code) \ +dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ { \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ @@ -200,7 +200,7 @@ void do_##name(struct pt_regs *regs, long error_code) \ } #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ -void do_##name(struct pt_regs *regs, long error_code) \ +dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ { \ siginfo_t info; \ info.si_signo = signr; \ @@ -224,7 +224,7 @@ DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) DO_ERROR(12, SIGBUS, "stack segment", stack_segment) DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) -void __kprobes +dotraplinkage void __kprobes do_general_protection(struct pt_regs *regs, long error_code) { struct task_struct *tsk; @@ -428,7 +428,8 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) reassert_nmi(); } -notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) +dotraplinkage notrace __kprobes void +do_nmi(struct pt_regs *regs, long error_code) { int cpu; @@ -456,7 +457,7 @@ void restart_nmi(void) acpi_nmi_enable(); } -void __kprobes do_int3(struct pt_regs *regs, long error_code) +dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) { #ifdef CONFIG_KPROBES if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) @@ -494,7 +495,7 @@ void __kprobes do_int3(struct pt_regs *regs, long error_code) * find every occurrence of the TF bit that could be saved away even * by user code) */ -void __kprobes do_debug(struct pt_regs *regs, long error_code) +dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; unsigned int condition; @@ -627,7 +628,7 @@ void math_error(void __user *ip) force_sig_info(SIGFPE, &info, task); } -void do_coprocessor_error(struct pt_regs *regs, long error_code) +dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) { conditional_sti(regs); ignore_fpu_irq = 1; @@ -682,7 +683,8 @@ static void simd_math_error(void __user *ip) force_sig_info(SIGFPE, &info, task); } -void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) +dotraplinkage void +do_simd_coprocessor_error(struct pt_regs *regs, long error_code) { conditional_sti(regs); @@ -706,7 +708,8 @@ void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) force_sig(SIGSEGV, current); } -void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) +dotraplinkage void +do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) { conditional_sti(regs); #if 0 @@ -784,7 +787,8 @@ asmlinkage void math_emulate(long arg) #endif /* CONFIG_MATH_EMULATION */ -void __kprobes do_device_not_available(struct pt_regs *regs, long error) +dotraplinkage void __kprobes +do_device_not_available(struct pt_regs *regs, long error) { if (read_cr0() & X86_CR0_EM) { conditional_sti(regs); @@ -796,14 +800,14 @@ void __kprobes do_device_not_available(struct pt_regs *regs, long error) } #ifdef CONFIG_X86_MCE -void __kprobes do_machine_check(struct pt_regs *regs, long error) +dotraplinkage void __kprobes do_machine_check(struct pt_regs *regs, long error) { conditional_sti(regs); machine_check_vector(regs, error); } #endif -void do_iret_error(struct pt_regs *regs, long error_code) +dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) { siginfo_t info; local_irq_enable(); diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index b295ebf0cb3..be73323ca95 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -125,7 +125,7 @@ kernel_trap: } #define DO_ERROR(trapnr, signr, str, name) \ -asmlinkage void do_##name(struct pt_regs *regs, long error_code) \ +dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ { \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ @@ -135,7 +135,7 @@ asmlinkage void do_##name(struct pt_regs *regs, long error_code) \ } #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ -asmlinkage void do_##name(struct pt_regs *regs, long error_code) \ +dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ { \ siginfo_t info; \ info.si_signo = signr; \ @@ -159,7 +159,7 @@ DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) /* Runs on IST stack */ -asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code) +dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) { if (notify_die(DIE_TRAP, "stack segment", regs, error_code, 12, SIGBUS) == NOTIFY_STOP) @@ -169,7 +169,7 @@ asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code) preempt_conditional_cli(regs); } -asmlinkage void do_double_fault(struct pt_regs *regs, long error_code) +dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) { static const char str[] = "double fault"; struct task_struct *tsk = current; @@ -186,7 +186,7 @@ asmlinkage void do_double_fault(struct pt_regs *regs, long error_code) die(str, regs, error_code); } -asmlinkage void __kprobes +dotraplinkage void __kprobes do_general_protection(struct pt_regs *regs, long error_code) { struct task_struct *tsk; @@ -317,7 +317,7 @@ asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs) io_check_error(reason, regs); } -asmlinkage notrace __kprobes void +dotraplinkage notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) { nmi_enter(); @@ -343,7 +343,7 @@ void restart_nmi(void) } /* runs on IST stack. */ -asmlinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) +dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) { if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) @@ -376,8 +376,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) } /* runs on IST stack. */ -asmlinkage void __kprobes do_debug(struct pt_regs *regs, - unsigned long error_code) +dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; unsigned long condition; @@ -510,7 +509,7 @@ void math_error(void __user *ip) force_sig_info(SIGFPE, &info, task); } -asmlinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) +dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) { conditional_sti(regs); if (!user_mode(regs) && @@ -572,7 +571,8 @@ static void simd_math_error(void __user *ip) force_sig_info(SIGFPE, &info, task); } -asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) +dotraplinkage void +do_simd_coprocessor_error(struct pt_regs *regs, long error_code) { conditional_sti(regs); if (!user_mode(regs) && @@ -581,7 +581,8 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) simd_math_error((void __user *)regs->ip); } -asmlinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) +dotraplinkage void +do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) { } @@ -633,6 +634,12 @@ asmlinkage void math_state_restore(void) } EXPORT_SYMBOL_GPL(math_state_restore); +dotraplinkage void __kprobes +do_device_not_available(struct pt_regs *regs, long error) +{ + math_state_restore(); +} + void __init trap_init(void) { set_intr_gate(0, ÷_error); diff --git a/include/asm-x86/traps.h b/include/asm-x86/traps.h index c82c39c7b5e..6c3dc2c6575 100644 --- a/include/asm-x86/traps.h +++ b/include/asm-x86/traps.h @@ -3,7 +3,12 @@ #include -/* Common in X86_32 and X86_64 */ +#ifdef CONFIG_X86_32 +#define dotraplinkage +#else +#define dotraplinkage asmlinkage +#endif + asmlinkage void divide_error(void); asmlinkage void debug(void); asmlinkage void nmi(void); @@ -12,31 +17,47 @@ asmlinkage void overflow(void); asmlinkage void bounds(void); asmlinkage void invalid_op(void); asmlinkage void device_not_available(void); +#ifdef CONFIG_X86_64 +asmlinkage void double_fault(void); +#endif asmlinkage void coprocessor_segment_overrun(void); asmlinkage void invalid_TSS(void); asmlinkage void segment_not_present(void); asmlinkage void stack_segment(void); asmlinkage void general_protection(void); asmlinkage void page_fault(void); +asmlinkage void spurious_interrupt_bug(void); asmlinkage void coprocessor_error(void); -asmlinkage void simd_coprocessor_error(void); asmlinkage void alignment_check(void); -asmlinkage void spurious_interrupt_bug(void); #ifdef CONFIG_X86_MCE asmlinkage void machine_check(void); #endif /* CONFIG_X86_MCE */ +asmlinkage void simd_coprocessor_error(void); -void do_divide_error(struct pt_regs *, long); -void do_overflow(struct pt_regs *, long); -void do_bounds(struct pt_regs *, long); -void do_coprocessor_segment_overrun(struct pt_regs *, long); -void do_invalid_TSS(struct pt_regs *, long); -void do_segment_not_present(struct pt_regs *, long); -void do_stack_segment(struct pt_regs *, long); -void do_alignment_check(struct pt_regs *, long); -void do_invalid_op(struct pt_regs *, long); -void do_general_protection(struct pt_regs *, long); -void do_nmi(struct pt_regs *, long); +dotraplinkage void do_divide_error(struct pt_regs *, long); +dotraplinkage void do_debug(struct pt_regs *, long); +dotraplinkage void do_nmi(struct pt_regs *, long); +dotraplinkage void do_int3(struct pt_regs *, long); +dotraplinkage void do_overflow(struct pt_regs *, long); +dotraplinkage void do_bounds(struct pt_regs *, long); +dotraplinkage void do_invalid_op(struct pt_regs *, long); +dotraplinkage void do_device_not_available(struct pt_regs *, long); +dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *, long); +dotraplinkage void do_invalid_TSS(struct pt_regs *, long); +dotraplinkage void do_segment_not_present(struct pt_regs *, long); +dotraplinkage void do_stack_segment(struct pt_regs *, long); +dotraplinkage void do_general_protection(struct pt_regs *, long); +dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); +dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long); +dotraplinkage void do_coprocessor_error(struct pt_regs *, long); +dotraplinkage void do_alignment_check(struct pt_regs *, long); +#ifdef CONFIG_X86_MCE +dotraplinkage void do_machine_check(struct pt_regs *, long); +#endif +dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long); +#ifdef CONFIG_X86_32 +dotraplinkage void do_iret_error(struct pt_regs *, long); +#endif static inline int get_si_code(unsigned long condition) { @@ -52,31 +73,9 @@ extern int panic_on_unrecovered_nmi; extern int kstack_depth_to_print; #ifdef CONFIG_X86_32 - -void do_iret_error(struct pt_regs *, long); -void do_int3(struct pt_regs *, long); -void do_debug(struct pt_regs *, long); void math_error(void __user *); -void do_coprocessor_error(struct pt_regs *, long); -void do_simd_coprocessor_error(struct pt_regs *, long); -void do_spurious_interrupt_bug(struct pt_regs *, long); unsigned long patch_espfix_desc(unsigned long, unsigned long); asmlinkage void math_emulate(long); +#endif -void do_page_fault(struct pt_regs *regs, unsigned long error_code); - -#else /* CONFIG_X86_32 */ - -asmlinkage void double_fault(void); - -asmlinkage void do_int3(struct pt_regs *, long); -asmlinkage void do_stack_segment(struct pt_regs *, long); -asmlinkage void do_debug(struct pt_regs *, unsigned long); -asmlinkage void do_coprocessor_error(struct pt_regs *, long); -asmlinkage void do_simd_coprocessor_error(struct pt_regs *, long); -asmlinkage void do_spurious_interrupt_bug(struct pt_regs *, long); - -asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code); - -#endif /* CONFIG_X86_32 */ #endif /* ASM_X86__TRAPS_H */ -- cgit v1.2.3 From 3d2a71a596bd9c761c8487a2178e95f8a61da083 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 30 Sep 2008 18:41:37 +0200 Subject: x86, traps: converge do_debug handlers Make the x86_64-version and the i386-version of do_debug more similar. - introduce preempt_conditional_sti/cli to i386. The preempt-count is now elevated during the trap handler, like on x86_64. It does not run on a separate stack, however. - replace an open-coded "send_sigtrap" - copy some comments Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 30 +++++++++++++++++++++--------- arch/x86/kernel/traps_64.c | 17 +++++++++-------- include/asm-x86/ptrace.h | 4 ---- 3 files changed, 30 insertions(+), 21 deletions(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 7eb1203c909..953172af6e5 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -88,6 +88,20 @@ static inline void conditional_sti(struct pt_regs *regs) local_irq_enable(); } +static inline void preempt_conditional_sti(struct pt_regs *regs) +{ + inc_preempt_count(); + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); +} + +static inline void preempt_conditional_cli(struct pt_regs *regs) +{ + if (regs->flags & X86_EFLAGS_IF) + local_irq_disable(); + dec_preempt_count(); +} + static inline void die_if_kernel(const char *str, struct pt_regs *regs, long err) { @@ -498,7 +512,7 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; - unsigned int condition; + unsigned long condition; int si_code; get_debugreg(condition, 6); @@ -512,9 +526,9 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, SIGTRAP) == NOTIFY_STOP) return; + /* It's safe to allow irq's after DR6 has been saved */ - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); + preempt_conditional_sti(regs); /* Mask out spurious debug traps due to lazy DR7 setting */ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { @@ -533,16 +547,11 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) * kernel space (but re-enable TF when returning to user mode). */ if (condition & DR_STEP) { - /* - * We already checked v86 mode above, so we can - * check for kernel mode by just checking the CPL - * of CS. - */ if (!user_mode(regs)) goto clear_TF_reenable; } - si_code = get_si_code((unsigned long)condition); + si_code = get_si_code(condition); /* Ok, finally something we can handle */ send_sigtrap(tsk, regs, error_code, si_code); @@ -552,15 +561,18 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) */ clear_dr7: set_debugreg(0, 7); + preempt_conditional_cli(regs); return; debug_vm86: handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); + preempt_conditional_cli(regs); return; clear_TF_reenable: set_tsk_thread_flag(tsk, TIF_SINGLESTEP); regs->flags &= ~X86_EFLAGS_TF; + preempt_conditional_cli(regs); return; } diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index be73323ca95..a851eca6d04 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -380,7 +380,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; unsigned long condition; - siginfo_t info; + int si_code; get_debugreg(condition, 6); @@ -394,6 +394,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) SIGTRAP) == NOTIFY_STOP) return; + /* It's safe to allow irq's after DR6 has been saved */ preempt_conditional_sti(regs); /* Mask out spurious debug traps due to lazy DR7 setting */ @@ -402,6 +403,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) goto clear_dr7; } + /* Save debug status register where ptrace can see it */ tsk->thread.debugreg6 = condition; /* @@ -413,15 +415,14 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) goto clear_TF_reenable; } + si_code = get_si_code(condition); /* Ok, finally something we can handle */ - tsk->thread.trap_no = 1; - tsk->thread.error_code = error_code; - info.si_signo = SIGTRAP; - info.si_errno = 0; - info.si_code = get_si_code(condition); - info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL; - force_sig_info(SIGTRAP, &info, tsk); + send_sigtrap(tsk, regs, error_code, si_code); + /* + * Disable additional traps. They'll be re-enabled when + * the signal is delivered. + */ clear_dr7: set_debugreg(0, 7); preempt_conditional_cli(regs); diff --git a/include/asm-x86/ptrace.h b/include/asm-x86/ptrace.h index ac578f11c1c..a2025525a15 100644 --- a/include/asm-x86/ptrace.h +++ b/include/asm-x86/ptrace.h @@ -174,12 +174,8 @@ extern unsigned long profile_pc(struct pt_regs *regs); extern unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs); - -#ifdef CONFIG_X86_32 extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code, int si_code); -#endif - void signal_fault(struct pt_regs *regs, void __user *frame, char *where); extern long syscall_trace_enter(struct pt_regs *); -- cgit v1.2.3 From 699d2937d45d9dabc1772d0d07501ccc43885c23 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 3 Oct 2008 22:00:32 +0200 Subject: traps: x86: converge trap_init functions - set_system_gate on i386 is really set_system_trap_gate - set_system_gate on x86_64 is really set_system_intr_gate - ist=0 means no special stack switch is done: - introduce STACKFAULT_STACK, DOUBLEFAULT_STACK, NMI_STACK, DEBUG_STACK and MCE_STACK as on x86_64. - use the _ist variants with XXX_STACK set to zero - remove set_system_gate Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar traps: x86: correct copy/paste bug: a trap is a GATE_TRAP Fix copy/paste/forgot-to-edit bug in desc.h. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 16 +++++++++------- arch/x86/kernel/traps_64.c | 6 +++--- include/asm-x86/desc.h | 14 +++++--------- include/asm-x86/page_32.h | 6 ++++++ 4 files changed, 23 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 953172af6e5..2c7ea382771 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -847,10 +847,12 @@ void __init trap_init(void) #endif set_intr_gate(0, ÷_error); - set_intr_gate(1, &debug); - set_intr_gate(2, &nmi); - set_system_intr_gate(3, &int3); /* int3 can be called from all */ - set_system_intr_gate(4, &overflow); /* int4 can be called from all */ + set_intr_gate_ist(1, &debug, DEBUG_STACK); + set_intr_gate_ist(2, &nmi, NMI_STACK); + /* int3 can be called from all */ + set_system_intr_gate_ist(3, &int3, DEBUG_STACK); + /* int4 can be called from all */ + set_system_intr_gate(4, &overflow); set_intr_gate(5, &bounds); set_intr_gate(6, &invalid_op); set_intr_gate(7, &device_not_available); @@ -858,14 +860,14 @@ void __init trap_init(void) set_intr_gate(9, &coprocessor_segment_overrun); set_intr_gate(10, &invalid_TSS); set_intr_gate(11, &segment_not_present); - set_intr_gate(12, &stack_segment); + set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK); set_intr_gate(13, &general_protection); set_intr_gate(14, &page_fault); set_intr_gate(15, &spurious_interrupt_bug); set_intr_gate(16, &coprocessor_error); set_intr_gate(17, &alignment_check); #ifdef CONFIG_X86_MCE - set_intr_gate(18, &machine_check); + set_intr_gate_ist(18, &machine_check, MCE_STACK); #endif set_intr_gate(19, &simd_coprocessor_error); @@ -881,7 +883,7 @@ void __init trap_init(void) printk("done.\n"); } - set_system_gate(SYSCALL_VECTOR, &system_call); + set_system_trap_gate(SYSCALL_VECTOR, &system_call); /* Reserve all the builtin and the syscall vector: */ for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index a851eca6d04..ea091dfe0cd 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -647,9 +647,9 @@ void __init trap_init(void) set_intr_gate_ist(1, &debug, DEBUG_STACK); set_intr_gate_ist(2, &nmi, NMI_STACK); /* int3 can be called from all */ - set_system_gate_ist(3, &int3, DEBUG_STACK); + set_system_intr_gate_ist(3, &int3, DEBUG_STACK); /* int4 can be called from all */ - set_system_gate(4, &overflow); + set_system_intr_gate(4, &overflow); set_intr_gate(5, &bounds); set_intr_gate(6, &invalid_op); set_intr_gate(7, &device_not_available); @@ -669,7 +669,7 @@ void __init trap_init(void) set_intr_gate(19, &simd_coprocessor_error); #ifdef CONFIG_IA32_EMULATION - set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall); + set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); #endif /* * Should be a barrier for any external CPU state: diff --git a/include/asm-x86/desc.h b/include/asm-x86/desc.h index ebc307817e9..f06adac7938 100644 --- a/include/asm-x86/desc.h +++ b/include/asm-x86/desc.h @@ -351,20 +351,16 @@ static inline void set_system_intr_gate(unsigned int n, void *addr) _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS); } -static inline void set_trap_gate(unsigned int n, void *addr) +static inline void set_system_trap_gate(unsigned int n, void *addr) { BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS); + _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS); } -static inline void set_system_gate(unsigned int n, void *addr) +static inline void set_trap_gate(unsigned int n, void *addr) { BUG_ON((unsigned)n > 0xFF); -#ifdef CONFIG_X86_32 - _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS); -#else - _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS); -#endif + _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS); } static inline void set_task_gate(unsigned int n, unsigned int gdt_entry) @@ -379,7 +375,7 @@ static inline void set_intr_gate_ist(int n, void *addr, unsigned ist) _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS); } -static inline void set_system_gate_ist(int n, void *addr, unsigned ist) +static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist) { BUG_ON((unsigned)n > 0xFF); _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); diff --git a/include/asm-x86/page_32.h b/include/asm-x86/page_32.h index 5d6a68a1067..e8d80d1de23 100644 --- a/include/asm-x86/page_32.h +++ b/include/asm-x86/page_32.h @@ -20,6 +20,12 @@ #endif #define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) +#define STACKFAULT_STACK 0 +#define DOUBLEFAULT_STACK 1 +#define NMI_STACK 0 +#define DEBUG_STACK 0 +#define MCE_STACK 0 +#define N_EXCEPTION_STACKS 1 #ifdef CONFIG_X86_PAE /* 44=32+12, the limit we can fit into an unsigned long pfn */ -- cgit v1.2.3 From 091d30c8f7744f43b0bb507fd30ceb95f9ff9e1b Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 3 Oct 2008 23:16:12 +0200 Subject: traps: x86_64: make math_state_restore more like i386 - rename variable me -> tsk - get thread and tsk like i386 - expand used_math() - copy comment Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_64.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index ea091dfe0cd..00406c99aee 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -604,14 +604,15 @@ asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) */ asmlinkage void math_state_restore(void) { - struct task_struct *me = current; + struct thread_info *thread = current_thread_info(); + struct task_struct *tsk = thread->task; - if (!used_math()) { + if (!tsk_used_math(tsk)) { local_irq_enable(); /* * does a slab alloc which can sleep */ - if (init_fpu(me)) { + if (init_fpu(tsk)) { /* * ran out of memory! */ @@ -625,13 +626,13 @@ asmlinkage void math_state_restore(void) /* * Paranoid restore. send a SIGSEGV if we fail to restore the state. */ - if (unlikely(restore_fpu_checking(me))) { + if (unlikely(restore_fpu_checking(tsk))) { stts(); - force_sig(SIGSEGV, me); + force_sig(SIGSEGV, tsk); return; } - task_thread_info(me)->status |= TS_USEDFPU; - me->fpu_counter++; + thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ + tsk->fpu_counter++; } EXPORT_SYMBOL_GPL(math_state_restore); -- cgit v1.2.3 From 4915a35e35a037254550a2ba9f367a812bc37d40 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 3 Oct 2008 22:00:34 +0200 Subject: traps: i386: use preempt_conditional_sti/cli in do_int3 Use preempt_conditional_sti/cli in do_int3, like on x86_64. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 2c7ea382771..67953bbe193 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -477,14 +477,15 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) return; - conditional_sti(regs); #else if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) return; #endif + preempt_conditional_sti(regs); do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); + preempt_conditional_cli(regs); } /* -- cgit v1.2.3 From 1c9af8a9f448abfe13f17fa76b7ca72b588a1edb Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 3 Oct 2008 22:00:35 +0200 Subject: traps: x86_64: make io_check_error equal to the one on i386 Make io_check_error equal to the one on i386. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_64.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 00406c99aee..7853f488cd6 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -252,13 +252,19 @@ mem_parity_error(unsigned char reason, struct pt_regs *regs) static notrace __kprobes void io_check_error(unsigned char reason, struct pt_regs *regs) { - printk("NMI: IOCK error (debug interrupt?)\n"); + unsigned long i; + + printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); show_registers(regs); /* Re-enable the IOCK line, wait for a few seconds */ reason = (reason & 0xf) | 8; outb(reason, 0x61); - mdelay(2000); + + i = 2000; + while (--i) + udelay(1000); + reason &= ~8; outb(reason, 0x61); } -- cgit v1.2.3 From 7970479c4881e156a0c07c1a7fdc564c8e3b2bfc Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 3 Oct 2008 22:00:36 +0200 Subject: traps: i386: expand clear_mem_error and remove from mach_traps.h This is the last user of clear_mem_error, which is defined only on i386. Expand the inline function and remove it from include/asm-x86/mach-default/mach_traps.h Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 3 ++- include/asm-x86/mach-default/mach_traps.h | 6 ------ 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 67953bbe193..01e7ca8c766 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -313,7 +313,8 @@ mem_parity_error(unsigned char reason, struct pt_regs *regs) printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); /* Clear and disable the memory parity error line. */ - clear_mem_error(reason); + reason = (reason & 0xf) | 4; + outb(reason, 0x61); } static notrace __kprobes void diff --git a/include/asm-x86/mach-default/mach_traps.h b/include/asm-x86/mach-default/mach_traps.h index de9ac3f5c4c..ff8778f26b8 100644 --- a/include/asm-x86/mach-default/mach_traps.h +++ b/include/asm-x86/mach-default/mach_traps.h @@ -7,12 +7,6 @@ #include -static inline void clear_mem_error(unsigned char reason) -{ - reason = (reason & 0xf) | 4; - outb(reason, 0x61); -} - static inline unsigned char get_nmi_reason(void) { return inb(0x61); -- cgit v1.2.3 From a5ae2330a5a6e7948866715a845ad2e8900bd4c2 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 3 Oct 2008 22:00:37 +0200 Subject: traps: x86_64: use task_pid_nr(tsk) instead of tsk->pid in do_general_protection Use task_pid_nr(tsk) instead of tsk->pid in do_general_protection. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 7853f488cd6..71c2629997b 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -204,7 +204,7 @@ do_general_protection(struct pt_regs *regs, long error_code) printk_ratelimit()) { printk(KERN_INFO "%s[%d] general protection ip:%lx sp:%lx error:%lx", - tsk->comm, tsk->pid, + tsk->comm, task_pid_nr(tsk), regs->ip, regs->sp, error_code); print_vma_addr(" in ", regs->ip); printk("\n"); -- cgit v1.2.3 From c1d518c8422ff7d3f377958771b265753028579c Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 3 Oct 2008 23:17:11 +0200 Subject: traps: x86: various noop-changes preparing for unification of traps_xx.c - reordering include files - whitespace changes - comment changes - removed unused bad_intr() - make default_do_nmi static Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 28 +++++++++++--------- arch/x86/kernel/traps_64.c | 65 ++++++++++++++++++++++++++++++---------------- include/asm-x86/nmi.h | 4 --- 3 files changed, 58 insertions(+), 39 deletions(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 01e7ca8c766..076739863d2 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -7,13 +7,11 @@ */ /* - * 'Traps.c' handles hardware traps and faults after we have saved some - * state in 'asm.s'. + * Handle hardware traps and faults. */ #include #include #include -#include #include #include #include @@ -32,6 +30,8 @@ #include #include #include +#include +#include #ifdef CONFIG_EISA #include @@ -46,22 +46,26 @@ #include #endif -#include -#include #include #include +#include #include #include #include #include +#include #include #include + +#include + +#include +#include #include #include #include #include -#include "mach_traps.h" #include "cpu/mcheck/mce.h" DECLARE_BITMAP(used_vectors, NR_VECTORS); @@ -340,7 +344,8 @@ io_check_error(unsigned char reason, struct pt_regs *regs) static notrace __kprobes void unknown_nmi_error(unsigned char reason, struct pt_regs *regs) { - if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) + if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == + NOTIFY_STOP) return; #ifdef CONFIG_MCA /* @@ -446,13 +451,9 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) dotraplinkage notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) { - int cpu; - nmi_enter(); - cpu = smp_processor_id(); - - ++nmi_count(cpu); + { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); } if (!ignore_nmis) default_do_nmi(regs); @@ -472,6 +473,7 @@ void restart_nmi(void) acpi_nmi_enable(); } +/* May run on IST stack. */ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) { #ifdef CONFIG_KPROBES @@ -510,6 +512,8 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) * about restoring all the debug state, and ptrace doesn't have to * find every occurrence of the TF bit that could be saved away even * by user code) + * + * May run on IST stack. */ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) { diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 71c2629997b..22fe62a24ed 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -7,10 +7,8 @@ */ /* - * 'Traps.c' handles hardware traps and faults after we have saved some - * state in 'entry.S'. + * Handle hardware traps and faults. */ -#include #include #include #include @@ -41,18 +39,21 @@ #include #include +#include #include #include #include #include +#include #include #include + +#include + #include #include #include -#include -#include static int ignore_nmis; @@ -73,8 +74,6 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) { if (regs->flags & X86_EFLAGS_IF) local_irq_disable(); - /* Make sure to not schedule here because we could be running - on an exception stack. */ dec_preempt_count(); } @@ -228,9 +227,12 @@ gp_in_kernel: static notrace __kprobes void mem_parity_error(unsigned char reason, struct pt_regs *regs) { - printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", - reason); - printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); + printk(KERN_EMERG + "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", + reason, smp_processor_id()); + + printk(KERN_EMERG + "You have some hardware problem, likely on the PCI bus.\n"); #if defined(CONFIG_EDAC) if (edac_handler_set()) { @@ -275,19 +277,18 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) return; - printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", - reason); - printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); + printk(KERN_EMERG + "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", + reason, smp_processor_id()); + printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); if (panic_on_unrecovered_nmi) panic("NMI: Not continuing"); printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); } -/* Runs on IST stack. This code must keep interrupts off all the time. - Nested NMIs are prevented by the CPU. */ -asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs) +static notrace __kprobes void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; int cpu; @@ -348,7 +349,7 @@ void restart_nmi(void) acpi_nmi_enable(); } -/* runs on IST stack. */ +/* May run on IST stack. */ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) { if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) @@ -381,7 +382,30 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) return regs; } -/* runs on IST stack. */ +/* + * Our handling of the processor debug registers is non-trivial. + * We do not clear them on entry and exit from the kernel. Therefore + * it is possible to get a watchpoint trap here from inside the kernel. + * However, the code in ./ptrace.c has ensured that the user can + * only set watchpoints on userspace addresses. Therefore the in-kernel + * watchpoint trap can only occur in code which is reading/writing + * from user space. Such code must not hold kernel locks (since it + * can equally take a page fault), therefore it is safe to call + * force_sig_info even though that claims and releases locks. + * + * Code in ./signal.c ensures that the debug control register + * is restored before we deliver any signal, and therefore that + * user code runs with the correct debug control register even though + * we clear it here. + * + * Being careful here means that we don't have to be as careful in a + * lot of more complicated places (task switching can be a bit lazy + * about restoring all the debug state, and ptrace doesn't have to + * find every occurrence of the TF bit that could be saved away even + * by user code) + * + * May run on IST stack. + */ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; @@ -525,11 +549,6 @@ dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) math_error((void __user *)regs->ip); } -asmlinkage void bad_intr(void) -{ - printk("bad interrupt"); -} - static void simd_math_error(void __user *ip) { struct task_struct *task; diff --git a/include/asm-x86/nmi.h b/include/asm-x86/nmi.h index d5e715f024d..a53f829a97c 100644 --- a/include/asm-x86/nmi.h +++ b/include/asm-x86/nmi.h @@ -15,10 +15,6 @@ */ int do_nmi_callback(struct pt_regs *regs, int cpu); -#ifdef CONFIG_X86_64 -extern void default_do_nmi(struct pt_regs *); -#endif - extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); extern int check_nmi_watchdog(void); extern int nmi_watchdog_enabled; -- cgit v1.2.3 From 081f75bbdc86de53537e1b5aca01de72bd2fea6b Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 3 Oct 2008 22:00:39 +0200 Subject: traps: x86: make traps_32.c and traps_64.c equal Use CONFIG_X86_64/CONFIG_X86_32 to condtionally compile the parts needed for x86_64 or i386 only. Runs a small userspace for a number of minimal configurations and boots the defconfigs. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 173 ++++++++++++++++++++- arch/x86/kernel/traps_64.c | 368 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 536 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 076739863d2..ffb131f74f7 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -48,7 +48,6 @@ #include #include -#include #include #include #include @@ -59,6 +58,11 @@ #include +#ifdef CONFIG_X86_64 +#include +#include +#include +#else #include #include #include @@ -83,6 +87,7 @@ char ignore_fpu_irq; */ gate_desc idt_table[256] __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; +#endif static int ignore_nmis; @@ -106,6 +111,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) dec_preempt_count(); } +#ifdef CONFIG_X86_32 static inline void die_if_kernel(const char *str, struct pt_regs *regs, long err) { @@ -153,6 +159,7 @@ static int lazy_iobitmap_copy(void) return 0; } +#endif static void __kprobes do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, @@ -160,6 +167,7 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, { struct task_struct *tsk = current; +#ifdef CONFIG_X86_32 if (regs->flags & X86_VM_MASK) { /* * traps 0, 1, 3, 4, and 5 should be forwarded to vm86. @@ -169,11 +177,14 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, goto vm86_trap; goto trap_signal; } +#endif if (!user_mode(regs)) goto kernel_trap; +#ifdef CONFIG_X86_32 trap_signal: +#endif /* * We want error_code and trap_no set for userspace faults and * kernelspace faults which result in die(), but not @@ -186,6 +197,18 @@ trap_signal: tsk->thread.error_code = error_code; tsk->thread.trap_no = trapnr; +#ifdef CONFIG_X86_64 + if (show_unhandled_signals && unhandled_signal(tsk, signr) && + printk_ratelimit()) { + printk(KERN_INFO + "%s[%d] trap %s ip:%lx sp:%lx error:%lx", + tsk->comm, tsk->pid, str, + regs->ip, regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } +#endif + if (info) force_sig_info(signr, info, tsk); else @@ -200,11 +223,13 @@ kernel_trap: } return; +#ifdef CONFIG_X86_32 vm86_trap: if (handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr)) goto trap_signal; return; +#endif } #define DO_ERROR(trapnr, signr, str, name) \ @@ -239,9 +264,41 @@ DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) +#ifdef CONFIG_X86_32 DO_ERROR(12, SIGBUS, "stack segment", stack_segment) +#endif DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) +#ifdef CONFIG_X86_64 +/* Runs on IST stack */ +dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) +{ + if (notify_die(DIE_TRAP, "stack segment", regs, error_code, + 12, SIGBUS) == NOTIFY_STOP) + return; + preempt_conditional_sti(regs); + do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); + preempt_conditional_cli(regs); +} + +dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) +{ + static const char str[] = "double fault"; + struct task_struct *tsk = current; + + /* Return not checked because double check cannot be ignored */ + notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV); + + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 8; + + /* This is always a kernel trap and never fixable (and thus must + never return). */ + for (;;) + die(str, regs, error_code); +} +#endif + dotraplinkage void __kprobes do_general_protection(struct pt_regs *regs, long error_code) { @@ -249,6 +306,7 @@ do_general_protection(struct pt_regs *regs, long error_code) conditional_sti(regs); +#ifdef CONFIG_X86_32 if (lazy_iobitmap_copy()) { /* restart the faulting instruction */ return; @@ -256,6 +314,7 @@ do_general_protection(struct pt_regs *regs, long error_code) if (regs->flags & X86_VM_MASK) goto gp_in_vm86; +#endif tsk = current; if (!user_mode(regs)) @@ -277,10 +336,12 @@ do_general_protection(struct pt_regs *regs, long error_code) force_sig(SIGSEGV, tsk); return; +#ifdef CONFIG_X86_32 gp_in_vm86: local_irq_enable(); handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); return; +#endif gp_in_kernel: if (fixup_exception(regs)) @@ -368,6 +429,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); } +#ifdef CONFIG_X86_32 static DEFINE_SPINLOCK(nmi_print_lock); void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) @@ -402,6 +464,7 @@ void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) do_exit(SIGSEGV); } +#endif static notrace __kprobes void default_do_nmi(struct pt_regs *regs) { @@ -441,11 +504,13 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) mem_parity_error(reason, regs); if (reason & 0x40) io_check_error(reason, regs); +#ifdef CONFIG_X86_32 /* * Reassert NMI in case it became active meanwhile * as it's edge-triggered: */ reassert_nmi(); +#endif } dotraplinkage notrace __kprobes void @@ -453,7 +518,11 @@ do_nmi(struct pt_regs *regs, long error_code) { nmi_enter(); +#ifdef CONFIG_X86_32 { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); } +#else + add_pda(__nmi_count, 1); +#endif if (!ignore_nmis) default_do_nmi(regs); @@ -491,6 +560,29 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) preempt_conditional_cli(regs); } +#ifdef CONFIG_X86_64 +/* Help handler running on IST stack to switch back to user stack + for scheduling or signal handling. The actual stack switch is done in + entry.S */ +asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) +{ + struct pt_regs *regs = eregs; + /* Did already sync */ + if (eregs == (struct pt_regs *)eregs->sp) + ; + /* Exception from user space */ + else if (user_mode(eregs)) + regs = task_pt_regs(current); + /* Exception from kernel and interrupts are enabled. Move to + kernel process stack. */ + else if (eregs->flags & X86_EFLAGS_IF) + regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); + if (eregs != regs) + *regs = *eregs; + return regs; +} +#endif + /* * Our handling of the processor debug registers is non-trivial. * We do not clear them on entry and exit from the kernel. Therefore @@ -542,8 +634,10 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) goto clear_dr7; } +#ifdef CONFIG_X86_32 if (regs->flags & X86_VM_MASK) goto debug_vm86; +#endif /* Save debug status register where ptrace can see it */ tsk->thread.debugreg6 = condition; @@ -570,10 +664,12 @@ clear_dr7: preempt_conditional_cli(regs); return; +#ifdef CONFIG_X86_32 debug_vm86: handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); preempt_conditional_cli(regs); return; +#endif clear_TF_reenable: set_tsk_thread_flag(tsk, TIF_SINGLESTEP); @@ -582,6 +678,20 @@ clear_TF_reenable: return; } +#ifdef CONFIG_X86_64 +static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) +{ + if (fixup_exception(regs)) + return 1; + + notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); + /* Illegal floating point operation in the kernel */ + current->thread.trap_no = trapnr; + die(str, regs, 0); + return 0; +} +#endif + /* * Note that we play around with the 'TS' bit in an attempt to get * the correct behaviour even in the presence of the asynchronous @@ -618,7 +728,9 @@ void math_error(void __user *ip) swd = get_fpu_swd(task); switch (swd & ~cwd & 0x3f) { case 0x000: /* No unmasked exception */ +#ifdef CONFIG_X86_32 return; +#endif default: /* Multiple exceptions */ break; case 0x001: /* Invalid Op */ @@ -649,7 +761,15 @@ void math_error(void __user *ip) dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) { conditional_sti(regs); + +#ifdef CONFIG_X86_32 ignore_fpu_irq = 1; +#else + if (!user_mode(regs) && + kernel_math_error(regs, "kernel x87 math error", 16)) + return; +#endif + math_error((void __user *)regs->ip); } @@ -706,6 +826,7 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code) { conditional_sti(regs); +#ifdef CONFIG_X86_32 if (cpu_has_xmm) { /* Handle SIMD FPU exceptions on PIII+ processors. */ ignore_fpu_irq = 1; @@ -724,6 +845,12 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code) current->thread.error_code = error_code; die_if_kernel("cache flush denied", regs, error_code); force_sig(SIGSEGV, current); +#else + if (!user_mode(regs) && + kernel_math_error(regs, "kernel simd math error", 19)) + return; + simd_math_error((void __user *)regs->ip); +#endif } dotraplinkage void @@ -736,6 +863,7 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) #endif } +#ifdef CONFIG_X86_32 unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) { struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); @@ -754,6 +882,15 @@ unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) return new_kesp; } +#else +asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) +{ +} + +asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) +{ +} +#endif /* * 'math_state_restore()' saves the current math information in the @@ -786,14 +923,24 @@ asmlinkage void math_state_restore(void) } clts(); /* Allow maths ops (or we recurse) */ +#ifdef CONFIG_X86_32 restore_fpu(tsk); +#else + /* + * Paranoid restore. send a SIGSEGV if we fail to restore the state. + */ + if (unlikely(restore_fpu_checking(tsk))) { + stts(); + force_sig(SIGSEGV, tsk); + return; + } +#endif thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ tsk->fpu_counter++; } EXPORT_SYMBOL_GPL(math_state_restore); #ifndef CONFIG_MATH_EMULATION - asmlinkage void math_emulate(long arg) { printk(KERN_EMERG @@ -802,12 +949,12 @@ asmlinkage void math_emulate(long arg) force_sig(SIGFPE, current); schedule(); } - #endif /* CONFIG_MATH_EMULATION */ dotraplinkage void __kprobes do_device_not_available(struct pt_regs *regs, long error) { +#ifdef CONFIG_X86_32 if (read_cr0() & X86_CR0_EM) { conditional_sti(regs); math_emulate(0); @@ -815,8 +962,12 @@ do_device_not_available(struct pt_regs *regs, long error) math_state_restore(); /* interrupts still off */ conditional_sti(regs); } +#else + math_state_restore(); +#endif } +#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_MCE dotraplinkage void __kprobes do_machine_check(struct pt_regs *regs, long error) { @@ -839,10 +990,13 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) return; do_trap(32, SIGILL, "iret exception", regs, error_code, &info); } +#endif void __init trap_init(void) { +#ifdef CONFIG_X86_32 int i; +#endif #ifdef CONFIG_EISA void __iomem *p = early_ioremap(0x0FFFD9, 4); @@ -862,7 +1016,11 @@ void __init trap_init(void) set_intr_gate(5, &bounds); set_intr_gate(6, &invalid_op); set_intr_gate(7, &device_not_available); +#ifdef CONFIG_X86_32 set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); +#else + set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK); +#endif set_intr_gate(9, &coprocessor_segment_overrun); set_intr_gate(10, &invalid_TSS); set_intr_gate(11, &segment_not_present); @@ -877,6 +1035,11 @@ void __init trap_init(void) #endif set_intr_gate(19, &simd_coprocessor_error); +#ifdef CONFIG_IA32_EMULATION + set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); +#endif + +#ifdef CONFIG_X86_32 if (cpu_has_fxsr) { printk(KERN_INFO "Enabling fast FPU save and restore... "); set_in_cr4(X86_CR4_OSFXSR); @@ -896,11 +1059,13 @@ void __init trap_init(void) set_bit(i, used_vectors); set_bit(SYSCALL_VECTOR, used_vectors); - +#endif /* * Should be a barrier for any external CPU state: */ cpu_init(); +#ifdef CONFIG_X86_32 trap_init_hook(); +#endif } diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 22fe62a24ed..60ecc855ab8 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -33,13 +33,21 @@ #include #include +#ifdef CONFIG_EISA +#include +#include +#endif + +#ifdef CONFIG_MCA +#include +#endif + #if defined(CONFIG_EDAC) #include #endif #include #include -#include #include #include #include @@ -50,10 +58,35 @@ #include +#ifdef CONFIG_X86_64 #include #include #include +#else +#include +#include +#include +#include +#include + +#include "cpu/mcheck/mce.h" +DECLARE_BITMAP(used_vectors, NR_VECTORS); +EXPORT_SYMBOL_GPL(used_vectors); + +asmlinkage int system_call(void); + +/* Do we ignore FPU interrupts ? */ +char ignore_fpu_irq; + +/* + * The IDT has to be page-aligned to simplify the Pentium + * F0 0F bug workaround.. We have a special link segment + * for this. + */ +gate_desc idt_table[256] + __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; +#endif static int ignore_nmis; @@ -77,15 +110,80 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) dec_preempt_count(); } +#ifdef CONFIG_X86_32 +static inline void +die_if_kernel(const char *str, struct pt_regs *regs, long err) +{ + if (!user_mode_vm(regs)) + die(str, regs, err); +} + +/* + * Perform the lazy TSS's I/O bitmap copy. If the TSS has an + * invalid offset set (the LAZY one) and the faulting thread has + * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS, + * we set the offset field correctly and return 1. + */ +static int lazy_iobitmap_copy(void) +{ + struct thread_struct *thread; + struct tss_struct *tss; + int cpu; + + cpu = get_cpu(); + tss = &per_cpu(init_tss, cpu); + thread = ¤t->thread; + + if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && + thread->io_bitmap_ptr) { + memcpy(tss->io_bitmap, thread->io_bitmap_ptr, + thread->io_bitmap_max); + /* + * If the previously set map was extending to higher ports + * than the current one, pad extra space with 0xff (no access). + */ + if (thread->io_bitmap_max < tss->io_bitmap_max) { + memset((char *) tss->io_bitmap + + thread->io_bitmap_max, 0xff, + tss->io_bitmap_max - thread->io_bitmap_max); + } + tss->io_bitmap_max = thread->io_bitmap_max; + tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; + tss->io_bitmap_owner = thread; + put_cpu(); + + return 1; + } + put_cpu(); + + return 0; +} +#endif + static void __kprobes do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, long error_code, siginfo_t *info) { struct task_struct *tsk = current; +#ifdef CONFIG_X86_32 + if (regs->flags & X86_VM_MASK) { + /* + * traps 0, 1, 3, 4, and 5 should be forwarded to vm86. + * On nmi (interrupt 2), do_trap should not be called. + */ + if (trapnr < 6) + goto vm86_trap; + goto trap_signal; + } +#endif + if (!user_mode(regs)) goto kernel_trap; +#ifdef CONFIG_X86_32 +trap_signal: +#endif /* * We want error_code and trap_no set for userspace faults and * kernelspace faults which result in die(), but not @@ -98,6 +196,7 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, tsk->thread.error_code = error_code; tsk->thread.trap_no = trapnr; +#ifdef CONFIG_X86_64 if (show_unhandled_signals && unhandled_signal(tsk, signr) && printk_ratelimit()) { printk(KERN_INFO @@ -107,6 +206,7 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, print_vma_addr(" in ", regs->ip); printk("\n"); } +#endif if (info) force_sig_info(signr, info, tsk); @@ -121,6 +221,14 @@ kernel_trap: die(str, regs, error_code); } return; + +#ifdef CONFIG_X86_32 +vm86_trap: + if (handle_vm86_trap((struct kernel_vm86_regs *) regs, + error_code, trapnr)) + goto trap_signal; + return; +#endif } #define DO_ERROR(trapnr, signr, str, name) \ @@ -155,8 +263,12 @@ DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) +#ifdef CONFIG_X86_32 +DO_ERROR(12, SIGBUS, "stack segment", stack_segment) +#endif DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) +#ifdef CONFIG_X86_64 /* Runs on IST stack */ dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) { @@ -184,6 +296,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) for (;;) die(str, regs, error_code); } +#endif dotraplinkage void __kprobes do_general_protection(struct pt_regs *regs, long error_code) @@ -192,6 +305,16 @@ do_general_protection(struct pt_regs *regs, long error_code) conditional_sti(regs); +#ifdef CONFIG_X86_32 + if (lazy_iobitmap_copy()) { + /* restart the faulting instruction */ + return; + } + + if (regs->flags & X86_VM_MASK) + goto gp_in_vm86; +#endif + tsk = current; if (!user_mode(regs)) goto gp_in_kernel; @@ -212,6 +335,13 @@ do_general_protection(struct pt_regs *regs, long error_code) force_sig(SIGSEGV, tsk); return; +#ifdef CONFIG_X86_32 +gp_in_vm86: + local_irq_enable(); + handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); + return; +#endif + gp_in_kernel: if (fixup_exception(regs)) return; @@ -277,6 +407,16 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) return; +#ifdef CONFIG_MCA + /* + * Might actually be able to figure out what the guilty party + * is: + */ + if (MCA_bus) { + mca_handle_nmi(); + return; + } +#endif printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", reason, smp_processor_id()); @@ -288,6 +428,43 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); } +#ifdef CONFIG_X86_32 +static DEFINE_SPINLOCK(nmi_print_lock); + +void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) +{ + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) + return; + + spin_lock(&nmi_print_lock); + /* + * We are in trouble anyway, lets at least try + * to get a message out: + */ + bust_spinlocks(1); + printk(KERN_EMERG "%s", str); + printk(" on CPU%d, ip %08lx, registers:\n", + smp_processor_id(), regs->ip); + show_registers(regs); + if (do_panic) + panic("Non maskable interrupt"); + console_silent(); + spin_unlock(&nmi_print_lock); + bust_spinlocks(0); + + /* + * If we are in kernel we are probably nested up pretty bad + * and might aswell get out now while we still can: + */ + if (!user_mode_vm(regs)) { + current->thread.trap_no = 2; + crash_kexec(regs); + } + + do_exit(SIGSEGV); +} +#endif + static notrace __kprobes void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; @@ -303,6 +480,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) == NOTIFY_STOP) return; +#ifdef CONFIG_X86_LOCAL_APIC /* * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. @@ -311,6 +489,9 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) return; if (!do_nmi_callback(regs, cpu)) unknown_nmi_error(reason, regs); +#else + unknown_nmi_error(reason, regs); +#endif return; } @@ -322,6 +503,13 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) mem_parity_error(reason, regs); if (reason & 0x40) io_check_error(reason, regs); +#ifdef CONFIG_X86_32 + /* + * Reassert NMI in case it became active meanwhile + * as it's edge-triggered: + */ + reassert_nmi(); +#endif } dotraplinkage notrace __kprobes void @@ -329,7 +517,11 @@ do_nmi(struct pt_regs *regs, long error_code) { nmi_enter(); +#ifdef CONFIG_X86_32 + { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); } +#else add_pda(__nmi_count, 1); +#endif if (!ignore_nmis) default_do_nmi(regs); @@ -352,15 +544,22 @@ void restart_nmi(void) /* May run on IST stack. */ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) { +#ifdef CONFIG_KPROBES if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) return; +#else + if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP) + == NOTIFY_STOP) + return; +#endif preempt_conditional_sti(regs); do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); preempt_conditional_cli(regs); } +#ifdef CONFIG_X86_64 /* Help handler running on IST stack to switch back to user stack for scheduling or signal handling. The actual stack switch is done in entry.S */ @@ -381,6 +580,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) *regs = *eregs; return regs; } +#endif /* * Our handling of the processor debug registers is non-trivial. @@ -433,6 +633,11 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) goto clear_dr7; } +#ifdef CONFIG_X86_32 + if (regs->flags & X86_VM_MASK) + goto debug_vm86; +#endif + /* Save debug status register where ptrace can see it */ tsk->thread.debugreg6 = condition; @@ -458,6 +663,13 @@ clear_dr7: preempt_conditional_cli(regs); return; +#ifdef CONFIG_X86_32 +debug_vm86: + handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); + preempt_conditional_cli(regs); + return; +#endif + clear_TF_reenable: set_tsk_thread_flag(tsk, TIF_SINGLESTEP); regs->flags &= ~X86_EFLAGS_TF; @@ -465,6 +677,7 @@ clear_TF_reenable: return; } +#ifdef CONFIG_X86_64 static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) { if (fixup_exception(regs)) @@ -476,6 +689,7 @@ static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) die(str, regs, 0); return 0; } +#endif /* * Note that we play around with the 'TS' bit in an attempt to get @@ -513,6 +727,9 @@ void math_error(void __user *ip) swd = get_fpu_swd(task); switch (swd & ~cwd & 0x3f) { case 0x000: /* No unmasked exception */ +#ifdef CONFIG_X86_32 + return; +#endif default: /* Multiple exceptions */ break; case 0x001: /* Invalid Op */ @@ -543,9 +760,15 @@ void math_error(void __user *ip) dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) { conditional_sti(regs); + +#ifdef CONFIG_X86_32 + ignore_fpu_irq = 1; +#else if (!user_mode(regs) && kernel_math_error(regs, "kernel x87 math error", 16)) return; +#endif + math_error((void __user *)regs->ip); } @@ -601,17 +824,64 @@ dotraplinkage void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) { conditional_sti(regs); + +#ifdef CONFIG_X86_32 + if (cpu_has_xmm) { + /* Handle SIMD FPU exceptions on PIII+ processors. */ + ignore_fpu_irq = 1; + simd_math_error((void __user *)regs->ip); + return; + } + /* + * Handle strange cache flush from user space exception + * in all other cases. This is undocumented behaviour. + */ + if (regs->flags & X86_VM_MASK) { + handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code); + return; + } + current->thread.trap_no = 19; + current->thread.error_code = error_code; + die_if_kernel("cache flush denied", regs, error_code); + force_sig(SIGSEGV, current); +#else if (!user_mode(regs) && kernel_math_error(regs, "kernel simd math error", 19)) return; simd_math_error((void __user *)regs->ip); +#endif } dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) { + conditional_sti(regs); +#if 0 + /* No need to warn about this any longer. */ + printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); +#endif } +#ifdef CONFIG_X86_32 +unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) +{ + struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); + unsigned long base = (kesp - uesp) & -THREAD_SIZE; + unsigned long new_kesp = kesp - base; + unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; + __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS]; + + /* Set up base for espfix segment */ + desc &= 0x00f0ff0000000000ULL; + desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) | + ((((__u64)base) << 32) & 0xff00000000000000ULL) | + ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) | + (lim_pages & 0xffff); + *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc; + + return new_kesp; +} +#else asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) { } @@ -619,6 +889,7 @@ asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) { } +#endif /* * 'math_state_restore()' saves the current math information in the @@ -626,6 +897,9 @@ asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) * * Careful.. There are problems with IBM-designed IRQ13 behaviour. * Don't touch unless you *really* know how it works. + * + * Must be called with kernel preemption disabled (in this case, + * local interrupts are disabled at the call-site in entry.S). */ asmlinkage void math_state_restore(void) { @@ -648,6 +922,9 @@ asmlinkage void math_state_restore(void) } clts(); /* Allow maths ops (or we recurse) */ +#ifdef CONFIG_X86_32 + restore_fpu(tsk); +#else /* * Paranoid restore. send a SIGSEGV if we fail to restore the state. */ @@ -656,19 +933,78 @@ asmlinkage void math_state_restore(void) force_sig(SIGSEGV, tsk); return; } +#endif thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ tsk->fpu_counter++; } EXPORT_SYMBOL_GPL(math_state_restore); +#ifndef CONFIG_MATH_EMULATION +asmlinkage void math_emulate(long arg) +{ + printk(KERN_EMERG + "math-emulation not enabled and no coprocessor found.\n"); + printk(KERN_EMERG "killing %s.\n", current->comm); + force_sig(SIGFPE, current); + schedule(); +} +#endif /* CONFIG_MATH_EMULATION */ + dotraplinkage void __kprobes do_device_not_available(struct pt_regs *regs, long error) { +#ifdef CONFIG_X86_32 + if (read_cr0() & X86_CR0_EM) { + conditional_sti(regs); + math_emulate(0); + } else { + math_state_restore(); /* interrupts still off */ + conditional_sti(regs); + } +#else math_state_restore(); +#endif +} + +#ifdef CONFIG_X86_32 +#ifdef CONFIG_X86_MCE +dotraplinkage void __kprobes do_machine_check(struct pt_regs *regs, long error) +{ + conditional_sti(regs); + machine_check_vector(regs, error); } +#endif + +dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) +{ + siginfo_t info; + local_irq_enable(); + + info.si_signo = SIGILL; + info.si_errno = 0; + info.si_code = ILL_BADSTK; + info.si_addr = 0; + if (notify_die(DIE_TRAP, "iret exception", + regs, error_code, 32, SIGILL) == NOTIFY_STOP) + return; + do_trap(32, SIGILL, "iret exception", regs, error_code, &info); +} +#endif void __init trap_init(void) { +#ifdef CONFIG_X86_32 + int i; +#endif + +#ifdef CONFIG_EISA + void __iomem *p = early_ioremap(0x0FFFD9, 4); + + if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24)) + EISA_bus = 1; + early_iounmap(p, 4); +#endif + set_intr_gate(0, ÷_error); set_intr_gate_ist(1, &debug, DEBUG_STACK); set_intr_gate_ist(2, &nmi, NMI_STACK); @@ -679,7 +1015,11 @@ void __init trap_init(void) set_intr_gate(5, &bounds); set_intr_gate(6, &invalid_op); set_intr_gate(7, &device_not_available); +#ifdef CONFIG_X86_32 + set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); +#else set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK); +#endif set_intr_gate(9, &coprocessor_segment_overrun); set_intr_gate(10, &invalid_TSS); set_intr_gate(11, &segment_not_present); @@ -697,8 +1037,34 @@ void __init trap_init(void) #ifdef CONFIG_IA32_EMULATION set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); #endif + +#ifdef CONFIG_X86_32 + if (cpu_has_fxsr) { + printk(KERN_INFO "Enabling fast FPU save and restore... "); + set_in_cr4(X86_CR4_OSFXSR); + printk("done.\n"); + } + if (cpu_has_xmm) { + printk(KERN_INFO + "Enabling unmasked SIMD FPU exception support... "); + set_in_cr4(X86_CR4_OSXMMEXCPT); + printk("done.\n"); + } + + set_system_trap_gate(SYSCALL_VECTOR, &system_call); + + /* Reserve all the builtin and the syscall vector: */ + for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) + set_bit(i, used_vectors); + + set_bit(SYSCALL_VECTOR, used_vectors); +#endif /* * Should be a barrier for any external CPU state: */ cpu_init(); + +#ifdef CONFIG_X86_32 + trap_init_hook(); +#endif } -- cgit v1.2.3 From 8728861b4fead8119a1b7bb856a387320859cd98 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 3 Oct 2008 22:00:40 +0200 Subject: traps: x86: finalize unification of traps.c traps_32.c and traps_64.c are now equal. Move one to traps.c, delete the other one and change the Makefile Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/traps.c | 1071 ++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/traps_32.c | 1071 -------------------------------------------- arch/x86/kernel/traps_64.c | 1070 ------------------------------------------- 4 files changed, 1072 insertions(+), 2142 deletions(-) create mode 100644 arch/x86/kernel/traps.c delete mode 100644 arch/x86/kernel/traps_32.c delete mode 100644 arch/x86/kernel/traps_64.c diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index de63fed9fae..0d41f0343dc 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -23,7 +23,7 @@ CFLAGS_hpet.o := $(nostackp) CFLAGS_tsc.o := $(nostackp) obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o -obj-y += traps_$(BITS).o irq_$(BITS).o dumpstack_$(BITS).o +obj-y += traps.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c new file mode 100644 index 00000000000..ffb131f74f7 --- /dev/null +++ b/arch/x86/kernel/traps.c @@ -0,0 +1,1071 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + * + * Pentium III FXSR, SSE support + * Gareth Hughes , May 2000 + */ + +/* + * Handle hardware traps and faults. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_EISA +#include +#include +#endif + +#ifdef CONFIG_MCA +#include +#endif + +#if defined(CONFIG_EDAC) +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef CONFIG_X86_64 +#include +#include +#include +#else +#include +#include +#include +#include +#include +#include + +#include "cpu/mcheck/mce.h" + +DECLARE_BITMAP(used_vectors, NR_VECTORS); +EXPORT_SYMBOL_GPL(used_vectors); + +asmlinkage int system_call(void); + +/* Do we ignore FPU interrupts ? */ +char ignore_fpu_irq; + +/* + * The IDT has to be page-aligned to simplify the Pentium + * F0 0F bug workaround.. We have a special link segment + * for this. + */ +gate_desc idt_table[256] + __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; +#endif + +static int ignore_nmis; + +static inline void conditional_sti(struct pt_regs *regs) +{ + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); +} + +static inline void preempt_conditional_sti(struct pt_regs *regs) +{ + inc_preempt_count(); + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); +} + +static inline void preempt_conditional_cli(struct pt_regs *regs) +{ + if (regs->flags & X86_EFLAGS_IF) + local_irq_disable(); + dec_preempt_count(); +} + +#ifdef CONFIG_X86_32 +static inline void +die_if_kernel(const char *str, struct pt_regs *regs, long err) +{ + if (!user_mode_vm(regs)) + die(str, regs, err); +} + +/* + * Perform the lazy TSS's I/O bitmap copy. If the TSS has an + * invalid offset set (the LAZY one) and the faulting thread has + * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS, + * we set the offset field correctly and return 1. + */ +static int lazy_iobitmap_copy(void) +{ + struct thread_struct *thread; + struct tss_struct *tss; + int cpu; + + cpu = get_cpu(); + tss = &per_cpu(init_tss, cpu); + thread = ¤t->thread; + + if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && + thread->io_bitmap_ptr) { + memcpy(tss->io_bitmap, thread->io_bitmap_ptr, + thread->io_bitmap_max); + /* + * If the previously set map was extending to higher ports + * than the current one, pad extra space with 0xff (no access). + */ + if (thread->io_bitmap_max < tss->io_bitmap_max) { + memset((char *) tss->io_bitmap + + thread->io_bitmap_max, 0xff, + tss->io_bitmap_max - thread->io_bitmap_max); + } + tss->io_bitmap_max = thread->io_bitmap_max; + tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; + tss->io_bitmap_owner = thread; + put_cpu(); + + return 1; + } + put_cpu(); + + return 0; +} +#endif + +static void __kprobes +do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, + long error_code, siginfo_t *info) +{ + struct task_struct *tsk = current; + +#ifdef CONFIG_X86_32 + if (regs->flags & X86_VM_MASK) { + /* + * traps 0, 1, 3, 4, and 5 should be forwarded to vm86. + * On nmi (interrupt 2), do_trap should not be called. + */ + if (trapnr < 6) + goto vm86_trap; + goto trap_signal; + } +#endif + + if (!user_mode(regs)) + goto kernel_trap; + +#ifdef CONFIG_X86_32 +trap_signal: +#endif + /* + * We want error_code and trap_no set for userspace faults and + * kernelspace faults which result in die(), but not + * kernelspace faults which are fixed up. die() gives the + * process no chance to handle the signal and notice the + * kernel fault information, so that won't result in polluting + * the information about previously queued, but not yet + * delivered, faults. See also do_general_protection below. + */ + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; + +#ifdef CONFIG_X86_64 + if (show_unhandled_signals && unhandled_signal(tsk, signr) && + printk_ratelimit()) { + printk(KERN_INFO + "%s[%d] trap %s ip:%lx sp:%lx error:%lx", + tsk->comm, tsk->pid, str, + regs->ip, regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } +#endif + + if (info) + force_sig_info(signr, info, tsk); + else + force_sig(signr, tsk); + return; + +kernel_trap: + if (!fixup_exception(regs)) { + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; + die(str, regs, error_code); + } + return; + +#ifdef CONFIG_X86_32 +vm86_trap: + if (handle_vm86_trap((struct kernel_vm86_regs *) regs, + error_code, trapnr)) + goto trap_signal; + return; +#endif +} + +#define DO_ERROR(trapnr, signr, str, name) \ +dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ +{ \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + conditional_sti(regs); \ + do_trap(trapnr, signr, str, regs, error_code, NULL); \ +} + +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ +dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ +{ \ + siginfo_t info; \ + info.si_signo = signr; \ + info.si_errno = 0; \ + info.si_code = sicode; \ + info.si_addr = (void __user *)siaddr; \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + conditional_sti(regs); \ + do_trap(trapnr, signr, str, regs, error_code, &info); \ +} + +DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) +DO_ERROR(4, SIGSEGV, "overflow", overflow) +DO_ERROR(5, SIGSEGV, "bounds", bounds) +DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) +DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) +DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) +#ifdef CONFIG_X86_32 +DO_ERROR(12, SIGBUS, "stack segment", stack_segment) +#endif +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) + +#ifdef CONFIG_X86_64 +/* Runs on IST stack */ +dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) +{ + if (notify_die(DIE_TRAP, "stack segment", regs, error_code, + 12, SIGBUS) == NOTIFY_STOP) + return; + preempt_conditional_sti(regs); + do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); + preempt_conditional_cli(regs); +} + +dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) +{ + static const char str[] = "double fault"; + struct task_struct *tsk = current; + + /* Return not checked because double check cannot be ignored */ + notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV); + + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 8; + + /* This is always a kernel trap and never fixable (and thus must + never return). */ + for (;;) + die(str, regs, error_code); +} +#endif + +dotraplinkage void __kprobes +do_general_protection(struct pt_regs *regs, long error_code) +{ + struct task_struct *tsk; + + conditional_sti(regs); + +#ifdef CONFIG_X86_32 + if (lazy_iobitmap_copy()) { + /* restart the faulting instruction */ + return; + } + + if (regs->flags & X86_VM_MASK) + goto gp_in_vm86; +#endif + + tsk = current; + if (!user_mode(regs)) + goto gp_in_kernel; + + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 13; + + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) { + printk(KERN_INFO + "%s[%d] general protection ip:%lx sp:%lx error:%lx", + tsk->comm, task_pid_nr(tsk), + regs->ip, regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } + + force_sig(SIGSEGV, tsk); + return; + +#ifdef CONFIG_X86_32 +gp_in_vm86: + local_irq_enable(); + handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); + return; +#endif + +gp_in_kernel: + if (fixup_exception(regs)) + return; + + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 13; + if (notify_die(DIE_GPF, "general protection fault", regs, + error_code, 13, SIGSEGV) == NOTIFY_STOP) + return; + die("general protection fault", regs, error_code); +} + +static notrace __kprobes void +mem_parity_error(unsigned char reason, struct pt_regs *regs) +{ + printk(KERN_EMERG + "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", + reason, smp_processor_id()); + + printk(KERN_EMERG + "You have some hardware problem, likely on the PCI bus.\n"); + +#if defined(CONFIG_EDAC) + if (edac_handler_set()) { + edac_atomic_assert_error(); + return; + } +#endif + + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); + + printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); + + /* Clear and disable the memory parity error line. */ + reason = (reason & 0xf) | 4; + outb(reason, 0x61); +} + +static notrace __kprobes void +io_check_error(unsigned char reason, struct pt_regs *regs) +{ + unsigned long i; + + printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); + show_registers(regs); + + /* Re-enable the IOCK line, wait for a few seconds */ + reason = (reason & 0xf) | 8; + outb(reason, 0x61); + + i = 2000; + while (--i) + udelay(1000); + + reason &= ~8; + outb(reason, 0x61); +} + +static notrace __kprobes void +unknown_nmi_error(unsigned char reason, struct pt_regs *regs) +{ + if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == + NOTIFY_STOP) + return; +#ifdef CONFIG_MCA + /* + * Might actually be able to figure out what the guilty party + * is: + */ + if (MCA_bus) { + mca_handle_nmi(); + return; + } +#endif + printk(KERN_EMERG + "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", + reason, smp_processor_id()); + + printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); + + printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); +} + +#ifdef CONFIG_X86_32 +static DEFINE_SPINLOCK(nmi_print_lock); + +void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) +{ + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) + return; + + spin_lock(&nmi_print_lock); + /* + * We are in trouble anyway, lets at least try + * to get a message out: + */ + bust_spinlocks(1); + printk(KERN_EMERG "%s", str); + printk(" on CPU%d, ip %08lx, registers:\n", + smp_processor_id(), regs->ip); + show_registers(regs); + if (do_panic) + panic("Non maskable interrupt"); + console_silent(); + spin_unlock(&nmi_print_lock); + bust_spinlocks(0); + + /* + * If we are in kernel we are probably nested up pretty bad + * and might aswell get out now while we still can: + */ + if (!user_mode_vm(regs)) { + current->thread.trap_no = 2; + crash_kexec(regs); + } + + do_exit(SIGSEGV); +} +#endif + +static notrace __kprobes void default_do_nmi(struct pt_regs *regs) +{ + unsigned char reason = 0; + int cpu; + + cpu = smp_processor_id(); + + /* Only the BSP gets external NMIs from the system. */ + if (!cpu) + reason = get_nmi_reason(); + + if (!(reason & 0xc0)) { + if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) + return; +#ifdef CONFIG_X86_LOCAL_APIC + /* + * Ok, so this is none of the documented NMI sources, + * so it must be the NMI watchdog. + */ + if (nmi_watchdog_tick(regs, reason)) + return; + if (!do_nmi_callback(regs, cpu)) + unknown_nmi_error(reason, regs); +#else + unknown_nmi_error(reason, regs); +#endif + + return; + } + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) + return; + + /* AK: following checks seem to be broken on modern chipsets. FIXME */ + if (reason & 0x80) + mem_parity_error(reason, regs); + if (reason & 0x40) + io_check_error(reason, regs); +#ifdef CONFIG_X86_32 + /* + * Reassert NMI in case it became active meanwhile + * as it's edge-triggered: + */ + reassert_nmi(); +#endif +} + +dotraplinkage notrace __kprobes void +do_nmi(struct pt_regs *regs, long error_code) +{ + nmi_enter(); + +#ifdef CONFIG_X86_32 + { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); } +#else + add_pda(__nmi_count, 1); +#endif + + if (!ignore_nmis) + default_do_nmi(regs); + + nmi_exit(); +} + +void stop_nmi(void) +{ + acpi_nmi_disable(); + ignore_nmis++; +} + +void restart_nmi(void) +{ + ignore_nmis--; + acpi_nmi_enable(); +} + +/* May run on IST stack. */ +dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) +{ +#ifdef CONFIG_KPROBES + if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) + == NOTIFY_STOP) + return; +#else + if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP) + == NOTIFY_STOP) + return; +#endif + + preempt_conditional_sti(regs); + do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); + preempt_conditional_cli(regs); +} + +#ifdef CONFIG_X86_64 +/* Help handler running on IST stack to switch back to user stack + for scheduling or signal handling. The actual stack switch is done in + entry.S */ +asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) +{ + struct pt_regs *regs = eregs; + /* Did already sync */ + if (eregs == (struct pt_regs *)eregs->sp) + ; + /* Exception from user space */ + else if (user_mode(eregs)) + regs = task_pt_regs(current); + /* Exception from kernel and interrupts are enabled. Move to + kernel process stack. */ + else if (eregs->flags & X86_EFLAGS_IF) + regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); + if (eregs != regs) + *regs = *eregs; + return regs; +} +#endif + +/* + * Our handling of the processor debug registers is non-trivial. + * We do not clear them on entry and exit from the kernel. Therefore + * it is possible to get a watchpoint trap here from inside the kernel. + * However, the code in ./ptrace.c has ensured that the user can + * only set watchpoints on userspace addresses. Therefore the in-kernel + * watchpoint trap can only occur in code which is reading/writing + * from user space. Such code must not hold kernel locks (since it + * can equally take a page fault), therefore it is safe to call + * force_sig_info even though that claims and releases locks. + * + * Code in ./signal.c ensures that the debug control register + * is restored before we deliver any signal, and therefore that + * user code runs with the correct debug control register even though + * we clear it here. + * + * Being careful here means that we don't have to be as careful in a + * lot of more complicated places (task switching can be a bit lazy + * about restoring all the debug state, and ptrace doesn't have to + * find every occurrence of the TF bit that could be saved away even + * by user code) + * + * May run on IST stack. + */ +dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) +{ + struct task_struct *tsk = current; + unsigned long condition; + int si_code; + + get_debugreg(condition, 6); + + /* + * The processor cleared BTF, so don't mark that we need it set. + */ + clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); + tsk->thread.debugctlmsr = 0; + + if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, + SIGTRAP) == NOTIFY_STOP) + return; + + /* It's safe to allow irq's after DR6 has been saved */ + preempt_conditional_sti(regs); + + /* Mask out spurious debug traps due to lazy DR7 setting */ + if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { + if (!tsk->thread.debugreg7) + goto clear_dr7; + } + +#ifdef CONFIG_X86_32 + if (regs->flags & X86_VM_MASK) + goto debug_vm86; +#endif + + /* Save debug status register where ptrace can see it */ + tsk->thread.debugreg6 = condition; + + /* + * Single-stepping through TF: make sure we ignore any events in + * kernel space (but re-enable TF when returning to user mode). + */ + if (condition & DR_STEP) { + if (!user_mode(regs)) + goto clear_TF_reenable; + } + + si_code = get_si_code(condition); + /* Ok, finally something we can handle */ + send_sigtrap(tsk, regs, error_code, si_code); + + /* + * Disable additional traps. They'll be re-enabled when + * the signal is delivered. + */ +clear_dr7: + set_debugreg(0, 7); + preempt_conditional_cli(regs); + return; + +#ifdef CONFIG_X86_32 +debug_vm86: + handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); + preempt_conditional_cli(regs); + return; +#endif + +clear_TF_reenable: + set_tsk_thread_flag(tsk, TIF_SINGLESTEP); + regs->flags &= ~X86_EFLAGS_TF; + preempt_conditional_cli(regs); + return; +} + +#ifdef CONFIG_X86_64 +static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) +{ + if (fixup_exception(regs)) + return 1; + + notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); + /* Illegal floating point operation in the kernel */ + current->thread.trap_no = trapnr; + die(str, regs, 0); + return 0; +} +#endif + +/* + * Note that we play around with the 'TS' bit in an attempt to get + * the correct behaviour even in the presence of the asynchronous + * IRQ13 behaviour + */ +void math_error(void __user *ip) +{ + struct task_struct *task; + siginfo_t info; + unsigned short cwd, swd; + + /* + * Save the info for the exception handler and clear the error. + */ + task = current; + save_init_fpu(task); + task->thread.trap_no = 16; + task->thread.error_code = 0; + info.si_signo = SIGFPE; + info.si_errno = 0; + info.si_code = __SI_FAULT; + info.si_addr = ip; + /* + * (~cwd & swd) will mask out exceptions that are not set to unmasked + * status. 0x3f is the exception bits in these regs, 0x200 is the + * C1 reg you need in case of a stack fault, 0x040 is the stack + * fault bit. We should only be taking one exception at a time, + * so if this combination doesn't produce any single exception, + * then we have a bad program that isn't synchronizing its FPU usage + * and it will suffer the consequences since we won't be able to + * fully reproduce the context of the exception + */ + cwd = get_fpu_cwd(task); + swd = get_fpu_swd(task); + switch (swd & ~cwd & 0x3f) { + case 0x000: /* No unmasked exception */ +#ifdef CONFIG_X86_32 + return; +#endif + default: /* Multiple exceptions */ + break; + case 0x001: /* Invalid Op */ + /* + * swd & 0x240 == 0x040: Stack Underflow + * swd & 0x240 == 0x240: Stack Overflow + * User must clear the SF bit (0x40) if set + */ + info.si_code = FPE_FLTINV; + break; + case 0x002: /* Denormalize */ + case 0x010: /* Underflow */ + info.si_code = FPE_FLTUND; + break; + case 0x004: /* Zero Divide */ + info.si_code = FPE_FLTDIV; + break; + case 0x008: /* Overflow */ + info.si_code = FPE_FLTOVF; + break; + case 0x020: /* Precision */ + info.si_code = FPE_FLTRES; + break; + } + force_sig_info(SIGFPE, &info, task); +} + +dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) +{ + conditional_sti(regs); + +#ifdef CONFIG_X86_32 + ignore_fpu_irq = 1; +#else + if (!user_mode(regs) && + kernel_math_error(regs, "kernel x87 math error", 16)) + return; +#endif + + math_error((void __user *)regs->ip); +} + +static void simd_math_error(void __user *ip) +{ + struct task_struct *task; + siginfo_t info; + unsigned short mxcsr; + + /* + * Save the info for the exception handler and clear the error. + */ + task = current; + save_init_fpu(task); + task->thread.trap_no = 19; + task->thread.error_code = 0; + info.si_signo = SIGFPE; + info.si_errno = 0; + info.si_code = __SI_FAULT; + info.si_addr = ip; + /* + * The SIMD FPU exceptions are handled a little differently, as there + * is only a single status/control register. Thus, to determine which + * unmasked exception was caught we must mask the exception mask bits + * at 0x1f80, and then use these to mask the exception bits at 0x3f. + */ + mxcsr = get_fpu_mxcsr(task); + switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { + case 0x000: + default: + break; + case 0x001: /* Invalid Op */ + info.si_code = FPE_FLTINV; + break; + case 0x002: /* Denormalize */ + case 0x010: /* Underflow */ + info.si_code = FPE_FLTUND; + break; + case 0x004: /* Zero Divide */ + info.si_code = FPE_FLTDIV; + break; + case 0x008: /* Overflow */ + info.si_code = FPE_FLTOVF; + break; + case 0x020: /* Precision */ + info.si_code = FPE_FLTRES; + break; + } + force_sig_info(SIGFPE, &info, task); +} + +dotraplinkage void +do_simd_coprocessor_error(struct pt_regs *regs, long error_code) +{ + conditional_sti(regs); + +#ifdef CONFIG_X86_32 + if (cpu_has_xmm) { + /* Handle SIMD FPU exceptions on PIII+ processors. */ + ignore_fpu_irq = 1; + simd_math_error((void __user *)regs->ip); + return; + } + /* + * Handle strange cache flush from user space exception + * in all other cases. This is undocumented behaviour. + */ + if (regs->flags & X86_VM_MASK) { + handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code); + return; + } + current->thread.trap_no = 19; + current->thread.error_code = error_code; + die_if_kernel("cache flush denied", regs, error_code); + force_sig(SIGSEGV, current); +#else + if (!user_mode(regs) && + kernel_math_error(regs, "kernel simd math error", 19)) + return; + simd_math_error((void __user *)regs->ip); +#endif +} + +dotraplinkage void +do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) +{ + conditional_sti(regs); +#if 0 + /* No need to warn about this any longer. */ + printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); +#endif +} + +#ifdef CONFIG_X86_32 +unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) +{ + struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); + unsigned long base = (kesp - uesp) & -THREAD_SIZE; + unsigned long new_kesp = kesp - base; + unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; + __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS]; + + /* Set up base for espfix segment */ + desc &= 0x00f0ff0000000000ULL; + desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) | + ((((__u64)base) << 32) & 0xff00000000000000ULL) | + ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) | + (lim_pages & 0xffff); + *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc; + + return new_kesp; +} +#else +asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) +{ +} + +asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) +{ +} +#endif + +/* + * 'math_state_restore()' saves the current math information in the + * old math state array, and gets the new ones from the current task + * + * Careful.. There are problems with IBM-designed IRQ13 behaviour. + * Don't touch unless you *really* know how it works. + * + * Must be called with kernel preemption disabled (in this case, + * local interrupts are disabled at the call-site in entry.S). + */ +asmlinkage void math_state_restore(void) +{ + struct thread_info *thread = current_thread_info(); + struct task_struct *tsk = thread->task; + + if (!tsk_used_math(tsk)) { + local_irq_enable(); + /* + * does a slab alloc which can sleep + */ + if (init_fpu(tsk)) { + /* + * ran out of memory! + */ + do_group_exit(SIGKILL); + return; + } + local_irq_disable(); + } + + clts(); /* Allow maths ops (or we recurse) */ +#ifdef CONFIG_X86_32 + restore_fpu(tsk); +#else + /* + * Paranoid restore. send a SIGSEGV if we fail to restore the state. + */ + if (unlikely(restore_fpu_checking(tsk))) { + stts(); + force_sig(SIGSEGV, tsk); + return; + } +#endif + thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ + tsk->fpu_counter++; +} +EXPORT_SYMBOL_GPL(math_state_restore); + +#ifndef CONFIG_MATH_EMULATION +asmlinkage void math_emulate(long arg) +{ + printk(KERN_EMERG + "math-emulation not enabled and no coprocessor found.\n"); + printk(KERN_EMERG "killing %s.\n", current->comm); + force_sig(SIGFPE, current); + schedule(); +} +#endif /* CONFIG_MATH_EMULATION */ + +dotraplinkage void __kprobes +do_device_not_available(struct pt_regs *regs, long error) +{ +#ifdef CONFIG_X86_32 + if (read_cr0() & X86_CR0_EM) { + conditional_sti(regs); + math_emulate(0); + } else { + math_state_restore(); /* interrupts still off */ + conditional_sti(regs); + } +#else + math_state_restore(); +#endif +} + +#ifdef CONFIG_X86_32 +#ifdef CONFIG_X86_MCE +dotraplinkage void __kprobes do_machine_check(struct pt_regs *regs, long error) +{ + conditional_sti(regs); + machine_check_vector(regs, error); +} +#endif + +dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) +{ + siginfo_t info; + local_irq_enable(); + + info.si_signo = SIGILL; + info.si_errno = 0; + info.si_code = ILL_BADSTK; + info.si_addr = 0; + if (notify_die(DIE_TRAP, "iret exception", + regs, error_code, 32, SIGILL) == NOTIFY_STOP) + return; + do_trap(32, SIGILL, "iret exception", regs, error_code, &info); +} +#endif + +void __init trap_init(void) +{ +#ifdef CONFIG_X86_32 + int i; +#endif + +#ifdef CONFIG_EISA + void __iomem *p = early_ioremap(0x0FFFD9, 4); + + if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24)) + EISA_bus = 1; + early_iounmap(p, 4); +#endif + + set_intr_gate(0, ÷_error); + set_intr_gate_ist(1, &debug, DEBUG_STACK); + set_intr_gate_ist(2, &nmi, NMI_STACK); + /* int3 can be called from all */ + set_system_intr_gate_ist(3, &int3, DEBUG_STACK); + /* int4 can be called from all */ + set_system_intr_gate(4, &overflow); + set_intr_gate(5, &bounds); + set_intr_gate(6, &invalid_op); + set_intr_gate(7, &device_not_available); +#ifdef CONFIG_X86_32 + set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); +#else + set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK); +#endif + set_intr_gate(9, &coprocessor_segment_overrun); + set_intr_gate(10, &invalid_TSS); + set_intr_gate(11, &segment_not_present); + set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK); + set_intr_gate(13, &general_protection); + set_intr_gate(14, &page_fault); + set_intr_gate(15, &spurious_interrupt_bug); + set_intr_gate(16, &coprocessor_error); + set_intr_gate(17, &alignment_check); +#ifdef CONFIG_X86_MCE + set_intr_gate_ist(18, &machine_check, MCE_STACK); +#endif + set_intr_gate(19, &simd_coprocessor_error); + +#ifdef CONFIG_IA32_EMULATION + set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); +#endif + +#ifdef CONFIG_X86_32 + if (cpu_has_fxsr) { + printk(KERN_INFO "Enabling fast FPU save and restore... "); + set_in_cr4(X86_CR4_OSFXSR); + printk("done.\n"); + } + if (cpu_has_xmm) { + printk(KERN_INFO + "Enabling unmasked SIMD FPU exception support... "); + set_in_cr4(X86_CR4_OSXMMEXCPT); + printk("done.\n"); + } + + set_system_trap_gate(SYSCALL_VECTOR, &system_call); + + /* Reserve all the builtin and the syscall vector: */ + for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) + set_bit(i, used_vectors); + + set_bit(SYSCALL_VECTOR, used_vectors); +#endif + /* + * Should be a barrier for any external CPU state: + */ + cpu_init(); + +#ifdef CONFIG_X86_32 + trap_init_hook(); +#endif +} diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c deleted file mode 100644 index ffb131f74f7..00000000000 --- a/arch/x86/kernel/traps_32.c +++ /dev/null @@ -1,1071 +0,0 @@ -/* - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs - * - * Pentium III FXSR, SSE support - * Gareth Hughes , May 2000 - */ - -/* - * Handle hardware traps and faults. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef CONFIG_EISA -#include -#include -#endif - -#ifdef CONFIG_MCA -#include -#endif - -#if defined(CONFIG_EDAC) -#include -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#ifdef CONFIG_X86_64 -#include -#include -#include -#else -#include -#include -#include -#include -#include -#include - -#include "cpu/mcheck/mce.h" - -DECLARE_BITMAP(used_vectors, NR_VECTORS); -EXPORT_SYMBOL_GPL(used_vectors); - -asmlinkage int system_call(void); - -/* Do we ignore FPU interrupts ? */ -char ignore_fpu_irq; - -/* - * The IDT has to be page-aligned to simplify the Pentium - * F0 0F bug workaround.. We have a special link segment - * for this. - */ -gate_desc idt_table[256] - __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; -#endif - -static int ignore_nmis; - -static inline void conditional_sti(struct pt_regs *regs) -{ - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); -} - -static inline void preempt_conditional_sti(struct pt_regs *regs) -{ - inc_preempt_count(); - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); -} - -static inline void preempt_conditional_cli(struct pt_regs *regs) -{ - if (regs->flags & X86_EFLAGS_IF) - local_irq_disable(); - dec_preempt_count(); -} - -#ifdef CONFIG_X86_32 -static inline void -die_if_kernel(const char *str, struct pt_regs *regs, long err) -{ - if (!user_mode_vm(regs)) - die(str, regs, err); -} - -/* - * Perform the lazy TSS's I/O bitmap copy. If the TSS has an - * invalid offset set (the LAZY one) and the faulting thread has - * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS, - * we set the offset field correctly and return 1. - */ -static int lazy_iobitmap_copy(void) -{ - struct thread_struct *thread; - struct tss_struct *tss; - int cpu; - - cpu = get_cpu(); - tss = &per_cpu(init_tss, cpu); - thread = ¤t->thread; - - if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && - thread->io_bitmap_ptr) { - memcpy(tss->io_bitmap, thread->io_bitmap_ptr, - thread->io_bitmap_max); - /* - * If the previously set map was extending to higher ports - * than the current one, pad extra space with 0xff (no access). - */ - if (thread->io_bitmap_max < tss->io_bitmap_max) { - memset((char *) tss->io_bitmap + - thread->io_bitmap_max, 0xff, - tss->io_bitmap_max - thread->io_bitmap_max); - } - tss->io_bitmap_max = thread->io_bitmap_max; - tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; - tss->io_bitmap_owner = thread; - put_cpu(); - - return 1; - } - put_cpu(); - - return 0; -} -#endif - -static void __kprobes -do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, - long error_code, siginfo_t *info) -{ - struct task_struct *tsk = current; - -#ifdef CONFIG_X86_32 - if (regs->flags & X86_VM_MASK) { - /* - * traps 0, 1, 3, 4, and 5 should be forwarded to vm86. - * On nmi (interrupt 2), do_trap should not be called. - */ - if (trapnr < 6) - goto vm86_trap; - goto trap_signal; - } -#endif - - if (!user_mode(regs)) - goto kernel_trap; - -#ifdef CONFIG_X86_32 -trap_signal: -#endif - /* - * We want error_code and trap_no set for userspace faults and - * kernelspace faults which result in die(), but not - * kernelspace faults which are fixed up. die() gives the - * process no chance to handle the signal and notice the - * kernel fault information, so that won't result in polluting - * the information about previously queued, but not yet - * delivered, faults. See also do_general_protection below. - */ - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - -#ifdef CONFIG_X86_64 - if (show_unhandled_signals && unhandled_signal(tsk, signr) && - printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] trap %s ip:%lx sp:%lx error:%lx", - tsk->comm, tsk->pid, str, - regs->ip, regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } -#endif - - if (info) - force_sig_info(signr, info, tsk); - else - force_sig(signr, tsk); - return; - -kernel_trap: - if (!fixup_exception(regs)) { - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - die(str, regs, error_code); - } - return; - -#ifdef CONFIG_X86_32 -vm86_trap: - if (handle_vm86_trap((struct kernel_vm86_regs *) regs, - error_code, trapnr)) - goto trap_signal; - return; -#endif -} - -#define DO_ERROR(trapnr, signr, str, name) \ -dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - conditional_sti(regs); \ - do_trap(trapnr, signr, str, regs, error_code, NULL); \ -} - -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ -dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - siginfo_t info; \ - info.si_signo = signr; \ - info.si_errno = 0; \ - info.si_code = sicode; \ - info.si_addr = (void __user *)siaddr; \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - conditional_sti(regs); \ - do_trap(trapnr, signr, str, regs, error_code, &info); \ -} - -DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) -DO_ERROR(4, SIGSEGV, "overflow", overflow) -DO_ERROR(5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) -DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) -DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) -#ifdef CONFIG_X86_32 -DO_ERROR(12, SIGBUS, "stack segment", stack_segment) -#endif -DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) - -#ifdef CONFIG_X86_64 -/* Runs on IST stack */ -dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) -{ - if (notify_die(DIE_TRAP, "stack segment", regs, error_code, - 12, SIGBUS) == NOTIFY_STOP) - return; - preempt_conditional_sti(regs); - do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); - preempt_conditional_cli(regs); -} - -dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) -{ - static const char str[] = "double fault"; - struct task_struct *tsk = current; - - /* Return not checked because double check cannot be ignored */ - notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV); - - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 8; - - /* This is always a kernel trap and never fixable (and thus must - never return). */ - for (;;) - die(str, regs, error_code); -} -#endif - -dotraplinkage void __kprobes -do_general_protection(struct pt_regs *regs, long error_code) -{ - struct task_struct *tsk; - - conditional_sti(regs); - -#ifdef CONFIG_X86_32 - if (lazy_iobitmap_copy()) { - /* restart the faulting instruction */ - return; - } - - if (regs->flags & X86_VM_MASK) - goto gp_in_vm86; -#endif - - tsk = current; - if (!user_mode(regs)) - goto gp_in_kernel; - - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] general protection ip:%lx sp:%lx error:%lx", - tsk->comm, task_pid_nr(tsk), - regs->ip, regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } - - force_sig(SIGSEGV, tsk); - return; - -#ifdef CONFIG_X86_32 -gp_in_vm86: - local_irq_enable(); - handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); - return; -#endif - -gp_in_kernel: - if (fixup_exception(regs)) - return; - - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - if (notify_die(DIE_GPF, "general protection fault", regs, - error_code, 13, SIGSEGV) == NOTIFY_STOP) - return; - die("general protection fault", regs, error_code); -} - -static notrace __kprobes void -mem_parity_error(unsigned char reason, struct pt_regs *regs) -{ - printk(KERN_EMERG - "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", - reason, smp_processor_id()); - - printk(KERN_EMERG - "You have some hardware problem, likely on the PCI bus.\n"); - -#if defined(CONFIG_EDAC) - if (edac_handler_set()) { - edac_atomic_assert_error(); - return; - } -#endif - - if (panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); - - printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); - - /* Clear and disable the memory parity error line. */ - reason = (reason & 0xf) | 4; - outb(reason, 0x61); -} - -static notrace __kprobes void -io_check_error(unsigned char reason, struct pt_regs *regs) -{ - unsigned long i; - - printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); - show_registers(regs); - - /* Re-enable the IOCK line, wait for a few seconds */ - reason = (reason & 0xf) | 8; - outb(reason, 0x61); - - i = 2000; - while (--i) - udelay(1000); - - reason &= ~8; - outb(reason, 0x61); -} - -static notrace __kprobes void -unknown_nmi_error(unsigned char reason, struct pt_regs *regs) -{ - if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == - NOTIFY_STOP) - return; -#ifdef CONFIG_MCA - /* - * Might actually be able to figure out what the guilty party - * is: - */ - if (MCA_bus) { - mca_handle_nmi(); - return; - } -#endif - printk(KERN_EMERG - "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", - reason, smp_processor_id()); - - printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); - if (panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); - - printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); -} - -#ifdef CONFIG_X86_32 -static DEFINE_SPINLOCK(nmi_print_lock); - -void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) -{ - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) - return; - - spin_lock(&nmi_print_lock); - /* - * We are in trouble anyway, lets at least try - * to get a message out: - */ - bust_spinlocks(1); - printk(KERN_EMERG "%s", str); - printk(" on CPU%d, ip %08lx, registers:\n", - smp_processor_id(), regs->ip); - show_registers(regs); - if (do_panic) - panic("Non maskable interrupt"); - console_silent(); - spin_unlock(&nmi_print_lock); - bust_spinlocks(0); - - /* - * If we are in kernel we are probably nested up pretty bad - * and might aswell get out now while we still can: - */ - if (!user_mode_vm(regs)) { - current->thread.trap_no = 2; - crash_kexec(regs); - } - - do_exit(SIGSEGV); -} -#endif - -static notrace __kprobes void default_do_nmi(struct pt_regs *regs) -{ - unsigned char reason = 0; - int cpu; - - cpu = smp_processor_id(); - - /* Only the BSP gets external NMIs from the system. */ - if (!cpu) - reason = get_nmi_reason(); - - if (!(reason & 0xc0)) { - if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) - return; -#ifdef CONFIG_X86_LOCAL_APIC - /* - * Ok, so this is none of the documented NMI sources, - * so it must be the NMI watchdog. - */ - if (nmi_watchdog_tick(regs, reason)) - return; - if (!do_nmi_callback(regs, cpu)) - unknown_nmi_error(reason, regs); -#else - unknown_nmi_error(reason, regs); -#endif - - return; - } - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) - return; - - /* AK: following checks seem to be broken on modern chipsets. FIXME */ - if (reason & 0x80) - mem_parity_error(reason, regs); - if (reason & 0x40) - io_check_error(reason, regs); -#ifdef CONFIG_X86_32 - /* - * Reassert NMI in case it became active meanwhile - * as it's edge-triggered: - */ - reassert_nmi(); -#endif -} - -dotraplinkage notrace __kprobes void -do_nmi(struct pt_regs *regs, long error_code) -{ - nmi_enter(); - -#ifdef CONFIG_X86_32 - { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); } -#else - add_pda(__nmi_count, 1); -#endif - - if (!ignore_nmis) - default_do_nmi(regs); - - nmi_exit(); -} - -void stop_nmi(void) -{ - acpi_nmi_disable(); - ignore_nmis++; -} - -void restart_nmi(void) -{ - ignore_nmis--; - acpi_nmi_enable(); -} - -/* May run on IST stack. */ -dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) -{ -#ifdef CONFIG_KPROBES - if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) - == NOTIFY_STOP) - return; -#else - if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP) - == NOTIFY_STOP) - return; -#endif - - preempt_conditional_sti(regs); - do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); - preempt_conditional_cli(regs); -} - -#ifdef CONFIG_X86_64 -/* Help handler running on IST stack to switch back to user stack - for scheduling or signal handling. The actual stack switch is done in - entry.S */ -asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) -{ - struct pt_regs *regs = eregs; - /* Did already sync */ - if (eregs == (struct pt_regs *)eregs->sp) - ; - /* Exception from user space */ - else if (user_mode(eregs)) - regs = task_pt_regs(current); - /* Exception from kernel and interrupts are enabled. Move to - kernel process stack. */ - else if (eregs->flags & X86_EFLAGS_IF) - regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); - if (eregs != regs) - *regs = *eregs; - return regs; -} -#endif - -/* - * Our handling of the processor debug registers is non-trivial. - * We do not clear them on entry and exit from the kernel. Therefore - * it is possible to get a watchpoint trap here from inside the kernel. - * However, the code in ./ptrace.c has ensured that the user can - * only set watchpoints on userspace addresses. Therefore the in-kernel - * watchpoint trap can only occur in code which is reading/writing - * from user space. Such code must not hold kernel locks (since it - * can equally take a page fault), therefore it is safe to call - * force_sig_info even though that claims and releases locks. - * - * Code in ./signal.c ensures that the debug control register - * is restored before we deliver any signal, and therefore that - * user code runs with the correct debug control register even though - * we clear it here. - * - * Being careful here means that we don't have to be as careful in a - * lot of more complicated places (task switching can be a bit lazy - * about restoring all the debug state, and ptrace doesn't have to - * find every occurrence of the TF bit that could be saved away even - * by user code) - * - * May run on IST stack. - */ -dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) -{ - struct task_struct *tsk = current; - unsigned long condition; - int si_code; - - get_debugreg(condition, 6); - - /* - * The processor cleared BTF, so don't mark that we need it set. - */ - clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); - tsk->thread.debugctlmsr = 0; - - if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, - SIGTRAP) == NOTIFY_STOP) - return; - - /* It's safe to allow irq's after DR6 has been saved */ - preempt_conditional_sti(regs); - - /* Mask out spurious debug traps due to lazy DR7 setting */ - if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { - if (!tsk->thread.debugreg7) - goto clear_dr7; - } - -#ifdef CONFIG_X86_32 - if (regs->flags & X86_VM_MASK) - goto debug_vm86; -#endif - - /* Save debug status register where ptrace can see it */ - tsk->thread.debugreg6 = condition; - - /* - * Single-stepping through TF: make sure we ignore any events in - * kernel space (but re-enable TF when returning to user mode). - */ - if (condition & DR_STEP) { - if (!user_mode(regs)) - goto clear_TF_reenable; - } - - si_code = get_si_code(condition); - /* Ok, finally something we can handle */ - send_sigtrap(tsk, regs, error_code, si_code); - - /* - * Disable additional traps. They'll be re-enabled when - * the signal is delivered. - */ -clear_dr7: - set_debugreg(0, 7); - preempt_conditional_cli(regs); - return; - -#ifdef CONFIG_X86_32 -debug_vm86: - handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); - preempt_conditional_cli(regs); - return; -#endif - -clear_TF_reenable: - set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - regs->flags &= ~X86_EFLAGS_TF; - preempt_conditional_cli(regs); - return; -} - -#ifdef CONFIG_X86_64 -static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) -{ - if (fixup_exception(regs)) - return 1; - - notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); - /* Illegal floating point operation in the kernel */ - current->thread.trap_no = trapnr; - die(str, regs, 0); - return 0; -} -#endif - -/* - * Note that we play around with the 'TS' bit in an attempt to get - * the correct behaviour even in the presence of the asynchronous - * IRQ13 behaviour - */ -void math_error(void __user *ip) -{ - struct task_struct *task; - siginfo_t info; - unsigned short cwd, swd; - - /* - * Save the info for the exception handler and clear the error. - */ - task = current; - save_init_fpu(task); - task->thread.trap_no = 16; - task->thread.error_code = 0; - info.si_signo = SIGFPE; - info.si_errno = 0; - info.si_code = __SI_FAULT; - info.si_addr = ip; - /* - * (~cwd & swd) will mask out exceptions that are not set to unmasked - * status. 0x3f is the exception bits in these regs, 0x200 is the - * C1 reg you need in case of a stack fault, 0x040 is the stack - * fault bit. We should only be taking one exception at a time, - * so if this combination doesn't produce any single exception, - * then we have a bad program that isn't synchronizing its FPU usage - * and it will suffer the consequences since we won't be able to - * fully reproduce the context of the exception - */ - cwd = get_fpu_cwd(task); - swd = get_fpu_swd(task); - switch (swd & ~cwd & 0x3f) { - case 0x000: /* No unmasked exception */ -#ifdef CONFIG_X86_32 - return; -#endif - default: /* Multiple exceptions */ - break; - case 0x001: /* Invalid Op */ - /* - * swd & 0x240 == 0x040: Stack Underflow - * swd & 0x240 == 0x240: Stack Overflow - * User must clear the SF bit (0x40) if set - */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; - } - force_sig_info(SIGFPE, &info, task); -} - -dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) -{ - conditional_sti(regs); - -#ifdef CONFIG_X86_32 - ignore_fpu_irq = 1; -#else - if (!user_mode(regs) && - kernel_math_error(regs, "kernel x87 math error", 16)) - return; -#endif - - math_error((void __user *)regs->ip); -} - -static void simd_math_error(void __user *ip) -{ - struct task_struct *task; - siginfo_t info; - unsigned short mxcsr; - - /* - * Save the info for the exception handler and clear the error. - */ - task = current; - save_init_fpu(task); - task->thread.trap_no = 19; - task->thread.error_code = 0; - info.si_signo = SIGFPE; - info.si_errno = 0; - info.si_code = __SI_FAULT; - info.si_addr = ip; - /* - * The SIMD FPU exceptions are handled a little differently, as there - * is only a single status/control register. Thus, to determine which - * unmasked exception was caught we must mask the exception mask bits - * at 0x1f80, and then use these to mask the exception bits at 0x3f. - */ - mxcsr = get_fpu_mxcsr(task); - switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { - case 0x000: - default: - break; - case 0x001: /* Invalid Op */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; - } - force_sig_info(SIGFPE, &info, task); -} - -dotraplinkage void -do_simd_coprocessor_error(struct pt_regs *regs, long error_code) -{ - conditional_sti(regs); - -#ifdef CONFIG_X86_32 - if (cpu_has_xmm) { - /* Handle SIMD FPU exceptions on PIII+ processors. */ - ignore_fpu_irq = 1; - simd_math_error((void __user *)regs->ip); - return; - } - /* - * Handle strange cache flush from user space exception - * in all other cases. This is undocumented behaviour. - */ - if (regs->flags & X86_VM_MASK) { - handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code); - return; - } - current->thread.trap_no = 19; - current->thread.error_code = error_code; - die_if_kernel("cache flush denied", regs, error_code); - force_sig(SIGSEGV, current); -#else - if (!user_mode(regs) && - kernel_math_error(regs, "kernel simd math error", 19)) - return; - simd_math_error((void __user *)regs->ip); -#endif -} - -dotraplinkage void -do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) -{ - conditional_sti(regs); -#if 0 - /* No need to warn about this any longer. */ - printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); -#endif -} - -#ifdef CONFIG_X86_32 -unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) -{ - struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); - unsigned long base = (kesp - uesp) & -THREAD_SIZE; - unsigned long new_kesp = kesp - base; - unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; - __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS]; - - /* Set up base for espfix segment */ - desc &= 0x00f0ff0000000000ULL; - desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) | - ((((__u64)base) << 32) & 0xff00000000000000ULL) | - ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) | - (lim_pages & 0xffff); - *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc; - - return new_kesp; -} -#else -asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) -{ -} - -asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) -{ -} -#endif - -/* - * 'math_state_restore()' saves the current math information in the - * old math state array, and gets the new ones from the current task - * - * Careful.. There are problems with IBM-designed IRQ13 behaviour. - * Don't touch unless you *really* know how it works. - * - * Must be called with kernel preemption disabled (in this case, - * local interrupts are disabled at the call-site in entry.S). - */ -asmlinkage void math_state_restore(void) -{ - struct thread_info *thread = current_thread_info(); - struct task_struct *tsk = thread->task; - - if (!tsk_used_math(tsk)) { - local_irq_enable(); - /* - * does a slab alloc which can sleep - */ - if (init_fpu(tsk)) { - /* - * ran out of memory! - */ - do_group_exit(SIGKILL); - return; - } - local_irq_disable(); - } - - clts(); /* Allow maths ops (or we recurse) */ -#ifdef CONFIG_X86_32 - restore_fpu(tsk); -#else - /* - * Paranoid restore. send a SIGSEGV if we fail to restore the state. - */ - if (unlikely(restore_fpu_checking(tsk))) { - stts(); - force_sig(SIGSEGV, tsk); - return; - } -#endif - thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ - tsk->fpu_counter++; -} -EXPORT_SYMBOL_GPL(math_state_restore); - -#ifndef CONFIG_MATH_EMULATION -asmlinkage void math_emulate(long arg) -{ - printk(KERN_EMERG - "math-emulation not enabled and no coprocessor found.\n"); - printk(KERN_EMERG "killing %s.\n", current->comm); - force_sig(SIGFPE, current); - schedule(); -} -#endif /* CONFIG_MATH_EMULATION */ - -dotraplinkage void __kprobes -do_device_not_available(struct pt_regs *regs, long error) -{ -#ifdef CONFIG_X86_32 - if (read_cr0() & X86_CR0_EM) { - conditional_sti(regs); - math_emulate(0); - } else { - math_state_restore(); /* interrupts still off */ - conditional_sti(regs); - } -#else - math_state_restore(); -#endif -} - -#ifdef CONFIG_X86_32 -#ifdef CONFIG_X86_MCE -dotraplinkage void __kprobes do_machine_check(struct pt_regs *regs, long error) -{ - conditional_sti(regs); - machine_check_vector(regs, error); -} -#endif - -dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) -{ - siginfo_t info; - local_irq_enable(); - - info.si_signo = SIGILL; - info.si_errno = 0; - info.si_code = ILL_BADSTK; - info.si_addr = 0; - if (notify_die(DIE_TRAP, "iret exception", - regs, error_code, 32, SIGILL) == NOTIFY_STOP) - return; - do_trap(32, SIGILL, "iret exception", regs, error_code, &info); -} -#endif - -void __init trap_init(void) -{ -#ifdef CONFIG_X86_32 - int i; -#endif - -#ifdef CONFIG_EISA - void __iomem *p = early_ioremap(0x0FFFD9, 4); - - if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24)) - EISA_bus = 1; - early_iounmap(p, 4); -#endif - - set_intr_gate(0, ÷_error); - set_intr_gate_ist(1, &debug, DEBUG_STACK); - set_intr_gate_ist(2, &nmi, NMI_STACK); - /* int3 can be called from all */ - set_system_intr_gate_ist(3, &int3, DEBUG_STACK); - /* int4 can be called from all */ - set_system_intr_gate(4, &overflow); - set_intr_gate(5, &bounds); - set_intr_gate(6, &invalid_op); - set_intr_gate(7, &device_not_available); -#ifdef CONFIG_X86_32 - set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); -#else - set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK); -#endif - set_intr_gate(9, &coprocessor_segment_overrun); - set_intr_gate(10, &invalid_TSS); - set_intr_gate(11, &segment_not_present); - set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK); - set_intr_gate(13, &general_protection); - set_intr_gate(14, &page_fault); - set_intr_gate(15, &spurious_interrupt_bug); - set_intr_gate(16, &coprocessor_error); - set_intr_gate(17, &alignment_check); -#ifdef CONFIG_X86_MCE - set_intr_gate_ist(18, &machine_check, MCE_STACK); -#endif - set_intr_gate(19, &simd_coprocessor_error); - -#ifdef CONFIG_IA32_EMULATION - set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); -#endif - -#ifdef CONFIG_X86_32 - if (cpu_has_fxsr) { - printk(KERN_INFO "Enabling fast FPU save and restore... "); - set_in_cr4(X86_CR4_OSFXSR); - printk("done.\n"); - } - if (cpu_has_xmm) { - printk(KERN_INFO - "Enabling unmasked SIMD FPU exception support... "); - set_in_cr4(X86_CR4_OSXMMEXCPT); - printk("done.\n"); - } - - set_system_trap_gate(SYSCALL_VECTOR, &system_call); - - /* Reserve all the builtin and the syscall vector: */ - for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) - set_bit(i, used_vectors); - - set_bit(SYSCALL_VECTOR, used_vectors); -#endif - /* - * Should be a barrier for any external CPU state: - */ - cpu_init(); - -#ifdef CONFIG_X86_32 - trap_init_hook(); -#endif -} diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c deleted file mode 100644 index 60ecc855ab8..00000000000 --- a/arch/x86/kernel/traps_64.c +++ /dev/null @@ -1,1070 +0,0 @@ -/* - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs - * - * Pentium III FXSR, SSE support - * Gareth Hughes , May 2000 - */ - -/* - * Handle hardware traps and faults. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef CONFIG_EISA -#include -#include -#endif - -#ifdef CONFIG_MCA -#include -#endif - -#if defined(CONFIG_EDAC) -#include -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#ifdef CONFIG_X86_64 -#include -#include -#include -#else -#include -#include -#include -#include -#include - -#include "cpu/mcheck/mce.h" - -DECLARE_BITMAP(used_vectors, NR_VECTORS); -EXPORT_SYMBOL_GPL(used_vectors); - -asmlinkage int system_call(void); - -/* Do we ignore FPU interrupts ? */ -char ignore_fpu_irq; - -/* - * The IDT has to be page-aligned to simplify the Pentium - * F0 0F bug workaround.. We have a special link segment - * for this. - */ -gate_desc idt_table[256] - __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; -#endif - -static int ignore_nmis; - -static inline void conditional_sti(struct pt_regs *regs) -{ - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); -} - -static inline void preempt_conditional_sti(struct pt_regs *regs) -{ - inc_preempt_count(); - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); -} - -static inline void preempt_conditional_cli(struct pt_regs *regs) -{ - if (regs->flags & X86_EFLAGS_IF) - local_irq_disable(); - dec_preempt_count(); -} - -#ifdef CONFIG_X86_32 -static inline void -die_if_kernel(const char *str, struct pt_regs *regs, long err) -{ - if (!user_mode_vm(regs)) - die(str, regs, err); -} - -/* - * Perform the lazy TSS's I/O bitmap copy. If the TSS has an - * invalid offset set (the LAZY one) and the faulting thread has - * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS, - * we set the offset field correctly and return 1. - */ -static int lazy_iobitmap_copy(void) -{ - struct thread_struct *thread; - struct tss_struct *tss; - int cpu; - - cpu = get_cpu(); - tss = &per_cpu(init_tss, cpu); - thread = ¤t->thread; - - if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && - thread->io_bitmap_ptr) { - memcpy(tss->io_bitmap, thread->io_bitmap_ptr, - thread->io_bitmap_max); - /* - * If the previously set map was extending to higher ports - * than the current one, pad extra space with 0xff (no access). - */ - if (thread->io_bitmap_max < tss->io_bitmap_max) { - memset((char *) tss->io_bitmap + - thread->io_bitmap_max, 0xff, - tss->io_bitmap_max - thread->io_bitmap_max); - } - tss->io_bitmap_max = thread->io_bitmap_max; - tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; - tss->io_bitmap_owner = thread; - put_cpu(); - - return 1; - } - put_cpu(); - - return 0; -} -#endif - -static void __kprobes -do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, - long error_code, siginfo_t *info) -{ - struct task_struct *tsk = current; - -#ifdef CONFIG_X86_32 - if (regs->flags & X86_VM_MASK) { - /* - * traps 0, 1, 3, 4, and 5 should be forwarded to vm86. - * On nmi (interrupt 2), do_trap should not be called. - */ - if (trapnr < 6) - goto vm86_trap; - goto trap_signal; - } -#endif - - if (!user_mode(regs)) - goto kernel_trap; - -#ifdef CONFIG_X86_32 -trap_signal: -#endif - /* - * We want error_code and trap_no set for userspace faults and - * kernelspace faults which result in die(), but not - * kernelspace faults which are fixed up. die() gives the - * process no chance to handle the signal and notice the - * kernel fault information, so that won't result in polluting - * the information about previously queued, but not yet - * delivered, faults. See also do_general_protection below. - */ - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - -#ifdef CONFIG_X86_64 - if (show_unhandled_signals && unhandled_signal(tsk, signr) && - printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] trap %s ip:%lx sp:%lx error:%lx", - tsk->comm, tsk->pid, str, - regs->ip, regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } -#endif - - if (info) - force_sig_info(signr, info, tsk); - else - force_sig(signr, tsk); - return; - -kernel_trap: - if (!fixup_exception(regs)) { - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - die(str, regs, error_code); - } - return; - -#ifdef CONFIG_X86_32 -vm86_trap: - if (handle_vm86_trap((struct kernel_vm86_regs *) regs, - error_code, trapnr)) - goto trap_signal; - return; -#endif -} - -#define DO_ERROR(trapnr, signr, str, name) \ -dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - conditional_sti(regs); \ - do_trap(trapnr, signr, str, regs, error_code, NULL); \ -} - -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ -dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - siginfo_t info; \ - info.si_signo = signr; \ - info.si_errno = 0; \ - info.si_code = sicode; \ - info.si_addr = (void __user *)siaddr; \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - conditional_sti(regs); \ - do_trap(trapnr, signr, str, regs, error_code, &info); \ -} - -DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) -DO_ERROR(4, SIGSEGV, "overflow", overflow) -DO_ERROR(5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) -DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) -DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) -#ifdef CONFIG_X86_32 -DO_ERROR(12, SIGBUS, "stack segment", stack_segment) -#endif -DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) - -#ifdef CONFIG_X86_64 -/* Runs on IST stack */ -dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) -{ - if (notify_die(DIE_TRAP, "stack segment", regs, error_code, - 12, SIGBUS) == NOTIFY_STOP) - return; - preempt_conditional_sti(regs); - do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); - preempt_conditional_cli(regs); -} - -dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) -{ - static const char str[] = "double fault"; - struct task_struct *tsk = current; - - /* Return not checked because double check cannot be ignored */ - notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV); - - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 8; - - /* This is always a kernel trap and never fixable (and thus must - never return). */ - for (;;) - die(str, regs, error_code); -} -#endif - -dotraplinkage void __kprobes -do_general_protection(struct pt_regs *regs, long error_code) -{ - struct task_struct *tsk; - - conditional_sti(regs); - -#ifdef CONFIG_X86_32 - if (lazy_iobitmap_copy()) { - /* restart the faulting instruction */ - return; - } - - if (regs->flags & X86_VM_MASK) - goto gp_in_vm86; -#endif - - tsk = current; - if (!user_mode(regs)) - goto gp_in_kernel; - - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] general protection ip:%lx sp:%lx error:%lx", - tsk->comm, task_pid_nr(tsk), - regs->ip, regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } - - force_sig(SIGSEGV, tsk); - return; - -#ifdef CONFIG_X86_32 -gp_in_vm86: - local_irq_enable(); - handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); - return; -#endif - -gp_in_kernel: - if (fixup_exception(regs)) - return; - - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - if (notify_die(DIE_GPF, "general protection fault", regs, - error_code, 13, SIGSEGV) == NOTIFY_STOP) - return; - die("general protection fault", regs, error_code); -} - -static notrace __kprobes void -mem_parity_error(unsigned char reason, struct pt_regs *regs) -{ - printk(KERN_EMERG - "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", - reason, smp_processor_id()); - - printk(KERN_EMERG - "You have some hardware problem, likely on the PCI bus.\n"); - -#if defined(CONFIG_EDAC) - if (edac_handler_set()) { - edac_atomic_assert_error(); - return; - } -#endif - - if (panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); - - printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); - - /* Clear and disable the memory parity error line. */ - reason = (reason & 0xf) | 4; - outb(reason, 0x61); -} - -static notrace __kprobes void -io_check_error(unsigned char reason, struct pt_regs *regs) -{ - unsigned long i; - - printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); - show_registers(regs); - - /* Re-enable the IOCK line, wait for a few seconds */ - reason = (reason & 0xf) | 8; - outb(reason, 0x61); - - i = 2000; - while (--i) - udelay(1000); - - reason &= ~8; - outb(reason, 0x61); -} - -static notrace __kprobes void -unknown_nmi_error(unsigned char reason, struct pt_regs *regs) -{ - if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == - NOTIFY_STOP) - return; -#ifdef CONFIG_MCA - /* - * Might actually be able to figure out what the guilty party - * is: - */ - if (MCA_bus) { - mca_handle_nmi(); - return; - } -#endif - printk(KERN_EMERG - "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", - reason, smp_processor_id()); - - printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); - if (panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); - - printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); -} - -#ifdef CONFIG_X86_32 -static DEFINE_SPINLOCK(nmi_print_lock); - -void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) -{ - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) - return; - - spin_lock(&nmi_print_lock); - /* - * We are in trouble anyway, lets at least try - * to get a message out: - */ - bust_spinlocks(1); - printk(KERN_EMERG "%s", str); - printk(" on CPU%d, ip %08lx, registers:\n", - smp_processor_id(), regs->ip); - show_registers(regs); - if (do_panic) - panic("Non maskable interrupt"); - console_silent(); - spin_unlock(&nmi_print_lock); - bust_spinlocks(0); - - /* - * If we are in kernel we are probably nested up pretty bad - * and might aswell get out now while we still can: - */ - if (!user_mode_vm(regs)) { - current->thread.trap_no = 2; - crash_kexec(regs); - } - - do_exit(SIGSEGV); -} -#endif - -static notrace __kprobes void default_do_nmi(struct pt_regs *regs) -{ - unsigned char reason = 0; - int cpu; - - cpu = smp_processor_id(); - - /* Only the BSP gets external NMIs from the system. */ - if (!cpu) - reason = get_nmi_reason(); - - if (!(reason & 0xc0)) { - if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) - return; -#ifdef CONFIG_X86_LOCAL_APIC - /* - * Ok, so this is none of the documented NMI sources, - * so it must be the NMI watchdog. - */ - if (nmi_watchdog_tick(regs, reason)) - return; - if (!do_nmi_callback(regs, cpu)) - unknown_nmi_error(reason, regs); -#else - unknown_nmi_error(reason, regs); -#endif - - return; - } - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) - return; - - /* AK: following checks seem to be broken on modern chipsets. FIXME */ - if (reason & 0x80) - mem_parity_error(reason, regs); - if (reason & 0x40) - io_check_error(reason, regs); -#ifdef CONFIG_X86_32 - /* - * Reassert NMI in case it became active meanwhile - * as it's edge-triggered: - */ - reassert_nmi(); -#endif -} - -dotraplinkage notrace __kprobes void -do_nmi(struct pt_regs *regs, long error_code) -{ - nmi_enter(); - -#ifdef CONFIG_X86_32 - { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); } -#else - add_pda(__nmi_count, 1); -#endif - - if (!ignore_nmis) - default_do_nmi(regs); - - nmi_exit(); -} - -void stop_nmi(void) -{ - acpi_nmi_disable(); - ignore_nmis++; -} - -void restart_nmi(void) -{ - ignore_nmis--; - acpi_nmi_enable(); -} - -/* May run on IST stack. */ -dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) -{ -#ifdef CONFIG_KPROBES - if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) - == NOTIFY_STOP) - return; -#else - if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP) - == NOTIFY_STOP) - return; -#endif - - preempt_conditional_sti(regs); - do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); - preempt_conditional_cli(regs); -} - -#ifdef CONFIG_X86_64 -/* Help handler running on IST stack to switch back to user stack - for scheduling or signal handling. The actual stack switch is done in - entry.S */ -asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) -{ - struct pt_regs *regs = eregs; - /* Did already sync */ - if (eregs == (struct pt_regs *)eregs->sp) - ; - /* Exception from user space */ - else if (user_mode(eregs)) - regs = task_pt_regs(current); - /* Exception from kernel and interrupts are enabled. Move to - kernel process stack. */ - else if (eregs->flags & X86_EFLAGS_IF) - regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); - if (eregs != regs) - *regs = *eregs; - return regs; -} -#endif - -/* - * Our handling of the processor debug registers is non-trivial. - * We do not clear them on entry and exit from the kernel. Therefore - * it is possible to get a watchpoint trap here from inside the kernel. - * However, the code in ./ptrace.c has ensured that the user can - * only set watchpoints on userspace addresses. Therefore the in-kernel - * watchpoint trap can only occur in code which is reading/writing - * from user space. Such code must not hold kernel locks (since it - * can equally take a page fault), therefore it is safe to call - * force_sig_info even though that claims and releases locks. - * - * Code in ./signal.c ensures that the debug control register - * is restored before we deliver any signal, and therefore that - * user code runs with the correct debug control register even though - * we clear it here. - * - * Being careful here means that we don't have to be as careful in a - * lot of more complicated places (task switching can be a bit lazy - * about restoring all the debug state, and ptrace doesn't have to - * find every occurrence of the TF bit that could be saved away even - * by user code) - * - * May run on IST stack. - */ -dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) -{ - struct task_struct *tsk = current; - unsigned long condition; - int si_code; - - get_debugreg(condition, 6); - - /* - * The processor cleared BTF, so don't mark that we need it set. - */ - clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); - tsk->thread.debugctlmsr = 0; - - if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, - SIGTRAP) == NOTIFY_STOP) - return; - - /* It's safe to allow irq's after DR6 has been saved */ - preempt_conditional_sti(regs); - - /* Mask out spurious debug traps due to lazy DR7 setting */ - if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { - if (!tsk->thread.debugreg7) - goto clear_dr7; - } - -#ifdef CONFIG_X86_32 - if (regs->flags & X86_VM_MASK) - goto debug_vm86; -#endif - - /* Save debug status register where ptrace can see it */ - tsk->thread.debugreg6 = condition; - - /* - * Single-stepping through TF: make sure we ignore any events in - * kernel space (but re-enable TF when returning to user mode). - */ - if (condition & DR_STEP) { - if (!user_mode(regs)) - goto clear_TF_reenable; - } - - si_code = get_si_code(condition); - /* Ok, finally something we can handle */ - send_sigtrap(tsk, regs, error_code, si_code); - - /* - * Disable additional traps. They'll be re-enabled when - * the signal is delivered. - */ -clear_dr7: - set_debugreg(0, 7); - preempt_conditional_cli(regs); - return; - -#ifdef CONFIG_X86_32 -debug_vm86: - handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); - preempt_conditional_cli(regs); - return; -#endif - -clear_TF_reenable: - set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - regs->flags &= ~X86_EFLAGS_TF; - preempt_conditional_cli(regs); - return; -} - -#ifdef CONFIG_X86_64 -static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) -{ - if (fixup_exception(regs)) - return 1; - - notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); - /* Illegal floating point operation in the kernel */ - current->thread.trap_no = trapnr; - die(str, regs, 0); - return 0; -} -#endif - -/* - * Note that we play around with the 'TS' bit in an attempt to get - * the correct behaviour even in the presence of the asynchronous - * IRQ13 behaviour - */ -void math_error(void __user *ip) -{ - struct task_struct *task; - siginfo_t info; - unsigned short cwd, swd; - - /* - * Save the info for the exception handler and clear the error. - */ - task = current; - save_init_fpu(task); - task->thread.trap_no = 16; - task->thread.error_code = 0; - info.si_signo = SIGFPE; - info.si_errno = 0; - info.si_code = __SI_FAULT; - info.si_addr = ip; - /* - * (~cwd & swd) will mask out exceptions that are not set to unmasked - * status. 0x3f is the exception bits in these regs, 0x200 is the - * C1 reg you need in case of a stack fault, 0x040 is the stack - * fault bit. We should only be taking one exception at a time, - * so if this combination doesn't produce any single exception, - * then we have a bad program that isn't synchronizing its FPU usage - * and it will suffer the consequences since we won't be able to - * fully reproduce the context of the exception - */ - cwd = get_fpu_cwd(task); - swd = get_fpu_swd(task); - switch (swd & ~cwd & 0x3f) { - case 0x000: /* No unmasked exception */ -#ifdef CONFIG_X86_32 - return; -#endif - default: /* Multiple exceptions */ - break; - case 0x001: /* Invalid Op */ - /* - * swd & 0x240 == 0x040: Stack Underflow - * swd & 0x240 == 0x240: Stack Overflow - * User must clear the SF bit (0x40) if set - */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; - } - force_sig_info(SIGFPE, &info, task); -} - -dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) -{ - conditional_sti(regs); - -#ifdef CONFIG_X86_32 - ignore_fpu_irq = 1; -#else - if (!user_mode(regs) && - kernel_math_error(regs, "kernel x87 math error", 16)) - return; -#endif - - math_error((void __user *)regs->ip); -} - -static void simd_math_error(void __user *ip) -{ - struct task_struct *task; - siginfo_t info; - unsigned short mxcsr; - - /* - * Save the info for the exception handler and clear the error. - */ - task = current; - save_init_fpu(task); - task->thread.trap_no = 19; - task->thread.error_code = 0; - info.si_signo = SIGFPE; - info.si_errno = 0; - info.si_code = __SI_FAULT; - info.si_addr = ip; - /* - * The SIMD FPU exceptions are handled a little differently, as there - * is only a single status/control register. Thus, to determine which - * unmasked exception was caught we must mask the exception mask bits - * at 0x1f80, and then use these to mask the exception bits at 0x3f. - */ - mxcsr = get_fpu_mxcsr(task); - switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { - case 0x000: - default: - break; - case 0x001: /* Invalid Op */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; - } - force_sig_info(SIGFPE, &info, task); -} - -dotraplinkage void -do_simd_coprocessor_error(struct pt_regs *regs, long error_code) -{ - conditional_sti(regs); - -#ifdef CONFIG_X86_32 - if (cpu_has_xmm) { - /* Handle SIMD FPU exceptions on PIII+ processors. */ - ignore_fpu_irq = 1; - simd_math_error((void __user *)regs->ip); - return; - } - /* - * Handle strange cache flush from user space exception - * in all other cases. This is undocumented behaviour. - */ - if (regs->flags & X86_VM_MASK) { - handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code); - return; - } - current->thread.trap_no = 19; - current->thread.error_code = error_code; - die_if_kernel("cache flush denied", regs, error_code); - force_sig(SIGSEGV, current); -#else - if (!user_mode(regs) && - kernel_math_error(regs, "kernel simd math error", 19)) - return; - simd_math_error((void __user *)regs->ip); -#endif -} - -dotraplinkage void -do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) -{ - conditional_sti(regs); -#if 0 - /* No need to warn about this any longer. */ - printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); -#endif -} - -#ifdef CONFIG_X86_32 -unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) -{ - struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); - unsigned long base = (kesp - uesp) & -THREAD_SIZE; - unsigned long new_kesp = kesp - base; - unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; - __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS]; - - /* Set up base for espfix segment */ - desc &= 0x00f0ff0000000000ULL; - desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) | - ((((__u64)base) << 32) & 0xff00000000000000ULL) | - ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) | - (lim_pages & 0xffff); - *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc; - - return new_kesp; -} -#else -asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) -{ -} - -asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) -{ -} -#endif - -/* - * 'math_state_restore()' saves the current math information in the - * old math state array, and gets the new ones from the current task - * - * Careful.. There are problems with IBM-designed IRQ13 behaviour. - * Don't touch unless you *really* know how it works. - * - * Must be called with kernel preemption disabled (in this case, - * local interrupts are disabled at the call-site in entry.S). - */ -asmlinkage void math_state_restore(void) -{ - struct thread_info *thread = current_thread_info(); - struct task_struct *tsk = thread->task; - - if (!tsk_used_math(tsk)) { - local_irq_enable(); - /* - * does a slab alloc which can sleep - */ - if (init_fpu(tsk)) { - /* - * ran out of memory! - */ - do_group_exit(SIGKILL); - return; - } - local_irq_disable(); - } - - clts(); /* Allow maths ops (or we recurse) */ -#ifdef CONFIG_X86_32 - restore_fpu(tsk); -#else - /* - * Paranoid restore. send a SIGSEGV if we fail to restore the state. - */ - if (unlikely(restore_fpu_checking(tsk))) { - stts(); - force_sig(SIGSEGV, tsk); - return; - } -#endif - thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ - tsk->fpu_counter++; -} -EXPORT_SYMBOL_GPL(math_state_restore); - -#ifndef CONFIG_MATH_EMULATION -asmlinkage void math_emulate(long arg) -{ - printk(KERN_EMERG - "math-emulation not enabled and no coprocessor found.\n"); - printk(KERN_EMERG "killing %s.\n", current->comm); - force_sig(SIGFPE, current); - schedule(); -} -#endif /* CONFIG_MATH_EMULATION */ - -dotraplinkage void __kprobes -do_device_not_available(struct pt_regs *regs, long error) -{ -#ifdef CONFIG_X86_32 - if (read_cr0() & X86_CR0_EM) { - conditional_sti(regs); - math_emulate(0); - } else { - math_state_restore(); /* interrupts still off */ - conditional_sti(regs); - } -#else - math_state_restore(); -#endif -} - -#ifdef CONFIG_X86_32 -#ifdef CONFIG_X86_MCE -dotraplinkage void __kprobes do_machine_check(struct pt_regs *regs, long error) -{ - conditional_sti(regs); - machine_check_vector(regs, error); -} -#endif - -dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) -{ - siginfo_t info; - local_irq_enable(); - - info.si_signo = SIGILL; - info.si_errno = 0; - info.si_code = ILL_BADSTK; - info.si_addr = 0; - if (notify_die(DIE_TRAP, "iret exception", - regs, error_code, 32, SIGILL) == NOTIFY_STOP) - return; - do_trap(32, SIGILL, "iret exception", regs, error_code, &info); -} -#endif - -void __init trap_init(void) -{ -#ifdef CONFIG_X86_32 - int i; -#endif - -#ifdef CONFIG_EISA - void __iomem *p = early_ioremap(0x0FFFD9, 4); - - if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24)) - EISA_bus = 1; - early_iounmap(p, 4); -#endif - - set_intr_gate(0, ÷_error); - set_intr_gate_ist(1, &debug, DEBUG_STACK); - set_intr_gate_ist(2, &nmi, NMI_STACK); - /* int3 can be called from all */ - set_system_intr_gate_ist(3, &int3, DEBUG_STACK); - /* int4 can be called from all */ - set_system_intr_gate(4, &overflow); - set_intr_gate(5, &bounds); - set_intr_gate(6, &invalid_op); - set_intr_gate(7, &device_not_available); -#ifdef CONFIG_X86_32 - set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); -#else - set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK); -#endif - set_intr_gate(9, &coprocessor_segment_overrun); - set_intr_gate(10, &invalid_TSS); - set_intr_gate(11, &segment_not_present); - set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK); - set_intr_gate(13, &general_protection); - set_intr_gate(14, &page_fault); - set_intr_gate(15, &spurious_interrupt_bug); - set_intr_gate(16, &coprocessor_error); - set_intr_gate(17, &alignment_check); -#ifdef CONFIG_X86_MCE - set_intr_gate_ist(18, &machine_check, MCE_STACK); -#endif - set_intr_gate(19, &simd_coprocessor_error); - -#ifdef CONFIG_IA32_EMULATION - set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); -#endif - -#ifdef CONFIG_X86_32 - if (cpu_has_fxsr) { - printk(KERN_INFO "Enabling fast FPU save and restore... "); - set_in_cr4(X86_CR4_OSFXSR); - printk("done.\n"); - } - if (cpu_has_xmm) { - printk(KERN_INFO - "Enabling unmasked SIMD FPU exception support... "); - set_in_cr4(X86_CR4_OSXMMEXCPT); - printk("done.\n"); - } - - set_system_trap_gate(SYSCALL_VECTOR, &system_call); - - /* Reserve all the builtin and the syscall vector: */ - for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) - set_bit(i, used_vectors); - - set_bit(SYSCALL_VECTOR, used_vectors); -#endif - /* - * Should be a barrier for any external CPU state: - */ - cpu_init(); - -#ifdef CONFIG_X86_32 - trap_init_hook(); -#endif -} -- cgit v1.2.3 From dd6e4eba1c03c9562fced21736453396c045f869 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Sat, 4 Oct 2008 23:12:40 +0200 Subject: dumpstack: x86: move die_nmi to dumpstack_32.c For some reason die_nmi is still defined in traps.c for i386, but is found in dumpstack_64.c for x86_64. Move it to dumpstack_32.c Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 36 ++++++++++++++++++++++++++++++++++++ arch/x86/kernel/traps.c | 37 ------------------------------------- 2 files changed, 36 insertions(+), 37 deletions(-) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index c398b27df6c..dc9ca7ee1c4 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -403,6 +403,42 @@ void die(const char *str, struct pt_regs *regs, long err) oops_end(flags, regs, SIGSEGV); } +static DEFINE_SPINLOCK(nmi_print_lock); + +void notrace __kprobes +die_nmi(char *str, struct pt_regs *regs, int do_panic) +{ + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) + return; + + spin_lock(&nmi_print_lock); + /* + * We are in trouble anyway, lets at least try + * to get a message out: + */ + bust_spinlocks(1); + printk(KERN_EMERG "%s", str); + printk(" on CPU%d, ip %08lx, registers:\n", + smp_processor_id(), regs->ip); + show_registers(regs); + if (do_panic) + panic("Non maskable interrupt"); + console_silent(); + spin_unlock(&nmi_print_lock); + bust_spinlocks(0); + + /* + * If we are in kernel we are probably nested up pretty bad + * and might aswell get out now while we still can: + */ + if (!user_mode_vm(regs)) { + current->thread.trap_no = 2; + crash_kexec(regs); + } + + do_exit(SIGSEGV); +} + static int __init kstack_setup(char *s) { kstack_depth_to_print = simple_strtoul(s, NULL, 0); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ffb131f74f7..e062974cce3 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -429,43 +429,6 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); } -#ifdef CONFIG_X86_32 -static DEFINE_SPINLOCK(nmi_print_lock); - -void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) -{ - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) - return; - - spin_lock(&nmi_print_lock); - /* - * We are in trouble anyway, lets at least try - * to get a message out: - */ - bust_spinlocks(1); - printk(KERN_EMERG "%s", str); - printk(" on CPU%d, ip %08lx, registers:\n", - smp_processor_id(), regs->ip); - show_registers(regs); - if (do_panic) - panic("Non maskable interrupt"); - console_silent(); - spin_unlock(&nmi_print_lock); - bust_spinlocks(0); - - /* - * If we are in kernel we are probably nested up pretty bad - * and might aswell get out now while we still can: - */ - if (!user_mode_vm(regs)) { - current->thread.trap_no = 2; - crash_kexec(regs); - } - - do_exit(SIGSEGV); -} -#endif - static notrace __kprobes void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; -- cgit v1.2.3 From 161827903bdc124655f4cd976b9f0a5ac6ebf21c Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Sat, 4 Oct 2008 23:12:41 +0200 Subject: dumpstack: x86: make printk_address equal - x86_64: use %p to print an address - make i386-version the same as the above The result should be the same on x86_64; on i386 the output only changes if CONFIG_KALLSYMS is turned off, in which case the address is printed twice. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 27 ++------------------------- arch/x86/kernel/dumpstack_64.c | 4 ++-- 2 files changed, 4 insertions(+), 27 deletions(-) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index dc9ca7ee1c4..4e67a005c7a 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -23,31 +23,8 @@ static int die_counter; void printk_address(unsigned long address, int reliable) { -#ifdef CONFIG_KALLSYMS - unsigned long offset = 0; - unsigned long symsize; - const char *symname; - char *modname; - char *delim = ":"; - char namebuf[KSYM_NAME_LEN]; - char reliab[4] = ""; - - symname = kallsyms_lookup(address, &symsize, &offset, - &modname, namebuf); - if (!symname) { - printk(" [<%08lx>]\n", address); - return; - } - if (!reliable) - strcpy(reliab, "? "); - - if (!modname) - modname = delim = ""; - printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n", - address, reliab, delim, modname, delim, symname, offset, symsize); -#else - printk(" [<%08lx>]\n", address); -#endif + printk(" [<%p>] %s%pS\n", (void *) address, + reliable ? "" : "? ", (void *) address); } static inline int valid_stack_ptr(struct thread_info *tinfo, diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 6f1505074db..563554c9068 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -23,8 +23,8 @@ static int die_counter; void printk_address(unsigned long address, int reliable) { - printk(" [<%016lx>] %s%pS\n", - address, reliable ? "" : "? ", (void *) address); + printk(" [<%p>] %s%pS\n", (void *) address, + reliable ? "" : "? ", (void *) address); } static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, -- cgit v1.2.3 From 3a18512db00e0eedca86e3db4d2e81f8fe0b1774 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Sat, 4 Oct 2008 23:12:42 +0200 Subject: dumpstack: x86: add "end" parameter to valid_stack_ptr and print_context_stack - Add "end" parameter to valid_stack_ptr and print_context_stack - use sizeof(long) as the size of a word on the stack Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 19 +++++++++++++------ arch/x86/kernel/dumpstack_64.c | 2 +- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 4e67a005c7a..466d55dfeb8 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -28,10 +28,16 @@ void printk_address(unsigned long address, int reliable) } static inline int valid_stack_ptr(struct thread_info *tinfo, - void *p, unsigned int size) + void *p, unsigned int size, void *end) { void *t = tinfo; - return p > t && p <= t + THREAD_SIZE - size; + if (end) { + if (p < end && p >= (end-THREAD_SIZE)) + return 1; + else + return 0; + } + return p > t && p < t + THREAD_SIZE - size; } /* The form of the top of the frame on the stack */ @@ -43,16 +49,17 @@ struct stack_frame { static inline unsigned long print_context_stack(struct thread_info *tinfo, unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data) + const struct stacktrace_ops *ops, void *data, + unsigned long *end) { struct stack_frame *frame = (struct stack_frame *)bp; - while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) { + while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { unsigned long addr; addr = *stack; if (__kernel_text_address(addr)) { - if ((unsigned long) stack == bp + 4) { + if ((unsigned long) stack == bp + sizeof(long)) { ops->address(data, addr, 1); frame = frame->next_frame; bp = (unsigned long) frame; @@ -96,7 +103,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, context = (struct thread_info *) ((unsigned long)stack & (~(THREAD_SIZE - 1))); - bp = print_context_stack(context, stack, bp, ops, data); + bp = print_context_stack(context, stack, bp, ops, data, NULL); /* * Should be after the line below, but somewhere * in early boot context comes out corrupted and we diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 563554c9068..361afa8864b 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -141,7 +141,7 @@ print_context_stack(struct thread_info *tinfo, addr = *stack; if (__kernel_text_address(addr)) { - if ((unsigned long) stack == bp + 8) { + if ((unsigned long) stack == bp + sizeof(long)) { ops->address(data, addr, 1); frame = frame->next_frame; bp = (unsigned long) frame; -- cgit v1.2.3 From 2ac53721f37c79acddaf60f6ff232f56b7abddba Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Sat, 4 Oct 2008 23:12:43 +0200 Subject: dumptrace: x86: consistently include loglevel, print stack switch - i386 and x86_64: always printk the 'data' parameter - i386: announce stack switch (irq -> normal) - i386: check if there is a stack switch before announcing it There is a warning that 'context' might come out corrupt in early boot. If this is true it should be fixed, not worked around. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 17 ++++++----------- arch/x86/kernel/dumpstack_64.c | 9 +++++++-- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 466d55dfeb8..517b507bc56 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -104,16 +104,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, context = (struct thread_info *) ((unsigned long)stack & (~(THREAD_SIZE - 1))); bp = print_context_stack(context, stack, bp, ops, data, NULL); - /* - * Should be after the line below, but somewhere - * in early boot context comes out corrupted and we - * can't reference it: - */ - if (ops->stack(data, "IRQ") < 0) - break; + stack = (unsigned long *)context->previous_esp; if (!stack) break; + if (ops->stack(data, "IRQ") < 0) + break; touch_nmi_watchdog(); } } @@ -134,6 +130,7 @@ static void print_trace_warning(void *data, char *msg) static int print_trace_stack(void *data, char *name) { + printk("%s <%s> ", (char *)data, name); return 0; } @@ -142,11 +139,9 @@ static int print_trace_stack(void *data, char *name) */ static void print_trace_address(void *data, unsigned long addr, int reliable) { - printk("%s [<%08lx>] ", (char *)data, addr); - if (!reliable) - printk("? "); - print_symbol("%s\n", addr); touch_nmi_watchdog(); + printk(data); + printk_address(addr, reliable); } static const struct stacktrace_ops print_trace_ops = { diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 361afa8864b..521c833cdc3 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -247,24 +247,29 @@ EXPORT_SYMBOL(dump_trace); static void print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) { + printk(data); print_symbol(msg, symbol); printk("\n"); } static void print_trace_warning(void *data, char *msg) { - printk("%s\n", msg); + printk("%s%s\n", (char *)data, msg); } static int print_trace_stack(void *data, char *name) { - printk(" <%s> ", name); + printk("%s <%s> ", (char *)data, name); return 0; } +/* + * Print one address/symbol entries per line. + */ static void print_trace_address(void *data, unsigned long addr, int reliable) { touch_nmi_watchdog(); + printk(data); printk_address(addr, reliable); } -- cgit v1.2.3 From ca0a816403c53411bb6b6fb8bf60cef30695b09d Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Sat, 4 Oct 2008 23:12:44 +0200 Subject: dumpstack: x86: use log_lvl and unify trace formatting - x86: Write log_lvl strings if available - start raw stack dumps on new line - i386: Remove extra indentation for raw stack dumps Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 18 +++++++++--------- arch/x86/kernel/dumpstack_64.c | 8 ++++---- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 517b507bc56..09bc3330d55 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -155,8 +155,8 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, char *log_lvl) { + printk("%sCall Trace:\n", log_lvl); dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); - printk("%s =======================\n", log_lvl); } void show_trace(struct task_struct *task, struct pt_regs *regs, @@ -184,17 +184,16 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, if (kstack_end(stack)) break; if (i && ((i % 8) == 0)) - printk("\n%s ", log_lvl); - printk("%08lx ", *stack++); + printk("\n%s", log_lvl); + printk(" %08lx", *stack++); + touch_nmi_watchdog(); } - printk("\n%sCall Trace:\n", log_lvl); - + printk("\n"); show_trace_log_lvl(task, regs, sp, bp, log_lvl); } void show_stack(struct task_struct *task, unsigned long *sp) { - printk(" "); show_stack_log_lvl(task, NULL, sp, 0, ""); } @@ -229,7 +228,7 @@ void show_registers(struct pt_regs *regs) print_modules(); __show_regs(regs, 0); - printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", + printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", TASK_COMM_LEN, current->comm, task_pid_nr(current), current_thread_info(), current, task_thread_info(current)); /* @@ -242,8 +241,9 @@ void show_registers(struct pt_regs *regs) unsigned char c; u8 *ip; - printk("\n" KERN_EMERG "Stack: "); - show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG); + printk(KERN_EMERG "Stack:\n"); + show_stack_log_lvl(NULL, regs, ®s->sp, + 0, KERN_EMERG); printk(KERN_EMERG "Code: "); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 521c833cdc3..7fd32944cea 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -284,7 +284,7 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, char *log_lvl) { - printk("Call Trace:\n"); + printk("%sCall Trace:\n", log_lvl); dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); } @@ -330,7 +330,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, break; } if (i && ((i % 4) == 0)) - printk("\n"); + printk("\n%s", log_lvl); printk(" %016lx", *stack++); touch_nmi_watchdog(); } @@ -388,9 +388,9 @@ void show_registers(struct pt_regs *regs) unsigned char c; u8 *ip; - printk("Stack: "); + printk(KERN_EMERG "Stack:\n"); show_stack_log_lvl(NULL, regs, (unsigned long *)sp, - regs->bp, ""); + regs->bp, KERN_EMERG); printk(KERN_EMERG "Code: "); -- cgit v1.2.3 From 802a67de0cbd1ef10df80ad48b840e2103b13e52 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Sat, 4 Oct 2008 23:12:45 +0200 Subject: dumpstack: i386: make kstack= an early boot-param and add oops=panic - make kstack= and early_param - add oops=panic, setting panic_on_oops Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 09bc3330d55..9a8920abe4b 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -418,13 +418,24 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic) do_exit(SIGSEGV); } +static int __init oops_setup(char *s) +{ + if (!s) + return -EINVAL; + if (!strcmp(s, "panic")) + panic_on_oops = 1; + return 0; +} +early_param("oops", oops_setup); + static int __init kstack_setup(char *s) { + if (!s) + return -EINVAL; kstack_depth_to_print = simple_strtoul(s, NULL, 0); - - return 1; + return 0; } -__setup("kstack=", kstack_setup); +early_param("kstack", kstack_setup); static int __init code_bytes_setup(char *s) { -- cgit v1.2.3 From 8a541665b9063c5006b370d4488cf9f6beb6083f Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Sat, 4 Oct 2008 23:12:46 +0200 Subject: dumpstack: x86: various small unification steps - define STACKSLOTS_PER_LINE and use it - define get_bp macro to hide the %%ebp/%%rbp difference - i386: check task==NULL in dump_trace, like x86_64 - i386: show_trace(NULL, ...) uses current automatically - x86_64: use [#%d] for die_counter, like i386 - whitespace and comments Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 23 +++++++++++------------ arch/x86/kernel/dumpstack_64.c | 15 +++++++++------ 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 9a8920abe4b..201ee359a1a 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -16,8 +16,11 @@ #include +#define STACKSLOTS_PER_LINE 8 +#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) + int panic_on_unrecovered_nmi; -int kstack_depth_to_print = 24; +int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; static unsigned int code_bytes = 64; static int die_counter; @@ -82,7 +85,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (!stack) { unsigned long dummy; stack = &dummy; - if (task != current) + if (task && task != current) stack = (unsigned long *)task->thread.sp; } @@ -90,7 +93,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (!bp) { if (task == current) { /* Grab bp right from our regs */ - asm("movl %%ebp, %0" : "=r" (bp) :); + get_bp(bp); } else { /* bp is the last reg pushed by switch_to */ bp = *(unsigned long *) task->thread.sp; @@ -167,7 +170,7 @@ void show_trace(struct task_struct *task, struct pt_regs *regs, static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) + unsigned long *sp, unsigned long bp, char *log_lvl) { unsigned long *stack; int i; @@ -183,7 +186,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, for (i = 0; i < kstack_depth_to_print; i++) { if (kstack_end(stack)) break; - if (i && ((i % 8) == 0)) + if (i && ((i % STACKSLOTS_PER_LINE) == 0)) printk("\n%s", log_lvl); printk(" %08lx", *stack++); touch_nmi_watchdog(); @@ -207,7 +210,7 @@ void dump_stack(void) #ifdef CONFIG_FRAME_POINTER if (!bp) - asm("movl %%ebp, %0" : "=r" (bp):); + get_bp(bp); #endif printk("Pid: %d, comm: %.20s %s %s %.*s\n", @@ -215,8 +218,7 @@ void dump_stack(void) init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - - show_trace(current, NULL, &stack, bp); + show_trace(NULL, NULL, &stack, bp); } EXPORT_SYMBOL(dump_stack); @@ -249,7 +251,7 @@ void show_registers(struct pt_regs *regs) ip = (u8 *)regs->ip - code_prologue; if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - /* try starting at EIP */ + /* try starting at IP */ ip = (u8 *)regs->ip; code_len = code_len - code_prologue + 1; } @@ -317,13 +319,10 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) if (kexec_should_crash(current)) crash_kexec(regs); - if (in_interrupt()) panic("Fatal exception in interrupt"); - if (panic_on_oops) panic("Fatal exception"); - oops_exit(); do_exit(signr); } diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 7fd32944cea..13379a98829 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -16,8 +16,11 @@ #include +#define STACKSLOTS_PER_LINE 4 +#define get_bp(bp) asm("movl %%rbp, %0" : "=r" (bp) :) + int panic_on_unrecovered_nmi; -int kstack_depth_to_print = 12; +int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; static unsigned int code_bytes = 64; static int die_counter; @@ -177,7 +180,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (!bp) { if (task == current) { /* Grab bp right from our regs */ - asm("movq %%rbp, %0" : "=r" (bp) : ); + get_bp(bp); } else { /* bp is the last reg pushed by switch_to */ bp = *(unsigned long *) task->thread.sp; @@ -329,7 +332,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, if (((long) stack & (THREAD_SIZE-1)) == 0) break; } - if (i && ((i % 4) == 0)) + if (i && ((i % STACKSLOTS_PER_LINE) == 0)) printk("\n%s", log_lvl); printk(" %016lx", *stack++); touch_nmi_watchdog(); @@ -353,7 +356,7 @@ void dump_stack(void) #ifdef CONFIG_FRAME_POINTER if (!bp) - asm("movq %%rbp, %0" : "=r" (bp) : ); + get_bp(bp); #endif printk("Pid: %d, comm: %.20s %s %s %.*s\n", @@ -396,7 +399,7 @@ void show_registers(struct pt_regs *regs) ip = (u8 *)regs->ip - code_prologue; if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - /* try starting at RIP */ + /* try starting at IP */ ip = (u8 *)regs->ip; code_len = code_len - code_prologue + 1; } @@ -475,7 +478,7 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) int __kprobes __die(const char *str, struct pt_regs *regs, long err) { - printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter); + printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); #ifdef CONFIG_PREEMPT printk("PREEMPT "); #endif -- cgit v1.2.3 From 649c6653fa94ec8f3ea32b19c97b790ec4e8e4ac Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 5 Oct 2008 16:52:24 +0200 Subject: x86: improve UP kernel when CPU-hotplug and SMP is enabled num_possible_cpus() can be > 1 when disabled CPUs have been accounted. Disabled CPUs are not in the cpu_present_map, so we can use num_present_cpus() as a safe indicator to switch to UP alternatives. Reported-by: Chuck Ebbert Signed-off-by: Thomas Gleixner Cc: Signed-off-by: Ingo Molnar --- arch/x86/kernel/alternative.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index fb04e49776b..a84ac7b570e 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -444,7 +444,7 @@ void __init alternative_instructions(void) _text, _etext); /* Only switch to UP mode if we don't immediately boot others */ - if (num_possible_cpus() == 1 || setup_max_cpus <= 1) + if (num_present_cpus() == 1 || setup_max_cpus <= 1) alternatives_smp_switch(0); } #endif -- cgit v1.2.3 From b807305059c28fb8197496c944bfaa6b372a40ad Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 5 Oct 2008 17:12:36 +0200 Subject: x86: remove additional_cpus configurability additional_cpus= parameter is dangerous and broken: for example if we boot additional_cpus=-2 on a stock dual-core system it will crash the box on bootup. So reduce the maze of code a bit by removingthe user-configurability angle. Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 18 ------------------ arch/x86/kernel/smpboot.c | 8 +------- 2 files changed, 1 insertion(+), 25 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index adaca6fa927..fc8351f374f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1438,24 +1438,6 @@ config HOTPLUG_CPU automatically on SMP systems. ) Say N if you want to disable CPU hotplug. -config HOTPLUG_RESTRICT_TO_BOOTUP_CPUS - def_bool n - prompt "Restrict CPU hotplugging to processors found during boot" if HOTPLUG_CPU - ---help--- - Say no here to use the default, which allows as many CPUs as are marked - "disabled" by ACPI or MPTABLES to be hotplugged after bootup. - - Say yes if you do not want to allow CPUs to be added after booting, for - example if you only need CPU hotplugging enabled for suspend/resume. - - If CPU_HOTPLUG is enabled this value may be overridden at boot time - with the "additional_cpus" kernel parameter. - -config HOTPLUG_ADDITIONAL_CPUS - int - default 0 if !HOTPLUG_CPU || HOTPLUG_RESTRICT_TO_BOOTUP_CPUS - default -1 - config COMPAT_VDSO def_bool y prompt "Compat VDSO support" diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 857a88bb919..8dd201c3132 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1261,7 +1261,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus) check_nmi_watchdog(); } -static int additional_cpus __initdata = CONFIG_HOTPLUG_ADDITIONAL_CPUS; +static int additional_cpus = -1; /* * cpu_possible_map should be static, it cannot change as cpu's @@ -1334,12 +1334,6 @@ static void remove_siblinginfo(int cpu) cpu_clear(cpu, cpu_sibling_setup_map); } -static __init int setup_additional_cpus(char *s) -{ - return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL; -} -early_param("additional_cpus", setup_additional_cpus); - static void __ref remove_cpu_from_maps(int cpu) { cpu_clear(cpu, cpu_online_map); -- cgit v1.2.3 From cb48bb59995d2d14a0c732835c80bbcfb354de31 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 5 Oct 2008 17:51:52 +0200 Subject: x86: remove additional_cpus remove remainder of additional_cpus logic. We now just listen to the disabled_cpus value like we did for years. disabled_cpus is always >= 0 so no need for an extra check. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8dd201c3132..8c3aca7cb34 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1261,8 +1261,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus) check_nmi_watchdog(); } -static int additional_cpus = -1; - /* * cpu_possible_map should be static, it cannot change as cpu's * are onlined, or offlined. The reason is per-cpu data-structures @@ -1282,21 +1280,13 @@ static int additional_cpus = -1; */ __init void prefill_possible_map(void) { - int i; - int possible; + int i, possible; /* no processor from mptable or madt */ if (!num_processors) num_processors = 1; - if (additional_cpus == -1) { - if (disabled_cpus > 0) - additional_cpus = disabled_cpus; - else - additional_cpus = 0; - } - - possible = num_processors + additional_cpus; + possible = num_processors + disabled_cpus; if (possible > NR_CPUS) possible = NR_CPUS; -- cgit v1.2.3 From 6a2ae2d9f9212de47c16e23080162aada882c8b6 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Sun, 5 Oct 2008 12:39:36 +0200 Subject: dumpstack: x86: various small unification steps, fix After "dumpstack: x86: various small unification steps", the assembler gives the following compile error. The error is in dumpstack_64.c. {standard input}: Assembler messages: {standard input}:720: Error: Incorrect register `%rbx' used with `l' suffix {standard input}:1340: Error: Incorrect register `%r12' used with `l' suffix Indeed the suffix in get_bp() was wrong. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 13379a98829..086cc8118e3 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -17,7 +17,7 @@ #include #define STACKSLOTS_PER_LINE 4 -#define get_bp(bp) asm("movl %%rbp, %0" : "=r" (bp) :) +#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) int panic_on_unrecovered_nmi; int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; -- cgit v1.2.3 From 79aa10dd9fb0f896ce358314fdba20606c038864 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 29 Aug 2008 12:50:38 +0100 Subject: x86: adjust dependencies for CONFIG_X86_CMOV Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.cpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index f4bdc0492d3..0b7c4a3f065 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -397,7 +397,7 @@ config X86_CMPXCHG64 # generates cmov. config X86_CMOV def_bool y - depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || X86_64) + depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64) config X86_MINIMUM_CPU_FAMILY int -- cgit v1.2.3 From c1a2f4b10852ce68e70f7e4c53600c36cc63ea45 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 14 Sep 2008 02:33:12 -0700 Subject: x86: change early_ioremap to use slots instead of nesting so we could remove the requirement that one needs to call early_iounmap() in exactly reverse order of early_ioremap(). Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 77 +++++++++++++++++++++++++++++++++------------ include/asm-x86/fixmap_32.h | 4 +-- include/asm-x86/fixmap_64.h | 4 +-- 3 files changed, 61 insertions(+), 24 deletions(-) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 8876220d906..e4c43ec71b2 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -616,16 +616,22 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx) __early_set_fixmap(idx, 0, __pgprot(0)); } - -static int __initdata early_ioremap_nested; - +static void *prev_map[FIX_BTMAPS_SLOTS] __initdata; +static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; static int __init check_early_ioremap_leak(void) { - if (!early_ioremap_nested) + int count = 0; + int i; + + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + if (prev_map[i]) + count++; + + if (!count) return 0; WARN(1, KERN_WARNING "Debug warning: early ioremap leak of %d areas detected.\n", - early_ioremap_nested); + count); printk(KERN_WARNING "please boot with early_ioremap_debug and report the dmesg.\n"); @@ -636,15 +642,30 @@ late_initcall(check_early_ioremap_leak); static void __init *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot) { unsigned long offset, last_addr; - unsigned int nrpages, nesting; + unsigned int nrpages; enum fixed_addresses idx0, idx; + int i, slot; WARN_ON(system_state != SYSTEM_BOOTING); - nesting = early_ioremap_nested; + slot = -1; + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { + if (!prev_map[i]) { + slot = i; + break; + } + } + + if (slot < 0) { + printk(KERN_INFO "early_iomap(%08lx, %08lx) not found slot\n", + phys_addr, size); + WARN_ON(1); + return NULL; + } + if (early_ioremap_debug) { printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ", - phys_addr, size, nesting); + phys_addr, size, slot); dump_stack(); } @@ -655,11 +676,7 @@ static void __init *__early_ioremap(unsigned long phys_addr, unsigned long size, return NULL; } - if (nesting >= FIX_BTMAPS_NESTING) { - WARN_ON(1); - return NULL; - } - early_ioremap_nested++; + prev_size[slot] = size; /* * Mappings have to be page-aligned */ @@ -679,7 +696,7 @@ static void __init *__early_ioremap(unsigned long phys_addr, unsigned long size, /* * Ok, go for it.. */ - idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; + idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; idx = idx0; while (nrpages > 0) { early_set_fixmap(idx, phys_addr, prot); @@ -690,7 +707,8 @@ static void __init *__early_ioremap(unsigned long phys_addr, unsigned long size, if (early_ioremap_debug) printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0)); - return (void *) (offset + fix_to_virt(idx0)); + prev_map[slot] = (void *) (offset + fix_to_virt(idx0)); + return prev_map[slot]; } /* Remap an IO device */ @@ -711,15 +729,33 @@ void __init early_iounmap(void *addr, unsigned long size) unsigned long offset; unsigned int nrpages; enum fixed_addresses idx; - int nesting; + int i, slot; + + slot = -1; + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { + if (prev_map[i] == addr) { + slot = i; + break; + } + } - nesting = --early_ioremap_nested; - if (WARN_ON(nesting < 0)) + if (slot < 0) { + printk(KERN_INFO "early_iounmap(%p, %08lx) not found slot\n", + addr, size); + WARN_ON(1); + return; + } + + if (prev_size[slot] != size) { + printk(KERN_INFO "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n", + addr, size, slot, prev_size[slot]); + WARN_ON(1); return; + } if (early_ioremap_debug) { printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, - size, nesting); + size, slot); dump_stack(); } @@ -731,12 +767,13 @@ void __init early_iounmap(void *addr, unsigned long size) offset = virt_addr & ~PAGE_MASK; nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; - idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; + idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; while (nrpages > 0) { early_clear_fixmap(idx); --idx; --nrpages; } + prev_map[slot] = 0; } void __this_fixmap_does_not_exist(void) diff --git a/include/asm-x86/fixmap_32.h b/include/asm-x86/fixmap_32.h index 784e3e75986..8844002da0e 100644 --- a/include/asm-x86/fixmap_32.h +++ b/include/asm-x86/fixmap_32.h @@ -94,10 +94,10 @@ enum fixed_addresses { * can have a single pgd entry and a single pte table: */ #define NR_FIX_BTMAPS 64 -#define FIX_BTMAPS_NESTING 4 +#define FIX_BTMAPS_SLOTS 4 FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - (__end_of_permanent_fixed_addresses & 255), - FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1, + FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, FIX_WP_TEST, #ifdef CONFIG_ACPI FIX_ACPI_BEGIN, diff --git a/include/asm-x86/fixmap_64.h b/include/asm-x86/fixmap_64.h index 33df64fd0e1..dab4751d130 100644 --- a/include/asm-x86/fixmap_64.h +++ b/include/asm-x86/fixmap_64.h @@ -65,10 +65,10 @@ enum fixed_addresses { * can have a single pgd entry and a single pte table: */ #define NR_FIX_BTMAPS 64 -#define FIX_BTMAPS_NESTING 4 +#define FIX_BTMAPS_SLOTS 4 FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - (__end_of_permanent_fixed_addresses & 255), - FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1, + FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, __end_of_fixed_addresses }; -- cgit v1.2.3