diff options
Diffstat (limited to 'arch/i386/kernel')
46 files changed, 3575 insertions, 2077 deletions
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 1e8988e558c..4ae3dcf1d2f 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -18,7 +18,7 @@ obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_APM) += apm.o -obj-$(CONFIG_X86_SMP) += smp.o smpboot.o +obj-$(CONFIG_X86_SMP) += smp.o smpboot.o tsc_sync.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o @@ -32,7 +32,6 @@ obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_MODULES) += module.o obj-y += sysenter.o vsyscall.o obj-$(CONFIG_ACPI_SRAT) += srat.o -obj-$(CONFIG_HPET_TIMER) += time_hpet.o obj-$(CONFIG_EFI) += efi.o efi_stub.o obj-$(CONFIG_DOUBLEFAULT) += doublefault.o obj-$(CONFIG_VM86) += vm86.o @@ -40,8 +39,9 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_K8_NB) += k8.o -# Make sure this is linked after any other paravirt_ops structs: see head.S +obj-$(CONFIG_VMI) += vmi.o vmitime.o obj-$(CONFIG_PARAVIRT) += paravirt.o +obj-y += pcspeaker.o EXTRA_AFLAGS := -traditional diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index e94aff6888c..fb3e72328a5 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -25,6 +25,7 @@ #include <linux/init.h> #include <linux/acpi.h> +#include <linux/acpi_pmtmr.h> #include <linux/efi.h> #include <linux/cpumask.h> #include <linux/module.h> @@ -615,6 +616,7 @@ static int __init acpi_parse_sbf(struct acpi_table_header *table) } #ifdef CONFIG_HPET_TIMER +#include <asm/hpet.h> static int __init acpi_parse_hpet(struct acpi_table_header *table) { @@ -645,24 +647,11 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table) hpet_res->end = (1 * 1024) - 1; } -#ifdef CONFIG_X86_64 - vxtime.hpet_address = hpet_tbl->address.address; - + hpet_address = hpet_tbl->address.address; printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", - hpet_tbl->id, vxtime.hpet_address); - - res_start = vxtime.hpet_address; -#else /* X86 */ - { - extern unsigned long hpet_address; - - hpet_address = hpet_tbl->address.address; - printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", - hpet_tbl->id, hpet_address); + hpet_tbl->id, hpet_address); - res_start = hpet_address; - } -#endif /* X86 */ + res_start = hpet_address; if (hpet_res) { hpet_res->start = res_start; @@ -676,10 +665,6 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table) #define acpi_parse_hpet NULL #endif -#ifdef CONFIG_X86_PM_TIMER -extern u32 pmtmr_ioport; -#endif - static int __init acpi_parse_fadt(struct acpi_table_header *table) { diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index 776d9be26af..9655c233e6f 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -25,6 +25,8 @@ #include <linux/kernel_stat.h> #include <linux/sysdev.h> #include <linux/cpu.h> +#include <linux/clockchips.h> +#include <linux/acpi_pmtmr.h> #include <linux/module.h> #include <asm/atomic.h> @@ -36,6 +38,7 @@ #include <asm/hpet.h> #include <asm/i8253.h> #include <asm/nmi.h> +#include <asm/idle.h> #include <mach_apic.h> #include <mach_apicdef.h> @@ -44,128 +47,549 @@ #include "io_ports.h" /* - * cpu_mask that denotes the CPUs that needs timer interrupt coming in as - * IPIs in place of local APIC timers + * Sanity check */ -static cpumask_t timer_bcast_ipi; +#if (SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F +# error SPURIOUS_APIC_VECTOR definition error +#endif /* * Knob to control our willingness to enable the local APIC. + * + * -1=force-disable, +1=force-enable */ -static int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */ - -static inline void lapic_disable(void) -{ - enable_local_apic = -1; - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); -} +static int enable_local_apic __initdata = 0; -static inline void lapic_enable(void) -{ - enable_local_apic = 1; -} +/* Local APIC timer verification ok */ +static int local_apic_timer_verify_ok; /* - * Debug level + * Debug level, exported for io_apic.c */ int apic_verbosity; +static unsigned int calibration_result; +static int lapic_next_event(unsigned long delta, + struct clock_event_device *evt); +static void lapic_timer_setup(enum clock_event_mode mode, + struct clock_event_device *evt); +static void lapic_timer_broadcast(cpumask_t mask); static void apic_pm_activate(void); +/* + * The local apic timer can be used for any function which is CPU local. + */ +static struct clock_event_device lapic_clockevent = { + .name = "lapic", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT + | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY, + .shift = 32, + .set_mode = lapic_timer_setup, + .set_next_event = lapic_next_event, + .broadcast = lapic_timer_broadcast, + .rating = 100, + .irq = -1, +}; +static DEFINE_PER_CPU(struct clock_event_device, lapic_events); + +/* Local APIC was disabled by the BIOS and enabled by the kernel */ +static int enabled_via_apicbase; + +/* + * Get the LAPIC version + */ +static inline int lapic_get_version(void) +{ + return GET_APIC_VERSION(apic_read(APIC_LVR)); +} + +/* + * Check, if the APIC is integrated or a seperate chip + */ +static inline int lapic_is_integrated(void) +{ + return APIC_INTEGRATED(lapic_get_version()); +} + +/* + * Check, whether this is a modern or a first generation APIC + */ static int modern_apic(void) { - unsigned int lvr, version; /* AMD systems use old APIC versions, so check the CPU */ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 >= 0xf) + boot_cpu_data.x86 >= 0xf) return 1; - lvr = apic_read(APIC_LVR); - version = GET_APIC_VERSION(lvr); - return version >= 0x14; + return lapic_get_version() >= 0x14; } +/** + * enable_NMI_through_LVT0 - enable NMI through local vector table 0 + */ +void enable_NMI_through_LVT0 (void * dummy) +{ + unsigned int v = APIC_DM_NMI; + + /* Level triggered for 82489DX */ + if (!lapic_is_integrated()) + v |= APIC_LVT_LEVEL_TRIGGER; + apic_write_around(APIC_LVT0, v); +} + +/** + * get_physical_broadcast - Get number of physical broadcast IDs + */ +int get_physical_broadcast(void) +{ + return modern_apic() ? 0xff : 0xf; +} + +/** + * lapic_get_maxlvt - get the maximum number of local vector table entries + */ +int lapic_get_maxlvt(void) +{ + unsigned int v = apic_read(APIC_LVR); + + /* 82489DXs do not report # of LVT entries. */ + return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; +} + +/* + * Local APIC timer + */ + +/* Clock divisor is set to 16 */ +#define APIC_DIVISOR 16 + /* - * 'what should we do if we get a hw irq event on an illegal vector'. - * each architecture has to answer this themselves. + * This function sets up the local APIC timer, with a timeout of + * 'clocks' APIC bus clock. During calibration we actually call + * this function twice on the boot CPU, once with a bogus timeout + * value, second time for real. The other (noncalibrating) CPUs + * call this function only once, with the real, calibrated value. + * + * We do reads before writes even if unnecessary, to get around the + * P5 APIC double write bug. */ -void ack_bad_irq(unsigned int irq) +static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) { - printk("unexpected IRQ trap at vector %02x\n", irq); + unsigned int lvtt_value, tmp_value; + + lvtt_value = LOCAL_TIMER_VECTOR; + if (!oneshot) + lvtt_value |= APIC_LVT_TIMER_PERIODIC; + if (!lapic_is_integrated()) + lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); + + if (!irqen) + lvtt_value |= APIC_LVT_MASKED; + + apic_write_around(APIC_LVTT, lvtt_value); + /* - * Currently unexpected vectors happen only on SMP and APIC. - * We _must_ ack these because every local APIC has only N - * irq slots per priority level, and a 'hanging, unacked' IRQ - * holds up an irq slot - in excessive cases (when multiple - * unexpected vectors occur) that might lock up the APIC - * completely. - * But only ack when the APIC is enabled -AK + * Divide PICLK by 16 */ - if (cpu_has_apic) - ack_APIC_irq(); + tmp_value = apic_read(APIC_TDCR); + apic_write_around(APIC_TDCR, (tmp_value + & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) + | APIC_TDR_DIV_16); + + if (!oneshot) + apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); } -void __init apic_intr_init(void) +/* + * Program the next event, relative to now + */ +static int lapic_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + apic_write_around(APIC_TMICT, delta); + return 0; +} + +/* + * Setup the lapic timer in periodic or oneshot mode + */ +static void lapic_timer_setup(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + unsigned long flags; + unsigned int v; + + /* Lapic used for broadcast ? */ + if (!local_apic_timer_verify_ok) + return; + + local_irq_save(flags); + + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + case CLOCK_EVT_MODE_ONESHOT: + __setup_APIC_LVTT(calibration_result, + mode != CLOCK_EVT_MODE_PERIODIC, 1); + break; + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + v = apic_read(APIC_LVTT); + v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write_around(APIC_LVTT, v); + break; + } + + local_irq_restore(flags); +} + +/* + * Local APIC timer broadcast function + */ +static void lapic_timer_broadcast(cpumask_t mask) { #ifdef CONFIG_SMP - smp_intr_init(); + send_IPI_mask(mask, LOCAL_TIMER_VECTOR); #endif - /* self generated IPI for local APIC timer */ - set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); +} - /* IPI vectors for APIC spurious and error interrupts */ - set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); +/* + * Setup the local APIC timer for this CPU. Copy the initilized values + * of the boot CPU and register the clock event in the framework. + */ +static void __devinit setup_APIC_timer(void) +{ + struct clock_event_device *levt = &__get_cpu_var(lapic_events); - /* thermal monitor LVT interrupt */ -#ifdef CONFIG_X86_MCE_P4THERMAL - set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); -#endif + memcpy(levt, &lapic_clockevent, sizeof(*levt)); + levt->cpumask = cpumask_of_cpu(smp_processor_id()); + + clockevents_register_device(levt); } -/* Using APIC to generate smp_local_timer_interrupt? */ -int using_apic_timer __read_mostly = 0; +/* + * In this functions we calibrate APIC bus clocks to the external timer. + * + * We want to do the calibration only once since we want to have local timer + * irqs syncron. CPUs connected by the same APIC bus have the very same bus + * frequency. + * + * This was previously done by reading the PIT/HPET and waiting for a wrap + * around to find out, that a tick has elapsed. I have a box, where the PIT + * readout is broken, so it never gets out of the wait loop again. This was + * also reported by others. + * + * Monitoring the jiffies value is inaccurate and the clockevents + * infrastructure allows us to do a simple substitution of the interrupt + * handler. + * + * The calibration routine also uses the pm_timer when possible, as the PIT + * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes + * back to normal later in the boot process). + */ + +#define LAPIC_CAL_LOOPS (HZ/10) -static int enabled_via_apicbase; +static __initdata volatile int lapic_cal_loops = -1; +static __initdata long lapic_cal_t1, lapic_cal_t2; +static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2; +static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2; +static __initdata unsigned long lapic_cal_j1, lapic_cal_j2; -void enable_NMI_through_LVT0 (void * dummy) +/* + * Temporary interrupt handler. + */ +static void __init lapic_cal_handler(struct clock_event_device *dev) { - unsigned int v, ver; + unsigned long long tsc = 0; + long tapic = apic_read(APIC_TMCCT); + unsigned long pm = acpi_pm_read_early(); - ver = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(ver); - v = APIC_DM_NMI; /* unmask and set to NMI */ - if (!APIC_INTEGRATED(ver)) /* 82489DX */ - v |= APIC_LVT_LEVEL_TRIGGER; - apic_write_around(APIC_LVT0, v); + if (cpu_has_tsc) + rdtscll(tsc); + + switch (lapic_cal_loops++) { + case 0: + lapic_cal_t1 = tapic; + lapic_cal_tsc1 = tsc; + lapic_cal_pm1 = pm; + lapic_cal_j1 = jiffies; + break; + + case LAPIC_CAL_LOOPS: + lapic_cal_t2 = tapic; + lapic_cal_tsc2 = tsc; + if (pm < lapic_cal_pm1) + pm += ACPI_PM_OVRRUN; + lapic_cal_pm2 = pm; + lapic_cal_j2 = jiffies; + break; + } } -int get_physical_broadcast(void) +/* + * Setup the boot APIC + * + * Calibrate and verify the result. + */ +void __init setup_boot_APIC_clock(void) { - if (modern_apic()) - return 0xff; - else - return 0xf; + struct clock_event_device *levt = &__get_cpu_var(lapic_events); + const long pm_100ms = PMTMR_TICKS_PER_SEC/10; + const long pm_thresh = pm_100ms/100; + void (*real_handler)(struct clock_event_device *dev); + unsigned long deltaj; + long delta, deltapm; + + apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" + "calibrating APIC timer ...\n"); + + local_irq_disable(); + + /* Replace the global interrupt handler */ + real_handler = global_clock_event->event_handler; + global_clock_event->event_handler = lapic_cal_handler; + + /* + * Setup the APIC counter to 1e9. There is no way the lapic + * can underflow in the 100ms detection time frame + */ + __setup_APIC_LVTT(1000000000, 0, 0); + + /* Let the interrupts run */ + local_irq_enable(); + + while(lapic_cal_loops <= LAPIC_CAL_LOOPS); + + local_irq_disable(); + + /* Restore the real event handler */ + global_clock_event->event_handler = real_handler; + + /* Build delta t1-t2 as apic timer counts down */ + delta = lapic_cal_t1 - lapic_cal_t2; + apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); + + /* Check, if the PM timer is available */ + deltapm = lapic_cal_pm2 - lapic_cal_pm1; + apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); + + if (deltapm) { + unsigned long mult; + u64 res; + + mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22); + + if (deltapm > (pm_100ms - pm_thresh) && + deltapm < (pm_100ms + pm_thresh)) { + apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); + } else { + res = (((u64) deltapm) * mult) >> 22; + do_div(res, 1000000); + printk(KERN_WARNING "APIC calibration not consistent " + "with PM Timer: %ldms instead of 100ms\n", + (long)res); + /* Correct the lapic counter value */ + res = (((u64) delta ) * pm_100ms); + do_div(res, deltapm); + printk(KERN_INFO "APIC delta adjusted to PM-Timer: " + "%lu (%ld)\n", (unsigned long) res, delta); + delta = (long) res; + } + } + + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, 32); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + + calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; + + apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); + apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); + apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", + calibration_result); + + if (cpu_has_tsc) { + delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1); + apic_printk(APIC_VERBOSE, "..... CPU clock speed is " + "%ld.%04ld MHz.\n", + (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ), + (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ)); + } + + apic_printk(APIC_VERBOSE, "..... host bus clock speed is " + "%u.%04u MHz.\n", + calibration_result / (1000000 / HZ), + calibration_result % (1000000 / HZ)); + + + apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); + + /* + * Setup the apic timer manually + */ + local_apic_timer_verify_ok = 1; + levt->event_handler = lapic_cal_handler; + lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt); + lapic_cal_loops = -1; + + /* Let the interrupts run */ + local_irq_enable(); + + while(lapic_cal_loops <= LAPIC_CAL_LOOPS); + + local_irq_disable(); + + /* Stop the lapic timer */ + lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt); + + local_irq_enable(); + + /* Jiffies delta */ + deltaj = lapic_cal_j2 - lapic_cal_j1; + apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj); + + /* Check, if the PM timer is available */ + deltapm = lapic_cal_pm2 - lapic_cal_pm1; + apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); + + local_apic_timer_verify_ok = 0; + + if (deltapm) { + if (deltapm > (pm_100ms - pm_thresh) && + deltapm < (pm_100ms + pm_thresh)) { + apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); + /* Check, if the jiffies result is consistent */ + if (deltaj < LAPIC_CAL_LOOPS-2 || + deltaj > LAPIC_CAL_LOOPS+2) { + /* + * Not sure, what we can do about this one. + * When high resultion timers are active + * and the lapic timer does not stop in C3 + * we are fine. Otherwise more trouble might + * be waiting. -- tglx + */ + printk(KERN_WARNING "Global event device %s " + "has wrong frequency " + "(%lu ticks instead of %d)\n", + global_clock_event->name, deltaj, + LAPIC_CAL_LOOPS); + } + local_apic_timer_verify_ok = 1; + } + } else { + /* Check, if the jiffies result is consistent */ + if (deltaj >= LAPIC_CAL_LOOPS-2 && + deltaj <= LAPIC_CAL_LOOPS+2) { + apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); + local_apic_timer_verify_ok = 1; + } + } + + if (!local_apic_timer_verify_ok) { + printk(KERN_WARNING + "APIC timer disabled due to verification failure.\n"); + /* No broadcast on UP ! */ + if (num_possible_cpus() == 1) + return; + } else + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; + + /* Setup the lapic or request the broadcast */ + setup_APIC_timer(); +} + +void __devinit setup_secondary_APIC_clock(void) +{ + setup_APIC_timer(); } -int get_maxlvt(void) +/* + * The guts of the apic timer interrupt + */ +static void local_apic_timer_interrupt(void) { - unsigned int v, ver, maxlvt; + int cpu = smp_processor_id(); + struct clock_event_device *evt = &per_cpu(lapic_events, cpu); - v = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(v); - /* 82489DXs do not report # of LVT entries. */ - maxlvt = APIC_INTEGRATED(ver) ? GET_APIC_MAXLVT(v) : 2; - return maxlvt; + /* + * Normally we should not be here till LAPIC has been initialized but + * in some cases like kdump, its possible that there is a pending LAPIC + * timer interrupt from previous kernel's context and is delivered in + * new kernel the moment interrupts are enabled. + * + * Interrupts are enabled early and LAPIC is setup much later, hence + * its possible that when we get here evt->event_handler is NULL. + * Check for event_handler being NULL and discard the interrupt as + * spurious. + */ + if (!evt->event_handler) { + printk(KERN_WARNING + "Spurious LAPIC timer interrupt on cpu %d\n", cpu); + /* Switch it off */ + lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); + return; + } + + per_cpu(irq_stat, cpu).apic_timer_irqs++; + + evt->event_handler(evt); } +/* + * Local APIC timer interrupt. This is the most natural way for doing + * local interrupts, but local timer interrupts can be emulated by + * broadcast interrupts too. [in case the hw doesn't support APIC timers] + * + * [ if a single-CPU system runs an SMP kernel then we call the local + * interrupt as well. Thus we cannot inline the local irq ... ] + */ + +void fastcall smp_apic_timer_interrupt(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + /* + * NOTE! We'd better ACK the irq immediately, + * because timer handling can be slow. + */ + ack_APIC_irq(); + /* + * update_process_times() expects us to have done irq_enter(). + * Besides, if we don't timer interrupts ignore the global + * interrupt lock, which is the WrongThing (tm) to do. + */ + exit_idle(); + irq_enter(); + local_apic_timer_interrupt(); + irq_exit(); + + set_irq_regs(old_regs); +} + +int setup_profiling_timer(unsigned int multiplier) +{ + return -EINVAL; +} + +/* + * Local APIC start and shutdown + */ + +/** + * clear_local_APIC - shutdown the local APIC + * + * This is called, when a CPU is disabled and before rebooting, so the state of + * the local APIC has no dangling leftovers. Also used to cleanout any BIOS + * leftovers during boot. + */ void clear_local_APIC(void) { - int maxlvt; + int maxlvt = lapic_get_maxlvt(); unsigned long v; - maxlvt = get_maxlvt(); - /* * Masking an LVT entry can trigger a local APIC error * if the vector is zero. Mask LVTERR first to prevent this. @@ -189,7 +613,7 @@ void clear_local_APIC(void) apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); } -/* lets not touch this if we didn't frob it */ + /* lets not touch this if we didn't frob it */ #ifdef CONFIG_X86_MCE_P4THERMAL if (maxlvt >= 5) { v = apic_read(APIC_LVTTHMR); @@ -211,85 +635,18 @@ void clear_local_APIC(void) if (maxlvt >= 5) apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED); #endif - v = GET_APIC_VERSION(apic_read(APIC_LVR)); - if (APIC_INTEGRATED(v)) { /* !82489DX */ - if (maxlvt > 3) /* Due to Pentium errata 3AP and 11AP. */ + /* Integrated APIC (!82489DX) ? */ + if (lapic_is_integrated()) { + if (maxlvt > 3) + /* Clear ESR due to Pentium errata 3AP and 11AP */ apic_write(APIC_ESR, 0); apic_read(APIC_ESR); } } -void __init connect_bsp_APIC(void) -{ - if (pic_mode) { - /* - * Do not trust the local APIC being empty at bootup. - */ - clear_local_APIC(); - /* - * PIC mode, enable APIC mode in the IMCR, i.e. - * connect BSP's local APIC to INT and NMI lines. - */ - apic_printk(APIC_VERBOSE, "leaving PIC mode, " - "enabling APIC mode.\n"); - outb(0x70, 0x22); - outb(0x01, 0x23); - } - enable_apic_mode(); -} - -void disconnect_bsp_APIC(int virt_wire_setup) -{ - if (pic_mode) { - /* - * Put the board back into PIC mode (has an effect - * only on certain older boards). Note that APIC - * interrupts, including IPIs, won't work beyond - * this point! The only exception are INIT IPIs. - */ - apic_printk(APIC_VERBOSE, "disabling APIC mode, " - "entering PIC mode.\n"); - outb(0x70, 0x22); - outb(0x00, 0x23); - } - else { - /* Go back to Virtual Wire compatibility mode */ - unsigned long value; - - /* For the spurious interrupt use vector F, and enable it */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - value |= 0xf; - apic_write_around(APIC_SPIV, value); - - if (!virt_wire_setup) { - /* For LVT0 make it edge triggered, active high, external and enabled */ - value = apic_read(APIC_LVT0); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); - apic_write_around(APIC_LVT0, value); - } - else { - /* Disable LVT0 */ - apic_write_around(APIC_LVT0, APIC_LVT_MASKED); - } - - /* For LVT1 make it edge triggered, active high, nmi and enabled */ - value = apic_read(APIC_LVT1); - value &= ~( - APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); - apic_write_around(APIC_LVT1, value); - } -} - +/** + * disable_local_APIC - clear and disable the local APIC + */ void disable_local_APIC(void) { unsigned long value; @@ -304,8 +661,13 @@ void disable_local_APIC(void) value &= ~APIC_SPIV_APIC_ENABLED; apic_write_around(APIC_SPIV, value); + /* + * When LAPIC was disabled by the BIOS and enabled by the kernel, + * restore the disabled state. + */ if (enabled_via_apicbase) { unsigned int l, h; + rdmsr(MSR_IA32_APICBASE, l, h); l &= ~MSR_IA32_APICBASE_ENABLE; wrmsr(MSR_IA32_APICBASE, l, h); @@ -313,6 +675,28 @@ void disable_local_APIC(void) } /* + * If Linux enabled the LAPIC against the BIOS default disable it down before + * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and + * not power-off. Additionally clear all LVT entries before disable_local_APIC + * for the case where Linux didn't enable the LAPIC. + */ +void lapic_shutdown(void) +{ + unsigned long flags; + + if (!cpu_has_apic) + return; + + local_irq_save(flags); + clear_local_APIC(); + + if (enabled_via_apicbase) + disable_local_APIC(); + + local_irq_restore(flags); +} + +/* * This is to verify that we're looking at a real local APIC. * Check these against your board if the CPUs aren't getting * started for no apparent reason. @@ -344,7 +728,7 @@ int __init verify_local_APIC(void) reg1 = GET_APIC_VERSION(reg0); if (reg1 == 0x00 || reg1 == 0xff) return 0; - reg1 = get_maxlvt(); + reg1 = lapic_get_maxlvt(); if (reg1 < 0x02 || reg1 == 0xff) return 0; @@ -367,10 +751,15 @@ int __init verify_local_APIC(void) return 1; } +/** + * sync_Arb_IDs - synchronize APIC bus arbitration IDs + */ void __init sync_Arb_IDs(void) { - /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 - And not needed on AMD */ + /* + * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not + * needed on AMD. + */ if (modern_apic()) return; /* @@ -383,14 +772,12 @@ void __init sync_Arb_IDs(void) | APIC_DM_INIT); } -extern void __error_in_apic_c (void); - /* * An initial setup of the virtual wire mode. */ void __init init_bsp_APIC(void) { - unsigned long value, ver; + unsigned long value; /* * Don't do the setup now if we have a SMP BIOS as the @@ -399,9 +786,6 @@ void __init init_bsp_APIC(void) if (smp_found_config || !cpu_has_apic) return; - value = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(value); - /* * Do not trust the local APIC being empty at bootup. */ @@ -413,9 +797,10 @@ void __init init_bsp_APIC(void) value = apic_read(APIC_SPIV); value &= ~APIC_VECTOR_MASK; value |= APIC_SPIV_APIC_ENABLED; - + /* This bit is reserved on P4/Xeon and should be cleared */ - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 15)) + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && + (boot_cpu_data.x86 == 15)) value &= ~APIC_SPIV_FOCUS_DISABLED; else value |= APIC_SPIV_FOCUS_DISABLED; @@ -427,14 +812,17 @@ void __init init_bsp_APIC(void) */ apic_write_around(APIC_LVT0, APIC_DM_EXTINT); value = APIC_DM_NMI; - if (!APIC_INTEGRATED(ver)) /* 82489DX */ + if (!lapic_is_integrated()) /* 82489DX */ value |= APIC_LVT_LEVEL_TRIGGER; apic_write_around(APIC_LVT1, value); } +/** + * setup_local_APIC - setup the local APIC + */ void __devinit setup_local_APIC(void) { - unsigned long oldvalue, value, ver, maxlvt; + unsigned long oldvalue, value, maxlvt, integrated; int i, j; /* Pound the ESR really hard over the head with a big hammer - mbligh */ @@ -445,11 +833,7 @@ void __devinit setup_local_APIC(void) apic_write(APIC_ESR, 0); } - value = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(value); - - if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f) - __error_in_apic_c(); + integrated = lapic_is_integrated(); /* * Double-check whether this APIC is really registered. @@ -520,13 +904,10 @@ void __devinit setup_local_APIC(void) * like LRU than MRU (the short-term load is more even across CPUs). * See also the comment in end_level_ioapic_irq(). --macro */ -#if 1 + /* Enable focus processor (bit==0) */ value &= ~APIC_SPIV_FOCUS_DISABLED; -#else - /* Disable focus processor (bit==1) */ - value |= APIC_SPIV_FOCUS_DISABLED; -#endif + /* * Set spurious IRQ vector */ @@ -562,17 +943,18 @@ void __devinit setup_local_APIC(void) value = APIC_DM_NMI; else value = APIC_DM_NMI | APIC_LVT_MASKED; - if (!APIC_INTEGRATED(ver)) /* 82489DX */ + if (!integrated) /* 82489DX */ value |= APIC_LVT_LEVEL_TRIGGER; apic_write_around(APIC_LVT1, value); - if (APIC_INTEGRATED(ver) && !esr_disable) { /* !82489DX */ - maxlvt = get_maxlvt(); + if (integrated && !esr_disable) { /* !82489DX */ + maxlvt = lapic_get_maxlvt(); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); oldvalue = apic_read(APIC_ESR); - value = ERROR_APIC_VECTOR; // enables sending errors + /* enables sending errors */ + value = ERROR_APIC_VECTOR; apic_write_around(APIC_LVTERR, value); /* * spec says clear errors after enabling vector. @@ -585,207 +967,30 @@ void __devinit setup_local_APIC(void) "vector: 0x%08lx after: 0x%08lx\n", oldvalue, value); } else { - if (esr_disable) - /* - * Something untraceble is creating bad interrupts on + if (esr_disable) + /* + * Something untraceble is creating bad interrupts on * secondary quads ... for the moment, just leave the * ESR disabled - we can't do anything useful with the * errors anyway - mbligh */ - printk("Leaving ESR disabled.\n"); - else - printk("No ESR for 82489DX.\n"); + printk(KERN_INFO "Leaving ESR disabled.\n"); + else + printk(KERN_INFO "No ESR for 82489DX.\n"); } + /* Disable the local apic timer */ + value = apic_read(APIC_LVTT); + value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write_around(APIC_LVTT, value); + setup_apic_nmi_watchdog(NULL); apic_pm_activate(); } /* - * If Linux enabled the LAPIC against the BIOS default - * disable it down before re-entering the BIOS on shutdown. - * Otherwise the BIOS may get confused and not power-off. - * Additionally clear all LVT entries before disable_local_APIC - * for the case where Linux didn't enable the LAPIC. + * Detect and initialize APIC */ -void lapic_shutdown(void) -{ - unsigned long flags; - - if (!cpu_has_apic) - return; - - local_irq_save(flags); - clear_local_APIC(); - - if (enabled_via_apicbase) - disable_local_APIC(); - - local_irq_restore(flags); -} - -#ifdef CONFIG_PM - -static struct { - int active; - /* r/w apic fields */ - unsigned int apic_id; - unsigned int apic_taskpri; - unsigned int apic_ldr; - unsigned int apic_dfr; - unsigned int apic_spiv; - unsigned int apic_lvtt; - unsigned int apic_lvtpc; - unsigned int apic_lvt0; - unsigned int apic_lvt1; - unsigned int apic_lvterr; - unsigned int apic_tmict; - unsigned int apic_tdcr; - unsigned int apic_thmr; -} apic_pm_state; - -static int lapic_suspend(struct sys_device *dev, pm_message_t state) -{ - unsigned long flags; - int maxlvt; - - if (!apic_pm_state.active) - return 0; - - maxlvt = get_maxlvt(); - - apic_pm_state.apic_id = apic_read(APIC_ID); - apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); - apic_pm_state.apic_ldr = apic_read(APIC_LDR); - apic_pm_state.apic_dfr = apic_read(APIC_DFR); - apic_pm_state.apic_spiv = apic_read(APIC_SPIV); - apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); - if (maxlvt >= 4) - apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); - apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); - apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); - apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); - apic_pm_state.apic_tmict = apic_read(APIC_TMICT); - apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); -#ifdef CONFIG_X86_MCE_P4THERMAL - if (maxlvt >= 5) - apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); -#endif - - local_irq_save(flags); - disable_local_APIC(); - local_irq_restore(flags); - return 0; -} - -static int lapic_resume(struct sys_device *dev) -{ - unsigned int l, h; - unsigned long flags; - int maxlvt; - - if (!apic_pm_state.active) - return 0; - - maxlvt = get_maxlvt(); - - local_irq_save(flags); - - /* - * Make sure the APICBASE points to the right address - * - * FIXME! This will be wrong if we ever support suspend on - * SMP! We'll need to do this as part of the CPU restore! - */ - rdmsr(MSR_IA32_APICBASE, l, h); - l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; - wrmsr(MSR_IA32_APICBASE, l, h); - - apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); - apic_write(APIC_ID, apic_pm_state.apic_id); - apic_write(APIC_DFR, apic_pm_state.apic_dfr); - apic_write(APIC_LDR, apic_pm_state.apic_ldr); - apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); - apic_write(APIC_SPIV, apic_pm_state.apic_spiv); - apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); - apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); -#ifdef CONFIG_X86_MCE_P4THERMAL - if (maxlvt >= 5) - apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); -#endif - if (maxlvt >= 4) - apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); - apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); - apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); - apic_write(APIC_TMICT, apic_pm_state.apic_tmict); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - local_irq_restore(flags); - return 0; -} - -/* - * This device has no shutdown method - fully functioning local APICs - * are needed on every CPU up until machine_halt/restart/poweroff. - */ - -static struct sysdev_class lapic_sysclass = { - set_kset_name("lapic"), - .resume = lapic_resume, - .suspend = lapic_suspend, -}; - -static struct sys_device device_lapic = { - .id = 0, - .cls = &lapic_sysclass, -}; - -static void __devinit apic_pm_activate(void) -{ - apic_pm_state.active = 1; -} - -static int __init init_lapic_sysfs(void) -{ - int error; - - if (!cpu_has_apic) - return 0; - /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ - - error = sysdev_class_register(&lapic_sysclass); - if (!error) - error = sysdev_register(&device_lapic); - return error; -} -device_initcall(init_lapic_sysfs); - -#else /* CONFIG_PM */ - -static void apic_pm_activate(void) { } - -#endif /* CONFIG_PM */ - -/* - * Detect and enable local APICs on non-SMP boards. - * Original code written by Keir Fraser. - */ - -static int __init apic_set_verbosity(char *str) -{ - if (strcmp("debug", str) == 0) - apic_verbosity = APIC_DEBUG; - else if (strcmp("verbose", str) == 0) - apic_verbosity = APIC_VERBOSE; - return 1; -} - -__setup("apic=", apic_set_verbosity); - static int __init detect_init_APIC (void) { u32 h, l, features; @@ -797,7 +1002,7 @@ static int __init detect_init_APIC (void) switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) || - (boot_cpu_data.x86 == 15)) + (boot_cpu_data.x86 == 15)) break; goto no_apic; case X86_VENDOR_INTEL: @@ -811,23 +1016,23 @@ static int __init detect_init_APIC (void) if (!cpu_has_apic) { /* - * Over-ride BIOS and try to enable the local - * APIC only if "lapic" specified. + * Over-ride BIOS and try to enable the local APIC only if + * "lapic" specified. */ if (enable_local_apic <= 0) { - printk("Local APIC disabled by BIOS -- " + printk(KERN_INFO "Local APIC disabled by BIOS -- " "you can enable it with \"lapic\"\n"); return -1; } /* - * Some BIOSes disable the local APIC in the - * APIC_BASE MSR. This can only be done in - * software for Intel P6 or later and AMD K7 - * (Model > 1) or later. + * Some BIOSes disable the local APIC in the APIC_BASE + * MSR. This can only be done in software for Intel P6 or later + * and AMD K7 (Model > 1) or later. */ rdmsr(MSR_IA32_APICBASE, l, h); if (!(l & MSR_IA32_APICBASE_ENABLE)) { - printk("Local APIC disabled by BIOS -- reenabling.\n"); + printk(KERN_INFO + "Local APIC disabled by BIOS -- reenabling.\n"); l &= ~MSR_IA32_APICBASE_BASE; l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; wrmsr(MSR_IA32_APICBASE, l, h); @@ -840,7 +1045,7 @@ static int __init detect_init_APIC (void) */ features = cpuid_edx(1); if (!(features & (1 << X86_FEATURE_APIC))) { - printk("Could not enable APIC!\n"); + printk(KERN_WARNING "Could not enable APIC!\n"); return -1; } set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); @@ -854,17 +1059,20 @@ static int __init detect_init_APIC (void) if (nmi_watchdog != NMI_NONE) nmi_watchdog = NMI_LOCAL_APIC; - printk("Found and enabled local APIC!\n"); + printk(KERN_INFO "Found and enabled local APIC!\n"); apic_pm_activate(); return 0; no_apic: - printk("No local APIC present or hardware disabled\n"); + printk(KERN_INFO "No local APIC present or hardware disabled\n"); return -1; } +/** + * init_apic_mappings - initialize APIC mappings + */ void __init init_apic_mappings(void) { unsigned long apic_phys; @@ -924,387 +1132,96 @@ fake_ioapic_page: } /* - * This part sets up the APIC 32 bit clock in LVTT1, with HZ interrupts - * per second. We assume that the caller has already set up the local - * APIC. - * - * The APIC timer is not exactly sync with the external timer chip, it - * closely follows bus clocks. - */ - -/* - * The timer chip is already set up at HZ interrupts per second here, - * but we do not accept timer interrupts yet. We only allow the BP - * to calibrate. - */ -static unsigned int __devinit get_8254_timer_count(void) -{ - unsigned long flags; - - unsigned int count; - - spin_lock_irqsave(&i8253_lock, flags); - - outb_p(0x00, PIT_MODE); - count = inb_p(PIT_CH0); - count |= inb_p(PIT_CH0) << 8; - - spin_unlock_irqrestore(&i8253_lock, flags); - - return count; -} - -/* next tick in 8254 can be caught by catching timer wraparound */ -static void __devinit wait_8254_wraparound(void) -{ - unsigned int curr_count, prev_count; - - curr_count = get_8254_timer_count(); - do { - prev_count = curr_count; - curr_count = get_8254_timer_count(); - - /* workaround for broken Mercury/Neptune */ - if (prev_count >= curr_count + 0x100) - curr_count = get_8254_timer_count(); - - } while (prev_count >= curr_count); -} - -/* - * Default initialization for 8254 timers. If we use other timers like HPET, - * we override this later - */ -void (*wait_timer_tick)(void) __devinitdata = wait_8254_wraparound; - -/* - * This function sets up the local APIC timer, with a timeout of - * 'clocks' APIC bus clock. During calibration we actually call - * this function twice on the boot CPU, once with a bogus timeout - * value, second time for real. The other (noncalibrating) CPUs - * call this function only once, with the real, calibrated value. - * - * We do reads before writes even if unnecessary, to get around the - * P5 APIC double write bug. + * This initializes the IO-APIC and APIC hardware if this is + * a UP kernel. */ - -#define APIC_DIVISOR 16 - -static void __setup_APIC_LVTT(unsigned int clocks) +int __init APIC_init_uniprocessor (void) { - unsigned int lvtt_value, tmp_value, ver; - int cpu = smp_processor_id(); - - ver = GET_APIC_VERSION(apic_read(APIC_LVR)); - lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; - if (!APIC_INTEGRATED(ver)) - lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); - - if (cpu_isset(cpu, timer_bcast_ipi)) - lvtt_value |= APIC_LVT_MASKED; + if (enable_local_apic < 0) + clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); - apic_write_around(APIC_LVTT, lvtt_value); + if (!smp_found_config && !cpu_has_apic) + return -1; /* - * Divide PICLK by 16 + * Complain if the BIOS pretends there is one. */ - tmp_value = apic_read(APIC_TDCR); - apic_write_around(APIC_TDCR, (tmp_value - & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) - | APIC_TDR_DIV_16); - - apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); -} + if (!cpu_has_apic && + APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", + boot_cpu_physical_apicid); + clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + return -1; + } -static void __devinit setup_APIC_timer(unsigned int clocks) -{ - unsigned long flags; + verify_local_APIC(); - local_irq_save(flags); + connect_bsp_APIC(); /* - * Wait for IRQ0's slice: + * Hack: In case of kdump, after a crash, kernel might be booting + * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid + * might be zero if read from MP tables. Get it from LAPIC. */ - wait_timer_tick(); +#ifdef CONFIG_CRASH_DUMP + boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); +#endif + phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); - __setup_APIC_LVTT(clocks); + setup_local_APIC(); - local_irq_restore(flags); +#ifdef CONFIG_X86_IO_APIC + if (smp_found_config) + if (!skip_ioapic_setup && nr_ioapics) + setup_IO_APIC(); +#endif + setup_boot_clock(); + + return 0; } /* - * In this function we calibrate APIC bus clocks to the external - * timer. Unfortunately we cannot use jiffies and the timer irq - * to calibrate, since some later bootup code depends on getting - * the first irq? Ugh. - * - * We want to do the calibration only once since we - * want to have local timer irqs syncron. CPUs connected - * by the same APIC bus have the very same bus frequency. - * And we want to have irqs off anyways, no accidental - * APIC irq that way. + * APIC command line parameters */ - -static int __init calibrate_APIC_clock(void) -{ - unsigned long long t1 = 0, t2 = 0; - long tt1, tt2; - long result; - int i; - const int LOOPS = HZ/10; - - apic_printk(APIC_VERBOSE, "calibrating APIC timer ...\n"); - - /* - * Put whatever arbitrary (but long enough) timeout - * value into the APIC clock, we just want to get the - * counter running for calibration. - */ - __setup_APIC_LVTT(1000000000); - - /* - * The timer chip counts down to zero. Let's wait - * for a wraparound to start exact measurement: - * (the current tick might have been already half done) - */ - - wait_timer_tick(); - - /* - * We wrapped around just now. Let's start: - */ - if (cpu_has_tsc) - rdtscll(t1); - tt1 = apic_read(APIC_TMCCT); - - /* - * Let's wait LOOPS wraprounds: - */ - for (i = 0; i < LOOPS; i++) - wait_timer_tick(); - - tt2 = apic_read(APIC_TMCCT); - if (cpu_has_tsc) - rdtscll(t2); - - /* - * The APIC bus clock counter is 32 bits only, it - * might have overflown, but note that we use signed - * longs, thus no extra care needed. - * - * underflown to be exact, as the timer counts down ;) - */ - - result = (tt1-tt2)*APIC_DIVISOR/LOOPS; - - if (cpu_has_tsc) - apic_printk(APIC_VERBOSE, "..... CPU clock speed is " - "%ld.%04ld MHz.\n", - ((long)(t2-t1)/LOOPS)/(1000000/HZ), - ((long)(t2-t1)/LOOPS)%(1000000/HZ)); - - apic_printk(APIC_VERBOSE, "..... host bus clock speed is " - "%ld.%04ld MHz.\n", - result/(1000000/HZ), - result%(1000000/HZ)); - - return result; -} - -static unsigned int calibration_result; - -void __init setup_boot_APIC_clock(void) -{ - unsigned long flags; - apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"); - using_apic_timer = 1; - - local_irq_save(flags); - - calibration_result = calibrate_APIC_clock(); - /* - * Now set up the timer for real. - */ - setup_APIC_timer(calibration_result); - - local_irq_restore(flags); -} - -void __devinit setup_secondary_APIC_clock(void) -{ - setup_APIC_timer(calibration_result); -} - -void disable_APIC_timer(void) -{ - if (using_apic_timer) { - unsigned long v; - - v = apic_read(APIC_LVTT); - /* - * When an illegal vector value (0-15) is written to an LVT - * entry and delivery mode is Fixed, the APIC may signal an - * illegal vector error, with out regard to whether the mask - * bit is set or whether an interrupt is actually seen on input. - * - * Boot sequence might call this function when the LVTT has - * '0' vector value. So make sure vector field is set to - * valid value. - */ - v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); - apic_write_around(APIC_LVTT, v); - } -} - -void enable_APIC_timer(void) +static int __init parse_lapic(char *arg) { - int cpu = smp_processor_id(); - - if (using_apic_timer && - !cpu_isset(cpu, timer_bcast_ipi)) { - unsigned long v; - - v = apic_read(APIC_LVTT); - apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED); - } + enable_local_apic = 1; + return 0; } +early_param("lapic", parse_lapic); -void switch_APIC_timer_to_ipi(void *cpumask) +static int __init parse_nolapic(char *arg) { - cpumask_t mask = *(cpumask_t *)cpumask; - int cpu = smp_processor_id(); - - if (cpu_isset(cpu, mask) && - !cpu_isset(cpu, timer_bcast_ipi)) { - disable_APIC_timer(); - cpu_set(cpu, timer_bcast_ipi); - } + enable_local_apic = -1; + clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + return 0; } -EXPORT_SYMBOL(switch_APIC_timer_to_ipi); +early_param("nolapic", parse_nolapic); -void switch_ipi_to_APIC_timer(void *cpumask) +static int __init apic_set_verbosity(char *str) { - cpumask_t mask = *(cpumask_t *)cpumask; - int cpu = smp_processor_id(); - - if (cpu_isset(cpu, mask) && - cpu_isset(cpu, timer_bcast_ipi)) { - cpu_clear(cpu, timer_bcast_ipi); - enable_APIC_timer(); - } + if (strcmp("debug", str) == 0) + apic_verbosity = APIC_DEBUG; + else if (strcmp("verbose", str) == 0) + apic_verbosity = APIC_VERBOSE; + return 1; } -EXPORT_SYMBOL(switch_ipi_to_APIC_timer); - -#undef APIC_DIVISOR -/* - * Local timer interrupt handler. It does both profiling and - * process statistics/rescheduling. - * - * We do profiling in every local tick, statistics/rescheduling - * happen only every 'profiling multiplier' ticks. The default - * multiplier is 1 and it can be changed by writing the new multiplier - * value into /proc/profile. - */ - -inline void smp_local_timer_interrupt(void) -{ - profile_tick(CPU_PROFILING); -#ifdef CONFIG_SMP - update_process_times(user_mode_vm(get_irq_regs())); -#endif +__setup("apic=", apic_set_verbosity); - /* - * We take the 'long' return path, and there every subsystem - * grabs the apropriate locks (kernel lock/ irq lock). - * - * we might want to decouple profiling from the 'long path', - * and do the profiling totally in assembly. - * - * Currently this isn't too much of an issue (performance wise), - * we can take more than 100K local irqs per second on a 100 MHz P5. - */ -} /* - * Local APIC timer interrupt. This is the most natural way for doing - * local interrupts, but local timer interrupts can be emulated by - * broadcast interrupts too. [in case the hw doesn't support APIC timers] - * - * [ if a single-CPU system runs an SMP kernel then we call the local - * interrupt as well. Thus we cannot inline the local irq ... ] + * Local APIC interrupts */ -fastcall void smp_apic_timer_interrupt(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - int cpu = smp_processor_id(); - - /* - * the NMI deadlock-detector uses this. - */ - per_cpu(irq_stat, cpu).apic_timer_irqs++; - - /* - * NOTE! We'd better ACK the irq immediately, - * because timer handling can be slow. - */ - ack_APIC_irq(); - /* - * update_process_times() expects us to have done irq_enter(). - * Besides, if we don't timer interrupts ignore the global - * interrupt lock, which is the WrongThing (tm) to do. - */ - irq_enter(); - smp_local_timer_interrupt(); - irq_exit(); - set_irq_regs(old_regs); -} - -#ifndef CONFIG_SMP -static void up_apic_timer_interrupt_call(void) -{ - int cpu = smp_processor_id(); - - /* - * the NMI deadlock-detector uses this. - */ - per_cpu(irq_stat, cpu).apic_timer_irqs++; - - smp_local_timer_interrupt(); -} -#endif - -void smp_send_timer_broadcast_ipi(void) -{ - cpumask_t mask; - - cpus_and(mask, cpu_online_map, timer_bcast_ipi); - if (!cpus_empty(mask)) { -#ifdef CONFIG_SMP - send_IPI_mask(mask, LOCAL_TIMER_VECTOR); -#else - /* - * We can directly call the apic timer interrupt handler - * in UP case. Minus all irq related functions - */ - up_apic_timer_interrupt_call(); -#endif - } -} - -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - /* * This interrupt should _never_ happen with our APIC/SMP architecture */ -fastcall void smp_spurious_interrupt(struct pt_regs *regs) +void smp_spurious_interrupt(struct pt_regs *regs) { unsigned long v; + exit_idle(); irq_enter(); /* * Check if this really is a spurious interrupt and ACK it @@ -1316,19 +1233,19 @@ fastcall void smp_spurious_interrupt(struct pt_regs *regs) ack_APIC_irq(); /* see sw-dev-man vol 3, chapter 7.4.13.5 */ - printk(KERN_INFO "spurious APIC interrupt on CPU#%d, should never happen.\n", - smp_processor_id()); + printk(KERN_INFO "spurious APIC interrupt on CPU#%d, " + "should never happen.\n", smp_processor_id()); irq_exit(); } /* * This interrupt should never happen with our APIC/SMP architecture */ - -fastcall void smp_error_interrupt(struct pt_regs *regs) +void smp_error_interrupt(struct pt_regs *regs) { unsigned long v, v1; + exit_idle(); irq_enter(); /* First tickle the hardware, only then report what went on. -- REW */ v = apic_read(APIC_ESR); @@ -1348,69 +1265,261 @@ fastcall void smp_error_interrupt(struct pt_regs *regs) 7: Illegal register address */ printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", - smp_processor_id(), v , v1); + smp_processor_id(), v , v1); irq_exit(); } /* - * This initializes the IO-APIC and APIC hardware if this is - * a UP kernel. + * Initialize APIC interrupts */ -int __init APIC_init_uniprocessor (void) +void __init apic_intr_init(void) { - if (enable_local_apic < 0) - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); +#ifdef CONFIG_SMP + smp_intr_init(); +#endif + /* self generated IPI for local APIC timer */ + set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - if (!smp_found_config && !cpu_has_apic) - return -1; + /* IPI vectors for APIC spurious and error interrupts */ + set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); + set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); - /* - * Complain if the BIOS pretends there is one. - */ - if (!cpu_has_apic && APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { - printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", - boot_cpu_physical_apicid); - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); - return -1; + /* thermal monitor LVT interrupt */ +#ifdef CONFIG_X86_MCE_P4THERMAL + set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); +#endif +} + +/** + * connect_bsp_APIC - attach the APIC to the interrupt system + */ +void __init connect_bsp_APIC(void) +{ + if (pic_mode) { + /* + * Do not trust the local APIC being empty at bootup. + */ + clear_local_APIC(); + /* + * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's + * local APIC to INT and NMI lines. + */ + apic_printk(APIC_VERBOSE, "leaving PIC mode, " + "enabling APIC mode.\n"); + outb(0x70, 0x22); + outb(0x01, 0x23); } + enable_apic_mode(); +} - verify_local_APIC(); +/** + * disconnect_bsp_APIC - detach the APIC from the interrupt system + * @virt_wire_setup: indicates, whether virtual wire mode is selected + * + * Virtual wire mode is necessary to deliver legacy interrupts even when the + * APIC is disabled. + */ +void disconnect_bsp_APIC(int virt_wire_setup) +{ + if (pic_mode) { + /* + * Put the board back into PIC mode (has an effect only on + * certain older boards). Note that APIC interrupts, including + * IPIs, won't work beyond this point! The only exception are + * INIT IPIs. + */ + apic_printk(APIC_VERBOSE, "disabling APIC mode, " + "entering PIC mode.\n"); + outb(0x70, 0x22); + outb(0x00, 0x23); + } else { + /* Go back to Virtual Wire compatibility mode */ + unsigned long value; - connect_bsp_APIC(); + /* For the spurious interrupt use vector F, and enable it */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= 0xf; + apic_write_around(APIC_SPIV, value); - /* - * Hack: In case of kdump, after a crash, kernel might be booting - * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid - * might be zero if read from MP tables. Get it from LAPIC. - */ -#ifdef CONFIG_CRASH_DUMP - boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); -#endif - phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); + if (!virt_wire_setup) { + /* + * For LVT0 make it edge triggered, active high, + * external and enabled + */ + value = apic_read(APIC_LVT0); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); + apic_write_around(APIC_LVT0, value); + } else { + /* Disable LVT0 */ + apic_write_around(APIC_LVT0, APIC_LVT_MASKED); + } - setup_local_APIC(); + /* + * For LVT1 make it edge triggered, active high, nmi and + * enabled + */ + value = apic_read(APIC_LVT1); + value &= ~( + APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); + apic_write_around(APIC_LVT1, value); + } +} -#ifdef CONFIG_X86_IO_APIC - if (smp_found_config) - if (!skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); +/* + * Power management + */ +#ifdef CONFIG_PM + +static struct { + int active; + /* r/w apic fields */ + unsigned int apic_id; + unsigned int apic_taskpri; + unsigned int apic_ldr; + unsigned int apic_dfr; + unsigned int apic_spiv; + unsigned int apic_lvtt; + unsigned int apic_lvtpc; + unsigned int apic_lvt0; + unsigned int apic_lvt1; + unsigned int apic_lvterr; + unsigned int apic_tmict; + unsigned int apic_tdcr; + unsigned int apic_thmr; +} apic_pm_state; + +static int lapic_suspend(struct sys_device *dev, pm_message_t state) +{ + unsigned long flags; + int maxlvt; + + if (!apic_pm_state.active) + return 0; + + maxlvt = lapic_get_maxlvt(); + + apic_pm_state.apic_id = apic_read(APIC_ID); + apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); + apic_pm_state.apic_ldr = apic_read(APIC_LDR); + apic_pm_state.apic_dfr = apic_read(APIC_DFR); + apic_pm_state.apic_spiv = apic_read(APIC_SPIV); + apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); + if (maxlvt >= 4) + apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); + apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); + apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); + apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); + apic_pm_state.apic_tmict = apic_read(APIC_TMICT); + apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); +#ifdef CONFIG_X86_MCE_P4THERMAL + if (maxlvt >= 5) + apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); #endif - setup_boot_APIC_clock(); + local_irq_save(flags); + disable_local_APIC(); + local_irq_restore(flags); return 0; } -static int __init parse_lapic(char *arg) +static int lapic_resume(struct sys_device *dev) { - lapic_enable(); + unsigned int l, h; + unsigned long flags; + int maxlvt; + + if (!apic_pm_state.active) + return 0; + + maxlvt = lapic_get_maxlvt(); + + local_irq_save(flags); + + /* + * Make sure the APICBASE points to the right address + * + * FIXME! This will be wrong if we ever support suspend on + * SMP! We'll need to do this as part of the CPU restore! + */ + rdmsr(MSR_IA32_APICBASE, l, h); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; + wrmsr(MSR_IA32_APICBASE, l, h); + + apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); + apic_write(APIC_ID, apic_pm_state.apic_id); + apic_write(APIC_DFR, apic_pm_state.apic_dfr); + apic_write(APIC_LDR, apic_pm_state.apic_ldr); + apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); + apic_write(APIC_SPIV, apic_pm_state.apic_spiv); + apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); + apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); +#ifdef CONFIG_X86_MCE_P4THERMAL + if (maxlvt >= 5) + apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); +#endif + if (maxlvt >= 4) + apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); + apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); + apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); + apic_write(APIC_TMICT, apic_pm_state.apic_tmict); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + local_irq_restore(flags); return 0; } -early_param("lapic", parse_lapic); -static int __init parse_nolapic(char *arg) +/* + * This device has no shutdown method - fully functioning local APICs + * are needed on every CPU up until machine_halt/restart/poweroff. + */ + +static struct sysdev_class lapic_sysclass = { + set_kset_name("lapic"), + .resume = lapic_resume, + .suspend = lapic_suspend, +}; + +static struct sys_device device_lapic = { + .id = 0, + .cls = &lapic_sysclass, +}; + +static void __devinit apic_pm_activate(void) { - lapic_disable(); - return 0; + apic_pm_state.active = 1; } -early_param("nolapic", parse_nolapic); +static int __init init_lapic_sysfs(void) +{ + int error; + + if (!cpu_has_apic) + return 0; + /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ + + error = sysdev_class_register(&lapic_sysclass); + if (!error) + error = sysdev_register(&device_lapic); + return error; +} +device_initcall(init_lapic_sysfs); + +#else /* CONFIG_PM */ + +static void apic_pm_activate(void) { } + +#endif /* CONFIG_PM */ diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c index 19901692754..064bbf2861f 100644 --- a/arch/i386/kernel/apm.c +++ b/arch/i386/kernel/apm.c @@ -211,6 +211,7 @@ #include <linux/slab.h> #include <linux/stat.h> #include <linux/proc_fs.h> +#include <linux/seq_file.h> #include <linux/miscdevice.h> #include <linux/apm_bios.h> #include <linux/init.h> @@ -235,7 +236,6 @@ #include "io_ports.h" -extern unsigned long get_cmos_time(void); extern void machine_real_restart(unsigned char *, int); #if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) @@ -1175,28 +1175,6 @@ out: spin_unlock(&user_list_lock); } -static void set_time(void) -{ - struct timespec ts; - if (got_clock_diff) { /* Must know time zone in order to set clock */ - ts.tv_sec = get_cmos_time() + clock_cmos_diff; - ts.tv_nsec = 0; - do_settimeofday(&ts); - } -} - -static void get_time_diff(void) -{ -#ifndef CONFIG_APM_RTC_IS_GMT - /* - * Estimate time zone so that set_time can update the clock - */ - clock_cmos_diff = -get_cmos_time(); - clock_cmos_diff += get_seconds(); - got_clock_diff = 1; -#endif -} - static void reinit_timer(void) { #ifdef INIT_TIMER_AFTER_SUSPEND @@ -1236,19 +1214,6 @@ static int suspend(int vetoable) local_irq_disable(); device_power_down(PMSG_SUSPEND); - /* serialize with the timer interrupt */ - write_seqlock(&xtime_lock); - - /* protect against access to timer chip registers */ - spin_lock(&i8253_lock); - - get_time_diff(); - /* - * Irq spinlock must be dropped around set_system_power_state. - * We'll undo any timer changes due to interrupts below. - */ - spin_unlock(&i8253_lock); - write_sequnlock(&xtime_lock); local_irq_enable(); save_processor_state(); @@ -1257,7 +1222,6 @@ static int suspend(int vetoable) restore_processor_state(); local_irq_disable(); - set_time(); reinit_timer(); if (err == APM_NO_ERROR) @@ -1287,11 +1251,6 @@ static void standby(void) local_irq_disable(); device_power_down(PMSG_SUSPEND); - /* serialize with the timer interrupt */ - write_seqlock(&xtime_lock); - /* If needed, notify drivers here */ - get_time_diff(); - write_sequnlock(&xtime_lock); local_irq_enable(); err = set_system_power_state(APM_STATE_STANDBY); @@ -1385,7 +1344,6 @@ static void check_events(void) ignore_bounce = 1; if ((event != APM_NORMAL_RESUME) || (ignore_normal_resume == 0)) { - set_time(); device_resume(); pm_send_all(PM_RESUME, (void *)0); queue_event(event, NULL); @@ -1401,7 +1359,6 @@ static void check_events(void) break; case APM_UPDATE_TIME: - set_time(); break; case APM_CRITICAL_SUSPEND: @@ -1636,9 +1593,8 @@ static int do_open(struct inode * inode, struct file * filp) return 0; } -static int apm_get_info(char *buf, char **start, off_t fpos, int length) +static int proc_apm_show(struct seq_file *m, void *v) { - char * p; unsigned short bx; unsigned short cx; unsigned short dx; @@ -1650,8 +1606,6 @@ static int apm_get_info(char *buf, char **start, off_t fpos, int length) int time_units = -1; char *units = "?"; - p = buf; - if ((num_online_cpus() == 1) && !(error = apm_get_power_status(&bx, &cx, &dx))) { ac_line_status = (bx >> 8) & 0xff; @@ -1705,7 +1659,7 @@ static int apm_get_info(char *buf, char **start, off_t fpos, int length) -1: Unknown 8) min = minutes; sec = seconds */ - p += sprintf(p, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n", + seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n", driver_version, (apm_info.bios.version >> 8) & 0xff, apm_info.bios.version & 0xff, @@ -1716,10 +1670,22 @@ static int apm_get_info(char *buf, char **start, off_t fpos, int length) percentage, time_units, units); + return 0; +} - return p - buf; +static int proc_apm_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_apm_show, NULL); } +static const struct file_operations apm_file_ops = { + .owner = THIS_MODULE, + .open = proc_apm_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static int apm(void *unused) { unsigned short bx; @@ -1894,7 +1860,7 @@ static int __init apm_setup(char *str) __setup("apm=", apm_setup); #endif -static struct file_operations apm_bios_fops = { +static const struct file_operations apm_bios_fops = { .owner = THIS_MODULE, .read = do_read, .poll = do_poll, @@ -2341,9 +2307,9 @@ static int __init apm_init(void) set_base(gdt[APM_DS >> 3], __va((unsigned long)apm_info.bios.dseg << 4)); - apm_proc = create_proc_info_entry("apm", 0, NULL, apm_get_info); + apm_proc = create_proc_entry("apm", 0, NULL); if (apm_proc) - apm_proc->owner = THIS_MODULE; + apm_proc->proc_fops = &apm_file_ops; kapmd_task = kthread_create(apm, NULL, "kapmd"); if (IS_ERR(kapmd_task)) { diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index 1b2f3cd3327..c37535163bf 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -72,7 +72,7 @@ void foo(void) OFFSET(PT_EAX, pt_regs, eax); OFFSET(PT_DS, pt_regs, xds); OFFSET(PT_ES, pt_regs, xes); - OFFSET(PT_GS, pt_regs, xgs); + OFFSET(PT_FS, pt_regs, xfs); OFFSET(PT_ORIG_EAX, pt_regs, orig_eax); OFFSET(PT_EIP, pt_regs, eip); OFFSET(PT_CS, pt_regs, xcs); diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 8a8bbdaaf38..dcbbd0a8bfc 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -605,7 +605,7 @@ void __init early_cpu_init(void) struct pt_regs * __devinit idle_regs(struct pt_regs *regs) { memset(regs, 0, sizeof(struct pt_regs)); - regs->xgs = __KERNEL_PDA; + regs->xfs = __KERNEL_PDA; return regs; } @@ -662,12 +662,12 @@ struct i386_pda boot_pda = { .pcurrent = &init_task, }; -static inline void set_kernel_gs(void) +static inline void set_kernel_fs(void) { - /* Set %gs for this CPU's PDA. Memory clobber is to create a + /* Set %fs for this CPU's PDA. Memory clobber is to create a barrier with respect to any PDA operations, so the compiler doesn't move any before here. */ - asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory"); + asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory"); } /* Initialize the CPU's GDT and PDA. The boot CPU does this for @@ -718,7 +718,7 @@ void __cpuinit cpu_set_gdt(int cpu) the boot CPU, this will transition from the boot gdt+pda to the real ones). */ load_gdt(cpu_gdt_descr); - set_kernel_gs(); + set_kernel_fs(); } /* Common CPU init for both boot and secondary CPUs */ @@ -764,8 +764,8 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr) __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); #endif - /* Clear %fs. */ - asm volatile ("mov %0, %%fs" : : "r" (0)); + /* Clear %gs. */ + asm volatile ("mov %0, %%gs" : : "r" (0)); /* Clear all 6 debug registers: */ set_debugreg(0, 0); diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c index c0c3b59de32..de27bd07bc9 100644 --- a/arch/i386/kernel/cpu/cyrix.c +++ b/arch/i386/kernel/cpu/cyrix.c @@ -6,6 +6,7 @@ #include <asm/io.h> #include <asm/processor.h> #include <asm/timer.h> +#include <asm/pci-direct.h> #include "cpu.h" @@ -161,19 +162,19 @@ static void __cpuinit set_cx86_inc(void) static void __cpuinit geode_configure(void) { unsigned long flags; - u8 ccr3, ccr4; + u8 ccr3; local_irq_save(flags); /* Suspend on halt power saving and enable #SUSP pin */ setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88); ccr3 = getCx86(CX86_CCR3); - setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* Enable */ - - ccr4 = getCx86(CX86_CCR4); - ccr4 |= 0x38; /* FPU fast, DTE cache, Mem bypass */ + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ - setCx86(CX86_CCR3, ccr3); + + /* FPU fast, DTE cache, Mem bypass */ + setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38); + setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ set_cx86_memwb(); set_cx86_reorder(); @@ -183,14 +184,6 @@ static void __cpuinit geode_configure(void) } -#ifdef CONFIG_PCI -static struct pci_device_id __cpuinitdata cyrix_55x0[] = { - { PCI_DEVICE(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510) }, - { PCI_DEVICE(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520) }, - { }, -}; -#endif - static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) { unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0; @@ -258,6 +251,8 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */ #ifdef CONFIG_PCI + { + u32 vendor, device; /* It isn't really a PCI quirk directly, but the cure is the same. The MediaGX has deep magic SMM stuff that handles the SB emulation. It thows away the fifo on disable_dma() which @@ -273,22 +268,34 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) printk(KERN_INFO "Working around Cyrix MediaGX virtual DMA bugs.\n"); isa_dma_bridge_buggy = 2; + /* We do this before the PCI layer is running. However we + are safe here as we know the bridge must be a Cyrix + companion and must be present */ + vendor = read_pci_config_16(0, 0, 0x12, PCI_VENDOR_ID); + device = read_pci_config_16(0, 0, 0x12, PCI_DEVICE_ID); /* * The 5510/5520 companion chips have a funky PIT. */ - if (pci_dev_present(cyrix_55x0)) + if (vendor == PCI_VENDOR_ID_CYRIX && + (device == PCI_DEVICE_ID_CYRIX_5510 || device == PCI_DEVICE_ID_CYRIX_5520)) pit_latch_buggy = 1; + } #endif c->x86_cache_size=16; /* Yep 16K integrated cache thats it */ /* GXm supports extended cpuid levels 'ala' AMD */ if (c->cpuid_level == 2) { /* Enable cxMMX extensions (GX1 Datasheet 54) */ - setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1); + setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1); - /* GXlv/GXm/GX1 */ - if((dir1 >= 0x50 && dir1 <= 0x54) || dir1 >= 0x63) + /* + * GXm : 0x30 ... 0x5f GXm datasheet 51 + * GXlv: 0x6x GXlv datasheet 54 + * ? : 0x7x + * GX1 : 0x8x GX1 datasheet 56 + */ + if((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <=dir1 && dir1 <= 0x8f)) geode_configure(); get_model_name(c); /* get CPU marketing name */ return; @@ -415,15 +422,14 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 * c) if (dir0 == 5 || dir0 == 3) { - unsigned char ccr3, ccr4; + unsigned char ccr3; unsigned long flags; printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n"); local_irq_save(flags); ccr3 = getCx86(CX86_CCR3); - setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ - ccr4 = getCx86(CX86_CCR4); - setCx86(CX86_CCR4, ccr4 | 0x80); /* enable cpuid */ - setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ + setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80); /* enable cpuid */ + setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ local_irq_restore(flags); } } diff --git a/arch/i386/kernel/cpu/mcheck/mce.c b/arch/i386/kernel/cpu/mcheck/mce.c index d555bec0db9..4f10c62d180 100644 --- a/arch/i386/kernel/cpu/mcheck/mce.c +++ b/arch/i386/kernel/cpu/mcheck/mce.c @@ -12,6 +12,7 @@ #include <asm/processor.h> #include <asm/system.h> +#include <asm/mce.h> #include "mce.h" diff --git a/arch/i386/kernel/cpu/mcheck/mce.h b/arch/i386/kernel/cpu/mcheck/mce.h index 84fd4cf7d0f..81fb6e2d35f 100644 --- a/arch/i386/kernel/cpu/mcheck/mce.h +++ b/arch/i386/kernel/cpu/mcheck/mce.h @@ -1,4 +1,5 @@ #include <linux/init.h> +#include <asm/mce.h> void amd_mcheck_init(struct cpuinfo_x86 *c); void intel_p4_mcheck_init(struct cpuinfo_x86 *c); @@ -9,6 +10,5 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c); /* Call the installed machine check handler for this CPU setup. */ extern fastcall void (*machine_check_vector)(struct pt_regs *, long error_code); -extern int mce_disabled; extern int nr_mce_banks; diff --git a/arch/i386/kernel/cpu/mcheck/p4.c b/arch/i386/kernel/cpu/mcheck/p4.c index 504434a4601..8359c19d3a2 100644 --- a/arch/i386/kernel/cpu/mcheck/p4.c +++ b/arch/i386/kernel/cpu/mcheck/p4.c @@ -12,6 +12,7 @@ #include <asm/system.h> #include <asm/msr.h> #include <asm/apic.h> +#include <asm/idle.h> #include <asm/therm_throt.h> @@ -59,6 +60,7 @@ static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_therm fastcall void smp_thermal_interrupt(struct pt_regs *regs) { + exit_idle(); irq_enter(); vendor_thermal_interrupt(regs); irq_exit(); diff --git a/arch/i386/kernel/cpu/mtrr/if.c b/arch/i386/kernel/cpu/mtrr/if.c index 5ae1705eafa..c7d8f175674 100644 --- a/arch/i386/kernel/cpu/mtrr/if.c +++ b/arch/i386/kernel/cpu/mtrr/if.c @@ -211,6 +211,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) default: return -ENOTTY; case MTRRIOC_ADD_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_ADD_ENTRY: +#endif if (!capable(CAP_SYS_ADMIN)) return -EPERM; err = @@ -218,21 +221,33 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) file, 0); break; case MTRRIOC_SET_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_SET_ENTRY: +#endif if (!capable(CAP_SYS_ADMIN)) return -EPERM; err = mtrr_add(sentry.base, sentry.size, sentry.type, 0); break; case MTRRIOC_DEL_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_DEL_ENTRY: +#endif if (!capable(CAP_SYS_ADMIN)) return -EPERM; err = mtrr_file_del(sentry.base, sentry.size, file, 0); break; case MTRRIOC_KILL_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_KILL_ENTRY: +#endif if (!capable(CAP_SYS_ADMIN)) return -EPERM; err = mtrr_del(-1, sentry.base, sentry.size); break; case MTRRIOC_GET_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_GET_ENTRY: +#endif if (gentry.regnum >= num_var_ranges) return -EINVAL; mtrr_if->get(gentry.regnum, &gentry.base, &size, &type); @@ -249,6 +264,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) break; case MTRRIOC_ADD_PAGE_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_ADD_PAGE_ENTRY: +#endif if (!capable(CAP_SYS_ADMIN)) return -EPERM; err = @@ -256,21 +274,33 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) file, 1); break; case MTRRIOC_SET_PAGE_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_SET_PAGE_ENTRY: +#endif if (!capable(CAP_SYS_ADMIN)) return -EPERM; err = mtrr_add_page(sentry.base, sentry.size, sentry.type, 0); break; case MTRRIOC_DEL_PAGE_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_DEL_PAGE_ENTRY: +#endif if (!capable(CAP_SYS_ADMIN)) return -EPERM; err = mtrr_file_del(sentry.base, sentry.size, file, 1); break; case MTRRIOC_KILL_PAGE_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_KILL_PAGE_ENTRY: +#endif if (!capable(CAP_SYS_ADMIN)) return -EPERM; err = mtrr_del_page(-1, sentry.base, sentry.size); break; case MTRRIOC_GET_PAGE_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_GET_PAGE_ENTRY: +#endif if (gentry.regnum >= num_var_ranges) return -EINVAL; mtrr_if->get(gentry.regnum, &gentry.base, &size, &type); @@ -339,7 +369,7 @@ static int mtrr_open(struct inode *inode, struct file *file) return single_open(file, mtrr_seq_show, NULL); } -static struct file_operations mtrr_fops = { +static const struct file_operations mtrr_fops = { .owner = THIS_MODULE, .open = mtrr_open, .read = seq_read, diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c index 16bb7ea8714..0acfb6a5a22 100644 --- a/arch/i386/kernel/cpu/mtrr/main.c +++ b/arch/i386/kernel/cpu/mtrr/main.c @@ -50,7 +50,7 @@ u32 num_var_ranges = 0; unsigned int *usage_table; static DEFINE_MUTEX(mtrr_mutex); -u32 size_or_mask, size_and_mask; +u64 size_or_mask, size_and_mask; static struct mtrr_ops * mtrr_ops[X86_VENDOR_NUM] = {}; @@ -662,8 +662,8 @@ void __init mtrr_bp_init(void) boot_cpu_data.x86_mask == 0x4)) phys_addr = 36; - size_or_mask = ~((1 << (phys_addr - PAGE_SHIFT)) - 1); - size_and_mask = ~size_or_mask & 0xfff00000; + size_or_mask = ~((1ULL << (phys_addr - PAGE_SHIFT)) - 1); + size_and_mask = ~size_or_mask & 0xfffff00000ULL; } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR && boot_cpu_data.x86 == 6) { /* VIA C* family have Intel style MTRRs, but diff --git a/arch/i386/kernel/cpu/mtrr/mtrr.h b/arch/i386/kernel/cpu/mtrr/mtrr.h index d61ea9db6cf..289dfe6030e 100644 --- a/arch/i386/kernel/cpu/mtrr/mtrr.h +++ b/arch/i386/kernel/cpu/mtrr/mtrr.h @@ -84,7 +84,7 @@ void get_mtrr_state(void); extern void set_mtrr_ops(struct mtrr_ops * ops); -extern u32 size_or_mask, size_and_mask; +extern u64 size_or_mask, size_and_mask; extern struct mtrr_ops * mtrr_if; #define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c index 6624d8583c4..47e3ebbfb28 100644 --- a/arch/i386/kernel/cpu/proc.c +++ b/arch/i386/kernel/cpu/proc.c @@ -29,7 +29,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL, - NULL, "fxsr_opt", "rdtscp", NULL, NULL, "lm", "3dnowext", "3dnow", + NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", "3dnowext", "3dnow", /* Transmeta-defined */ "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, @@ -47,7 +47,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) /* Intel-defined (#2) */ "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, - NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL, + NULL, NULL, "dca", NULL, NULL, NULL, NULL, "popcnt", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* VIA/Cyrix/Centaur-defined */ @@ -57,8 +57,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* AMD-defined (#2) */ - "lahf_lm", "cmp_legacy", "svm", NULL, "cr8legacy", NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "lahf_lm", "cmp_legacy", "svm", "extapic", "cr8legacy", "abm", + "sse4a", "misalignsse", + "3dnowprefetch", "osvw", "ibs", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; @@ -69,8 +70,11 @@ static int show_cpuinfo(struct seq_file *m, void *v) "ttp", /* thermal trip */ "tm", "stc", + "100mhzsteps", + "hwpstate", NULL, - /* nothing */ /* constant_tsc - moved to flags */ + NULL, /* constant_tsc - moved to flags */ + /* nothing */ }; struct cpuinfo_x86 *c = v; int i, n = c - cpu_data; diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c index 4056fb7d2cd..5678d46863c 100644 --- a/arch/i386/kernel/cpu/transmeta.c +++ b/arch/i386/kernel/cpu/transmeta.c @@ -9,7 +9,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) { unsigned int cap_mask, uk, max, dummy; unsigned int cms_rev1, cms_rev2; - unsigned int cpu_rev, cpu_freq, cpu_flags, new_cpu_rev; + unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev; char cpu_info[65]; get_model_name(c); /* Same as AMD/Cyrix */ @@ -72,6 +72,9 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) wrmsr(0x80860004, ~0, uk); c->x86_capability[0] = cpuid_edx(0x00000001); wrmsr(0x80860004, cap_mask, uk); + + /* All Transmeta CPUs have a constant TSC */ + set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability); /* If we can run i686 user-space code, call us an i686 */ #define USER686 (X86_FEATURE_TSC|X86_FEATURE_CX8|X86_FEATURE_CMOV) diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c index 51130b39cd2..eeae0d99233 100644 --- a/arch/i386/kernel/cpuid.c +++ b/arch/i386/kernel/cpuid.c @@ -48,7 +48,6 @@ static struct class *cpuid_class; #ifdef CONFIG_SMP struct cpuid_command { - int cpu; u32 reg; u32 *data; }; @@ -57,8 +56,7 @@ static void cpuid_smp_cpuid(void *cmd_block) { struct cpuid_command *cmd = (struct cpuid_command *)cmd_block; - if (cmd->cpu == smp_processor_id()) - cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2], + cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2], &cmd->data[3]); } @@ -70,11 +68,10 @@ static inline void do_cpuid(int cpu, u32 reg, u32 * data) if (cpu == smp_processor_id()) { cpuid(reg, &data[0], &data[1], &data[2], &data[3]); } else { - cmd.cpu = cpu; cmd.reg = reg; cmd.data = data; - smp_call_function(cpuid_smp_cpuid, &cmd, 1, 1); + smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1); } preempt_enable(); } @@ -148,7 +145,7 @@ static int cpuid_open(struct inode *inode, struct file *file) /* * File operations we support */ -static struct file_operations cpuid_fops = { +static const struct file_operations cpuid_fops = { .owner = THIS_MODULE, .llseek = cpuid_seek, .read = cpuid_read, diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c index f391abcf7da..70f39560846 100644 --- a/arch/i386/kernel/e820.c +++ b/arch/i386/kernel/e820.c @@ -14,6 +14,7 @@ #include <asm/pgtable.h> #include <asm/page.h> #include <asm/e820.h> +#include <asm/setup.h> #ifdef CONFIG_EFI int efi_enabled = 0; @@ -156,21 +157,22 @@ static struct resource standard_io_resources[] = { { .flags = IORESOURCE_BUSY | IORESOURCE_IO } }; -static int romsignature(const unsigned char *x) +#define ROMSIGNATURE 0xaa55 + +static int __init romsignature(const unsigned char *rom) { unsigned short sig; - int ret = 0; - if (probe_kernel_address((const unsigned short *)x, sig) == 0) - ret = (sig == 0xaa55); - return ret; + + return probe_kernel_address((const unsigned short *)rom, sig) == 0 && + sig == ROMSIGNATURE; } static int __init romchecksum(unsigned char *rom, unsigned long length) { - unsigned char *p, sum = 0; + unsigned char sum; - for (p = rom; p < rom + length; p++) - sum += *p; + for (sum = 0; length; length--) + sum += *rom++; return sum == 0; } diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 5e47683fc63..18bddcb8e9e 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -30,7 +30,7 @@ * 18(%esp) - %eax * 1C(%esp) - %ds * 20(%esp) - %es - * 24(%esp) - %gs + * 24(%esp) - %fs * 28(%esp) - orig_eax * 2C(%esp) - %eip * 30(%esp) - %cs @@ -99,9 +99,9 @@ VM_MASK = 0x00020000 #define SAVE_ALL \ cld; \ - pushl %gs; \ + pushl %fs; \ CFI_ADJUST_CFA_OFFSET 4;\ - /*CFI_REL_OFFSET gs, 0;*/\ + /*CFI_REL_OFFSET fs, 0;*/\ pushl %es; \ CFI_ADJUST_CFA_OFFSET 4;\ /*CFI_REL_OFFSET es, 0;*/\ @@ -133,7 +133,7 @@ VM_MASK = 0x00020000 movl %edx, %ds; \ movl %edx, %es; \ movl $(__KERNEL_PDA), %edx; \ - movl %edx, %gs + movl %edx, %fs #define RESTORE_INT_REGS \ popl %ebx; \ @@ -166,9 +166,9 @@ VM_MASK = 0x00020000 2: popl %es; \ CFI_ADJUST_CFA_OFFSET -4;\ /*CFI_RESTORE es;*/\ -3: popl %gs; \ +3: popl %fs; \ CFI_ADJUST_CFA_OFFSET -4;\ - /*CFI_RESTORE gs;*/\ + /*CFI_RESTORE fs;*/\ .pushsection .fixup,"ax"; \ 4: movl $0,(%esp); \ jmp 1b; \ @@ -227,6 +227,7 @@ ENTRY(ret_from_fork) CFI_ADJUST_CFA_OFFSET -4 jmp syscall_exit CFI_ENDPROC +END(ret_from_fork) /* * Return to user mode is not as complex as all this looks, @@ -258,6 +259,7 @@ ENTRY(resume_userspace) # int/exception return? jne work_pending jmp restore_all +END(ret_from_exception) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) @@ -272,6 +274,7 @@ need_resched: jz restore_all call preempt_schedule_irq jmp need_resched +END(resume_kernel) #endif CFI_ENDPROC @@ -349,16 +352,17 @@ sysenter_past_esp: movl PT_OLDESP(%esp), %ecx xorl %ebp,%ebp TRACE_IRQS_ON -1: mov PT_GS(%esp), %gs +1: mov PT_FS(%esp), %fs ENABLE_INTERRUPTS_SYSEXIT CFI_ENDPROC .pushsection .fixup,"ax" -2: movl $0,PT_GS(%esp) +2: movl $0,PT_FS(%esp) jmp 1b .section __ex_table,"a" .align 4 .long 1b,2b .popsection +ENDPROC(sysenter_entry) # system call handler stub ENTRY(system_call) @@ -459,6 +463,7 @@ ldt_ss: CFI_ADJUST_CFA_OFFSET -8 jmp restore_nocheck CFI_ENDPROC +ENDPROC(system_call) # perform work that needs to be done immediately before resumption ALIGN @@ -504,6 +509,7 @@ work_notifysig_v86: xorl %edx, %edx call do_notify_resume jmp resume_userspace_sig +END(work_pending) # perform syscall exit tracing ALIGN @@ -519,6 +525,7 @@ syscall_trace_entry: cmpl $(nr_syscalls), %eax jnae syscall_call jmp syscall_exit +END(syscall_trace_entry) # perform syscall exit tracing ALIGN @@ -532,6 +539,7 @@ syscall_exit_work: movl $1, %edx call do_syscall_trace jmp resume_userspace +END(syscall_exit_work) CFI_ENDPROC RING0_INT_FRAME # can't unwind into user space anyway @@ -542,15 +550,17 @@ syscall_fault: GET_THREAD_INFO(%ebp) movl $-EFAULT,PT_EAX(%esp) jmp resume_userspace +END(syscall_fault) syscall_badsys: movl $-ENOSYS,PT_EAX(%esp) jmp resume_userspace +END(syscall_badsys) CFI_ENDPROC #define FIXUP_ESPFIX_STACK \ /* since we are on a wrong stack, we cant make it a C code :( */ \ - movl %gs:PDA_cpu, %ebx; \ + movl %fs:PDA_cpu, %ebx; \ PER_CPU(cpu_gdt_descr, %ebx); \ movl GDS_address(%ebx), %ebx; \ GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ @@ -581,9 +591,9 @@ syscall_badsys: ENTRY(interrupt) .text -vector=0 ENTRY(irq_entries_start) RING0_INT_FRAME +vector=0 .rept NR_IRQS ALIGN .if vector @@ -592,11 +602,16 @@ ENTRY(irq_entries_start) 1: pushl $~(vector) CFI_ADJUST_CFA_OFFSET 4 jmp common_interrupt -.data + .previous .long 1b -.text + .text vector=vector+1 .endr +END(irq_entries_start) + +.previous +END(interrupt) +.previous /* * the CPU automatically disables interrupts when executing an IRQ vector, @@ -609,6 +624,7 @@ common_interrupt: movl %esp,%eax call do_IRQ jmp ret_from_intr +ENDPROC(common_interrupt) CFI_ENDPROC #define BUILD_INTERRUPT(name, nr) \ @@ -621,18 +637,24 @@ ENTRY(name) \ movl %esp,%eax; \ call smp_/**/name; \ jmp ret_from_intr; \ - CFI_ENDPROC + CFI_ENDPROC; \ +ENDPROC(name) /* The include is where all of the SMP etc. interrupts come from */ #include "entry_arch.h" +/* This alternate entry is needed because we hijack the apic LVTT */ +#if defined(CONFIG_VMI) && defined(CONFIG_X86_LOCAL_APIC) +BUILD_INTERRUPT(apic_vmi_timer_interrupt,LOCAL_TIMER_VECTOR) +#endif + KPROBE_ENTRY(page_fault) RING0_EC_FRAME pushl $do_page_fault CFI_ADJUST_CFA_OFFSET 4 ALIGN error_code: - /* the function address is in %gs's slot on the stack */ + /* the function address is in %fs's slot on the stack */ pushl %es CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET es, 0*/ @@ -661,20 +683,20 @@ error_code: CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ebx, 0 cld - pushl %gs + pushl %fs CFI_ADJUST_CFA_OFFSET 4 - /*CFI_REL_OFFSET gs, 0*/ + /*CFI_REL_OFFSET fs, 0*/ movl $(__KERNEL_PDA), %ecx - movl %ecx, %gs + movl %ecx, %fs UNWIND_ESPFIX_STACK popl %ecx CFI_ADJUST_CFA_OFFSET -4 /*CFI_REGISTER es, ecx*/ - movl PT_GS(%esp), %edi # get the function address + movl PT_FS(%esp), %edi # get the function address movl PT_ORIG_EAX(%esp), %edx # get the error code movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart - mov %ecx, PT_GS(%esp) - /*CFI_REL_OFFSET gs, ES*/ + mov %ecx, PT_FS(%esp) + /*CFI_REL_OFFSET fs, ES*/ movl $(__USER_DS), %ecx movl %ecx, %ds movl %ecx, %es @@ -692,6 +714,7 @@ ENTRY(coprocessor_error) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(coprocessor_error) ENTRY(simd_coprocessor_error) RING0_INT_FRAME @@ -701,6 +724,7 @@ ENTRY(simd_coprocessor_error) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(simd_coprocessor_error) ENTRY(device_not_available) RING0_INT_FRAME @@ -721,6 +745,7 @@ device_not_available_emulate: CFI_ADJUST_CFA_OFFSET -4 jmp ret_from_exception CFI_ENDPROC +END(device_not_available) /* * Debug traps and NMI can happen at the one SYSENTER instruction @@ -864,10 +889,12 @@ ENTRY(native_iret) .align 4 .long 1b,iret_exc .previous +END(native_iret) ENTRY(native_irq_enable_sysexit) sti sysexit +END(native_irq_enable_sysexit) #endif KPROBE_ENTRY(int3) @@ -890,6 +917,7 @@ ENTRY(overflow) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(overflow) ENTRY(bounds) RING0_INT_FRAME @@ -899,6 +927,7 @@ ENTRY(bounds) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(bounds) ENTRY(invalid_op) RING0_INT_FRAME @@ -908,6 +937,7 @@ ENTRY(invalid_op) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(invalid_op) ENTRY(coprocessor_segment_overrun) RING0_INT_FRAME @@ -917,6 +947,7 @@ ENTRY(coprocessor_segment_overrun) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(coprocessor_segment_overrun) ENTRY(invalid_TSS) RING0_EC_FRAME @@ -924,6 +955,7 @@ ENTRY(invalid_TSS) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(invalid_TSS) ENTRY(segment_not_present) RING0_EC_FRAME @@ -931,6 +963,7 @@ ENTRY(segment_not_present) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(segment_not_present) ENTRY(stack_segment) RING0_EC_FRAME @@ -938,6 +971,7 @@ ENTRY(stack_segment) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(stack_segment) KPROBE_ENTRY(general_protection) RING0_EC_FRAME @@ -953,6 +987,7 @@ ENTRY(alignment_check) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(alignment_check) ENTRY(divide_error) RING0_INT_FRAME @@ -962,6 +997,7 @@ ENTRY(divide_error) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(divide_error) #ifdef CONFIG_X86_MCE ENTRY(machine_check) @@ -972,6 +1008,7 @@ ENTRY(machine_check) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(machine_check) #endif ENTRY(spurious_interrupt_bug) @@ -982,6 +1019,7 @@ ENTRY(spurious_interrupt_bug) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(spurious_interrupt_bug) ENTRY(kernel_thread_helper) pushl $0 # fake return address for unwinder diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index edef5084ce1..3fa7f9389af 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -53,6 +53,7 @@ * any particular GDT layout, because we load our own as soon as we * can. */ +.section .text.head,"ax",@progbits ENTRY(startup_32) #ifdef CONFIG_PARAVIRT @@ -103,7 +104,7 @@ ENTRY(startup_32) movzwl OLD_CL_OFFSET,%esi addl $(OLD_CL_BASE_ADDR),%esi 2: - movl $(saved_command_line - __PAGE_OFFSET),%edi + movl $(boot_command_line - __PAGE_OFFSET),%edi movl $(COMMAND_LINE_SIZE/4),%ecx rep movsl @@ -141,16 +142,25 @@ page_pde_offset = (__PAGE_OFFSET >> 20); jb 10b movl %edi,(init_pg_tables_end - __PAGE_OFFSET) -#ifdef CONFIG_SMP xorl %ebx,%ebx /* This is the boot CPU (BSP) */ jmp 3f - /* * Non-boot CPU entry point; entered from trampoline.S * We can't lgdt here, because lgdt itself uses a data segment, but * we know the trampoline has already loaded the boot_gdt_table GDT * for us. + * + * If cpu hotplug is not supported then this code can go in init section + * which will be freed later */ + +#ifdef CONFIG_HOTPLUG_CPU +.section .text,"ax",@progbits +#else +.section .init.text,"ax",@progbits +#endif + +#ifdef CONFIG_SMP ENTRY(startup_32_smp) cld movl $(__BOOT_DS),%eax @@ -208,8 +218,8 @@ ENTRY(startup_32_smp) xorl %ebx,%ebx incl %ebx -3: #endif /* CONFIG_SMP */ +3: /* * Enable paging @@ -309,7 +319,7 @@ is386: movl $2,%ecx # set MP call check_x87 call setup_pda - lgdt cpu_gdt_descr + lgdt early_gdt_descr lidt idt_descr ljmp $(__KERNEL_CS),$1f 1: movl $(__KERNEL_DS),%eax # reload all the segment registers @@ -319,12 +329,12 @@ is386: movl $2,%ecx # set MP movl %eax,%ds movl %eax,%es - xorl %eax,%eax # Clear FS and LDT - movl %eax,%fs + xorl %eax,%eax # Clear GS and LDT + movl %eax,%gs lldt %ax movl $(__KERNEL_PDA),%eax - mov %eax,%gs + mov %eax,%fs cld # gcc2 wants the direction flag cleared at all times pushl $0 # fake return address for unwinder @@ -360,12 +370,12 @@ check_x87: * cpu_gdt_table and boot_pda; for secondary CPUs, these will be * that CPU's GDT and PDA. */ -setup_pda: +ENTRY(setup_pda) /* get the PDA pointer */ movl start_pda, %eax /* slot the PDA address into the GDT */ - mov cpu_gdt_descr+2, %ecx + mov early_gdt_descr+2, %ecx mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */ shr $16, %eax mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */ @@ -492,6 +502,7 @@ ignore_int: #endif iret +.section .text #ifdef CONFIG_PARAVIRT startup_paravirt: cld @@ -502,10 +513,11 @@ startup_paravirt: pushl %ecx pushl %eax - /* paravirt.o is last in link, and that probe fn never returns */ pushl $__start_paravirtprobe 1: movl 0(%esp), %eax + cmpl $__stop_paravirtprobe, %eax + je unhandled_paravirt pushl (%eax) movl 8(%esp), %eax call *(%esp) @@ -517,6 +529,10 @@ startup_paravirt: addl $4, (%esp) jmp 1b + +unhandled_paravirt: + /* Nothing wanted us: we're screwed. */ + ud2 #endif /* @@ -581,7 +597,7 @@ idt_descr: # boot GDT descriptor (later on used by CPU#0): .word 0 # 32 bit align gdt_desc.address -ENTRY(cpu_gdt_descr) +ENTRY(early_gdt_descr) .word GDT_ENTRIES*8-1 .long cpu_gdt_table diff --git a/arch/i386/kernel/hpet.c b/arch/i386/kernel/hpet.c index 0b29d41322a..e1006b7acc9 100644 --- a/arch/i386/kernel/hpet.c +++ b/arch/i386/kernel/hpet.c @@ -1,4 +1,5 @@ #include <linux/clocksource.h> +#include <linux/clockchips.h> #include <linux/errno.h> #include <linux/hpet.h> #include <linux/init.h> @@ -6,17 +7,278 @@ #include <asm/hpet.h> #include <asm/io.h> +extern struct clock_event_device *global_clock_event; + #define HPET_MASK CLOCKSOURCE_MASK(32) #define HPET_SHIFT 22 /* FSEC = 10^-15 NSEC = 10^-9 */ #define FSEC_PER_NSEC 1000000 -static void __iomem *hpet_ptr; +/* + * HPET address is set in acpi/boot.c, when an ACPI entry exists + */ +unsigned long hpet_address; +static void __iomem * hpet_virt_address; + +static inline unsigned long hpet_readl(unsigned long a) +{ + return readl(hpet_virt_address + a); +} + +static inline void hpet_writel(unsigned long d, unsigned long a) +{ + writel(d, hpet_virt_address + a); +} + +/* + * HPET command line enable / disable + */ +static int boot_hpet_disable; + +static int __init hpet_setup(char* str) +{ + if (str) { + if (!strncmp("disable", str, 7)) + boot_hpet_disable = 1; + } + return 1; +} +__setup("hpet=", hpet_setup); + +static inline int is_hpet_capable(void) +{ + return (!boot_hpet_disable && hpet_address); +} + +/* + * HPET timer interrupt enable / disable + */ +static int hpet_legacy_int_enabled; + +/** + * is_hpet_enabled - check whether the hpet timer interrupt is enabled + */ +int is_hpet_enabled(void) +{ + return is_hpet_capable() && hpet_legacy_int_enabled; +} + +/* + * When the hpet driver (/dev/hpet) is enabled, we need to reserve + * timer 0 and timer 1 in case of RTC emulation. + */ +#ifdef CONFIG_HPET +static void hpet_reserve_platform_timers(unsigned long id) +{ + struct hpet __iomem *hpet = hpet_virt_address; + struct hpet_timer __iomem *timer = &hpet->hpet_timers[2]; + unsigned int nrtimers, i; + struct hpet_data hd; + + nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; + + memset(&hd, 0, sizeof (hd)); + hd.hd_phys_address = hpet_address; + hd.hd_address = hpet_virt_address; + hd.hd_nirqs = nrtimers; + hd.hd_flags = HPET_DATA_PLATFORM; + hpet_reserve_timer(&hd, 0); + +#ifdef CONFIG_HPET_EMULATE_RTC + hpet_reserve_timer(&hd, 1); +#endif + + hd.hd_irq[0] = HPET_LEGACY_8254; + hd.hd_irq[1] = HPET_LEGACY_RTC; + + for (i = 2; i < nrtimers; timer++, i++) + hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >> + Tn_INT_ROUTE_CNF_SHIFT; + + hpet_alloc(&hd); + +} +#else +static void hpet_reserve_platform_timers(unsigned long id) { } +#endif + +/* + * Common hpet info + */ +static unsigned long hpet_period; + +static void hpet_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt); +static int hpet_next_event(unsigned long delta, + struct clock_event_device *evt); + +/* + * The hpet clock event device + */ +static struct clock_event_device hpet_clockevent = { + .name = "hpet", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, + .set_mode = hpet_set_mode, + .set_next_event = hpet_next_event, + .shift = 32, + .irq = 0, +}; + +static void hpet_start_counter(void) +{ + unsigned long cfg = hpet_readl(HPET_CFG); + + cfg &= ~HPET_CFG_ENABLE; + hpet_writel(cfg, HPET_CFG); + hpet_writel(0, HPET_COUNTER); + hpet_writel(0, HPET_COUNTER + 4); + cfg |= HPET_CFG_ENABLE; + hpet_writel(cfg, HPET_CFG); +} + +static void hpet_enable_int(void) +{ + unsigned long cfg = hpet_readl(HPET_CFG); + + cfg |= HPET_CFG_LEGACY; + hpet_writel(cfg, HPET_CFG); + hpet_legacy_int_enabled = 1; +} + +static void hpet_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + unsigned long cfg, cmp, now; + uint64_t delta; + + switch(mode) { + case CLOCK_EVT_MODE_PERIODIC: + delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * hpet_clockevent.mult; + delta >>= hpet_clockevent.shift; + now = hpet_readl(HPET_COUNTER); + cmp = now + (unsigned long) delta; + cfg = hpet_readl(HPET_T0_CFG); + cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | + HPET_TN_SETVAL | HPET_TN_32BIT; + hpet_writel(cfg, HPET_T0_CFG); + /* + * The first write after writing TN_SETVAL to the + * config register sets the counter value, the second + * write sets the period. + */ + hpet_writel(cmp, HPET_T0_CMP); + udelay(1); + hpet_writel((unsigned long) delta, HPET_T0_CMP); + break; + + case CLOCK_EVT_MODE_ONESHOT: + cfg = hpet_readl(HPET_T0_CFG); + cfg &= ~HPET_TN_PERIODIC; + cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; + hpet_writel(cfg, HPET_T0_CFG); + break; + + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + cfg = hpet_readl(HPET_T0_CFG); + cfg &= ~HPET_TN_ENABLE; + hpet_writel(cfg, HPET_T0_CFG); + break; + } +} + +static int hpet_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + unsigned long cnt; + + cnt = hpet_readl(HPET_COUNTER); + cnt += delta; + hpet_writel(cnt, HPET_T0_CMP); + + return ((long)(hpet_readl(HPET_COUNTER) - cnt ) > 0); +} + +/* + * Try to setup the HPET timer + */ +int __init hpet_enable(void) +{ + unsigned long id; + uint64_t hpet_freq; + + if (!is_hpet_capable()) + return 0; + + hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); + + /* + * Read the period and check for a sane value: + */ + hpet_period = hpet_readl(HPET_PERIOD); + if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD) + goto out_nohpet; + + /* + * The period is a femto seconds value. We need to calculate the + * scaled math multiplication factor for nanosecond to hpet tick + * conversion. + */ + hpet_freq = 1000000000000000ULL; + do_div(hpet_freq, hpet_period); + hpet_clockevent.mult = div_sc((unsigned long) hpet_freq, + NSEC_PER_SEC, 32); + /* Calculate the min / max delta */ + hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, + &hpet_clockevent); + hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30, + &hpet_clockevent); + + /* + * Read the HPET ID register to retrieve the IRQ routing + * information and the number of channels + */ + id = hpet_readl(HPET_ID); + +#ifdef CONFIG_HPET_EMULATE_RTC + /* + * The legacy routing mode needs at least two channels, tick timer + * and the rtc emulation channel. + */ + if (!(id & HPET_ID_NUMBER)) + goto out_nohpet; +#endif + + /* Start the counter */ + hpet_start_counter(); + + if (id & HPET_ID_LEGSUP) { + hpet_enable_int(); + hpet_reserve_platform_timers(id); + /* + * Start hpet with the boot cpu mask and make it + * global after the IO_APIC has been initialized. + */ + hpet_clockevent.cpumask =cpumask_of_cpu(0); + clockevents_register_device(&hpet_clockevent); + global_clock_event = &hpet_clockevent; + return 1; + } + return 0; +out_nohpet: + iounmap(hpet_virt_address); + hpet_virt_address = NULL; + return 0; +} + +/* + * Clock source related code + */ static cycle_t read_hpet(void) { - return (cycle_t)readl(hpet_ptr); + return (cycle_t)hpet_readl(HPET_COUNTER); } static struct clocksource clocksource_hpet = { @@ -24,28 +286,17 @@ static struct clocksource clocksource_hpet = { .rating = 250, .read = read_hpet, .mask = HPET_MASK, - .mult = 0, /* set below */ .shift = HPET_SHIFT, - .is_continuous = 1, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; static int __init init_hpet_clocksource(void) { - unsigned long hpet_period; - void __iomem* hpet_base; u64 tmp; - int err; - if (!is_hpet_enabled()) + if (!hpet_virt_address) return -ENODEV; - /* calculate the hpet address: */ - hpet_base = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); - hpet_ptr = hpet_base + HPET_COUNTER; - - /* calculate the frequency: */ - hpet_period = readl(hpet_base + HPET_PERIOD); - /* * hpet period is in femto seconds per cycle * so we need to convert this to ns/cyc units @@ -61,11 +312,218 @@ static int __init init_hpet_clocksource(void) do_div(tmp, FSEC_PER_NSEC); clocksource_hpet.mult = (u32)tmp; - err = clocksource_register(&clocksource_hpet); - if (err) - iounmap(hpet_base); - - return err; + return clocksource_register(&clocksource_hpet); } module_init(init_hpet_clocksource); + +#ifdef CONFIG_HPET_EMULATE_RTC + +/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET + * is enabled, we support RTC interrupt functionality in software. + * RTC has 3 kinds of interrupts: + * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock + * is updated + * 2) Alarm Interrupt - generate an interrupt at a specific time of day + * 3) Periodic Interrupt - generate periodic interrupt, with frequencies + * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2) + * (1) and (2) above are implemented using polling at a frequency of + * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt + * overhead. (DEFAULT_RTC_INT_FREQ) + * For (3), we use interrupts at 64Hz or user specified periodic + * frequency, whichever is higher. + */ +#include <linux/mc146818rtc.h> +#include <linux/rtc.h> + +#define DEFAULT_RTC_INT_FREQ 64 +#define DEFAULT_RTC_SHIFT 6 +#define RTC_NUM_INTS 1 + +static unsigned long hpet_rtc_flags; +static unsigned long hpet_prev_update_sec; +static struct rtc_time hpet_alarm_time; +static unsigned long hpet_pie_count; +static unsigned long hpet_t1_cmp; +static unsigned long hpet_default_delta; +static unsigned long hpet_pie_delta; +static unsigned long hpet_pie_limit; + +/* + * Timer 1 for RTC emulation. We use one shot mode, as periodic mode + * is not supported by all HPET implementations for timer 1. + * + * hpet_rtc_timer_init() is called when the rtc is initialized. + */ +int hpet_rtc_timer_init(void) +{ + unsigned long cfg, cnt, delta, flags; + + if (!is_hpet_enabled()) + return 0; + + if (!hpet_default_delta) { + uint64_t clc; + + clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; + clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT; + hpet_default_delta = (unsigned long) clc; + } + + if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) + delta = hpet_default_delta; + else + delta = hpet_pie_delta; + + local_irq_save(flags); + + cnt = delta + hpet_readl(HPET_COUNTER); + hpet_writel(cnt, HPET_T1_CMP); + hpet_t1_cmp = cnt; + + cfg = hpet_readl(HPET_T1_CFG); + cfg &= ~HPET_TN_PERIODIC; + cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; + hpet_writel(cfg, HPET_T1_CFG); + + local_irq_restore(flags); + + return 1; +} + +/* + * The functions below are called from rtc driver. + * Return 0 if HPET is not being used. + * Otherwise do the necessary changes and return 1. + */ +int hpet_mask_rtc_irq_bit(unsigned long bit_mask) +{ + if (!is_hpet_enabled()) + return 0; + + hpet_rtc_flags &= ~bit_mask; + return 1; +} + +int hpet_set_rtc_irq_bit(unsigned long bit_mask) +{ + unsigned long oldbits = hpet_rtc_flags; + + if (!is_hpet_enabled()) + return 0; + + hpet_rtc_flags |= bit_mask; + + if (!oldbits) + hpet_rtc_timer_init(); + + return 1; +} + +int hpet_set_alarm_time(unsigned char hrs, unsigned char min, + unsigned char sec) +{ + if (!is_hpet_enabled()) + return 0; + + hpet_alarm_time.tm_hour = hrs; + hpet_alarm_time.tm_min = min; + hpet_alarm_time.tm_sec = sec; + + return 1; +} + +int hpet_set_periodic_freq(unsigned long freq) +{ + uint64_t clc; + + if (!is_hpet_enabled()) + return 0; + + if (freq <= DEFAULT_RTC_INT_FREQ) + hpet_pie_limit = DEFAULT_RTC_INT_FREQ / freq; + else { + clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; + do_div(clc, freq); + clc >>= hpet_clockevent.shift; + hpet_pie_delta = (unsigned long) clc; + } + return 1; +} + +int hpet_rtc_dropped_irq(void) +{ + return is_hpet_enabled(); +} + +static void hpet_rtc_timer_reinit(void) +{ + unsigned long cfg, delta; + int lost_ints = -1; + + if (unlikely(!hpet_rtc_flags)) { + cfg = hpet_readl(HPET_T1_CFG); + cfg &= ~HPET_TN_ENABLE; + hpet_writel(cfg, HPET_T1_CFG); + return; + } + + if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) + delta = hpet_default_delta; + else + delta = hpet_pie_delta; + + /* + * Increment the comparator value until we are ahead of the + * current count. + */ + do { + hpet_t1_cmp += delta; + hpet_writel(hpet_t1_cmp, HPET_T1_CMP); + lost_ints++; + } while ((long)(hpet_readl(HPET_COUNTER) - hpet_t1_cmp) > 0); + + if (lost_ints) { + if (hpet_rtc_flags & RTC_PIE) + hpet_pie_count += lost_ints; + if (printk_ratelimit()) + printk(KERN_WARNING "rtc: lost %d interrupts\n", + lost_ints); + } +} + +irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) +{ + struct rtc_time curr_time; + unsigned long rtc_int_flag = 0; + + hpet_rtc_timer_reinit(); + + if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) + rtc_get_rtc_time(&curr_time); + + if (hpet_rtc_flags & RTC_UIE && + curr_time.tm_sec != hpet_prev_update_sec) { + rtc_int_flag = RTC_UF; + hpet_prev_update_sec = curr_time.tm_sec; + } + + if (hpet_rtc_flags & RTC_PIE && + ++hpet_pie_count >= hpet_pie_limit) { + rtc_int_flag |= RTC_PF; + hpet_pie_count = 0; + } + + if (hpet_rtc_flags & RTC_PIE && + (curr_time.tm_sec == hpet_alarm_time.tm_sec) && + (curr_time.tm_min == hpet_alarm_time.tm_min) && + (curr_time.tm_hour == hpet_alarm_time.tm_hour)) + rtc_int_flag |= RTC_AF; + + if (rtc_int_flag) { + rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); + rtc_interrupt(rtc_int_flag, dev_id); + } + return IRQ_HANDLED; +} +#endif diff --git a/arch/i386/kernel/i8253.c b/arch/i386/kernel/i8253.c index 9a0060b92e3..a6bc7bb3883 100644 --- a/arch/i386/kernel/i8253.c +++ b/arch/i386/kernel/i8253.c @@ -2,7 +2,7 @@ * i8253.c 8253/PIT functions * */ -#include <linux/clocksource.h> +#include <linux/clockchips.h> #include <linux/spinlock.h> #include <linux/jiffies.h> #include <linux/sysdev.h> @@ -19,17 +19,97 @@ DEFINE_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); -void setup_pit_timer(void) +/* + * HPET replaces the PIT, when enabled. So we need to know, which of + * the two timers is used + */ +struct clock_event_device *global_clock_event; + +/* + * Initialize the PIT timer. + * + * This is also called after resume to bring the PIT into operation again. + */ +static void init_pit_timer(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + unsigned long flags; + + spin_lock_irqsave(&i8253_lock, flags); + + switch(mode) { + case CLOCK_EVT_MODE_PERIODIC: + /* binary, mode 2, LSB/MSB, ch 0 */ + outb_p(0x34, PIT_MODE); + udelay(10); + outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ + udelay(10); + outb(LATCH >> 8 , PIT_CH0); /* MSB */ + break; + + case CLOCK_EVT_MODE_ONESHOT: + case CLOCK_EVT_MODE_SHUTDOWN: + case CLOCK_EVT_MODE_UNUSED: + /* One shot setup */ + outb_p(0x38, PIT_MODE); + udelay(10); + break; + } + spin_unlock_irqrestore(&i8253_lock, flags); +} + +/* + * Program the next event in oneshot mode + * + * Delta is given in PIT ticks + */ +static int pit_next_event(unsigned long delta, struct clock_event_device *evt) { unsigned long flags; spin_lock_irqsave(&i8253_lock, flags); - outb_p(0x34,PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ - udelay(10); - outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ - udelay(10); - outb(LATCH >> 8 , PIT_CH0); /* MSB */ + outb_p(delta & 0xff , PIT_CH0); /* LSB */ + outb(delta >> 8 , PIT_CH0); /* MSB */ spin_unlock_irqrestore(&i8253_lock, flags); + + return 0; +} + +/* + * On UP the PIT can serve all of the possible timer functions. On SMP systems + * it can be solely used for the global tick. + * + * The profiling and update capabilites are switched off once the local apic is + * registered. This mechanism replaces the previous #ifdef LOCAL_APIC - + * !using_apic_timer decisions in do_timer_interrupt_hook() + */ +struct clock_event_device pit_clockevent = { + .name = "pit", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, + .set_mode = init_pit_timer, + .set_next_event = pit_next_event, + .shift = 32, + .irq = 0, +}; + +/* + * Initialize the conversion factor and the min/max deltas of the clock event + * structure and register the clock event source with the framework. + */ +void __init setup_pit_timer(void) +{ + /* + * Start pit with the boot cpu mask and make it global after the + * IO_APIC has been initialized. + */ + pit_clockevent.cpumask = cpumask_of_cpu(0); + pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 32); + pit_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFF, &pit_clockevent); + pit_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &pit_clockevent); + clockevents_register_device(&pit_clockevent); + global_clock_event = &pit_clockevent; } /* @@ -46,7 +126,7 @@ static cycle_t pit_read(void) static u32 old_jifs; spin_lock_irqsave(&i8253_lock, flags); - /* + /* * Although our caller may have the read side of xtime_lock, * this is now a seqlock, and we are cheating in this routine * by having side effects on state that we cannot undo if diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c index c8d45821c78..03abfdb1a6e 100644 --- a/arch/i386/kernel/i8259.c +++ b/arch/i386/kernel/i8259.c @@ -41,6 +41,7 @@ static void mask_and_ack_8259A(unsigned int); static struct irq_chip i8259A_chip = { .name = "XT-PIC", .mask = disable_8259A_irq, + .disable = disable_8259A_irq, .unmask = enable_8259A_irq, .mask_ack = mask_and_ack_8259A, }; @@ -410,12 +411,6 @@ void __init native_init_IRQ(void) intr_init_hook(); /* - * Set the clock to HZ Hz, we already have a valid - * vector now: - */ - setup_pit_timer(); - - /* * External FPU? Set up irq13 if so, for * original braindamaged IBM FERR coupling. */ diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index ba8d302a0b7..4ccebd454e2 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -482,8 +482,8 @@ static void do_irq_balance(void) package_index = CPU_TO_PACKAGEINDEX(i); for (j = 0; j < NR_IRQS; j++) { unsigned long value_now, delta; - /* Is this an active IRQ? */ - if (!irq_desc[j].action) + /* Is this an active IRQ or balancing disabled ? */ + if (!irq_desc[j].action || irq_balancing_disabled(j)) continue; if ( package_index == i ) IRQ_DELTA(package_index,j) = 0; @@ -1281,11 +1281,9 @@ static void ioapic_register_intr(int irq, int vector, unsigned long trigger) trigger == IOAPIC_LEVEL) set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_fasteoi_irq, "fasteoi"); - else { - irq_desc[irq].status |= IRQ_DELAYED_DISABLE; + else set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_edge_irq, "edge"); - } set_intr_gate(vector, interrupt[irq]); } @@ -1588,7 +1586,7 @@ void /*__init*/ print_local_APIC(void * dummy) v = apic_read(APIC_LVR); printk(KERN_INFO "... APIC VERSION: %08x\n", v); ver = GET_APIC_VERSION(v); - maxlvt = get_maxlvt(); + maxlvt = lapic_get_maxlvt(); v = apic_read(APIC_TASKPRI); printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); @@ -1920,7 +1918,7 @@ static void __init setup_ioapic_ids_from_mpc(void) static void __init setup_ioapic_ids_from_mpc(void) { } #endif -static int no_timer_check __initdata; +int no_timer_check __initdata; static int __init notimercheck(char *s) { @@ -2310,7 +2308,7 @@ static inline void __init check_timer(void) disable_8259A_irq(0); set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq, - "fasteio"); + "fasteoi"); apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ enable_8259A_irq(0); diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c index 3201d421090..0f2ca590bf2 100644 --- a/arch/i386/kernel/irq.c +++ b/arch/i386/kernel/irq.c @@ -10,7 +10,6 @@ * io_apic.c.) */ -#include <asm/uaccess.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/interrupt.h> @@ -19,19 +18,36 @@ #include <linux/cpu.h> #include <linux/delay.h> +#include <asm/idle.h> + +#include <asm/apic.h> +#include <asm/uaccess.h> + DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp; EXPORT_PER_CPU_SYMBOL(irq_stat); -#ifndef CONFIG_X86_LOCAL_APIC /* * 'what should we do if we get a hw irq event on an illegal vector'. * each architecture has to answer this themselves. */ void ack_bad_irq(unsigned int irq) { - printk("unexpected IRQ trap at vector %02x\n", irq); -} + printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq); + +#ifdef CONFIG_X86_LOCAL_APIC + /* + * Currently unexpected vectors happen only on SMP and APIC. + * We _must_ ack these because every local APIC has only N + * irq slots per priority level, and a 'hanging, unacked' IRQ + * holds up an irq slot - in excessive cases (when multiple + * unexpected vectors occur) that might lock up the APIC + * completely. + * But only ack when the APIC is enabled -AK + */ + if (cpu_has_apic) + ack_APIC_irq(); #endif +} #ifdef CONFIG_4KSTACKS /* @@ -61,6 +77,7 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs) union irq_ctx *curctx, *irqctx; u32 *isp; #endif + exit_idle(); if (unlikely((unsigned)irq >= NR_IRQS)) { printk(KERN_EMERG "%s: cannot handle IRQ %d\n", diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c index af1d5334499..b545bc746fc 100644 --- a/arch/i386/kernel/kprobes.c +++ b/arch/i386/kernel/kprobes.c @@ -363,7 +363,7 @@ no_kprobe: " pushf\n" /* skip cs, eip, orig_eax */ " subl $12, %esp\n" - " pushl %gs\n" + " pushl %fs\n" " pushl %ds\n" " pushl %es\n" " pushl %eax\n" @@ -387,7 +387,7 @@ no_kprobe: " popl %edi\n" " popl %ebp\n" " popl %eax\n" - /* skip eip, orig_eax, es, ds, gs */ + /* skip eip, orig_eax, es, ds, fs */ " addl $20, %esp\n" " popf\n" " ret\n"); @@ -408,7 +408,7 @@ fastcall void *__kprobes trampoline_handler(struct pt_regs *regs) spin_lock_irqsave(&kretprobe_lock, flags); head = kretprobe_inst_table_head(current); /* fixup registers */ - regs->xcs = __KERNEL_CS; + regs->xcs = __KERNEL_CS | get_kernel_rpl(); regs->eip = trampoline_address; regs->orig_eax = 0xffffffff; diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c index c8fa13721bc..b8f16633a6e 100644 --- a/arch/i386/kernel/microcode.c +++ b/arch/i386/kernel/microcode.c @@ -384,7 +384,7 @@ static int do_microcode_update (void) { long cursor = 0; int error = 0; - void *new_mc; + void *new_mc = NULL; int cpu; cpumask_t old; @@ -451,7 +451,7 @@ static ssize_t microcode_write (struct file *file, const char __user *buf, size_ return ret; } -static struct file_operations microcode_fops = { +static const struct file_operations microcode_fops = { .owner = THIS_MODULE, .write = microcode_write, .open = microcode_open, diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c index 4a472a17d1c..bcaa6e9b619 100644 --- a/arch/i386/kernel/msr.c +++ b/arch/i386/kernel/msr.c @@ -68,7 +68,6 @@ static inline int rdmsr_eio(u32 reg, u32 *eax, u32 *edx) #ifdef CONFIG_SMP struct msr_command { - int cpu; int err; u32 reg; u32 data[2]; @@ -78,16 +77,14 @@ static void msr_smp_wrmsr(void *cmd_block) { struct msr_command *cmd = (struct msr_command *)cmd_block; - if (cmd->cpu == smp_processor_id()) - cmd->err = wrmsr_eio(cmd->reg, cmd->data[0], cmd->data[1]); + cmd->err = wrmsr_eio(cmd->reg, cmd->data[0], cmd->data[1]); } static void msr_smp_rdmsr(void *cmd_block) { struct msr_command *cmd = (struct msr_command *)cmd_block; - if (cmd->cpu == smp_processor_id()) - cmd->err = rdmsr_eio(cmd->reg, &cmd->data[0], &cmd->data[1]); + cmd->err = rdmsr_eio(cmd->reg, &cmd->data[0], &cmd->data[1]); } static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx) @@ -99,12 +96,11 @@ static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx) if (cpu == smp_processor_id()) { ret = wrmsr_eio(reg, eax, edx); } else { - cmd.cpu = cpu; cmd.reg = reg; cmd.data[0] = eax; cmd.data[1] = edx; - smp_call_function(msr_smp_wrmsr, &cmd, 1, 1); + smp_call_function_single(cpu, msr_smp_wrmsr, &cmd, 1, 1); ret = cmd.err; } preempt_enable(); @@ -120,10 +116,9 @@ static inline int do_rdmsr(int cpu, u32 reg, u32 * eax, u32 * edx) if (cpu == smp_processor_id()) { ret = rdmsr_eio(reg, eax, edx); } else { - cmd.cpu = cpu; cmd.reg = reg; - smp_call_function(msr_smp_rdmsr, &cmd, 1, 1); + smp_call_function_single(cpu, msr_smp_rdmsr, &cmd, 1, 1); *eax = cmd.data[0]; *edx = cmd.data[1]; @@ -230,7 +225,7 @@ static int msr_open(struct inode *inode, struct file *file) /* * File operations we support */ -static struct file_operations msr_fops = { +static const struct file_operations msr_fops = { .owner = THIS_MODULE, .llseek = msr_seek, .read = msr_read, diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 1a6f8bb8881..821df34d2b3 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -23,6 +23,7 @@ #include <linux/dmi.h> #include <linux/kprobes.h> #include <linux/cpumask.h> +#include <linux/kernel_stat.h> #include <asm/smp.h> #include <asm/nmi.h> @@ -185,7 +186,8 @@ static __cpuinit inline int nmi_known_cpu(void) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: - return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)); + return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6) + || (boot_cpu_data.x86 == 16)); case X86_VENDOR_INTEL: if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return 1; @@ -216,6 +218,28 @@ static __init void nmi_cpu_busy(void *data) } #endif +static unsigned int adjust_for_32bit_ctr(unsigned int hz) +{ + u64 counter_val; + unsigned int retval = hz; + + /* + * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter + * are writable, with higher bits sign extending from bit 31. + * So, we can only program the counter with 31 bit values and + * 32nd bit should be 1, for 33.. to be 1. + * Find the appropriate nmi_hz + */ + counter_val = (u64)cpu_khz * 1000; + do_div(counter_val, retval); + if (counter_val > 0x7fffffffULL) { + u64 count = (u64)cpu_khz * 1000; + do_div(count, 0x7fffffffUL); + retval = count + 1; + } + return retval; +} + static int __init check_nmi_watchdog(void) { unsigned int *prev_nmi_count; @@ -281,18 +305,10 @@ static int __init check_nmi_watchdog(void) struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); nmi_hz = 1; - /* - * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter - * are writable, with higher bits sign extending from bit 31. - * So, we can only program the counter with 31 bit values and - * 32nd bit should be 1, for 33.. to be 1. - * Find the appropriate nmi_hz - */ - if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 && - ((u64)cpu_khz * 1000) > 0x7fffffffULL) { - u64 count = (u64)cpu_khz * 1000; - do_div(count, 0x7fffffffUL); - nmi_hz = count + 1; + + if (wd->perfctr_msr == MSR_P6_PERFCTR0 || + wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { + nmi_hz = adjust_for_32bit_ctr(nmi_hz); } } @@ -369,6 +385,34 @@ void enable_timer_nmi_watchdog(void) } } +static void __acpi_nmi_disable(void *__unused) +{ + apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); +} + +/* + * Disable timer based NMIs on all CPUs: + */ +void acpi_nmi_disable(void) +{ + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); +} + +static void __acpi_nmi_enable(void *__unused) +{ + apic_write_around(APIC_LVT0, APIC_DM_NMI); +} + +/* + * Enable timer based NMIs on all CPUs: + */ +void acpi_nmi_enable(void) +{ + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); +} + #ifdef CONFIG_PM static int nmi_pm_active; /* nmi_active before suspend */ @@ -442,6 +486,17 @@ static void write_watchdog_counter(unsigned int perfctr_msr, const char *descr) wrmsrl(perfctr_msr, 0 - count); } +static void write_watchdog_counter32(unsigned int perfctr_msr, + const char *descr) +{ + u64 count = (u64)cpu_khz * 1000; + + do_div(count, nmi_hz); + if(descr) + Dprintk("setting %s to -0x%08Lx\n", descr, count); + wrmsr(perfctr_msr, (u32)(-count), 0); +} + /* Note that these events don't tick when the CPU idles. This means the frequency varies with CPU load. */ @@ -531,7 +586,8 @@ static int setup_p6_watchdog(void) /* setup the timer */ wrmsr(evntsel_msr, evntsel, 0); - write_watchdog_counter(perfctr_msr, "P6_PERFCTR0"); + nmi_hz = adjust_for_32bit_ctr(nmi_hz); + write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0"); apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= P6_EVNTSEL0_ENABLE; wrmsr(evntsel_msr, evntsel, 0); @@ -704,7 +760,8 @@ static int setup_intel_arch_watchdog(void) /* setup the timer */ wrmsr(evntsel_msr, evntsel, 0); - write_watchdog_counter(perfctr_msr, "INTEL_ARCH_PERFCTR0"); + nmi_hz = adjust_for_32bit_ctr(nmi_hz); + write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0"); apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; wrmsr(evntsel_msr, evntsel, 0); @@ -762,7 +819,8 @@ void setup_apic_nmi_watchdog (void *unused) if (nmi_watchdog == NMI_LOCAL_APIC) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: - if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15) + if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 && + boot_cpu_data.x86 != 16) return; if (!setup_k7_watchdog()) return; @@ -916,9 +974,13 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) cpu_clear(cpu, backtrace_mask); } - sum = per_cpu(irq_stat, cpu).apic_timer_irqs; + /* + * Take the local apic timer and PIT/HPET into account. We don't + * know which one is active, when we have highres/dyntick on + */ + sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_irqs(0); - /* if the apic timer isn't firing, this cpu isn't doing much */ + /* if the none of the timers isn't firing, this cpu isn't doing much */ if (!touched && last_irq_sums[cpu] == sum) { /* * Ayiee, looks like this CPU is stuck ... @@ -956,6 +1018,8 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) dummy &= ~P4_CCCR_OVF; wrmsrl(wd->cccr_msr, dummy); apic_write(APIC_LVTPC, APIC_DM_NMI); + /* start the cycle over again */ + write_watchdog_counter(wd->perfctr_msr, NULL); } else if (wd->perfctr_msr == MSR_P6_PERFCTR0 || wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { @@ -964,9 +1028,12 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) * other P6 variant. * ArchPerfom/Core Duo also needs this */ apic_write(APIC_LVTPC, APIC_DM_NMI); + /* P6/ARCH_PERFMON has 32 bit counter write */ + write_watchdog_counter32(wd->perfctr_msr, NULL); + } else { + /* start the cycle over again */ + write_watchdog_counter(wd->perfctr_msr, NULL); } - /* start the cycle over again */ - write_watchdog_counter(wd->perfctr_msr, NULL); rc = 1; } else if (nmi_watchdog == NMI_IO_APIC) { /* don't know how to accurately check for this. diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index e55fd05da0f..c156ecfa387 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -92,7 +92,7 @@ static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len) return insn_len; } -static fastcall unsigned long native_get_debugreg(int regno) +static unsigned long native_get_debugreg(int regno) { unsigned long val = 0; /* Damn you, gcc! */ @@ -115,7 +115,7 @@ static fastcall unsigned long native_get_debugreg(int regno) return val; } -static fastcall void native_set_debugreg(int regno, unsigned long value) +static void native_set_debugreg(int regno, unsigned long value) { switch (regno) { case 0: @@ -146,55 +146,55 @@ void init_IRQ(void) paravirt_ops.init_IRQ(); } -static fastcall void native_clts(void) +static void native_clts(void) { asm volatile ("clts"); } -static fastcall unsigned long native_read_cr0(void) +static unsigned long native_read_cr0(void) { unsigned long val; asm volatile("movl %%cr0,%0\n\t" :"=r" (val)); return val; } -static fastcall void native_write_cr0(unsigned long val) +static void native_write_cr0(unsigned long val) { asm volatile("movl %0,%%cr0": :"r" (val)); } -static fastcall unsigned long native_read_cr2(void) +static unsigned long native_read_cr2(void) { unsigned long val; asm volatile("movl %%cr2,%0\n\t" :"=r" (val)); return val; } -static fastcall void native_write_cr2(unsigned long val) +static void native_write_cr2(unsigned long val) { asm volatile("movl %0,%%cr2": :"r" (val)); } -static fastcall unsigned long native_read_cr3(void) +static unsigned long native_read_cr3(void) { unsigned long val; asm volatile("movl %%cr3,%0\n\t" :"=r" (val)); return val; } -static fastcall void native_write_cr3(unsigned long val) +static void native_write_cr3(unsigned long val) { asm volatile("movl %0,%%cr3": :"r" (val)); } -static fastcall unsigned long native_read_cr4(void) +static unsigned long native_read_cr4(void) { unsigned long val; asm volatile("movl %%cr4,%0\n\t" :"=r" (val)); return val; } -static fastcall unsigned long native_read_cr4_safe(void) +static unsigned long native_read_cr4_safe(void) { unsigned long val; /* This could fault if %cr4 does not exist */ @@ -207,51 +207,51 @@ static fastcall unsigned long native_read_cr4_safe(void) return val; } -static fastcall void native_write_cr4(unsigned long val) +static void native_write_cr4(unsigned long val) { asm volatile("movl %0,%%cr4": :"r" (val)); } -static fastcall unsigned long native_save_fl(void) +static unsigned long native_save_fl(void) { unsigned long f; asm volatile("pushfl ; popl %0":"=g" (f): /* no input */); return f; } -static fastcall void native_restore_fl(unsigned long f) +static void native_restore_fl(unsigned long f) { asm volatile("pushl %0 ; popfl": /* no output */ :"g" (f) :"memory", "cc"); } -static fastcall void native_irq_disable(void) +static void native_irq_disable(void) { asm volatile("cli": : :"memory"); } -static fastcall void native_irq_enable(void) +static void native_irq_enable(void) { asm volatile("sti": : :"memory"); } -static fastcall void native_safe_halt(void) +static void native_safe_halt(void) { asm volatile("sti; hlt": : :"memory"); } -static fastcall void native_halt(void) +static void native_halt(void) { asm volatile("hlt": : :"memory"); } -static fastcall void native_wbinvd(void) +static void native_wbinvd(void) { asm volatile("wbinvd": : :"memory"); } -static fastcall unsigned long long native_read_msr(unsigned int msr, int *err) +static unsigned long long native_read_msr(unsigned int msr, int *err) { unsigned long long val; @@ -270,7 +270,7 @@ static fastcall unsigned long long native_read_msr(unsigned int msr, int *err) return val; } -static fastcall int native_write_msr(unsigned int msr, unsigned long long val) +static int native_write_msr(unsigned int msr, unsigned long long val) { int err; asm volatile("2: wrmsr ; xorl %0,%0\n" @@ -288,53 +288,53 @@ static fastcall int native_write_msr(unsigned int msr, unsigned long long val) return err; } -static fastcall unsigned long long native_read_tsc(void) +static unsigned long long native_read_tsc(void) { unsigned long long val; asm volatile("rdtsc" : "=A" (val)); return val; } -static fastcall unsigned long long native_read_pmc(void) +static unsigned long long native_read_pmc(void) { unsigned long long val; asm volatile("rdpmc" : "=A" (val)); return val; } -static fastcall void native_load_tr_desc(void) +static void native_load_tr_desc(void) { asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); } -static fastcall void native_load_gdt(const struct Xgt_desc_struct *dtr) +static void native_load_gdt(const struct Xgt_desc_struct *dtr) { asm volatile("lgdt %0"::"m" (*dtr)); } -static fastcall void native_load_idt(const struct Xgt_desc_struct *dtr) +static void native_load_idt(const struct Xgt_desc_struct *dtr) { asm volatile("lidt %0"::"m" (*dtr)); } -static fastcall void native_store_gdt(struct Xgt_desc_struct *dtr) +static void native_store_gdt(struct Xgt_desc_struct *dtr) { asm ("sgdt %0":"=m" (*dtr)); } -static fastcall void native_store_idt(struct Xgt_desc_struct *dtr) +static void native_store_idt(struct Xgt_desc_struct *dtr) { asm ("sidt %0":"=m" (*dtr)); } -static fastcall unsigned long native_store_tr(void) +static unsigned long native_store_tr(void) { unsigned long tr; asm ("str %0":"=r" (tr)); return tr; } -static fastcall void native_load_tls(struct thread_struct *t, unsigned int cpu) +static void native_load_tls(struct thread_struct *t, unsigned int cpu) { #define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i] C(0); C(1); C(2); @@ -348,22 +348,22 @@ static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32 lp[1] = entry_high; } -static fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high) +static void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high) { native_write_dt_entry(dt, entrynum, low, high); } -static fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high) +static void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high) { native_write_dt_entry(dt, entrynum, low, high); } -static fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high) +static void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high) { native_write_dt_entry(dt, entrynum, low, high); } -static fastcall void native_load_esp0(struct tss_struct *tss, +static void native_load_esp0(struct tss_struct *tss, struct thread_struct *thread) { tss->esp0 = thread->esp0; @@ -375,12 +375,12 @@ static fastcall void native_load_esp0(struct tss_struct *tss, } } -static fastcall void native_io_delay(void) +static void native_io_delay(void) { asm volatile("outb %al,$0x80"); } -static fastcall void native_flush_tlb(void) +static void native_flush_tlb(void) { __native_flush_tlb(); } @@ -389,49 +389,49 @@ static fastcall void native_flush_tlb(void) * Global pages have to be flushed a bit differently. Not a real * performance problem because this does not happen often. */ -static fastcall void native_flush_tlb_global(void) +static void native_flush_tlb_global(void) { __native_flush_tlb_global(); } -static fastcall void native_flush_tlb_single(u32 addr) +static void native_flush_tlb_single(u32 addr) { __native_flush_tlb_single(addr); } #ifndef CONFIG_X86_PAE -static fastcall void native_set_pte(pte_t *ptep, pte_t pteval) +static void native_set_pte(pte_t *ptep, pte_t pteval) { *ptep = pteval; } -static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval) +static void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval) { *ptep = pteval; } -static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) +static void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) { *pmdp = pmdval; } #else /* CONFIG_X86_PAE */ -static fastcall void native_set_pte(pte_t *ptep, pte_t pte) +static void native_set_pte(pte_t *ptep, pte_t pte) { ptep->pte_high = pte.pte_high; smp_wmb(); ptep->pte_low = pte.pte_low; } -static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte) +static void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte) { ptep->pte_high = pte.pte_high; smp_wmb(); ptep->pte_low = pte.pte_low; } -static fastcall void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) +static void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { ptep->pte_low = 0; smp_wmb(); @@ -440,29 +440,29 @@ static fastcall void native_set_pte_present(struct mm_struct *mm, unsigned long ptep->pte_low = pte.pte_low; } -static fastcall void native_set_pte_atomic(pte_t *ptep, pte_t pteval) +static void native_set_pte_atomic(pte_t *ptep, pte_t pteval) { set_64bit((unsigned long long *)ptep,pte_val(pteval)); } -static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) +static void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) { set_64bit((unsigned long long *)pmdp,pmd_val(pmdval)); } -static fastcall void native_set_pud(pud_t *pudp, pud_t pudval) +static void native_set_pud(pud_t *pudp, pud_t pudval) { *pudp = pudval; } -static fastcall void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +static void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { ptep->pte_low = 0; smp_wmb(); ptep->pte_high = 0; } -static fastcall void native_pmd_clear(pmd_t *pmd) +static void native_pmd_clear(pmd_t *pmd) { u32 *tmp = (u32 *)pmd; *tmp = 0; @@ -472,8 +472,8 @@ static fastcall void native_pmd_clear(pmd_t *pmd) #endif /* CONFIG_X86_PAE */ /* These are in entry.S */ -extern fastcall void native_iret(void); -extern fastcall void native_irq_enable_sysexit(void); +extern void native_iret(void); +extern void native_irq_enable_sysexit(void); static int __init print_banner(void) { @@ -482,9 +482,6 @@ static int __init print_banner(void) } core_initcall(print_banner); -/* We simply declare start_kernel to be the paravirt probe of last resort. */ -paravirt_probe(start_kernel); - struct paravirt_ops paravirt_ops = { .name = "bare hardware", .paravirt_enabled = 0, @@ -544,12 +541,21 @@ struct paravirt_ops paravirt_ops = { .apic_write = native_apic_write, .apic_write_atomic = native_apic_write_atomic, .apic_read = native_apic_read, + .setup_boot_clock = setup_boot_APIC_clock, + .setup_secondary_clock = setup_secondary_APIC_clock, #endif + .set_lazy_mode = (void *)native_nop, .flush_tlb_user = native_flush_tlb, .flush_tlb_kernel = native_flush_tlb_global, .flush_tlb_single = native_flush_tlb_single, + .alloc_pt = (void *)native_nop, + .alloc_pd = (void *)native_nop, + .alloc_pd_clone = (void *)native_nop, + .release_pt = (void *)native_nop, + .release_pd = (void *)native_nop, + .set_pte = native_set_pte, .set_pte_at = native_set_pte_at, .set_pmd = native_set_pmd, @@ -565,6 +571,8 @@ struct paravirt_ops paravirt_ops = { .irq_enable_sysexit = native_irq_enable_sysexit, .iret = native_iret, + + .startup_ipi_hook = (void *)native_nop, }; /* diff --git a/arch/i386/kernel/pcspeaker.c b/arch/i386/kernel/pcspeaker.c new file mode 100644 index 00000000000..bc1f2d3ea27 --- /dev/null +++ b/arch/i386/kernel/pcspeaker.c @@ -0,0 +1,20 @@ +#include <linux/platform_device.h> +#include <linux/errno.h> +#include <linux/init.h> + +static __init int add_pcspkr(void) +{ + struct platform_device *pd; + int ret; + + pd = platform_device_alloc("pcspkr", -1); + if (!pd) + return -ENOMEM; + + ret = platform_device_add(pd); + if (ret) + platform_device_put(pd); + + return ret; +} +device_initcall(add_pcspkr); diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index c641056233a..bea304d48cd 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -38,6 +38,7 @@ #include <linux/ptrace.h> #include <linux/random.h> #include <linux/personality.h> +#include <linux/tick.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -48,6 +49,7 @@ #include <asm/i387.h> #include <asm/desc.h> #include <asm/vm86.h> +#include <asm/idle.h> #ifdef CONFIG_MATH_EMULATION #include <asm/math_emu.h> #endif @@ -80,6 +82,42 @@ void (*pm_idle)(void); EXPORT_SYMBOL(pm_idle); static DEFINE_PER_CPU(unsigned int, cpu_idle_state); +static ATOMIC_NOTIFIER_HEAD(idle_notifier); + +void idle_notifier_register(struct notifier_block *n) +{ + atomic_notifier_chain_register(&idle_notifier, n); +} + +void idle_notifier_unregister(struct notifier_block *n) +{ + atomic_notifier_chain_unregister(&idle_notifier, n); +} + +static DEFINE_PER_CPU(volatile unsigned long, idle_state); + +void enter_idle(void) +{ + /* needs to be atomic w.r.t. interrupts, not against other CPUs */ + __set_bit(0, &__get_cpu_var(idle_state)); + atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); +} + +static void __exit_idle(void) +{ + /* needs to be atomic w.r.t. interrupts, not against other CPUs */ + if (__test_and_clear_bit(0, &__get_cpu_var(idle_state)) == 0) + return; + atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); +} + +void exit_idle(void) +{ + if (current->pid) + return; + __exit_idle(); +} + void disable_hlt(void) { hlt_counter++; @@ -130,6 +168,7 @@ EXPORT_SYMBOL(default_idle); */ static void poll_idle (void) { + local_irq_enable(); cpu_relax(); } @@ -173,6 +212,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { + tick_nohz_stop_sched_tick(); while (!need_resched()) { void (*idle)(void); @@ -189,8 +229,18 @@ void cpu_idle(void) play_dead(); __get_cpu_var(irq_stat).idle_timestamp = jiffies; + + /* + * Idle routines should keep interrupts disabled + * from here on, until they go to idle. + * Otherwise, idle callbacks can misfire. + */ + local_irq_disable(); + enter_idle(); idle(); + __exit_idle(); } + tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); schedule(); preempt_disable(); @@ -243,7 +293,11 @@ void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); if (!need_resched()) - __mwait(eax, ecx); + __sti_mwait(eax, ecx); + else + local_irq_enable(); + } else { + local_irq_enable(); } } @@ -308,8 +362,8 @@ void show_regs(struct pt_regs * regs) regs->eax,regs->ebx,regs->ecx,regs->edx); printk("ESI: %08lx EDI: %08lx EBP: %08lx", regs->esi, regs->edi, regs->ebp); - printk(" DS: %04x ES: %04x GS: %04x\n", - 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xgs); + printk(" DS: %04x ES: %04x FS: %04x\n", + 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xfs); cr0 = read_cr0(); cr2 = read_cr2(); @@ -340,7 +394,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) regs.xds = __USER_DS; regs.xes = __USER_DS; - regs.xgs = __KERNEL_PDA; + regs.xfs = __KERNEL_PDA; regs.orig_eax = -1; regs.eip = (unsigned long) kernel_thread_helper; regs.xcs = __KERNEL_CS | get_kernel_rpl(); @@ -425,7 +479,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, p->thread.eip = (unsigned long) ret_from_fork; - savesegment(fs,p->thread.fs); + savesegment(gs,p->thread.gs); tsk = current; if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { @@ -501,8 +555,8 @@ void dump_thread(struct pt_regs * regs, struct user * dump) dump->regs.eax = regs->eax; dump->regs.ds = regs->xds; dump->regs.es = regs->xes; - savesegment(fs,dump->regs.fs); - dump->regs.gs = regs->xgs; + dump->regs.fs = regs->xfs; + savesegment(gs,dump->regs.gs); dump->regs.orig_eax = regs->orig_eax; dump->regs.eip = regs->eip; dump->regs.cs = regs->xcs; @@ -653,7 +707,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas load_esp0(tss, next); /* - * Save away %fs. No need to save %gs, as it was saved on the + * Save away %gs. No need to save %fs, as it was saved on the * stack on entry. No need to save %es and %ds, as those are * always kernel segments while inside the kernel. Doing this * before setting the new TLS descriptors avoids the situation @@ -662,7 +716,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas * used %fs or %gs (it does not today), or if the kernel is * running inside of a hypervisor layer. */ - savesegment(fs, prev->fs); + savesegment(gs, prev->gs); /* * Load the per-thread Thread-Local Storage descriptor. @@ -670,14 +724,13 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas load_TLS(next, cpu); /* - * Restore %fs if needed. - * - * Glibc normally makes %fs be zero. + * Restore IOPL if needed. In normal use, the flags restore + * in the switch assembly will handle this. But if the kernel + * is running virtualized at a non-zero CPL, the popf will + * not restore flags, so it must be done in a separate step. */ - if (unlikely(prev->fs | next->fs)) - loadsegment(fs, next->fs); - - write_pda(pcurrent, next_p); + if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl)) + set_iopl_mask(next->iopl); /* * Now maybe handle debug registers and/or IO bitmaps @@ -688,6 +741,15 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas disable_tsc(prev_p, next_p); + /* + * Leave lazy mode, flushing any hypercalls made here. + * This must be done before restoring TLS segments so + * the GDT and LDT are properly updated, and must be + * done before math_state_restore, so the TS bit is up + * to date. + */ + arch_leave_lazy_cpu_mode(); + /* If the task has used fpu the last 5 timeslices, just do a full * restore of the math state immediately to avoid the trap; the * chances of needing FPU soon are obviously high now @@ -695,6 +757,14 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas if (next_p->fpu_counter > 5) math_state_restore(); + /* + * Restore %gs if needed (which is common) + */ + if (prev->gs | next->gs) + loadsegment(gs, next->gs); + + write_pda(pcurrent, next_p); + return prev_p; } diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c index af8aabe8580..4a8f8a25972 100644 --- a/arch/i386/kernel/ptrace.c +++ b/arch/i386/kernel/ptrace.c @@ -89,14 +89,14 @@ static int putreg(struct task_struct *child, unsigned long regno, unsigned long value) { switch (regno >> 2) { - case FS: + case GS: if (value && (value & 3) != 3) return -EIO; - child->thread.fs = value; + child->thread.gs = value; return 0; case DS: case ES: - case GS: + case FS: if (value && (value & 3) != 3) return -EIO; value &= 0xffff; @@ -112,7 +112,7 @@ static int putreg(struct task_struct *child, value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK; break; } - if (regno > ES*4) + if (regno > FS*4) regno -= 1*4; put_stack_long(child, regno, value); return 0; @@ -124,18 +124,18 @@ static unsigned long getreg(struct task_struct *child, unsigned long retval = ~0UL; switch (regno >> 2) { - case FS: - retval = child->thread.fs; + case GS: + retval = child->thread.gs; break; case DS: case ES: - case GS: + case FS: case SS: case CS: retval = 0xffff; /* fall through */ default: - if (regno > ES*4) + if (regno > FS*4) regno -= 1*4; retval &= get_stack_long(child, regno); } diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 4b31ad70c1a..122623dcc6e 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -33,7 +33,6 @@ #include <linux/initrd.h> #include <linux/bootmem.h> #include <linux/seq_file.h> -#include <linux/platform_device.h> #include <linux/console.h> #include <linux/mca.h> #include <linux/root_dev.h> @@ -60,6 +59,7 @@ #include <asm/io_apic.h> #include <asm/ist.h> #include <asm/io.h> +#include <asm/vmi.h> #include <setup_arch.h> #include <bios_ebda.h> @@ -132,7 +132,7 @@ unsigned long saved_videomode; #define RAMDISK_PROMPT_FLAG 0x8000 #define RAMDISK_LOAD_FLAG 0x4000 -static char command_line[COMMAND_LINE_SIZE]; +static char __initdata command_line[COMMAND_LINE_SIZE]; unsigned char __initdata boot_params[PARAM_SIZE]; @@ -576,11 +576,19 @@ void __init setup_arch(char **cmdline_p) print_memory_map("user"); } - strlcpy(command_line, saved_command_line, COMMAND_LINE_SIZE); + strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; max_low_pfn = setup_memory(); +#ifdef CONFIG_VMI + /* + * Must be after max_low_pfn is determined, and before kernel + * pagetables are setup. + */ + vmi_init(); +#endif + /* * NOTE: before this point _nobody_ is allowed to allocate * any memory using the bootmem allocator. Although the @@ -651,28 +659,3 @@ void __init setup_arch(char **cmdline_p) #endif tsc_init(); } - -static __init int add_pcspkr(void) -{ - struct platform_device *pd; - int ret; - - pd = platform_device_alloc("pcspkr", -1); - if (!pd) - return -ENOMEM; - - ret = platform_device_add(pd); - if (ret) - platform_device_put(pd); - - return ret; -} -device_initcall(add_pcspkr); - -/* - * Local Variables: - * mode:c - * c-file-style:"k&r" - * c-basic-offset:8 - * End: - */ diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c index 65d7620eaa0..4f99e870c98 100644 --- a/arch/i386/kernel/signal.c +++ b/arch/i386/kernel/signal.c @@ -21,6 +21,7 @@ #include <linux/suspend.h> #include <linux/ptrace.h> #include <linux/elf.h> +#include <linux/binfmts.h> #include <asm/processor.h> #include <asm/ucontext.h> #include <asm/uaccess.h> @@ -128,8 +129,8 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \ X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF) - COPY_SEG(gs); - GET_SEG(fs); + GET_SEG(gs); + COPY_SEG(fs); COPY_SEG(es); COPY_SEG(ds); COPY(edi); @@ -244,9 +245,9 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, { int tmp, err = 0; - err |= __put_user(regs->xgs, (unsigned int __user *)&sc->gs); - savesegment(fs, tmp); - err |= __put_user(tmp, (unsigned int __user *)&sc->fs); + err |= __put_user(regs->xfs, (unsigned int __user *)&sc->fs); + savesegment(gs, tmp); + err |= __put_user(tmp, (unsigned int __user *)&sc->gs); err |= __put_user(regs->xes, (unsigned int __user *)&sc->es); err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds); @@ -349,7 +350,10 @@ static int setup_frame(int sig, struct k_sigaction *ka, goto give_sigsegv; } - restorer = (void *)VDSO_SYM(&__kernel_sigreturn); + if (current->binfmt->hasvdso) + restorer = (void *)VDSO_SYM(&__kernel_sigreturn); + else + restorer = (void *)&frame->retcode; if (ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer; diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index 5285aff8367..9bd9637ae69 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -23,6 +23,7 @@ #include <asm/mtrr.h> #include <asm/tlbflush.h> +#include <asm/idle.h> #include <mach_apic.h> /* @@ -374,8 +375,7 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, /* * i'm not happy about this global shared spinlock in the * MM hot path, but we'll see how contended it is. - * Temporarily this turns IRQs off, so that lockups are - * detected by the NMI watchdog. + * AK: x86-64 has a faster method that could be ported. */ spin_lock(&tlbstate_lock); @@ -400,7 +400,7 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, while (!cpus_empty(flush_cpumask)) /* nothing. lockup detection does not belong here */ - mb(); + cpu_relax(); flush_mm = NULL; flush_va = 0; @@ -624,6 +624,7 @@ fastcall void smp_call_function_interrupt(struct pt_regs *regs) /* * At this point the info structure may be out of scope unless wait==1 */ + exit_idle(); irq_enter(); (*func)(info); irq_exit(); diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 8c6c8c52b95..48bfcaa13ec 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -63,6 +63,7 @@ #include <mach_apic.h> #include <mach_wakecpu.h> #include <smpboot_hooks.h> +#include <asm/vmi.h> /* Set if we find a B stepping CPU */ static int __devinitdata smp_b_stepping; @@ -93,12 +94,6 @@ cpumask_t cpu_possible_map; EXPORT_SYMBOL(cpu_possible_map); static cpumask_t smp_commenced_mask; -/* TSC's upper 32 bits can't be written in eariler CPU (before prescott), there - * is no way to resync one AP against BP. TBD: for prescott and above, we - * should use IA64's algorithm - */ -static int __devinitdata tsc_sync_disabled; - /* Per CPU bogomips and other parameters */ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; EXPORT_SYMBOL(cpu_data); @@ -215,151 +210,6 @@ valid_k7: ; } -/* - * TSC synchronization. - * - * We first check whether all CPUs have their TSC's synchronized, - * then we print a warning if not, and always resync. - */ - -static struct { - atomic_t start_flag; - atomic_t count_start; - atomic_t count_stop; - unsigned long long values[NR_CPUS]; -} tsc __cpuinitdata = { - .start_flag = ATOMIC_INIT(0), - .count_start = ATOMIC_INIT(0), - .count_stop = ATOMIC_INIT(0), -}; - -#define NR_LOOPS 5 - -static void __init synchronize_tsc_bp(void) -{ - int i; - unsigned long long t0; - unsigned long long sum, avg; - long long delta; - unsigned int one_usec; - int buggy = 0; - - printk(KERN_INFO "checking TSC synchronization across %u CPUs: ", num_booting_cpus()); - - /* convert from kcyc/sec to cyc/usec */ - one_usec = cpu_khz / 1000; - - atomic_set(&tsc.start_flag, 1); - wmb(); - - /* - * We loop a few times to get a primed instruction cache, - * then the last pass is more or less synchronized and - * the BP and APs set their cycle counters to zero all at - * once. This reduces the chance of having random offsets - * between the processors, and guarantees that the maximum - * delay between the cycle counters is never bigger than - * the latency of information-passing (cachelines) between - * two CPUs. - */ - for (i = 0; i < NR_LOOPS; i++) { - /* - * all APs synchronize but they loop on '== num_cpus' - */ - while (atomic_read(&tsc.count_start) != num_booting_cpus()-1) - cpu_relax(); - atomic_set(&tsc.count_stop, 0); - wmb(); - /* - * this lets the APs save their current TSC: - */ - atomic_inc(&tsc.count_start); - - rdtscll(tsc.values[smp_processor_id()]); - /* - * We clear the TSC in the last loop: - */ - if (i == NR_LOOPS-1) - write_tsc(0, 0); - - /* - * Wait for all APs to leave the synchronization point: - */ - while (atomic_read(&tsc.count_stop) != num_booting_cpus()-1) - cpu_relax(); - atomic_set(&tsc.count_start, 0); - wmb(); - atomic_inc(&tsc.count_stop); - } - - sum = 0; - for (i = 0; i < NR_CPUS; i++) { - if (cpu_isset(i, cpu_callout_map)) { - t0 = tsc.values[i]; - sum += t0; - } - } - avg = sum; - do_div(avg, num_booting_cpus()); - - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_isset(i, cpu_callout_map)) - continue; - delta = tsc.values[i] - avg; - if (delta < 0) - delta = -delta; - /* - * We report bigger than 2 microseconds clock differences. - */ - if (delta > 2*one_usec) { - long long realdelta; - - if (!buggy) { - buggy = 1; - printk("\n"); - } - realdelta = delta; - do_div(realdelta, one_usec); - if (tsc.values[i] < avg) - realdelta = -realdelta; - - if (realdelta) - printk(KERN_INFO "CPU#%d had %Ld usecs TSC " - "skew, fixed it up.\n", i, realdelta); - } - } - if (!buggy) - printk("passed.\n"); -} - -static void __cpuinit synchronize_tsc_ap(void) -{ - int i; - - /* - * Not every cpu is online at the time - * this gets called, so we first wait for the BP to - * finish SMP initialization: - */ - while (!atomic_read(&tsc.start_flag)) - cpu_relax(); - - for (i = 0; i < NR_LOOPS; i++) { - atomic_inc(&tsc.count_start); - while (atomic_read(&tsc.count_start) != num_booting_cpus()) - cpu_relax(); - - rdtscll(tsc.values[smp_processor_id()]); - if (i == NR_LOOPS-1) - write_tsc(0, 0); - - atomic_inc(&tsc.count_stop); - while (atomic_read(&tsc.count_stop) != num_booting_cpus()) - cpu_relax(); - } -} -#undef NR_LOOPS - extern void calibrate_delay(void); static atomic_t init_deasserted; @@ -437,20 +287,12 @@ static void __cpuinit smp_callin(void) /* * Save our processor parameters */ - smp_store_cpu_info(cpuid); - - disable_APIC_timer(); + smp_store_cpu_info(cpuid); /* * Allow the master to continue. */ cpu_set(cpuid, cpu_callin_map); - - /* - * Synchronize the TSC with the BP - */ - if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled) - synchronize_tsc_ap(); } static int cpucount; @@ -545,18 +387,25 @@ static void __cpuinit start_secondary(void *unused) * booting is too fragile that we want to limit the * things done here to the most necessary things. */ +#ifdef CONFIG_VMI + vmi_bringup(); +#endif secondary_cpu_init(); preempt_disable(); smp_callin(); while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) rep_nop(); - setup_secondary_APIC_clock(); + /* + * Check TSC synchronization with the BP: + */ + check_tsc_sync_target(); + + setup_secondary_clock(); if (nmi_watchdog == NMI_IO_APIC) { disable_8259A_irq(0); enable_NMI_through_LVT0(NULL); enable_8259A_irq(0); } - enable_APIC_timer(); /* * low-memory mappings have been cleared, flush them from * the local TLBs too. @@ -619,7 +468,6 @@ extern struct { unsigned short ss; } stack_start; extern struct i386_pda *start_pda; -extern struct Xgt_desc_struct cpu_gdt_descr; #ifdef CONFIG_NUMA @@ -749,7 +597,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) /* * Due to the Pentium erratum 3AP. */ - maxlvt = get_maxlvt(); + maxlvt = lapic_get_maxlvt(); if (maxlvt > 3) { apic_read_around(APIC_SPIV); apic_write(APIC_ESR, 0); @@ -835,11 +683,18 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) num_starts = 0; /* + * Paravirt / VMI wants a startup IPI hook here to set up the + * target processor state. + */ + startup_ipi_hook(phys_apicid, (unsigned long) start_secondary, + (unsigned long) stack_start.esp); + + /* * Run STARTUP IPI loop. */ Dprintk("#startup loops: %d.\n", num_starts); - maxlvt = get_maxlvt(); + maxlvt = lapic_get_maxlvt(); for (j = 1; j <= num_starts; j++) { Dprintk("Sending STARTUP #%d.\n",j); @@ -1115,8 +970,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu) info.cpu = cpu; INIT_WORK(&info.task, do_warm_boot_cpu); - tsc_sync_disabled = 1; - /* init low mem mapping */ clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS)); @@ -1124,7 +977,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu) schedule_work(&info.task); wait_for_completion(&done); - tsc_sync_disabled = 0; zap_low_mappings(); ret = 0; exit: @@ -1320,13 +1172,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus) smpboot_setup_io_apic(); - setup_boot_APIC_clock(); - - /* - * Synchronize the TSC with the AP - */ - if (cpu_has_tsc && cpucount && cpu_khz) - synchronize_tsc_bp(); + setup_boot_clock(); } /* These are wrappers to interface to the new boot process. Someone @@ -1461,9 +1307,16 @@ int __cpuinit __cpu_up(unsigned int cpu) } local_irq_enable(); + per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; /* Unleash the CPU! */ cpu_set(cpu, smp_commenced_mask); + + /* + * Check TSC synchronization with the AP: + */ + check_tsc_sync_source(cpu); + while (!cpu_isset(cpu, cpu_online_map)) cpu_relax(); diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c index bc882a2b1db..13ca54a85a1 100644 --- a/arch/i386/kernel/sysenter.c +++ b/arch/i386/kernel/sysenter.c @@ -78,7 +78,7 @@ int __init sysenter_setup(void) syscall_pages[0] = virt_to_page(syscall_page); #ifdef CONFIG_COMPAT_VDSO - __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY); + __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY_EXEC); printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO)); #endif diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c index c505b16c099..a5350059557 100644 --- a/arch/i386/kernel/time.c +++ b/arch/i386/kernel/time.c @@ -131,15 +131,13 @@ unsigned long profile_pc(struct pt_regs *regs) unsigned long pc = instruction_pointer(regs); #ifdef CONFIG_SMP - if (!user_mode_vm(regs) && in_lock_functions(pc)) { + if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs) && + in_lock_functions(pc)) { #ifdef CONFIG_FRAME_POINTER return *(unsigned long *)(regs->ebp + 4); #else - unsigned long *sp; - if ((regs->xcs & 3) == 0) - sp = (unsigned long *)®s->esp; - else - sp = (unsigned long *)regs->esp; + unsigned long *sp = (unsigned long *)®s->esp; + /* Return address is either directly at stack pointer or above a saved eflags. Eflags has bits 22-31 zero, kernel addresses don't. */ @@ -161,15 +159,6 @@ EXPORT_SYMBOL(profile_pc); */ irqreturn_t timer_interrupt(int irq, void *dev_id) { - /* - * Here we are in the timer irq handler. We just have irqs locally - * disabled but we don't know if the timer_bh is running on the other - * CPU. We need to avoid to SMP race with it. NOTE: we don' t need - * the irq version of write_lock because as just said we have irq - * locally disabled. -arca - */ - write_seqlock(&xtime_lock); - #ifdef CONFIG_X86_IO_APIC if (timer_ack) { /* @@ -188,7 +177,6 @@ irqreturn_t timer_interrupt(int irq, void *dev_id) do_timer_interrupt_hook(); - if (MCA_bus) { /* The PS/2 uses level-triggered interrupts. You can't turn them off, nor would you want to (any attempt to @@ -203,18 +191,11 @@ irqreturn_t timer_interrupt(int irq, void *dev_id) outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */ } - write_sequnlock(&xtime_lock); - -#ifdef CONFIG_X86_LOCAL_APIC - if (using_apic_timer) - smp_send_timer_broadcast_ipi(); -#endif - return IRQ_HANDLED; } /* not static: needed by APM */ -unsigned long get_cmos_time(void) +unsigned long read_persistent_clock(void) { unsigned long retval; unsigned long flags; @@ -227,11 +208,11 @@ unsigned long get_cmos_time(void) return retval; } -EXPORT_SYMBOL(get_cmos_time); static void sync_cmos_clock(unsigned long dummy); static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); +int no_sync_cmos_clock; static void sync_cmos_clock(unsigned long dummy) { @@ -275,117 +256,20 @@ static void sync_cmos_clock(unsigned long dummy) void notify_arch_cmos_timer(void) { - mod_timer(&sync_cmos_timer, jiffies + 1); -} - -static long clock_cmos_diff; -static unsigned long sleep_start; - -static int timer_suspend(struct sys_device *dev, pm_message_t state) -{ - /* - * Estimate time zone so that set_time can update the clock - */ - unsigned long ctime = get_cmos_time(); - - clock_cmos_diff = -ctime; - clock_cmos_diff += get_seconds(); - sleep_start = ctime; - return 0; -} - -static int timer_resume(struct sys_device *dev) -{ - unsigned long flags; - unsigned long sec; - unsigned long ctime = get_cmos_time(); - long sleep_length = (ctime - sleep_start) * HZ; - struct timespec ts; - - if (sleep_length < 0) { - printk(KERN_WARNING "CMOS clock skew detected in timer resume!\n"); - /* The time after the resume must not be earlier than the time - * before the suspend or some nasty things will happen - */ - sleep_length = 0; - ctime = sleep_start; - } -#ifdef CONFIG_HPET_TIMER - if (is_hpet_enabled()) - hpet_reenable(); -#endif - setup_pit_timer(); - - sec = ctime + clock_cmos_diff; - ts.tv_sec = sec; - ts.tv_nsec = 0; - do_settimeofday(&ts); - write_seqlock_irqsave(&xtime_lock, flags); - jiffies_64 += sleep_length; - write_sequnlock_irqrestore(&xtime_lock, flags); - touch_softlockup_watchdog(); - return 0; -} - -static struct sysdev_class timer_sysclass = { - .resume = timer_resume, - .suspend = timer_suspend, - set_kset_name("timer"), -}; - - -/* XXX this driverfs stuff should probably go elsewhere later -john */ -static struct sys_device device_timer = { - .id = 0, - .cls = &timer_sysclass, -}; - -static int time_init_device(void) -{ - int error = sysdev_class_register(&timer_sysclass); - if (!error) - error = sysdev_register(&device_timer); - return error; + if (!no_sync_cmos_clock) + mod_timer(&sync_cmos_timer, jiffies + 1); } -device_initcall(time_init_device); - -#ifdef CONFIG_HPET_TIMER extern void (*late_time_init)(void); /* Duplicate of time_init() below, with hpet_enable part added */ static void __init hpet_time_init(void) { - struct timespec ts; - ts.tv_sec = get_cmos_time(); - ts.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); - - do_settimeofday(&ts); - - if ((hpet_enable() >= 0) && hpet_use_timer) { - printk("Using HPET for base-timer\n"); - } - + if (!hpet_enable()) + setup_pit_timer(); do_time_init(); } -#endif void __init time_init(void) { - struct timespec ts; -#ifdef CONFIG_HPET_TIMER - if (is_hpet_capable()) { - /* - * HPET initialization needs to do memory-mapped io. So, let - * us do a late initialization after mem_init(). - */ - late_time_init = hpet_time_init; - return; - } -#endif - ts.tv_sec = get_cmos_time(); - ts.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); - - do_settimeofday(&ts); - - do_time_init(); + late_time_init = hpet_time_init; } diff --git a/arch/i386/kernel/time_hpet.c b/arch/i386/kernel/time_hpet.c deleted file mode 100644 index 1e4702dfcd0..00000000000 --- a/arch/i386/kernel/time_hpet.c +++ /dev/null @@ -1,497 +0,0 @@ -/* - * linux/arch/i386/kernel/time_hpet.c - * This code largely copied from arch/x86_64/kernel/time.c - * See that file for credits. - * - * 2003-06-30 Venkatesh Pallipadi - Additional changes for HPET support - */ - -#include <linux/errno.h> -#include <linux/kernel.h> -#include <linux/param.h> -#include <linux/string.h> -#include <linux/init.h> -#include <linux/smp.h> - -#include <asm/timer.h> -#include <asm/fixmap.h> -#include <asm/apic.h> - -#include <linux/timex.h> - -#include <asm/hpet.h> -#include <linux/hpet.h> - -static unsigned long hpet_period; /* fsecs / HPET clock */ -unsigned long hpet_tick; /* hpet clks count per tick */ -unsigned long hpet_address; /* hpet memory map physical address */ -int hpet_use_timer; - -static int use_hpet; /* can be used for runtime check of hpet */ -static int boot_hpet_disable; /* boottime override for HPET timer */ -static void __iomem * hpet_virt_address; /* hpet kernel virtual address */ - -#define FSEC_TO_USEC (1000000000UL) - -int hpet_readl(unsigned long a) -{ - return readl(hpet_virt_address + a); -} - -static void hpet_writel(unsigned long d, unsigned long a) -{ - writel(d, hpet_virt_address + a); -} - -#ifdef CONFIG_X86_LOCAL_APIC -/* - * HPET counters dont wrap around on every tick. They just change the - * comparator value and continue. Next tick can be caught by checking - * for a change in the comparator value. Used in apic.c. - */ -static void __devinit wait_hpet_tick(void) -{ - unsigned int start_cmp_val, end_cmp_val; - - start_cmp_val = hpet_readl(HPET_T0_CMP); - do { - end_cmp_val = hpet_readl(HPET_T0_CMP); - } while (start_cmp_val == end_cmp_val); -} -#endif - -static int hpet_timer_stop_set_go(unsigned long tick) -{ - unsigned int cfg; - - /* - * Stop the timers and reset the main counter. - */ - cfg = hpet_readl(HPET_CFG); - cfg &= ~HPET_CFG_ENABLE; - hpet_writel(cfg, HPET_CFG); - hpet_writel(0, HPET_COUNTER); - hpet_writel(0, HPET_COUNTER + 4); - - if (hpet_use_timer) { - /* - * Set up timer 0, as periodic with first interrupt to happen at - * hpet_tick, and period also hpet_tick. - */ - cfg = hpet_readl(HPET_T0_CFG); - cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | - HPET_TN_SETVAL | HPET_TN_32BIT; - hpet_writel(cfg, HPET_T0_CFG); - - /* - * The first write after writing TN_SETVAL to the config register sets - * the counter value, the second write sets the threshold. - */ - hpet_writel(tick, HPET_T0_CMP); - hpet_writel(tick, HPET_T0_CMP); - } - /* - * Go! - */ - cfg = hpet_readl(HPET_CFG); - if (hpet_use_timer) - cfg |= HPET_CFG_LEGACY; - cfg |= HPET_CFG_ENABLE; - hpet_writel(cfg, HPET_CFG); - - return 0; -} - -/* - * Check whether HPET was found by ACPI boot parse. If yes setup HPET - * counter 0 for kernel base timer. - */ -int __init hpet_enable(void) -{ - unsigned int id; - unsigned long tick_fsec_low, tick_fsec_high; /* tick in femto sec */ - unsigned long hpet_tick_rem; - - if (boot_hpet_disable) - return -1; - - if (!hpet_address) { - return -1; - } - hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); - /* - * Read the period, compute tick and quotient. - */ - id = hpet_readl(HPET_ID); - - /* - * We are checking for value '1' or more in number field if - * CONFIG_HPET_EMULATE_RTC is set because we will need an - * additional timer for RTC emulation. - * However, we can do with one timer otherwise using the - * the single HPET timer for system time. - */ -#ifdef CONFIG_HPET_EMULATE_RTC - if (!(id & HPET_ID_NUMBER)) { - iounmap(hpet_virt_address); - hpet_virt_address = NULL; - return -1; - } -#endif - - - hpet_period = hpet_readl(HPET_PERIOD); - if ((hpet_period < HPET_MIN_PERIOD) || (hpet_period > HPET_MAX_PERIOD)) { - iounmap(hpet_virt_address); - hpet_virt_address = NULL; - return -1; - } - - /* - * 64 bit math - * First changing tick into fsec - * Then 64 bit div to find number of hpet clk per tick - */ - ASM_MUL64_REG(tick_fsec_low, tick_fsec_high, - KERNEL_TICK_USEC, FSEC_TO_USEC); - ASM_DIV64_REG(hpet_tick, hpet_tick_rem, - hpet_period, tick_fsec_low, tick_fsec_high); - - if (hpet_tick_rem > (hpet_period >> 1)) - hpet_tick++; /* rounding the result */ - - hpet_use_timer = id & HPET_ID_LEGSUP; - - if (hpet_timer_stop_set_go(hpet_tick)) { - iounmap(hpet_virt_address); - hpet_virt_address = NULL; - return -1; - } - - use_hpet = 1; - -#ifdef CONFIG_HPET - { - struct hpet_data hd; - unsigned int ntimer; - - memset(&hd, 0, sizeof (hd)); - - ntimer = hpet_readl(HPET_ID); - ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; - ntimer++; - - /* - * Register with driver. - * Timer0 and Timer1 is used by platform. - */ - hd.hd_phys_address = hpet_address; - hd.hd_address = hpet_virt_address; - hd.hd_nirqs = ntimer; - hd.hd_flags = HPET_DATA_PLATFORM; - hpet_reserve_timer(&hd, 0); -#ifdef CONFIG_HPET_EMULATE_RTC - hpet_reserve_timer(&hd, 1); -#endif - hd.hd_irq[0] = HPET_LEGACY_8254; - hd.hd_irq[1] = HPET_LEGACY_RTC; - if (ntimer > 2) { - struct hpet __iomem *hpet; - struct hpet_timer __iomem *timer; - int i; - - hpet = hpet_virt_address; - - for (i = 2, timer = &hpet->hpet_timers[2]; i < ntimer; - timer++, i++) - hd.hd_irq[i] = (timer->hpet_config & - Tn_INT_ROUTE_CNF_MASK) >> - Tn_INT_ROUTE_CNF_SHIFT; - - } - - hpet_alloc(&hd); - } -#endif - -#ifdef CONFIG_X86_LOCAL_APIC - if (hpet_use_timer) - wait_timer_tick = wait_hpet_tick; -#endif - return 0; -} - -int hpet_reenable(void) -{ - return hpet_timer_stop_set_go(hpet_tick); -} - -int is_hpet_enabled(void) -{ - return use_hpet; -} - -int is_hpet_capable(void) -{ - if (!boot_hpet_disable && hpet_address) - return 1; - return 0; -} - -static int __init hpet_setup(char* str) -{ - if (str) { - if (!strncmp("disable", str, 7)) - boot_hpet_disable = 1; - } - return 1; -} - -__setup("hpet=", hpet_setup); - -#ifdef CONFIG_HPET_EMULATE_RTC -/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET - * is enabled, we support RTC interrupt functionality in software. - * RTC has 3 kinds of interrupts: - * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock - * is updated - * 2) Alarm Interrupt - generate an interrupt at a specific time of day - * 3) Periodic Interrupt - generate periodic interrupt, with frequencies - * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2) - * (1) and (2) above are implemented using polling at a frequency of - * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt - * overhead. (DEFAULT_RTC_INT_FREQ) - * For (3), we use interrupts at 64Hz or user specified periodic - * frequency, whichever is higher. - */ -#include <linux/mc146818rtc.h> -#include <linux/rtc.h> - -#define DEFAULT_RTC_INT_FREQ 64 -#define RTC_NUM_INTS 1 - -static unsigned long UIE_on; -static unsigned long prev_update_sec; - -static unsigned long AIE_on; -static struct rtc_time alarm_time; - -static unsigned long PIE_on; -static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ; -static unsigned long PIE_count; - -static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */ -static unsigned int hpet_t1_cmp; /* cached comparator register */ - -/* - * Timer 1 for RTC, we do not use periodic interrupt feature, - * even if HPET supports periodic interrupts on Timer 1. - * The reason being, to set up a periodic interrupt in HPET, we need to - * stop the main counter. And if we do that everytime someone diables/enables - * RTC, we will have adverse effect on main kernel timer running on Timer 0. - * So, for the time being, simulate the periodic interrupt in software. - * - * hpet_rtc_timer_init() is called for the first time and during subsequent - * interuppts reinit happens through hpet_rtc_timer_reinit(). - */ -int hpet_rtc_timer_init(void) -{ - unsigned int cfg, cnt; - unsigned long flags; - - if (!is_hpet_enabled()) - return 0; - /* - * Set the counter 1 and enable the interrupts. - */ - if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) - hpet_rtc_int_freq = PIE_freq; - else - hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; - - local_irq_save(flags); - - cnt = hpet_readl(HPET_COUNTER); - cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq); - hpet_writel(cnt, HPET_T1_CMP); - hpet_t1_cmp = cnt; - - cfg = hpet_readl(HPET_T1_CFG); - cfg &= ~HPET_TN_PERIODIC; - cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; - hpet_writel(cfg, HPET_T1_CFG); - - local_irq_restore(flags); - - return 1; -} - -static void hpet_rtc_timer_reinit(void) -{ - unsigned int cfg, cnt, ticks_per_int, lost_ints; - - if (unlikely(!(PIE_on | AIE_on | UIE_on))) { - cfg = hpet_readl(HPET_T1_CFG); - cfg &= ~HPET_TN_ENABLE; - hpet_writel(cfg, HPET_T1_CFG); - return; - } - - if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) - hpet_rtc_int_freq = PIE_freq; - else - hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; - - /* It is more accurate to use the comparator value than current count.*/ - ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq; - hpet_t1_cmp += ticks_per_int; - hpet_writel(hpet_t1_cmp, HPET_T1_CMP); - - /* - * If the interrupt handler was delayed too long, the write above tries - * to schedule the next interrupt in the past and the hardware would - * not interrupt until the counter had wrapped around. - * So we have to check that the comparator wasn't set to a past time. - */ - cnt = hpet_readl(HPET_COUNTER); - if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) { - lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1; - /* Make sure that, even with the time needed to execute - * this code, the next scheduled interrupt has been moved - * back to the future: */ - lost_ints++; - - hpet_t1_cmp += lost_ints * ticks_per_int; - hpet_writel(hpet_t1_cmp, HPET_T1_CMP); - - if (PIE_on) - PIE_count += lost_ints; - - printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", - hpet_rtc_int_freq); - } -} - -/* - * The functions below are called from rtc driver. - * Return 0 if HPET is not being used. - * Otherwise do the necessary changes and return 1. - */ -int hpet_mask_rtc_irq_bit(unsigned long bit_mask) -{ - if (!is_hpet_enabled()) - return 0; - - if (bit_mask & RTC_UIE) - UIE_on = 0; - if (bit_mask & RTC_PIE) - PIE_on = 0; - if (bit_mask & RTC_AIE) - AIE_on = 0; - - return 1; -} - -int hpet_set_rtc_irq_bit(unsigned long bit_mask) -{ - int timer_init_reqd = 0; - - if (!is_hpet_enabled()) - return 0; - - if (!(PIE_on | AIE_on | UIE_on)) - timer_init_reqd = 1; - - if (bit_mask & RTC_UIE) { - UIE_on = 1; - } - if (bit_mask & RTC_PIE) { - PIE_on = 1; - PIE_count = 0; - } - if (bit_mask & RTC_AIE) { - AIE_on = 1; - } - - if (timer_init_reqd) - hpet_rtc_timer_init(); - - return 1; -} - -int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec) -{ - if (!is_hpet_enabled()) - return 0; - - alarm_time.tm_hour = hrs; - alarm_time.tm_min = min; - alarm_time.tm_sec = sec; - - return 1; -} - -int hpet_set_periodic_freq(unsigned long freq) -{ - if (!is_hpet_enabled()) - return 0; - - PIE_freq = freq; - PIE_count = 0; - - return 1; -} - -int hpet_rtc_dropped_irq(void) -{ - if (!is_hpet_enabled()) - return 0; - - return 1; -} - -irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) -{ - struct rtc_time curr_time; - unsigned long rtc_int_flag = 0; - int call_rtc_interrupt = 0; - - hpet_rtc_timer_reinit(); - - if (UIE_on | AIE_on) { - rtc_get_rtc_time(&curr_time); - } - if (UIE_on) { - if (curr_time.tm_sec != prev_update_sec) { - /* Set update int info, call real rtc int routine */ - call_rtc_interrupt = 1; - rtc_int_flag = RTC_UF; - prev_update_sec = curr_time.tm_sec; - } - } - if (PIE_on) { - PIE_count++; - if (PIE_count >= hpet_rtc_int_freq/PIE_freq) { - /* Set periodic int info, call real rtc int routine */ - call_rtc_interrupt = 1; - rtc_int_flag |= RTC_PF; - PIE_count = 0; - } - } - if (AIE_on) { - if ((curr_time.tm_sec == alarm_time.tm_sec) && - (curr_time.tm_min == alarm_time.tm_min) && - (curr_time.tm_hour == alarm_time.tm_hour)) { - /* Set alarm int info, call real rtc int routine */ - call_rtc_interrupt = 1; - rtc_int_flag |= RTC_AF; - } - } - if (call_rtc_interrupt) { - rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); - rtc_interrupt(rtc_int_flag, dev_id); - } - return IRQ_HANDLED; -} -#endif - diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 0efad8aeb41..af0d3f70a81 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -94,6 +94,7 @@ asmlinkage void spurious_interrupt_bug(void); asmlinkage void machine_check(void); int kstack_depth_to_print = 24; +static unsigned int code_bytes = 64; ATOMIC_NOTIFIER_HEAD(i386die_chain); int register_die_notifier(struct notifier_block *nb) @@ -291,10 +292,11 @@ void show_registers(struct pt_regs *regs) int i; int in_kernel = 1; unsigned long esp; - unsigned short ss; + unsigned short ss, gs; esp = (unsigned long) (®s->esp); savesegment(ss, ss); + savesegment(gs, gs); if (user_mode_vm(regs)) { in_kernel = 0; esp = regs->esp; @@ -313,8 +315,8 @@ void show_registers(struct pt_regs *regs) regs->eax, regs->ebx, regs->ecx, regs->edx); printk(KERN_EMERG "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", regs->esi, regs->edi, regs->ebp, esp); - printk(KERN_EMERG "ds: %04x es: %04x ss: %04x\n", - regs->xds & 0xffff, regs->xes & 0xffff, ss); + printk(KERN_EMERG "ds: %04x es: %04x fs: %04x gs: %04x ss: %04x\n", + regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss); printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", TASK_COMM_LEN, current->comm, current->pid, current_thread_info(), current, current->thread_info); @@ -324,7 +326,8 @@ void show_registers(struct pt_regs *regs) */ if (in_kernel) { u8 *eip; - int code_bytes = 64; + unsigned int code_prologue = code_bytes * 43 / 64; + unsigned int code_len = code_bytes; unsigned char c; printk("\n" KERN_EMERG "Stack: "); @@ -332,14 +335,14 @@ void show_registers(struct pt_regs *regs) printk(KERN_EMERG "Code: "); - eip = (u8 *)regs->eip - 43; + eip = (u8 *)regs->eip - code_prologue; if (eip < (u8 *)PAGE_OFFSET || probe_kernel_address(eip, c)) { /* try starting at EIP */ eip = (u8 *)regs->eip; - code_bytes = 32; + code_len = code_len - code_prologue + 1; } - for (i = 0; i < code_bytes; i++, eip++) { + for (i = 0; i < code_len; i++, eip++) { if (eip < (u8 *)PAGE_OFFSET || probe_kernel_address(eip, c)) { printk(" Bad EIP value."); @@ -1191,3 +1194,13 @@ static int __init kstack_setup(char *s) return 1; } __setup("kstack=", kstack_setup); + +static int __init code_bytes_setup(char *s) +{ + code_bytes = simple_strtoul(s, NULL, 0); + if (code_bytes > 8192) + code_bytes = 8192; + + return 1; +} +__setup("code_bytes=", code_bytes_setup); diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c index 2cfc7b09b92..3082a418635 100644 --- a/arch/i386/kernel/tsc.c +++ b/arch/i386/kernel/tsc.c @@ -23,6 +23,7 @@ * an extra value to store the TSC freq */ unsigned int tsc_khz; +unsigned long long (*custom_sched_clock)(void); int tsc_disable; @@ -59,12 +60,6 @@ static inline int check_tsc_unstable(void) return tsc_unstable; } -void mark_tsc_unstable(void) -{ - tsc_unstable = 1; -} -EXPORT_SYMBOL_GPL(mark_tsc_unstable); - /* Accellerators for sched_clock() * convert from cycles(64bits) => nanoseconds (64bits) * basic equation: @@ -107,14 +102,14 @@ unsigned long long sched_clock(void) { unsigned long long this_offset; + if (unlikely(custom_sched_clock)) + return (*custom_sched_clock)(); + /* - * in the NUMA case we dont use the TSC as they are not - * synchronized across all CPUs. + * Fall back to jiffies if there's no TSC available: */ -#ifndef CONFIG_NUMA - if (!cpu_khz || check_tsc_unstable()) -#endif - /* no locking but a rare wrong value is not a big deal */ + if (unlikely(tsc_disable)) + /* No locking but a rare wrong value is not a big deal: */ return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); /* read the Time Stamp Counter: */ @@ -194,13 +189,13 @@ EXPORT_SYMBOL(recalibrate_cpu_khz); void __init tsc_init(void) { if (!cpu_has_tsc || tsc_disable) - return; + goto out_no_tsc; cpu_khz = calculate_cpu_khz(); tsc_khz = cpu_khz; if (!cpu_khz) - return; + goto out_no_tsc; printk("Detected %lu.%03lu MHz processor.\n", (unsigned long)cpu_khz / 1000, @@ -208,37 +203,18 @@ void __init tsc_init(void) set_cyc2ns_scale(cpu_khz); use_tsc_delay(); -} + return; -#ifdef CONFIG_CPU_FREQ - -static unsigned int cpufreq_delayed_issched = 0; -static unsigned int cpufreq_init = 0; -static struct work_struct cpufreq_delayed_get_work; - -static void handle_cpufreq_delayed_get(struct work_struct *work) -{ - unsigned int cpu; - - for_each_online_cpu(cpu) - cpufreq_get(cpu); - - cpufreq_delayed_issched = 0; +out_no_tsc: + /* + * Set the tsc_disable flag if there's no TSC support, this + * makes it a fast flag for the kernel to see whether it + * should be using the TSC. + */ + tsc_disable = 1; } -/* - * if we notice cpufreq oddness, schedule a call to cpufreq_get() as it tries - * to verify the CPU frequency the timing core thinks the CPU is running - * at is still correct. - */ -static inline void cpufreq_delayed_get(void) -{ - if (cpufreq_init && !cpufreq_delayed_issched) { - cpufreq_delayed_issched = 1; - printk(KERN_DEBUG "Checking if CPU frequency changed.\n"); - schedule_work(&cpufreq_delayed_get_work); - } -} +#ifdef CONFIG_CPU_FREQ /* * if the CPU frequency is scaled, TSC-based delays will need a different @@ -303,17 +279,9 @@ static struct notifier_block time_cpufreq_notifier_block = { static int __init cpufreq_tsc(void) { - int ret; - - INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get); - ret = cpufreq_register_notifier(&time_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER); - if (!ret) - cpufreq_init = 1; - - return ret; + return cpufreq_register_notifier(&time_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); } - core_initcall(cpufreq_tsc); #endif @@ -321,7 +289,6 @@ core_initcall(cpufreq_tsc); /* clock source code */ static unsigned long current_tsc_khz = 0; -static int tsc_update_callback(void); static cycle_t read_tsc(void) { @@ -339,37 +306,28 @@ static struct clocksource clocksource_tsc = { .mask = CLOCKSOURCE_MASK(64), .mult = 0, /* to be set */ .shift = 22, - .update_callback = tsc_update_callback, - .is_continuous = 1, + .flags = CLOCK_SOURCE_IS_CONTINUOUS | + CLOCK_SOURCE_MUST_VERIFY, }; -static int tsc_update_callback(void) +void mark_tsc_unstable(void) { - int change = 0; - - /* check to see if we should switch to the safe clocksource: */ - if (clocksource_tsc.rating != 0 && check_tsc_unstable()) { - clocksource_tsc.rating = 0; - clocksource_reselect(); - change = 1; - } - - /* only update if tsc_khz has changed: */ - if (current_tsc_khz != tsc_khz) { - current_tsc_khz = tsc_khz; - clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz, - clocksource_tsc.shift); - change = 1; + if (!tsc_unstable) { + tsc_unstable = 1; + /* Can be called before registration */ + if (clocksource_tsc.mult) + clocksource_change_rating(&clocksource_tsc, 0); + else + clocksource_tsc.rating = 0; } - - return change; } +EXPORT_SYMBOL_GPL(mark_tsc_unstable); static int __init dmi_mark_tsc_unstable(struct dmi_system_id *d) { printk(KERN_NOTICE "%s detected: marking TSC unstable.\n", d->ident); - mark_tsc_unstable(); + tsc_unstable = 1; return 0; } @@ -386,65 +344,44 @@ static struct dmi_system_id __initdata bad_tsc_dmi_table[] = { {} }; -#define TSC_FREQ_CHECK_INTERVAL (10*MSEC_PER_SEC) /* 10sec in MS */ -static struct timer_list verify_tsc_freq_timer; - -/* XXX - Probably should add locking */ -static void verify_tsc_freq(unsigned long unused) -{ - static u64 last_tsc; - static unsigned long last_jiffies; - - u64 now_tsc, interval_tsc; - unsigned long now_jiffies, interval_jiffies; - - - if (check_tsc_unstable()) - return; - - rdtscll(now_tsc); - now_jiffies = jiffies; - - if (!last_jiffies) { - goto out; - } - - interval_jiffies = now_jiffies - last_jiffies; - interval_tsc = now_tsc - last_tsc; - interval_tsc *= HZ; - do_div(interval_tsc, cpu_khz*1000); - - if (interval_tsc < (interval_jiffies * 3 / 4)) { - printk("TSC appears to be running slowly. " - "Marking it as unstable\n"); - mark_tsc_unstable(); - return; - } - -out: - last_tsc = now_tsc; - last_jiffies = now_jiffies; - /* set us up to go off on the next interval: */ - mod_timer(&verify_tsc_freq_timer, - jiffies + msecs_to_jiffies(TSC_FREQ_CHECK_INTERVAL)); -} - /* * Make an educated guess if the TSC is trustworthy and synchronized * over all CPUs. */ -static __init int unsynchronized_tsc(void) +__cpuinit int unsynchronized_tsc(void) { + if (!cpu_has_tsc || tsc_unstable) + return 1; /* * Intel systems are normally all synchronized. * Exceptions must mark TSC as unstable: */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - return 0; + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { + /* assume multi socket systems are not synchronized: */ + if (num_possible_cpus() > 1) + tsc_unstable = 1; + } + return tsc_unstable; +} + +/* + * Geode_LX - the OLPC CPU has a possibly a very reliable TSC + */ +#ifdef CONFIG_MGEODE_LX +/* RTSC counts during suspend */ +#define RTSC_SUSP 0x100 + +static void __init check_geode_tsc_reliable(void) +{ + unsigned long val; - /* assume multi socket systems are not synchronized: */ - return num_possible_cpus() > 1; + rdmsrl(MSR_GEODE_BUSCONT_CONF0, val); + if ((val & RTSC_SUSP)) + clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; } +#else +static inline void check_geode_tsc_reliable(void) { } +#endif static int __init init_tsc_clocksource(void) { @@ -453,20 +390,16 @@ static int __init init_tsc_clocksource(void) /* check blacklist */ dmi_check_system(bad_tsc_dmi_table); - if (unsynchronized_tsc()) /* mark unstable if unsynced */ - mark_tsc_unstable(); + unsynchronized_tsc(); + check_geode_tsc_reliable(); current_tsc_khz = tsc_khz; clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz, clocksource_tsc.shift); /* lower the rating if we already know its unstable: */ - if (check_tsc_unstable()) + if (check_tsc_unstable()) { clocksource_tsc.rating = 0; - - init_timer(&verify_tsc_freq_timer); - verify_tsc_freq_timer.function = verify_tsc_freq; - verify_tsc_freq_timer.expires = - jiffies + msecs_to_jiffies(TSC_FREQ_CHECK_INTERVAL); - add_timer(&verify_tsc_freq_timer); + clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; + } return clocksource_register(&clocksource_tsc); } diff --git a/arch/i386/kernel/tsc_sync.c b/arch/i386/kernel/tsc_sync.c new file mode 100644 index 00000000000..12424629af8 --- /dev/null +++ b/arch/i386/kernel/tsc_sync.c @@ -0,0 +1 @@ +#include "../../x86_64/kernel/tsc_sync.c" diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c index be2f96e67f7..d1b8f2b7aea 100644 --- a/arch/i386/kernel/vm86.c +++ b/arch/i386/kernel/vm86.c @@ -96,12 +96,12 @@ static int copy_vm86_regs_to_user(struct vm86_regs __user *user, { int ret = 0; - /* kernel_vm86_regs is missing xfs, so copy everything up to - (but not including) xgs, and then rest after xgs. */ - ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.xgs)); - ret += copy_to_user(&user->__null_gs, ®s->pt.xgs, + /* kernel_vm86_regs is missing xgs, so copy everything up to + (but not including) orig_eax, and then rest including orig_eax. */ + ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_eax)); + ret += copy_to_user(&user->orig_eax, ®s->pt.orig_eax, sizeof(struct kernel_vm86_regs) - - offsetof(struct kernel_vm86_regs, pt.xgs)); + offsetof(struct kernel_vm86_regs, pt.orig_eax)); return ret; } @@ -113,12 +113,13 @@ static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs, { int ret = 0; - ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.xgs)); - ret += copy_from_user(®s->pt.xgs, &user->__null_gs, + /* copy eax-xfs inclusive */ + ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_eax)); + /* copy orig_eax-__gsh+extra */ + ret += copy_from_user(®s->pt.orig_eax, &user->orig_eax, sizeof(struct kernel_vm86_regs) - - offsetof(struct kernel_vm86_regs, pt.xgs) + + offsetof(struct kernel_vm86_regs, pt.orig_eax) + extra); - return ret; } @@ -157,8 +158,8 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs) ret = KVM86->regs32; - loadsegment(fs, current->thread.saved_fs); - ret->xgs = current->thread.saved_gs; + ret->xfs = current->thread.saved_fs; + loadsegment(gs, current->thread.saved_gs); return ret; } @@ -285,9 +286,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk */ info->regs.pt.xds = 0; info->regs.pt.xes = 0; - info->regs.pt.xgs = 0; + info->regs.pt.xfs = 0; -/* we are clearing fs later just before "jmp resume_userspace", +/* we are clearing gs later just before "jmp resume_userspace", * because it is not saved/restored. */ @@ -321,8 +322,8 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk */ info->regs32->eax = 0; tsk->thread.saved_esp0 = tsk->thread.esp0; - savesegment(fs, tsk->thread.saved_fs); - tsk->thread.saved_gs = info->regs32->xgs; + tsk->thread.saved_fs = info->regs32->xfs; + savesegment(gs, tsk->thread.saved_gs); tss = &per_cpu(init_tss, get_cpu()); tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; @@ -342,7 +343,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk __asm__ __volatile__( "movl %0,%%esp\n\t" "movl %1,%%ebp\n\t" - "mov %2, %%fs\n\t" + "mov %2, %%gs\n\t" "jmp resume_userspace" : /* no outputs */ :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0)); diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c new file mode 100644 index 00000000000..bb5a7abf949 --- /dev/null +++ b/arch/i386/kernel/vmi.c @@ -0,0 +1,949 @@ +/* + * VMI specific paravirt-ops implementation + * + * Copyright (C) 2005, VMware, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to zach@vmware.com + * + */ + +#include <linux/module.h> +#include <linux/license.h> +#include <linux/cpu.h> +#include <linux/bootmem.h> +#include <linux/mm.h> +#include <asm/vmi.h> +#include <asm/io.h> +#include <asm/fixmap.h> +#include <asm/apicdef.h> +#include <asm/apic.h> +#include <asm/processor.h> +#include <asm/timer.h> +#include <asm/vmi_time.h> + +/* Convenient for calling VMI functions indirectly in the ROM */ +typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void); +typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int); + +#define call_vrom_func(rom,func) \ + (((VROMFUNC *)(rom->func))()) + +#define call_vrom_long_func(rom,func,arg) \ + (((VROMLONGFUNC *)(rom->func)) (arg)) + +static struct vrom_header *vmi_rom; +static int license_gplok; +static int disable_nodelay; +static int disable_pge; +static int disable_pse; +static int disable_sep; +static int disable_tsc; +static int disable_mtrr; + +/* Cached VMI operations */ +struct { + void (*cpuid)(void /* non-c */); + void (*_set_ldt)(u32 selector); + void (*set_tr)(u32 selector); + void (*set_kernel_stack)(u32 selector, u32 esp0); + void (*allocate_page)(u32, u32, u32, u32, u32); + void (*release_page)(u32, u32); + void (*set_pte)(pte_t, pte_t *, unsigned); + void (*update_pte)(pte_t *, unsigned); + void (*set_linear_mapping)(int, u32, u32, u32); + void (*flush_tlb)(int); + void (*set_initial_ap_state)(int, int); + void (*halt)(void); +} vmi_ops; + +/* XXX move this to alternative.h */ +extern struct paravirt_patch __start_parainstructions[], + __stop_parainstructions[]; + +/* + * VMI patching routines. + */ +#define MNEM_CALL 0xe8 +#define MNEM_JMP 0xe9 +#define MNEM_RET 0xc3 + +static char irq_save_disable_callout[] = { + MNEM_CALL, 0, 0, 0, 0, + MNEM_CALL, 0, 0, 0, 0, + MNEM_RET +}; +#define IRQ_PATCH_INT_MASK 0 +#define IRQ_PATCH_DISABLE 5 + +static inline void patch_offset(unsigned char *eip, unsigned char *dest) +{ + *(unsigned long *)(eip+1) = dest-eip-5; +} + +static unsigned patch_internal(int call, unsigned len, void *insns) +{ + u64 reloc; + struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc; + reloc = call_vrom_long_func(vmi_rom, get_reloc, call); + switch(rel->type) { + case VMI_RELOCATION_CALL_REL: + BUG_ON(len < 5); + *(char *)insns = MNEM_CALL; + patch_offset(insns, rel->eip); + return 5; + + case VMI_RELOCATION_JUMP_REL: + BUG_ON(len < 5); + *(char *)insns = MNEM_JMP; + patch_offset(insns, rel->eip); + return 5; + + case VMI_RELOCATION_NOP: + /* obliterate the whole thing */ + return 0; + + case VMI_RELOCATION_NONE: + /* leave native code in place */ + break; + + default: + BUG(); + } + return len; +} + +/* + * Apply patch if appropriate, return length of new instruction + * sequence. The callee does nop padding for us. + */ +static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, unsigned len) +{ + switch (type) { + case PARAVIRT_IRQ_DISABLE: + return patch_internal(VMI_CALL_DisableInterrupts, len, insns); + case PARAVIRT_IRQ_ENABLE: + return patch_internal(VMI_CALL_EnableInterrupts, len, insns); + case PARAVIRT_RESTORE_FLAGS: + return patch_internal(VMI_CALL_SetInterruptMask, len, insns); + case PARAVIRT_SAVE_FLAGS: + return patch_internal(VMI_CALL_GetInterruptMask, len, insns); + case PARAVIRT_SAVE_FLAGS_IRQ_DISABLE: + if (len >= 10) { + patch_internal(VMI_CALL_GetInterruptMask, len, insns); + patch_internal(VMI_CALL_DisableInterrupts, len-5, insns+5); + return 10; + } else { + /* + * You bastards didn't leave enough room to + * patch save_flags_irq_disable inline. Patch + * to a helper + */ + BUG_ON(len < 5); + *(char *)insns = MNEM_CALL; + patch_offset(insns, irq_save_disable_callout); + return 5; + } + case PARAVIRT_INTERRUPT_RETURN: + return patch_internal(VMI_CALL_IRET, len, insns); + case PARAVIRT_STI_SYSEXIT: + return patch_internal(VMI_CALL_SYSEXIT, len, insns); + default: + break; + } + return len; +} + +/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */ +static void vmi_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + int override = 0; + if (*eax == 1) + override = 1; + asm volatile ("call *%6" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx), "r" (vmi_ops.cpuid)); + if (override) { + if (disable_pse) + *edx &= ~X86_FEATURE_PSE; + if (disable_pge) + *edx &= ~X86_FEATURE_PGE; + if (disable_sep) + *edx &= ~X86_FEATURE_SEP; + if (disable_tsc) + *edx &= ~X86_FEATURE_TSC; + if (disable_mtrr) + *edx &= ~X86_FEATURE_MTRR; + } +} + +static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new) +{ + if (gdt[nr].a != new->a || gdt[nr].b != new->b) + write_gdt_entry(gdt, nr, new->a, new->b); +} + +static void vmi_load_tls(struct thread_struct *t, unsigned int cpu) +{ + struct desc_struct *gdt = get_cpu_gdt_table(cpu); + vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]); + vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]); + vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]); +} + +static void vmi_set_ldt(const void *addr, unsigned entries) +{ + unsigned cpu = smp_processor_id(); + u32 low, high; + + pack_descriptor(&low, &high, (unsigned long)addr, + entries * sizeof(struct desc_struct) - 1, + DESCTYPE_LDT, 0); + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, low, high); + vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0); +} + +static void vmi_set_tr(void) +{ + vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct)); +} + +static void vmi_load_esp0(struct tss_struct *tss, + struct thread_struct *thread) +{ + tss->esp0 = thread->esp0; + + /* This can only happen when SEP is enabled, no need to test "SEP"arately */ + if (unlikely(tss->ss1 != thread->sysenter_cs)) { + tss->ss1 = thread->sysenter_cs; + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); + } + vmi_ops.set_kernel_stack(__KERNEL_DS, tss->esp0); +} + +static void vmi_flush_tlb_user(void) +{ + vmi_ops.flush_tlb(VMI_FLUSH_TLB); +} + +static void vmi_flush_tlb_kernel(void) +{ + vmi_ops.flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL); +} + +/* Stub to do nothing at all; used for delays and unimplemented calls */ +static void vmi_nop(void) +{ +} + +/* For NO_IDLE_HZ, we stop the clock when halting the kernel */ +#ifdef CONFIG_NO_IDLE_HZ +static fastcall void vmi_safe_halt(void) +{ + int idle = vmi_stop_hz_timer(); + vmi_ops.halt(); + if (idle) { + local_irq_disable(); + vmi_account_time_restart_hz_timer(); + local_irq_enable(); + } +} +#endif + +#ifdef CONFIG_DEBUG_PAGE_TYPE + +#ifdef CONFIG_X86_PAE +#define MAX_BOOT_PTS (2048+4+1) +#else +#define MAX_BOOT_PTS (1024+1) +#endif + +/* + * During boot, mem_map is not yet available in paging_init, so stash + * all the boot page allocations here. + */ +static struct { + u32 pfn; + int type; +} boot_page_allocations[MAX_BOOT_PTS]; +static int num_boot_page_allocations; +static int boot_allocations_applied; + +void vmi_apply_boot_page_allocations(void) +{ + int i; + BUG_ON(!mem_map); + for (i = 0; i < num_boot_page_allocations; i++) { + struct page *page = pfn_to_page(boot_page_allocations[i].pfn); + page->type = boot_page_allocations[i].type; + page->type = boot_page_allocations[i].type & + ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE); + } + boot_allocations_applied = 1; +} + +static void record_page_type(u32 pfn, int type) +{ + BUG_ON(num_boot_page_allocations >= MAX_BOOT_PTS); + boot_page_allocations[num_boot_page_allocations].pfn = pfn; + boot_page_allocations[num_boot_page_allocations].type = type; + num_boot_page_allocations++; +} + +static void check_zeroed_page(u32 pfn, int type, struct page *page) +{ + u32 *ptr; + int i; + int limit = PAGE_SIZE / sizeof(int); + + if (page_address(page)) + ptr = (u32 *)page_address(page); + else + ptr = (u32 *)__va(pfn << PAGE_SHIFT); + /* + * When cloning the root in non-PAE mode, only the userspace + * pdes need to be zeroed. + */ + if (type & VMI_PAGE_CLONE) + limit = USER_PTRS_PER_PGD; + for (i = 0; i < limit; i++) + BUG_ON(ptr[i]); +} + +/* + * We stash the page type into struct page so we can verify the page + * types are used properly. + */ +static void vmi_set_page_type(u32 pfn, int type) +{ + /* PAE can have multiple roots per page - don't track */ + if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP)) + return; + + if (boot_allocations_applied) { + struct page *page = pfn_to_page(pfn); + if (type != VMI_PAGE_NORMAL) + BUG_ON(page->type); + else + BUG_ON(page->type == VMI_PAGE_NORMAL); + page->type = type & ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE); + if (type & VMI_PAGE_ZEROED) + check_zeroed_page(pfn, type, page); + } else { + record_page_type(pfn, type); + } +} + +static void vmi_check_page_type(u32 pfn, int type) +{ + /* PAE can have multiple roots per page - skip checks */ + if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP)) + return; + + type &= ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE); + if (boot_allocations_applied) { + struct page *page = pfn_to_page(pfn); + BUG_ON((page->type ^ type) & VMI_PAGE_PAE); + BUG_ON(type == VMI_PAGE_NORMAL && page->type); + BUG_ON((type & page->type) == 0); + } +} +#else +#define vmi_set_page_type(p,t) do { } while (0) +#define vmi_check_page_type(p,t) do { } while (0) +#endif + +static void vmi_allocate_pt(u32 pfn) +{ + vmi_set_page_type(pfn, VMI_PAGE_L1); + vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); +} + +static void vmi_allocate_pd(u32 pfn) +{ + /* + * This call comes in very early, before mem_map is setup. + * It is called only for swapper_pg_dir, which already has + * data on it. + */ + vmi_set_page_type(pfn, VMI_PAGE_L2); + vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); +} + +static void vmi_allocate_pd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count) +{ + vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE); + vmi_check_page_type(clonepfn, VMI_PAGE_L2); + vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); +} + +static void vmi_release_pt(u32 pfn) +{ + vmi_ops.release_page(pfn, VMI_PAGE_L1); + vmi_set_page_type(pfn, VMI_PAGE_NORMAL); +} + +static void vmi_release_pd(u32 pfn) +{ + vmi_ops.release_page(pfn, VMI_PAGE_L2); + vmi_set_page_type(pfn, VMI_PAGE_NORMAL); +} + +/* + * Helper macros for MMU update flags. We can defer updates until a flush + * or page invalidation only if the update is to the current address space + * (otherwise, there is no flush). We must check against init_mm, since + * this could be a kernel update, which usually passes init_mm, although + * sometimes this check can be skipped if we know the particular function + * is only called on user mode PTEs. We could change the kernel to pass + * current->active_mm here, but in particular, I was unsure if changing + * mm/highmem.c to do this would still be correct on other architectures. + */ +#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm || \ + (!mustbeuser && (mm) == &init_mm)) +#define vmi_flags_addr(mm, addr, level, user) \ + ((level) | (is_current_as(mm, user) ? \ + (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0)) +#define vmi_flags_addr_defer(mm, addr, level, user) \ + ((level) | (is_current_as(mm, user) ? \ + (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0)) + +static void vmi_update_pte(struct mm_struct *mm, u32 addr, pte_t *ptep) +{ + vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); + vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); +} + +static void vmi_update_pte_defer(struct mm_struct *mm, u32 addr, pte_t *ptep) +{ + vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); + vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0)); +} + +static void vmi_set_pte(pte_t *ptep, pte_t pte) +{ + /* XXX because of set_pmd_pte, this can be called on PT or PD layers */ + vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE | VMI_PAGE_PD); + vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT); +} + +static void vmi_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte) +{ + vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); + vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); +} + +static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval) +{ +#ifdef CONFIG_X86_PAE + const pte_t pte = { pmdval.pmd, pmdval.pmd >> 32 }; + vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD); +#else + const pte_t pte = { pmdval.pud.pgd.pgd }; + vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PGD); +#endif + vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD); +} + +#ifdef CONFIG_X86_PAE + +static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval) +{ + /* + * XXX This is called from set_pmd_pte, but at both PT + * and PD layers so the VMI_PAGE_PT flag is wrong. But + * it is only called for large page mapping changes, + * the Xen backend, doesn't support large pages, and the + * ESX backend doesn't depend on the flag. + */ + set_64bit((unsigned long long *)ptep,pte_val(pteval)); + vmi_ops.update_pte(ptep, VMI_PAGE_PT); +} + +static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) +{ + vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); + vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1)); +} + +static void vmi_set_pud(pud_t *pudp, pud_t pudval) +{ + /* Um, eww */ + const pte_t pte = { pudval.pgd.pgd, pudval.pgd.pgd >> 32 }; + vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD); + vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP); +} + +static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + const pte_t pte = { 0 }; + vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); + vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); +} + +void vmi_pmd_clear(pmd_t *pmd) +{ + const pte_t pte = { 0 }; + vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD); + vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD); +} +#endif + +#ifdef CONFIG_SMP +struct vmi_ap_state ap; +extern void setup_pda(void); + +static void __init /* XXX cpu hotplug */ +vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, + unsigned long start_esp) +{ + /* Default everything to zero. This is fine for most GPRs. */ + memset(&ap, 0, sizeof(struct vmi_ap_state)); + + ap.gdtr_limit = GDT_SIZE - 1; + ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid); + + ap.idtr_limit = IDT_ENTRIES * 8 - 1; + ap.idtr_base = (unsigned long) idt_table; + + ap.ldtr = 0; + + ap.cs = __KERNEL_CS; + ap.eip = (unsigned long) start_eip; + ap.ss = __KERNEL_DS; + ap.esp = (unsigned long) start_esp; + + ap.ds = __USER_DS; + ap.es = __USER_DS; + ap.fs = __KERNEL_PDA; + ap.gs = 0; + + ap.eflags = 0; + + setup_pda(); + +#ifdef CONFIG_X86_PAE + /* efer should match BSP efer. */ + if (cpu_has_nx) { + unsigned l, h; + rdmsr(MSR_EFER, l, h); + ap.efer = (unsigned long long) h << 32 | l; + } +#endif + + ap.cr3 = __pa(swapper_pg_dir); + /* Protected mode, paging, AM, WP, NE, MP. */ + ap.cr0 = 0x80050023; + ap.cr4 = mmu_cr4_features; + vmi_ops.set_initial_ap_state(__pa(&ap), phys_apicid); +} +#endif + +static inline int __init check_vmi_rom(struct vrom_header *rom) +{ + struct pci_header *pci; + struct pnp_header *pnp; + const char *manufacturer = "UNKNOWN"; + const char *product = "UNKNOWN"; + const char *license = "unspecified"; + + if (rom->rom_signature != 0xaa55) + return 0; + if (rom->vrom_signature != VMI_SIGNATURE) + return 0; + if (rom->api_version_maj != VMI_API_REV_MAJOR || + rom->api_version_min+1 < VMI_API_REV_MINOR+1) { + printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n", + rom->api_version_maj, + rom->api_version_min); + return 0; + } + + /* + * Relying on the VMI_SIGNATURE field is not 100% safe, so check + * the PCI header and device type to make sure this is really a + * VMI device. + */ + if (!rom->pci_header_offs) { + printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n"); + return 0; + } + + pci = (struct pci_header *)((char *)rom+rom->pci_header_offs); + if (pci->vendorID != PCI_VENDOR_ID_VMWARE || + pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) { + /* Allow it to run... anyways, but warn */ + printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n"); + } + + if (rom->pnp_header_offs) { + pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs); + if (pnp->manufacturer_offset) + manufacturer = (const char *)rom+pnp->manufacturer_offset; + if (pnp->product_offset) + product = (const char *)rom+pnp->product_offset; + } + + if (rom->license_offs) + license = (char *)rom+rom->license_offs; + + printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n", + manufacturer, product, + rom->api_version_maj, rom->api_version_min, + pci->rom_version_maj, pci->rom_version_min); + + license_gplok = license_is_gpl_compatible(license); + if (!license_gplok) { + printk(KERN_WARNING "VMI: ROM license '%s' taints kernel... " + "inlining disabled\n", + license); + add_taint(TAINT_PROPRIETARY_MODULE); + } + return 1; +} + +/* + * Probe for the VMI option ROM + */ +static inline int __init probe_vmi_rom(void) +{ + unsigned long base; + + /* VMI ROM is in option ROM area, check signature */ + for (base = 0xC0000; base < 0xE0000; base += 2048) { + struct vrom_header *romstart; + romstart = (struct vrom_header *)isa_bus_to_virt(base); + if (check_vmi_rom(romstart)) { + vmi_rom = romstart; + return 1; + } + } + return 0; +} + +/* + * VMI setup common to all processors + */ +void vmi_bringup(void) +{ + /* We must establish the lowmem mapping for MMU ops to work */ + if (vmi_rom) + vmi_ops.set_linear_mapping(0, __PAGE_OFFSET, max_low_pfn, 0); +} + +/* + * Return a pointer to the VMI function or a NOP stub + */ +static void *vmi_get_function(int vmicall) +{ + u64 reloc; + const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc; + reloc = call_vrom_long_func(vmi_rom, get_reloc, vmicall); + BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); + if (rel->type == VMI_RELOCATION_CALL_REL) + return (void *)rel->eip; + else + return (void *)vmi_nop; +} + +/* + * Helper macro for making the VMI paravirt-ops fill code readable. + * For unimplemented operations, fall back to default. + */ +#define para_fill(opname, vmicall) \ +do { \ + reloc = call_vrom_long_func(vmi_rom, get_reloc, \ + VMI_CALL_##vmicall); \ + if (rel->type != VMI_RELOCATION_NONE) { \ + BUG_ON(rel->type != VMI_RELOCATION_CALL_REL); \ + paravirt_ops.opname = (void *)rel->eip; \ + } \ +} while (0) + +/* + * Activate the VMI interface and switch into paravirtualized mode + */ +static inline int __init activate_vmi(void) +{ + short kernel_cs; + u64 reloc; + const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc; + + if (call_vrom_func(vmi_rom, vmi_init) != 0) { + printk(KERN_ERR "VMI ROM failed to initialize!"); + return 0; + } + savesegment(cs, kernel_cs); + + paravirt_ops.paravirt_enabled = 1; + paravirt_ops.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK; + + paravirt_ops.patch = vmi_patch; + paravirt_ops.name = "vmi"; + + /* + * Many of these operations are ABI compatible with VMI. + * This means we can fill in the paravirt-ops with direct + * pointers into the VMI ROM. If the calling convention for + * these operations changes, this code needs to be updated. + * + * Exceptions + * CPUID paravirt-op uses pointers, not the native ISA + * halt has no VMI equivalent; all VMI halts are "safe" + * no MSR support yet - just trap and emulate. VMI uses the + * same ABI as the native ISA, but Linux wants exceptions + * from bogus MSR read / write handled + * rdpmc is not yet used in Linux + */ + + /* CPUID is special, so very special */ + reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_CPUID); + if (rel->type != VMI_RELOCATION_NONE) { + BUG_ON(rel->type != VMI_RELOCATION_CALL_REL); + vmi_ops.cpuid = (void *)rel->eip; + paravirt_ops.cpuid = vmi_cpuid; + } + + para_fill(clts, CLTS); + para_fill(get_debugreg, GetDR); + para_fill(set_debugreg, SetDR); + para_fill(read_cr0, GetCR0); + para_fill(read_cr2, GetCR2); + para_fill(read_cr3, GetCR3); + para_fill(read_cr4, GetCR4); + para_fill(write_cr0, SetCR0); + para_fill(write_cr2, SetCR2); + para_fill(write_cr3, SetCR3); + para_fill(write_cr4, SetCR4); + para_fill(save_fl, GetInterruptMask); + para_fill(restore_fl, SetInterruptMask); + para_fill(irq_disable, DisableInterrupts); + para_fill(irq_enable, EnableInterrupts); + /* irq_save_disable !!! sheer pain */ + patch_offset(&irq_save_disable_callout[IRQ_PATCH_INT_MASK], + (char *)paravirt_ops.save_fl); + patch_offset(&irq_save_disable_callout[IRQ_PATCH_DISABLE], + (char *)paravirt_ops.irq_disable); +#ifndef CONFIG_NO_IDLE_HZ + para_fill(safe_halt, Halt); +#else + vmi_ops.halt = vmi_get_function(VMI_CALL_Halt); + paravirt_ops.safe_halt = vmi_safe_halt; +#endif + para_fill(wbinvd, WBINVD); + /* paravirt_ops.read_msr = vmi_rdmsr */ + /* paravirt_ops.write_msr = vmi_wrmsr */ + para_fill(read_tsc, RDTSC); + /* paravirt_ops.rdpmc = vmi_rdpmc */ + + /* TR interface doesn't pass TR value */ + reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_SetTR); + if (rel->type != VMI_RELOCATION_NONE) { + BUG_ON(rel->type != VMI_RELOCATION_CALL_REL); + vmi_ops.set_tr = (void *)rel->eip; + paravirt_ops.load_tr_desc = vmi_set_tr; + } + + /* LDT is special, too */ + reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_SetLDT); + if (rel->type != VMI_RELOCATION_NONE) { + BUG_ON(rel->type != VMI_RELOCATION_CALL_REL); + vmi_ops._set_ldt = (void *)rel->eip; + paravirt_ops.set_ldt = vmi_set_ldt; + } + + para_fill(load_gdt, SetGDT); + para_fill(load_idt, SetIDT); + para_fill(store_gdt, GetGDT); + para_fill(store_idt, GetIDT); + para_fill(store_tr, GetTR); + paravirt_ops.load_tls = vmi_load_tls; + para_fill(write_ldt_entry, WriteLDTEntry); + para_fill(write_gdt_entry, WriteGDTEntry); + para_fill(write_idt_entry, WriteIDTEntry); + reloc = call_vrom_long_func(vmi_rom, get_reloc, + VMI_CALL_UpdateKernelStack); + if (rel->type != VMI_RELOCATION_NONE) { + BUG_ON(rel->type != VMI_RELOCATION_CALL_REL); + vmi_ops.set_kernel_stack = (void *)rel->eip; + paravirt_ops.load_esp0 = vmi_load_esp0; + } + + para_fill(set_iopl_mask, SetIOPLMask); + paravirt_ops.io_delay = (void *)vmi_nop; + if (!disable_nodelay) { + paravirt_ops.const_udelay = (void *)vmi_nop; + } + + para_fill(set_lazy_mode, SetLazyMode); + + reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_FlushTLB); + if (rel->type != VMI_RELOCATION_NONE) { + vmi_ops.flush_tlb = (void *)rel->eip; + paravirt_ops.flush_tlb_user = vmi_flush_tlb_user; + paravirt_ops.flush_tlb_kernel = vmi_flush_tlb_kernel; + } + para_fill(flush_tlb_single, InvalPage); + + /* + * Until a standard flag format can be agreed on, we need to + * implement these as wrappers in Linux. Get the VMI ROM + * function pointers for the two backend calls. + */ +#ifdef CONFIG_X86_PAE + vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong); + vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong); +#else + vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE); + vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE); +#endif + vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping); + vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage); + vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage); + + paravirt_ops.alloc_pt = vmi_allocate_pt; + paravirt_ops.alloc_pd = vmi_allocate_pd; + paravirt_ops.alloc_pd_clone = vmi_allocate_pd_clone; + paravirt_ops.release_pt = vmi_release_pt; + paravirt_ops.release_pd = vmi_release_pd; + paravirt_ops.set_pte = vmi_set_pte; + paravirt_ops.set_pte_at = vmi_set_pte_at; + paravirt_ops.set_pmd = vmi_set_pmd; + paravirt_ops.pte_update = vmi_update_pte; + paravirt_ops.pte_update_defer = vmi_update_pte_defer; +#ifdef CONFIG_X86_PAE + paravirt_ops.set_pte_atomic = vmi_set_pte_atomic; + paravirt_ops.set_pte_present = vmi_set_pte_present; + paravirt_ops.set_pud = vmi_set_pud; + paravirt_ops.pte_clear = vmi_pte_clear; + paravirt_ops.pmd_clear = vmi_pmd_clear; +#endif + /* + * These MUST always be patched. Don't support indirect jumps + * through these operations, as the VMI interface may use either + * a jump or a call to get to these operations, depending on + * the backend. They are performance critical anyway, so requiring + * a patch is not a big problem. + */ + paravirt_ops.irq_enable_sysexit = (void *)0xfeedbab0; + paravirt_ops.iret = (void *)0xbadbab0; + +#ifdef CONFIG_SMP + paravirt_ops.startup_ipi_hook = vmi_startup_ipi_hook; + vmi_ops.set_initial_ap_state = vmi_get_function(VMI_CALL_SetInitialAPState); +#endif + +#ifdef CONFIG_X86_LOCAL_APIC + paravirt_ops.apic_read = vmi_get_function(VMI_CALL_APICRead); + paravirt_ops.apic_write = vmi_get_function(VMI_CALL_APICWrite); + paravirt_ops.apic_write_atomic = vmi_get_function(VMI_CALL_APICWrite); +#endif + + /* + * Check for VMI timer functionality by probing for a cycle frequency method + */ + reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency); + if (rel->type != VMI_RELOCATION_NONE) { + vmi_timer_ops.get_cycle_frequency = (void *)rel->eip; + vmi_timer_ops.get_cycle_counter = + vmi_get_function(VMI_CALL_GetCycleCounter); + vmi_timer_ops.get_wallclock = + vmi_get_function(VMI_CALL_GetWallclockTime); + vmi_timer_ops.wallclock_updated = + vmi_get_function(VMI_CALL_WallclockUpdated); + vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm); + vmi_timer_ops.cancel_alarm = + vmi_get_function(VMI_CALL_CancelAlarm); + paravirt_ops.time_init = vmi_time_init; + paravirt_ops.get_wallclock = vmi_get_wallclock; + paravirt_ops.set_wallclock = vmi_set_wallclock; +#ifdef CONFIG_X86_LOCAL_APIC + paravirt_ops.setup_boot_clock = vmi_timer_setup_boot_alarm; + paravirt_ops.setup_secondary_clock = vmi_timer_setup_secondary_alarm; +#endif + custom_sched_clock = vmi_sched_clock; + } + + /* + * Alternative instruction rewriting doesn't happen soon enough + * to convert VMI_IRET to a call instead of a jump; so we have + * to do this before IRQs get reenabled. Fortunately, it is + * idempotent. + */ + apply_paravirt(__start_parainstructions, __stop_parainstructions); + + vmi_bringup(); + + return 1; +} + +#undef para_fill + +void __init vmi_init(void) +{ + unsigned long flags; + + if (!vmi_rom) + probe_vmi_rom(); + else + check_vmi_rom(vmi_rom); + + /* In case probing for or validating the ROM failed, basil */ + if (!vmi_rom) + return; + + reserve_top_address(-vmi_rom->virtual_top); + + local_irq_save(flags); + activate_vmi(); +#ifdef CONFIG_SMP + no_timer_check = 1; +#endif + local_irq_restore(flags & X86_EFLAGS_IF); +} + +static int __init parse_vmi(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strcmp(arg, "disable_nodelay")) + disable_nodelay = 1; + else if (!strcmp(arg, "disable_pge")) { + clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); + disable_pge = 1; + } else if (!strcmp(arg, "disable_pse")) { + clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); + disable_pse = 1; + } else if (!strcmp(arg, "disable_sep")) { + clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability); + disable_sep = 1; + } else if (!strcmp(arg, "disable_tsc")) { + clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability); + disable_tsc = 1; + } else if (!strcmp(arg, "disable_mtrr")) { + clear_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability); + disable_mtrr = 1; + } + return 0; +} + +early_param("vmi", parse_vmi); diff --git a/arch/i386/kernel/vmitime.c b/arch/i386/kernel/vmitime.c new file mode 100644 index 00000000000..76d2adcae5a --- /dev/null +++ b/arch/i386/kernel/vmitime.c @@ -0,0 +1,499 @@ +/* + * VMI paravirtual timer support routines. + * + * Copyright (C) 2005, VMware, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to dhecht@vmware.com + * + */ + +/* + * Portions of this code from arch/i386/kernel/timers/timer_tsc.c. + * Portions of the CONFIG_NO_IDLE_HZ code from arch/s390/kernel/time.c. + * See comments there for proper credits. + */ + +#include <linux/spinlock.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/jiffies.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> +#include <linux/rcupdate.h> +#include <linux/clocksource.h> + +#include <asm/timer.h> +#include <asm/io.h> +#include <asm/apic.h> +#include <asm/div64.h> +#include <asm/timer.h> +#include <asm/desc.h> + +#include <asm/vmi.h> +#include <asm/vmi_time.h> + +#include <mach_timer.h> +#include <io_ports.h> + +#ifdef CONFIG_X86_LOCAL_APIC +#define VMI_ALARM_WIRING VMI_ALARM_WIRED_LVTT +#else +#define VMI_ALARM_WIRING VMI_ALARM_WIRED_IRQ0 +#endif + +/* Cached VMI operations */ +struct vmi_timer_ops vmi_timer_ops; + +#ifdef CONFIG_NO_IDLE_HZ + +/* /proc/sys/kernel/hz_timer state. */ +int sysctl_hz_timer; + +/* Some stats */ +static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_irqs); +static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_jiffies); +static DEFINE_PER_CPU(unsigned long, idle_start_jiffies); + +#endif /* CONFIG_NO_IDLE_HZ */ + +/* Number of alarms per second. By default this is CONFIG_VMI_ALARM_HZ. */ +static int alarm_hz = CONFIG_VMI_ALARM_HZ; + +/* Cache of the value get_cycle_frequency / HZ. */ +static signed long long cycles_per_jiffy; + +/* Cache of the value get_cycle_frequency / alarm_hz. */ +static signed long long cycles_per_alarm; + +/* The number of cycles accounted for by the 'jiffies'/'xtime' count. + * Protected by xtime_lock. */ +static unsigned long long real_cycles_accounted_system; + +/* The number of cycles accounted for by update_process_times(), per cpu. */ +static DEFINE_PER_CPU(unsigned long long, process_times_cycles_accounted_cpu); + +/* The number of stolen cycles accounted, per cpu. */ +static DEFINE_PER_CPU(unsigned long long, stolen_cycles_accounted_cpu); + +/* Clock source. */ +static cycle_t read_real_cycles(void) +{ + return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL); +} + +static cycle_t read_available_cycles(void) +{ + return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE); +} + +#if 0 +static cycle_t read_stolen_cycles(void) +{ + return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_STOLEN); +} +#endif /* 0 */ + +static struct clocksource clocksource_vmi = { + .name = "vmi-timer", + .rating = 450, + .read = read_real_cycles, + .mask = CLOCKSOURCE_MASK(64), + .mult = 0, /* to be set */ + .shift = 22, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + + +/* Timer interrupt handler. */ +static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id); + +static struct irqaction vmi_timer_irq = { + vmi_timer_interrupt, + SA_INTERRUPT, + CPU_MASK_NONE, + "VMI-alarm", + NULL, + NULL +}; + +/* Alarm rate */ +static int __init vmi_timer_alarm_rate_setup(char* str) +{ + int alarm_rate; + if (get_option(&str, &alarm_rate) == 1 && alarm_rate > 0) { + alarm_hz = alarm_rate; + printk(KERN_WARNING "VMI timer alarm HZ set to %d\n", alarm_hz); + } + return 1; +} +__setup("vmi_timer_alarm_hz=", vmi_timer_alarm_rate_setup); + + +/* Initialization */ +static void vmi_get_wallclock_ts(struct timespec *ts) +{ + unsigned long long wallclock; + wallclock = vmi_timer_ops.get_wallclock(); // nsec units + ts->tv_nsec = do_div(wallclock, 1000000000); + ts->tv_sec = wallclock; +} + +static void update_xtime_from_wallclock(void) +{ + struct timespec ts; + vmi_get_wallclock_ts(&ts); + do_settimeofday(&ts); +} + +unsigned long vmi_get_wallclock(void) +{ + struct timespec ts; + vmi_get_wallclock_ts(&ts); + return ts.tv_sec; +} + +int vmi_set_wallclock(unsigned long now) +{ + return -1; +} + +unsigned long long vmi_sched_clock(void) +{ + return read_available_cycles(); +} + +void __init vmi_time_init(void) +{ + unsigned long long cycles_per_sec, cycles_per_msec; + unsigned long flags; + + local_irq_save(flags); + setup_irq(0, &vmi_timer_irq); +#ifdef CONFIG_X86_LOCAL_APIC + set_intr_gate(LOCAL_TIMER_VECTOR, apic_vmi_timer_interrupt); +#endif + + no_sync_cmos_clock = 1; + + vmi_get_wallclock_ts(&xtime); + set_normalized_timespec(&wall_to_monotonic, + -xtime.tv_sec, -xtime.tv_nsec); + + real_cycles_accounted_system = read_real_cycles(); + update_xtime_from_wallclock(); + per_cpu(process_times_cycles_accounted_cpu, 0) = read_available_cycles(); + + cycles_per_sec = vmi_timer_ops.get_cycle_frequency(); + + cycles_per_jiffy = cycles_per_sec; + (void)do_div(cycles_per_jiffy, HZ); + cycles_per_alarm = cycles_per_sec; + (void)do_div(cycles_per_alarm, alarm_hz); + cycles_per_msec = cycles_per_sec; + (void)do_div(cycles_per_msec, 1000); + cpu_khz = cycles_per_msec; + + printk(KERN_WARNING "VMI timer cycles/sec = %llu ; cycles/jiffy = %llu ;" + "cycles/alarm = %llu\n", cycles_per_sec, cycles_per_jiffy, + cycles_per_alarm); + + clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec, + clocksource_vmi.shift); + if (clocksource_register(&clocksource_vmi)) + printk(KERN_WARNING "Error registering VMITIME clocksource."); + + /* Disable PIT. */ + outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */ + + /* schedule the alarm. do this in phase with process_times_cycles_accounted_cpu + * reduce the latency calling update_process_times. */ + vmi_timer_ops.set_alarm( + VMI_ALARM_WIRED_IRQ0 | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE, + per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm, + cycles_per_alarm); + + local_irq_restore(flags); +} + +#ifdef CONFIG_X86_LOCAL_APIC + +void __init vmi_timer_setup_boot_alarm(void) +{ + local_irq_disable(); + + /* Route the interrupt to the correct vector. */ + apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR); + + /* Cancel the IRQ0 wired alarm, and setup the LVTT alarm. */ + vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE); + vmi_timer_ops.set_alarm( + VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE, + per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm, + cycles_per_alarm); + local_irq_enable(); +} + +/* Initialize the time accounting variables for an AP on an SMP system. + * Also, set the local alarm for the AP. */ +void __init vmi_timer_setup_secondary_alarm(void) +{ + int cpu = smp_processor_id(); + + /* Route the interrupt to the correct vector. */ + apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR); + + per_cpu(process_times_cycles_accounted_cpu, cpu) = read_available_cycles(); + + vmi_timer_ops.set_alarm( + VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE, + per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm, + cycles_per_alarm); +} + +#endif + +/* Update system wide (real) time accounting (e.g. jiffies, xtime). */ +static void vmi_account_real_cycles(unsigned long long cur_real_cycles) +{ + long long cycles_not_accounted; + + write_seqlock(&xtime_lock); + + cycles_not_accounted = cur_real_cycles - real_cycles_accounted_system; + while (cycles_not_accounted >= cycles_per_jiffy) { + /* systems wide jiffies and wallclock. */ + do_timer(1); + + cycles_not_accounted -= cycles_per_jiffy; + real_cycles_accounted_system += cycles_per_jiffy; + } + + if (vmi_timer_ops.wallclock_updated()) + update_xtime_from_wallclock(); + + write_sequnlock(&xtime_lock); +} + +/* Update per-cpu process times. */ +static void vmi_account_process_times_cycles(struct pt_regs *regs, int cpu, + unsigned long long cur_process_times_cycles) +{ + long long cycles_not_accounted; + cycles_not_accounted = cur_process_times_cycles - + per_cpu(process_times_cycles_accounted_cpu, cpu); + + while (cycles_not_accounted >= cycles_per_jiffy) { + /* Account time to the current process. This includes + * calling into the scheduler to decrement the timeslice + * and possibly reschedule.*/ + update_process_times(user_mode(regs)); + /* XXX handle /proc/profile multiplier. */ + profile_tick(CPU_PROFILING); + + cycles_not_accounted -= cycles_per_jiffy; + per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy; + } +} + +#ifdef CONFIG_NO_IDLE_HZ +/* Update per-cpu idle times. Used when a no-hz halt is ended. */ +static void vmi_account_no_hz_idle_cycles(int cpu, + unsigned long long cur_process_times_cycles) +{ + long long cycles_not_accounted; + unsigned long no_idle_hz_jiffies = 0; + + cycles_not_accounted = cur_process_times_cycles - + per_cpu(process_times_cycles_accounted_cpu, cpu); + + while (cycles_not_accounted >= cycles_per_jiffy) { + no_idle_hz_jiffies++; + cycles_not_accounted -= cycles_per_jiffy; + per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy; + } + /* Account time to the idle process. */ + account_steal_time(idle_task(cpu), jiffies_to_cputime(no_idle_hz_jiffies)); +} +#endif + +/* Update per-cpu stolen time. */ +static void vmi_account_stolen_cycles(int cpu, + unsigned long long cur_real_cycles, + unsigned long long cur_avail_cycles) +{ + long long stolen_cycles_not_accounted; + unsigned long stolen_jiffies = 0; + + if (cur_real_cycles < cur_avail_cycles) + return; + + stolen_cycles_not_accounted = cur_real_cycles - cur_avail_cycles - + per_cpu(stolen_cycles_accounted_cpu, cpu); + + while (stolen_cycles_not_accounted >= cycles_per_jiffy) { + stolen_jiffies++; + stolen_cycles_not_accounted -= cycles_per_jiffy; + per_cpu(stolen_cycles_accounted_cpu, cpu) += cycles_per_jiffy; + } + /* HACK: pass NULL to force time onto cpustat->steal. */ + account_steal_time(NULL, jiffies_to_cputime(stolen_jiffies)); +} + +/* Body of either IRQ0 interrupt handler (UP no local-APIC) or + * local-APIC LVTT interrupt handler (UP & local-APIC or SMP). */ +static void vmi_local_timer_interrupt(int cpu) +{ + unsigned long long cur_real_cycles, cur_process_times_cycles; + + cur_real_cycles = read_real_cycles(); + cur_process_times_cycles = read_available_cycles(); + /* Update system wide (real) time state (xtime, jiffies). */ + vmi_account_real_cycles(cur_real_cycles); + /* Update per-cpu process times. */ + vmi_account_process_times_cycles(get_irq_regs(), cpu, cur_process_times_cycles); + /* Update time stolen from this cpu by the hypervisor. */ + vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles); +} + +#ifdef CONFIG_NO_IDLE_HZ + +/* Must be called only from idle loop, with interrupts disabled. */ +int vmi_stop_hz_timer(void) +{ + /* Note that cpu_set, cpu_clear are (SMP safe) atomic on x86. */ + + unsigned long seq, next; + unsigned long long real_cycles_expiry; + int cpu = smp_processor_id(); + int idle; + + BUG_ON(!irqs_disabled()); + if (sysctl_hz_timer != 0) + return 0; + + cpu_set(cpu, nohz_cpu_mask); + smp_mb(); + if (rcu_needs_cpu(cpu) || local_softirq_pending() || + (next = next_timer_interrupt(), time_before_eq(next, jiffies))) { + cpu_clear(cpu, nohz_cpu_mask); + next = jiffies; + idle = 0; + } else + idle = 1; + + /* Convert jiffies to the real cycle counter. */ + do { + seq = read_seqbegin(&xtime_lock); + real_cycles_expiry = real_cycles_accounted_system + + (long)(next - jiffies) * cycles_per_jiffy; + } while (read_seqretry(&xtime_lock, seq)); + + /* This cpu is going idle. Disable the periodic alarm. */ + if (idle) { + vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE); + per_cpu(idle_start_jiffies, cpu) = jiffies; + } + + /* Set the real time alarm to expire at the next event. */ + vmi_timer_ops.set_alarm( + VMI_ALARM_WIRING | VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL, + real_cycles_expiry, 0); + + return idle; +} + +static void vmi_reenable_hz_timer(int cpu) +{ + /* For /proc/vmi/info idle_hz stat. */ + per_cpu(vmi_idle_no_hz_jiffies, cpu) += jiffies - per_cpu(idle_start_jiffies, cpu); + per_cpu(vmi_idle_no_hz_irqs, cpu)++; + + /* Don't bother explicitly cancelling the one-shot alarm -- at + * worse we will receive a spurious timer interrupt. */ + vmi_timer_ops.set_alarm( + VMI_ALARM_WIRING | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE, + per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm, + cycles_per_alarm); + /* Indicate this cpu is no longer nohz idle. */ + cpu_clear(cpu, nohz_cpu_mask); +} + +/* Called from interrupt handlers when (local) HZ timer is disabled. */ +void vmi_account_time_restart_hz_timer(void) +{ + unsigned long long cur_real_cycles, cur_process_times_cycles; + int cpu = smp_processor_id(); + + BUG_ON(!irqs_disabled()); + /* Account the time during which the HZ timer was disabled. */ + cur_real_cycles = read_real_cycles(); + cur_process_times_cycles = read_available_cycles(); + /* Update system wide (real) time state (xtime, jiffies). */ + vmi_account_real_cycles(cur_real_cycles); + /* Update per-cpu idle times. */ + vmi_account_no_hz_idle_cycles(cpu, cur_process_times_cycles); + /* Update time stolen from this cpu by the hypervisor. */ + vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles); + /* Reenable the hz timer. */ + vmi_reenable_hz_timer(cpu); +} + +#endif /* CONFIG_NO_IDLE_HZ */ + +/* UP (and no local-APIC) VMI-timer alarm interrupt handler. + * Handler for IRQ0. Not used when SMP or X86_LOCAL_APIC after + * APIC setup and setup_boot_vmi_alarm() is called. */ +static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id) +{ + vmi_local_timer_interrupt(smp_processor_id()); + return IRQ_HANDLED; +} + +#ifdef CONFIG_X86_LOCAL_APIC + +/* SMP VMI-timer alarm interrupt handler. Handler for LVTT vector. + * Also used in UP when CONFIG_X86_LOCAL_APIC. + * The wrapper code is from arch/i386/kernel/apic.c#smp_apic_timer_interrupt. */ +void smp_apic_vmi_timer_interrupt(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + int cpu = smp_processor_id(); + + /* + * the NMI deadlock-detector uses this. + */ + per_cpu(irq_stat,cpu).apic_timer_irqs++; + + /* + * NOTE! We'd better ACK the irq immediately, + * because timer handling can be slow. + */ + ack_APIC_irq(); + + /* + * update_process_times() expects us to have done irq_enter(). + * Besides, if we don't timer interrupts ignore the global + * interrupt lock, which is the WrongThing (tm) to do. + */ + irq_enter(); + vmi_local_timer_interrupt(cpu); + irq_exit(); + set_irq_regs(old_regs); +} + +#endif /* CONFIG_X86_LOCAL_APIC */ diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index a53c8b1854b..ca51610955d 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -37,9 +37,14 @@ SECTIONS { . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; phys_startup_32 = startup_32 - LOAD_OFFSET; + + .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { + _text = .; /* Text and read-only data */ + *(.text.head) + } :text = 0x9090 + /* read-only */ .text : AT(ADDR(.text) - LOAD_OFFSET) { - _text = .; /* Text and read-only data */ *(.text) SCHED_TEXT LOCK_TEXT @@ -181,12 +186,14 @@ SECTIONS from .altinstructions and .eh_frame */ .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) } +#if defined(CONFIG_BLK_DEV_INITRD) . = ALIGN(4096); .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { __initramfs_start = .; *(.init.ramfs) __initramfs_end = .; } +#endif . = ALIGN(L1_CACHE_BYTES); .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { __per_cpu_start = .; |