aboutsummaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig30
-rw-r--r--arch/x86/kernel/tsc.c235
-rw-r--r--arch/x86/kvm/paging_tmpl.h2
-rw-r--r--arch/x86/pci/i386.c87
4 files changed, 198 insertions, 156 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0a80d6a5e9f..21ef9dd3618 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -577,35 +577,29 @@ config SWIOTLB
config IOMMU_HELPER
def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU)
+
config MAXSMP
bool "Configure Maximum number of SMP Processors and NUMA Nodes"
- depends on X86_64 && SMP
+ depends on X86_64 && SMP && BROKEN
default n
help
Configure maximum number of CPUS and NUMA Nodes for this architecture.
If unsure, say N.
-if MAXSMP
-config NR_CPUS
- int
- default "4096"
-endif
-
-if !MAXSMP
config NR_CPUS
- int "Maximum number of CPUs (2-4096)"
- range 2 4096
+ int "Maximum number of CPUs (2-512)" if !MAXSMP
+ range 2 512
depends on SMP
+ default "4096" if MAXSMP
default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000
default "8"
help
This allows you to specify the maximum number of CPUs which this
- kernel will support. The maximum supported value is 4096 and the
+ kernel will support. The maximum supported value is 512 and the
minimum value which makes sense is 2.
This is purely to save memory - each supported CPU adds
approximately eight kilobytes to the kernel image.
-endif
config SCHED_SMT
bool "SMT (Hyperthreading) scheduler support"
@@ -996,17 +990,10 @@ config NUMA_EMU
into virtual nodes when booted with "numa=fake=N", where N is the
number of nodes. This is only useful for debugging.
-if MAXSMP
-
config NODES_SHIFT
- int
- default "9"
-endif
-
-if !MAXSMP
-config NODES_SHIFT
- int "Maximum NUMA Nodes (as a power of 2)"
+ int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP
range 1 9 if X86_64
+ default "9" if MAXSMP
default "6" if X86_64
default "4" if X86_NUMAQ
default "3"
@@ -1014,7 +1001,6 @@ config NODES_SHIFT
help
Specify the maximum number of NUMA Nodes available on the target
system. Increases memory reserved to accomodate various tables.
-endif
config HAVE_ARCH_BOOTMEM_NODE
def_bool y
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 8e786b0d665..346cae5ac42 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -122,80 +122,217 @@ static u64 tsc_read_refs(u64 *pm, u64 *hpet)
return ULLONG_MAX;
}
-/**
- * native_calibrate_tsc - calibrate the tsc on boot
+/*
+ * Try to calibrate the TSC against the Programmable
+ * Interrupt Timer and return the frequency of the TSC
+ * in kHz.
+ *
+ * Return ULONG_MAX on failure to calibrate.
*/
-unsigned long native_calibrate_tsc(void)
+static unsigned long pit_calibrate_tsc(void)
{
- unsigned long flags;
- u64 tsc1, tsc2, tr1, tr2, delta, pm1, pm2, hpet1, hpet2;
- int hpet = is_hpet_enabled();
- unsigned int tsc_khz_val = 0;
-
- local_irq_save(flags);
-
- tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL);
+ u64 tsc, t1, t2, delta;
+ unsigned long tscmin, tscmax;
+ int pitcnt;
+ /* Set the Gate high, disable speaker */
outb((inb(0x61) & ~0x02) | 0x01, 0x61);
+ /*
+ * Setup CTC channel 2* for mode 0, (interrupt on terminal
+ * count mode), binary count. Set the latch register to 50ms
+ * (LSB then MSB) to begin countdown.
+ */
outb(0xb0, 0x43);
outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42);
- tr1 = get_cycles();
- while ((inb(0x61) & 0x20) == 0);
- tr2 = get_cycles();
- tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL);
+ tsc = t1 = t2 = get_cycles();
- local_irq_restore(flags);
+ pitcnt = 0;
+ tscmax = 0;
+ tscmin = ULONG_MAX;
+ while ((inb(0x61) & 0x20) == 0) {
+ t2 = get_cycles();
+ delta = t2 - tsc;
+ tsc = t2;
+ if ((unsigned long) delta < tscmin)
+ tscmin = (unsigned int) delta;
+ if ((unsigned long) delta > tscmax)
+ tscmax = (unsigned int) delta;
+ pitcnt++;
+ }
/*
- * Preset the result with the raw and inaccurate PIT
- * calibration value
+ * Sanity checks:
+ *
+ * If we were not able to read the PIT more than 5000
+ * times, then we have been hit by a massive SMI
+ *
+ * If the maximum is 10 times larger than the minimum,
+ * then we got hit by an SMI as well.
*/
- delta = (tr2 - tr1);
+ if (pitcnt < 5000 || tscmax > 10 * tscmin)
+ return ULONG_MAX;
+
+ /* Calculate the PIT value */
+ delta = t2 - t1;
do_div(delta, 50);
- tsc_khz_val = delta;
+ return delta;
+}
+
+
+/**
+ * native_calibrate_tsc - calibrate the tsc on boot
+ */
+unsigned long native_calibrate_tsc(void)
+{
+ u64 tsc1, tsc2, delta, pm1, pm2, hpet1, hpet2;
+ unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
+ unsigned long flags;
+ int hpet = is_hpet_enabled(), i;
- /* hpet or pmtimer available ? */
+ /*
+ * Run 5 calibration loops to get the lowest frequency value
+ * (the best estimate). We use two different calibration modes
+ * here:
+ *
+ * 1) PIT loop. We set the PIT Channel 2 to oneshot mode and
+ * load a timeout of 50ms. We read the time right after we
+ * started the timer and wait until the PIT count down reaches
+ * zero. In each wait loop iteration we read the TSC and check
+ * the delta to the previous read. We keep track of the min
+ * and max values of that delta. The delta is mostly defined
+ * by the IO time of the PIT access, so we can detect when a
+ * SMI/SMM disturbance happend between the two reads. If the
+ * maximum time is significantly larger than the minimum time,
+ * then we discard the result and have another try.
+ *
+ * 2) Reference counter. If available we use the HPET or the
+ * PMTIMER as a reference to check the sanity of that value.
+ * We use separate TSC readouts and check inside of the
+ * reference read for a SMI/SMM disturbance. We dicard
+ * disturbed values here as well. We do that around the PIT
+ * calibration delay loop as we have to wait for a certain
+ * amount of time anyway.
+ */
+ for (i = 0; i < 5; i++) {
+ unsigned long tsc_pit_khz;
+
+ /*
+ * Read the start value and the reference count of
+ * hpet/pmtimer when available. Then do the PIT
+ * calibration, which will take at least 50ms, and
+ * read the end value.
+ */
+ local_irq_save(flags);
+ tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL);
+ tsc_pit_khz = pit_calibrate_tsc();
+ tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL);
+ local_irq_restore(flags);
+
+ /* Pick the lowest PIT TSC calibration so far */
+ tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
+
+ /* hpet or pmtimer available ? */
+ if (!hpet && !pm1 && !pm2)
+ continue;
+
+ /* Check, whether the sampling was disturbed by an SMI */
+ if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX)
+ continue;
+
+ tsc2 = (tsc2 - tsc1) * 1000000LL;
+
+ if (hpet) {
+ if (hpet2 < hpet1)
+ hpet2 += 0x100000000ULL;
+ hpet2 -= hpet1;
+ tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
+ do_div(tsc1, 1000000);
+ } else {
+ if (pm2 < pm1)
+ pm2 += (u64)ACPI_PM_OVRRUN;
+ pm2 -= pm1;
+ tsc1 = pm2 * 1000000000LL;
+ do_div(tsc1, PMTMR_TICKS_PER_SEC);
+ }
+
+ do_div(tsc2, tsc1);
+ tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);
+ }
+
+ /*
+ * Now check the results.
+ */
+ if (tsc_pit_min == ULONG_MAX) {
+ /* PIT gave no useful value */
+ printk(KERN_WARNING "TSC: PIT calibration failed due to "
+ "SMI disturbance.\n");
+
+ /* We don't have an alternative source, disable TSC */
+ if (!hpet && !pm1 && !pm2) {
+ printk("TSC: No reference (HPET/PMTIMER) available\n");
+ return 0;
+ }
+
+ /* The alternative source failed as well, disable TSC */
+ if (tsc_ref_min == ULONG_MAX) {
+ printk(KERN_WARNING "TSC: HPET/PMTIMER calibration "
+ "failed due to SMI disturbance.\n");
+ return 0;
+ }
+
+ /* Use the alternative source */
+ printk(KERN_INFO "TSC: using %s reference calibration\n",
+ hpet ? "HPET" : "PMTIMER");
+
+ return tsc_ref_min;
+ }
+
+ /* We don't have an alternative source, use the PIT calibration value */
if (!hpet && !pm1 && !pm2) {
- printk(KERN_INFO "TSC calibrated against PIT\n");
- goto out;
+ printk(KERN_INFO "TSC: Using PIT calibration value\n");
+ return tsc_pit_min;
}
- /* Check, whether the sampling was disturbed by an SMI */
- if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) {
- printk(KERN_WARNING "TSC calibration disturbed by SMI, "
- "using PIT calibration result\n");
- goto out;
+ /* The alternative source failed, use the PIT calibration value */
+ if (tsc_ref_min == ULONG_MAX) {
+ printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed due "
+ "to SMI disturbance. Using PIT calibration\n");
+ return tsc_pit_min;
}
- tsc2 = (tsc2 - tsc1) * 1000000LL;
-
- if (hpet) {
- printk(KERN_INFO "TSC calibrated against HPET\n");
- if (hpet2 < hpet1)
- hpet2 += 0x100000000ULL;
- hpet2 -= hpet1;
- tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
- do_div(tsc1, 1000000);
- } else {
- printk(KERN_INFO "TSC calibrated against PM_TIMER\n");
- if (pm2 < pm1)
- pm2 += (u64)ACPI_PM_OVRRUN;
- pm2 -= pm1;
- tsc1 = pm2 * 1000000000LL;
- do_div(tsc1, PMTMR_TICKS_PER_SEC);
+ /* Check the reference deviation */
+ delta = ((u64) tsc_pit_min) * 100;
+ do_div(delta, tsc_ref_min);
+
+ /*
+ * If both calibration results are inside a 5% window, the we
+ * use the lower frequency of those as it is probably the
+ * closest estimate.
+ */
+ if (delta >= 95 && delta <= 105) {
+ printk(KERN_INFO "TSC: PIT calibration confirmed by %s.\n",
+ hpet ? "HPET" : "PMTIMER");
+ printk(KERN_INFO "TSC: using %s calibration value\n",
+ tsc_pit_min <= tsc_ref_min ? "PIT" :
+ hpet ? "HPET" : "PMTIMER");
+ return tsc_pit_min <= tsc_ref_min ? tsc_pit_min : tsc_ref_min;
}
- do_div(tsc2, tsc1);
- tsc_khz_val = tsc2;
+ printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
+ hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
-out:
- return tsc_khz_val;
+ /*
+ * The calibration values differ too much. In doubt, we use
+ * the PIT value as we know that there are PMTIMERs around
+ * running at double speed.
+ */
+ printk(KERN_INFO "TSC: Using PIT calibration value\n");
+ return tsc_pit_min;
}
-
#ifdef CONFIG_X86_32
/* Only called from the Powernow K7 cpu freq driver */
int recalibrate_cpu_khz(void)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index f72ac1fa35f..4a814bff21f 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -345,7 +345,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
shadow_addr = __pa(shadow_page->spt);
shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
| PT_WRITABLE_MASK | PT_USER_MASK;
- *shadow_ent = shadow_pte;
+ set_shadow_pte(shadow_ent, shadow_pte);
}
mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index d765da91384..8791fc55e71 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -31,11 +31,8 @@
#include <linux/ioport.h>
#include <linux/errno.h>
#include <linux/bootmem.h>
-#include <linux/acpi.h>
#include <asm/pat.h>
-#include <asm/hpet.h>
-#include <asm/io_apic.h>
#include "pci.h"
@@ -80,77 +77,6 @@ pcibios_align_resource(void *data, struct resource *res,
}
EXPORT_SYMBOL(pcibios_align_resource);
-static int check_res_with_valid(struct pci_dev *dev, struct resource *res)
-{
- unsigned long base;
- unsigned long size;
- int i;
-
- base = res->start;
- size = (res->start == 0 && res->end == res->start) ? 0 :
- (res->end - res->start + 1);
-
- if (!base || !size)
- return 0;
-
-#ifdef CONFIG_HPET_TIMER
- /* for hpet */
- if (base == hpet_address && (res->flags & IORESOURCE_MEM)) {
- dev_info(&dev->dev, "BAR has HPET at %08lx-%08lx\n",
- base, base + size - 1);
- return 1;
- }
-#endif
-
-#ifdef CONFIG_X86_IO_APIC
- for (i = 0; i < nr_ioapics; i++) {
- unsigned long ioapic_phys = mp_ioapics[i].mp_apicaddr;
-
- if (base == ioapic_phys && (res->flags & IORESOURCE_MEM)) {
- dev_info(&dev->dev, "BAR has ioapic at %08lx-%08lx\n",
- base, base + size - 1);
- return 1;
- }
- }
-#endif
-
-#ifdef CONFIG_PCI_MMCONFIG
- for (i = 0; i < pci_mmcfg_config_num; i++) {
- unsigned long addr;
-
- addr = pci_mmcfg_config[i].address;
- if (base == addr && (res->flags & IORESOURCE_MEM)) {
- dev_info(&dev->dev, "BAR has MMCONFIG at %08lx-%08lx\n",
- base, base + size - 1);
- return 1;
- }
- }
-#endif
-
- return 0;
-}
-
-static int check_platform(struct pci_dev *dev, struct resource *res)
-{
- struct resource *root = NULL;
-
- /*
- * forcibly insert it into the
- * resource tree
- */
- if (res->flags & IORESOURCE_MEM)
- root = &iomem_resource;
- else if (res->flags & IORESOURCE_IO)
- root = &ioport_resource;
-
- if (root && check_res_with_valid(dev, res)) {
- insert_resource(root, res);
-
- return 1;
- }
-
- return 0;
-}
/*
* Handle resources of PCI devices. If the world were perfect, we could
* just allocate all the resource regions and do nothing more. It isn't.
@@ -202,10 +128,7 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
pr = pci_find_parent_resource(dev, r);
if (!r->start || !pr ||
request_resource(pr, r) < 0) {
- if (check_platform(dev, r))
- continue;
- dev_err(&dev->dev, "BAR %d: can't "
- "allocate resource\n", idx);
+ dev_err(&dev->dev, "BAR %d: can't allocate resource\n", idx);
/*
* Something is wrong with the region.
* Invalidate the resource to prevent
@@ -240,17 +163,13 @@ static void __init pcibios_allocate_resources(int pass)
else
disabled = !(command & PCI_COMMAND_MEMORY);
if (pass == disabled) {
- dev_dbg(&dev->dev, "resource %#08llx-%#08llx "
- "(f=%lx, d=%d, p=%d)\n",
+ dev_dbg(&dev->dev, "resource %#08llx-%#08llx (f=%lx, d=%d, p=%d)\n",
(unsigned long long) r->start,
(unsigned long long) r->end,
r->flags, disabled, pass);
pr = pci_find_parent_resource(dev, r);
if (!pr || request_resource(pr, r) < 0) {
- if (check_platform(dev, r))
- continue;
- dev_err(&dev->dev, "BAR %d: can't "
- "allocate resource\n", idx);
+ dev_err(&dev->dev, "BAR %d: can't allocate resource\n", idx);
/* We'll assign a new address later */
r->end -= r->start;
r->start = 0;