diff options
Diffstat (limited to 'arch/x86')
99 files changed, 2911 insertions, 1310 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 98a0ed52b5c..249d1e0824b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -247,6 +247,28 @@ config X86_HAS_BOOT_CPU_ID def_bool y depends on X86_VOYAGER +config SPARSE_IRQ + bool "Support sparse irq numbering" + depends on PCI_MSI || HT_IRQ + help + This enables support for sparse irqs. This is useful for distro + kernels that want to define a high CONFIG_NR_CPUS value but still + want to have low kernel memory footprint on smaller machines. + + ( Sparse IRQs can also be beneficial on NUMA boxes, as they spread + out the irq_desc[] array in a more NUMA-friendly way. ) + + If you don't know what to do here, say N. + +config NUMA_MIGRATE_IRQ_DESC + bool "Move irq desc when changing irq smp_affinity" + depends on SPARSE_IRQ && NUMA + default n + help + This enables moving irq_desc to cpu/node that irq will use handled. + + If you don't know what to do here, say N. + config X86_FIND_SMP_CONFIG def_bool y depends on X86_MPPARSE || X86_VOYAGER @@ -479,7 +501,7 @@ config HPET_TIMER The HPET provides a stable time base on SMP systems, unlike the TSC, but it is more expensive to access, as it is off-chip. You can find the HPET spec at - <http://www.intel.com/hardwaredesign/hpetspec.htm>. + <http://www.intel.com/hardwaredesign/hpetspec_1.pdf>. You can safely choose Y here. However, HPET will only be activated if the platform and the BIOS support this feature. @@ -579,19 +601,20 @@ config IOMMU_HELPER config MAXSMP bool "Configure Maximum number of SMP Processors and NUMA Nodes" - depends on X86_64 && SMP && BROKEN + depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL + select CPUMASK_OFFSTACK default n help Configure maximum number of CPUS and NUMA Nodes for this architecture. If unsure, say N. config NR_CPUS - int "Maximum number of CPUs (2-512)" if !MAXSMP - range 2 512 - depends on SMP + int "Maximum number of CPUs" if SMP && !MAXSMP + range 2 512 if SMP && !MAXSMP + default "1" if !SMP default "4096" if MAXSMP - default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000 - default "8" + default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000) + default "8" if SMP help This allows you to specify the maximum number of CPUs which this kernel will support. The maximum supported value is 512 and the diff --git a/arch/x86/include/asm/bigsmp/apic.h b/arch/x86/include/asm/bigsmp/apic.h index ce547f24a1c..d8dd9f53791 100644 --- a/arch/x86/include/asm/bigsmp/apic.h +++ b/arch/x86/include/asm/bigsmp/apic.h @@ -9,12 +9,12 @@ static inline int apic_id_registered(void) return (1); } -static inline cpumask_t target_cpus(void) +static inline const cpumask_t *target_cpus(void) { #ifdef CONFIG_SMP - return cpu_online_map; + return &cpu_online_map; #else - return cpumask_of_cpu(0); + return &cpumask_of_cpu(0); #endif } @@ -79,7 +79,7 @@ static inline int apicid_to_node(int logical_apicid) static inline int cpu_present_to_apicid(int mps_cpu) { - if (mps_cpu < NR_CPUS) + if (mps_cpu < nr_cpu_ids) return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu); return BAD_APICID; @@ -94,7 +94,7 @@ extern u8 cpu_2_logical_apicid[]; /* Mapping from cpu number to logical apicid */ static inline int cpu_to_logical_apicid(int cpu) { - if (cpu >= NR_CPUS) + if (cpu >= nr_cpu_ids) return BAD_APICID; return cpu_physical_id(cpu); } @@ -119,16 +119,34 @@ static inline int check_phys_apicid_present(int boot_cpu_physical_apicid) } /* As we are using single CPU as destination, pick only one CPU here */ -static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) +static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask) { int cpu; int apicid; - cpu = first_cpu(cpumask); + cpu = first_cpu(*cpumask); apicid = cpu_to_logical_apicid(cpu); return apicid; } +static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + for_each_cpu_and(cpu, cpumask, andmask) + if (cpumask_test_cpu(cpu, cpu_online_mask)) + break; + if (cpu < nr_cpu_ids) + return cpu_to_logical_apicid(cpu); + + return BAD_APICID; +} + static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb) { return cpuid_apic >> index_msb; diff --git a/arch/x86/include/asm/bigsmp/ipi.h b/arch/x86/include/asm/bigsmp/ipi.h index 9404c535b7e..27fcd01b3ae 100644 --- a/arch/x86/include/asm/bigsmp/ipi.h +++ b/arch/x86/include/asm/bigsmp/ipi.h @@ -1,25 +1,22 @@ #ifndef __ASM_MACH_IPI_H #define __ASM_MACH_IPI_H -void send_IPI_mask_sequence(cpumask_t mask, int vector); +void send_IPI_mask_sequence(const struct cpumask *mask, int vector); +void send_IPI_mask_allbutself(const struct cpumask *mask, int vector); -static inline void send_IPI_mask(cpumask_t mask, int vector) +static inline void send_IPI_mask(const struct cpumask *mask, int vector) { send_IPI_mask_sequence(mask, vector); } static inline void send_IPI_allbutself(int vector) { - cpumask_t mask = cpu_online_map; - cpu_clear(smp_processor_id(), mask); - - if (!cpus_empty(mask)) - send_IPI_mask(mask, vector); + send_IPI_mask_allbutself(cpu_online_mask, vector); } static inline void send_IPI_all(int vector) { - send_IPI_mask(cpu_online_map, vector); + send_IPI_mask(cpu_online_mask, vector); } #endif /* __ASM_MACH_IPI_H */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index e6b82b17b07..dc27705f544 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -320,16 +320,14 @@ static inline void set_intr_gate(unsigned int n, void *addr) _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS); } -#define SYS_VECTOR_FREE 0 -#define SYS_VECTOR_ALLOCED 1 - extern int first_system_vector; -extern char system_vectors[]; +/* used_vectors is BITMAP for irq is not managed by percpu vector_irq */ +extern unsigned long used_vectors[]; static inline void alloc_system_vector(int vector) { - if (system_vectors[vector] == SYS_VECTOR_FREE) { - system_vectors[vector] = SYS_VECTOR_ALLOCED; + if (!test_bit(vector, used_vectors)) { + set_bit(vector, used_vectors); if (first_system_vector > vector) first_system_vector = vector; } else diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index dc22c073328..4035357f5b9 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -65,7 +65,7 @@ static inline struct dma_mapping_ops *get_dma_ops(struct device *dev) return dma_ops; else return dev->archdata.dma_ops; -#endif /* _ASM_X86_DMA_MAPPING_H */ +#endif } /* Make sure we keep the same behaviour */ diff --git a/arch/x86/include/asm/es7000/apic.h b/arch/x86/include/asm/es7000/apic.h index e24ef876915..51ac1230294 100644 --- a/arch/x86/include/asm/es7000/apic.h +++ b/arch/x86/include/asm/es7000/apic.h @@ -9,14 +9,14 @@ static inline int apic_id_registered(void) return (1); } -static inline cpumask_t target_cpus_cluster(void) +static inline const cpumask_t *target_cpus_cluster(void) { - return CPU_MASK_ALL; + return &CPU_MASK_ALL; } -static inline cpumask_t target_cpus(void) +static inline const cpumask_t *target_cpus(void) { - return cpumask_of_cpu(smp_processor_id()); + return &cpumask_of_cpu(smp_processor_id()); } #define APIC_DFR_VALUE_CLUSTER (APIC_DFR_CLUSTER) @@ -80,9 +80,10 @@ extern int apic_version [MAX_APICS]; static inline void setup_apic_routing(void) { int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id()); - printk("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n", + printk("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n", (apic_version[apic] == 0x14) ? - "Physical Cluster" : "Logical Cluster", nr_ioapics, cpus_addr(target_cpus())[0]); + "Physical Cluster" : "Logical Cluster", + nr_ioapics, cpus_addr(*target_cpus())[0]); } static inline int multi_timer_check(int apic, int irq) @@ -100,7 +101,7 @@ static inline int cpu_present_to_apicid(int mps_cpu) { if (!mps_cpu) return boot_cpu_physical_apicid; - else if (mps_cpu < NR_CPUS) + else if (mps_cpu < nr_cpu_ids) return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu); else return BAD_APICID; @@ -120,9 +121,9 @@ extern u8 cpu_2_logical_apicid[]; static inline int cpu_to_logical_apicid(int cpu) { #ifdef CONFIG_SMP - if (cpu >= NR_CPUS) - return BAD_APICID; - return (int)cpu_2_logical_apicid[cpu]; + if (cpu >= nr_cpu_ids) + return BAD_APICID; + return (int)cpu_2_logical_apicid[cpu]; #else return logical_smp_processor_id(); #endif @@ -146,14 +147,15 @@ static inline int check_phys_apicid_present(int cpu_physical_apicid) return (1); } -static inline unsigned int cpu_mask_to_apicid_cluster(cpumask_t cpumask) +static inline unsigned int +cpu_mask_to_apicid_cluster(const struct cpumask *cpumask) { int num_bits_set; int cpus_found = 0; int cpu; int apicid; - num_bits_set = cpus_weight(cpumask); + num_bits_set = cpumask_weight(cpumask); /* Return id to all */ if (num_bits_set == NR_CPUS) return 0xFF; @@ -161,10 +163,10 @@ static inline unsigned int cpu_mask_to_apicid_cluster(cpumask_t cpumask) * The cpus in the mask must all be on the apic cluster. If are not * on the same apicid cluster return default value of TARGET_CPUS. */ - cpu = first_cpu(cpumask); + cpu = cpumask_first(cpumask); apicid = cpu_to_logical_apicid(cpu); while (cpus_found < num_bits_set) { - if (cpu_isset(cpu, cpumask)) { + if (cpumask_test_cpu(cpu, cpumask)) { int new_apicid = cpu_to_logical_apicid(cpu); if (apicid_cluster(apicid) != apicid_cluster(new_apicid)){ @@ -179,14 +181,14 @@ static inline unsigned int cpu_mask_to_apicid_cluster(cpumask_t cpumask) return apicid; } -static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) +static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask) { int num_bits_set; int cpus_found = 0; int cpu; int apicid; - num_bits_set = cpus_weight(cpumask); + num_bits_set = cpus_weight(*cpumask); /* Return id to all */ if (num_bits_set == NR_CPUS) return cpu_to_logical_apicid(0); @@ -194,10 +196,52 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) * The cpus in the mask must all be on the apic cluster. If are not * on the same apicid cluster return default value of TARGET_CPUS. */ - cpu = first_cpu(cpumask); + cpu = first_cpu(*cpumask); + apicid = cpu_to_logical_apicid(cpu); + while (cpus_found < num_bits_set) { + if (cpu_isset(cpu, *cpumask)) { + int new_apicid = cpu_to_logical_apicid(cpu); + if (apicid_cluster(apicid) != + apicid_cluster(new_apicid)){ + printk ("%s: Not a valid mask!\n", __func__); + return cpu_to_logical_apicid(0); + } + apicid = new_apicid; + cpus_found++; + } + cpu++; + } + return apicid; +} + + +static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *inmask, + const struct cpumask *andmask) +{ + int num_bits_set; + int cpus_found = 0; + int cpu; + int apicid = cpu_to_logical_apicid(0); + cpumask_var_t cpumask; + + if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) + return apicid; + + cpumask_and(cpumask, inmask, andmask); + cpumask_and(cpumask, cpumask, cpu_online_mask); + + num_bits_set = cpumask_weight(cpumask); + /* Return id to all */ + if (num_bits_set == NR_CPUS) + goto exit; + /* + * The cpus in the mask must all be on the apic cluster. If are not + * on the same apicid cluster return default value of TARGET_CPUS. + */ + cpu = cpumask_first(cpumask); apicid = cpu_to_logical_apicid(cpu); while (cpus_found < num_bits_set) { - if (cpu_isset(cpu, cpumask)) { + if (cpumask_test_cpu(cpu, cpumask)) { int new_apicid = cpu_to_logical_apicid(cpu); if (apicid_cluster(apicid) != apicid_cluster(new_apicid)){ @@ -209,6 +253,8 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) } cpu++; } +exit: + free_cpumask_var(cpumask); return apicid; } diff --git a/arch/x86/include/asm/es7000/ipi.h b/arch/x86/include/asm/es7000/ipi.h index 632a955fcc0..7e8ed24d4b8 100644 --- a/arch/x86/include/asm/es7000/ipi.h +++ b/arch/x86/include/asm/es7000/ipi.h @@ -1,24 +1,22 @@ #ifndef __ASM_ES7000_IPI_H #define __ASM_ES7000_IPI_H -void send_IPI_mask_sequence(cpumask_t mask, int vector); +void send_IPI_mask_sequence(const struct cpumask *mask, int vector); +void send_IPI_mask_allbutself(const struct cpumask *mask, int vector); -static inline void send_IPI_mask(cpumask_t mask, int vector) +static inline void send_IPI_mask(const struct cpumask *mask, int vector) { send_IPI_mask_sequence(mask, vector); } static inline void send_IPI_allbutself(int vector) { - cpumask_t mask = cpu_online_map; - cpu_clear(smp_processor_id(), mask); - if (!cpus_empty(mask)) - send_IPI_mask(mask, vector); + send_IPI_mask_allbutself(cpu_online_mask, vector); } static inline void send_IPI_all(int vector) { - send_IPI_mask(cpu_online_map, vector); + send_IPI_mask(cpu_online_mask, vector); } #endif /* __ASM_ES7000_IPI_H */ diff --git a/arch/x86/include/asm/genapic_32.h b/arch/x86/include/asm/genapic_32.h index 0ac17d33a8c..746f37a7963 100644 --- a/arch/x86/include/asm/genapic_32.h +++ b/arch/x86/include/asm/genapic_32.h @@ -24,7 +24,7 @@ struct genapic { int (*probe)(void); int (*apic_id_registered)(void); - cpumask_t (*target_cpus)(void); + const struct cpumask *(*target_cpus)(void); int int_delivery_mode; int int_dest_mode; int ESR_DISABLE; @@ -57,12 +57,16 @@ struct genapic { unsigned (*get_apic_id)(unsigned long x); unsigned long apic_id_mask; - unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask); - cpumask_t (*vector_allocation_domain)(int cpu); + unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask); + unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask, + const struct cpumask *andmask); + void (*vector_allocation_domain)(int cpu, struct cpumask *retmask); #ifdef CONFIG_SMP /* ipi */ - void (*send_IPI_mask)(cpumask_t mask, int vector); + void (*send_IPI_mask)(const struct cpumask *mask, int vector); + void (*send_IPI_mask_allbutself)(const struct cpumask *mask, + int vector); void (*send_IPI_allbutself)(int vector); void (*send_IPI_all)(int vector); #endif @@ -114,6 +118,7 @@ struct genapic { APICFUNC(get_apic_id) \ .apic_id_mask = APIC_ID_MASK, \ APICFUNC(cpu_mask_to_apicid) \ + APICFUNC(cpu_mask_to_apicid_and) \ APICFUNC(vector_allocation_domain) \ APICFUNC(acpi_madt_oem_check) \ IPIFUNC(send_IPI_mask) \ diff --git a/arch/x86/include/asm/genapic_64.h b/arch/x86/include/asm/genapic_64.h index 2cae011668b..adf32fb56aa 100644 --- a/arch/x86/include/asm/genapic_64.h +++ b/arch/x86/include/asm/genapic_64.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_GENAPIC_64_H #define _ASM_X86_GENAPIC_64_H +#include <linux/cpumask.h> + /* * Copyright 2004 James Cleverdon, IBM. * Subject to the GNU Public License, v.2 @@ -18,16 +20,20 @@ struct genapic { u32 int_delivery_mode; u32 int_dest_mode; int (*apic_id_registered)(void); - cpumask_t (*target_cpus)(void); - cpumask_t (*vector_allocation_domain)(int cpu); + const struct cpumask *(*target_cpus)(void); + void (*vector_allocation_domain)(int cpu, struct cpumask *retmask); void (*init_apic_ldr)(void); /* ipi */ - void (*send_IPI_mask)(cpumask_t mask, int vector); + void (*send_IPI_mask)(const struct cpumask *mask, int vector); + void (*send_IPI_mask_allbutself)(const struct cpumask *mask, + int vector); void (*send_IPI_allbutself)(int vector); void (*send_IPI_all)(int vector); void (*send_IPI_self)(int vector); /* */ - unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask); + unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask); + unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask, + const struct cpumask *andmask); unsigned int (*phys_pkg_id)(int index_msb); unsigned int (*get_apic_id)(unsigned long x); unsigned long (*set_apic_id)(unsigned int id); diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index e475e009ae5..7a1f44ac1f1 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -198,17 +198,14 @@ extern void restore_IO_APIC_setup(void); extern void reinit_intr_remapped_IO_APIC(int); #endif -extern int probe_nr_irqs(void); +extern void probe_nr_irqs_gsi(void); #else /* !CONFIG_X86_IO_APIC */ #define io_apic_assign_pci_irqs 0 static const int timer_through_8259 = 0; -static inline void ioapic_init_mappings(void) { } +static inline void ioapic_init_mappings(void) { } -static inline int probe_nr_irqs(void) -{ - return NR_IRQS; -} +static inline void probe_nr_irqs_gsi(void) { } #endif #endif /* _ASM_X86_IO_APIC_H */ diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index 295b13193f4..a6ee9e6f530 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -7,8 +7,6 @@ extern struct dma_mapping_ops nommu_dma_ops; extern int force_iommu, no_iommu; extern int iommu_detected; -extern unsigned long iommu_nr_pages(unsigned long addr, unsigned long len); - /* 10 seconds */ #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h index f89dffb28aa..c745a306f7d 100644 --- a/arch/x86/include/asm/ipi.h +++ b/arch/x86/include/asm/ipi.h @@ -117,7 +117,8 @@ static inline void __send_IPI_dest_field(unsigned int mask, int vector, native_apic_mem_write(APIC_ICR, cfg); } -static inline void send_IPI_mask_sequence(cpumask_t mask, int vector) +static inline void send_IPI_mask_sequence(const struct cpumask *mask, + int vector) { unsigned long flags; unsigned long query_cpu; @@ -128,11 +129,29 @@ static inline void send_IPI_mask_sequence(cpumask_t mask, int vector) * - mbligh */ local_irq_save(flags); - for_each_cpu_mask_nr(query_cpu, mask) { + for_each_cpu(query_cpu, mask) { __send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, query_cpu), vector, APIC_DEST_PHYSICAL); } local_irq_restore(flags); } +static inline void send_IPI_mask_allbutself(const struct cpumask *mask, + int vector) +{ + unsigned long flags; + unsigned int query_cpu; + unsigned int this_cpu = smp_processor_id(); + + /* See Hack comment above */ + + local_irq_save(flags); + for_each_cpu(query_cpu, mask) + if (query_cpu != this_cpu) + __send_IPI_dest_field( + per_cpu(x86_cpu_to_apicid, query_cpu), + vector, APIC_DEST_PHYSICAL); + local_irq_restore(flags); +} + #endif /* _ASM_X86_IPI_H */ diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 28e409fc73f..592688ed04d 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -33,7 +33,7 @@ static inline int irq_canonicalize(int irq) #ifdef CONFIG_HOTPLUG_CPU #include <linux/cpumask.h> -extern void fixup_irqs(cpumask_t map); +extern void fixup_irqs(void); #endif extern unsigned int do_IRQ(struct pt_regs *regs); @@ -42,5 +42,6 @@ extern void native_init_IRQ(void); /* Interrupt vector management */ extern DECLARE_BITMAP(used_vectors, NR_VECTORS); +extern int vector_used_by_percpu_irq(unsigned int vector); #endif /* _ASM_X86_IRQ_H */ diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 0005adb0f94..f7ff65032b9 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -101,12 +101,23 @@ #define LAST_VM86_IRQ 15 #define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15) +#define NR_IRQS_LEGACY 16 + #if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER) + +#ifndef CONFIG_SPARSE_IRQ # if NR_CPUS < MAX_IO_APICS # define NR_IRQS (NR_VECTORS + (32 * NR_CPUS)) # else # define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS)) # endif +#else +# if (8 * NR_CPUS) > (32 * MAX_IO_APICS) +# define NR_IRQS (NR_VECTORS + (8 * NR_CPUS)) +# else +# define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS)) +# endif +#endif #elif defined(CONFIG_X86_VOYAGER) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8346be87cfa..97215a458e5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -21,6 +21,7 @@ #include <asm/pvclock-abi.h> #include <asm/desc.h> +#include <asm/mtrr.h> #define KVM_MAX_VCPUS 16 #define KVM_MEMORY_SLOTS 32 @@ -86,6 +87,7 @@ #define KVM_MIN_FREE_MMU_PAGES 5 #define KVM_REFILL_PAGES 25 #define KVM_MAX_CPUID_ENTRIES 40 +#define KVM_NR_FIXED_MTRR_REGION 88 #define KVM_NR_VAR_MTRR 8 extern spinlock_t kvm_lock; @@ -180,6 +182,8 @@ struct kvm_mmu_page { struct list_head link; struct hlist_node hash_link; + struct list_head oos_link; + /* * The following two entries are used to key the shadow page in the * hash table. @@ -190,13 +194,16 @@ struct kvm_mmu_page { u64 *spt; /* hold the gfn of each spte inside spt */ gfn_t *gfns; - unsigned long slot_bitmap; /* One bit set per slot which has memory - * in this shadow page. - */ + /* + * One bit set per slot which has memory + * in this shadow page. + */ + DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); int multimapped; /* More than one parent_pte? */ int root_count; /* Currently serving as active root */ bool unsync; - bool unsync_children; + bool global; + unsigned int unsync_children; union { u64 *parent_pte; /* !multimapped */ struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */ @@ -327,8 +334,10 @@ struct kvm_vcpu_arch { bool nmi_pending; bool nmi_injected; + bool nmi_window_open; - u64 mtrr[0x100]; + struct mtrr_state_type mtrr_state; + u32 pat; }; struct kvm_mem_alias { @@ -350,11 +359,13 @@ struct kvm_arch{ */ struct list_head active_mmu_pages; struct list_head assigned_dev_head; + struct list_head oos_global_pages; struct dmar_domain *intel_iommu_domain; struct kvm_pic *vpic; struct kvm_ioapic *vioapic; struct kvm_pit *vpit; struct hlist_head irq_ack_notifier_list; + int vapics_in_nmi_mode; int round_robin_prev_vcpu; unsigned int tss_addr; @@ -378,6 +389,7 @@ struct kvm_vm_stat { u32 mmu_recycled; u32 mmu_cache_miss; u32 mmu_unsync; + u32 mmu_unsync_global; u32 remote_tlb_flush; u32 lpages; }; @@ -397,6 +409,7 @@ struct kvm_vcpu_stat { u32 halt_exits; u32 halt_wakeup; u32 request_irq_exits; + u32 request_nmi_exits; u32 irq_exits; u32 host_state_reload; u32 efer_reload; @@ -405,6 +418,7 @@ struct kvm_vcpu_stat { u32 insn_emulation_fail; u32 hypercalls; u32 irq_injections; + u32 nmi_injections; }; struct descriptor_table { @@ -477,6 +491,7 @@ struct kvm_x86_ops { int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); + int (*get_mt_mask_shift)(void); }; extern struct kvm_x86_ops *kvm_x86_ops; @@ -490,7 +505,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu); void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); void kvm_mmu_set_base_ptes(u64 base_pte); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask); + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask); int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); @@ -587,12 +602,14 @@ unsigned long segment_base(u16 selector); void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes); + const u8 *new, int bytes, + bool guest_initiated); int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); +void kvm_mmu_sync_global(struct kvm_vcpu *vcpu); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); @@ -607,6 +624,8 @@ void kvm_disable_tdp(void); int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); int complete_pio(struct kvm_vcpu *vcpu); +struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn); + static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) { struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); @@ -702,18 +721,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); } -#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30" -#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2" -#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3" -#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30" -#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0" -#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0" -#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4" -#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4" -#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30" -#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08" -#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08" - #define MSR_IA32_TIME_STAMP_COUNTER 0x010 #define TSS_IOPB_BASE_OFFSET 0x66 diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h index 25179a29f20..6a159732881 100644 --- a/arch/x86/include/asm/kvm_x86_emulate.h +++ b/arch/x86/include/asm/kvm_x86_emulate.h @@ -123,6 +123,7 @@ struct decode_cache { u8 ad_bytes; u8 rex_prefix; struct operand src; + struct operand src2; struct operand dst; bool has_seg_override; u8 seg_override; @@ -146,22 +147,18 @@ struct x86_emulate_ctxt { /* Register state before/after emulation. */ struct kvm_vcpu *vcpu; - /* Linear faulting address (if emulating a page-faulting instruction) */ unsigned long eflags; - /* Emulated execution mode, represented by an X86EMUL_MODE value. */ int mode; - u32 cs_base; /* decode cache */ - struct decode_cache decode; }; /* Repeat String Operation Prefix */ -#define REPE_PREFIX 1 -#define REPNE_PREFIX 2 +#define REPE_PREFIX 1 +#define REPNE_PREFIX 2 /* Execution mode, passed to the emulator. */ #define X86EMUL_MODE_REAL 0 /* Real mode. */ @@ -170,7 +167,7 @@ struct x86_emulate_ctxt { #define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ /* Host execution mode. */ -#if defined(__i386__) +#if defined(CONFIG_X86_32) #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32 #elif defined(CONFIG_X86_64) #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 diff --git a/arch/x86/include/asm/mach-default/mach_apic.h b/arch/x86/include/asm/mach-default/mach_apic.h index 6cb3a467e06..cc09cbbee27 100644 --- a/arch/x86/include/asm/mach-default/mach_apic.h +++ b/arch/x86/include/asm/mach-default/mach_apic.h @@ -8,12 +8,12 @@ #define APIC_DFR_VALUE (APIC_DFR_FLAT) -static inline cpumask_t target_cpus(void) +static inline const struct cpumask *target_cpus(void) { #ifdef CONFIG_SMP - return cpu_online_map; + return cpu_online_mask; #else - return cpumask_of_cpu(0); + return cpumask_of(0); #endif } @@ -28,6 +28,7 @@ static inline cpumask_t target_cpus(void) #define apic_id_registered (genapic->apic_id_registered) #define init_apic_ldr (genapic->init_apic_ldr) #define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid) +#define cpu_mask_to_apicid_and (genapic->cpu_mask_to_apicid_and) #define phys_pkg_id (genapic->phys_pkg_id) #define vector_allocation_domain (genapic->vector_allocation_domain) #define read_apic_id() (GET_APIC_ID(apic_read(APIC_ID))) @@ -61,9 +62,19 @@ static inline int apic_id_registered(void) return physid_isset(read_apic_id(), phys_cpu_present_map); } -static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) +static inline unsigned int cpu_mask_to_apicid(const struct cpumask *cpumask) { - return cpus_addr(cpumask)[0]; + return cpumask_bits(cpumask)[0]; +} + +static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) +{ + unsigned long mask1 = cpumask_bits(cpumask)[0]; + unsigned long mask2 = cpumask_bits(andmask)[0]; + unsigned long mask3 = cpumask_bits(cpu_online_mask)[0]; + + return (unsigned int)(mask1 & mask2 & mask3); } static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb) @@ -88,7 +99,7 @@ static inline int apicid_to_node(int logical_apicid) #endif } -static inline cpumask_t vector_allocation_domain(int cpu) +static inline void vector_allocation_domain(int cpu, struct cpumask *retmask) { /* Careful. Some cpus do not strictly honor the set of cpus * specified in the interrupt destination when using lowest @@ -98,8 +109,7 @@ static inline cpumask_t vector_allocation_domain(int cpu) * deliver interrupts to the wrong hyperthread when only one * hyperthread was specified in the interrupt desitination. */ - cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; - return domain; + *retmask = (cpumask_t) { { [0] = APIC_ALL_CPUS } }; } #endif @@ -131,7 +141,7 @@ static inline int cpu_to_logical_apicid(int cpu) static inline int cpu_present_to_apicid(int mps_cpu) { - if (mps_cpu < NR_CPUS && cpu_present(mps_cpu)) + if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu)) return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu); else return BAD_APICID; diff --git a/arch/x86/include/asm/mach-default/mach_ipi.h b/arch/x86/include/asm/mach-default/mach_ipi.h index fabca01ebac..191312d155d 100644 --- a/arch/x86/include/asm/mach-default/mach_ipi.h +++ b/arch/x86/include/asm/mach-default/mach_ipi.h @@ -4,7 +4,8 @@ /* Avoid include hell */ #define NMI_VECTOR 0x02 -void send_IPI_mask_bitmask(cpumask_t mask, int vector); +void send_IPI_mask_bitmask(const struct cpumask *mask, int vector); +void send_IPI_mask_allbutself(const struct cpumask *mask, int vector); void __send_IPI_shortcut(unsigned int shortcut, int vector); extern int no_broadcast; @@ -12,28 +13,27 @@ extern int no_broadcast; #ifdef CONFIG_X86_64 #include <asm/genapic.h> #define send_IPI_mask (genapic->send_IPI_mask) +#define send_IPI_mask_allbutself (genapic->send_IPI_mask_allbutself) #else -static inline void send_IPI_mask(cpumask_t mask, int vector) +static inline void send_IPI_mask(const struct cpumask *mask, int vector) { send_IPI_mask_bitmask(mask, vector); } +void send_IPI_mask_allbutself(const struct cpumask *mask, int vector); #endif static inline void __local_send_IPI_allbutself(int vector) { - if (no_broadcast || vector == NMI_VECTOR) { - cpumask_t mask = cpu_online_map; - - cpu_clear(smp_processor_id(), mask); - send_IPI_mask(mask, vector); - } else + if (no_broadcast || vector == NMI_VECTOR) + send_IPI_mask_allbutself(cpu_online_mask, vector); + else __send_IPI_shortcut(APIC_DEST_ALLBUT, vector); } static inline void __local_send_IPI_all(int vector) { if (no_broadcast || vector == NMI_VECTOR) - send_IPI_mask(cpu_online_map, vector); + send_IPI_mask(cpu_online_mask, vector); else __send_IPI_shortcut(APIC_DEST_ALLINC, vector); } diff --git a/arch/x86/include/asm/mach-generic/mach_apic.h b/arch/x86/include/asm/mach-generic/mach_apic.h index e430f47df66..48553e958ad 100644 --- a/arch/x86/include/asm/mach-generic/mach_apic.h +++ b/arch/x86/include/asm/mach-generic/mach_apic.h @@ -24,6 +24,7 @@ #define check_phys_apicid_present (genapic->check_phys_apicid_present) #define check_apicid_used (genapic->check_apicid_used) #define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid) +#define cpu_mask_to_apicid_and (genapic->cpu_mask_to_apicid_and) #define vector_allocation_domain (genapic->vector_allocation_domain) #define enable_apic_mode (genapic->enable_apic_mode) #define phys_pkg_id (genapic->phys_pkg_id) diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index 7c1e4258b31..cb988aab716 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -57,6 +57,31 @@ struct mtrr_gentry { }; #endif /* !__i386__ */ +struct mtrr_var_range { + u32 base_lo; + u32 base_hi; + u32 mask_lo; + u32 mask_hi; +}; + +/* In the Intel processor's MTRR interface, the MTRR type is always held in + an 8 bit field: */ +typedef u8 mtrr_type; + +#define MTRR_NUM_FIXED_RANGES 88 +#define MTRR_MAX_VAR_RANGES 256 + +struct mtrr_state_type { + struct mtrr_var_range var_ranges[MTRR_MAX_VAR_RANGES]; + mtrr_type fixed_ranges[MTRR_NUM_FIXED_RANGES]; + unsigned char enabled; + unsigned char have_fixed; + mtrr_type def_type; +}; + +#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg)) +#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1) + /* These are the various ioctls */ #define MTRRIOC_ADD_ENTRY _IOW(MTRR_IOCTL_BASE, 0, struct mtrr_sentry) #define MTRRIOC_SET_ENTRY _IOW(MTRR_IOCTL_BASE, 1, struct mtrr_sentry) diff --git a/arch/x86/include/asm/numaq/apic.h b/arch/x86/include/asm/numaq/apic.h index 0bf2a06b7a4..c80f00d2996 100644 --- a/arch/x86/include/asm/numaq/apic.h +++ b/arch/x86/include/asm/numaq/apic.h @@ -7,9 +7,9 @@ #define APIC_DFR_VALUE (APIC_DFR_CLUSTER) -static inline cpumask_t target_cpus(void) +static inline const cpumask_t *target_cpus(void) { - return CPU_MASK_ALL; + return &CPU_MASK_ALL; } #define NO_BALANCE_IRQ (1) @@ -122,7 +122,13 @@ static inline void enable_apic_mode(void) * We use physical apicids here, not logical, so just return the default * physical broadcast to stop people from breaking us */ -static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) +static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask) +{ + return (int) 0xF; +} + +static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) { return (int) 0xF; } diff --git a/arch/x86/include/asm/numaq/ipi.h b/arch/x86/include/asm/numaq/ipi.h index 935588d286c..a8374c65277 100644 --- a/arch/x86/include/asm/numaq/ipi.h +++ b/arch/x86/include/asm/numaq/ipi.h @@ -1,25 +1,22 @@ #ifndef __ASM_NUMAQ_IPI_H #define __ASM_NUMAQ_IPI_H -void send_IPI_mask_sequence(cpumask_t, int vector); +void send_IPI_mask_sequence(const struct cpumask *mask, int vector); +void send_IPI_mask_allbutself(const struct cpumask *mask, int vector); -static inline void send_IPI_mask(cpumask_t mask, int vector) +static inline void send_IPI_mask(const struct cpumask *mask, int vector) { send_IPI_mask_sequence(mask, vector); } static inline void send_IPI_allbutself(int vector) { - cpumask_t mask = cpu_online_map; - cpu_clear(smp_processor_id(), mask); - - if (!cpus_empty(mask)) - send_IPI_mask(mask, vector); + send_IPI_mask_allbutself(cpu_online_mask, vector); } static inline void send_IPI_all(int vector) { - send_IPI_mask(cpu_online_map, vector); + send_IPI_mask(cpu_online_mask, vector); } #endif /* __ASM_NUMAQ_IPI_H */ diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 647781298e7..66834c41c04 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -84,6 +84,8 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev, static inline void early_quirks(void) { } #endif +extern void pci_iommu_alloc(void); + #endif /* __KERNEL__ */ #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/pci_64.h b/arch/x86/include/asm/pci_64.h index d02d936840a..4da20798277 100644 --- a/arch/x86/include/asm/pci_64.h +++ b/arch/x86/include/asm/pci_64.h @@ -23,7 +23,6 @@ extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int len, u32 value); extern void dma32_reserve_bootmem(void); -extern void pci_iommu_alloc(void); /* The PCI address space does equal the physical memory * address space. The networking and block device layers use diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index d12811ce51d..830b9fcb642 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -60,7 +60,7 @@ struct smp_ops { void (*cpu_die)(unsigned int cpu); void (*play_dead)(void); - void (*send_call_func_ipi)(cpumask_t mask); + void (*send_call_func_ipi)(const struct cpumask *mask); void (*send_call_func_single_ipi)(int cpu); }; @@ -125,7 +125,7 @@ static inline void arch_send_call_function_single_ipi(int cpu) static inline void arch_send_call_function_ipi(cpumask_t mask) { - smp_ops.send_call_func_ipi(mask); + smp_ops.send_call_func_ipi(&mask); } void cpu_disable_common(void); @@ -138,7 +138,7 @@ void native_cpu_die(unsigned int cpu); void native_play_dead(void); void play_dead_common(void); -void native_send_call_func_ipi(cpumask_t mask); +void native_send_call_func_ipi(const struct cpumask *mask); void native_send_call_func_single_ipi(int cpu); extern void prefill_possible_map(void); diff --git a/arch/x86/include/asm/summit/apic.h b/arch/x86/include/asm/summit/apic.h index 9b3070f1c2a..99327d1be49 100644 --- a/arch/x86/include/asm/summit/apic.h +++ b/arch/x86/include/asm/summit/apic.h @@ -14,13 +14,13 @@ #define APIC_DFR_VALUE (APIC_DFR_CLUSTER) -static inline cpumask_t target_cpus(void) +static inline const cpumask_t *target_cpus(void) { /* CPU_MASK_ALL (0xff) has undefined behaviour with * dest_LowestPrio mode logical clustered apic interrupt routing * Just start on cpu 0. IRQ balancing will spread load */ - return cpumask_of_cpu(0); + return &cpumask_of_cpu(0); } #define INT_DELIVERY_MODE (dest_LowestPrio) @@ -137,14 +137,14 @@ static inline void enable_apic_mode(void) { } -static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) +static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask) { int num_bits_set; int cpus_found = 0; int cpu; int apicid; - num_bits_set = cpus_weight(cpumask); + num_bits_set = cpus_weight(*cpumask); /* Return id to all */ if (num_bits_set == NR_CPUS) return (int) 0xFF; @@ -152,10 +152,10 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) * The cpus in the mask must all be on the apic cluster. If are not * on the same apicid cluster return default value of TARGET_CPUS. */ - cpu = first_cpu(cpumask); + cpu = first_cpu(*cpumask); apicid = cpu_to_logical_apicid(cpu); while (cpus_found < num_bits_set) { - if (cpu_isset(cpu, cpumask)) { + if (cpu_isset(cpu, *cpumask)) { int new_apicid = cpu_to_logical_apicid(cpu); if (apicid_cluster(apicid) != apicid_cluster(new_apicid)){ @@ -170,6 +170,49 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) return apicid; } +static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *inmask, + const struct cpumask *andmask) +{ + int num_bits_set; + int cpus_found = 0; + int cpu; + int apicid = 0xFF; + cpumask_var_t cpumask; + + if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) + return (int) 0xFF; + + cpumask_and(cpumask, inmask, andmask); + cpumask_and(cpumask, cpumask, cpu_online_mask); + + num_bits_set = cpumask_weight(cpumask); + /* Return id to all */ + if (num_bits_set == nr_cpu_ids) + goto exit; + /* + * The cpus in the mask must all be on the apic cluster. If are not + * on the same apicid cluster return default value of TARGET_CPUS. + */ + cpu = cpumask_first(cpumask); + apicid = cpu_to_logical_apicid(cpu); + while (cpus_found < num_bits_set) { + if (cpumask_test_cpu(cpu, cpumask)) { + int new_apicid = cpu_to_logical_apicid(cpu); + if (apicid_cluster(apicid) != + apicid_cluster(new_apicid)){ + printk ("%s: Not a valid mask!\n", __func__); + return 0xFF; + } + apicid = apicid | new_apicid; + cpus_found++; + } + cpu++; + } +exit: + free_cpumask_var(cpumask); + return apicid; +} + /* cpuid returns the value latched in the HW at reset, not the APIC ID * register's value. For any box whose BIOS changes APIC IDs, like * clustered APIC systems, we must use hard_smp_processor_id. diff --git a/arch/x86/include/asm/summit/ipi.h b/arch/x86/include/asm/summit/ipi.h index 53bd1e7bd7b..a8a2c24f50c 100644 --- a/arch/x86/include/asm/summit/ipi.h +++ b/arch/x86/include/asm/summit/ipi.h @@ -1,9 +1,10 @@ #ifndef __ASM_SUMMIT_IPI_H #define __ASM_SUMMIT_IPI_H -void send_IPI_mask_sequence(cpumask_t mask, int vector); +void send_IPI_mask_sequence(const cpumask_t *mask, int vector); +void send_IPI_mask_allbutself(const cpumask_t *mask, int vector); -static inline void send_IPI_mask(cpumask_t mask, int vector) +static inline void send_IPI_mask(const cpumask_t *mask, int vector) { send_IPI_mask_sequence(mask, vector); } @@ -14,12 +15,12 @@ static inline void send_IPI_allbutself(int vector) cpu_clear(smp_processor_id(), mask); if (!cpus_empty(mask)) - send_IPI_mask(mask, vector); + send_IPI_mask(&mask, vector); } static inline void send_IPI_all(int vector) { - send_IPI_mask(cpu_online_map, vector); + send_IPI_mask(&cpu_online_map, vector); } #endif /* __ASM_SUMMIT_IPI_H */ diff --git a/arch/x86/kvm/svm.h b/arch/x86/include/asm/svm.h index 1b8afa78e86..1b8afa78e86 100644 --- a/arch/x86/kvm/svm.h +++ b/arch/x86/include/asm/svm.h diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index ff386ff50ed..79e31e9dcdd 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -226,6 +226,8 @@ extern cpumask_t cpu_coregroup_map(int cpu); #define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) #define topology_core_siblings(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_thread_siblings(cpu) (per_cpu(cpu_sibling_map, cpu)) +#define topology_core_cpumask(cpu) (&per_cpu(cpu_core_map, cpu)) +#define topology_thread_cpumask(cpu) (&per_cpu(cpu_sibling_map, cpu)) /* indicates that pointers to the topology cpumask_t maps are valid */ #define arch_provides_topology_pointers yes diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 580c3ee6c58..4340055b755 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -157,6 +157,7 @@ extern int __get_user_bad(void); int __ret_gu; \ unsigned long __val_gu; \ __chk_user_ptr(ptr); \ + might_fault(); \ switch (sizeof(*(ptr))) { \ case 1: \ __get_user_x(1, __ret_gu, __val_gu, ptr); \ @@ -241,6 +242,7 @@ extern void __put_user_8(void); int __ret_pu; \ __typeof__(*(ptr)) __pu_val; \ __chk_user_ptr(ptr); \ + might_fault(); \ __pu_val = x; \ switch (sizeof(*(ptr))) { \ case 1: \ diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index d095a3aeea1..5e06259e90e 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -82,8 +82,8 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) static __always_inline unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n) { - might_sleep(); - return __copy_to_user_inatomic(to, from, n); + might_fault(); + return __copy_to_user_inatomic(to, from, n); } static __always_inline unsigned long @@ -137,7 +137,7 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) static __always_inline unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { - might_sleep(); + might_fault(); if (__builtin_constant_p(n)) { unsigned long ret; @@ -159,7 +159,7 @@ __copy_from_user(void *to, const void __user *from, unsigned long n) static __always_inline unsigned long __copy_from_user_nocache(void *to, const void __user *from, unsigned long n) { - might_sleep(); + might_fault(); if (__builtin_constant_p(n)) { unsigned long ret; diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index f8cfd00db45..84210c479fc 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -29,6 +29,8 @@ static __always_inline __must_check int __copy_from_user(void *dst, const void __user *src, unsigned size) { int ret = 0; + + might_fault(); if (!__builtin_constant_p(size)) return copy_user_generic(dst, (__force void *)src, size); switch (size) { @@ -71,6 +73,8 @@ static __always_inline __must_check int __copy_to_user(void __user *dst, const void *src, unsigned size) { int ret = 0; + + might_fault(); if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, src, size); switch (size) { @@ -113,6 +117,8 @@ static __always_inline __must_check int __copy_in_user(void __user *dst, const void __user *src, unsigned size) { int ret = 0; + + might_fault(); if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, (__force void *)src, size); diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h new file mode 100644 index 00000000000..59363627523 --- /dev/null +++ b/arch/x86/include/asm/virtext.h @@ -0,0 +1,132 @@ +/* CPU virtualization extensions handling + * + * This should carry the code for handling CPU virtualization extensions + * that needs to live in the kernel core. + * + * Author: Eduardo Habkost <ehabkost@redhat.com> + * + * Copyright (C) 2008, Red Hat Inc. + * + * Contains code from KVM, Copyright (C) 2006 Qumranet, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ +#ifndef _ASM_X86_VIRTEX_H +#define _ASM_X86_VIRTEX_H + +#include <asm/processor.h> +#include <asm/system.h> + +#include <asm/vmx.h> +#include <asm/svm.h> + +/* + * VMX functions: + */ + +static inline int cpu_has_vmx(void) +{ + unsigned long ecx = cpuid_ecx(1); + return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */ +} + + +/** Disable VMX on the current CPU + * + * vmxoff causes a undefined-opcode exception if vmxon was not run + * on the CPU previously. Only call this function if you know VMX + * is enabled. + */ +static inline void cpu_vmxoff(void) +{ + asm volatile (ASM_VMX_VMXOFF : : : "cc"); + write_cr4(read_cr4() & ~X86_CR4_VMXE); +} + +static inline int cpu_vmx_enabled(void) +{ + return read_cr4() & X86_CR4_VMXE; +} + +/** Disable VMX if it is enabled on the current CPU + * + * You shouldn't call this if cpu_has_vmx() returns 0. + */ +static inline void __cpu_emergency_vmxoff(void) +{ + if (cpu_vmx_enabled()) + cpu_vmxoff(); +} + +/** Disable VMX if it is supported and enabled on the current CPU + */ +static inline void cpu_emergency_vmxoff(void) +{ + if (cpu_has_vmx()) + __cpu_emergency_vmxoff(); +} + + + + +/* + * SVM functions: + */ + +/** Check if the CPU has SVM support + * + * You can use the 'msg' arg to get a message describing the problem, + * if the function returns zero. Simply pass NULL if you are not interested + * on the messages; gcc should take care of not generating code for + * the messages on this case. + */ +static inline int cpu_has_svm(const char **msg) +{ + uint32_t eax, ebx, ecx, edx; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { + if (msg) + *msg = "not amd"; + return 0; + } + + cpuid(0x80000000, &eax, &ebx, &ecx, &edx); + if (eax < SVM_CPUID_FUNC) { + if (msg) + *msg = "can't execute cpuid_8000000a"; + return 0; + } + + cpuid(0x80000001, &eax, &ebx, &ecx, &edx); + if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) { + if (msg) + *msg = "svm not available"; + return 0; + } + return 1; +} + + +/** Disable SVM on the current CPU + * + * You should call this only if cpu_has_svm() returned true. + */ +static inline void cpu_svm_disable(void) +{ + uint64_t efer; + + wrmsrl(MSR_VM_HSAVE_PA, 0); + rdmsrl(MSR_EFER, efer); + wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK); +} + +/** Makes sure SVM is disabled, if it is supported on the CPU + */ +static inline void cpu_emergency_svm_disable(void) +{ + if (cpu_has_svm(NULL)) + cpu_svm_disable(); +} + +#endif /* _ASM_X86_VIRTEX_H */ diff --git a/arch/x86/kvm/vmx.h b/arch/x86/include/asm/vmx.h index ec5edc339da..d0238e6151d 100644 --- a/arch/x86/kvm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -63,10 +63,13 @@ #define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 #define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 +#define VM_EXIT_SAVE_IA32_PAT 0x00040000 +#define VM_EXIT_LOAD_IA32_PAT 0x00080000 #define VM_ENTRY_IA32E_MODE 0x00000200 #define VM_ENTRY_SMM 0x00000400 #define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 +#define VM_ENTRY_LOAD_IA32_PAT 0x00004000 /* VMCS Encodings */ enum vmcs_field { @@ -112,6 +115,8 @@ enum vmcs_field { VMCS_LINK_POINTER_HIGH = 0x00002801, GUEST_IA32_DEBUGCTL = 0x00002802, GUEST_IA32_DEBUGCTL_HIGH = 0x00002803, + GUEST_IA32_PAT = 0x00002804, + GUEST_IA32_PAT_HIGH = 0x00002805, GUEST_PDPTR0 = 0x0000280a, GUEST_PDPTR0_HIGH = 0x0000280b, GUEST_PDPTR1 = 0x0000280c, @@ -120,6 +125,8 @@ enum vmcs_field { GUEST_PDPTR2_HIGH = 0x0000280f, GUEST_PDPTR3 = 0x00002810, GUEST_PDPTR3_HIGH = 0x00002811, + HOST_IA32_PAT = 0x00002c00, + HOST_IA32_PAT_HIGH = 0x00002c01, PIN_BASED_VM_EXEC_CONTROL = 0x00004000, CPU_BASED_VM_EXEC_CONTROL = 0x00004002, EXCEPTION_BITMAP = 0x00004004, @@ -331,8 +338,9 @@ enum vmcs_field { #define AR_RESERVD_MASK 0xfffe0f00 -#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 -#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10 +#define TSS_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 0) +#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 1) +#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 2) #define VMX_NR_VPIDS (1 << 16) #define VMX_VPID_EXTENT_SINGLE_CONTEXT 1 @@ -356,4 +364,19 @@ enum vmcs_field { #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul + +#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30" +#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2" +#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3" +#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30" +#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0" +#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0" +#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4" +#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4" +#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30" +#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08" +#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08" + + + #endif diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 88dd768eab6..d364df03c1d 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -109,6 +109,8 @@ obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o +obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o # NB rename without _64 + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) @@ -122,7 +124,6 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o - obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o endif diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 66198cbe464..d652515e285 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -119,8 +119,6 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); int first_system_vector = 0xfe; -char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; - /* * Debug level, exported for io_apic.c */ @@ -142,7 +140,7 @@ static int lapic_next_event(unsigned long delta, struct clock_event_device *evt); static void lapic_timer_setup(enum clock_event_mode mode, struct clock_event_device *evt); -static void lapic_timer_broadcast(cpumask_t mask); +static void lapic_timer_broadcast(const cpumask_t *mask); static void apic_pm_activate(void); /* @@ -455,7 +453,7 @@ static void lapic_timer_setup(enum clock_event_mode mode, /* * Local APIC timer broadcast function */ -static void lapic_timer_broadcast(cpumask_t mask) +static void lapic_timer_broadcast(const cpumask_t *mask) { #ifdef CONFIG_SMP send_IPI_mask(mask, LOCAL_TIMER_VECTOR); @@ -471,7 +469,7 @@ static void __cpuinit setup_APIC_timer(void) struct clock_event_device *levt = &__get_cpu_var(lapic_events); memcpy(levt, &lapic_clockevent, sizeof(*levt)); - levt->cpumask = cpumask_of_cpu(smp_processor_id()); + levt->cpumask = cpumask_of(smp_processor_id()); clockevents_register_device(levt); } @@ -1807,28 +1805,32 @@ void disconnect_bsp_APIC(int virt_wire_setup) void __cpuinit generic_processor_info(int apicid, int version) { int cpu; - cpumask_t tmp_map; /* * Validate version */ if (version == 0x0) { pr_warning("BIOS bug, APIC version is 0 for CPU#%d! " - "fixing up to 0x10. (tell your hw vendor)\n", - version); + "fixing up to 0x10. (tell your hw vendor)\n", + version); version = 0x10; } apic_version[apicid] = version; - if (num_processors >= NR_CPUS) { - pr_warning("WARNING: NR_CPUS limit of %i reached." - " Processor ignored.\n", NR_CPUS); + if (num_processors >= nr_cpu_ids) { + int max = nr_cpu_ids; + int thiscpu = max + disabled_cpus; + + pr_warning( + "ACPI: NR_CPUS/possible_cpus limit of %i reached." + " Processor %d/0x%x ignored.\n", max, thiscpu, apicid); + + disabled_cpus++; return; } num_processors++; - cpus_complement(tmp_map, cpu_present_map); - cpu = first_cpu(tmp_map); + cpu = cpumask_next_zero(-1, cpu_present_mask); physid_set(apicid, phys_cpu_present_map); if (apicid == boot_cpu_physical_apicid) { @@ -1878,8 +1880,8 @@ void __cpuinit generic_processor_info(int apicid, int version) } #endif - cpu_set(cpu, cpu_possible_map); - cpu_set(cpu, cpu_present_map); + set_cpu_possible(cpu, true); + set_cpu_present(cpu, true); } #ifdef CONFIG_X86_64 @@ -2081,7 +2083,7 @@ __cpuinit int apic_is_clustered_box(void) bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); bitmap_zero(clustermap, NUM_APIC_CLUSTERS); - for (i = 0; i < NR_CPUS; i++) { + for (i = 0; i < nr_cpu_ids; i++) { /* are we being called early in kernel startup? */ if (bios_cpu_apicid) { id = bios_cpu_apicid[i]; diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 68b5d8681cb..c6ecda64f5f 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -534,31 +534,16 @@ static void __cpuinit free_cache_attributes(unsigned int cpu) per_cpu(cpuid4_info, cpu) = NULL; } -static int __cpuinit detect_cache_attributes(unsigned int cpu) +static void get_cpu_leaves(void *_retval) { - struct _cpuid4_info *this_leaf; - unsigned long j; - int retval; - cpumask_t oldmask; - - if (num_cache_leaves == 0) - return -ENOENT; - - per_cpu(cpuid4_info, cpu) = kzalloc( - sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); - if (per_cpu(cpuid4_info, cpu) == NULL) - return -ENOMEM; - - oldmask = current->cpus_allowed; - retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - if (retval) - goto out; + int j, *retval = _retval, cpu = smp_processor_id(); /* Do cpuid and store the results */ for (j = 0; j < num_cache_leaves; j++) { + struct _cpuid4_info *this_leaf; this_leaf = CPUID4_INFO_IDX(cpu, j); - retval = cpuid4_cache_lookup(j, this_leaf); - if (unlikely(retval < 0)) { + *retval = cpuid4_cache_lookup(j, this_leaf); + if (unlikely(*retval < 0)) { int i; for (i = 0; i < j; i++) @@ -567,9 +552,21 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu) } cache_shared_cpu_map_setup(cpu, j); } - set_cpus_allowed_ptr(current, &oldmask); +} + +static int __cpuinit detect_cache_attributes(unsigned int cpu) +{ + int retval; + + if (num_cache_leaves == 0) + return -ENOENT; + + per_cpu(cpuid4_info, cpu) = kzalloc( + sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); + if (per_cpu(cpuid4_info, cpu) == NULL) + return -ENOMEM; -out: + smp_call_function_single(cpu, get_cpu_leaves, &retval, true); if (retval) { kfree(per_cpu(cpuid4_info, cpu)); per_cpu(cpuid4_info, cpu) = NULL; @@ -626,8 +623,8 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, cpumask_t *mask = &this_leaf->shared_cpu_map; n = type? - cpulist_scnprintf(buf, len-2, *mask): - cpumask_scnprintf(buf, len-2, *mask); + cpulist_scnprintf(buf, len-2, mask) : + cpumask_scnprintf(buf, len-2, mask); buf[n++] = '\n'; buf[n] = '\0'; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 748c8f9e7a0..a5a5e053037 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -83,34 +83,41 @@ static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ * CPU Initialization */ +struct thresh_restart { + struct threshold_block *b; + int reset; + u16 old_limit; +}; + /* must be called with correct cpu affinity */ -static void threshold_restart_bank(struct threshold_block *b, - int reset, u16 old_limit) +static long threshold_restart_bank(void *_tr) { + struct thresh_restart *tr = _tr; u32 mci_misc_hi, mci_misc_lo; - rdmsr(b->address, mci_misc_lo, mci_misc_hi); + rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi); - if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) - reset = 1; /* limit cannot be lower than err count */ + if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) + tr->reset = 1; /* limit cannot be lower than err count */ - if (reset) { /* reset err count and overflow bit */ + if (tr->reset) { /* reset err count and overflow bit */ mci_misc_hi = (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | - (THRESHOLD_MAX - b->threshold_limit); - } else if (old_limit) { /* change limit w/o reset */ + (THRESHOLD_MAX - tr->b->threshold_limit); + } else if (tr->old_limit) { /* change limit w/o reset */ int new_count = (mci_misc_hi & THRESHOLD_MAX) + - (old_limit - b->threshold_limit); + (tr->old_limit - tr->b->threshold_limit); mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | (new_count & THRESHOLD_MAX); } - b->interrupt_enable ? + tr->b->interrupt_enable ? (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : (mci_misc_hi &= ~MASK_INT_TYPE_HI); mci_misc_hi |= MASK_COUNT_EN_HI; - wrmsr(b->address, mci_misc_lo, mci_misc_hi); + wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi); + return 0; } /* cpu init entry point, called from mce.c with preempt off */ @@ -120,6 +127,7 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) unsigned int cpu = smp_processor_id(); u8 lvt_off; u32 low = 0, high = 0, address = 0; + struct thresh_restart tr; for (bank = 0; bank < NR_BANKS; ++bank) { for (block = 0; block < NR_BLOCKS; ++block) { @@ -162,7 +170,10 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) wrmsr(address, low, high); threshold_defaults.address = address; - threshold_restart_bank(&threshold_defaults, 0, 0); + tr.b = &threshold_defaults; + tr.reset = 0; + tr.old_limit = 0; + threshold_restart_bank(&tr); } } } @@ -251,20 +262,6 @@ struct threshold_attr { ssize_t(*store) (struct threshold_block *, const char *, size_t count); }; -static void affinity_set(unsigned int cpu, cpumask_t *oldmask, - cpumask_t *newmask) -{ - *oldmask = current->cpus_allowed; - cpus_clear(*newmask); - cpu_set(cpu, *newmask); - set_cpus_allowed_ptr(current, newmask); -} - -static void affinity_restore(const cpumask_t *oldmask) -{ - set_cpus_allowed_ptr(current, oldmask); -} - #define SHOW_FIELDS(name) \ static ssize_t show_ ## name(struct threshold_block * b, char *buf) \ { \ @@ -277,15 +274,16 @@ static ssize_t store_interrupt_enable(struct threshold_block *b, const char *buf, size_t count) { char *end; - cpumask_t oldmask, newmask; + struct thresh_restart tr; unsigned long new = simple_strtoul(buf, &end, 0); if (end == buf) return -EINVAL; b->interrupt_enable = !!new; - affinity_set(b->cpu, &oldmask, &newmask); - threshold_restart_bank(b, 0, 0); - affinity_restore(&oldmask); + tr.b = b; + tr.reset = 0; + tr.old_limit = 0; + work_on_cpu(b->cpu, threshold_restart_bank, &tr); return end - buf; } @@ -294,8 +292,7 @@ static ssize_t store_threshold_limit(struct threshold_block *b, const char *buf, size_t count) { char *end; - cpumask_t oldmask, newmask; - u16 old; + struct thresh_restart tr; unsigned long new = simple_strtoul(buf, &end, 0); if (end == buf) return -EINVAL; @@ -303,34 +300,36 @@ static ssize_t store_threshold_limit(struct threshold_block *b, new = THRESHOLD_MAX; if (new < 1) new = 1; - old = b->threshold_limit; + tr.old_limit = b->threshold_limit; b->threshold_limit = new; + tr.b = b; + tr.reset = 0; - affinity_set(b->cpu, &oldmask, &newmask); - threshold_restart_bank(b, 0, old); - affinity_restore(&oldmask); + work_on_cpu(b->cpu, threshold_restart_bank, &tr); return end - buf; } -static ssize_t show_error_count(struct threshold_block *b, char *buf) +static long local_error_count(void *_b) { - u32 high, low; - cpumask_t oldmask, newmask; - affinity_set(b->cpu, &oldmask, &newmask); + struct threshold_block *b = _b; + u32 low, high; + rdmsr(b->address, low, high); - affinity_restore(&oldmask); - return sprintf(buf, "%x\n", - (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit)); + return (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit); +} + +static ssize_t show_error_count(struct threshold_block *b, char *buf) +{ + return sprintf(buf, "%lx\n", work_on_cpu(b->cpu, local_error_count, b)); } static ssize_t store_error_count(struct threshold_block *b, const char *buf, size_t count) { - cpumask_t oldmask, newmask; - affinity_set(b->cpu, &oldmask, &newmask); - threshold_restart_bank(b, 1, 0); - affinity_restore(&oldmask); + struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 }; + + work_on_cpu(b->cpu, threshold_restart_bank, &tr); return 1; } @@ -463,12 +462,19 @@ out_free: return err; } +static long local_allocate_threshold_blocks(void *_bank) +{ + unsigned int *bank = _bank; + + return allocate_threshold_blocks(smp_processor_id(), *bank, 0, + MSR_IA32_MC0_MISC + *bank * 4); +} + /* symlinks sibling shared banks to first core. first core owns dir/files. */ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) { int i, err = 0; struct threshold_bank *b = NULL; - cpumask_t oldmask, newmask; char name[32]; sprintf(name, "threshold_bank%i", bank); @@ -519,11 +525,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) per_cpu(threshold_banks, cpu)[bank] = b; - affinity_set(cpu, &oldmask, &newmask); - err = allocate_threshold_blocks(cpu, bank, 0, - MSR_IA32_MC0_MISC + bank * 4); - affinity_restore(&oldmask); - + err = work_on_cpu(cpu, local_allocate_threshold_blocks, &bank); if (err) goto out_free; diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 4e8d77f01ee..b59ddcc88cd 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -14,14 +14,6 @@ #include <asm/pat.h> #include "mtrr.h" -struct mtrr_state { - struct mtrr_var_range var_ranges[MAX_VAR_RANGES]; - mtrr_type fixed_ranges[NUM_FIXED_RANGES]; - unsigned char enabled; - unsigned char have_fixed; - mtrr_type def_type; -}; - struct fixed_range_block { int base_msr; /* start address of an MTRR block */ int ranges; /* number of MTRRs in this block */ @@ -35,10 +27,12 @@ static struct fixed_range_block fixed_range_blocks[] = { }; static unsigned long smp_changes_mask; -static struct mtrr_state mtrr_state = {}; static int mtrr_state_set; u64 mtrr_tom2; +struct mtrr_state_type mtrr_state = {}; +EXPORT_SYMBOL_GPL(mtrr_state); + #undef MODULE_PARAM_PREFIX #define MODULE_PARAM_PREFIX "mtrr." diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 44fcb237bd5..d259e5d2e05 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -49,7 +49,7 @@ u32 num_var_ranges = 0; -unsigned int mtrr_usage_table[MAX_VAR_RANGES]; +unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; @@ -574,7 +574,7 @@ struct mtrr_value { unsigned long lsize; }; -static struct mtrr_value mtrr_state[MAX_VAR_RANGES]; +static struct mtrr_value mtrr_state[MTRR_MAX_VAR_RANGES]; static int mtrr_save(struct sys_device * sysdev, pm_message_t state) { diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 2dc4ec656b2..ffd60409cc6 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -8,11 +8,6 @@ #define MTRRcap_MSR 0x0fe #define MTRRdefType_MSR 0x2ff -#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg)) -#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1) - -#define NUM_FIXED_RANGES 88 -#define MAX_VAR_RANGES 256 #define MTRRfix64K_00000_MSR 0x250 #define MTRRfix16K_80000_MSR 0x258 #define MTRRfix16K_A0000_MSR 0x259 @@ -29,11 +24,7 @@ #define MTRR_CHANGE_MASK_VARIABLE 0x02 #define MTRR_CHANGE_MASK_DEFTYPE 0x04 -/* In the Intel processor's MTRR interface, the MTRR type is always held in - an 8 bit field: */ -typedef u8 mtrr_type; - -extern unsigned int mtrr_usage_table[MAX_VAR_RANGES]; +extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; struct mtrr_ops { u32 vendor; @@ -70,13 +61,6 @@ struct set_mtrr_context { u32 ccr3; }; -struct mtrr_var_range { - u32 base_lo; - u32 base_hi; - u32 mask_lo; - u32 mask_hi; -}; - void set_mtrr_done(struct set_mtrr_context *ctxt); void set_mtrr_cache_disable(struct set_mtrr_context *ctxt); void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index d84a852e4cd..c689d19e35a 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -26,6 +26,7 @@ #include <linux/kdebug.h> #include <asm/smp.h> #include <asm/reboot.h> +#include <asm/virtext.h> #include <mach_ipi.h> @@ -49,6 +50,15 @@ static void kdump_nmi_callback(int cpu, struct die_args *args) #endif crash_save_cpu(regs, cpu); + /* Disable VMX or SVM if needed. + * + * We need to disable virtualization on all CPUs. + * Having VMX or SVM enabled on any CPU may break rebooting + * after the kdump kernel has finished its task. + */ + cpu_emergency_vmxoff(); + cpu_emergency_svm_disable(); + disable_local_APIC(); } @@ -80,6 +90,14 @@ void native_machine_crash_shutdown(struct pt_regs *regs) local_irq_disable(); kdump_nmi_shootdown_cpus(); + + /* Booting kdump kernel with VMX or SVM enabled won't work, + * because (among other limitations) we can't disable paging + * with the virt flags. + */ + cpu_emergency_vmxoff(); + cpu_emergency_svm_disable(); + lapic_shutdown(); #if defined(CONFIG_X86_IO_APIC) disable_IO_APIC(); diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c index c0262791bda..34185488e4f 100644 --- a/arch/x86/kernel/genapic_flat_64.c +++ b/arch/x86/kernel/genapic_flat_64.c @@ -30,12 +30,12 @@ static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 1; } -static cpumask_t flat_target_cpus(void) +static const struct cpumask *flat_target_cpus(void) { - return cpu_online_map; + return cpu_online_mask; } -static cpumask_t flat_vector_allocation_domain(int cpu) +static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask) { /* Careful. Some cpus do not strictly honor the set of cpus * specified in the interrupt destination when using lowest @@ -45,8 +45,8 @@ static cpumask_t flat_vector_allocation_domain(int cpu) * deliver interrupts to the wrong hyperthread when only one * hyperthread was specified in the interrupt desitination. */ - cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; - return domain; + cpumask_clear(retmask); + cpumask_bits(retmask)[0] = APIC_ALL_CPUS; } /* @@ -69,9 +69,8 @@ static void flat_init_apic_ldr(void) apic_write(APIC_LDR, val); } -static void flat_send_IPI_mask(cpumask_t cpumask, int vector) +static inline void _flat_send_IPI_mask(unsigned long mask, int vector) { - unsigned long mask = cpus_addr(cpumask)[0]; unsigned long flags; local_irq_save(flags); @@ -79,20 +78,41 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector) local_irq_restore(flags); } +static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector) +{ + unsigned long mask = cpumask_bits(cpumask)[0]; + + _flat_send_IPI_mask(mask, vector); +} + +static void flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, + int vector) +{ + unsigned long mask = cpumask_bits(cpumask)[0]; + int cpu = smp_processor_id(); + + if (cpu < BITS_PER_LONG) + clear_bit(cpu, &mask); + _flat_send_IPI_mask(mask, vector); +} + static void flat_send_IPI_allbutself(int vector) { + int cpu = smp_processor_id(); #ifdef CONFIG_HOTPLUG_CPU int hotplug = 1; #else int hotplug = 0; #endif if (hotplug || vector == NMI_VECTOR) { - cpumask_t allbutme = cpu_online_map; + if (!cpumask_equal(cpu_online_mask, cpumask_of(cpu))) { + unsigned long mask = cpumask_bits(cpu_online_mask)[0]; - cpu_clear(smp_processor_id(), allbutme); + if (cpu < BITS_PER_LONG) + clear_bit(cpu, &mask); - if (!cpus_empty(allbutme)) - flat_send_IPI_mask(allbutme, vector); + _flat_send_IPI_mask(mask, vector); + } } else if (num_online_cpus() > 1) { __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL); } @@ -101,7 +121,7 @@ static void flat_send_IPI_allbutself(int vector) static void flat_send_IPI_all(int vector) { if (vector == NMI_VECTOR) - flat_send_IPI_mask(cpu_online_map, vector); + flat_send_IPI_mask(cpu_online_mask, vector); else __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); } @@ -135,9 +155,18 @@ static int flat_apic_id_registered(void) return physid_isset(read_xapic_id(), phys_cpu_present_map); } -static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask) +static unsigned int flat_cpu_mask_to_apicid(const struct cpumask *cpumask) +{ + return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS; +} + +static unsigned int flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) { - return cpus_addr(cpumask)[0] & APIC_ALL_CPUS; + unsigned long mask1 = cpumask_bits(cpumask)[0] & APIC_ALL_CPUS; + unsigned long mask2 = cpumask_bits(andmask)[0] & APIC_ALL_CPUS; + + return mask1 & mask2; } static unsigned int phys_pkg_id(int index_msb) @@ -157,8 +186,10 @@ struct genapic apic_flat = { .send_IPI_all = flat_send_IPI_all, .send_IPI_allbutself = flat_send_IPI_allbutself, .send_IPI_mask = flat_send_IPI_mask, + .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself, .send_IPI_self = apic_send_IPI_self, .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, .phys_pkg_id = phys_pkg_id, .get_apic_id = get_apic_id, .set_apic_id = set_apic_id, @@ -188,35 +219,39 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } -static cpumask_t physflat_target_cpus(void) +static const struct cpumask *physflat_target_cpus(void) { - return cpu_online_map; + return cpu_online_mask; } -static cpumask_t physflat_vector_allocation_domain(int cpu) +static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask) { - return cpumask_of_cpu(cpu); + cpumask_clear(retmask); + cpumask_set_cpu(cpu, retmask); } -static void physflat_send_IPI_mask(cpumask_t cpumask, int vector) +static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector) { send_IPI_mask_sequence(cpumask, vector); } -static void physflat_send_IPI_allbutself(int vector) +static void physflat_send_IPI_mask_allbutself(const struct cpumask *cpumask, + int vector) { - cpumask_t allbutme = cpu_online_map; + send_IPI_mask_allbutself(cpumask, vector); +} - cpu_clear(smp_processor_id(), allbutme); - physflat_send_IPI_mask(allbutme, vector); +static void physflat_send_IPI_allbutself(int vector) +{ + send_IPI_mask_allbutself(cpu_online_mask, vector); } static void physflat_send_IPI_all(int vector) { - physflat_send_IPI_mask(cpu_online_map, vector); + physflat_send_IPI_mask(cpu_online_mask, vector); } -static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask) +static unsigned int physflat_cpu_mask_to_apicid(const struct cpumask *cpumask) { int cpu; @@ -224,13 +259,31 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask) * We're using fixed IRQ delivery, can only return one phys APIC ID. * May as well be the first. */ - cpu = first_cpu(cpumask); + cpu = cpumask_first(cpumask); if ((unsigned)cpu < nr_cpu_ids) return per_cpu(x86_cpu_to_apicid, cpu); else return BAD_APICID; } +static unsigned int +physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + for_each_cpu_and(cpu, cpumask, andmask) + if (cpumask_test_cpu(cpu, cpu_online_mask)) + break; + if (cpu < nr_cpu_ids) + return per_cpu(x86_cpu_to_apicid, cpu); + return BAD_APICID; +} + struct genapic apic_physflat = { .name = "physical flat", .acpi_madt_oem_check = physflat_acpi_madt_oem_check, @@ -243,8 +296,10 @@ struct genapic apic_physflat = { .send_IPI_all = physflat_send_IPI_all, .send_IPI_allbutself = physflat_send_IPI_allbutself, .send_IPI_mask = physflat_send_IPI_mask, + .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself, .send_IPI_self = apic_send_IPI_self, .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = physflat_cpu_mask_to_apicid_and, .phys_pkg_id = phys_pkg_id, .get_apic_id = get_apic_id, .set_apic_id = set_apic_id, diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c index f6a2c8eb48a..6ce497cc372 100644 --- a/arch/x86/kernel/genx2apic_cluster.c +++ b/arch/x86/kernel/genx2apic_cluster.c @@ -22,19 +22,18 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) /* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ -static cpumask_t x2apic_target_cpus(void) +static const struct cpumask *x2apic_target_cpus(void) { - return cpumask_of_cpu(0); + return cpumask_of(0); } /* * for now each logical cpu is in its own vector allocation domain. */ -static cpumask_t x2apic_vector_allocation_domain(int cpu) +static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) { - cpumask_t domain = CPU_MASK_NONE; - cpu_set(cpu, domain); - return domain; + cpumask_clear(retmask); + cpumask_set_cpu(cpu, retmask); } static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, @@ -56,32 +55,53 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, * at once. We have 16 cpu's in a cluster. This will minimize IPI register * writes. */ -static void x2apic_send_IPI_mask(cpumask_t mask, int vector) +static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) { unsigned long flags; unsigned long query_cpu; local_irq_save(flags); - for_each_cpu_mask(query_cpu, mask) { - __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_logical_apicid, query_cpu), - vector, APIC_DEST_LOGICAL); - } + for_each_cpu(query_cpu, mask) + __x2apic_send_IPI_dest( + per_cpu(x86_cpu_to_logical_apicid, query_cpu), + vector, APIC_DEST_LOGICAL); local_irq_restore(flags); } -static void x2apic_send_IPI_allbutself(int vector) +static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, + int vector) { - cpumask_t mask = cpu_online_map; + unsigned long flags; + unsigned long query_cpu; + unsigned long this_cpu = smp_processor_id(); - cpu_clear(smp_processor_id(), mask); + local_irq_save(flags); + for_each_cpu(query_cpu, mask) + if (query_cpu != this_cpu) + __x2apic_send_IPI_dest( + per_cpu(x86_cpu_to_logical_apicid, query_cpu), + vector, APIC_DEST_LOGICAL); + local_irq_restore(flags); +} + +static void x2apic_send_IPI_allbutself(int vector) +{ + unsigned long flags; + unsigned long query_cpu; + unsigned long this_cpu = smp_processor_id(); - if (!cpus_empty(mask)) - x2apic_send_IPI_mask(mask, vector); + local_irq_save(flags); + for_each_online_cpu(query_cpu) + if (query_cpu != this_cpu) + __x2apic_send_IPI_dest( + per_cpu(x86_cpu_to_logical_apicid, query_cpu), + vector, APIC_DEST_LOGICAL); + local_irq_restore(flags); } static void x2apic_send_IPI_all(int vector) { - x2apic_send_IPI_mask(cpu_online_map, vector); + x2apic_send_IPI_mask(cpu_online_mask, vector); } static int x2apic_apic_id_registered(void) @@ -89,21 +109,38 @@ static int x2apic_apic_id_registered(void) return 1; } -static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask) +static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) { int cpu; /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. + * We're using fixed IRQ delivery, can only return one logical APIC ID. * May as well be the first. */ - cpu = first_cpu(cpumask); - if ((unsigned)cpu < NR_CPUS) + cpu = cpumask_first(cpumask); + if ((unsigned)cpu < nr_cpu_ids) return per_cpu(x86_cpu_to_logical_apicid, cpu); else return BAD_APICID; } +static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one logical APIC ID. + * May as well be the first. + */ + for_each_cpu_and(cpu, cpumask, andmask) + if (cpumask_test_cpu(cpu, cpu_online_mask)) + break; + if (cpu < nr_cpu_ids) + return per_cpu(x86_cpu_to_logical_apicid, cpu); + return BAD_APICID; +} + static unsigned int get_apic_id(unsigned long x) { unsigned int id; @@ -150,8 +187,10 @@ struct genapic apic_x2apic_cluster = { .send_IPI_all = x2apic_send_IPI_all, .send_IPI_allbutself = x2apic_send_IPI_allbutself, .send_IPI_mask = x2apic_send_IPI_mask, + .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, .send_IPI_self = x2apic_send_IPI_self, .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, .phys_pkg_id = phys_pkg_id, .get_apic_id = get_apic_id, .set_apic_id = set_apic_id, diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c index a177c7880ab..21bcc0e098b 100644 --- a/arch/x86/kernel/genx2apic_phys.c +++ b/arch/x86/kernel/genx2apic_phys.c @@ -29,16 +29,15 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) /* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ -static cpumask_t x2apic_target_cpus(void) +static const struct cpumask *x2apic_target_cpus(void) { - return cpumask_of_cpu(0); + return cpumask_of(0); } -static cpumask_t x2apic_vector_allocation_domain(int cpu) +static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) { - cpumask_t domain = CPU_MASK_NONE; - cpu_set(cpu, domain); - return domain; + cpumask_clear(retmask); + cpumask_set_cpu(cpu, retmask); } static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, @@ -54,32 +53,54 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, x2apic_icr_write(cfg, apicid); } -static void x2apic_send_IPI_mask(cpumask_t mask, int vector) +static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) { unsigned long flags; unsigned long query_cpu; local_irq_save(flags); - for_each_cpu_mask(query_cpu, mask) { + for_each_cpu(query_cpu, mask) { __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), vector, APIC_DEST_PHYSICAL); } local_irq_restore(flags); } -static void x2apic_send_IPI_allbutself(int vector) +static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, + int vector) { - cpumask_t mask = cpu_online_map; + unsigned long flags; + unsigned long query_cpu; + unsigned long this_cpu = smp_processor_id(); + + local_irq_save(flags); + for_each_cpu(query_cpu, mask) { + if (query_cpu != this_cpu) + __x2apic_send_IPI_dest( + per_cpu(x86_cpu_to_apicid, query_cpu), + vector, APIC_DEST_PHYSICAL); + } + local_irq_restore(flags); +} - cpu_clear(smp_processor_id(), mask); +static void x2apic_send_IPI_allbutself(int vector) +{ + unsigned long flags; + unsigned long query_cpu; + unsigned long this_cpu = smp_processor_id(); - if (!cpus_empty(mask)) - x2apic_send_IPI_mask(mask, vector); + local_irq_save(flags); + for_each_online_cpu(query_cpu) + if (query_cpu != this_cpu) + __x2apic_send_IPI_dest( + per_cpu(x86_cpu_to_apicid, query_cpu), + vector, APIC_DEST_PHYSICAL); + local_irq_restore(flags); } static void x2apic_send_IPI_all(int vector) { - x2apic_send_IPI_mask(cpu_online_map, vector); + x2apic_send_IPI_mask(cpu_online_mask, vector); } static int x2apic_apic_id_registered(void) @@ -87,7 +108,7 @@ static int x2apic_apic_id_registered(void) return 1; } -static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask) +static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) { int cpu; @@ -95,13 +116,30 @@ static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask) * We're using fixed IRQ delivery, can only return one phys APIC ID. * May as well be the first. */ - cpu = first_cpu(cpumask); - if ((unsigned)cpu < NR_CPUS) + cpu = cpumask_first(cpumask); + if ((unsigned)cpu < nr_cpu_ids) return per_cpu(x86_cpu_to_apicid, cpu); else return BAD_APICID; } +static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + for_each_cpu_and(cpu, cpumask, andmask) + if (cpumask_test_cpu(cpu, cpu_online_mask)) + break; + if (cpu < nr_cpu_ids) + return per_cpu(x86_cpu_to_apicid, cpu); + return BAD_APICID; +} + static unsigned int get_apic_id(unsigned long x) { unsigned int id; @@ -145,8 +183,10 @@ struct genapic apic_x2apic_phys = { .send_IPI_all = x2apic_send_IPI_all, .send_IPI_allbutself = x2apic_send_IPI_allbutself, .send_IPI_mask = x2apic_send_IPI_mask, + .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, .send_IPI_self = x2apic_send_IPI_self, .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, .phys_pkg_id = phys_pkg_id, .get_apic_id = get_apic_id, .set_apic_id = set_apic_id, diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index dece1728973..b193e082f6c 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -79,16 +79,15 @@ EXPORT_SYMBOL(sn_rtc_cycles_per_second); /* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ -static cpumask_t uv_target_cpus(void) +static const struct cpumask *uv_target_cpus(void) { - return cpumask_of_cpu(0); + return cpumask_of(0); } -static cpumask_t uv_vector_allocation_domain(int cpu) +static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask) { - cpumask_t domain = CPU_MASK_NONE; - cpu_set(cpu, domain); - return domain; + cpumask_clear(retmask); + cpumask_set_cpu(cpu, retmask); } int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip) @@ -127,28 +126,37 @@ static void uv_send_IPI_one(int cpu, int vector) uv_write_global_mmr64(pnode, UVH_IPI_INT, val); } -static void uv_send_IPI_mask(cpumask_t mask, int vector) +static void uv_send_IPI_mask(const struct cpumask *mask, int vector) { unsigned int cpu; - for_each_possible_cpu(cpu) - if (cpu_isset(cpu, mask)) + for_each_cpu(cpu, mask) + uv_send_IPI_one(cpu, vector); +} + +static void uv_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) +{ + unsigned int cpu; + unsigned int this_cpu = smp_processor_id(); + + for_each_cpu(cpu, mask) + if (cpu != this_cpu) uv_send_IPI_one(cpu, vector); } static void uv_send_IPI_allbutself(int vector) { - cpumask_t mask = cpu_online_map; - - cpu_clear(smp_processor_id(), mask); + unsigned int cpu; + unsigned int this_cpu = smp_processor_id(); - if (!cpus_empty(mask)) - uv_send_IPI_mask(mask, vector); + for_each_online_cpu(cpu) + if (cpu != this_cpu) + uv_send_IPI_one(cpu, vector); } static void uv_send_IPI_all(int vector) { - uv_send_IPI_mask(cpu_online_map, vector); + uv_send_IPI_mask(cpu_online_mask, vector); } static int uv_apic_id_registered(void) @@ -160,7 +168,7 @@ static void uv_init_apic_ldr(void) { } -static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask) +static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask) { int cpu; @@ -168,13 +176,30 @@ static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask) * We're using fixed IRQ delivery, can only return one phys APIC ID. * May as well be the first. */ - cpu = first_cpu(cpumask); + cpu = cpumask_first(cpumask); if ((unsigned)cpu < nr_cpu_ids) return per_cpu(x86_cpu_to_apicid, cpu); else return BAD_APICID; } +static unsigned int uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + for_each_cpu_and(cpu, cpumask, andmask) + if (cpumask_test_cpu(cpu, cpu_online_mask)) + break; + if (cpu < nr_cpu_ids) + return per_cpu(x86_cpu_to_apicid, cpu); + return BAD_APICID; +} + static unsigned int get_apic_id(unsigned long x) { unsigned int id; @@ -222,8 +247,10 @@ struct genapic apic_x2apic_uv_x = { .send_IPI_all = uv_send_IPI_all, .send_IPI_allbutself = uv_send_IPI_allbutself, .send_IPI_mask = uv_send_IPI_mask, + .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself, .send_IPI_self = uv_send_IPI_self, .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and, .phys_pkg_id = phys_pkg_id, .get_apic_id = get_apic_id, .set_apic_id = set_apic_id, diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 3f0a3edf0a5..cd759ad9069 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -248,7 +248,7 @@ static void hpet_legacy_clockevent_register(void) * Start hpet with the boot cpu mask and make it * global after the IO_APIC has been initialized. */ - hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id()); + hpet_clockevent.cpumask = cpumask_of(smp_processor_id()); clockevents_register_device(&hpet_clockevent); global_clock_event = &hpet_clockevent; printk(KERN_DEBUG "hpet clockevent registered\n"); @@ -303,7 +303,7 @@ static void hpet_set_mode(enum clock_event_mode mode, struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); hpet_setup_msi_irq(hdev->irq); disable_irq(hdev->irq); - irq_set_affinity(hdev->irq, cpumask_of_cpu(hdev->cpu)); + irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu)); enable_irq(hdev->irq); } break; @@ -451,7 +451,7 @@ static int hpet_setup_irq(struct hpet_dev *dev) return -1; disable_irq(dev->irq); - irq_set_affinity(dev->irq, cpumask_of_cpu(dev->cpu)); + irq_set_affinity(dev->irq, cpumask_of(dev->cpu)); enable_irq(dev->irq); printk(KERN_DEBUG "hpet: %s irq %d for MSI\n", @@ -502,7 +502,7 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) /* 5 usec minimum reprogramming delta. */ evt->min_delta_ns = 5000; - evt->cpumask = cpumask_of_cpu(hdev->cpu); + evt->cpumask = cpumask_of(hdev->cpu); clockevents_register_device(evt); } @@ -813,7 +813,7 @@ int __init hpet_enable(void) out_nohpet: hpet_clear_mapping(); - boot_hpet_disable = 1; + hpet_address = 0; return 0; } @@ -836,10 +836,11 @@ static __init int hpet_late_init(void) hpet_address = force_hpet_address; hpet_enable(); - if (!hpet_virt_address) - return -ENODEV; } + if (!hpet_virt_address) + return -ENODEV; + hpet_reserve_platform_timers(hpet_readl(HPET_ID)); for_each_online_cpu(cpu) { diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index c1b5e3ece1f..10f92fb532f 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -114,7 +114,7 @@ void __init setup_pit_timer(void) * Start pit with the boot cpu mask and make it global after the * IO_APIC has been initialized. */ - pit_clockevent.cpumask = cpumask_of_cpu(smp_processor_id()); + pit_clockevent.cpumask = cpumask_of(smp_processor_id()); pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, pit_clockevent.shift); pit_clockevent.max_delta_ns = diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index d39918076bb..df3bf269bea 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -10,7 +10,6 @@ #include <asm/pgtable.h> #include <asm/desc.h> -static struct fs_struct init_fs = INIT_FS; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index b8c8a8e9934..69911722b9d 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -108,94 +108,277 @@ static int __init parse_noapic(char *str) early_param("noapic", parse_noapic); struct irq_pin_list; + +/* + * This is performance-critical, we want to do it O(1) + * + * the indexing order of this array favors 1:1 mappings + * between pins and IRQs. + */ + +struct irq_pin_list { + int apic, pin; + struct irq_pin_list *next; +}; + +static struct irq_pin_list *get_one_free_irq_2_pin(int cpu) +{ + struct irq_pin_list *pin; + int node; + + node = cpu_to_node(cpu); + + pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node); + printk(KERN_DEBUG " alloc irq_2_pin on cpu %d node %d\n", cpu, node); + + return pin; +} + struct irq_cfg { - unsigned int irq; struct irq_pin_list *irq_2_pin; - cpumask_t domain; - cpumask_t old_domain; + cpumask_var_t domain; + cpumask_var_t old_domain; unsigned move_cleanup_count; u8 vector; u8 move_in_progress : 1; +#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC + u8 move_desc_pending : 1; +#endif }; /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ +#ifdef CONFIG_SPARSE_IRQ +static struct irq_cfg irq_cfgx[] = { +#else static struct irq_cfg irq_cfgx[NR_IRQS] = { - [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, - [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, - [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, - [3] = { .irq = 3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, - [4] = { .irq = 4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, - [5] = { .irq = 5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, - [6] = { .irq = 6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, - [7] = { .irq = 7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, - [8] = { .irq = 8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, - [9] = { .irq = 9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, - [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, - [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, - [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, - [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, - [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, - [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, +#endif + [0] = { .vector = IRQ0_VECTOR, }, + [1] = { .vector = IRQ1_VECTOR, }, + [2] = { .vector = IRQ2_VECTOR, }, + [3] = { .vector = IRQ3_VECTOR, }, + [4] = { .vector = IRQ4_VECTOR, }, + [5] = { .vector = IRQ5_VECTOR, }, + [6] = { .vector = IRQ6_VECTOR, }, + [7] = { .vector = IRQ7_VECTOR, }, + [8] = { .vector = IRQ8_VECTOR, }, + [9] = { .vector = IRQ9_VECTOR, }, + [10] = { .vector = IRQ10_VECTOR, }, + [11] = { .vector = IRQ11_VECTOR, }, + [12] = { .vector = IRQ12_VECTOR, }, + [13] = { .vector = IRQ13_VECTOR, }, + [14] = { .vector = IRQ14_VECTOR, }, + [15] = { .vector = IRQ15_VECTOR, }, }; -#define for_each_irq_cfg(irq, cfg) \ - for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++) +int __init arch_early_irq_init(void) +{ + struct irq_cfg *cfg; + struct irq_desc *desc; + int count; + int i; + + cfg = irq_cfgx; + count = ARRAY_SIZE(irq_cfgx); + + for (i = 0; i < count; i++) { + desc = irq_to_desc(i); + desc->chip_data = &cfg[i]; + alloc_bootmem_cpumask_var(&cfg[i].domain); + alloc_bootmem_cpumask_var(&cfg[i].old_domain); + if (i < NR_IRQS_LEGACY) + cpumask_setall(cfg[i].domain); + } + + return 0; +} +#ifdef CONFIG_SPARSE_IRQ static struct irq_cfg *irq_cfg(unsigned int irq) { - return irq < nr_irqs ? irq_cfgx + irq : NULL; + struct irq_cfg *cfg = NULL; + struct irq_desc *desc; + + desc = irq_to_desc(irq); + if (desc) + cfg = desc->chip_data; + + return cfg; } -static struct irq_cfg *irq_cfg_alloc(unsigned int irq) +static struct irq_cfg *get_one_free_irq_cfg(int cpu) { - return irq_cfg(irq); + struct irq_cfg *cfg; + int node; + + node = cpu_to_node(cpu); + + cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); + if (cfg) { + /* FIXME: needs alloc_cpumask_var_node() */ + if (!alloc_cpumask_var(&cfg->domain, GFP_ATOMIC)) { + kfree(cfg); + cfg = NULL; + } else if (!alloc_cpumask_var(&cfg->old_domain, GFP_ATOMIC)) { + free_cpumask_var(cfg->domain); + kfree(cfg); + cfg = NULL; + } else { + cpumask_clear(cfg->domain); + cpumask_clear(cfg->old_domain); + } + } + printk(KERN_DEBUG " alloc irq_cfg on cpu %d node %d\n", cpu, node); + + return cfg; } -/* - * Rough estimation of how many shared IRQs there are, can be changed - * anytime. - */ -#define MAX_PLUS_SHARED_IRQS NR_IRQS -#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) +int arch_init_chip_data(struct irq_desc *desc, int cpu) +{ + struct irq_cfg *cfg; -/* - * This is performance-critical, we want to do it O(1) - * - * the indexing order of this array favors 1:1 mappings - * between pins and IRQs. - */ + cfg = desc->chip_data; + if (!cfg) { + desc->chip_data = get_one_free_irq_cfg(cpu); + if (!desc->chip_data) { + printk(KERN_ERR "can not alloc irq_cfg\n"); + BUG_ON(1); + } + } -struct irq_pin_list { - int apic, pin; - struct irq_pin_list *next; -}; + return 0; +} -static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE]; -static struct irq_pin_list *irq_2_pin_ptr; +#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC -static void __init irq_2_pin_init(void) +static void +init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) { - struct irq_pin_list *pin = irq_2_pin_head; - int i; + struct irq_pin_list *old_entry, *head, *tail, *entry; + + cfg->irq_2_pin = NULL; + old_entry = old_cfg->irq_2_pin; + if (!old_entry) + return; + + entry = get_one_free_irq_2_pin(cpu); + if (!entry) + return; - for (i = 1; i < PIN_MAP_SIZE; i++) - pin[i-1].next = &pin[i]; + entry->apic = old_entry->apic; + entry->pin = old_entry->pin; + head = entry; + tail = entry; + old_entry = old_entry->next; + while (old_entry) { + entry = get_one_free_irq_2_pin(cpu); + if (!entry) { + entry = head; + while (entry) { + head = entry->next; + kfree(entry); + entry = head; + } + /* still use the old one */ + return; + } + entry->apic = old_entry->apic; + entry->pin = old_entry->pin; + tail->next = entry; + tail = entry; + old_entry = old_entry->next; + } - irq_2_pin_ptr = &pin[0]; + tail->next = NULL; + cfg->irq_2_pin = head; } -static struct irq_pin_list *get_one_free_irq_2_pin(void) +static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg) { - struct irq_pin_list *pin = irq_2_pin_ptr; + struct irq_pin_list *entry, *next; - if (!pin) - panic("can not get more irq_2_pin\n"); + if (old_cfg->irq_2_pin == cfg->irq_2_pin) + return; - irq_2_pin_ptr = pin->next; - pin->next = NULL; - return pin; + entry = old_cfg->irq_2_pin; + + while (entry) { + next = entry->next; + kfree(entry); + entry = next; + } + old_cfg->irq_2_pin = NULL; +} + +void arch_init_copy_chip_data(struct irq_desc *old_desc, + struct irq_desc *desc, int cpu) +{ + struct irq_cfg *cfg; + struct irq_cfg *old_cfg; + + cfg = get_one_free_irq_cfg(cpu); + + if (!cfg) + return; + + desc->chip_data = cfg; + + old_cfg = old_desc->chip_data; + + memcpy(cfg, old_cfg, sizeof(struct irq_cfg)); + + init_copy_irq_2_pin(old_cfg, cfg, cpu); } +static void free_irq_cfg(struct irq_cfg *old_cfg) +{ + kfree(old_cfg); +} + +void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) +{ + struct irq_cfg *old_cfg, *cfg; + + old_cfg = old_desc->chip_data; + cfg = desc->chip_data; + + if (old_cfg == cfg) + return; + + if (old_cfg) { + free_irq_2_pin(old_cfg, cfg); + free_irq_cfg(old_cfg); + old_desc->chip_data = NULL; + } +} + +static void +set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) +{ + struct irq_cfg *cfg = desc->chip_data; + + if (!cfg->move_in_progress) { + /* it means that domain is not changed */ + if (!cpumask_intersects(&desc->affinity, mask)) + cfg->move_desc_pending = 1; + } +} +#endif + +#else +static struct irq_cfg *irq_cfg(unsigned int irq) +{ + return irq < nr_irqs ? irq_cfgx + irq : NULL; +} + +#endif + +#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC +static inline void +set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) +{ +} +#endif + struct io_apic { unsigned int index; unsigned int unused[3]; @@ -237,11 +420,10 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned writel(value, &io_apic->data); } -static bool io_apic_level_ack_pending(unsigned int irq) +static bool io_apic_level_ack_pending(struct irq_cfg *cfg) { struct irq_pin_list *entry; unsigned long flags; - struct irq_cfg *cfg = irq_cfg(irq); spin_lock_irqsave(&ioapic_lock, flags); entry = cfg->irq_2_pin; @@ -323,13 +505,32 @@ static void ioapic_mask_entry(int apic, int pin) } #ifdef CONFIG_SMP -static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) +static void send_cleanup_vector(struct irq_cfg *cfg) +{ + cpumask_var_t cleanup_mask; + + if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { + unsigned int i; + cfg->move_cleanup_count = 0; + for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + cfg->move_cleanup_count++; + for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); + } else { + cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); + cfg->move_cleanup_count = cpumask_weight(cleanup_mask); + send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + free_cpumask_var(cleanup_mask); + } + cfg->move_in_progress = 0; +} + +static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) { int apic, pin; - struct irq_cfg *cfg; struct irq_pin_list *entry; + u8 vector = cfg->vector; - cfg = irq_cfg(irq); entry = cfg->irq_2_pin; for (;;) { unsigned int reg; @@ -359,36 +560,61 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) } } -static int assign_irq_vector(int irq, cpumask_t mask); +static int +assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); -static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) +/* + * Either sets desc->affinity to a valid value, and returns cpu_mask_to_apicid + * of that, or returns BAD_APICID and leaves desc->affinity untouched. + */ +static unsigned int +set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) +{ + struct irq_cfg *cfg; + unsigned int irq; + + if (!cpumask_intersects(mask, cpu_online_mask)) + return BAD_APICID; + + irq = desc->irq; + cfg = desc->chip_data; + if (assign_irq_vector(irq, cfg, mask)) + return BAD_APICID; + + cpumask_and(&desc->affinity, cfg->domain, mask); + set_extra_move_desc(desc, mask); + return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask); +} + +static void +set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { struct irq_cfg *cfg; unsigned long flags; unsigned int dest; - cpumask_t tmp; - struct irq_desc *desc; + unsigned int irq; - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; + irq = desc->irq; + cfg = desc->chip_data; - cfg = irq_cfg(irq); - if (assign_irq_vector(irq, mask)) - return; + spin_lock_irqsave(&ioapic_lock, flags); + dest = set_desc_affinity(desc, mask); + if (dest != BAD_APICID) { + /* Only the high 8 bits are valid. */ + dest = SET_APIC_LOGICAL_ID(dest); + __target_IO_APIC_irq(irq, dest, cfg); + } + spin_unlock_irqrestore(&ioapic_lock, flags); +} - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - /* - * Only the high 8 bits are valid. - */ - dest = SET_APIC_LOGICAL_ID(dest); +static void +set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) +{ + struct irq_desc *desc; desc = irq_to_desc(irq); - spin_lock_irqsave(&ioapic_lock, flags); - __target_IO_APIC_irq(irq, dest, cfg->vector); - desc->affinity = mask; - spin_unlock_irqrestore(&ioapic_lock, flags); + + set_ioapic_affinity_irq_desc(desc, mask); } #endif /* CONFIG_SMP */ @@ -397,16 +623,18 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static void add_pin_to_irq(unsigned int irq, int apic, int pin) +static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) { - struct irq_cfg *cfg; struct irq_pin_list *entry; - /* first time to refer irq_cfg, so with new */ - cfg = irq_cfg_alloc(irq); entry = cfg->irq_2_pin; if (!entry) { - entry = get_one_free_irq_2_pin(); + entry = get_one_free_irq_2_pin(cpu); + if (!entry) { + printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n", + apic, pin); + return; + } cfg->irq_2_pin = entry; entry->apic = apic; entry->pin = pin; @@ -421,7 +649,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin) entry = entry->next; } - entry->next = get_one_free_irq_2_pin(); + entry->next = get_one_free_irq_2_pin(cpu); entry = entry->next; entry->apic = apic; entry->pin = pin; @@ -430,11 +658,10 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin) /* * Reroute an IRQ to a different pin. */ -static void __init replace_pin_at_irq(unsigned int irq, +static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu, int oldapic, int oldpin, int newapic, int newpin) { - struct irq_cfg *cfg = irq_cfg(irq); struct irq_pin_list *entry = cfg->irq_2_pin; int replaced = 0; @@ -451,18 +678,16 @@ static void __init replace_pin_at_irq(unsigned int irq, /* why? call replace before add? */ if (!replaced) - add_pin_to_irq(irq, newapic, newpin); + add_pin_to_irq_cpu(cfg, cpu, newapic, newpin); } -static inline void io_apic_modify_irq(unsigned int irq, +static inline void io_apic_modify_irq(struct irq_cfg *cfg, int mask_and, int mask_or, void (*final)(struct irq_pin_list *entry)) { int pin; - struct irq_cfg *cfg; struct irq_pin_list *entry; - cfg = irq_cfg(irq); for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { unsigned int reg; pin = entry->pin; @@ -475,9 +700,9 @@ static inline void io_apic_modify_irq(unsigned int irq, } } -static void __unmask_IO_APIC_irq(unsigned int irq) +static void __unmask_IO_APIC_irq(struct irq_cfg *cfg) { - io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL); + io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL); } #ifdef CONFIG_X86_64 @@ -492,47 +717,64 @@ static void io_apic_sync(struct irq_pin_list *entry) readl(&io_apic->data); } -static void __mask_IO_APIC_irq(unsigned int irq) +static void __mask_IO_APIC_irq(struct irq_cfg *cfg) { - io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); + io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); } #else /* CONFIG_X86_32 */ -static void __mask_IO_APIC_irq(unsigned int irq) +static void __mask_IO_APIC_irq(struct irq_cfg *cfg) { - io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL); + io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL); } -static void __mask_and_edge_IO_APIC_irq(unsigned int irq) +static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg) { - io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER, + io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER, IO_APIC_REDIR_MASKED, NULL); } -static void __unmask_and_level_IO_APIC_irq(unsigned int irq) +static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg) { - io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, + io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, IO_APIC_REDIR_LEVEL_TRIGGER, NULL); } #endif /* CONFIG_X86_32 */ -static void mask_IO_APIC_irq (unsigned int irq) +static void mask_IO_APIC_irq_desc(struct irq_desc *desc) { + struct irq_cfg *cfg = desc->chip_data; unsigned long flags; + BUG_ON(!cfg); + spin_lock_irqsave(&ioapic_lock, flags); - __mask_IO_APIC_irq(irq); + __mask_IO_APIC_irq(cfg); spin_unlock_irqrestore(&ioapic_lock, flags); } -static void unmask_IO_APIC_irq (unsigned int irq) +static void unmask_IO_APIC_irq_desc(struct irq_desc *desc) { + struct irq_cfg *cfg = desc->chip_data; unsigned long flags; spin_lock_irqsave(&ioapic_lock, flags); - __unmask_IO_APIC_irq(irq); + __unmask_IO_APIC_irq(cfg); spin_unlock_irqrestore(&ioapic_lock, flags); } +static void mask_IO_APIC_irq(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + mask_IO_APIC_irq_desc(desc); +} +static void unmask_IO_APIC_irq(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + unmask_IO_APIC_irq_desc(desc); +} + static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) { struct IO_APIC_route_entry entry; @@ -809,7 +1051,7 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); */ static int EISA_ELCR(unsigned int irq) { - if (irq < 16) { + if (irq < NR_IRQS_LEGACY) { unsigned int port = 0x4d0 + (irq >> 3); return (inb(port) >> (irq & 7)) & 1; } @@ -1034,7 +1276,8 @@ void unlock_vector_lock(void) spin_unlock(&vector_lock); } -static int __assign_irq_vector(int irq, cpumask_t mask) +static int +__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) { /* * NOTE! The local APIC isn't very good at handling @@ -1049,52 +1292,49 @@ static int __assign_irq_vector(int irq, cpumask_t mask) */ static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; unsigned int old_vector; - int cpu; - struct irq_cfg *cfg; - - cfg = irq_cfg(irq); - - /* Only try and allocate irqs on cpus that are present */ - cpus_and(mask, mask, cpu_online_map); + int cpu, err; + cpumask_var_t tmp_mask; if ((cfg->move_in_progress) || cfg->move_cleanup_count) return -EBUSY; + if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) + return -ENOMEM; + old_vector = cfg->vector; if (old_vector) { - cpumask_t tmp; - cpus_and(tmp, cfg->domain, mask); - if (!cpus_empty(tmp)) + cpumask_and(tmp_mask, mask, cpu_online_mask); + cpumask_and(tmp_mask, cfg->domain, tmp_mask); + if (!cpumask_empty(tmp_mask)) { + free_cpumask_var(tmp_mask); return 0; + } } - for_each_cpu_mask_nr(cpu, mask) { - cpumask_t domain, new_mask; + /* Only try and allocate irqs on cpus that are present */ + err = -ENOSPC; + for_each_cpu_and(cpu, mask, cpu_online_mask) { int new_cpu; int vector, offset; - domain = vector_allocation_domain(cpu); - cpus_and(new_mask, domain, cpu_online_map); + vector_allocation_domain(cpu, tmp_mask); vector = current_vector; offset = current_offset; next: vector += 8; if (vector >= first_system_vector) { - /* If we run out of vectors on large boxen, must share them. */ + /* If out of vectors on large boxen, must share them. */ offset = (offset + 1) % 8; vector = FIRST_DEVICE_VECTOR + offset; } if (unlikely(current_vector == vector)) continue; -#ifdef CONFIG_X86_64 - if (vector == IA32_SYSCALL_VECTOR) - goto next; -#else - if (vector == SYSCALL_VECTOR) + + if (test_bit(vector, used_vectors)) goto next; -#endif - for_each_cpu_mask_nr(new_cpu, new_mask) + + for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) if (per_cpu(vector_irq, new_cpu)[vector] != -1) goto next; /* Found one! */ @@ -1102,49 +1342,47 @@ next: current_offset = offset; if (old_vector) { cfg->move_in_progress = 1; - cfg->old_domain = cfg->domain; + cpumask_copy(cfg->old_domain, cfg->domain); } - for_each_cpu_mask_nr(new_cpu, new_mask) + for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) per_cpu(vector_irq, new_cpu)[vector] = irq; cfg->vector = vector; - cfg->domain = domain; - return 0; + cpumask_copy(cfg->domain, tmp_mask); + err = 0; + break; } - return -ENOSPC; + free_cpumask_var(tmp_mask); + return err; } -static int assign_irq_vector(int irq, cpumask_t mask) +static int +assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) { int err; unsigned long flags; spin_lock_irqsave(&vector_lock, flags); - err = __assign_irq_vector(irq, mask); + err = __assign_irq_vector(irq, cfg, mask); spin_unlock_irqrestore(&vector_lock, flags); return err; } -static void __clear_irq_vector(int irq) +static void __clear_irq_vector(int irq, struct irq_cfg *cfg) { - struct irq_cfg *cfg; - cpumask_t mask; int cpu, vector; - cfg = irq_cfg(irq); BUG_ON(!cfg->vector); vector = cfg->vector; - cpus_and(mask, cfg->domain, cpu_online_map); - for_each_cpu_mask_nr(cpu, mask) + for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) per_cpu(vector_irq, cpu)[vector] = -1; cfg->vector = 0; - cpus_clear(cfg->domain); + cpumask_clear(cfg->domain); if (likely(!cfg->move_in_progress)) return; - cpus_and(mask, cfg->old_domain, cpu_online_map); - for_each_cpu_mask_nr(cpu, mask) { + for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { if (per_cpu(vector_irq, cpu)[vector] != irq) @@ -1162,10 +1400,12 @@ void __setup_vector_irq(int cpu) /* This function must be called with vector_lock held */ int irq, vector; struct irq_cfg *cfg; + struct irq_desc *desc; /* Mark the inuse vectors */ - for_each_irq_cfg(irq, cfg) { - if (!cpu_isset(cpu, cfg->domain)) + for_each_irq_desc(irq, desc) { + cfg = desc->chip_data; + if (!cpumask_test_cpu(cpu, cfg->domain)) continue; vector = cfg->vector; per_cpu(vector_irq, cpu)[vector] = irq; @@ -1177,7 +1417,7 @@ void __setup_vector_irq(int cpu) continue; cfg = irq_cfg(irq); - if (!cpu_isset(cpu, cfg->domain)) + if (!cpumask_test_cpu(cpu, cfg->domain)) per_cpu(vector_irq, cpu)[vector] = -1; } } @@ -1215,11 +1455,8 @@ static inline int IO_APIC_irq_trigger(int irq) } #endif -static void ioapic_register_intr(int irq, unsigned long trigger) +static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger) { - struct irq_desc *desc; - - desc = irq_to_desc(irq); if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || trigger == IOAPIC_LEVEL) @@ -1311,23 +1548,22 @@ static int setup_ioapic_entry(int apic, int irq, return 0; } -static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, +static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_desc *desc, int trigger, int polarity) { struct irq_cfg *cfg; struct IO_APIC_route_entry entry; - cpumask_t mask; + unsigned int dest; if (!IO_APIC_IRQ(irq)) return; - cfg = irq_cfg(irq); + cfg = desc->chip_data; - mask = TARGET_CPUS; - if (assign_irq_vector(irq, mask)) + if (assign_irq_vector(irq, cfg, TARGET_CPUS)) return; - cpus_and(mask, cfg->domain, mask); + dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " @@ -1337,16 +1573,15 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry, - cpu_mask_to_apicid(mask), trigger, polarity, - cfg->vector)) { + dest, trigger, polarity, cfg->vector)) { printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", mp_ioapics[apic].mp_apicid, pin); - __clear_irq_vector(irq); + __clear_irq_vector(irq, cfg); return; } - ioapic_register_intr(irq, trigger); - if (irq < 16) + ioapic_register_intr(irq, desc, trigger); + if (irq < NR_IRQS_LEGACY) disable_8259A_irq(irq); ioapic_write_entry(apic, pin, entry); @@ -1356,6 +1591,9 @@ static void __init setup_IO_APIC_irqs(void) { int apic, pin, idx, irq; int notcon = 0; + struct irq_desc *desc; + struct irq_cfg *cfg; + int cpu = boot_cpu_id; apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); @@ -1387,9 +1625,15 @@ static void __init setup_IO_APIC_irqs(void) if (multi_timer_check(apic, irq)) continue; #endif - add_pin_to_irq(irq, apic, pin); + desc = irq_to_desc_alloc_cpu(irq, cpu); + if (!desc) { + printk(KERN_INFO "can not get irq_desc for %d\n", irq); + continue; + } + cfg = desc->chip_data; + add_pin_to_irq_cpu(cfg, cpu, apic, pin); - setup_IO_APIC_irq(apic, pin, irq, + setup_IO_APIC_irq(apic, pin, irq, desc, irq_trigger(idx), irq_polarity(idx)); } } @@ -1448,6 +1692,7 @@ __apicdebuginit(void) print_IO_APIC(void) union IO_APIC_reg_03 reg_03; unsigned long flags; struct irq_cfg *cfg; + struct irq_desc *desc; unsigned int irq; if (apic_verbosity == APIC_QUIET) @@ -1537,8 +1782,11 @@ __apicdebuginit(void) print_IO_APIC(void) } } printk(KERN_DEBUG "IRQ to pin mappings:\n"); - for_each_irq_cfg(irq, cfg) { - struct irq_pin_list *entry = cfg->irq_2_pin; + for_each_irq_desc(irq, desc) { + struct irq_pin_list *entry; + + cfg = desc->chip_data; + entry = cfg->irq_2_pin; if (!entry) continue; printk(KERN_DEBUG "IRQ%d ", irq); @@ -2022,14 +2270,16 @@ static unsigned int startup_ioapic_irq(unsigned int irq) { int was_pending = 0; unsigned long flags; + struct irq_cfg *cfg; spin_lock_irqsave(&ioapic_lock, flags); - if (irq < 16) { + if (irq < NR_IRQS_LEGACY) { disable_8259A_irq(irq); if (i8259A_irq_pending(irq)) was_pending = 1; } - __unmask_IO_APIC_irq(irq); + cfg = irq_cfg(irq); + __unmask_IO_APIC_irq(cfg); spin_unlock_irqrestore(&ioapic_lock, flags); return was_pending; @@ -2043,7 +2293,7 @@ static int ioapic_retrigger_irq(unsigned int irq) unsigned long flags; spin_lock_irqsave(&vector_lock, flags); - send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector); + send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); spin_unlock_irqrestore(&vector_lock, flags); return 1; @@ -2092,35 +2342,35 @@ static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration); * as simple as edge triggered migration and we can do the irq migration * with a simple atomic update to IO-APIC RTE. */ -static void migrate_ioapic_irq(int irq, cpumask_t mask) +static void +migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { struct irq_cfg *cfg; - struct irq_desc *desc; - cpumask_t tmp, cleanup_mask; struct irte irte; int modify_ioapic_rte; unsigned int dest; unsigned long flags; + unsigned int irq; - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) + if (!cpumask_intersects(mask, cpu_online_mask)) return; + irq = desc->irq; if (get_irte(irq, &irte)) return; - if (assign_irq_vector(irq, mask)) + cfg = desc->chip_data; + if (assign_irq_vector(irq, cfg, mask)) return; - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); + set_extra_move_desc(desc, mask); + + dest = cpu_mask_to_apicid_and(cfg->domain, mask); - desc = irq_to_desc(irq); modify_ioapic_rte = desc->status & IRQ_LEVEL; if (modify_ioapic_rte) { spin_lock_irqsave(&ioapic_lock, flags); - __target_IO_APIC_irq(irq, dest, cfg->vector); + __target_IO_APIC_irq(irq, dest, cfg); spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -2132,24 +2382,20 @@ static void migrate_ioapic_irq(int irq, cpumask_t mask) */ modify_irte(irq, &irte); - if (cfg->move_in_progress) { - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - cfg->move_in_progress = 0; - } + if (cfg->move_in_progress) + send_cleanup_vector(cfg); - desc->affinity = mask; + cpumask_copy(&desc->affinity, mask); } -static int migrate_irq_remapped_level(int irq) +static int migrate_irq_remapped_level_desc(struct irq_desc *desc) { int ret = -1; - struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg = desc->chip_data; - mask_IO_APIC_irq(irq); + mask_IO_APIC_irq_desc(desc); - if (io_apic_level_ack_pending(irq)) { + if (io_apic_level_ack_pending(cfg)) { /* * Interrupt in progress. Migrating irq now will change the * vector information in the IO-APIC RTE and that will confuse @@ -2161,14 +2407,15 @@ static int migrate_irq_remapped_level(int irq) } /* everthing is clear. we have right of way */ - migrate_ioapic_irq(irq, desc->pending_mask); + migrate_ioapic_irq_desc(desc, &desc->pending_mask); ret = 0; desc->status &= ~IRQ_MOVE_PENDING; - cpus_clear(desc->pending_mask); + cpumask_clear(&desc->pending_mask); unmask: - unmask_IO_APIC_irq(irq); + unmask_IO_APIC_irq_desc(desc); + return ret; } @@ -2189,7 +2436,7 @@ static void ir_irq_migration(struct work_struct *work) continue; } - desc->chip->set_affinity(irq, desc->pending_mask); + desc->chip->set_affinity(irq, &desc->pending_mask); spin_unlock_irqrestore(&desc->lock, flags); } } @@ -2198,18 +2445,24 @@ static void ir_irq_migration(struct work_struct *work) /* * Migrates the IRQ destination in the process context. */ -static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) +static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, + const struct cpumask *mask) { - struct irq_desc *desc = irq_to_desc(irq); - if (desc->status & IRQ_LEVEL) { desc->status |= IRQ_MOVE_PENDING; - desc->pending_mask = mask; - migrate_irq_remapped_level(irq); + cpumask_copy(&desc->pending_mask, mask); + migrate_irq_remapped_level_desc(desc); return; } - migrate_ioapic_irq(irq, mask); + migrate_ioapic_irq_desc(desc, mask); +} +static void set_ir_ioapic_affinity_irq(unsigned int irq, + const struct cpumask *mask) +{ + struct irq_desc *desc = irq_to_desc(irq); + + set_ir_ioapic_affinity_irq_desc(desc, mask); } #endif @@ -2228,6 +2481,9 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) struct irq_cfg *cfg; irq = __get_cpu_var(vector_irq)[vector]; + if (irq == -1) + continue; + desc = irq_to_desc(irq); if (!desc) continue; @@ -2237,7 +2493,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) if (!cfg->move_cleanup_count) goto unlock; - if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) + if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) goto unlock; __get_cpu_var(vector_irq)[vector] = -1; @@ -2249,28 +2505,44 @@ unlock: irq_exit(); } -static void irq_complete_move(unsigned int irq) +static void irq_complete_move(struct irq_desc **descp) { - struct irq_cfg *cfg = irq_cfg(irq); + struct irq_desc *desc = *descp; + struct irq_cfg *cfg = desc->chip_data; unsigned vector, me; - if (likely(!cfg->move_in_progress)) + if (likely(!cfg->move_in_progress)) { +#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC + if (likely(!cfg->move_desc_pending)) + return; + + /* domain has not changed, but affinity did */ + me = smp_processor_id(); + if (cpu_isset(me, desc->affinity)) { + *descp = desc = move_irq_desc(desc, me); + /* get the new one */ + cfg = desc->chip_data; + cfg->move_desc_pending = 0; + } +#endif return; + } vector = ~get_irq_regs()->orig_ax; me = smp_processor_id(); - if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { - cpumask_t cleanup_mask; +#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC + *descp = desc = move_irq_desc(desc, me); + /* get the new one */ + cfg = desc->chip_data; +#endif - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - cfg->move_in_progress = 0; - } + if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) + send_cleanup_vector(cfg); } #else -static inline void irq_complete_move(unsigned int irq) {} +static inline void irq_complete_move(struct irq_desc **descp) {} #endif + #ifdef CONFIG_INTR_REMAP static void ack_x2apic_level(unsigned int irq) { @@ -2281,11 +2553,14 @@ static void ack_x2apic_edge(unsigned int irq) { ack_x2APIC_irq(); } + #endif static void ack_apic_edge(unsigned int irq) { - irq_complete_move(irq); + struct irq_desc *desc = irq_to_desc(irq); + + irq_complete_move(&desc); move_native_irq(irq); ack_APIC_irq(); } @@ -2294,18 +2569,21 @@ atomic_t irq_mis_count; static void ack_apic_level(unsigned int irq) { + struct irq_desc *desc = irq_to_desc(irq); + #ifdef CONFIG_X86_32 unsigned long v; int i; #endif + struct irq_cfg *cfg; int do_unmask_irq = 0; - irq_complete_move(irq); + irq_complete_move(&desc); #ifdef CONFIG_GENERIC_PENDING_IRQ /* If we are moving the irq we need to mask it */ - if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { + if (unlikely(desc->status & IRQ_MOVE_PENDING)) { do_unmask_irq = 1; - mask_IO_APIC_irq(irq); + mask_IO_APIC_irq_desc(desc); } #endif @@ -2329,7 +2607,8 @@ static void ack_apic_level(unsigned int irq) * operation to prevent an edge-triggered interrupt escaping meanwhile. * The idea is from Manfred Spraul. --macro */ - i = irq_cfg(irq)->vector; + cfg = desc->chip_data; + i = cfg->vector; v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); #endif @@ -2368,17 +2647,18 @@ static void ack_apic_level(unsigned int irq) * accurate and is causing problems then it is a hardware bug * and you can go talk to the chipset vendor about it. */ - if (!io_apic_level_ack_pending(irq)) + cfg = desc->chip_data; + if (!io_apic_level_ack_pending(cfg)) move_masked_irq(irq); - unmask_IO_APIC_irq(irq); + unmask_IO_APIC_irq_desc(desc); } #ifdef CONFIG_X86_32 if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(irq); - __unmask_and_level_IO_APIC_irq(irq); + __mask_and_edge_IO_APIC_irq(cfg); + __unmask_and_level_IO_APIC_irq(cfg); spin_unlock(&ioapic_lock); } #endif @@ -2429,20 +2709,19 @@ static inline void init_IO_APIC_traps(void) * Also, we've got to be careful not to trash gate * 0x80, because int 0x80 is hm, kind of importantish. ;) */ - for_each_irq_cfg(irq, cfg) { - if (IO_APIC_IRQ(irq) && !cfg->vector) { + for_each_irq_desc(irq, desc) { + cfg = desc->chip_data; + if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { /* * Hmm.. We don't have an entry for this, * so default to an old-fashioned 8259 * interrupt if we can.. */ - if (irq < 16) + if (irq < NR_IRQS_LEGACY) make_8259A_irq(irq); - else { - desc = irq_to_desc(irq); + else /* Strange. Oh, well.. */ desc->chip = &no_irq_chip; - } } } } @@ -2467,7 +2746,7 @@ static void unmask_lapic_irq(unsigned int irq) apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); } -static void ack_lapic_irq (unsigned int irq) +static void ack_lapic_irq(unsigned int irq) { ack_APIC_irq(); } @@ -2479,11 +2758,8 @@ static struct irq_chip lapic_chip __read_mostly = { .ack = ack_lapic_irq, }; -static void lapic_register_intr(int irq) +static void lapic_register_intr(int irq, struct irq_desc *desc) { - struct irq_desc *desc; - - desc = irq_to_desc(irq); desc->status &= ~IRQ_LEVEL; set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, "edge"); @@ -2587,7 +2863,9 @@ int timer_through_8259 __initdata; */ static inline void __init check_timer(void) { - struct irq_cfg *cfg = irq_cfg(0); + struct irq_desc *desc = irq_to_desc(0); + struct irq_cfg *cfg = desc->chip_data; + int cpu = boot_cpu_id; int apic1, pin1, apic2, pin2; unsigned long flags; unsigned int ver; @@ -2602,7 +2880,7 @@ static inline void __init check_timer(void) * get/set the timer IRQ vector: */ disable_8259A_irq(0); - assign_irq_vector(0, TARGET_CPUS); + assign_irq_vector(0, cfg, TARGET_CPUS); /* * As IRQ0 is to be enabled in the 8259A, the virtual @@ -2653,10 +2931,10 @@ static inline void __init check_timer(void) * Ok, does IRQ0 through the IOAPIC work? */ if (no_pin1) { - add_pin_to_irq(0, apic1, pin1); + add_pin_to_irq_cpu(cfg, cpu, apic1, pin1); setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); } - unmask_IO_APIC_irq(0); + unmask_IO_APIC_irq_desc(desc); if (timer_irq_works()) { if (nmi_watchdog == NMI_IO_APIC) { setup_nmi(); @@ -2682,9 +2960,9 @@ static inline void __init check_timer(void) /* * legacy devices should be connected to IO APIC #0 */ - replace_pin_at_irq(0, apic1, pin1, apic2, pin2); + replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2); setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); - unmask_IO_APIC_irq(0); + unmask_IO_APIC_irq_desc(desc); enable_8259A_irq(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); @@ -2716,7 +2994,7 @@ static inline void __init check_timer(void) apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...\n"); - lapic_register_intr(0); + lapic_register_intr(0, desc); apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ enable_8259A_irq(0); @@ -2901,22 +3179,26 @@ unsigned int create_irq_nr(unsigned int irq_want) unsigned int irq; unsigned int new; unsigned long flags; - struct irq_cfg *cfg_new; - - irq_want = nr_irqs - 1; + struct irq_cfg *cfg_new = NULL; + int cpu = boot_cpu_id; + struct irq_desc *desc_new = NULL; irq = 0; spin_lock_irqsave(&vector_lock, flags); - for (new = irq_want; new > 0; new--) { + for (new = irq_want; new < NR_IRQS; new++) { if (platform_legacy_irq(new)) continue; - cfg_new = irq_cfg(new); - if (cfg_new && cfg_new->vector != 0) + + desc_new = irq_to_desc_alloc_cpu(new, cpu); + if (!desc_new) { + printk(KERN_INFO "can not get irq_desc for %d\n", new); continue; - /* check if need to create one */ - if (!cfg_new) - cfg_new = irq_cfg_alloc(new); - if (__assign_irq_vector(new, TARGET_CPUS) == 0) + } + cfg_new = desc_new->chip_data; + + if (cfg_new->vector != 0) + continue; + if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0) irq = new; break; } @@ -2924,15 +3206,21 @@ unsigned int create_irq_nr(unsigned int irq_want) if (irq > 0) { dynamic_irq_init(irq); + /* restore it, in case dynamic_irq_init clear it */ + if (desc_new) + desc_new->chip_data = cfg_new; } return irq; } +static int nr_irqs_gsi = NR_IRQS_LEGACY; int create_irq(void) { + unsigned int irq_want; int irq; - irq = create_irq_nr(nr_irqs - 1); + irq_want = nr_irqs_gsi; + irq = create_irq_nr(irq_want); if (irq == 0) irq = -1; @@ -2943,14 +3231,22 @@ int create_irq(void) void destroy_irq(unsigned int irq) { unsigned long flags; + struct irq_cfg *cfg; + struct irq_desc *desc; + /* store it, in case dynamic_irq_cleanup clear it */ + desc = irq_to_desc(irq); + cfg = desc->chip_data; dynamic_irq_cleanup(irq); + /* connect back irq_cfg */ + if (desc) + desc->chip_data = cfg; #ifdef CONFIG_INTR_REMAP free_irte(irq); #endif spin_lock_irqsave(&vector_lock, flags); - __clear_irq_vector(irq); + __clear_irq_vector(irq, cfg); spin_unlock_irqrestore(&vector_lock, flags); } @@ -2963,16 +3259,13 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms struct irq_cfg *cfg; int err; unsigned dest; - cpumask_t tmp; - tmp = TARGET_CPUS; - err = assign_irq_vector(irq, tmp); + cfg = irq_cfg(irq); + err = assign_irq_vector(irq, cfg, TARGET_CPUS); if (err) return err; - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, tmp); - dest = cpu_mask_to_apicid(tmp); + dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); #ifdef CONFIG_INTR_REMAP if (irq_remapped(irq)) { @@ -3026,64 +3319,48 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms } #ifdef CONFIG_SMP -static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) +static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) { + struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; struct msi_msg msg; unsigned int dest; - cpumask_t tmp; - struct irq_desc *desc; - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) return; - if (assign_irq_vector(irq, mask)) - return; + cfg = desc->chip_data; - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - - read_msi_msg(irq, &msg); + read_msi_msg_desc(desc, &msg); msg.data &= ~MSI_DATA_VECTOR_MASK; msg.data |= MSI_DATA_VECTOR(cfg->vector); msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; msg.address_lo |= MSI_ADDR_DEST_ID(dest); - write_msi_msg(irq, &msg); - desc = irq_to_desc(irq); - desc->affinity = mask; + write_msi_msg_desc(desc, &msg); } - #ifdef CONFIG_INTR_REMAP /* * Migrate the MSI irq to another cpumask. This migration is * done in the process context using interrupt-remapping hardware. */ -static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) +static void +ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) { - struct irq_cfg *cfg; + struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg = desc->chip_data; unsigned int dest; - cpumask_t tmp, cleanup_mask; struct irte irte; - struct irq_desc *desc; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; if (get_irte(irq, &irte)) return; - if (assign_irq_vector(irq, mask)) + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) return; - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - irte.vector = cfg->vector; irte.dest_id = IRTE_DEST(dest); @@ -3097,16 +3374,10 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) * at the new destination. So, time to cleanup the previous * vector allocation. */ - if (cfg->move_in_progress) { - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - cfg->move_in_progress = 0; - } - - desc = irq_to_desc(irq); - desc->affinity = mask; + if (cfg->move_in_progress) + send_cleanup_vector(cfg); } + #endif #endif /* CONFIG_SMP */ @@ -3165,7 +3436,7 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) } #endif -static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) +static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) { int ret; struct msi_msg msg; @@ -3174,7 +3445,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) if (ret < 0) return ret; - set_irq_msi(irq, desc); + set_irq_msi(irq, msidesc); write_msi_msg(irq, &msg); #ifdef CONFIG_INTR_REMAP @@ -3194,26 +3465,13 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) return 0; } -static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) -{ - unsigned int irq; - - irq = dev->bus->number; - irq <<= 8; - irq |= dev->devfn; - irq <<= 12; - - return irq; -} - -int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) +int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc) { unsigned int irq; int ret; unsigned int irq_want; - irq_want = build_irq_for_pci_dev(dev) + 0x100; - + irq_want = nr_irqs_gsi; irq = create_irq_nr(irq_want); if (irq == 0) return -1; @@ -3227,7 +3485,7 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) goto error; no_ir: #endif - ret = setup_msi_irq(dev, desc, irq); + ret = setup_msi_irq(dev, msidesc, irq); if (ret < 0) { destroy_irq(irq); return ret; @@ -3245,7 +3503,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { unsigned int irq; int ret, sub_handle; - struct msi_desc *desc; + struct msi_desc *msidesc; unsigned int irq_want; #ifdef CONFIG_INTR_REMAP @@ -3253,10 +3511,11 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) int index = 0; #endif - irq_want = build_irq_for_pci_dev(dev) + 0x100; + irq_want = nr_irqs_gsi; sub_handle = 0; - list_for_each_entry(desc, &dev->msi_list, list) { - irq = create_irq_nr(irq_want--); + list_for_each_entry(msidesc, &dev->msi_list, list) { + irq = create_irq_nr(irq_want); + irq_want++; if (irq == 0) return -1; #ifdef CONFIG_INTR_REMAP @@ -3288,7 +3547,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) } no_ir: #endif - ret = setup_msi_irq(dev, desc, irq); + ret = setup_msi_irq(dev, msidesc, irq); if (ret < 0) goto error; sub_handle++; @@ -3307,24 +3566,18 @@ void arch_teardown_msi_irq(unsigned int irq) #ifdef CONFIG_DMAR #ifdef CONFIG_SMP -static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) +static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) { + struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; struct msi_msg msg; unsigned int dest; - cpumask_t tmp; - struct irq_desc *desc; - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) return; - if (assign_irq_vector(irq, mask)) - return; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); + cfg = desc->chip_data; dmar_msi_read(irq, &msg); @@ -3334,9 +3587,8 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); dmar_msi_write(irq, &msg); - desc = irq_to_desc(irq); - desc->affinity = mask; } + #endif /* CONFIG_SMP */ struct irq_chip dmar_msi_type = { @@ -3368,24 +3620,18 @@ int arch_setup_dmar_msi(unsigned int irq) #ifdef CONFIG_HPET_TIMER #ifdef CONFIG_SMP -static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask) +static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) { + struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; - struct irq_desc *desc; struct msi_msg msg; unsigned int dest; - cpumask_t tmp; - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) return; - if (assign_irq_vector(irq, mask)) - return; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); + cfg = desc->chip_data; hpet_msi_read(irq, &msg); @@ -3395,9 +3641,8 @@ static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); hpet_msi_write(irq, &msg); - desc = irq_to_desc(irq); - desc->affinity = mask; } + #endif /* CONFIG_SMP */ struct irq_chip hpet_msi_type = { @@ -3450,28 +3695,21 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) write_ht_irq_msg(irq, &msg); } -static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) +static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) { + struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; unsigned int dest; - cpumask_t tmp; - struct irq_desc *desc; - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) return; - if (assign_irq_vector(irq, mask)) - return; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); + cfg = desc->chip_data; target_ht_irq(irq, dest, cfg->vector); - desc = irq_to_desc(irq); - desc->affinity = mask; } + #endif static struct irq_chip ht_irq_chip = { @@ -3489,17 +3727,14 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) { struct irq_cfg *cfg; int err; - cpumask_t tmp; - tmp = TARGET_CPUS; - err = assign_irq_vector(irq, tmp); + cfg = irq_cfg(irq); + err = assign_irq_vector(irq, cfg, TARGET_CPUS); if (!err) { struct ht_irq_msg msg; unsigned dest; - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, tmp); - dest = cpu_mask_to_apicid(tmp); + dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); @@ -3535,7 +3770,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, unsigned long mmr_offset) { - const cpumask_t *eligible_cpu = get_cpu_mask(cpu); + const struct cpumask *eligible_cpu = cpumask_of(cpu); struct irq_cfg *cfg; int mmr_pnode; unsigned long mmr_value; @@ -3543,7 +3778,9 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, unsigned long flags; int err; - err = assign_irq_vector(irq, *eligible_cpu); + cfg = irq_cfg(irq); + + err = assign_irq_vector(irq, cfg, eligible_cpu); if (err != 0) return err; @@ -3552,8 +3789,6 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, irq_name); spin_unlock_irqrestore(&vector_lock, flags); - cfg = irq_cfg(irq); - mmr_value = 0; entry = (struct uv_IO_APIC_route_entry *)&mmr_value; BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); @@ -3564,7 +3799,7 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, entry->polarity = 0; entry->trigger = 0; entry->mask = 0; - entry->dest = cpu_mask_to_apicid(*eligible_cpu); + entry->dest = cpu_mask_to_apicid(eligible_cpu); mmr_pnode = uv_blade_to_pnode(mmr_blade); uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); @@ -3605,9 +3840,16 @@ int __init io_apic_get_redir_entries (int ioapic) return reg_01.bits.entries; } -int __init probe_nr_irqs(void) +void __init probe_nr_irqs_gsi(void) { - return NR_IRQS; + int idx; + int nr = 0; + + for (idx = 0; idx < nr_ioapics; idx++) + nr += io_apic_get_redir_entries(idx) + 1; + + if (nr > nr_irqs_gsi) + nr_irqs_gsi = nr; } /* -------------------------------------------------------------------------- @@ -3706,19 +3948,31 @@ int __init io_apic_get_version(int ioapic) int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) { + struct irq_desc *desc; + struct irq_cfg *cfg; + int cpu = boot_cpu_id; + if (!IO_APIC_IRQ(irq)) { apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", ioapic); return -EINVAL; } + desc = irq_to_desc_alloc_cpu(irq, cpu); + if (!desc) { + printk(KERN_INFO "can not get irq_desc %d\n", irq); + return 0; + } + /* * IRQs < 16 are already in the irq_2_pin[] map */ - if (irq >= 16) - add_pin_to_irq(irq, ioapic, pin); + if (irq >= NR_IRQS_LEGACY) { + cfg = desc->chip_data; + add_pin_to_irq_cpu(cfg, cpu, ioapic, pin); + } - setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity); + setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity); return 0; } @@ -3756,7 +4010,7 @@ void __init setup_ioapic_dest(void) int pin, ioapic, irq, irq_entry; struct irq_desc *desc; struct irq_cfg *cfg; - cpumask_t mask; + const struct cpumask *mask; if (skip_ioapic_setup == 1) return; @@ -3772,9 +4026,10 @@ void __init setup_ioapic_dest(void) * when you have too many devices, because at that time only boot * cpu is online. */ - cfg = irq_cfg(irq); + desc = irq_to_desc(irq); + cfg = desc->chip_data; if (!cfg->vector) { - setup_IO_APIC_irq(ioapic, pin, irq, + setup_IO_APIC_irq(ioapic, pin, irq, desc, irq_trigger(irq_entry), irq_polarity(irq_entry)); continue; @@ -3784,19 +4039,18 @@ void __init setup_ioapic_dest(void) /* * Honour affinities which have been set in early boot */ - desc = irq_to_desc(irq); if (desc->status & (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) - mask = desc->affinity; + mask = &desc->affinity; else mask = TARGET_CPUS; #ifdef CONFIG_INTR_REMAP if (intr_remapping_enabled) - set_ir_ioapic_affinity_irq(irq, mask); + set_ir_ioapic_affinity_irq_desc(desc, mask); else #endif - set_ioapic_affinity_irq(irq, mask); + set_ioapic_affinity_irq_desc(desc, mask); } } @@ -3845,7 +4099,6 @@ void __init ioapic_init_mappings(void) struct resource *ioapic_res; int i; - irq_2_pin_init(); ioapic_res = ioapic_setup_resources(); for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c index f1c688e46f3..285bbf8831f 100644 --- a/arch/x86/kernel/ipi.c +++ b/arch/x86/kernel/ipi.c @@ -116,18 +116,18 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector) /* * This is only used on smaller machines. */ -void send_IPI_mask_bitmask(cpumask_t cpumask, int vector) +void send_IPI_mask_bitmask(const struct cpumask *cpumask, int vector) { - unsigned long mask = cpus_addr(cpumask)[0]; + unsigned long mask = cpumask_bits(cpumask)[0]; unsigned long flags; local_irq_save(flags); - WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]); + WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]); __send_IPI_dest_field(mask, vector); local_irq_restore(flags); } -void send_IPI_mask_sequence(cpumask_t mask, int vector) +void send_IPI_mask_sequence(const struct cpumask *mask, int vector) { unsigned long flags; unsigned int query_cpu; @@ -139,12 +139,24 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector) */ local_irq_save(flags); - for_each_possible_cpu(query_cpu) { - if (cpu_isset(query_cpu, mask)) { + for_each_cpu(query_cpu, mask) + __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), vector); + local_irq_restore(flags); +} + +void send_IPI_mask_allbutself(const struct cpumask *mask, int vector) +{ + unsigned long flags; + unsigned int query_cpu; + unsigned int this_cpu = smp_processor_id(); + + /* See Hack comment above */ + + local_irq_save(flags); + for_each_cpu(query_cpu, mask) + if (query_cpu != this_cpu) __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), vector); - } - } local_irq_restore(flags); } diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index d1d4dc52f64..bce53e1352a 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -9,6 +9,7 @@ #include <asm/apic.h> #include <asm/io_apic.h> #include <asm/smp.h> +#include <asm/irq.h> atomic_t irq_err_count; @@ -118,6 +119,9 @@ int show_interrupts(struct seq_file *p, void *v) } desc = irq_to_desc(i); + if (!desc) + return 0; + spin_lock_irqsave(&desc->lock, flags); #ifndef CONFIG_SMP any_count = kstat_irqs(i); @@ -187,3 +191,5 @@ u64 arch_irq_stat(void) #endif return sum; } + +EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index a51382672de..9dc5588f336 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -233,25 +233,28 @@ unsigned int do_IRQ(struct pt_regs *regs) #ifdef CONFIG_HOTPLUG_CPU #include <mach_apic.h> -void fixup_irqs(cpumask_t map) +/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ +void fixup_irqs(void) { unsigned int irq; static int warned; struct irq_desc *desc; for_each_irq_desc(irq, desc) { - cpumask_t mask; + const struct cpumask *affinity; + if (!desc) + continue; if (irq == 2) continue; - cpus_and(mask, desc->affinity, map); - if (any_online_cpu(mask) == NR_CPUS) { + affinity = &desc->affinity; + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { printk("Breaking affinity for irq %i\n", irq); - mask = map; + affinity = cpu_all_mask; } if (desc->chip->set_affinity) - desc->chip->set_affinity(irq, mask); + desc->chip->set_affinity(irq, affinity); else if (desc->action && !(warned++)) printk("Cannot set affinity for irq %i\n", irq); } diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 1df869e5bd0..6383d50f82e 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -80,40 +80,43 @@ asmlinkage unsigned int __irq_entry do_IRQ(struct pt_regs *regs) } #ifdef CONFIG_HOTPLUG_CPU -void fixup_irqs(cpumask_t map) +/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ +void fixup_irqs(void) { unsigned int irq; static int warned; struct irq_desc *desc; for_each_irq_desc(irq, desc) { - cpumask_t mask; int break_affinity = 0; int set_affinity = 1; + const struct cpumask *affinity; + if (!desc) + continue; if (irq == 2) continue; /* interrupt's are disabled at this point */ spin_lock(&desc->lock); + affinity = &desc->affinity; if (!irq_has_action(irq) || - cpus_equal(desc->affinity, map)) { + cpumask_equal(affinity, cpu_online_mask)) { spin_unlock(&desc->lock); continue; } - cpus_and(mask, desc->affinity, map); - if (cpus_empty(mask)) { + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { break_affinity = 1; - mask = map; + affinity = cpu_all_mask; } if (desc->chip->mask) desc->chip->mask(irq); if (desc->chip->set_affinity) - desc->chip->set_affinity(irq, mask); + desc->chip->set_affinity(irq, affinity); else if (!(warned++)) set_affinity = 0; diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 607db63044a..84723295f88 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -68,8 +68,7 @@ void __init init_ISA_irqs (void) /* * 16 old-style INTA-cycle interrupts: */ - for (i = 0; i < 16; i++) { - /* first time call this irq_desc */ + for (i = 0; i < NR_IRQS_LEGACY; i++) { struct irq_desc *desc = irq_to_desc(i); desc->status = IRQ_DISABLED; @@ -111,6 +110,18 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = { [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 }; +int vector_used_by_percpu_irq(unsigned int vector) +{ + int cpu; + + for_each_online_cpu(cpu) { + if (per_cpu(vector_irq, cpu)[vector] != -1) + return 1; + } + + return 0; +} + /* Overridden in paravirt.c */ void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); @@ -147,10 +158,12 @@ void __init native_init_IRQ(void) alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); /* IPI for single call function */ - set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt); + alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, + call_function_single_interrupt); /* Low priority IPI to cleanup after moving an irq */ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); + set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); #endif #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 8670b3ce626..31ebfe38e96 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -69,6 +69,18 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = { [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 }; +int vector_used_by_percpu_irq(unsigned int vector) +{ + int cpu; + + for_each_online_cpu(cpu) { + if (per_cpu(vector_irq, cpu)[vector] != -1) + return 1; + } + + return 0; +} + void __init init_ISA_irqs(void) { int i; @@ -76,8 +88,7 @@ void __init init_ISA_irqs(void) init_bsp_APIC(); init_8259A(0); - for (i = 0; i < 16; i++) { - /* first time call this irq_desc */ + for (i = 0; i < NR_IRQS_LEGACY; i++) { struct irq_desc *desc = irq_to_desc(i); desc->status = IRQ_DISABLED; @@ -122,6 +133,7 @@ static void __init smp_intr_init(void) /* Low priority IPI to cleanup after moving an irq */ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); + set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); #endif } diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index e169ae9b6a6..652fce6d2cc 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -89,17 +89,17 @@ static cycle_t kvm_clock_read(void) */ static unsigned long kvm_get_tsc_khz(void) { - return preset_lpj; + struct pvclock_vcpu_time_info *src; + src = &per_cpu(hv_clock, 0); + return pvclock_tsc_khz(src); } static void kvm_get_preset_lpj(void) { - struct pvclock_vcpu_time_info *src; unsigned long khz; u64 lpj; - src = &per_cpu(hv_clock, 0); - khz = pvclock_tsc_khz(src); + khz = kvm_get_tsc_khz(); lpj = ((u64)khz * 1000); do_div(lpj, HZ); @@ -194,5 +194,7 @@ void __init kvmclock_init(void) #endif kvm_get_preset_lpj(); clocksource_register(&kvm_clock); + pv_info.paravirt_enabled = 1; + pv_info.name = "KVM"; } } diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 3b599518c32..c12314c9e86 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -287,7 +287,7 @@ static struct clock_event_device mfgpt_clockevent = { .set_mode = mfgpt_set_mode, .set_next_event = mfgpt_next_event, .rating = 250, - .cpumask = CPU_MASK_ALL, + .cpumask = cpu_all_mask, .shift = 32 }; diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 7a3dfceb90e..19a1044a0cd 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -101,11 +101,15 @@ static void __init dma32_free_bootmem(void) dma32_bootmem_ptr = NULL; dma32_bootmem_size = 0; } +#endif void __init pci_iommu_alloc(void) { +#ifdef CONFIG_X86_64 /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); +#endif + /* * The order of these functions is important for * fall-back/fail-over reasons @@ -121,15 +125,6 @@ void __init pci_iommu_alloc(void) pci_swiotlb_init(); } -unsigned long iommu_nr_pages(unsigned long addr, unsigned long len) -{ - unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE); - - return size >> PAGE_SHIFT; -} -EXPORT_SYMBOL(iommu_nr_pages); -#endif - void *dma_generic_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag) { diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c index 3c539d111ab..242c3440687 100644 --- a/arch/x86/kernel/pci-swiotlb_64.c +++ b/arch/x86/kernel/pci-swiotlb_64.c @@ -3,6 +3,8 @@ #include <linux/pci.h> #include <linux/cache.h> #include <linux/module.h> +#include <linux/swiotlb.h> +#include <linux/bootmem.h> #include <linux/dma-mapping.h> #include <asm/iommu.h> @@ -11,6 +13,31 @@ int swiotlb __read_mostly; +void *swiotlb_alloc_boot(size_t size, unsigned long nslabs) +{ + return alloc_bootmem_low_pages(size); +} + +void *swiotlb_alloc(unsigned order, unsigned long nslabs) +{ + return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order); +} + +dma_addr_t swiotlb_phys_to_bus(phys_addr_t paddr) +{ + return paddr; +} + +phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr) +{ + return baddr; +} + +int __weak swiotlb_arch_range_needs_mapping(void *ptr, size_t size) +{ + return 0; +} + static dma_addr_t swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size, int direction) @@ -50,8 +77,10 @@ struct dma_mapping_ops swiotlb_dma_ops = { void __init pci_swiotlb_init(void) { /* don't initialize swiotlb if iommu=off (no_iommu=1) */ +#ifdef CONFIG_X86_64 if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) swiotlb = 1; +#endif if (swiotlb_force) swiotlb = 1; if (swiotlb) { diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 67465ed8931..309949e9e1c 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -168,6 +168,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31, ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1, ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_4, + ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7, ich_force_enable_hpet); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index a90913cccfb..bf088c61fa4 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -13,6 +13,7 @@ #include <asm/reboot_fixups.h> #include <asm/reboot.h> #include <asm/pci_x86.h> +#include <asm/virtext.h> #ifdef CONFIG_X86_32 # include <linux/dmi.h> @@ -39,6 +40,12 @@ int reboot_force; static int reboot_cpu = -1; #endif +/* This is set if we need to go through the 'emergency' path. + * When machine_emergency_restart() is called, we may be on + * an inconsistent state and won't be able to do a clean cleanup + */ +static int reboot_emergency; + /* This is set by the PCI code if either type 1 or type 2 PCI is detected */ bool port_cf9_safe = false; @@ -368,6 +375,48 @@ static inline void kb_wait(void) } } +static void vmxoff_nmi(int cpu, struct die_args *args) +{ + cpu_emergency_vmxoff(); +} + +/* Use NMIs as IPIs to tell all CPUs to disable virtualization + */ +static void emergency_vmx_disable_all(void) +{ + /* Just make sure we won't change CPUs while doing this */ + local_irq_disable(); + + /* We need to disable VMX on all CPUs before rebooting, otherwise + * we risk hanging up the machine, because the CPU ignore INIT + * signals when VMX is enabled. + * + * We can't take any locks and we may be on an inconsistent + * state, so we use NMIs as IPIs to tell the other CPUs to disable + * VMX and halt. + * + * For safety, we will avoid running the nmi_shootdown_cpus() + * stuff unnecessarily, but we don't have a way to check + * if other CPUs have VMX enabled. So we will call it only if the + * CPU we are running on has VMX enabled. + * + * We will miss cases where VMX is not enabled on all CPUs. This + * shouldn't do much harm because KVM always enable VMX on all + * CPUs anyway. But we can miss it on the small window where KVM + * is still enabling VMX. + */ + if (cpu_has_vmx() && cpu_vmx_enabled()) { + /* Disable VMX on this CPU. + */ + cpu_vmxoff(); + + /* Halt and disable VMX on the other CPUs */ + nmi_shootdown_cpus(vmxoff_nmi); + + } +} + + void __attribute__((weak)) mach_reboot_fixups(void) { } @@ -376,6 +425,9 @@ static void native_machine_emergency_restart(void) { int i; + if (reboot_emergency) + emergency_vmx_disable_all(); + /* Tell the BIOS if we want cold or warm reboot */ *((unsigned short *)__va(0x472)) = reboot_mode; @@ -482,13 +534,19 @@ void native_machine_shutdown(void) #endif } +static void __machine_emergency_restart(int emergency) +{ + reboot_emergency = emergency; + machine_ops.emergency_restart(); +} + static void native_machine_restart(char *__unused) { printk("machine restart\n"); if (!reboot_force) machine_shutdown(); - machine_emergency_restart(); + __machine_emergency_restart(0); } static void native_machine_halt(void) @@ -532,7 +590,7 @@ void machine_shutdown(void) void machine_emergency_restart(void) { - machine_ops.emergency_restart(); + __machine_emergency_restart(1); } void machine_restart(char *cmd) @@ -592,10 +650,7 @@ static int crash_nmi_callback(struct notifier_block *self, static void smp_send_nmi_allbutself(void) { - cpumask_t mask = cpu_online_map; - cpu_clear(safe_smp_processor_id(), mask); - if (!cpus_empty(mask)) - send_IPI_mask(mask, NMI_VECTOR); + send_IPI_allbutself(NMI_VECTOR); } static struct notifier_block crash_nmi_nb = { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 08e02e8453c..ae0d8042cf6 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -953,7 +953,7 @@ void __init setup_arch(char **cmdline_p) ioapic_init_mappings(); /* need to wait for io_apic is mapped */ - nr_irqs = probe_nr_irqs(); + probe_nr_irqs_gsi(); kvm_guest_init(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index ae0c0d3bb77..0b63b08e753 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -152,6 +152,11 @@ void __init setup_per_cpu_areas(void) old_size = PERCPU_ENOUGH_ROOM; align = max_t(unsigned long, PAGE_SIZE, align); size = roundup(old_size, align); + + printk(KERN_INFO + "NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", + NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); + printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n", size); @@ -168,24 +173,24 @@ void __init setup_per_cpu_areas(void) "cpu %d has no node %d or node-local memory\n", cpu, node); if (ptr) - printk(KERN_DEBUG "per cpu data for cpu%d at %016lx\n", + printk(KERN_DEBUG + "per cpu data for cpu%d at %016lx\n", cpu, __pa(ptr)); } else { ptr = __alloc_bootmem_node(NODE_DATA(node), size, align, __pa(MAX_DMA_ADDRESS)); if (ptr) - printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n", - cpu, node, __pa(ptr)); + printk(KERN_DEBUG + "per cpu data for cpu%d on node%d " + "at %016lx\n", + cpu, node, __pa(ptr)); } #endif per_cpu_offset(cpu) = ptr - __per_cpu_start; memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); } - printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n", - NR_CPUS, nr_cpu_ids, nr_node_ids); - /* Setup percpu data maps */ setup_per_cpu_maps(); @@ -282,7 +287,7 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable) else cpu_clear(cpu, *mask); - cpulist_scnprintf(buf, sizeof(buf), *mask); + cpulist_scnprintf(buf, sizeof(buf), mask); printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf); } diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 7e558db362c..beea2649a24 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -118,22 +118,22 @@ static void native_smp_send_reschedule(int cpu) WARN_ON(1); return; } - send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); + send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR); } void native_send_call_func_single_ipi(int cpu) { - send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_SINGLE_VECTOR); + send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR); } -void native_send_call_func_ipi(cpumask_t mask) +void native_send_call_func_ipi(const struct cpumask *mask) { cpumask_t allbutself; allbutself = cpu_online_map; cpu_clear(smp_processor_id(), allbutself); - if (cpus_equal(mask, allbutself) && + if (cpus_equal(*mask, allbutself) && cpus_equal(cpu_online_map, cpu_callout_map)) send_IPI_allbutself(CALL_FUNCTION_VECTOR); else diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index f8500c96944..31869bf5fab 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -102,14 +102,8 @@ EXPORT_SYMBOL(smp_num_siblings); /* Last level cache ID of each logical CPU */ DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID; -/* bitmap of online cpus */ -cpumask_t cpu_online_map __read_mostly; -EXPORT_SYMBOL(cpu_online_map); - cpumask_t cpu_callin_map; cpumask_t cpu_callout_map; -cpumask_t cpu_possible_map; -EXPORT_SYMBOL(cpu_possible_map); /* representing HT siblings of each logical CPU */ DEFINE_PER_CPU(cpumask_t, cpu_sibling_map); @@ -1260,6 +1254,15 @@ void __init native_smp_cpus_done(unsigned int max_cpus) check_nmi_watchdog(); } +static int __initdata setup_possible_cpus = -1; +static int __init _setup_possible_cpus(char *str) +{ + get_option(&str, &setup_possible_cpus); + return 0; +} +early_param("possible_cpus", _setup_possible_cpus); + + /* * cpu_possible_map should be static, it cannot change as cpu's * are onlined, or offlined. The reason is per-cpu data-structures @@ -1272,7 +1275,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus) * * Three ways to find out the number of additional hotplug CPUs: * - If the BIOS specified disabled CPUs in ACPI/mptables use that. - * - The user can overwrite it with additional_cpus=NUM + * - The user can overwrite it with possible_cpus=NUM * - Otherwise don't reserve additional CPUs. * We do this because additional CPUs waste a lot of memory. * -AK @@ -1285,9 +1288,17 @@ __init void prefill_possible_map(void) if (!num_processors) num_processors = 1; - possible = num_processors + disabled_cpus; - if (possible > NR_CPUS) - possible = NR_CPUS; + if (setup_possible_cpus == -1) + possible = num_processors + disabled_cpus; + else + possible = setup_possible_cpus; + + if (possible > CONFIG_NR_CPUS) { + printk(KERN_WARNING + "%d Processors exceeds NR_CPUS limit of %d\n", + possible, CONFIG_NR_CPUS); + possible = CONFIG_NR_CPUS; + } printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", possible, max_t(int, possible - num_processors, 0)); @@ -1352,7 +1363,7 @@ void cpu_disable_common(void) lock_vector_lock(); remove_cpu_from_maps(cpu); unlock_vector_lock(); - fixup_irqs(cpu_online_map); + fixup_irqs(); } int native_cpu_disable(void) diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c index 8da059f949b..ce505464224 100644 --- a/arch/x86/kernel/tlb_32.c +++ b/arch/x86/kernel/tlb_32.c @@ -163,7 +163,7 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, * We have to send the IPI only to * CPUs affected. */ - send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR); + send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR); while (!cpus_empty(flush_cpumask)) /* nothing. lockup detection does not belong here */ diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c index 29887d7081a..f8be6f1d2e4 100644 --- a/arch/x86/kernel/tlb_64.c +++ b/arch/x86/kernel/tlb_64.c @@ -191,7 +191,7 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, * We have to send the IPI only to * CPUs affected. */ - send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender); + send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR_START + sender); while (!cpus_empty(f->flush_cpumask)) cpu_relax(); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 961e26a69d5..ce6650eb64e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -72,9 +72,6 @@ #include "cpu/mcheck/mce.h" -DECLARE_BITMAP(used_vectors, NR_VECTORS); -EXPORT_SYMBOL_GPL(used_vectors); - asmlinkage int system_call(void); /* Do we ignore FPU interrupts ? */ @@ -89,6 +86,9 @@ gate_desc idt_table[256] __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; #endif +DECLARE_BITMAP(used_vectors, NR_VECTORS); +EXPORT_SYMBOL_GPL(used_vectors); + static int ignore_nmis; static inline void conditional_sti(struct pt_regs *regs) @@ -946,9 +946,7 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) void __init trap_init(void) { -#ifdef CONFIG_X86_32 int i; -#endif #ifdef CONFIG_EISA void __iomem *p = early_ioremap(0x0FFFD9, 4); @@ -1005,11 +1003,15 @@ void __init trap_init(void) } set_system_trap_gate(SYSCALL_VECTOR, &system_call); +#endif /* Reserve all the builtin and the syscall vector: */ for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) set_bit(i, used_vectors); +#ifdef CONFIG_X86_64 + set_bit(IA32_SYSCALL_VECTOR, used_vectors); +#else set_bit(SYSCALL_VECTOR, used_vectors); #endif /* diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 254ee07f863..c4c1f9e0940 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -226,7 +226,7 @@ static void __devinit vmi_time_init_clockevent(void) /* Upper bound is clockevent's use of ulong for cycle deltas. */ evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt); evt->min_delta_ns = clockevent_delta2ns(1, evt); - evt->cpumask = cpumask_of_cpu(cpu); + evt->cpumask = cpumask_of(cpu); printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n", evt->name, evt->mult, evt->shift); diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 59ebd37ad79..e665d1c623c 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -603,10 +603,29 @@ void kvm_free_pit(struct kvm *kvm) static void __inject_pit_timer_intr(struct kvm *kvm) { + struct kvm_vcpu *vcpu; + int i; + mutex_lock(&kvm->lock); kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); mutex_unlock(&kvm->lock); + + /* + * Provides NMI watchdog support via Virtual Wire mode. + * The route is: PIT -> PIC -> LVT0 in NMI mode. + * + * Note: Our Virtual Wire implementation is simplified, only + * propagating PIT interrupts to all VCPUs when they have set + * LVT0 to NMI delivery. Other PIC interrupts are just sent to + * VCPU0, and only if its LVT0 is in EXTINT mode. + */ + if (kvm->arch.vapics_in_nmi_mode > 0) + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + vcpu = kvm->vcpus[i]; + if (vcpu) + kvm_apic_nmi_wd_deliver(vcpu); + } } void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 17e41e165f1..179dcb0103f 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -26,10 +26,40 @@ * Port from Qemu. */ #include <linux/mm.h> +#include <linux/bitops.h> #include "irq.h" #include <linux/kvm_host.h> +static void pic_lock(struct kvm_pic *s) +{ + spin_lock(&s->lock); +} + +static void pic_unlock(struct kvm_pic *s) +{ + struct kvm *kvm = s->kvm; + unsigned acks = s->pending_acks; + bool wakeup = s->wakeup_needed; + struct kvm_vcpu *vcpu; + + s->pending_acks = 0; + s->wakeup_needed = false; + + spin_unlock(&s->lock); + + while (acks) { + kvm_notify_acked_irq(kvm, __ffs(acks)); + acks &= acks - 1; + } + + if (wakeup) { + vcpu = s->kvm->vcpus[0]; + if (vcpu) + kvm_vcpu_kick(vcpu); + } +} + static void pic_clear_isr(struct kvm_kpic_state *s, int irq) { s->isr &= ~(1 << irq); @@ -136,17 +166,21 @@ static void pic_update_irq(struct kvm_pic *s) void kvm_pic_update_irq(struct kvm_pic *s) { + pic_lock(s); pic_update_irq(s); + pic_unlock(s); } void kvm_pic_set_irq(void *opaque, int irq, int level) { struct kvm_pic *s = opaque; + pic_lock(s); if (irq >= 0 && irq < PIC_NUM_PINS) { pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); pic_update_irq(s); } + pic_unlock(s); } /* @@ -172,6 +206,7 @@ int kvm_pic_read_irq(struct kvm *kvm) int irq, irq2, intno; struct kvm_pic *s = pic_irqchip(kvm); + pic_lock(s); irq = pic_get_irq(&s->pics[0]); if (irq >= 0) { pic_intack(&s->pics[0], irq); @@ -196,6 +231,7 @@ int kvm_pic_read_irq(struct kvm *kvm) intno = s->pics[0].irq_base + irq; } pic_update_irq(s); + pic_unlock(s); kvm_notify_acked_irq(kvm, irq); return intno; @@ -203,7 +239,7 @@ int kvm_pic_read_irq(struct kvm *kvm) void kvm_pic_reset(struct kvm_kpic_state *s) { - int irq, irqbase; + int irq, irqbase, n; struct kvm *kvm = s->pics_state->irq_request_opaque; struct kvm_vcpu *vcpu0 = kvm->vcpus[0]; @@ -214,8 +250,10 @@ void kvm_pic_reset(struct kvm_kpic_state *s) for (irq = 0; irq < PIC_NUM_PINS/2; irq++) { if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) - if (s->irr & (1 << irq) || s->isr & (1 << irq)) - kvm_notify_acked_irq(kvm, irq+irqbase); + if (s->irr & (1 << irq) || s->isr & (1 << irq)) { + n = irq + irqbase; + s->pics_state->pending_acks |= 1 << n; + } } s->last_irr = 0; s->irr = 0; @@ -406,6 +444,7 @@ static void picdev_write(struct kvm_io_device *this, printk(KERN_ERR "PIC: non byte write\n"); return; } + pic_lock(s); switch (addr) { case 0x20: case 0x21: @@ -418,6 +457,7 @@ static void picdev_write(struct kvm_io_device *this, elcr_ioport_write(&s->pics[addr & 1], addr, data); break; } + pic_unlock(s); } static void picdev_read(struct kvm_io_device *this, @@ -431,6 +471,7 @@ static void picdev_read(struct kvm_io_device *this, printk(KERN_ERR "PIC: non byte read\n"); return; } + pic_lock(s); switch (addr) { case 0x20: case 0x21: @@ -444,6 +485,7 @@ static void picdev_read(struct kvm_io_device *this, break; } *(unsigned char *)val = data; + pic_unlock(s); } /* @@ -459,7 +501,7 @@ static void pic_irq_request(void *opaque, int level) s->output = level; if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { s->pics[0].isr_ack &= ~(1 << irq); - kvm_vcpu_kick(vcpu); + s->wakeup_needed = true; } } @@ -469,6 +511,8 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); if (!s) return NULL; + spin_lock_init(&s->lock); + s->kvm = kvm; s->pics[0].elcr_mask = 0xf8; s->pics[1].elcr_mask = 0xde; s->irq_request = pic_irq_request; diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index f17c8f5bbf3..2bf32a03cee 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -25,6 +25,7 @@ #include <linux/mm_types.h> #include <linux/hrtimer.h> #include <linux/kvm_host.h> +#include <linux/spinlock.h> #include "iodev.h" #include "ioapic.h" @@ -59,6 +60,10 @@ struct kvm_kpic_state { }; struct kvm_pic { + spinlock_t lock; + bool wakeup_needed; + unsigned pending_acks; + struct kvm *kvm; struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ irq_request_func *irq_request; void *irq_request_opaque; @@ -87,6 +92,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s); void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); +void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu); void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu); void __kvm_migrate_timers(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h index 65ef0fc2c03..8e5ee99551f 100644 --- a/arch/x86/kvm/kvm_svm.h +++ b/arch/x86/kvm/kvm_svm.h @@ -7,7 +7,7 @@ #include <linux/kvm_host.h> #include <asm/msr.h> -#include "svm.h" +#include <asm/svm.h> static const u32 host_save_user_msrs[] = { #ifdef CONFIG_X86_64 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0fc3cab4894..afac68c0815 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -130,6 +130,11 @@ static inline int apic_lvtt_period(struct kvm_lapic *apic) return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC; } +static inline int apic_lvt_nmi_mode(u32 lvt_val) +{ + return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI; +} + static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ @@ -354,6 +359,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, case APIC_DM_NMI: kvm_inject_nmi(vcpu); + kvm_vcpu_kick(vcpu); break; case APIC_DM_INIT: @@ -380,6 +386,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, } break; + case APIC_DM_EXTINT: + /* + * Should only be called by kvm_apic_local_deliver() with LVT0, + * before NMI watchdog was enabled. Already handled by + * kvm_apic_accept_pic_intr(). + */ + break; + default: printk(KERN_ERR "TODO: unsupported delivery mode %x\n", delivery_mode); @@ -663,6 +677,20 @@ static void start_apic_timer(struct kvm_lapic *apic) apic->timer.period))); } +static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) +{ + int nmi_wd_enabled = apic_lvt_nmi_mode(apic_get_reg(apic, APIC_LVT0)); + + if (apic_lvt_nmi_mode(lvt0_val)) { + if (!nmi_wd_enabled) { + apic_debug("Receive NMI setting on APIC_LVT0 " + "for cpu %d\n", apic->vcpu->vcpu_id); + apic->vcpu->kvm->arch.vapics_in_nmi_mode++; + } + } else if (nmi_wd_enabled) + apic->vcpu->kvm->arch.vapics_in_nmi_mode--; +} + static void apic_mmio_write(struct kvm_io_device *this, gpa_t address, int len, const void *data) { @@ -743,10 +771,11 @@ static void apic_mmio_write(struct kvm_io_device *this, apic_set_reg(apic, APIC_ICR2, val & 0xff000000); break; + case APIC_LVT0: + apic_manage_nmi_watchdog(apic, val); case APIC_LVTT: case APIC_LVTTHMR: case APIC_LVTPC: - case APIC_LVT0: case APIC_LVT1: case APIC_LVTERR: /* TODO: Check vector */ @@ -961,12 +990,26 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu) return 0; } -static int __inject_apic_timer_irq(struct kvm_lapic *apic) +static int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) +{ + u32 reg = apic_get_reg(apic, lvt_type); + int vector, mode, trig_mode; + + if (apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) { + vector = reg & APIC_VECTOR_MASK; + mode = reg & APIC_MODE_MASK; + trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; + return __apic_accept_irq(apic, mode, vector, 1, trig_mode); + } + return 0; +} + +void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu) { - int vector; + struct kvm_lapic *apic = vcpu->arch.apic; - vector = apic_lvt_vector(apic, APIC_LVTT); - return __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0); + if (apic) + kvm_apic_local_deliver(apic, APIC_LVT0); } static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) @@ -1061,9 +1104,8 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (apic && apic_lvt_enabled(apic, APIC_LVTT) && - atomic_read(&apic->timer.pending) > 0) { - if (__inject_apic_timer_irq(apic)) + if (apic && atomic_read(&apic->timer.pending) > 0) { + if (kvm_apic_local_deliver(apic, APIC_LVTT)) atomic_dec(&apic->timer.pending); } } diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 410ddbc1aa2..83f11c7474a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -17,7 +17,6 @@ * */ -#include "vmx.h" #include "mmu.h" #include <linux/kvm_host.h> @@ -33,6 +32,7 @@ #include <asm/page.h> #include <asm/cmpxchg.h> #include <asm/io.h> +#include <asm/vmx.h> /* * When setting this variable to true it enables Two-Dimensional-Paging @@ -168,6 +168,7 @@ static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ static u64 __read_mostly shadow_user_mask; static u64 __read_mostly shadow_accessed_mask; static u64 __read_mostly shadow_dirty_mask; +static u64 __read_mostly shadow_mt_mask; void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) { @@ -183,13 +184,14 @@ void kvm_mmu_set_base_ptes(u64 base_pte) EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask) + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask) { shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; shadow_dirty_mask = dirty_mask; shadow_nx_mask = nx_mask; shadow_x_mask = x_mask; + shadow_mt_mask = mt_mask; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); @@ -384,7 +386,9 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn) { int *write_count; - write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn)); + gfn = unalias_gfn(kvm, gfn); + write_count = slot_largepage_idx(gfn, + gfn_to_memslot_unaliased(kvm, gfn)); *write_count += 1; } @@ -392,16 +396,20 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) { int *write_count; - write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn)); + gfn = unalias_gfn(kvm, gfn); + write_count = slot_largepage_idx(gfn, + gfn_to_memslot_unaliased(kvm, gfn)); *write_count -= 1; WARN_ON(*write_count < 0); } static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn) { - struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); + struct kvm_memory_slot *slot; int *largepage_idx; + gfn = unalias_gfn(kvm, gfn); + slot = gfn_to_memslot_unaliased(kvm, gfn); if (slot) { largepage_idx = slot_largepage_idx(gfn, slot); return *largepage_idx; @@ -613,7 +621,7 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) return NULL; } -static void rmap_write_protect(struct kvm *kvm, u64 gfn) +static int rmap_write_protect(struct kvm *kvm, u64 gfn) { unsigned long *rmapp; u64 *spte; @@ -659,8 +667,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) spte = rmap_next(kvm, rmapp, spte); } - if (write_protected) - kvm_flush_remote_tlbs(kvm); + return write_protected; } static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) @@ -786,9 +793,11 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); + INIT_LIST_HEAD(&sp->oos_link); ASSERT(is_empty_shadow_page(sp->spt)); - sp->slot_bitmap = 0; + bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); sp->multimapped = 0; + sp->global = 1; sp->parent_pte = parent_pte; --vcpu->kvm->arch.n_free_mmu_pages; return sp; @@ -900,8 +909,9 @@ static void kvm_mmu_update_unsync_bitmap(u64 *spte) struct kvm_mmu_page *sp = page_header(__pa(spte)); index = spte - sp->spt; - __set_bit(index, sp->unsync_child_bitmap); - sp->unsync_children = 1; + if (!__test_and_set_bit(index, sp->unsync_child_bitmap)) + sp->unsync_children++; + WARN_ON(!sp->unsync_children); } static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) @@ -928,7 +938,6 @@ static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { - sp->unsync_children = 1; kvm_mmu_update_parents_unsync(sp); return 1; } @@ -959,38 +968,66 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { } +#define KVM_PAGE_ARRAY_NR 16 + +struct kvm_mmu_pages { + struct mmu_page_and_offset { + struct kvm_mmu_page *sp; + unsigned int idx; + } page[KVM_PAGE_ARRAY_NR]; + unsigned int nr; +}; + #define for_each_unsync_children(bitmap, idx) \ for (idx = find_first_bit(bitmap, 512); \ idx < 512; \ idx = find_next_bit(bitmap, 512, idx+1)) -static int mmu_unsync_walk(struct kvm_mmu_page *sp, - struct kvm_unsync_walk *walker) +int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, + int idx) { - int i, ret; + int i; - if (!sp->unsync_children) - return 0; + if (sp->unsync) + for (i=0; i < pvec->nr; i++) + if (pvec->page[i].sp == sp) + return 0; + + pvec->page[pvec->nr].sp = sp; + pvec->page[pvec->nr].idx = idx; + pvec->nr++; + return (pvec->nr == KVM_PAGE_ARRAY_NR); +} + +static int __mmu_unsync_walk(struct kvm_mmu_page *sp, + struct kvm_mmu_pages *pvec) +{ + int i, ret, nr_unsync_leaf = 0; for_each_unsync_children(sp->unsync_child_bitmap, i) { u64 ent = sp->spt[i]; - if (is_shadow_present_pte(ent)) { + if (is_shadow_present_pte(ent) && !is_large_pte(ent)) { struct kvm_mmu_page *child; child = page_header(ent & PT64_BASE_ADDR_MASK); if (child->unsync_children) { - ret = mmu_unsync_walk(child, walker); - if (ret) + if (mmu_pages_add(pvec, child, i)) + return -ENOSPC; + + ret = __mmu_unsync_walk(child, pvec); + if (!ret) + __clear_bit(i, sp->unsync_child_bitmap); + else if (ret > 0) + nr_unsync_leaf += ret; + else return ret; - __clear_bit(i, sp->unsync_child_bitmap); } if (child->unsync) { - ret = walker->entry(child, walker); - __clear_bit(i, sp->unsync_child_bitmap); - if (ret) - return ret; + nr_unsync_leaf++; + if (mmu_pages_add(pvec, child, i)) + return -ENOSPC; } } } @@ -998,7 +1035,17 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp, if (find_first_bit(sp->unsync_child_bitmap, 512) == 512) sp->unsync_children = 0; - return 0; + return nr_unsync_leaf; +} + +static int mmu_unsync_walk(struct kvm_mmu_page *sp, + struct kvm_mmu_pages *pvec) +{ + if (!sp->unsync_children) + return 0; + + mmu_pages_add(pvec, sp, 0); + return __mmu_unsync_walk(sp, pvec); } static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) @@ -1021,10 +1068,18 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) return NULL; } +static void kvm_unlink_unsync_global(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + list_del(&sp->oos_link); + --kvm->stat.mmu_unsync_global; +} + static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { WARN_ON(!sp->unsync); sp->unsync = 0; + if (sp->global) + kvm_unlink_unsync_global(kvm, sp); --kvm->stat.mmu_unsync; } @@ -1037,7 +1092,8 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) return 1; } - rmap_write_protect(vcpu->kvm, sp->gfn); + if (rmap_write_protect(vcpu->kvm, sp->gfn)) + kvm_flush_remote_tlbs(vcpu->kvm); kvm_unlink_unsync_page(vcpu->kvm, sp); if (vcpu->arch.mmu.sync_page(vcpu, sp)) { kvm_mmu_zap_page(vcpu->kvm, sp); @@ -1048,30 +1104,89 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) return 0; } -struct sync_walker { - struct kvm_vcpu *vcpu; - struct kvm_unsync_walk walker; +struct mmu_page_path { + struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1]; + unsigned int idx[PT64_ROOT_LEVEL-1]; }; -static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk) +#define for_each_sp(pvec, sp, parents, i) \ + for (i = mmu_pages_next(&pvec, &parents, -1), \ + sp = pvec.page[i].sp; \ + i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \ + i = mmu_pages_next(&pvec, &parents, i)) + +int mmu_pages_next(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents, + int i) { - struct sync_walker *sync_walk = container_of(walk, struct sync_walker, - walker); - struct kvm_vcpu *vcpu = sync_walk->vcpu; + int n; - kvm_sync_page(vcpu, sp); - return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)); + for (n = i+1; n < pvec->nr; n++) { + struct kvm_mmu_page *sp = pvec->page[n].sp; + + if (sp->role.level == PT_PAGE_TABLE_LEVEL) { + parents->idx[0] = pvec->page[n].idx; + return n; + } + + parents->parent[sp->role.level-2] = sp; + parents->idx[sp->role.level-1] = pvec->page[n].idx; + } + + return n; } -static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +void mmu_pages_clear_parents(struct mmu_page_path *parents) { - struct sync_walker walker = { - .walker = { .entry = mmu_sync_fn, }, - .vcpu = vcpu, - }; + struct kvm_mmu_page *sp; + unsigned int level = 0; + + do { + unsigned int idx = parents->idx[level]; + + sp = parents->parent[level]; + if (!sp) + return; + + --sp->unsync_children; + WARN_ON((int)sp->unsync_children < 0); + __clear_bit(idx, sp->unsync_child_bitmap); + level++; + } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children); +} + +static void kvm_mmu_pages_init(struct kvm_mmu_page *parent, + struct mmu_page_path *parents, + struct kvm_mmu_pages *pvec) +{ + parents->parent[parent->role.level-1] = NULL; + pvec->nr = 0; +} + +static void mmu_sync_children(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *parent) +{ + int i; + struct kvm_mmu_page *sp; + struct mmu_page_path parents; + struct kvm_mmu_pages pages; + + kvm_mmu_pages_init(parent, &parents, &pages); + while (mmu_unsync_walk(parent, &pages)) { + int protected = 0; - while (mmu_unsync_walk(sp, &walker.walker)) + for_each_sp(pages, sp, parents, i) + protected |= rmap_write_protect(vcpu->kvm, sp->gfn); + + if (protected) + kvm_flush_remote_tlbs(vcpu->kvm); + + for_each_sp(pages, sp, parents, i) { + kvm_sync_page(vcpu, sp); + mmu_pages_clear_parents(&parents); + } cond_resched_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_pages_init(parent, &parents, &pages); + } } static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, @@ -1129,7 +1244,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, sp->role = role; hlist_add_head(&sp->hash_link, bucket); if (!metaphysical) { - rmap_write_protect(vcpu->kvm, gfn); + if (rmap_write_protect(vcpu->kvm, gfn)) + kvm_flush_remote_tlbs(vcpu->kvm); account_shadowed(vcpu->kvm, gfn); } if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) @@ -1153,6 +1269,8 @@ static int walk_shadow(struct kvm_shadow_walk *walker, if (level == PT32E_ROOT_LEVEL) { shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; shadow_addr &= PT64_BASE_ADDR_MASK; + if (!shadow_addr) + return 1; --level; } @@ -1237,33 +1355,29 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) } } -struct zap_walker { - struct kvm_unsync_walk walker; - struct kvm *kvm; - int zapped; -}; - -static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk) +static int mmu_zap_unsync_children(struct kvm *kvm, + struct kvm_mmu_page *parent) { - struct zap_walker *zap_walk = container_of(walk, struct zap_walker, - walker); - kvm_mmu_zap_page(zap_walk->kvm, sp); - zap_walk->zapped = 1; - return 0; -} + int i, zapped = 0; + struct mmu_page_path parents; + struct kvm_mmu_pages pages; -static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - struct zap_walker walker = { - .walker = { .entry = mmu_zap_fn, }, - .kvm = kvm, - .zapped = 0, - }; - - if (sp->role.level == PT_PAGE_TABLE_LEVEL) + if (parent->role.level == PT_PAGE_TABLE_LEVEL) return 0; - mmu_unsync_walk(sp, &walker.walker); - return walker.zapped; + + kvm_mmu_pages_init(parent, &parents, &pages); + while (mmu_unsync_walk(parent, &pages)) { + struct kvm_mmu_page *sp; + + for_each_sp(pages, sp, parents, i) { + kvm_mmu_zap_page(kvm, sp); + mmu_pages_clear_parents(&parents); + } + zapped += pages.nr; + kvm_mmu_pages_init(parent, &parents, &pages); + } + + return zapped; } static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) @@ -1362,7 +1476,7 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); struct kvm_mmu_page *sp = page_header(__pa(pte)); - __set_bit(slot, &sp->slot_bitmap); + __set_bit(slot, sp->slot_bitmap); } static void mmu_convert_notrap(struct kvm_mmu_page *sp) @@ -1393,6 +1507,110 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) return page; } +/* + * The function is based on mtrr_type_lookup() in + * arch/x86/kernel/cpu/mtrr/generic.c + */ +static int get_mtrr_type(struct mtrr_state_type *mtrr_state, + u64 start, u64 end) +{ + int i; + u64 base, mask; + u8 prev_match, curr_match; + int num_var_ranges = KVM_NR_VAR_MTRR; + + if (!mtrr_state->enabled) + return 0xFF; + + /* Make end inclusive end, instead of exclusive */ + end--; + + /* Look in fixed ranges. Just return the type as per start */ + if (mtrr_state->have_fixed && (start < 0x100000)) { + int idx; + + if (start < 0x80000) { + idx = 0; + idx += (start >> 16); + return mtrr_state->fixed_ranges[idx]; + } else if (start < 0xC0000) { + idx = 1 * 8; + idx += ((start - 0x80000) >> 14); + return mtrr_state->fixed_ranges[idx]; + } else if (start < 0x1000000) { + idx = 3 * 8; + idx += ((start - 0xC0000) >> 12); + return mtrr_state->fixed_ranges[idx]; + } + } + + /* + * Look in variable ranges + * Look of multiple ranges matching this address and pick type + * as per MTRR precedence + */ + if (!(mtrr_state->enabled & 2)) + return mtrr_state->def_type; + + prev_match = 0xFF; + for (i = 0; i < num_var_ranges; ++i) { + unsigned short start_state, end_state; + + if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11))) + continue; + + base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) + + (mtrr_state->var_ranges[i].base_lo & PAGE_MASK); + mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) + + (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK); + + start_state = ((start & mask) == (base & mask)); + end_state = ((end & mask) == (base & mask)); + if (start_state != end_state) + return 0xFE; + + if ((start & mask) != (base & mask)) + continue; + + curr_match = mtrr_state->var_ranges[i].base_lo & 0xff; + if (prev_match == 0xFF) { + prev_match = curr_match; + continue; + } + + if (prev_match == MTRR_TYPE_UNCACHABLE || + curr_match == MTRR_TYPE_UNCACHABLE) + return MTRR_TYPE_UNCACHABLE; + + if ((prev_match == MTRR_TYPE_WRBACK && + curr_match == MTRR_TYPE_WRTHROUGH) || + (prev_match == MTRR_TYPE_WRTHROUGH && + curr_match == MTRR_TYPE_WRBACK)) { + prev_match = MTRR_TYPE_WRTHROUGH; + curr_match = MTRR_TYPE_WRTHROUGH; + } + + if (prev_match != curr_match) + return MTRR_TYPE_UNCACHABLE; + } + + if (prev_match != 0xFF) + return prev_match; + + return mtrr_state->def_type; +} + +static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + u8 mtrr; + + mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT, + (gfn << PAGE_SHIFT) + PAGE_SIZE); + if (mtrr == 0xfe || mtrr == 0xff) + mtrr = MTRR_TYPE_WRBACK; + return mtrr; +} + static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { unsigned index; @@ -1409,9 +1627,15 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (s->role.word != sp->role.word) return 1; } - kvm_mmu_mark_parents_unsync(vcpu, sp); ++vcpu->kvm->stat.mmu_unsync; sp->unsync = 1; + + if (sp->global) { + list_add(&sp->oos_link, &vcpu->kvm->arch.oos_global_pages); + ++vcpu->kvm->stat.mmu_unsync_global; + } else + kvm_mmu_mark_parents_unsync(vcpu, sp); + mmu_convert_notrap(sp); return 0; } @@ -1437,11 +1661,24 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, unsigned pte_access, int user_fault, int write_fault, int dirty, int largepage, - gfn_t gfn, pfn_t pfn, bool speculative, + int global, gfn_t gfn, pfn_t pfn, bool speculative, bool can_unsync) { u64 spte; int ret = 0; + u64 mt_mask = shadow_mt_mask; + struct kvm_mmu_page *sp = page_header(__pa(shadow_pte)); + + if (!(vcpu->arch.cr4 & X86_CR4_PGE)) + global = 0; + if (!global && sp->global) { + sp->global = 0; + if (sp->unsync) { + kvm_unlink_unsync_global(vcpu->kvm, sp); + kvm_mmu_mark_parents_unsync(vcpu, sp); + } + } + /* * We don't set the accessed bit, since we sometimes want to see * whether the guest actually used the pte (in order to detect @@ -1460,6 +1697,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, spte |= shadow_user_mask; if (largepage) spte |= PT_PAGE_SIZE_MASK; + if (mt_mask) { + mt_mask = get_memory_type(vcpu, gfn) << + kvm_x86_ops->get_mt_mask_shift(); + spte |= mt_mask; + } spte |= (u64)pfn << PAGE_SHIFT; @@ -1474,6 +1716,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, spte |= PT_WRITABLE_MASK; + /* + * Optimization: for pte sync, if spte was writable the hash + * lookup is unnecessary (and expensive). Write protection + * is responsibility of mmu_get_page / kvm_sync_page. + * Same reasoning can be applied to dirty page accounting. + */ + if (!can_unsync && is_writeble_pte(*shadow_pte)) + goto set_pte; + if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { pgprintk("%s: found shadow page for %lx, marking ro\n", __func__, gfn); @@ -1495,8 +1746,8 @@ set_pte: static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, unsigned pt_access, unsigned pte_access, int user_fault, int write_fault, int dirty, - int *ptwrite, int largepage, gfn_t gfn, - pfn_t pfn, bool speculative) + int *ptwrite, int largepage, int global, + gfn_t gfn, pfn_t pfn, bool speculative) { int was_rmapped = 0; int was_writeble = is_writeble_pte(*shadow_pte); @@ -1529,7 +1780,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, } } if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, - dirty, largepage, gfn, pfn, speculative, true)) { + dirty, largepage, global, gfn, pfn, speculative, true)) { if (write_fault) *ptwrite = 1; kvm_x86_ops->tlb_flush(vcpu); @@ -1586,7 +1837,7 @@ static int direct_map_entry(struct kvm_shadow_walk *_walk, || (walk->largepage && level == PT_DIRECTORY_LEVEL)) { mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL, 0, walk->write, 1, &walk->pt_write, - walk->largepage, gfn, walk->pfn, false); + walk->largepage, 0, gfn, walk->pfn, false); ++vcpu->stat.pf_fixed; return 1; } @@ -1773,6 +2024,15 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) } } +static void mmu_sync_global(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_mmu_page *sp, *n; + + list_for_each_entry_safe(sp, n, &kvm->arch.oos_global_pages, oos_link) + kvm_sync_page(vcpu, sp); +} + void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) { spin_lock(&vcpu->kvm->mmu_lock); @@ -1780,6 +2040,13 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) spin_unlock(&vcpu->kvm->mmu_lock); } +void kvm_mmu_sync_global(struct kvm_vcpu *vcpu) +{ + spin_lock(&vcpu->kvm->mmu_lock); + mmu_sync_global(vcpu); + spin_unlock(&vcpu->kvm->mmu_lock); +} + static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) { return vaddr; @@ -2178,7 +2445,8 @@ static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) } void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes) + const u8 *new, int bytes, + bool guest_initiated) { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *sp; @@ -2204,15 +2472,17 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_mmu_free_some_pages(vcpu); ++vcpu->kvm->stat.mmu_pte_write; kvm_mmu_audit(vcpu, "pre pte write"); - if (gfn == vcpu->arch.last_pt_write_gfn - && !last_updated_pte_accessed(vcpu)) { - ++vcpu->arch.last_pt_write_count; - if (vcpu->arch.last_pt_write_count >= 3) - flooded = 1; - } else { - vcpu->arch.last_pt_write_gfn = gfn; - vcpu->arch.last_pt_write_count = 1; - vcpu->arch.last_pte_updated = NULL; + if (guest_initiated) { + if (gfn == vcpu->arch.last_pt_write_gfn + && !last_updated_pte_accessed(vcpu)) { + ++vcpu->arch.last_pt_write_count; + if (vcpu->arch.last_pt_write_count >= 3) + flooded = 1; + } else { + vcpu->arch.last_pt_write_gfn = gfn; + vcpu->arch.last_pt_write_count = 1; + vcpu->arch.last_pte_updated = NULL; + } } index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; @@ -2352,9 +2622,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { - spin_lock(&vcpu->kvm->mmu_lock); vcpu->arch.mmu.invlpg(vcpu, gva); - spin_unlock(&vcpu->kvm->mmu_lock); kvm_mmu_flush_tlb(vcpu); ++vcpu->stat.invlpg; } @@ -2451,7 +2719,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) int i; u64 *pt; - if (!test_bit(slot, &sp->slot_bitmap)) + if (!test_bit(slot, sp->slot_bitmap)) continue; pt = sp->spt; @@ -2860,8 +3128,8 @@ static void audit_write_protection(struct kvm_vcpu *vcpu) if (sp->role.metaphysical) continue; - slot = gfn_to_memslot(vcpu->kvm, sp->gfn); gfn = unalias_gfn(vcpu->kvm, sp->gfn); + slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn); rmapp = &slot->rmap[gfn - slot->base_gfn]; if (*rmapp) printk(KERN_ERR "%s: (%s) shadow page has writable" diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 84eee43bbe7..9fd78b6e17a 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -82,6 +82,7 @@ struct shadow_walker { int *ptwrite; pfn_t pfn; u64 *sptep; + gpa_t pte_gpa; }; static gfn_t gpte_to_gfn(pt_element_t gpte) @@ -222,7 +223,7 @@ walk: if (ret) goto walk; pte |= PT_DIRTY_MASK; - kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte)); + kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte), 0); walker->ptes[walker->level - 1] = pte; } @@ -274,7 +275,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, return; kvm_get_pfn(pfn); mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, - gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte), + gpte & PT_DIRTY_MASK, NULL, largepage, + gpte & PT_GLOBAL_MASK, gpte_to_gfn(gpte), pfn, true); } @@ -301,8 +303,9 @@ static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw, mmu_set_spte(vcpu, sptep, access, gw->pte_access & access, sw->user_fault, sw->write_fault, gw->ptes[gw->level-1] & PT_DIRTY_MASK, - sw->ptwrite, sw->largepage, gw->gfn, sw->pfn, - false); + sw->ptwrite, sw->largepage, + gw->ptes[gw->level-1] & PT_GLOBAL_MASK, + gw->gfn, sw->pfn, false); sw->sptep = sptep; return 1; } @@ -466,10 +469,22 @@ static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw, struct kvm_vcpu *vcpu, u64 addr, u64 *sptep, int level) { + struct shadow_walker *sw = + container_of(_sw, struct shadow_walker, walker); - if (level == PT_PAGE_TABLE_LEVEL) { - if (is_shadow_present_pte(*sptep)) + /* FIXME: properly handle invlpg on large guest pages */ + if (level == PT_PAGE_TABLE_LEVEL || + ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) { + struct kvm_mmu_page *sp = page_header(__pa(sptep)); + + sw->pte_gpa = (sp->gfn << PAGE_SHIFT); + sw->pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); + + if (is_shadow_present_pte(*sptep)) { rmap_remove(vcpu->kvm, sptep); + if (is_large_pte(*sptep)) + --vcpu->kvm->stat.lpages; + } set_shadow_pte(sptep, shadow_trap_nonpresent_pte); return 1; } @@ -480,11 +495,26 @@ static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw, static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) { + pt_element_t gpte; struct shadow_walker walker = { .walker = { .entry = FNAME(shadow_invlpg_entry), }, + .pte_gpa = -1, }; + spin_lock(&vcpu->kvm->mmu_lock); walk_shadow(&walker.walker, vcpu, gva); + spin_unlock(&vcpu->kvm->mmu_lock); + if (walker.pte_gpa == -1) + return; + if (kvm_read_guest_atomic(vcpu->kvm, walker.pte_gpa, &gpte, + sizeof(pt_element_t))) + return; + if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) { + if (mmu_topup_memory_caches(vcpu)) + return; + kvm_mmu_pte_write(vcpu, walker.pte_gpa, (const u8 *)&gpte, + sizeof(pt_element_t), 0); + } } static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) @@ -580,7 +610,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) nr_present++; pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, - is_dirty_pte(gpte), 0, gfn, + is_dirty_pte(gpte), 0, gpte & PT_GLOBAL_MASK, gfn, spte_to_pfn(sp->spt[i]), true, false); } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9c4ce657d96..1452851ae25 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -28,6 +28,8 @@ #include <asm/desc.h> +#include <asm/virtext.h> + #define __ex(x) __kvm_handle_fault_on_reboot(x) MODULE_AUTHOR("Qumranet"); @@ -245,34 +247,19 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) static int has_svm(void) { - uint32_t eax, ebx, ecx, edx; - - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { - printk(KERN_INFO "has_svm: not amd\n"); - return 0; - } + const char *msg; - cpuid(0x80000000, &eax, &ebx, &ecx, &edx); - if (eax < SVM_CPUID_FUNC) { - printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n"); + if (!cpu_has_svm(&msg)) { + printk(KERN_INFO "has_svn: %s\n", msg); return 0; } - cpuid(0x80000001, &eax, &ebx, &ecx, &edx); - if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) { - printk(KERN_DEBUG "has_svm: svm not available\n"); - return 0; - } return 1; } static void svm_hardware_disable(void *garbage) { - uint64_t efer; - - wrmsrl(MSR_VM_HSAVE_PA, 0); - rdmsrl(MSR_EFER, efer); - wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK); + cpu_svm_disable(); } static void svm_hardware_enable(void *garbage) @@ -772,6 +759,22 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1; + + /* + * SVM always stores 0 for the 'G' bit in the CS selector in + * the VMCB on a VMEXIT. This hurts cross-vendor migration: + * Intel's VMENTRY has a check on the 'G' bit. + */ + if (seg == VCPU_SREG_CS) + var->g = s->limit > 0xfffff; + + /* + * Work around a bug where the busy flag in the tr selector + * isn't exposed + */ + if (seg == VCPU_SREG_TR) + var->type |= 0x2; + var->unusable = !var->present; } @@ -1099,6 +1102,7 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) rep = (io_info & SVM_IOIO_REP_MASK) != 0; down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0; + skip_emulated_instruction(&svm->vcpu); return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); } @@ -1912,6 +1916,11 @@ static int get_npt_level(void) #endif } +static int svm_get_mt_mask_shift(void) +{ + return 0; +} + static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -1967,6 +1976,7 @@ static struct kvm_x86_ops svm_x86_ops = { .set_tss_addr = svm_set_tss_addr, .get_tdp_level = get_npt_level, + .get_mt_mask_shift = svm_get_mt_mask_shift, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a4018b01e1f..6259d746764 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -16,7 +16,6 @@ */ #include "irq.h" -#include "vmx.h" #include "mmu.h" #include <linux/kvm_host.h> @@ -31,6 +30,8 @@ #include <asm/io.h> #include <asm/desc.h> +#include <asm/vmx.h> +#include <asm/virtext.h> #define __ex(x) __kvm_handle_fault_on_reboot(x) @@ -90,6 +91,11 @@ struct vcpu_vmx { } rmode; int vpid; bool emulation_required; + + /* Support for vnmi-less CPUs */ + int soft_vnmi_blocked; + ktime_t entry_time; + s64 vnmi_blocked_time; }; static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) @@ -122,7 +128,7 @@ static struct vmcs_config { u32 vmentry_ctrl; } vmcs_config; -struct vmx_capability { +static struct vmx_capability { u32 ept; u32 vpid; } vmx_capability; @@ -957,6 +963,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data); break; + case MSR_IA32_CR_PAT: + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + vmcs_write64(GUEST_IA32_PAT, data); + vcpu->arch.pat = data; + break; + } + /* Otherwise falls through to kvm_set_msr_common */ default: vmx_load_host_state(vmx); msr = find_msr_entry(vmx, msr_index); @@ -1032,8 +1045,7 @@ static int vmx_get_irq(struct kvm_vcpu *vcpu) static __init int cpu_has_kvm_support(void) { - unsigned long ecx = cpuid_ecx(1); - return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */ + return cpu_has_vmx(); } static __init int vmx_disabled_by_bios(void) @@ -1079,13 +1091,22 @@ static void vmclear_local_vcpus(void) __vcpu_clear(vmx); } -static void hardware_disable(void *garbage) + +/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot() + * tricks. + */ +static void kvm_cpu_vmxoff(void) { - vmclear_local_vcpus(); asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); write_cr4(read_cr4() & ~X86_CR4_VMXE); } +static void hardware_disable(void *garbage) +{ + vmclear_local_vcpus(); + kvm_cpu_vmxoff(); +} + static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result) { @@ -1176,12 +1197,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) #ifdef CONFIG_X86_64 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; #endif - opt = 0; + opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, &_vmexit_control) < 0) return -EIO; - min = opt = 0; + min = 0; + opt = VM_ENTRY_LOAD_IA32_PAT; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, &_vmentry_control) < 0) return -EIO; @@ -2087,8 +2109,9 @@ static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr) */ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) { - u32 host_sysenter_cs; + u32 host_sysenter_cs, msr_low, msr_high; u32 junk; + u64 host_pat; unsigned long a; struct descriptor_table dt; int i; @@ -2176,6 +2199,20 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) rdmsrl(MSR_IA32_SYSENTER_EIP, a); vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */ + if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { + rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); + host_pat = msr_low | ((u64) msr_high << 32); + vmcs_write64(HOST_IA32_PAT, host_pat); + } + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); + host_pat = msr_low | ((u64) msr_high << 32); + /* Write the default value follow host pat */ + vmcs_write64(GUEST_IA32_PAT, host_pat); + /* Keep arch.pat sync with GUEST_IA32_PAT */ + vmx->vcpu.arch.pat = host_pat; + } + for (i = 0; i < NR_VMX_MSR; ++i) { u32 index = vmx_msr_index[i]; u32 data_low, data_high; @@ -2230,6 +2267,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx->vcpu.arch.rmode.active = 0; + vmx->soft_vnmi_blocked = 0; + vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); kvm_set_cr8(&vmx->vcpu, 0); msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; @@ -2335,6 +2374,29 @@ out: return ret; } +static void enable_irq_window(struct kvm_vcpu *vcpu) +{ + u32 cpu_based_vm_exec_control; + + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); +} + +static void enable_nmi_window(struct kvm_vcpu *vcpu) +{ + u32 cpu_based_vm_exec_control; + + if (!cpu_has_virtual_nmis()) { + enable_irq_window(vcpu); + return; + } + + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); +} + static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -2358,10 +2420,54 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) static void vmx_inject_nmi(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!cpu_has_virtual_nmis()) { + /* + * Tracking the NMI-blocked state in software is built upon + * finding the next open IRQ window. This, in turn, depends on + * well-behaving guests: They have to keep IRQs disabled at + * least as long as the NMI handler runs. Otherwise we may + * cause NMI nesting, maybe breaking the guest. But as this is + * highly unlikely, we can live with the residual risk. + */ + vmx->soft_vnmi_blocked = 1; + vmx->vnmi_blocked_time = 0; + } + + ++vcpu->stat.nmi_injections; + if (vcpu->arch.rmode.active) { + vmx->rmode.irq.pending = true; + vmx->rmode.irq.vector = NMI_VECTOR; + vmx->rmode.irq.rip = kvm_rip_read(vcpu); + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + NMI_VECTOR | INTR_TYPE_SOFT_INTR | + INTR_INFO_VALID_MASK); + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); + kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); + return; + } vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); } +static void vmx_update_window_states(struct kvm_vcpu *vcpu) +{ + u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + + vcpu->arch.nmi_window_open = + !(guest_intr & (GUEST_INTR_STATE_STI | + GUEST_INTR_STATE_MOV_SS | + GUEST_INTR_STATE_NMI)); + if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) + vcpu->arch.nmi_window_open = 0; + + vcpu->arch.interrupt_window_open = + ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && + !(guest_intr & (GUEST_INTR_STATE_STI | + GUEST_INTR_STATE_MOV_SS))); +} + static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) { int word_index = __ffs(vcpu->arch.irq_summary); @@ -2374,40 +2480,49 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) kvm_queue_interrupt(vcpu, irq); } - static void do_interrupt_requests(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - u32 cpu_based_vm_exec_control; - - vcpu->arch.interrupt_window_open = - ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && - (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); + vmx_update_window_states(vcpu); - if (vcpu->arch.interrupt_window_open && - vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending) - kvm_do_inject_irq(vcpu); + if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { + if (vcpu->arch.interrupt.pending) { + enable_nmi_window(vcpu); + } else if (vcpu->arch.nmi_window_open) { + vcpu->arch.nmi_pending = false; + vcpu->arch.nmi_injected = true; + } else { + enable_nmi_window(vcpu); + return; + } + } + if (vcpu->arch.nmi_injected) { + vmx_inject_nmi(vcpu); + if (vcpu->arch.nmi_pending) + enable_nmi_window(vcpu); + else if (vcpu->arch.irq_summary + || kvm_run->request_interrupt_window) + enable_irq_window(vcpu); + return; + } - if (vcpu->arch.interrupt_window_open && vcpu->arch.interrupt.pending) - vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); + if (vcpu->arch.interrupt_window_open) { + if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending) + kvm_do_inject_irq(vcpu); - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + if (vcpu->arch.interrupt.pending) + vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); + } if (!vcpu->arch.interrupt_window_open && (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) - /* - * Interrupts blocked. Wait for unblock. - */ - cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; - else - cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + enable_irq_window(vcpu); } static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) { int ret; struct kvm_userspace_memory_region tss_mem = { - .slot = 8, + .slot = TSS_PRIVATE_MEMSLOT, .guest_phys_addr = addr, .memory_size = PAGE_SIZE * 3, .flags = 0, @@ -2492,7 +2607,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary); } - if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ + if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) return 1; /* already handled by vmx_vcpu_run() */ if (is_no_device(intr_info)) { @@ -2581,6 +2696,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) rep = (exit_qualification & 32) != 0; port = exit_qualification >> 16; + skip_emulated_instruction(vcpu); return kvm_emulate_pio(vcpu, kvm_run, in, size, port); } @@ -2767,6 +2883,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); KVMTRACE_0D(PEND_INTR, vcpu, handler); + ++vcpu->stat.irq_window_exits; /* * If the user space waits to inject interrupts, exit as soon as @@ -2775,7 +2892,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, if (kvm_run->request_interrupt_window && !vcpu->arch.irq_summary) { kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; - ++vcpu->stat.irq_window_exits; return 0; } return 1; @@ -2832,6 +2948,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { + struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long exit_qualification; u16 tss_selector; int reason; @@ -2839,6 +2956,15 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) exit_qualification = vmcs_readl(EXIT_QUALIFICATION); reason = (u32)exit_qualification >> 30; + if (reason == TASK_SWITCH_GATE && vmx->vcpu.arch.nmi_injected && + (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && + (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK) + == INTR_TYPE_NMI_INTR) { + vcpu->arch.nmi_injected = false; + if (cpu_has_virtual_nmis()) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + } tss_selector = exit_qualification; return kvm_task_switch(vcpu, tss_selector, reason); @@ -2927,16 +3053,12 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, while (!guest_state_valid(vcpu)) { err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); - switch (err) { - case EMULATE_DONE: - break; - case EMULATE_DO_MMIO: - kvm_report_emulation_failure(vcpu, "mmio"); - /* TODO: Handle MMIO */ - return; - default: - kvm_report_emulation_failure(vcpu, "emulation failure"); - return; + if (err == EMULATE_DO_MMIO) + break; + + if (err != EMULATE_DONE) { + kvm_report_emulation_failure(vcpu, "emulation failure"); + return; } if (signal_pending(current)) @@ -2948,8 +3070,10 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, local_irq_disable(); preempt_disable(); - /* Guest state should be valid now, no more emulation should be needed */ - vmx->emulation_required = 0; + /* Guest state should be valid now except if we need to + * emulate an MMIO */ + if (guest_state_valid(vcpu)) + vmx->emulation_required = 0; } /* @@ -2996,6 +3120,11 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit); + /* If we need to emulate an MMIO from handle_invalid_guest_state + * we just return 0 */ + if (vmx->emulation_required && emulate_invalid_guest_state) + return 0; + /* Access CR3 don't cause VMExit in paging mode, so we need * to sync with guest real CR3. */ if (vm_need_ept() && is_paging(vcpu)) { @@ -3012,9 +3141,32 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) if ((vectoring_info & VECTORING_INFO_VALID_MASK) && (exit_reason != EXIT_REASON_EXCEPTION_NMI && - exit_reason != EXIT_REASON_EPT_VIOLATION)) - printk(KERN_WARNING "%s: unexpected, valid vectoring info and " - "exit reason is 0x%x\n", __func__, exit_reason); + exit_reason != EXIT_REASON_EPT_VIOLATION && + exit_reason != EXIT_REASON_TASK_SWITCH)) + printk(KERN_WARNING "%s: unexpected, valid vectoring info " + "(0x%x) and exit reason is 0x%x\n", + __func__, vectoring_info, exit_reason); + + if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) { + if (vcpu->arch.interrupt_window_open) { + vmx->soft_vnmi_blocked = 0; + vcpu->arch.nmi_window_open = 1; + } else if (vmx->vnmi_blocked_time > 1000000000LL && + vcpu->arch.nmi_pending) { + /* + * This CPU don't support us in finding the end of an + * NMI-blocked window if the guest runs with IRQs + * disabled. So we pull the trigger after 1 s of + * futile waiting, but inform the user about this. + */ + printk(KERN_WARNING "%s: Breaking out of NMI-blocked " + "state on VCPU %d after 1 s timeout\n", + __func__, vcpu->vcpu_id); + vmx->soft_vnmi_blocked = 0; + vmx->vcpu.arch.nmi_window_open = 1; + } + } + if (exit_reason < kvm_vmx_max_exit_handlers && kvm_vmx_exit_handlers[exit_reason]) return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); @@ -3042,51 +3194,6 @@ static void update_tpr_threshold(struct kvm_vcpu *vcpu) vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4); } -static void enable_irq_window(struct kvm_vcpu *vcpu) -{ - u32 cpu_based_vm_exec_control; - - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); -} - -static void enable_nmi_window(struct kvm_vcpu *vcpu) -{ - u32 cpu_based_vm_exec_control; - - if (!cpu_has_virtual_nmis()) - return; - - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); -} - -static int vmx_nmi_enabled(struct kvm_vcpu *vcpu) -{ - u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); - return !(guest_intr & (GUEST_INTR_STATE_NMI | - GUEST_INTR_STATE_MOV_SS | - GUEST_INTR_STATE_STI)); -} - -static int vmx_irq_enabled(struct kvm_vcpu *vcpu) -{ - u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); - return (!(guest_intr & (GUEST_INTR_STATE_MOV_SS | - GUEST_INTR_STATE_STI)) && - (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)); -} - -static void enable_intr_window(struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.nmi_pending) - enable_nmi_window(vcpu); - else if (kvm_cpu_has_interrupt(vcpu)) - enable_irq_window(vcpu); -} - static void vmx_complete_interrupts(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -3109,7 +3216,9 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) if (unblock_nmi && vector != DF_VECTOR) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); - } + } else if (unlikely(vmx->soft_vnmi_blocked)) + vmx->vnmi_blocked_time += + ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); idt_vectoring_info = vmx->idt_vectoring_info; idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; @@ -3147,26 +3256,29 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) { update_tpr_threshold(vcpu); - if (cpu_has_virtual_nmis()) { - if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { - if (vcpu->arch.interrupt.pending) { - enable_nmi_window(vcpu); - } else if (vmx_nmi_enabled(vcpu)) { - vcpu->arch.nmi_pending = false; - vcpu->arch.nmi_injected = true; - } else { - enable_intr_window(vcpu); - return; - } - } - if (vcpu->arch.nmi_injected) { - vmx_inject_nmi(vcpu); - enable_intr_window(vcpu); + vmx_update_window_states(vcpu); + + if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { + if (vcpu->arch.interrupt.pending) { + enable_nmi_window(vcpu); + } else if (vcpu->arch.nmi_window_open) { + vcpu->arch.nmi_pending = false; + vcpu->arch.nmi_injected = true; + } else { + enable_nmi_window(vcpu); return; } } + if (vcpu->arch.nmi_injected) { + vmx_inject_nmi(vcpu); + if (vcpu->arch.nmi_pending) + enable_nmi_window(vcpu); + else if (kvm_cpu_has_interrupt(vcpu)) + enable_irq_window(vcpu); + return; + } if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) { - if (vmx_irq_enabled(vcpu)) + if (vcpu->arch.interrupt_window_open) kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu)); else enable_irq_window(vcpu); @@ -3174,6 +3286,8 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) if (vcpu->arch.interrupt.pending) { vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr); + if (kvm_cpu_has_interrupt(vcpu)) + enable_irq_window(vcpu); } } @@ -3213,6 +3327,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 intr_info; + /* Record the guest's net vcpu time for enforced NMI injections. */ + if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) + vmx->entry_time = ktime_get(); + /* Handle invalid guest state instead of entering VMX */ if (vmx->emulation_required && emulate_invalid_guest_state) { handle_invalid_guest_state(vcpu, kvm_run); @@ -3327,9 +3445,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (vmx->rmode.irq.pending) fixup_rmode_irq(vmx); - vcpu->arch.interrupt_window_open = - (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & - (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)) == 0; + vmx_update_window_states(vcpu); asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); vmx->launched = 1; @@ -3337,7 +3453,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) intr_info = vmcs_read32(VM_EXIT_INTR_INFO); /* We need to handle NMIs before interrupts are enabled */ - if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200 && + if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && (intr_info & INTR_INFO_VALID_MASK)) { KVMTRACE_0D(NMI, vcpu, handler); asm("int $2"); @@ -3455,6 +3571,11 @@ static int get_ept_level(void) return VMX_EPT_DEFAULT_GAW + 1; } +static int vmx_get_mt_mask_shift(void) +{ + return VMX_EPT_MT_EPTE_SHIFT; +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -3510,6 +3631,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_tss_addr = vmx_set_tss_addr, .get_tdp_level = get_ept_level, + .get_mt_mask_shift = vmx_get_mt_mask_shift, }; static int __init vmx_init(void) @@ -3566,10 +3688,10 @@ static int __init vmx_init(void) bypass_guest_pf = 0; kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK | - VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT | VMX_EPT_IGMT_BIT); kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, - VMX_EPT_EXECUTABLE_MASK); + VMX_EPT_EXECUTABLE_MASK, + VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT); kvm_enable_tdp(); } else kvm_disable_tdp(); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f1f8ff2f1fa..0e6aa8141dc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -39,6 +39,7 @@ #include <asm/uaccess.h> #include <asm/msr.h> #include <asm/desc.h> +#include <asm/mtrr.h> #define MAX_IO_MSRS 256 #define CR0_RESERVED_BITS \ @@ -86,6 +87,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "hypercalls", VCPU_STAT(hypercalls) }, { "request_irq", VCPU_STAT(request_irq_exits) }, + { "request_nmi", VCPU_STAT(request_nmi_exits) }, { "irq_exits", VCPU_STAT(irq_exits) }, { "host_state_reload", VCPU_STAT(host_state_reload) }, { "efer_reload", VCPU_STAT(efer_reload) }, @@ -93,6 +95,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "insn_emulation", VCPU_STAT(insn_emulation) }, { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, { "irq_injections", VCPU_STAT(irq_injections) }, + { "nmi_injections", VCPU_STAT(nmi_injections) }, { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, { "mmu_pte_write", VM_STAT(mmu_pte_write) }, { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, @@ -101,6 +104,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmu_recycled", VM_STAT(mmu_recycled) }, { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, { "mmu_unsync", VM_STAT(mmu_unsync) }, + { "mmu_unsync_global", VM_STAT(mmu_unsync_global) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, { "largepages", VM_STAT(lpages) }, { NULL } @@ -312,6 +316,7 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) kvm_x86_ops->set_cr0(vcpu, cr0); vcpu->arch.cr0 = cr0; + kvm_mmu_sync_global(vcpu); kvm_mmu_reset_context(vcpu); return; } @@ -355,6 +360,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) } kvm_x86_ops->set_cr4(vcpu, cr4); vcpu->arch.cr4 = cr4; + kvm_mmu_sync_global(vcpu); kvm_mmu_reset_context(vcpu); } EXPORT_SYMBOL_GPL(kvm_set_cr4); @@ -449,7 +455,7 @@ static u32 msrs_to_save[] = { MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, - MSR_IA32_PERF_STATUS, + MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT }; static unsigned num_msrs_to_save; @@ -648,10 +654,38 @@ static bool msr_mtrr_valid(unsigned msr) static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { + u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; + if (!msr_mtrr_valid(msr)) return 1; - vcpu->arch.mtrr[msr - 0x200] = data; + if (msr == MSR_MTRRdefType) { + vcpu->arch.mtrr_state.def_type = data; + vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10; + } else if (msr == MSR_MTRRfix64K_00000) + p[0] = data; + else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) + p[1 + msr - MSR_MTRRfix16K_80000] = data; + else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) + p[3 + msr - MSR_MTRRfix4K_C0000] = data; + else if (msr == MSR_IA32_CR_PAT) + vcpu->arch.pat = data; + else { /* Variable MTRRs */ + int idx, is_mtrr_mask; + u64 *pt; + + idx = (msr - 0x200) / 2; + is_mtrr_mask = msr - 0x200 - 2 * idx; + if (!is_mtrr_mask) + pt = + (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; + else + pt = + (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; + *pt = data; + } + + kvm_mmu_reset_context(vcpu); return 0; } @@ -747,10 +781,37 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { + u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; + if (!msr_mtrr_valid(msr)) return 1; - *pdata = vcpu->arch.mtrr[msr - 0x200]; + if (msr == MSR_MTRRdefType) + *pdata = vcpu->arch.mtrr_state.def_type + + (vcpu->arch.mtrr_state.enabled << 10); + else if (msr == MSR_MTRRfix64K_00000) + *pdata = p[0]; + else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) + *pdata = p[1 + msr - MSR_MTRRfix16K_80000]; + else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) + *pdata = p[3 + msr - MSR_MTRRfix4K_C0000]; + else if (msr == MSR_IA32_CR_PAT) + *pdata = vcpu->arch.pat; + else { /* Variable MTRRs */ + int idx, is_mtrr_mask; + u64 *pt; + + idx = (msr - 0x200) / 2; + is_mtrr_mask = msr - 0x200 - 2 * idx; + if (!is_mtrr_mask) + pt = + (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; + else + pt = + (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; + *pdata = *pt; + } + return 0; } @@ -903,7 +964,6 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_IRQCHIP: case KVM_CAP_HLT: case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: - case KVM_CAP_USER_MEMORY: case KVM_CAP_SET_TSS_ADDR: case KVM_CAP_EXT_CPUID: case KVM_CAP_CLOCKSOURCE: @@ -1188,6 +1248,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, int t, times = entry->eax & 0xff; entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; + entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; for (t = 1; t < times && *nent < maxnent; ++t) { do_cpuid_1_ent(&entry[t], function, 0); entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; @@ -1218,7 +1279,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; /* read more entries until level_type is zero */ for (i = 1; *nent < maxnent; ++i) { - level_type = entry[i - 1].ecx & 0xff; + level_type = entry[i - 1].ecx & 0xff00; if (!level_type) break; do_cpuid_1_ent(&entry[i], function, i); @@ -1318,6 +1379,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, return 0; } +static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) +{ + vcpu_load(vcpu); + kvm_inject_nmi(vcpu); + vcpu_put(vcpu); + + return 0; +} + static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, struct kvm_tpr_access_ctl *tac) { @@ -1377,6 +1447,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = 0; break; } + case KVM_NMI: { + r = kvm_vcpu_ioctl_nmi(vcpu); + if (r) + goto out; + r = 0; + break; + } case KVM_SET_CPUID: { struct kvm_cpuid __user *cpuid_arg = argp; struct kvm_cpuid cpuid; @@ -1968,7 +2045,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); if (ret < 0) return 0; - kvm_mmu_pte_write(vcpu, gpa, val, bytes); + kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1); return 1; } @@ -2404,8 +2481,6 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, val = kvm_register_read(vcpu, VCPU_REGS_RAX); memcpy(vcpu->arch.pio_data, &val, 4); - kvm_x86_ops->skip_emulated_instruction(vcpu); - pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in); if (pio_dev) { kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); @@ -2541,7 +2616,7 @@ int kvm_arch_init(void *opaque) kvm_mmu_set_nonpresent_ptes(0ull, 0ull); kvm_mmu_set_base_ptes(PT_PRESENT_MASK); kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, - PT_DIRTY_MASK, PT64_NX_MASK, 0); + PT_DIRTY_MASK, PT64_NX_MASK, 0, 0); return 0; out: @@ -2729,7 +2804,7 @@ static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; /* when no next entry is found, the current entry[i] is reselected */ - for (j = i + 1; j == i; j = (j + 1) % nent) { + for (j = i + 1; ; j = (j + 1) % nent) { struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; if (ej->function == e->function) { ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; @@ -2973,7 +3048,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) pr_debug("vcpu %d received sipi with vector # %x\n", vcpu->vcpu_id, vcpu->arch.sipi_vector); kvm_lapic_reset(vcpu); - r = kvm_x86_ops->vcpu_reset(vcpu); + r = kvm_arch_vcpu_reset(vcpu); if (r) return r; vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -3275,9 +3350,9 @@ static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, kvm_desct->padding = 0; } -static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu, - u16 selector, - struct descriptor_table *dtable) +static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu, + u16 selector, + struct descriptor_table *dtable) { if (selector & 1 << 2) { struct kvm_segment kvm_seg; @@ -3302,7 +3377,7 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, struct descriptor_table dtable; u16 index = selector >> 3; - get_segment_descritptor_dtable(vcpu, selector, &dtable); + get_segment_descriptor_dtable(vcpu, selector, &dtable); if (dtable.limit < index * 8 + 7) { kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); @@ -3321,7 +3396,7 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, struct descriptor_table dtable; u16 index = selector >> 3; - get_segment_descritptor_dtable(vcpu, selector, &dtable); + get_segment_descriptor_dtable(vcpu, selector, &dtable); if (dtable.limit < index * 8 + 7) return 1; @@ -3900,6 +3975,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) /* We do fxsave: this must be aligned. */ BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); + vcpu->arch.mtrr_state.have_fixed = 1; vcpu_load(vcpu); r = kvm_arch_vcpu_reset(vcpu); if (r == 0) @@ -3925,6 +4001,9 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) { + vcpu->arch.nmi_pending = false; + vcpu->arch.nmi_injected = false; + return kvm_x86_ops->vcpu_reset(vcpu); } @@ -4012,6 +4091,7 @@ struct kvm *kvm_arch_create_vm(void) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + INIT_LIST_HEAD(&kvm->arch.oos_global_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ @@ -4048,8 +4128,8 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { - kvm_iommu_unmap_guest(kvm); kvm_free_all_assigned_devices(kvm); + kvm_iommu_unmap_guest(kvm); kvm_free_pit(kvm); kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); @@ -4127,7 +4207,8 @@ void kvm_arch_flush_shadow(struct kvm *kvm) int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE - || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED; + || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED + || vcpu->arch.nmi_pending; } static void vcpu_kick_intr(void *info) diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index ea051173b0d..d174db7a337 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -58,6 +58,7 @@ #define SrcMem32 (4<<4) /* Memory operand (32-bit). */ #define SrcImm (5<<4) /* Immediate operand. */ #define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ +#define SrcOne (7<<4) /* Implied '1' */ #define SrcMask (7<<4) /* Generic ModRM decode. */ #define ModRM (1<<7) @@ -70,17 +71,23 @@ #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ #define GroupMask 0xff /* Group number stored in bits 0:7 */ +/* Source 2 operand type */ +#define Src2None (0<<29) +#define Src2CL (1<<29) +#define Src2ImmByte (2<<29) +#define Src2One (3<<29) +#define Src2Mask (7<<29) enum { Group1_80, Group1_81, Group1_82, Group1_83, Group1A, Group3_Byte, Group3, Group4, Group5, Group7, }; -static u16 opcode_table[256] = { +static u32 opcode_table[256] = { /* 0x00 - 0x07 */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, + ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, /* 0x08 - 0x0F */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, @@ -195,7 +202,7 @@ static u16 opcode_table[256] = { ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, }; -static u16 twobyte_table[256] = { +static u32 twobyte_table[256] = { /* 0x00 - 0x0F */ 0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0, ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, @@ -230,9 +237,14 @@ static u16 twobyte_table[256] = { /* 0x90 - 0x9F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xA0 - 0xA7 */ - 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, + 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, + DstMem | SrcReg | Src2ImmByte | ModRM, + DstMem | SrcReg | Src2CL | ModRM, 0, 0, /* 0xA8 - 0xAF */ - 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, ModRM, 0, + 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, + DstMem | SrcReg | Src2ImmByte | ModRM, + DstMem | SrcReg | Src2CL | ModRM, + ModRM, 0, /* 0xB0 - 0xB7 */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, DstMem | SrcReg | ModRM | BitOp, @@ -253,7 +265,7 @@ static u16 twobyte_table[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; -static u16 group_table[] = { +static u32 group_table[] = { [Group1_80*8] = ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, @@ -297,9 +309,9 @@ static u16 group_table[] = { SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp, }; -static u16 group2_table[] = { +static u32 group2_table[] = { [Group7*8] = - SrcNone | ModRM, 0, 0, 0, + SrcNone | ModRM, 0, 0, SrcNone | ModRM, SrcNone | ModRM | DstMem | Mov, 0, SrcMem16 | ModRM | Mov, 0, }; @@ -359,49 +371,48 @@ static u16 group2_table[] = { "andl %"_msk",%"_LO32 _tmp"; " \ "orl %"_LO32 _tmp",%"_sav"; " +#ifdef CONFIG_X86_64 +#define ON64(x) x +#else +#define ON64(x) +#endif + +#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix) \ + do { \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "4", "2") \ + _op _suffix " %"_x"3,%1; " \ + _POST_EFLAGS("0", "4", "2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : _y ((_src).val), "i" (EFLAGS_MASK)); \ + } while (0) + + /* Raw emulation: instruction has two explicit operands. */ #define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ - do { \ - unsigned long _tmp; \ - \ - switch ((_dst).bytes) { \ - case 2: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "4", "2") \ - _op"w %"_wx"3,%1; " \ - _POST_EFLAGS("0", "4", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : _wy ((_src).val), "i" (EFLAGS_MASK)); \ - break; \ - case 4: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "4", "2") \ - _op"l %"_lx"3,%1; " \ - _POST_EFLAGS("0", "4", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : _ly ((_src).val), "i" (EFLAGS_MASK)); \ - break; \ - case 8: \ - __emulate_2op_8byte(_op, _src, _dst, \ - _eflags, _qx, _qy); \ - break; \ - } \ + do { \ + unsigned long _tmp; \ + \ + switch ((_dst).bytes) { \ + case 2: \ + ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \ + break; \ + case 4: \ + ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \ + break; \ + case 8: \ + ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \ + break; \ + } \ } while (0) #define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ do { \ - unsigned long __tmp; \ + unsigned long _tmp; \ switch ((_dst).bytes) { \ case 1: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "4", "2") \ - _op"b %"_bx"3,%1; " \ - _POST_EFLAGS("0", "4", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (__tmp) \ - : _by ((_src).val), "i" (EFLAGS_MASK)); \ + ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b"); \ break; \ default: \ __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ @@ -425,71 +436,68 @@ static u16 group2_table[] = { __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ "w", "r", _LO32, "r", "", "r") -/* Instruction has only one explicit operand (no source operand). */ -#define emulate_1op(_op, _dst, _eflags) \ - do { \ - unsigned long _tmp; \ - \ - switch ((_dst).bytes) { \ - case 1: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "3", "2") \ - _op"b %1; " \ - _POST_EFLAGS("0", "3", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : "i" (EFLAGS_MASK)); \ - break; \ - case 2: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "3", "2") \ - _op"w %1; " \ - _POST_EFLAGS("0", "3", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : "i" (EFLAGS_MASK)); \ - break; \ - case 4: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "3", "2") \ - _op"l %1; " \ - _POST_EFLAGS("0", "3", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : "i" (EFLAGS_MASK)); \ - break; \ - case 8: \ - __emulate_1op_8byte(_op, _dst, _eflags); \ - break; \ - } \ +/* Instruction has three operands and one operand is stored in ECX register */ +#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \ + do { \ + unsigned long _tmp; \ + _type _clv = (_cl).val; \ + _type _srcv = (_src).val; \ + _type _dstv = (_dst).val; \ + \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "5", "2") \ + _op _suffix " %4,%1 \n" \ + _POST_EFLAGS("0", "5", "2") \ + : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \ + : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ + ); \ + \ + (_cl).val = (unsigned long) _clv; \ + (_src).val = (unsigned long) _srcv; \ + (_dst).val = (unsigned long) _dstv; \ } while (0) -/* Emulate an instruction with quadword operands (x86/64 only). */ -#if defined(CONFIG_X86_64) -#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \ - do { \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "4", "2") \ - _op"q %"_qx"3,%1; " \ - _POST_EFLAGS("0", "4", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ - : _qy ((_src).val), "i" (EFLAGS_MASK)); \ +#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \ + do { \ + switch ((_dst).bytes) { \ + case 2: \ + __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ + "w", unsigned short); \ + break; \ + case 4: \ + __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ + "l", unsigned int); \ + break; \ + case 8: \ + ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ + "q", unsigned long)); \ + break; \ + } \ } while (0) -#define __emulate_1op_8byte(_op, _dst, _eflags) \ - do { \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "3", "2") \ - _op"q %1; " \ - _POST_EFLAGS("0", "3", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ - : "i" (EFLAGS_MASK)); \ +#define __emulate_1op(_op, _dst, _eflags, _suffix) \ + do { \ + unsigned long _tmp; \ + \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "3", "2") \ + _op _suffix " %1; " \ + _POST_EFLAGS("0", "3", "2") \ + : "=m" (_eflags), "+m" ((_dst).val), \ + "=&r" (_tmp) \ + : "i" (EFLAGS_MASK)); \ } while (0) -#elif defined(__i386__) -#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) -#define __emulate_1op_8byte(_op, _dst, _eflags) -#endif /* __i386__ */ +/* Instruction has only one explicit operand (no source operand). */ +#define emulate_1op(_op, _dst, _eflags) \ + do { \ + switch ((_dst).bytes) { \ + case 1: __emulate_1op(_op, _dst, _eflags, "b"); break; \ + case 2: __emulate_1op(_op, _dst, _eflags, "w"); break; \ + case 4: __emulate_1op(_op, _dst, _eflags, "l"); break; \ + case 8: ON64(__emulate_1op(_op, _dst, _eflags, "q")); break; \ + } \ + } while (0) /* Fetch next part of the instruction being emulated. */ #define insn_fetch(_type, _size, _eip) \ @@ -1041,6 +1049,33 @@ done_prefixes: c->src.bytes = 1; c->src.val = insn_fetch(s8, 1, c->eip); break; + case SrcOne: + c->src.bytes = 1; + c->src.val = 1; + break; + } + + /* + * Decode and fetch the second source operand: register, memory + * or immediate. + */ + switch (c->d & Src2Mask) { + case Src2None: + break; + case Src2CL: + c->src2.bytes = 1; + c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8; + break; + case Src2ImmByte: + c->src2.type = OP_IMM; + c->src2.ptr = (unsigned long *)c->eip; + c->src2.bytes = 1; + c->src2.val = insn_fetch(u8, 1, c->eip); + break; + case Src2One: + c->src2.bytes = 1; + c->src2.val = 1; + break; } /* Decode and fetch the destination operand: register or memory. */ @@ -1100,20 +1135,33 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt) c->regs[VCPU_REGS_RSP]); } -static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int emulate_pop(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; int rc; - rc = ops->read_std(register_address(c, ss_base(ctxt), - c->regs[VCPU_REGS_RSP]), - &c->dst.val, c->dst.bytes, ctxt->vcpu); + rc = ops->read_emulated(register_address(c, ss_base(ctxt), + c->regs[VCPU_REGS_RSP]), + &c->src.val, c->src.bytes, ctxt->vcpu); if (rc != 0) return rc; - register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->dst.bytes); + register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.bytes); + return rc; +} + +static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + c->src.bytes = c->dst.bytes; + rc = emulate_pop(ctxt, ops); + if (rc != 0) + return rc; + c->dst.val = c->src.val; return 0; } @@ -1415,24 +1463,15 @@ special_insn: emulate_1op("dec", c->dst, ctxt->eflags); break; case 0x50 ... 0x57: /* push reg */ - c->dst.type = OP_MEM; - c->dst.bytes = c->op_bytes; - c->dst.val = c->src.val; - register_address_increment(c, &c->regs[VCPU_REGS_RSP], - -c->op_bytes); - c->dst.ptr = (void *) register_address( - c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]); + emulate_push(ctxt); break; case 0x58 ... 0x5f: /* pop reg */ pop_instruction: - if ((rc = ops->read_std(register_address(c, ss_base(ctxt), - c->regs[VCPU_REGS_RSP]), c->dst.ptr, - c->op_bytes, ctxt->vcpu)) != 0) + c->src.bytes = c->op_bytes; + rc = emulate_pop(ctxt, ops); + if (rc != 0) goto done; - - register_address_increment(c, &c->regs[VCPU_REGS_RSP], - c->op_bytes); - c->dst.type = OP_NONE; /* Disable writeback. */ + c->dst.val = c->src.val; break; case 0x63: /* movsxd */ if (ctxt->mode != X86EMUL_MODE_PROT64) @@ -1591,7 +1630,9 @@ special_insn: emulate_push(ctxt); break; case 0x9d: /* popf */ + c->dst.type = OP_REG; c->dst.ptr = (unsigned long *) &ctxt->eflags; + c->dst.bytes = c->op_bytes; goto pop_instruction; case 0xa0 ... 0xa1: /* mov */ c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; @@ -1689,7 +1730,9 @@ special_insn: emulate_grp2(ctxt); break; case 0xc3: /* ret */ + c->dst.type = OP_REG; c->dst.ptr = &c->eip; + c->dst.bytes = c->op_bytes; goto pop_instruction; case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ mov: @@ -1778,7 +1821,7 @@ special_insn: c->eip = saved_eip; goto cannot_emulate; } - return 0; + break; case 0xf4: /* hlt */ ctxt->vcpu->arch.halt_request = 1; break; @@ -1999,12 +2042,20 @@ twobyte_insn: c->src.val &= (c->dst.bytes << 3) - 1; emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags); break; + case 0xa4: /* shld imm8, r, r/m */ + case 0xa5: /* shld cl, r, r/m */ + emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); + break; case 0xab: bts: /* bts */ /* only subword offset */ c->src.val &= (c->dst.bytes << 3) - 1; emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); break; + case 0xac: /* shrd imm8, r, r/m */ + case 0xad: /* shrd cl, r, r/m */ + emulate_2op_cl("shrd", c->src2, c->src, c->dst, ctxt->eflags); + break; case 0xae: /* clflush */ break; case 0xb0 ... 0xb1: /* cmpxchg */ diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 50a779264bb..a7ed208f81e 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -738,7 +738,7 @@ static void lguest_time_init(void) /* We can't set cpumask in the initializer: damn C limitations! Set it * here and register our timer device. */ - lguest_clockevent.cpumask = cpumask_of_cpu(0); + lguest_clockevent.cpumask = cpumask_of(0); clockevents_register_device(&lguest_clockevent); /* Finally, we unblock the timer interrupt. */ diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index 5c7cef34c9e..10b9bd35a8f 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -30,21 +30,6 @@ ENTRY(lguest_entry) movl $lguest_data - __PAGE_OFFSET, %edx int $LGUEST_TRAP_ENTRY - /* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl - * instruction uses %esi implicitly as the source for the copy we're - * about to do. */ - movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi - - /* Copy first 32 entries of page directory to __PAGE_OFFSET entries. - * This means the first 128M of kernel memory will be mapped at - * PAGE_OFFSET where the kernel expects to run. This will get it far - * enough through boot to switch to its own pagetables. */ - movl $32, %ecx - movl %esi, %edi - addl $((__PAGE_OFFSET >> 22) * 4), %edi - rep - movsl - /* Set up the initial stack so we can run C code. */ movl $(init_thread_union+THREAD_SIZE),%esp diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 9e68075544f..4a20b2f9a38 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -39,7 +39,7 @@ static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned lon #define __do_strncpy_from_user(dst, src, count, res) \ do { \ int __d0, __d1, __d2; \ - might_sleep(); \ + might_fault(); \ __asm__ __volatile__( \ " testl %1,%1\n" \ " jz 2f\n" \ @@ -126,7 +126,7 @@ EXPORT_SYMBOL(strncpy_from_user); #define __do_clear_user(addr,size) \ do { \ int __d0; \ - might_sleep(); \ + might_fault(); \ __asm__ __volatile__( \ "0: rep; stosl\n" \ " movl %2,%0\n" \ @@ -155,7 +155,7 @@ do { \ unsigned long clear_user(void __user *to, unsigned long n) { - might_sleep(); + might_fault(); if (access_ok(VERIFY_WRITE, to, n)) __do_clear_user(to, n); return n; @@ -197,7 +197,7 @@ long strnlen_user(const char __user *s, long n) unsigned long mask = -__addr_ok(s); unsigned long res, tmp; - might_sleep(); + might_fault(); __asm__ __volatile__( " testl %0, %0\n" diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index f4df6e7c718..64d6c84e635 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -15,7 +15,7 @@ #define __do_strncpy_from_user(dst,src,count,res) \ do { \ long __d0, __d1, __d2; \ - might_sleep(); \ + might_fault(); \ __asm__ __volatile__( \ " testq %1,%1\n" \ " jz 2f\n" \ @@ -64,7 +64,7 @@ EXPORT_SYMBOL(strncpy_from_user); unsigned long __clear_user(void __user *addr, unsigned long size) { long __d0; - might_sleep(); + might_fault(); /* no memory constraint because it doesn't change any memory gcc knows about */ asm volatile( diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c index 3624a364b7f..bc4c7840b2a 100644 --- a/arch/x86/mach-generic/bigsmp.c +++ b/arch/x86/mach-generic/bigsmp.c @@ -42,9 +42,10 @@ static const struct dmi_system_id bigsmp_dmi_table[] = { { } }; -static cpumask_t vector_allocation_domain(int cpu) +static void vector_allocation_domain(int cpu, cpumask_t *retmask) { - return cpumask_of_cpu(cpu); + cpus_clear(*retmask); + cpu_set(cpu, *retmask); } static int probe_bigsmp(void) diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c index 7b4e6d0d169..4ba5ccaa158 100644 --- a/arch/x86/mach-generic/es7000.c +++ b/arch/x86/mach-generic/es7000.c @@ -87,7 +87,7 @@ static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) } #endif -static cpumask_t vector_allocation_domain(int cpu) +static void vector_allocation_domain(int cpu, cpumask_t *retmask) { /* Careful. Some cpus do not strictly honor the set of cpus * specified in the interrupt destination when using lowest @@ -97,8 +97,7 @@ static cpumask_t vector_allocation_domain(int cpu) * deliver interrupts to the wrong hyperthread when only one * hyperthread was specified in the interrupt desitination. */ - cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; - return domain; + *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } }; } struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000); diff --git a/arch/x86/mach-generic/numaq.c b/arch/x86/mach-generic/numaq.c index 71a309b122e..511d7941364 100644 --- a/arch/x86/mach-generic/numaq.c +++ b/arch/x86/mach-generic/numaq.c @@ -38,7 +38,7 @@ static int acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } -static cpumask_t vector_allocation_domain(int cpu) +static void vector_allocation_domain(int cpu, cpumask_t *retmask) { /* Careful. Some cpus do not strictly honor the set of cpus * specified in the interrupt destination when using lowest @@ -48,8 +48,7 @@ static cpumask_t vector_allocation_domain(int cpu) * deliver interrupts to the wrong hyperthread when only one * hyperthread was specified in the interrupt desitination. */ - cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; - return domain; + *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } }; } struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq); diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c index 2c6d234e000..2821ffc188b 100644 --- a/arch/x86/mach-generic/summit.c +++ b/arch/x86/mach-generic/summit.c @@ -24,7 +24,7 @@ static int probe_summit(void) return 0; } -static cpumask_t vector_allocation_domain(int cpu) +static void vector_allocation_domain(int cpu, cpumask_t *retmask) { /* Careful. Some cpus do not strictly honor the set of cpus * specified in the interrupt destination when using lowest @@ -34,8 +34,7 @@ static cpumask_t vector_allocation_domain(int cpu) * deliver interrupts to the wrong hyperthread when only one * hyperthread was specified in the interrupt desitination. */ - cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; - return domain; + *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } }; } struct genapic apic_summit = APIC_INIT("summit", probe_summit); diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c index 52145007bd7..a5bc05492b1 100644 --- a/arch/x86/mach-voyager/voyager_smp.c +++ b/arch/x86/mach-voyager/voyager_smp.c @@ -63,11 +63,6 @@ static int voyager_extended_cpus = 1; /* Used for the invalidate map that's also checked in the spinlock */ static volatile unsigned long smp_invalidate_needed; -/* Bitmask of currently online CPUs - used by setup.c for - /proc/cpuinfo, visible externally but still physical */ -cpumask_t cpu_online_map = CPU_MASK_NONE; -EXPORT_SYMBOL(cpu_online_map); - /* Bitmask of CPUs present in the system - exported by i386_syms.c, used * by scheduler but indexed physically */ cpumask_t phys_cpu_present_map = CPU_MASK_NONE; @@ -218,8 +213,6 @@ static cpumask_t smp_commenced_mask = CPU_MASK_NONE; /* This is for the new dynamic CPU boot code */ cpumask_t cpu_callin_map = CPU_MASK_NONE; cpumask_t cpu_callout_map = CPU_MASK_NONE; -cpumask_t cpu_possible_map = CPU_MASK_NONE; -EXPORT_SYMBOL(cpu_possible_map); /* The per processor IRQ masks (these are usually kept in sync) */ static __u16 vic_irq_mask[NR_CPUS] __cacheline_aligned; @@ -679,7 +672,7 @@ void __init smp_boot_cpus(void) /* loop over all the extended VIC CPUs and boot them. The * Quad CPUs must be bootstrapped by their extended VIC cpu */ - for (i = 0; i < NR_CPUS; i++) { + for (i = 0; i < nr_cpu_ids; i++) { if (i == boot_cpu_id || !cpu_isset(i, phys_cpu_present_map)) continue; do_boot_cpu(i); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index ad98b18f2b4..f99a6c6c432 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -21,6 +21,7 @@ #include <linux/init.h> #include <linux/highmem.h> #include <linux/pagemap.h> +#include <linux/pci.h> #include <linux/pfn.h> #include <linux/poison.h> #include <linux/bootmem.h> @@ -971,6 +972,8 @@ void __init mem_init(void) int codesize, reservedpages, datasize, initsize; int tmp; + pci_iommu_alloc(); + #ifdef CONFIG_FLATMEM BUG_ON(!mem_map); #endif diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index cebcbf152d4..71a14f89f89 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -278,7 +278,7 @@ void __init numa_init_array(void) int rr, i; rr = first_node(node_online_map); - for (i = 0; i < NR_CPUS; i++) { + for (i = 0; i < nr_cpu_ids; i++) { if (early_cpu_to_node(i) != NUMA_NO_NODE) continue; numa_set_node(i, rr); @@ -549,7 +549,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn) memnodemap[0] = 0; node_set_online(0); node_set(0, node_possible_map); - for (i = 0; i < NR_CPUS; i++) + for (i = 0; i < nr_cpu_ids; i++) numa_set_node(i, 0); e820_register_active_regions(0, start_pfn, last_pfn); setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT); diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 51c0a2fc14f..09737c8af07 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -382,7 +382,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) if (!node_online(i)) setup_node_bootmem(i, nodes[i].start, nodes[i].end); - for (i = 0; i < NR_CPUS; i++) { + for (i = 0; i < nr_cpu_ids; i++) { int node = early_cpu_to_node(i); if (node == NUMA_NO_NODE) diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 509513760a6..98658f25f54 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -65,11 +65,13 @@ static unsigned long reset_value[NUM_COUNTERS]; #define IBS_FETCH_BEGIN 3 #define IBS_OP_BEGIN 4 -/* The function interface needs to be fixed, something like add - data. Should then be added to linux/oprofile.h. */ +/* + * The function interface needs to be fixed, something like add + * data. Should then be added to linux/oprofile.h. + */ extern void -oprofile_add_ibs_sample(struct pt_regs *const regs, - unsigned int *const ibs_sample, int ibs_code); +oprofile_add_ibs_sample(struct pt_regs * const regs, + unsigned int * const ibs_sample, int ibs_code); struct ibs_fetch_sample { /* MSRC001_1031 IBS Fetch Linear Address Register */ @@ -104,11 +106,6 @@ struct ibs_op_sample { unsigned int ibs_dc_phys_high; }; -/* - * unitialize the APIC for the IBS interrupts if needed on AMD Family10h+ -*/ -static void clear_ibs_nmi(void); - static int ibs_allowed; /* AMD Family10h and later */ struct op_ibs_config { @@ -223,7 +220,7 @@ op_amd_handle_ibs(struct pt_regs * const regs, (unsigned int *)&ibs_fetch, IBS_FETCH_BEGIN); - /*reenable the IRQ */ + /* reenable the IRQ */ rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); high &= ~IBS_FETCH_HIGH_VALID_BIT; high |= IBS_FETCH_HIGH_ENABLE; @@ -331,8 +328,10 @@ static void op_amd_stop(struct op_msrs const * const msrs) unsigned int low, high; int i; - /* Subtle: stop on all counters to avoid race with - * setting our pm callback */ + /* + * Subtle: stop on all counters to avoid race with setting our + * pm callback + */ for (i = 0 ; i < NUM_COUNTERS ; ++i) { if (!reset_value[i]) continue; @@ -343,13 +342,15 @@ static void op_amd_stop(struct op_msrs const * const msrs) #ifdef CONFIG_OPROFILE_IBS if (ibs_allowed && ibs_config.fetch_enabled) { - low = 0; /* clear max count and enable */ + /* clear max count and enable */ + low = 0; high = 0; wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); } if (ibs_allowed && ibs_config.op_enabled) { - low = 0; /* clear max count and enable */ + /* clear max count and enable */ + low = 0; high = 0; wrmsr(MSR_AMD64_IBSOPCTL, low, high); } @@ -370,18 +371,7 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) } } -#ifndef CONFIG_OPROFILE_IBS - -/* no IBS support */ - -static int op_amd_init(struct oprofile_operations *ops) -{ - return 0; -} - -static void op_amd_exit(void) {} - -#else +#ifdef CONFIG_OPROFILE_IBS static u8 ibs_eilvt_off; @@ -395,7 +385,7 @@ static inline void apic_clear_ibs_nmi_per_cpu(void *arg) setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1); } -static int pfm_amd64_setup_eilvt(void) +static int init_ibs_nmi(void) { #define IBSCTL_LVTOFFSETVAL (1 << 8) #define IBSCTL 0x1cc @@ -443,18 +433,22 @@ static int pfm_amd64_setup_eilvt(void) return 0; } -/* - * initialize the APIC for the IBS interrupts - * if available (AMD Family10h rev B0 and later) - */ -static void setup_ibs(void) +/* uninitialize the APIC for the IBS interrupts if needed */ +static void clear_ibs_nmi(void) +{ + if (ibs_allowed) + on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); +} + +/* initialize the APIC for the IBS interrupts if available */ +static void ibs_init(void) { ibs_allowed = boot_cpu_has(X86_FEATURE_IBS); if (!ibs_allowed) return; - if (pfm_amd64_setup_eilvt()) { + if (init_ibs_nmi()) { ibs_allowed = 0; return; } @@ -462,14 +456,12 @@ static void setup_ibs(void) printk(KERN_INFO "oprofile: AMD IBS detected\n"); } - -/* - * unitialize the APIC for the IBS interrupts if needed on AMD Family10h - * rev B0 and later */ -static void clear_ibs_nmi(void) +static void ibs_exit(void) { - if (ibs_allowed) - on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); + if (!ibs_allowed) + return; + + clear_ibs_nmi(); } static int (*create_arch_files)(struct super_block *sb, struct dentry *root); @@ -519,7 +511,7 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root) static int op_amd_init(struct oprofile_operations *ops) { - setup_ibs(); + ibs_init(); create_arch_files = ops->create_files; ops->create_files = setup_ibs_files; return 0; @@ -527,10 +519,21 @@ static int op_amd_init(struct oprofile_operations *ops) static void op_amd_exit(void) { - clear_ibs_nmi(); + ibs_exit(); } -#endif +#else + +/* no IBS support */ + +static int op_amd_init(struct oprofile_operations *ops) +{ + return 0; +} + +static void op_amd_exit(void) {} + +#endif /* CONFIG_OPROFILE_IBS */ struct op_x86_model_spec const op_amd_spec = { .init = op_amd_init, diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 773d68d3e91..503c240e26c 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1082,7 +1082,7 @@ static void drop_other_mm_ref(void *info) static void xen_drop_mm_ref(struct mm_struct *mm) { - cpumask_t mask; + cpumask_var_t mask; unsigned cpu; if (current->active_mm == mm) { @@ -1094,7 +1094,16 @@ static void xen_drop_mm_ref(struct mm_struct *mm) } /* Get the "official" set of cpus referring to our pagetable. */ - mask = mm->cpu_vm_mask; + if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { + for_each_online_cpu(cpu) { + if (!cpumask_test_cpu(cpu, &mm->cpu_vm_mask) + && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) + continue; + smp_call_function_single(cpu, drop_other_mm_ref, mm, 1); + } + return; + } + cpumask_copy(mask, &mm->cpu_vm_mask); /* It's possible that a vcpu may have a stale reference to our cr3, because its in lazy mode, and it hasn't yet flushed @@ -1103,11 +1112,12 @@ static void xen_drop_mm_ref(struct mm_struct *mm) if needed. */ for_each_online_cpu(cpu) { if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) - cpu_set(cpu, mask); + cpumask_set_cpu(cpu, mask); } - if (!cpus_empty(mask)) - smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); + if (!cpumask_empty(mask)) + smp_call_function_many(mask, drop_other_mm_ref, mm, 1); + free_cpumask_var(mask); } #else static void xen_drop_mm_ref(struct mm_struct *mm) diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index acd9b6705e0..c44e2069c7c 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -33,7 +33,7 @@ #include "xen-ops.h" #include "mmu.h" -cpumask_t xen_cpu_initialized_map; +cpumask_var_t xen_cpu_initialized_map; static DEFINE_PER_CPU(int, resched_irq); static DEFINE_PER_CPU(int, callfunc_irq); @@ -158,7 +158,7 @@ static void __init xen_fill_possible_map(void) { int i, rc; - for (i = 0; i < NR_CPUS; i++) { + for (i = 0; i < nr_cpu_ids; i++) { rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); if (rc >= 0) { num_processors++; @@ -192,11 +192,14 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) if (xen_smp_intr_init(0)) BUG(); - xen_cpu_initialized_map = cpumask_of_cpu(0); + if (!alloc_cpumask_var(&xen_cpu_initialized_map, GFP_KERNEL)) + panic("could not allocate xen_cpu_initialized_map\n"); + + cpumask_copy(xen_cpu_initialized_map, cpumask_of(0)); /* Restrict the possible_map according to max_cpus. */ while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) { - for (cpu = NR_CPUS - 1; !cpu_possible(cpu); cpu--) + for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--) continue; cpu_clear(cpu, cpu_possible_map); } @@ -221,7 +224,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) struct vcpu_guest_context *ctxt; struct desc_struct *gdt; - if (cpu_test_and_set(cpu, xen_cpu_initialized_map)) + if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map)) return 0; ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); @@ -408,24 +411,23 @@ static void xen_smp_send_reschedule(int cpu) xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); } -static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector) +static void xen_send_IPI_mask(const struct cpumask *mask, + enum ipi_vector vector) { unsigned cpu; - cpus_and(mask, mask, cpu_online_map); - - for_each_cpu_mask_nr(cpu, mask) + for_each_cpu_and(cpu, mask, cpu_online_mask) xen_send_IPI_one(cpu, vector); } -static void xen_smp_send_call_function_ipi(cpumask_t mask) +static void xen_smp_send_call_function_ipi(const struct cpumask *mask) { int cpu; xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); /* Make sure other vcpus get a chance to run if they need to. */ - for_each_cpu_mask_nr(cpu, mask) { + for_each_cpu(cpu, mask) { if (xen_vcpu_stolen(cpu)) { HYPERVISOR_sched_op(SCHEDOP_yield, 0); break; @@ -435,7 +437,8 @@ static void xen_smp_send_call_function_ipi(cpumask_t mask) static void xen_smp_send_call_function_single_ipi(int cpu) { - xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR); + xen_send_IPI_mask(cpumask_of(cpu), + XEN_CALL_FUNCTION_SINGLE_VECTOR); } static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 2a234db5949..212ffe012b7 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -35,7 +35,8 @@ void xen_post_suspend(int suspend_cancelled) pfn_to_mfn(xen_start_info->console.domU.mfn); } else { #ifdef CONFIG_SMP - xen_cpu_initialized_map = cpu_online_map; + BUG_ON(xen_cpu_initialized_map == NULL); + cpumask_copy(xen_cpu_initialized_map, cpu_online_mask); #endif xen_vcpu_restore(); } diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index c9f7cda48ed..65d75a6be0b 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -437,7 +437,7 @@ void xen_setup_timer(int cpu) evt = &per_cpu(xen_clock_events, cpu); memcpy(evt, xen_clockevent, sizeof(*evt)); - evt->cpumask = cpumask_of_cpu(cpu); + evt->cpumask = cpumask_of(cpu); evt->irq = irq; setup_runstate_info(cpu); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 9e1afae8461..c1f8faf0a2c 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -58,7 +58,7 @@ void __init xen_init_spinlocks(void); __cpuinit void xen_init_lock_cpu(int cpu); void xen_uninit_lock_cpu(int cpu); -extern cpumask_t xen_cpu_initialized_map; +extern cpumask_var_t xen_cpu_initialized_map; #else static inline void xen_smp_init(void) {} #endif |