From 323ec001c6bb98eeabb5abbdbb8c8055d9496554 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Sun, 29 Jun 2008 14:19:31 +0400 Subject: x86: use generic per-device dma coherent allocator Signed-off-by: Dmitry Baryshkov Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 122 +--------------------------------------------- 1 file changed, 2 insertions(+), 120 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index dc00a1331ac..b75c81a8d3e 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -197,124 +197,6 @@ static __init int iommu_setup(char *p) } early_param("iommu", iommu_setup); -#ifdef CONFIG_X86_32 -int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, - dma_addr_t device_addr, size_t size, int flags) -{ - void __iomem *mem_base = NULL; - int pages = size >> PAGE_SHIFT; - int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); - - if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0) - goto out; - if (!size) - goto out; - if (dev->dma_mem) - goto out; - - /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */ - - mem_base = ioremap(bus_addr, size); - if (!mem_base) - goto out; - - dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); - if (!dev->dma_mem) - goto out; - dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); - if (!dev->dma_mem->bitmap) - goto free1_out; - - dev->dma_mem->virt_base = mem_base; - dev->dma_mem->device_base = device_addr; - dev->dma_mem->size = pages; - dev->dma_mem->flags = flags; - - if (flags & DMA_MEMORY_MAP) - return DMA_MEMORY_MAP; - - return DMA_MEMORY_IO; - - free1_out: - kfree(dev->dma_mem); - out: - if (mem_base) - iounmap(mem_base); - return 0; -} -EXPORT_SYMBOL(dma_declare_coherent_memory); - -void dma_release_declared_memory(struct device *dev) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - - if (!mem) - return; - dev->dma_mem = NULL; - iounmap(mem->virt_base); - kfree(mem->bitmap); - kfree(mem); -} -EXPORT_SYMBOL(dma_release_declared_memory); - -void *dma_mark_declared_memory_occupied(struct device *dev, - dma_addr_t device_addr, size_t size) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - int pos, err; - int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1); - - pages >>= PAGE_SHIFT; - - if (!mem) - return ERR_PTR(-EINVAL); - - pos = (device_addr - mem->device_base) >> PAGE_SHIFT; - err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages)); - if (err != 0) - return ERR_PTR(err); - return mem->virt_base + (pos << PAGE_SHIFT); -} -EXPORT_SYMBOL(dma_mark_declared_memory_occupied); - -static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size, - dma_addr_t *dma_handle, void **ret) -{ - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; - int order = get_order(size); - - if (mem) { - int page = bitmap_find_free_region(mem->bitmap, mem->size, - order); - if (page >= 0) { - *dma_handle = mem->device_base + (page << PAGE_SHIFT); - *ret = mem->virt_base + (page << PAGE_SHIFT); - memset(*ret, 0, size); - } - if (mem->flags & DMA_MEMORY_EXCLUSIVE) - *ret = NULL; - } - return (mem != NULL); -} - -static int dma_release_coherent(struct device *dev, int order, void *vaddr) -{ - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; - - if (mem && vaddr >= mem->virt_base && vaddr < - (mem->virt_base + (mem->size << PAGE_SHIFT))) { - int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; - - bitmap_release_region(mem->bitmap, page, order); - return 1; - } - return 0; -} -#else -#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0) -#define dma_release_coherent(dev, order, vaddr) (0) -#endif /* CONFIG_X86_32 */ - int dma_supported(struct device *dev, u64 mask) { #ifdef CONFIG_PCI @@ -383,7 +265,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, /* ignore region specifiers */ gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); - if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory)) + if (dma_alloc_from_coherent(dev, size, dma_handle, &memory)) return memory; if (!dev) { @@ -486,7 +368,7 @@ void dma_free_coherent(struct device *dev, size_t size, { int order = get_order(size); WARN_ON(irqs_disabled()); /* for portability */ - if (dma_release_coherent(dev, order, vaddr)) + if (dma_release_from_coherent(dev, order, vaddr)) return; if (dma_ops->unmap_single) dma_ops->unmap_single(dev, bus, size, 0); -- cgit v1.2.3 From a8132e5fe2c4f3f780b8bd3cce7851640f79f1c7 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 25 Jul 2008 14:57:59 +0200 Subject: x86, AMD IOMMU: replace to_pages macro with iommu_num_pages This patch removes the to_pages macro from AMD IOMMU code and calls the generic iommu_num_pages function instead. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index c25210e6ac8..79eff735fde 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -29,9 +29,6 @@ #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) -#define to_pages(addr, size) \ - (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) - #define EXIT_LOOP_COUNT 10000000 static DEFINE_RWLOCK(amd_iommu_devtable_lock); @@ -185,7 +182,7 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, u64 address, size_t size) { int s = 0; - unsigned pages = to_pages(address, size); + unsigned pages = iommu_num_pages(address, size); address &= PAGE_MASK; @@ -557,8 +554,8 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, if (iommu->exclusion_start && iommu->exclusion_start < dma_dom->aperture_size) { unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; - int pages = to_pages(iommu->exclusion_start, - iommu->exclusion_length); + int pages = iommu_num_pages(iommu->exclusion_start, + iommu->exclusion_length); dma_ops_reserve_addresses(dma_dom, startpage, pages); } @@ -767,7 +764,7 @@ static dma_addr_t __map_single(struct device *dev, unsigned int pages; int i; - pages = to_pages(paddr, size); + pages = iommu_num_pages(paddr, size); paddr &= PAGE_MASK; address = dma_ops_alloc_addresses(dev, dma_dom, pages); @@ -802,7 +799,7 @@ static void __unmap_single(struct amd_iommu *iommu, if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size)) return; - pages = to_pages(dma_addr, size); + pages = iommu_num_pages(dma_addr, size); dma_addr &= PAGE_MASK; start = dma_addr; -- cgit v1.2.3 From 87e39ea5714dd59ba31e36c25833d2b20255a29d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 25 Jul 2008 14:58:00 +0200 Subject: x86 gart: replace to_pages macro with iommu_num_pages This patch removes the to_pages macro from x86 GART code and calls the generic iommu_num_pages function instead. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index df5f142657d..c9824043156 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -67,9 +67,6 @@ static u32 gart_unmapped_entry; (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) #define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) -#define to_pages(addr, size) \ - (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) - #define EMERGENCY_PAGES 32 /* = 128KB */ #ifdef CONFIG_AGP @@ -241,7 +238,7 @@ nonforced_iommu(struct device *dev, unsigned long addr, size_t size) static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, size_t size, int dir) { - unsigned long npages = to_pages(phys_mem, size); + unsigned long npages = iommu_num_pages(phys_mem, size); unsigned long iommu_page = alloc_iommu(dev, npages); int i; @@ -304,7 +301,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, return; iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; - npages = to_pages(dma_addr, size); + npages = iommu_num_pages(dma_addr, size); for (i = 0; i < npages; i++) { iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; CLEAR_LEAK(iommu_page + i); @@ -387,7 +384,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start, } addr = phys_addr; - pages = to_pages(s->offset, s->length); + pages = iommu_num_pages(s->offset, s->length); while (pages--) { iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); SET_LEAK(iommu_page); @@ -470,7 +467,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) seg_size += s->length; need = nextneed; - pages += to_pages(s->offset, s->length); + pages += iommu_num_pages(s->offset, s->length); ps = s; } if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) -- cgit v1.2.3 From 6524d938b3360504b43a1278b5a8403e85383d1a Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Thu, 24 Jul 2008 18:21:30 -0700 Subject: cpumask: put cpumask_of_cpu_map in the initdata section * Create the cpumask_of_cpu_map statically in the init data section using NR_CPUS but replace it during boot up with one sized by nr_cpu_ids (num possible cpus). Signed-off-by: Mike Travis Cc: Andrew Morton Cc: Jack Steiner Cc: Rusty Russell Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index f7745f94c00..1cd53dfcd30 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -81,10 +81,12 @@ static void __init setup_per_cpu_maps(void) } #ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP -cpumask_t *cpumask_of_cpu_map __read_mostly; -EXPORT_SYMBOL(cpumask_of_cpu_map); - -/* requires nr_cpu_ids to be initialized */ +/* + * Replace static cpumask_of_cpu_map in the initdata section, + * with one that's allocated sized by the possible number of cpus. + * + * (requires nr_cpu_ids to be initialized) + */ static void __init setup_cpumask_of_cpu(void) { int i; -- cgit v1.2.3 From 0bc3cc03fa6e1c20aecb5a33356bcaae410640b9 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Thu, 24 Jul 2008 18:21:31 -0700 Subject: cpumask: change cpumask_of_cpu_ptr to use new cpumask_of_cpu * Replace previous instances of the cpumask_of_cpu_ptr* macros with a the new (lvalue capable) generic cpumask_of_cpu(). Signed-off-by: Mike Travis Cc: Andrew Morton Cc: Jack Steiner Cc: Rusty Russell Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/cstate.c | 3 +-- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 10 +++------- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 15 +++++---------- arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c | 12 ++++-------- arch/x86/kernel/cpu/cpufreq/speedstep-ich.c | 3 +-- arch/x86/kernel/cpu/intel_cacheinfo.c | 3 +-- arch/x86/kernel/ldt.c | 6 ++---- arch/x86/kernel/microcode.c | 17 +++++------------ arch/x86/kernel/reboot.c | 11 +++-------- 9 files changed, 25 insertions(+), 55 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 9220cf46aa1..c2502eb9aa8 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -73,7 +73,6 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, struct cpuinfo_x86 *c = &cpu_data(cpu); cpumask_t saved_mask; - cpumask_of_cpu_ptr(new_mask, cpu); int retval; unsigned int eax, ebx, ecx, edx; unsigned int edx_part; @@ -92,7 +91,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, /* Make sure we are running on right CPU */ saved_mask = current->cpus_allowed; - retval = set_cpus_allowed_ptr(current, new_mask); + retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); if (retval) return -1; diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index ff2fff56f0a..dd097b83583 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -200,12 +200,10 @@ static void drv_read(struct drv_cmd *cmd) static void drv_write(struct drv_cmd *cmd) { cpumask_t saved_mask = current->cpus_allowed; - cpumask_of_cpu_ptr_declare(cpu_mask); unsigned int i; for_each_cpu_mask_nr(i, cmd->mask) { - cpumask_of_cpu_ptr_next(cpu_mask, i); - set_cpus_allowed_ptr(current, cpu_mask); + set_cpus_allowed_ptr(current, &cpumask_of_cpu(i)); do_drv_write(cmd); } @@ -269,12 +267,11 @@ static unsigned int get_measured_perf(unsigned int cpu) } aperf_cur, mperf_cur; cpumask_t saved_mask; - cpumask_of_cpu_ptr(cpu_mask, cpu); unsigned int perf_percent; unsigned int retval; saved_mask = current->cpus_allowed; - set_cpus_allowed_ptr(current, cpu_mask); + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); if (get_cpu() != cpu) { /* We were not able to run on requested processor */ put_cpu(); @@ -340,7 +337,6 @@ static unsigned int get_measured_perf(unsigned int cpu) static unsigned int get_cur_freq_on_cpu(unsigned int cpu) { - cpumask_of_cpu_ptr(cpu_mask, cpu); struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu); unsigned int freq; unsigned int cached_freq; @@ -353,7 +349,7 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu) } cached_freq = data->freq_table[data->acpi_data->state].frequency; - freq = extract_freq(get_cur_val(cpu_mask), data); + freq = extract_freq(get_cur_val(&cpumask_of_cpu(cpu)), data); if (freq != cached_freq) { /* * The dreaded BIOS frequency change behind our back. diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 53c7b693697..c45ca6d4dce 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -479,12 +479,11 @@ static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvi static int check_supported_cpu(unsigned int cpu) { cpumask_t oldmask; - cpumask_of_cpu_ptr(cpu_mask, cpu); u32 eax, ebx, ecx, edx; unsigned int rc = 0; oldmask = current->cpus_allowed; - set_cpus_allowed_ptr(current, cpu_mask); + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); if (smp_processor_id() != cpu) { printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu); @@ -1017,7 +1016,6 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation) { cpumask_t oldmask; - cpumask_of_cpu_ptr(cpu_mask, pol->cpu); struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); u32 checkfid; u32 checkvid; @@ -1032,7 +1030,7 @@ static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsi /* only run on specific CPU from here on */ oldmask = current->cpus_allowed; - set_cpus_allowed_ptr(current, cpu_mask); + set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); if (smp_processor_id() != pol->cpu) { printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); @@ -1107,7 +1105,6 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) { struct powernow_k8_data *data; cpumask_t oldmask; - cpumask_of_cpu_ptr_declare(newmask); int rc; if (!cpu_online(pol->cpu)) @@ -1159,8 +1156,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) /* only run on specific CPU from here on */ oldmask = current->cpus_allowed; - cpumask_of_cpu_ptr_next(newmask, pol->cpu); - set_cpus_allowed_ptr(current, newmask); + set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); if (smp_processor_id() != pol->cpu) { printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); @@ -1182,7 +1178,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) set_cpus_allowed_ptr(current, &oldmask); if (cpu_family == CPU_HW_PSTATE) - pol->cpus = *newmask; + pol->cpus = cpumask_of_cpu(pol->cpu); else pol->cpus = per_cpu(cpu_core_map, pol->cpu); data->available_cores = &(pol->cpus); @@ -1248,7 +1244,6 @@ static unsigned int powernowk8_get (unsigned int cpu) { struct powernow_k8_data *data; cpumask_t oldmask = current->cpus_allowed; - cpumask_of_cpu_ptr(newmask, cpu); unsigned int khz = 0; unsigned int first; @@ -1258,7 +1253,7 @@ static unsigned int powernowk8_get (unsigned int cpu) if (!data) return -EINVAL; - set_cpus_allowed_ptr(current, newmask); + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); if (smp_processor_id() != cpu) { printk(KERN_ERR PFX "limiting to CPU %d failed in powernowk8_get\n", cpu); diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index ca2ac13b7af..15e13c01cc3 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -324,10 +324,9 @@ static unsigned int get_cur_freq(unsigned int cpu) unsigned l, h; unsigned clock_freq; cpumask_t saved_mask; - cpumask_of_cpu_ptr(new_mask, cpu); saved_mask = current->cpus_allowed; - set_cpus_allowed_ptr(current, new_mask); + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); if (smp_processor_id() != cpu) return 0; @@ -585,15 +584,12 @@ static int centrino_target (struct cpufreq_policy *policy, * Best effort undo.. */ - if (!cpus_empty(*covered_cpus)) { - cpumask_of_cpu_ptr_declare(new_mask); - + if (!cpus_empty(*covered_cpus)) for_each_cpu_mask_nr(j, *covered_cpus) { - cpumask_of_cpu_ptr_next(new_mask, j); - set_cpus_allowed_ptr(current, new_mask); + set_cpus_allowed_ptr(current, + &cpumask_of_cpu(j)); wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); } - } tmp = freqs.new; freqs.new = freqs.old; diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 2f3728dc24f..191f7263c61 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -244,8 +244,7 @@ static unsigned int _speedstep_get(const cpumask_t *cpus) static unsigned int speedstep_get(unsigned int cpu) { - cpumask_of_cpu_ptr(newmask, cpu); - return _speedstep_get(newmask); + return _speedstep_get(&cpumask_of_cpu(cpu)); } /** diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 650d40f7912..6b0a10b002f 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -516,7 +516,6 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu) unsigned long j; int retval; cpumask_t oldmask; - cpumask_of_cpu_ptr(newmask, cpu); if (num_cache_leaves == 0) return -ENOENT; @@ -527,7 +526,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu) return -ENOMEM; oldmask = current->cpus_allowed; - retval = set_cpus_allowed_ptr(current, newmask); + retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); if (retval) goto out; diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 3fee2aa50f3..b68e21f06f4 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -62,12 +62,10 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) if (reload) { #ifdef CONFIG_SMP - cpumask_of_cpu_ptr_declare(mask); - preempt_disable(); load_LDT(pc); - cpumask_of_cpu_ptr_next(mask, smp_processor_id()); - if (!cpus_equal(current->mm->cpu_vm_mask, *mask)) + if (!cpus_equal(current->mm->cpu_vm_mask, + cpumask_of_cpu(smp_processor_id()))) smp_call_function(flush_ldt, current->mm, 1); preempt_enable(); #else diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 6994c751590..652fa5c38eb 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -388,7 +388,6 @@ static int do_microcode_update (void) void *new_mc = NULL; int cpu; cpumask_t old; - cpumask_of_cpu_ptr_declare(newmask); old = current->cpus_allowed; @@ -405,8 +404,7 @@ static int do_microcode_update (void) if (!uci->valid) continue; - cpumask_of_cpu_ptr_next(newmask, cpu); - set_cpus_allowed_ptr(current, newmask); + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); error = get_maching_microcode(new_mc, cpu); if (error < 0) goto out; @@ -576,7 +574,6 @@ static int apply_microcode_check_cpu(int cpu) struct cpuinfo_x86 *c = &cpu_data(cpu); struct ucode_cpu_info *uci = ucode_cpu_info + cpu; cpumask_t old; - cpumask_of_cpu_ptr(newmask, cpu); unsigned int val[2]; int err = 0; @@ -585,7 +582,7 @@ static int apply_microcode_check_cpu(int cpu) return 0; old = current->cpus_allowed; - set_cpus_allowed_ptr(current, newmask); + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); /* Check if the microcode we have in memory matches the CPU */ if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || @@ -623,12 +620,11 @@ static int apply_microcode_check_cpu(int cpu) static void microcode_init_cpu(int cpu, int resume) { cpumask_t old; - cpumask_of_cpu_ptr(newmask, cpu); struct ucode_cpu_info *uci = ucode_cpu_info + cpu; old = current->cpus_allowed; - set_cpus_allowed_ptr(current, newmask); + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); mutex_lock(µcode_mutex); collect_cpu_info(cpu); if (uci->valid && system_state == SYSTEM_RUNNING && !resume) @@ -661,13 +657,10 @@ static ssize_t reload_store(struct sys_device *dev, if (end == buf) return -EINVAL; if (val == 1) { - cpumask_t old; - cpumask_of_cpu_ptr(newmask, cpu); - - old = current->cpus_allowed; + cpumask_t old = current->cpus_allowed; get_online_cpus(); - set_cpus_allowed_ptr(current, newmask); + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); mutex_lock(µcode_mutex); if (uci->valid) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 06a9f643817..724adfc63cb 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -414,25 +414,20 @@ void native_machine_shutdown(void) /* The boot cpu is always logical cpu 0 */ int reboot_cpu_id = 0; - cpumask_of_cpu_ptr(newmask, reboot_cpu_id); #ifdef CONFIG_X86_32 /* See if there has been given a command line override */ if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) && - cpu_online(reboot_cpu)) { + cpu_online(reboot_cpu)) reboot_cpu_id = reboot_cpu; - cpumask_of_cpu_ptr_next(newmask, reboot_cpu_id); - } #endif /* Make certain the cpu I'm about to reboot on is online */ - if (!cpu_online(reboot_cpu_id)) { + if (!cpu_online(reboot_cpu_id)) reboot_cpu_id = smp_processor_id(); - cpumask_of_cpu_ptr_next(newmask, reboot_cpu_id); - } /* Make certain I only run on the appropriate processor */ - set_cpus_allowed_ptr(current, newmask); + set_cpus_allowed_ptr(current, &cpumask_of_cpu(reboot_cpu_id)); /* O.K Now that I'm on the appropriate processor, * stop all of the others. -- cgit v1.2.3 From 8d8bb39b9eba32dd70e87fd5ad5c5dd4ba118e06 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 25 Jul 2008 19:44:49 -0700 Subject: dma-mapping: add the device argument to dma_mapping_error() Add per-device dma_mapping_ops support for CONFIG_X86_64 as POWER architecture does: This enables us to cleanly fix the Calgary IOMMU issue that some devices are not behind the IOMMU (http://lkml.org/lkml/2008/5/8/423). I think that per-device dma_mapping_ops support would be also helpful for KVM people to support PCI passthrough but Andi thinks that this makes it difficult to support the PCI passthrough (see the above thread). So I CC'ed this to KVM camp. Comments are appreciated. A pointer to dma_mapping_ops to struct dev_archdata is added. If the pointer is non NULL, DMA operations in asm/dma-mapping.h use it. If it's NULL, the system-wide dma_ops pointer is used as before. If it's useful for KVM people, I plan to implement a mechanism to register a hook called when a new pci (or dma capable) device is created (it works with hot plugging). It enables IOMMUs to set up an appropriate dma_mapping_ops per device. The major obstacle is that dma_mapping_error doesn't take a pointer to the device unlike other DMA operations. So x86 can't have dma_mapping_ops per device. Note all the POWER IOMMUs use the same dma_mapping_error function so this is not a problem for POWER but x86 IOMMUs use different dma_mapping_error functions. The first patch adds the device argument to dma_mapping_error. The patch is trivial but large since it touches lots of drivers and dma-mapping.h in all the architecture. This patch: dma_mapping_error() doesn't take a pointer to the device unlike other DMA operations. So we can't have dma_mapping_ops per device. Note that POWER already has dma_mapping_ops per device but all the POWER IOMMUs use the same dma_mapping_error function. x86 IOMMUs use device argument. [akpm@linux-foundation.org: fix sge] [akpm@linux-foundation.org: fix svc_rdma] [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: fix bnx2x] [akpm@linux-foundation.org: fix s2io] [akpm@linux-foundation.org: fix pasemi_mac] [akpm@linux-foundation.org: fix sdhci] [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: fix sparc] [akpm@linux-foundation.org: fix ibmvscsi] Signed-off-by: FUJITA Tomonori Cc: Muli Ben-Yehuda Cc: Andi Kleen Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Avi Kivity Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/pci-calgary_64.c | 2 +- arch/x86/kernel/pci-dma.c | 27 ++++++++++++++++----------- arch/x86/kernel/pci-gart_64.c | 3 +-- arch/x86/kernel/pci-nommu.c | 14 +------------- arch/x86/kernel/pci-swiotlb_64.c | 2 +- 5 files changed, 20 insertions(+), 28 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 19e7fc7c2c4..1eb86be93d7 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -544,7 +544,7 @@ error: return ret; } -static const struct dma_mapping_ops calgary_dma_ops = { +static struct dma_mapping_ops calgary_dma_ops = { .alloc_coherent = calgary_alloc_coherent, .map_single = calgary_map_single, .unmap_single = calgary_unmap_single, diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index cbecb05551b..37544123896 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -11,7 +11,7 @@ static int forbid_dac __read_mostly; -const struct dma_mapping_ops *dma_ops; +struct dma_mapping_ops *dma_ops; EXPORT_SYMBOL(dma_ops); static int iommu_sac_force __read_mostly; @@ -312,6 +312,8 @@ static int dma_release_coherent(struct device *dev, int order, void *vaddr) int dma_supported(struct device *dev, u64 mask) { + struct dma_mapping_ops *ops = get_dma_ops(dev); + #ifdef CONFIG_PCI if (mask > 0xffffffff && forbid_dac > 0) { dev_info(dev, "PCI: Disallowing DAC for device\n"); @@ -319,8 +321,8 @@ int dma_supported(struct device *dev, u64 mask) } #endif - if (dma_ops->dma_supported) - return dma_ops->dma_supported(dev, mask); + if (ops->dma_supported) + return ops->dma_supported(dev, mask); /* Copied from i386. Doesn't make much sense, because it will only work for pci_alloc_coherent. @@ -367,6 +369,7 @@ void * dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) { + struct dma_mapping_ops *ops = get_dma_ops(dev); void *memory = NULL; struct page *page; unsigned long dma_mask = 0; @@ -435,8 +438,8 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, /* Let low level make its own zone decisions */ gfp &= ~(GFP_DMA32|GFP_DMA); - if (dma_ops->alloc_coherent) - return dma_ops->alloc_coherent(dev, size, + if (ops->alloc_coherent) + return ops->alloc_coherent(dev, size, dma_handle, gfp); return NULL; } @@ -448,14 +451,14 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, } } - if (dma_ops->alloc_coherent) { + if (ops->alloc_coherent) { free_pages((unsigned long)memory, get_order(size)); gfp &= ~(GFP_DMA|GFP_DMA32); - return dma_ops->alloc_coherent(dev, size, dma_handle, gfp); + return ops->alloc_coherent(dev, size, dma_handle, gfp); } - if (dma_ops->map_simple) { - *dma_handle = dma_ops->map_simple(dev, virt_to_phys(memory), + if (ops->map_simple) { + *dma_handle = ops->map_simple(dev, virt_to_phys(memory), size, PCI_DMA_BIDIRECTIONAL); if (*dma_handle != bad_dma_address) @@ -477,12 +480,14 @@ EXPORT_SYMBOL(dma_alloc_coherent); void dma_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t bus) { + struct dma_mapping_ops *ops = get_dma_ops(dev); + int order = get_order(size); WARN_ON(irqs_disabled()); /* for portability */ if (dma_release_coherent(dev, order, vaddr)) return; - if (dma_ops->unmap_single) - dma_ops->unmap_single(dev, bus, size, 0); + if (ops->unmap_single) + ops->unmap_single(dev, bus, size, 0); free_pages((unsigned long)vaddr, order); } EXPORT_SYMBOL(dma_free_coherent); diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index df5f142657d..744126e6495 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -692,8 +692,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) extern int agp_amd64_init(void); -static const struct dma_mapping_ops gart_dma_ops = { - .mapping_error = NULL, +static struct dma_mapping_ops gart_dma_ops = { .map_single = gart_map_single, .map_simple = gart_map_simple, .unmap_single = gart_unmap_single, diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 792b9179eff..3f91f71cdc3 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -72,21 +72,9 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, return nents; } -/* Make sure we keep the same behaviour */ -static int nommu_mapping_error(dma_addr_t dma_addr) -{ -#ifdef CONFIG_X86_32 - return 0; -#else - return (dma_addr == bad_dma_address); -#endif -} - - -const struct dma_mapping_ops nommu_dma_ops = { +struct dma_mapping_ops nommu_dma_ops = { .map_single = nommu_map_single, .map_sg = nommu_map_sg, - .mapping_error = nommu_mapping_error, .is_phys = 1, }; diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c index 20df839b9c2..c4ce0332759 100644 --- a/arch/x86/kernel/pci-swiotlb_64.c +++ b/arch/x86/kernel/pci-swiotlb_64.c @@ -18,7 +18,7 @@ swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size, return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction); } -const struct dma_mapping_ops swiotlb_dma_ops = { +struct dma_mapping_ops swiotlb_dma_ops = { .mapping_error = swiotlb_dma_mapping_error, .alloc_coherent = swiotlb_alloc_coherent, .free_coherent = swiotlb_free_coherent, -- cgit v1.2.3 From 1956a96de488feb05e95c08c9d5e80f63a4be2b1 Mon Sep 17 00:00:00 2001 From: Alexis Bruemmer Date: Fri, 25 Jul 2008 19:44:51 -0700 Subject: x86 calgary: fix handling of devices that aren't behind the Calgary The calgary code can give drivers addresses above 4GB which is very bad for hardware that is only 32bit DMA addressable. With this patch, the calgary code sets the global dma_ops to swiotlb or nommu properly, and the dma_ops of devices behind the Calgary/CalIOC2 to calgary_dma_ops. So the calgary code can handle devices safely that aren't behind the Calgary/CalIOC2. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Alexis Bruemmer Signed-off-by: FUJITA Tomonori Cc: Muli Ben-Yehuda Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/pci-calgary_64.c | 71 +++++++++++++++------------------------- 1 file changed, 26 insertions(+), 45 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 1eb86be93d7..b67a4b1d4ea 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -37,6 +37,7 @@ #include #include #include + #include #include #include @@ -413,22 +414,6 @@ static void calgary_unmap_sg(struct device *dev, } } -static int calgary_nontranslate_map_sg(struct device* dev, - struct scatterlist *sg, int nelems, int direction) -{ - struct scatterlist *s; - int i; - - for_each_sg(sg, s, nelems, i) { - struct page *p = sg_page(s); - - BUG_ON(!p); - s->dma_address = virt_to_bus(sg_virt(s)); - s->dma_length = s->length; - } - return nelems; -} - static int calgary_map_sg(struct device *dev, struct scatterlist *sg, int nelems, int direction) { @@ -439,9 +424,6 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, unsigned long entry; int i; - if (!translation_enabled(tbl)) - return calgary_nontranslate_map_sg(dev, sg, nelems, direction); - for_each_sg(sg, s, nelems, i) { BUG_ON(!sg_page(s)); @@ -477,7 +459,6 @@ error: static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, size_t size, int direction) { - dma_addr_t dma_handle = bad_dma_address; void *vaddr = phys_to_virt(paddr); unsigned long uaddr; unsigned int npages; @@ -486,12 +467,7 @@ static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, uaddr = (unsigned long)vaddr; npages = num_dma_pages(uaddr, size); - if (translation_enabled(tbl)) - dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction); - else - dma_handle = virt_to_bus(vaddr); - - return dma_handle; + return iommu_alloc(dev, tbl, vaddr, npages, direction); } static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, @@ -500,9 +476,6 @@ static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, struct iommu_table *tbl = find_iommu_table(dev); unsigned int npages; - if (!translation_enabled(tbl)) - return; - npages = num_dma_pages(dma_handle, size); iommu_free(tbl, dma_handle, npages); } @@ -525,18 +498,12 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size, goto error; memset(ret, 0, size); - if (translation_enabled(tbl)) { - /* set up tces to cover the allocated range */ - mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); - if (mapping == bad_dma_address) - goto free; - - *dma_handle = mapping; - } else /* non translated slot */ - *dma_handle = virt_to_bus(ret); - + /* set up tces to cover the allocated range */ + mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); + if (mapping == bad_dma_address) + goto free; + *dma_handle = mapping; return ret; - free: free_pages((unsigned long)ret, get_order(size)); ret = NULL; @@ -1241,6 +1208,16 @@ static int __init calgary_init(void) goto error; } while (1); + dev = NULL; + for_each_pci_dev(dev) { + struct iommu_table *tbl; + + tbl = find_iommu_table(&dev->dev); + + if (translation_enabled(tbl)) + dev->dev.archdata.dma_ops = &calgary_dma_ops; + } + return ret; error: @@ -1262,6 +1239,7 @@ error: calgary_disable_translation(dev); calgary_free_bus(dev); pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ + dev->dev.archdata.dma_ops = NULL; } while (1); return ret; @@ -1503,6 +1481,10 @@ void __init detect_calgary(void) printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, debugging ? "enabled" : "disabled"); + + /* swiotlb for devices that aren't behind the Calgary. */ + if (max_pfn > MAX_DMA32_PFN) + swiotlb = 1; } return; @@ -1519,7 +1501,7 @@ int __init calgary_iommu_init(void) { int ret; - if (no_iommu || swiotlb) + if (no_iommu || (swiotlb && !calgary_detected)) return -ENODEV; if (!calgary_detected) @@ -1532,15 +1514,14 @@ int __init calgary_iommu_init(void) if (ret) { printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " "falling back to no_iommu\n", ret); - if (max_pfn > MAX_DMA32_PFN) - printk(KERN_ERR "WARNING more than 4GB of memory, " - "32bit PCI may malfunction.\n"); return ret; } force_iommu = 1; bad_dma_address = 0x0; - dma_ops = &calgary_dma_ops; + /* dma_ops is set to swiotlb or nommu */ + if (!dma_ops) + dma_ops = &nommu_dma_ops; return 0; } -- cgit v1.2.3 From 3ab83521378268044a448113c6aa9a9e245f4d2f Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 25 Jul 2008 19:45:07 -0700 Subject: kexec jump This patch provides an enhancement to kexec/kdump. It implements the following features: - Backup/restore memory used by the original kernel before/after kexec. - Save/restore CPU state before/after kexec. The features of this patch can be used as a general method to call program in physical mode (paging turning off). This can be used to call BIOS code under Linux. kexec-tools needs to be patched to support kexec jump. The patches and the precompiled kexec can be download from the following URL: source: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec-tools-src_git_kh10.tar.bz2 patches: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec-tools-patches_git_kh10.tar.bz2 binary: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec_git_kh10 Usage example of calling some physical mode code and return: 1. Compile and install patched kernel with following options selected: CONFIG_X86_32=y CONFIG_KEXEC=y CONFIG_PM=y CONFIG_KEXEC_JUMP=y 2. Build patched kexec-tool or download the pre-built one. 3. Build some physical mode executable named such as "phy_mode" 4. Boot kernel compiled in step 1. 5. Load physical mode executable with /sbin/kexec. The shell command line can be as follow: /sbin/kexec --load-preserve-context --args-none phy_mode 6. Call physical mode executable with following shell command line: /sbin/kexec -e Implementation point: To support jumping without reserving memory. One shadow backup page (source page) is allocated for each page used by kexeced code image (destination page). When do kexec_load, the image of kexeced code is loaded into source pages, and before executing, the destination pages and the source pages are swapped, so the contents of destination pages are backupped. Before jumping to the kexeced code image and after jumping back to the original kernel, the destination pages and the source pages are swapped too. C ABI (calling convention) is used as communication protocol between kernel and called code. A flag named KEXEC_PRESERVE_CONTEXT for sys_kexec_load is added to indicate that the loaded kernel image is used for jumping back. Now, only the i386 architecture is supported. Signed-off-by: Huang Ying Acked-by: Vivek Goyal Cc: "Eric W. Biederman" Cc: Pavel Machek Cc: Nigel Cunningham Cc: "Rafael J. Wysocki" Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/machine_kexec_32.c | 27 ++++-- arch/x86/kernel/machine_kexec_64.c | 2 +- arch/x86/kernel/relocate_kernel_32.S | 174 ++++++++++++++++++++++++++++++----- 3 files changed, 172 insertions(+), 31 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 8864230d55a..2b67609d0a1 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -22,6 +22,7 @@ #include #include #include +#include #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) static u32 kexec_pgd[1024] PAGE_ALIGNED; @@ -85,10 +86,12 @@ static void load_segments(void) * reboot code buffer to allow us to avoid allocations * later. * - * Currently nothing. + * Make control page executable. */ int machine_kexec_prepare(struct kimage *image) { + if (nx_enabled) + set_pages_x(image->control_code_page, 1); return 0; } @@ -98,16 +101,24 @@ int machine_kexec_prepare(struct kimage *image) */ void machine_kexec_cleanup(struct kimage *image) { + if (nx_enabled) + set_pages_nx(image->control_code_page, 1); } /* * Do not allocate memory (or fail in any way) in machine_kexec(). * We are past the point of no return, committed to rebooting now. */ -NORET_TYPE void machine_kexec(struct kimage *image) +void machine_kexec(struct kimage *image) { unsigned long page_list[PAGES_NR]; void *control_page; + asmlinkage unsigned long + (*relocate_kernel_ptr)(unsigned long indirection_page, + unsigned long control_page, + unsigned long start_address, + unsigned int has_pae, + unsigned int preserve_context); tracer_disable(); @@ -115,10 +126,11 @@ NORET_TYPE void machine_kexec(struct kimage *image) local_irq_disable(); control_page = page_address(image->control_code_page); - memcpy(control_page, relocate_kernel, PAGE_SIZE); + memcpy(control_page, relocate_kernel, PAGE_SIZE/2); + relocate_kernel_ptr = control_page; page_list[PA_CONTROL_PAGE] = __pa(control_page); - page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; + page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; page_list[PA_PGD] = __pa(kexec_pgd); page_list[VA_PGD] = (unsigned long)kexec_pgd; #ifdef CONFIG_X86_PAE @@ -131,6 +143,7 @@ NORET_TYPE void machine_kexec(struct kimage *image) page_list[VA_PTE_0] = (unsigned long)kexec_pte0; page_list[PA_PTE_1] = __pa(kexec_pte1); page_list[VA_PTE_1] = (unsigned long)kexec_pte1; + page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT); /* The segment registers are funny things, they have both a * visible and an invisible part. Whenever the visible part is @@ -149,8 +162,10 @@ NORET_TYPE void machine_kexec(struct kimage *image) set_idt(phys_to_virt(0),0); /* now call it */ - relocate_kernel((unsigned long)image->head, (unsigned long)page_list, - image->start, cpu_has_pae); + image->start = relocate_kernel_ptr((unsigned long)image->head, + (unsigned long)page_list, + image->start, cpu_has_pae, + image->preserve_context); } void arch_crash_save_vmcoreinfo(void) diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 9dd9262693a..c43caa3a91f 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -181,7 +181,7 @@ void machine_kexec_cleanup(struct kimage *image) * Do not allocate memory (or fail in any way) in machine_kexec(). * We are past the point of no return, committed to rebooting now. */ -NORET_TYPE void machine_kexec(struct kimage *image) +void machine_kexec(struct kimage *image) { unsigned long page_list[PAGES_NR]; void *control_page; diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index c30fe25d470..703310a9902 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -20,11 +20,44 @@ #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) #define PAE_PGD_ATTR (_PAGE_PRESENT) +/* control_page + PAGE_SIZE/2 ~ control_page + PAGE_SIZE * 3/4 are + * used to save some data for jumping back + */ +#define DATA(offset) (PAGE_SIZE/2+(offset)) + +/* Minimal CPU state */ +#define ESP DATA(0x0) +#define CR0 DATA(0x4) +#define CR3 DATA(0x8) +#define CR4 DATA(0xc) + +/* other data */ +#define CP_VA_CONTROL_PAGE DATA(0x10) +#define CP_PA_PGD DATA(0x14) +#define CP_PA_SWAP_PAGE DATA(0x18) +#define CP_PA_BACKUP_PAGES_MAP DATA(0x1c) + .text .align PAGE_SIZE .globl relocate_kernel relocate_kernel: - movl 8(%esp), %ebp /* list of pages */ + /* Save the CPU context, used for jumping back */ + + pushl %ebx + pushl %esi + pushl %edi + pushl %ebp + pushf + + movl 20+8(%esp), %ebp /* list of pages */ + movl PTR(VA_CONTROL_PAGE)(%ebp), %edi + movl %esp, ESP(%edi) + movl %cr0, %eax + movl %eax, CR0(%edi) + movl %cr3, %eax + movl %eax, CR3(%edi) + movl %cr4, %eax + movl %eax, CR4(%edi) #ifdef CONFIG_X86_PAE /* map the control page at its virtual address */ @@ -138,15 +171,25 @@ relocate_kernel: relocate_new_kernel: /* read the arguments and say goodbye to the stack */ - movl 4(%esp), %ebx /* page_list */ - movl 8(%esp), %ebp /* list of pages */ - movl 12(%esp), %edx /* start address */ - movl 16(%esp), %ecx /* cpu_has_pae */ + movl 20+4(%esp), %ebx /* page_list */ + movl 20+8(%esp), %ebp /* list of pages */ + movl 20+12(%esp), %edx /* start address */ + movl 20+16(%esp), %ecx /* cpu_has_pae */ + movl 20+20(%esp), %esi /* preserve_context */ /* zero out flags, and disable interrupts */ pushl $0 popfl + /* save some information for jumping back */ + movl PTR(VA_CONTROL_PAGE)(%ebp), %edi + movl %edi, CP_VA_CONTROL_PAGE(%edi) + movl PTR(PA_PGD)(%ebp), %eax + movl %eax, CP_PA_PGD(%edi) + movl PTR(PA_SWAP_PAGE)(%ebp), %eax + movl %eax, CP_PA_SWAP_PAGE(%edi) + movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi) + /* get physical address of control page now */ /* this is impossible after page table switch */ movl PTR(PA_CONTROL_PAGE)(%ebp), %edi @@ -197,8 +240,90 @@ identity_mapped: xorl %eax, %eax movl %eax, %cr3 + movl CP_PA_SWAP_PAGE(%edi), %eax + pushl %eax + pushl %ebx + call swap_pages + addl $8, %esp + + /* To be certain of avoiding problems with self-modifying code + * I need to execute a serializing instruction here. + * So I flush the TLB, it's handy, and not processor dependent. + */ + xorl %eax, %eax + movl %eax, %cr3 + + /* set all of the registers to known values */ + /* leave %esp alone */ + + testl %esi, %esi + jnz 1f + xorl %edi, %edi + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %esi, %esi + xorl %ebp, %ebp + ret +1: + popl %edx + movl CP_PA_SWAP_PAGE(%edi), %esp + addl $PAGE_SIZE, %esp +2: + call *%edx + + /* get the re-entry point of the peer system */ + movl 0(%esp), %ebp + call 1f +1: + popl %ebx + subl $(1b - relocate_kernel), %ebx + movl CP_VA_CONTROL_PAGE(%ebx), %edi + lea PAGE_SIZE(%ebx), %esp + movl CP_PA_SWAP_PAGE(%ebx), %eax + movl CP_PA_BACKUP_PAGES_MAP(%ebx), %edx + pushl %eax + pushl %edx + call swap_pages + addl $8, %esp + movl CP_PA_PGD(%ebx), %eax + movl %eax, %cr3 + movl %cr0, %eax + orl $(1<<31), %eax + movl %eax, %cr0 + lea PAGE_SIZE(%edi), %esp + movl %edi, %eax + addl $(virtual_mapped - relocate_kernel), %eax + pushl %eax + ret + +virtual_mapped: + movl CR4(%edi), %eax + movl %eax, %cr4 + movl CR3(%edi), %eax + movl %eax, %cr3 + movl CR0(%edi), %eax + movl %eax, %cr0 + movl ESP(%edi), %esp + movl %ebp, %eax + + popf + popl %ebp + popl %edi + popl %esi + popl %ebx + ret + /* Do the copies */ - movl %ebx, %ecx +swap_pages: + movl 8(%esp), %edx + movl 4(%esp), %ecx + pushl %ebp + pushl %ebx + pushl %edi + pushl %esi + movl %ecx, %ebx jmp 1f 0: /* top, read another word from the indirection page */ @@ -226,27 +351,28 @@ identity_mapped: movl %ecx, %esi /* For every source page do a copy */ andl $0xfffff000, %esi + movl %edi, %eax + movl %esi, %ebp + + movl %edx, %edi movl $1024, %ecx rep ; movsl - jmp 0b -3: - - /* To be certain of avoiding problems with self-modifying code - * I need to execute a serializing instruction here. - * So I flush the TLB, it's handy, and not processor dependent. - */ - xorl %eax, %eax - movl %eax, %cr3 + movl %ebp, %edi + movl %eax, %esi + movl $1024, %ecx + rep ; movsl - /* set all of the registers to known values */ - /* leave %esp alone */ + movl %eax, %edi + movl %edx, %esi + movl $1024, %ecx + rep ; movsl - xorl %eax, %eax - xorl %ebx, %ebx - xorl %ecx, %ecx - xorl %edx, %edx - xorl %esi, %esi - xorl %edi, %edi - xorl %ebp, %ebp + lea PAGE_SIZE(%ebp), %esi + jmp 0b +3: + popl %esi + popl %edi + popl %ebx + popl %ebp ret -- cgit v1.2.3 From 89081d17f7bb81d89fa1aa9b70f821c5cf4d39e9 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 25 Jul 2008 19:45:10 -0700 Subject: kexec jump: save/restore device state This patch implements devices state save/restore before after kexec. This patch together with features in kexec_jump patch can be used for following: - A simple hibernation implementation without ACPI support. You can kexec a hibernating kernel, save the memory image of original system and shutdown the system. When resuming, you restore the memory image of original system via ordinary kexec load then jump back. - Kernel/system debug through making system snapshot. You can make system snapshot, jump back, do some thing and make another system snapshot. - Cooperative multi-kernel/system. With kexec jump, you can switch between several kernels/systems quickly without boot process except the first time. This appears like swap a whole kernel/system out/in. - A general method to call program in physical mode (paging turning off). This can be used to invoke BIOS code under Linux. The following user-space tools can be used with kexec jump: - kexec-tools needs to be patched to support kexec jump. The patches and the precompiled kexec can be download from the following URL: source: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec-tools-src_git_kh10.tar.bz2 patches: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec-tools-patches_git_kh10.tar.bz2 binary: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec_git_kh10 - makedumpfile with patches are used as memory image saving tool, it can exclude free pages from original kernel memory image file. The patches and the precompiled makedumpfile can be download from the following URL: source: http://khibernation.sourceforge.net/download/release_v10/makedumpfile/makedumpfile-src_cvs_kh10.tar.bz2 patches: http://khibernation.sourceforge.net/download/release_v10/makedumpfile/makedumpfile-patches_cvs_kh10.tar.bz2 binary: http://khibernation.sourceforge.net/download/release_v10/makedumpfile/makedumpfile_cvs_kh10 - An initramfs image can be used as the root file system of kexeced kernel. An initramfs image built with "BuildRoot" can be downloaded from the following URL: initramfs image: http://khibernation.sourceforge.net/download/release_v10/initramfs/rootfs_cvs_kh10.gz All user space tools above are included in the initramfs image. Usage example of simple hibernation: 1. Compile and install patched kernel with following options selected: CONFIG_X86_32=y CONFIG_RELOCATABLE=y CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y CONFIG_PM=y CONFIG_HIBERNATION=y CONFIG_KEXEC_JUMP=y 2. Build an initramfs image contains kexec-tool and makedumpfile, or download the pre-built initramfs image, called rootfs.gz in following text. 3. Prepare a partition to save memory image of original kernel, called hibernating partition in following text. 4. Boot kernel compiled in step 1 (kernel A). 5. In the kernel A, load kernel compiled in step 1 (kernel B) with /sbin/kexec. The shell command line can be as follow: /sbin/kexec --load-preserve-context /boot/bzImage --mem-min=0x100000 --mem-max=0xffffff --initrd=rootfs.gz 6. Boot the kernel B with following shell command line: /sbin/kexec -e 7. The kernel B will boot as normal kexec. In kernel B the memory image of kernel A can be saved into hibernating partition as follow: jump_back_entry=`cat /proc/cmdline | tr ' ' '\n' | grep kexec_jump_back_entry | cut -d '='` echo $jump_back_entry > kexec_jump_back_entry cp /proc/vmcore dump.elf Then you can shutdown the machine as normal. 8. Boot kernel compiled in step 1 (kernel C). Use the rootfs.gz as root file system. 9. In kernel C, load the memory image of kernel A as follow: /sbin/kexec -l --args-none --entry=`cat kexec_jump_back_entry` dump.elf 10. Jump back to the kernel A as follow: /sbin/kexec -e Then, kernel A is resumed. Implementation point: To support jumping between two kernels, before jumping to (executing) the new kernel and jumping back to the original kernel, the devices are put into quiescent state, and the state of devices and CPU is saved. After jumping back from kexeced kernel and jumping to the new kernel, the state of devices and CPU are restored accordingly. The devices/CPU state save/restore code of software suspend is called to implement corresponding function. Known issues: - Because the segment number supported by sys_kexec_load is limited, hibernation image with many segments may not be load. This is planned to be eliminated by adding a new flag to sys_kexec_load to make a image can be loaded with multiple sys_kexec_load invoking. Now, only the i386 architecture is supported. Signed-off-by: Huang Ying Acked-by: Vivek Goyal Cc: "Eric W. Biederman" Cc: Pavel Machek Cc: Nigel Cunningham Cc: "Rafael J. Wysocki" Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/machine_kexec_32.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 2b67609d0a1..9fe478d9840 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -125,6 +125,18 @@ void machine_kexec(struct kimage *image) /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); + if (image->preserve_context) { +#ifdef CONFIG_X86_IO_APIC + /* We need to put APICs in legacy mode so that we can + * get timer interrupts in second kernel. kexec/kdump + * paths already have calls to disable_IO_APIC() in + * one form or other. kexec jump path also need + * one. + */ + disable_IO_APIC(); +#endif + } + control_page = page_address(image->control_code_page); memcpy(control_page, relocate_kernel, PAGE_SIZE/2); -- cgit v1.2.3 From 583323b9d2f624884a8c9563fb5a4d795f39ab07 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 27 Jul 2008 21:43:11 +0200 Subject: x86: fix cpu hotplug on 32bit commit 3e9704739daf46a8ba6593d749c67b5f7cd633d2 ("x86: boot secondary cpus through initial_code") causes the kernel to crash when a CPU is brought online after the read only sections have been write protected. The write to initial_code in do_boot_cpu() fails. Move inital_code to .cpuinit.data section. Signed-off-by: Thomas Gleixner Acked-by: H. Peter Anvin --- arch/x86/kernel/head_32.S | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index f67e93441ca..a7010c3a377 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -456,9 +456,6 @@ is386: movl $2,%ecx # set MP 1: #endif /* CONFIG_SMP */ jmp *(initial_code) -.align 4 -ENTRY(initial_code) - .long i386_start_kernel /* * We depend on ET to be correct. This checks for 287/387. @@ -601,6 +598,11 @@ ignore_int: #endif iret +.section .cpuinit.data,"wa" +.align 4 +ENTRY(initial_code) + .long i386_start_kernel + .section .text /* * Real beginning of normal "text" segment -- cgit v1.2.3 From e56b3bc7942982ac2589c942fb345e38bc7a341a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 28 Jul 2008 11:32:33 -0700 Subject: cpu masks: optimize and clean up cpumask_of_cpu() Clean up and optimize cpumask_of_cpu(), by sharing all the zero words. Instead of stupidly generating all possible i=0...NR_CPUS 2^i patterns creating a huge array of constant bitmasks, realize that the zero words can be shared. In other words, on a 64-bit architecture, we only ever need 64 of these arrays - with a different bit set in one single world (with enough zero words around it so that we can create any bitmask by just offsetting in that big array). And then we just put enough zeroes around it that we can point every single cpumask to be one of those things. So when we have 4k CPU's, instead of having 4k arrays (of 4k bits each, with one bit set in each array - 2MB memory total), we have exactly 64 arrays instead, each 8k bits in size (64kB total). And then we just point cpumask(n) to the right position (which we can calculate dynamically). Once we have the right arrays, getting "cpumask(n)" ends up being: static inline const cpumask_t *get_cpu_mask(unsigned int cpu) { const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG]; p -= cpu / BITS_PER_LONG; return (const cpumask_t *)p; } This brings other advantages and simplifications as well: - we are not wasting memory that is just filled with a single bit in various different places - we don't need all those games to re-create the arrays in some dense format, because they're already going to be dense enough. if we compile a kernel for up to 4k CPU's, "wasting" that 64kB of memory is a non-issue (especially since by doing this "overlapping" trick we probably get better cache behaviour anyway). [ mingo@elte.hu: Converted Linus's mails into a commit. See: http://lkml.org/lkml/2008/7/27/156 http://lkml.org/lkml/2008/7/28/320 Also applied a family filter - which also has the side-effect of leaving out the bits where Linus calls me an idio... Oh, never mind ;-) ] Signed-off-by: Ingo Molnar Cc: Rusty Russell Cc: Andrew Morton Cc: Al Viro Cc: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 1cd53dfcd30..76e305e064f 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -80,26 +80,6 @@ static void __init setup_per_cpu_maps(void) #endif } -#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP -/* - * Replace static cpumask_of_cpu_map in the initdata section, - * with one that's allocated sized by the possible number of cpus. - * - * (requires nr_cpu_ids to be initialized) - */ -static void __init setup_cpumask_of_cpu(void) -{ - int i; - - /* alloc_bootmem zeroes memory */ - cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids); - for (i = 0; i < nr_cpu_ids; i++) - cpu_set(i, cpumask_of_cpu_map[i]); -} -#else -static inline void setup_cpumask_of_cpu(void) { } -#endif - #ifdef CONFIG_X86_32 /* * Great future not-so-futuristic plan: make i386 and x86_64 do it @@ -199,9 +179,6 @@ void __init setup_per_cpu_areas(void) /* Setup node to cpumask map */ setup_node_to_cpumask_map(); - - /* Setup cpumask_of_cpu map */ - setup_cpumask_of_cpu(); } #endif -- cgit v1.2.3