From 48adcf148c83faa41999fb0b3524299c4e160fd9 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Tue, 20 May 2008 01:03:16 +0300 Subject: [CPUFREQ] cpufreq: remove CVS keywords This patch removes CVS keywords that weren't updated for a long time from comments. Signed-off-by: Adrian Bunk Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k7.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h index f8a63b3664e..35fb4eaf6e1 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h @@ -1,5 +1,4 @@ /* - * $Id: powernow-k7.h,v 1.2 2003/02/10 18:26:01 davej Exp $ * (C) 2003 Dave Jones. * * Licensed under the terms of the GNU GPL License version 2. -- cgit v1.2.3 From 334ef7a7ab8f80b689a2be95d5e62d2167900865 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 12 May 2008 21:21:13 +0200 Subject: x86: use performance variant for_each_cpu_mask_nr Change references from for_each_cpu_mask to for_each_cpu_mask_nr where appropriate Reviewed-by: Paul Jackson Reviewed-by: Christoph Lameter Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner commit 2d474871e2fb092eb46a0930aba5442e10eb96cc Author: Mike Travis Date: Mon May 12 21:21:13 2008 +0200 --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 6 +++--- arch/x86/kernel/cpu/cpufreq/p4-clockmod.c | 6 +++--- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 8 ++++---- arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c | 10 +++++----- arch/x86/kernel/cpu/cpufreq/speedstep-ich.c | 4 ++-- arch/x86/kernel/cpu/intel_cacheinfo.c | 2 +- arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 4 ++-- arch/x86/kernel/io_apic_64.c | 8 ++++---- arch/x86/kernel/smpboot.c | 8 ++++---- 9 files changed, 28 insertions(+), 28 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index b0c8208df9f..dd097b83583 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -202,7 +202,7 @@ static void drv_write(struct drv_cmd *cmd) cpumask_t saved_mask = current->cpus_allowed; unsigned int i; - for_each_cpu_mask(i, cmd->mask) { + for_each_cpu_mask_nr(i, cmd->mask) { set_cpus_allowed_ptr(current, &cpumask_of_cpu(i)); do_drv_write(cmd); } @@ -451,7 +451,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, freqs.old = perf->states[perf->state].core_frequency * 1000; freqs.new = data->freq_table[next_state].frequency; - for_each_cpu_mask(i, cmd.mask) { + for_each_cpu_mask_nr(i, cmd.mask) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -466,7 +466,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, } } - for_each_cpu_mask(i, cmd.mask) { + for_each_cpu_mask_nr(i, cmd.mask) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index 199e4e05e5d..f1685fb91fb 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c @@ -122,7 +122,7 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy, return 0; /* notifiers */ - for_each_cpu_mask(i, policy->cpus) { + for_each_cpu_mask_nr(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -130,11 +130,11 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy, /* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software * Developer's Manual, Volume 3 */ - for_each_cpu_mask(i, policy->cpus) + for_each_cpu_mask_nr(i, policy->cpus) cpufreq_p4_setdc(i, p4clockmod_table[newstate].index); /* notifiers */ - for_each_cpu_mask(i, policy->cpus) { + for_each_cpu_mask_nr(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 46d4034d9f3..06d6eea5e07 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -966,7 +966,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i freqs.old = find_khz_freq_from_fid(data->currfid); freqs.new = find_khz_freq_from_fid(fid); - for_each_cpu_mask(i, *(data->available_cores)) { + for_each_cpu_mask_nr(i, *(data->available_cores)) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -974,7 +974,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i res = transition_fid_vid(data, fid, vid); freqs.new = find_khz_freq_from_fid(data->currfid); - for_each_cpu_mask(i, *(data->available_cores)) { + for_each_cpu_mask_nr(i, *(data->available_cores)) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } @@ -997,7 +997,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i freqs.old = find_khz_freq_from_pstate(data->powernow_table, data->currpstate); freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); - for_each_cpu_mask(i, *(data->available_cores)) { + for_each_cpu_mask_nr(i, *(data->available_cores)) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -1005,7 +1005,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i res = transition_pstate(data, pstate); freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); - for_each_cpu_mask(i, *(data->available_cores)) { + for_each_cpu_mask_nr(i, *(data->available_cores)) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index 908dd347c67..8b0dd6f2a1a 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -476,7 +476,7 @@ static int centrino_target (struct cpufreq_policy *policy, saved_mask = current->cpus_allowed; first_cpu = 1; cpus_clear(covered_cpus); - for_each_cpu_mask(j, online_policy_cpus) { + for_each_cpu_mask_nr(j, online_policy_cpus) { /* * Support for SMP systems. * Make sure we are running on CPU that wants to change freq @@ -517,7 +517,7 @@ static int centrino_target (struct cpufreq_policy *policy, dprintk("target=%dkHz old=%d new=%d msr=%04x\n", target_freq, freqs.old, freqs.new, msr); - for_each_cpu_mask(k, online_policy_cpus) { + for_each_cpu_mask_nr(k, online_policy_cpus) { freqs.cpu = k; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); @@ -540,7 +540,7 @@ static int centrino_target (struct cpufreq_policy *policy, preempt_enable(); } - for_each_cpu_mask(k, online_policy_cpus) { + for_each_cpu_mask_nr(k, online_policy_cpus) { freqs.cpu = k; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } @@ -554,7 +554,7 @@ static int centrino_target (struct cpufreq_policy *policy, */ if (!cpus_empty(covered_cpus)) { - for_each_cpu_mask(j, covered_cpus) { + for_each_cpu_mask_nr(j, covered_cpus) { set_cpus_allowed_ptr(current, &cpumask_of_cpu(j)); wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); @@ -564,7 +564,7 @@ static int centrino_target (struct cpufreq_policy *policy, tmp = freqs.new; freqs.new = freqs.old; freqs.old = tmp; - for_each_cpu_mask(j, online_policy_cpus) { + for_each_cpu_mask_nr(j, online_policy_cpus) { freqs.cpu = j; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 1b50244b1fd..191f7263c61 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -279,7 +279,7 @@ static int speedstep_target (struct cpufreq_policy *policy, cpus_allowed = current->cpus_allowed; - for_each_cpu_mask(i, policy->cpus) { + for_each_cpu_mask_nr(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -292,7 +292,7 @@ static int speedstep_target (struct cpufreq_policy *policy, /* allow to be run on all CPUs */ set_cpus_allowed_ptr(current, &cpus_allowed); - for_each_cpu_mask(i, policy->cpus) { + for_each_cpu_mask_nr(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 26d615dcb14..bfade3301c3 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -488,7 +488,7 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) int sibling; this_leaf = CPUID4_INFO_IDX(cpu, index); - for_each_cpu_mask(sibling, this_leaf->shared_cpu_map) { + for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) { sibling_leaf = CPUID4_INFO_IDX(sibling, index); cpu_clear(cpu, sibling_leaf->shared_cpu_map); } diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 7c9a813e119..88736cadbaa 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -527,7 +527,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (err) goto out_free; - for_each_cpu_mask(i, b->cpus) { + for_each_cpu_mask_nr(i, b->cpus) { if (i == cpu) continue; @@ -617,7 +617,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) #endif /* remove all sibling symlinks before unregistering */ - for_each_cpu_mask(i, b->cpus) { + for_each_cpu_mask_nr(i, b->cpus) { if (i == cpu) continue; diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index ef1a8dfcc52..e2838cbd2ff 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -718,7 +718,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask) return 0; } - for_each_cpu_mask(cpu, mask) { + for_each_cpu_mask_nr(cpu, mask) { cpumask_t domain, new_mask; int new_cpu; int vector, offset; @@ -739,7 +739,7 @@ next: continue; if (vector == IA32_SYSCALL_VECTOR) goto next; - for_each_cpu_mask(new_cpu, new_mask) + for_each_cpu_mask_nr(new_cpu, new_mask) if (per_cpu(vector_irq, new_cpu)[vector] != -1) goto next; /* Found one! */ @@ -749,7 +749,7 @@ next: cfg->move_in_progress = 1; cfg->old_domain = cfg->domain; } - for_each_cpu_mask(new_cpu, new_mask) + for_each_cpu_mask_nr(new_cpu, new_mask) per_cpu(vector_irq, new_cpu)[vector] = irq; cfg->vector = vector; cfg->domain = domain; @@ -781,7 +781,7 @@ static void __clear_irq_vector(int irq) vector = cfg->vector; cpus_and(mask, cfg->domain, cpu_online_map); - for_each_cpu_mask(cpu, mask) + for_each_cpu_mask_nr(cpu, mask) per_cpu(vector_irq, cpu)[vector] = -1; cfg->vector = 0; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 38988491c62..fff8ebaa554 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -487,7 +487,7 @@ void __cpuinit set_cpu_sibling_map(int cpu) cpu_set(cpu, cpu_sibling_setup_map); if (smp_num_siblings > 1) { - for_each_cpu_mask(i, cpu_sibling_setup_map) { + for_each_cpu_mask_nr(i, cpu_sibling_setup_map) { if (c->phys_proc_id == cpu_data(i).phys_proc_id && c->cpu_core_id == cpu_data(i).cpu_core_id) { cpu_set(i, per_cpu(cpu_sibling_map, cpu)); @@ -510,7 +510,7 @@ void __cpuinit set_cpu_sibling_map(int cpu) return; } - for_each_cpu_mask(i, cpu_sibling_setup_map) { + for_each_cpu_mask_nr(i, cpu_sibling_setup_map) { if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { cpu_set(i, c->llc_shared_map); @@ -1298,7 +1298,7 @@ static void remove_siblinginfo(int cpu) int sibling; struct cpuinfo_x86 *c = &cpu_data(cpu); - for_each_cpu_mask(sibling, per_cpu(cpu_core_map, cpu)) { + for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) { cpu_clear(cpu, per_cpu(cpu_core_map, sibling)); /*/ * last thread sibling in this cpu core going down @@ -1307,7 +1307,7 @@ static void remove_siblinginfo(int cpu) cpu_data(sibling).booted_cores--; } - for_each_cpu_mask(sibling, per_cpu(cpu_sibling_map, cpu)) + for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu)) cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling)); cpus_clear(per_cpu(cpu_sibling_map, cpu)); cpus_clear(per_cpu(cpu_core_map, cpu)); -- cgit v1.2.3 From 5e5a29bf2624a5984e1c36c3a2481ee91249ec9c Mon Sep 17 00:00:00 2001 From: Anders H Kaseorg Date: Sat, 28 Jun 2008 18:25:41 -0400 Subject: x86, 64-bit: patch paravirt inline replacements when loading modules small speedup. Paravirt replacements were added to the i386 module loader by commit 139ec7c416248b9ea227d21839235344edfee1e0. This adds the same code to the x86_64 module loader. Signed-off-by: Anders Kaseorg Acked-by: "H. Peter Anvin" Signed-off-by: Ingo Molnar --- arch/x86/kernel/module_64.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c index a888e67f587..0e867676b5a 100644 --- a/arch/x86/kernel/module_64.c +++ b/arch/x86/kernel/module_64.c @@ -150,7 +150,8 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) { - const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL; + const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, + *para = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { @@ -160,6 +161,8 @@ int module_finalize(const Elf_Ehdr *hdr, alt = s; if (!strcmp(".smp_locks", secstrings + s->sh_name)) locks= s; + if (!strcmp(".parainstructions", secstrings + s->sh_name)) + para = s; } if (alt) { @@ -175,6 +178,11 @@ int module_finalize(const Elf_Ehdr *hdr, tseg, tseg + text->sh_size); } + if (para) { + void *pseg = (void *)para->sh_addr; + apply_paravirt(pseg, pseg + para->sh_size); + } + return module_bug_finalize(hdr, sechdrs, me); } -- cgit v1.2.3 From 299a140dacaa514be5e567b5851c187c42ec38c4 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 8 Jul 2008 14:47:16 +0200 Subject: x86, AMD IOMMU: ignore detection of GART IOMMU One of the last IOMMU updates covered a bug in the AMD IOMMU code. The early detection code does not succeed if the GART is already detected. This patch fixes this. Cc: Robert Richter Cc: Bhavna Sarathy Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: Joerg Roedel Cc: Robert Richter Cc: Bhavna Sarathy Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 2a13e430437..bb0280077a3 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -828,7 +828,7 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table) void __init amd_iommu_detect(void) { - if (swiotlb || no_iommu || iommu_detected) + if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture)) return; if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { -- cgit v1.2.3 From ab6bc3e343fbe3be4a0f67225e849d0db6b4b7ac Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 5 Jul 2008 15:53:36 +0400 Subject: x86: idle process - add checking for NULL early param Signed-off-by: Cyrill Gorcunov Cc: akpm@linux-foundation.org Cc: andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/process.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ba370dc8685..58325a6604a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -164,6 +164,9 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) static int __init idle_setup(char *str) { + if (!str) + return -EINVAL; + if (!strcmp(str, "poll")) { printk("using polling idle threads.\n"); pm_idle = poll_idle; -- cgit v1.2.3 From d6cd7effcc5e0047faf15ab0a54c980f1a616a07 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 5 Jul 2008 15:53:37 +0400 Subject: x86: io delay - add checking for NULL early param Signed-off-by: Cyrill Gorcunov Cc: akpm@linux-foundation.org Cc: andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_delay.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c index 5921e5f0a64..1c3a66a67f8 100644 --- a/arch/x86/kernel/io_delay.c +++ b/arch/x86/kernel/io_delay.c @@ -103,6 +103,9 @@ void __init io_delay_init(void) static int __init io_delay_param(char *s) { + if (!s) + return -EINVAL; + if (!strcmp(s, "0x80")) io_delay_type = CONFIG_IO_DELAY_TYPE_0X80; else if (!strcmp(s, "0xed")) -- cgit v1.2.3 From 4d8cc874d7ed43eda72765e9c0e141e170fee4f3 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 5 Jul 2008 15:53:38 +0400 Subject: x86: smpboot maxcpus - add checking for NULL early param Signed-off-by: Cyrill Gorcunov Cc: akpm@linux-foundation.org Cc: andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index fd933b5465b..e47bfac70c3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1452,7 +1452,8 @@ static int __init parse_maxcpus(char *arg) { extern unsigned int maxcpus; - maxcpus = simple_strtoul(arg, NULL, 0); + if (arg) + maxcpus = simple_strtoul(arg, NULL, 0); return 0; } early_param("maxcpus", parse_maxcpus); -- cgit v1.2.3 From 46a7fa270afbe5fddc6042a598cfe22977b0e989 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 11 Jul 2008 10:23:42 +0900 Subject: x86: make only GART code include gart.h gart.h has only GART-specific stuff. Only GART code needs it. Other IOMMU stuff should include iommu.h instead of gart.h. Signed-off-by: FUJITA Tomonori Acked-by: Muli Ben-Yehuda Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 2 +- arch/x86/kernel/amd_iommu_init.c | 2 +- arch/x86/kernel/aperture_64.c | 1 + arch/x86/kernel/early-quirks.c | 5 +---- arch/x86/kernel/pci-calgary_64.c | 2 +- arch/x86/kernel/pci-dma.c | 2 +- arch/x86/kernel/pci-gart_64.c | 1 + arch/x86/kernel/pci-nommu.c | 2 +- arch/x86/kernel/pci-swiotlb_64.c | 2 +- arch/x86/kernel/setup.c | 2 +- 10 files changed, 10 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index f2766d84c7a..cf2f74bcde5 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 2a13e430437..66438284c69 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include /* * definitions for the ACPI scanning code diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 9f907806c1a..44e21826db1 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index a4665f37cfc..510b8e36773 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -16,10 +16,7 @@ #include #include #include - -#ifdef CONFIG_GART_IOMMU -#include -#endif +#include static void __init fix_hypertransport_config(int num, int slot, int func) { diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 6959b5c45df..151f2d171f7 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 8467ec2320f..f581a4b63b4 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -5,7 +5,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index d0d18db5d2a..949ca985deb 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index aec43d56f49..792b9179eff 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -7,7 +7,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c index 82299cd1d04..20df839b9c2 100644 --- a/arch/x86/kernel/pci-swiotlb_64.c +++ b/arch/x86/kernel/pci-swiotlb_64.c @@ -5,7 +5,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 86fc2d62427..e5d208934bf 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -96,7 +96,7 @@ #include #include #include -#include +#include #include #include -- cgit v1.2.3 From ac7ded2adb2e43152fe7385ddd53bf45f5c92285 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 11 Jul 2008 10:23:43 +0900 Subject: x86: remove ifdef CONFIG_GART_IOMMU in pci-dma.c Our way to handle gart_* functions for CONFIG_GART_IOMMU and !CONFIG_GART_IOMMU cases is inconsistent. We have some dummy gart_* functions in !CONFIG_GART_IOMMU case and also use ifdef CONFIG_GART_IOMMU tricks in pci-dma.c to call some gart_* functions in only CONFIG_GART_IOMMU case. This patch removes ifdef CONFIG_GART_IOMMU in pci-dma.c and always use dummy gart_* functions in iommu.h. Signed-off-by: FUJITA Tomonori Acked-by: Muli Ben-Yehuda Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index f581a4b63b4..dd57c5bbe2d 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -114,9 +114,7 @@ void __init pci_iommu_alloc(void) * The order of these functions is important for * fall-back/fail-over reasons */ -#ifdef CONFIG_GART_IOMMU gart_iommu_hole_init(); -#endif #ifdef CONFIG_CALGARY_IOMMU detect_calgary(); @@ -184,9 +182,7 @@ static __init int iommu_setup(char *p) swiotlb = 1; #endif -#ifdef CONFIG_GART_IOMMU gart_parse_options(p); -#endif #ifdef CONFIG_CALGARY_IOMMU if (!strncmp(p, "calgary", 7)) @@ -508,9 +504,7 @@ static int __init pci_iommu_init(void) amd_iommu_init(); -#ifdef CONFIG_GART_IOMMU gart_iommu_init(); -#endif no_iommu_init(); return 0; -- cgit v1.2.3 From b8b48326f312026af12799917383c54c25d05482 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 11 Jul 2008 10:23:44 +0900 Subject: x86: remove ifdef CONFIG_CALGARY_IOMMU in pci-dma.c asm-x86/calgary.h has dummy calgary_iommu_init() and detect_calgary() in !CONFIG_CALGARY_IOMMU case. So we don't need ifdef CONFIG_CALGARY_IOMMU in pci-dma.c. Signed-off-by: FUJITA Tomonori Acked-by: Muli Ben-Yehuda Cc: Alexis Bruemmer Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index dd57c5bbe2d..f16cbbe424a 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -116,9 +116,7 @@ void __init pci_iommu_alloc(void) */ gart_iommu_hole_init(); -#ifdef CONFIG_CALGARY_IOMMU detect_calgary(); -#endif detect_intel_iommu(); @@ -496,9 +494,7 @@ EXPORT_SYMBOL(dma_free_coherent); static int __init pci_iommu_init(void) { -#ifdef CONFIG_CALGARY_IOMMU calgary_iommu_init(); -#endif intel_iommu_init(); -- cgit v1.2.3 From be54f9d1c8df93c4998e134a306652caaa58f67f Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 11 Jul 2008 10:23:45 +0900 Subject: x86: remove ifdef CONFIG_SWIOTLB in pci-dma.c As other IOMMUs do, this puts dummy pci_swiotlb_init() in swiotlb.h and remove ifdef CONFIG_SWIOTLB in pci-dma.c. Signed-off-by: FUJITA Tomonori Acked-by: Muli Ben-Yehuda Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index f16cbbe424a..d12945de056 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -122,9 +122,7 @@ void __init pci_iommu_alloc(void) amd_iommu_detect(); -#ifdef CONFIG_SWIOTLB pci_swiotlb_init(); -#endif } #endif -- cgit v1.2.3 From b65233a9c1da587bf19ee161982f4f0ec59941c0 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 11 Jul 2008 17:14:21 +0200 Subject: x86, AMD IOMMU: add comments to the initialization code This patch adds some comments to the AMD IOMMU initialization code to increase its readability. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 214 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 206 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index bb0280077a3..9ddb46d7c52 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -71,6 +71,17 @@ #define ACPI_DEVFLAG_LINT1 0x80 #define ACPI_DEVFLAG_ATSDIS 0x10000000 +/* + * ACPI table definitions + * + * These data structures are laid over the table to parse the important values + * out of it. + */ + +/* + * structure describing one IOMMU in the ACPI table. Typically followed by one + * or more ivhd_entrys. + */ struct ivhd_header { u8 type; u8 flags; @@ -83,6 +94,10 @@ struct ivhd_header { u32 reserved; } __attribute__((packed)); +/* + * A device entry describing which devices a specific IOMMU translates and + * which requestor ids they use. + */ struct ivhd_entry { u8 type; u16 devid; @@ -90,6 +105,10 @@ struct ivhd_entry { u32 ext; } __attribute__((packed)); +/* + * An AMD IOMMU memory definition structure. It defines things like exclusion + * ranges for devices and regions that should be unity mapped. + */ struct ivmd_header { u8 type; u8 flags; @@ -103,22 +122,66 @@ struct ivmd_header { static int __initdata amd_iommu_detected; -u16 amd_iommu_last_bdf; -struct list_head amd_iommu_unity_map; -unsigned amd_iommu_aperture_order = 26; -int amd_iommu_isolate; +u16 amd_iommu_last_bdf; /* largest PCI device id we have + to handle */ +struct list_head amd_iommu_unity_map; /* a list of required unity mappings + we find in ACPI */ +unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ +int amd_iommu_isolate; /* if 1, device isolation is enabled */ -struct list_head amd_iommu_list; +struct list_head amd_iommu_list; /* list of all AMD IOMMUs in the + system */ + +/* + * Pointer to the device table which is shared by all AMD IOMMUs + * it is indexed by the PCI device id or the HT unit id and contains + * information about the domain the device belongs to as well as the + * page table root pointer. + */ struct dev_table_entry *amd_iommu_dev_table; + +/* + * The alias table is a driver specific data structure which contains the + * mappings of the PCI device ids to the actual requestor ids on the IOMMU. + * More than one device can share the same requestor id. + */ u16 *amd_iommu_alias_table; + +/* + * The rlookup table is used to find the IOMMU which is responsible + * for a specific device. It is also indexed by the PCI device id. + */ struct amd_iommu **amd_iommu_rlookup_table; + +/* + * The pd table (protection domain table) is used to find the protection domain + * data structure a device belongs to. Indexed with the PCI device id too. + */ struct protection_domain **amd_iommu_pd_table; + +/* + * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap + * to know which ones are already in use. + */ unsigned long *amd_iommu_pd_alloc_bitmap; -static u32 dev_table_size; -static u32 alias_table_size; -static u32 rlookup_table_size; +static u32 dev_table_size; /* size of the device table */ +static u32 alias_table_size; /* size of the alias table */ +static u32 rlookup_table_size; /* size if the rlookup table */ + +/**************************************************************************** + * + * AMD IOMMU MMIO register space handling functions + * + * These functions are used to program the IOMMU device registers in + * MMIO space required for that driver. + * + ****************************************************************************/ +/* + * This function set the exclusion range in the IOMMU. DMA accesses to the + * exclusion range are passed through untranslated + */ static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) { u64 start = iommu->exclusion_start & PAGE_MASK; @@ -137,6 +200,7 @@ static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) &entry, sizeof(entry)); } +/* Programs the physical address of the device table into the IOMMU hardware */ static void __init iommu_set_device_table(struct amd_iommu *iommu) { u32 entry; @@ -149,6 +213,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu) &entry, sizeof(entry)); } +/* Generic functions to enable/disable certain features of the IOMMU. */ static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit) { u32 ctrl; @@ -167,6 +232,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); } +/* Function to enable the hardware */ void __init iommu_enable(struct amd_iommu *iommu) { printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at "); @@ -176,6 +242,10 @@ void __init iommu_enable(struct amd_iommu *iommu) iommu_feature_enable(iommu, CONTROL_IOMMU_EN); } +/* + * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in + * the system has one. + */ static u8 * __init iommu_map_mmio_space(u64 address) { u8 *ret; @@ -199,6 +269,19 @@ static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu) release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH); } +/**************************************************************************** + * + * The functions below belong to the first pass of AMD IOMMU ACPI table + * parsing. In this pass we try to find out the highest device id this + * code has to handle. Upon this information the size of the shared data + * structures is determined later. + * + ****************************************************************************/ + +/* + * This function reads the last device id the IOMMU has to handle from the PCI + * capability header for this IOMMU + */ static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr) { u32 cap; @@ -209,6 +292,10 @@ static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr) return 0; } +/* + * After reading the highest device id from the IOMMU PCI capability header + * this function looks if there is a higher device id defined in the ACPI table + */ static int __init find_last_devid_from_ivhd(struct ivhd_header *h) { u8 *p = (void *)h, *end = (void *)h; @@ -229,6 +316,7 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h) case IVHD_DEV_RANGE_END: case IVHD_DEV_ALIAS: case IVHD_DEV_EXT_SELECT: + /* all the above subfield types refer to device ids */ UPDATE_LAST_BDF(dev->devid); break; default: @@ -242,6 +330,11 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h) return 0; } +/* + * Iterate over all IVHD entries in the ACPI table and find the highest device + * id which we need to handle. This is the first of three functions which parse + * the ACPI table. So we check the checksum here. + */ static int __init find_last_devid_acpi(struct acpi_table_header *table) { int i; @@ -277,6 +370,20 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table) return 0; } +/**************************************************************************** + * + * The following functions belong the the code path which parses the ACPI table + * the second time. In this ACPI parsing iteration we allocate IOMMU specific + * data structures, initialize the device/alias/rlookup table and also + * basically initialize the hardware. + * + ****************************************************************************/ + +/* + * Allocates the command buffer. This buffer is per AMD IOMMU. We can + * write commands to that buffer later and the IOMMU will execute them + * asynchronously + */ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) { u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL, @@ -307,6 +414,7 @@ static void __init free_command_buffer(struct amd_iommu *iommu) get_order(CMD_BUFFER_SIZE)); } +/* sets a specific bit in the device table entry. */ static void set_dev_entry_bit(u16 devid, u8 bit) { int i = (bit >> 5) & 0x07; @@ -315,6 +423,10 @@ static void set_dev_entry_bit(u16 devid, u8 bit) amd_iommu_dev_table[devid].data[i] |= (1 << _bit); } +/* + * This function takes the device specific flags read from the ACPI + * table and sets up the device table entry with that information + */ static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags) { if (flags & ACPI_DEVFLAG_INITPASS) @@ -333,11 +445,16 @@ static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags) set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS); } +/* Writes the specific IOMMU for a device into the rlookup table */ static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) { amd_iommu_rlookup_table[devid] = iommu; } +/* + * Reads the device exclusion range from ACPI and initialize IOMMU with + * it + */ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m) { struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; @@ -346,12 +463,22 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m) return; if (iommu) { + /* + * We only can configure exclusion ranges per IOMMU, not + * per device. But we can enable the exclusion range per + * device. This is done here + */ set_dev_entry_bit(m->devid, DEV_ENTRY_EX); iommu->exclusion_start = m->range_start; iommu->exclusion_length = m->range_length; } } +/* + * This function reads some important data from the IOMMU PCI space and + * initializes the driver data structure with it. It reads the hardware + * capabilities and the first/last device entries + */ static void __init init_iommu_from_pci(struct amd_iommu *iommu) { int bus = PCI_BUS(iommu->devid); @@ -367,6 +494,10 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu) iommu->last_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_LD(range)); } +/* + * Takes a pointer to an AMD IOMMU entry in the ACPI table and + * initializes the hardware and our data structures with it. + */ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, struct ivhd_header *h) { @@ -467,6 +598,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, } } +/* Initializes the device->iommu mapping for the driver */ static int __init init_iommu_devices(struct amd_iommu *iommu) { u16 i; @@ -494,6 +626,11 @@ static void __init free_iommu_all(void) } } +/* + * This function clues the initialization function for one IOMMU + * together and also allocates the command buffer and programs the + * hardware. It does NOT enable the IOMMU. This is done afterwards. + */ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) { spin_lock_init(&iommu->lock); @@ -521,6 +658,10 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) return 0; } +/* + * Iterates over all IOMMU entries in the ACPI table, allocates the + * IOMMU structure and initializes it with init_iommu_one() + */ static int __init init_iommu_all(struct acpi_table_header *table) { u8 *p = (u8 *)table, *end = (u8 *)table; @@ -555,6 +696,14 @@ static int __init init_iommu_all(struct acpi_table_header *table) return 0; } +/**************************************************************************** + * + * The next functions belong to the third pass of parsing the ACPI + * table. In this last pass the memory mapping requirements are + * gathered (like exclusion and unity mapping reanges). + * + ****************************************************************************/ + static void __init free_unity_maps(void) { struct unity_map_entry *entry, *next; @@ -565,6 +714,7 @@ static void __init free_unity_maps(void) } } +/* called when we find an exclusion range definition in ACPI */ static int __init init_exclusion_range(struct ivmd_header *m) { int i; @@ -588,6 +738,7 @@ static int __init init_exclusion_range(struct ivmd_header *m) return 0; } +/* called for unity map ACPI definition */ static int __init init_unity_map_range(struct ivmd_header *m) { struct unity_map_entry *e = 0; @@ -619,6 +770,7 @@ static int __init init_unity_map_range(struct ivmd_header *m) return 0; } +/* iterates over all memory definitions we find in the ACPI table */ static int __init init_memory_definitions(struct acpi_table_header *table) { u8 *p = (u8 *)table, *end = (u8 *)table; @@ -642,6 +794,10 @@ static int __init init_memory_definitions(struct acpi_table_header *table) return 0; } +/* + * This function finally enables all IOMMUs found in the system after + * they have been initialized + */ static void __init enable_iommus(void) { struct amd_iommu *iommu; @@ -678,6 +834,34 @@ static struct sys_device device_amd_iommu = { .cls = &amd_iommu_sysdev_class, }; +/* + * This is the core init function for AMD IOMMU hardware in the system. + * This function is called from the generic x86 DMA layer initialization + * code. + * + * This function basically parses the ACPI table for AMD IOMMU (IVRS) + * three times: + * + * 1 pass) Find the highest PCI device id the driver has to handle. + * Upon this information the size of the data structures is + * determined that needs to be allocated. + * + * 2 pass) Initialize the data structures just allocated with the + * information in the ACPI table about available AMD IOMMUs + * in the system. It also maps the PCI devices in the + * system to specific IOMMUs + * + * 3 pass) After the basic data structures are allocated and + * initialized we update them with information about memory + * remapping requirements parsed out of the ACPI table in + * this last pass. + * + * After that the hardware is initialized and ready to go. In the last + * step we do some Linux specific things like registering the driver in + * the dma_ops interface and initializing the suspend/resume support + * functions. Finally it prints some information about AMD IOMMUs and + * the driver state and enables the hardware. + */ int __init amd_iommu_init(void) { int i, ret = 0; @@ -821,6 +1005,13 @@ free: goto out; } +/**************************************************************************** + * + * Early detect code. This code runs at IOMMU detection time in the DMA + * layer. It just looks if there is an IVRS ACPI table to detect AMD + * IOMMUs + * + ****************************************************************************/ static int __init early_amd_iommu_detect(struct acpi_table_header *table) { return 0; @@ -841,6 +1032,13 @@ void __init amd_iommu_detect(void) } } +/**************************************************************************** + * + * Parsing functions for the AMD IOMMU specific kernel command line + * options. + * + ****************************************************************************/ + static int __init parse_amd_iommu_options(char *str) { for (; *str; ++str) { -- cgit v1.2.3 From 431b2a2015337533f1a9e39a840266a8a2c93144 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 11 Jul 2008 17:14:22 +0200 Subject: x86, AMD IOMMU: add comments to core code This patch adds comments about how the AMD IOMMU core code works for the DMA remapping functionality. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 201 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 199 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index f2766d84c7a..4bae96ca7c1 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -34,6 +34,9 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock); +/* + * general struct to manage commands send to an IOMMU + */ struct command { u32 data[4]; }; @@ -41,11 +44,22 @@ struct command { static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, struct unity_map_entry *e); +/* returns !0 if the IOMMU is caching non-present entries in its TLB */ static int iommu_has_npcache(struct amd_iommu *iommu) { return iommu->cap & IOMMU_CAP_NPCACHE; } +/**************************************************************************** + * + * IOMMU command queuing functions + * + ****************************************************************************/ + +/* + * Writes the command to the IOMMUs command buffer and informs the + * hardware about the new command. Must be called with iommu->lock held. + */ static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) { u32 tail, head; @@ -63,6 +77,10 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) return 0; } +/* + * General queuing function for commands. Takes iommu->lock and calls + * __iommu_queue_command(). + */ static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) { unsigned long flags; @@ -75,6 +93,13 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) return ret; } +/* + * This function is called whenever we need to ensure that the IOMMU has + * completed execution of all commands we sent. It sends a + * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs + * us about that by writing a value to a physical address we pass with + * the command. + */ static int iommu_completion_wait(struct amd_iommu *iommu) { int ret; @@ -101,6 +126,9 @@ static int iommu_completion_wait(struct amd_iommu *iommu) return 0; } +/* + * Command send function for invalidating a device table entry + */ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) { struct command cmd; @@ -116,6 +144,9 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) return iommu_queue_command(iommu, &cmd); } +/* + * Generic command send function for invalidaing TLB entries + */ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, u64 address, u16 domid, int pde, int s) { @@ -127,9 +158,9 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, cmd.data[1] |= domid; cmd.data[2] = LOW_U32(address); cmd.data[3] = HIGH_U32(address); - if (s) + if (s) /* size bit - we flush more than one 4kb page */ cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; - if (pde) + if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; iommu->need_sync = 1; @@ -137,6 +168,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, return iommu_queue_command(iommu, &cmd); } +/* + * TLB invalidation function which is called from the mapping functions. + * It invalidates a single PTE if the range to flush is within a single + * page. Otherwise it flushes the whole TLB of the IOMMU. + */ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, u64 address, size_t size) { @@ -159,6 +195,20 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, return 0; } +/**************************************************************************** + * + * The functions below are used the create the page table mappings for + * unity mapped regions. + * + ****************************************************************************/ + +/* + * Generic mapping functions. It maps a physical address into a DMA + * address space. It allocates the page table pages if necessary. + * In the future it can be extended to a generic mapping function + * supporting all features of AMD IOMMU page tables like level skipping + * and full 64 bit address spaces. + */ static int iommu_map(struct protection_domain *dom, unsigned long bus_addr, unsigned long phys_addr, @@ -209,6 +259,10 @@ static int iommu_map(struct protection_domain *dom, return 0; } +/* + * This function checks if a specific unity mapping entry is needed for + * this specific IOMMU. + */ static int iommu_for_unity_map(struct amd_iommu *iommu, struct unity_map_entry *entry) { @@ -223,6 +277,12 @@ static int iommu_for_unity_map(struct amd_iommu *iommu, return 0; } +/* + * Init the unity mappings for a specific IOMMU in the system + * + * Basically iterates over all unity mapping entries and applies them to + * the default domain DMA of that IOMMU if necessary. + */ static int iommu_init_unity_mappings(struct amd_iommu *iommu) { struct unity_map_entry *entry; @@ -239,6 +299,10 @@ static int iommu_init_unity_mappings(struct amd_iommu *iommu) return 0; } +/* + * This function actually applies the mapping to the page table of the + * dma_ops domain. + */ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, struct unity_map_entry *e) { @@ -261,6 +325,9 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, return 0; } +/* + * Inits the unity mappings required for a specific device + */ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, u16 devid) { @@ -278,12 +345,26 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, return 0; } +/**************************************************************************** + * + * The next functions belong to the address allocator for the dma_ops + * interface functions. They work like the allocators in the other IOMMU + * drivers. Its basically a bitmap which marks the allocated pages in + * the aperture. Maybe it could be enhanced in the future to a more + * efficient allocator. + * + ****************************************************************************/ static unsigned long dma_mask_to_pages(unsigned long mask) { return (mask >> PAGE_SHIFT) + (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT); } +/* + * The address allocator core function. + * + * called with domain->lock held + */ static unsigned long dma_ops_alloc_addresses(struct device *dev, struct dma_ops_domain *dom, unsigned int pages) @@ -317,6 +398,11 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev, return address; } +/* + * The address free function. + * + * called with domain->lock held + */ static void dma_ops_free_addresses(struct dma_ops_domain *dom, unsigned long address, unsigned int pages) @@ -325,6 +411,16 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom, iommu_area_free(dom->bitmap, address, pages); } +/**************************************************************************** + * + * The next functions belong to the domain allocation. A domain is + * allocated for every IOMMU as the default domain. If device isolation + * is enabled, every device get its own domain. The most important thing + * about domains is the page table mapping the DMA address space they + * contain. + * + ****************************************************************************/ + static u16 domain_id_alloc(void) { unsigned long flags; @@ -342,6 +438,10 @@ static u16 domain_id_alloc(void) return id; } +/* + * Used to reserve address ranges in the aperture (e.g. for exclusion + * ranges. + */ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, unsigned long start_page, unsigned int pages) @@ -382,6 +482,10 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) free_page((unsigned long)p1); } +/* + * Free a domain, only used if something went wrong in the + * allocation path and we need to free an already allocated page table + */ static void dma_ops_domain_free(struct dma_ops_domain *dom) { if (!dom) @@ -396,6 +500,11 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) kfree(dom); } +/* + * Allocates a new protection domain usable for the dma_ops functions. + * It also intializes the page table and the address allocator data + * structures required for the dma_ops interface + */ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, unsigned order) { @@ -436,6 +545,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_dom->bitmap[0] = 1; dma_dom->next_bit = 0; + /* Intialize the exclusion range if necessary */ if (iommu->exclusion_start && iommu->exclusion_start < dma_dom->aperture_size) { unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; @@ -444,6 +554,11 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_ops_reserve_addresses(dma_dom, startpage, pages); } + /* + * At the last step, build the page tables so we don't need to + * allocate page table pages in the dma_ops mapping/unmapping + * path. + */ num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512); dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *), GFP_KERNEL); @@ -472,6 +587,10 @@ free_dma_dom: return NULL; } +/* + * Find out the protection domain structure for a given PCI device. This + * will give us the pointer to the page table root for example. + */ static struct protection_domain *domain_for_device(u16 devid) { struct protection_domain *dom; @@ -484,6 +603,10 @@ static struct protection_domain *domain_for_device(u16 devid) return dom; } +/* + * If a device is not yet associated with a domain, this function does + * assigns it visible for the hardware + */ static void set_device_domain(struct amd_iommu *iommu, struct protection_domain *domain, u16 devid) @@ -508,6 +631,19 @@ static void set_device_domain(struct amd_iommu *iommu, iommu->need_sync = 1; } +/***************************************************************************** + * + * The next functions belong to the dma_ops mapping/unmapping code. + * + *****************************************************************************/ + +/* + * In the dma_ops path we only have the struct device. This function + * finds the corresponding IOMMU, the protection domain and the + * requestor id for a given device. + * If the device is not yet associated with a domain this is also done + * in this function. + */ static int get_device_resources(struct device *dev, struct amd_iommu **iommu, struct protection_domain **domain, @@ -522,6 +658,7 @@ static int get_device_resources(struct device *dev, pcidev = to_pci_dev(dev); _bdf = (pcidev->bus->number << 8) | pcidev->devfn; + /* device not translated by any IOMMU in the system? */ if (_bdf >= amd_iommu_last_bdf) { *iommu = NULL; *domain = NULL; @@ -547,6 +684,10 @@ static int get_device_resources(struct device *dev, return 1; } +/* + * This is the generic map function. It maps one 4kb page at paddr to + * the given address in the DMA address space for the domain. + */ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, struct dma_ops_domain *dom, unsigned long address, @@ -578,6 +719,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, return (dma_addr_t)address; } +/* + * The generic unmapping function for on page in the DMA address space. + */ static void dma_ops_domain_unmap(struct amd_iommu *iommu, struct dma_ops_domain *dom, unsigned long address) @@ -597,6 +741,12 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu, *pte = 0ULL; } +/* + * This function contains common code for mapping of a physically + * contiguous memory region into DMA address space. It is uses by all + * mapping functions provided by this IOMMU driver. + * Must be called with the domain lock held. + */ static dma_addr_t __map_single(struct device *dev, struct amd_iommu *iommu, struct dma_ops_domain *dma_dom, @@ -628,6 +778,10 @@ out: return address; } +/* + * Does the reverse of the __map_single function. Must be called with + * the domain lock held too + */ static void __unmap_single(struct amd_iommu *iommu, struct dma_ops_domain *dma_dom, dma_addr_t dma_addr, @@ -652,6 +806,9 @@ static void __unmap_single(struct amd_iommu *iommu, dma_ops_free_addresses(dma_dom, dma_addr, pages); } +/* + * The exported map_single function for dma_ops. + */ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir) { @@ -664,6 +821,7 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, get_device_resources(dev, &iommu, &domain, &devid); if (iommu == NULL || domain == NULL) + /* device not handled by any AMD IOMMU */ return (dma_addr_t)paddr; spin_lock_irqsave(&domain->lock, flags); @@ -683,6 +841,9 @@ out: return addr; } +/* + * The exported unmap_single function for dma_ops. + */ static void unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, int dir) { @@ -692,6 +853,7 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr, u16 devid; if (!get_device_resources(dev, &iommu, &domain, &devid)) + /* device not handled by any AMD IOMMU */ return; spin_lock_irqsave(&domain->lock, flags); @@ -706,6 +868,10 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr, spin_unlock_irqrestore(&domain->lock, flags); } +/* + * This is a special map_sg function which is used if we should map a + * device which is not handled by an AMD IOMMU in the system. + */ static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist, int nelems, int dir) { @@ -720,6 +886,10 @@ static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist, return nelems; } +/* + * The exported map_sg function for dma_ops (handles scatter-gather + * lists). + */ static int map_sg(struct device *dev, struct scatterlist *sglist, int nelems, int dir) { @@ -775,6 +945,10 @@ unmap: goto out; } +/* + * The exported map_sg function for dma_ops (handles scatter-gather + * lists). + */ static void unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems, int dir) { @@ -804,6 +978,9 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, spin_unlock_irqrestore(&domain->lock, flags); } +/* + * The exported alloc_coherent function for dma_ops. + */ static void *alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag) { @@ -851,6 +1028,11 @@ out: return virt_addr; } +/* + * The exported free_coherent function for dma_ops. + * FIXME: fix the generic x86 DMA layer so that it actually calls that + * function. + */ static void free_coherent(struct device *dev, size_t size, void *virt_addr, dma_addr_t dma_addr) { @@ -879,6 +1061,8 @@ free_mem: } /* + * The function for pre-allocating protection domains. + * * If the driver core informs the DMA layer if a driver grabs a device * we don't need to preallocate the protection domains anymore. * For now we have to. @@ -921,12 +1105,20 @@ static struct dma_mapping_ops amd_iommu_dma_ops = { .unmap_sg = unmap_sg, }; +/* + * The function which clues the AMD IOMMU driver into dma_ops. + */ int __init amd_iommu_init_dma_ops(void) { struct amd_iommu *iommu; int order = amd_iommu_aperture_order; int ret; + /* + * first allocate a default protection domain for every IOMMU we + * found in the system. Devices not assigned to any other + * protection domain will be assigned to the default one. + */ list_for_each_entry(iommu, &amd_iommu_list, list) { iommu->default_dom = dma_ops_domain_alloc(iommu, order); if (iommu->default_dom == NULL) @@ -936,6 +1128,10 @@ int __init amd_iommu_init_dma_ops(void) goto free_domains; } + /* + * If device isolation is enabled, pre-allocate the protection + * domains for each device. + */ if (amd_iommu_isolate) prealloc_protection_domains(); @@ -947,6 +1143,7 @@ int __init amd_iommu_init_dma_ops(void) gart_iommu_aperture = 0; #endif + /* Make the driver finally visible to the drivers */ dma_ops = &amd_iommu_dma_ops; return 0; -- cgit v1.2.3 From 8ea80d783efd0c50577ec8d69757ae54c408eacd Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 11 Jul 2008 17:14:23 +0200 Subject: x86, AMD IOMMU: replace HIGH_U32 macro with upper_32_bits function Removes a driver specific macro and replaces it with a generic function already available in Linux. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 4bae96ca7c1..9098f047c1a 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -109,7 +109,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu) memset(&cmd, 0, sizeof(cmd)); cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK; - cmd.data[1] = HIGH_U32(ready_phys); + cmd.data[1] = upper_32_bits(ready_phys); cmd.data[2] = 1; /* value written to 'ready' */ CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT); @@ -157,7 +157,7 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES); cmd.data[1] |= domid; cmd.data[2] = LOW_U32(address); - cmd.data[3] = HIGH_U32(address); + cmd.data[3] = upper_32_bits(address); if (s) /* size bit - we flush more than one 4kb page */ cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ -- cgit v1.2.3 From 208ec8c94d818a3def0b424958493728871716d1 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 11 Jul 2008 17:14:24 +0200 Subject: x86, AMD IOMMU: replace UPDATE_LAST_BDF macro with a function This patch replaces the UPDATE_LAST_BDF macro in the init code with the update_last_devid function. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 9ddb46d7c52..6e1c8ffc0c5 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -30,11 +30,6 @@ /* * definitions for the ACPI scanning code */ -#define UPDATE_LAST_BDF(x) do {\ - if ((x) > amd_iommu_last_bdf) \ - amd_iommu_last_bdf = (x); \ - } while (0); - #define DEVID(bus, devfn) (((bus) << 8) | (devfn)) #define PCI_BUS(x) (((x) >> 8) & 0xff) #define IVRS_HEADER_LENGTH 48 @@ -169,6 +164,12 @@ static u32 dev_table_size; /* size of the device table */ static u32 alias_table_size; /* size of the alias table */ static u32 rlookup_table_size; /* size if the rlookup table */ +static inline void update_last_devid(u16 devid) +{ + if (devid > amd_iommu_last_bdf) + amd_iommu_last_bdf = devid; +} + /**************************************************************************** * * AMD IOMMU MMIO register space handling functions @@ -287,7 +288,7 @@ static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr) u32 cap; cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); - UPDATE_LAST_BDF(DEVID(MMIO_GET_BUS(cap), MMIO_GET_LD(cap))); + update_last_devid(DEVID(MMIO_GET_BUS(cap), MMIO_GET_LD(cap))); return 0; } @@ -317,7 +318,7 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h) case IVHD_DEV_ALIAS: case IVHD_DEV_EXT_SELECT: /* all the above subfield types refer to device ids */ - UPDATE_LAST_BDF(dev->devid); + update_last_devid(dev->devid); break; default: break; -- cgit v1.2.3 From c571484e53f3e1d90bc5374528580c7419d28d4c Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 11 Jul 2008 17:14:25 +0200 Subject: x86, AMD IOMMU: replace TBL_SIZE macro with a function This patch converts the TBL_SIZE macro in the init code to a function. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 6e1c8ffc0c5..1f148393cf7 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -33,7 +33,6 @@ #define DEVID(bus, devfn) (((bus) << 8) | (devfn)) #define PCI_BUS(x) (((x) >> 8) & 0xff) #define IVRS_HEADER_LENGTH 48 -#define TBL_SIZE(x) (1 << (PAGE_SHIFT + get_order(amd_iommu_last_bdf * (x)))) #define ACPI_IVHD_TYPE 0x10 #define ACPI_IVMD_TYPE_ALL 0x20 @@ -170,6 +169,14 @@ static inline void update_last_devid(u16 devid) amd_iommu_last_bdf = devid; } +static inline unsigned long tbl_size(int entry_size) +{ + unsigned shift = PAGE_SHIFT + + get_order(amd_iommu_last_bdf * entry_size); + + return 1UL << shift; +} + /**************************************************************************** * * AMD IOMMU MMIO register space handling functions @@ -884,9 +891,9 @@ int __init amd_iommu_init(void) if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0) return -ENODEV; - dev_table_size = TBL_SIZE(DEV_TABLE_ENTRY_SIZE); - alias_table_size = TBL_SIZE(ALIAS_TABLE_ENTRY_SIZE); - rlookup_table_size = TBL_SIZE(RLOOKUP_TABLE_ENTRY_SIZE); + dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE); + alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE); + rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE); ret = -ENOMEM; -- cgit v1.2.3 From 9a836de0c9944c42d006ec241712c72e74737c73 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 11 Jul 2008 17:14:26 +0200 Subject: x86, AMD IOMMU: remove unnecessary free checks from init code This patch removes unnecessary checks before memory is released. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 1f148393cf7..0f5a9115a69 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -417,9 +417,7 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) static void __init free_command_buffer(struct amd_iommu *iommu) { - if (iommu->cmd_buf) - free_pages((unsigned long)iommu->cmd_buf, - get_order(CMD_BUFFER_SIZE)); + free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE)); } /* sets a specific bit in the device table entry. */ @@ -987,24 +985,19 @@ out: return ret; free: - if (amd_iommu_pd_alloc_bitmap) - free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1); + free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1); - if (amd_iommu_pd_table) - free_pages((unsigned long)amd_iommu_pd_table, - get_order(rlookup_table_size)); + free_pages((unsigned long)amd_iommu_pd_table, + get_order(rlookup_table_size)); - if (amd_iommu_rlookup_table) - free_pages((unsigned long)amd_iommu_rlookup_table, - get_order(rlookup_table_size)); + free_pages((unsigned long)amd_iommu_rlookup_table, + get_order(rlookup_table_size)); - if (amd_iommu_alias_table) - free_pages((unsigned long)amd_iommu_alias_table, - get_order(alias_table_size)); + free_pages((unsigned long)amd_iommu_alias_table, + get_order(alias_table_size)); - if (amd_iommu_dev_table) - free_pages((unsigned long)amd_iommu_dev_table, - get_order(dev_table_size)); + free_pages((unsigned long)amd_iommu_dev_table, + get_order(dev_table_size)); free_iommu_all(); -- cgit v1.2.3 From 136f78a19cf94d469f31a4009c7c0ac2301fbbf0 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 11 Jul 2008 17:14:27 +0200 Subject: x86, AMD IOMMU: add an emergency exit to the completion wait loop To make the loop waiting for the completion wait command not wait forever this patch adds a limit of cycles that loop. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 9098f047c1a..7fa2d5d57dd 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -32,6 +32,8 @@ #define to_pages(addr, size) \ (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) +#define EXIT_LOOP_COUNT 10000000 + static DEFINE_RWLOCK(amd_iommu_devtable_lock); /* @@ -106,6 +108,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu) struct command cmd; volatile u64 ready = 0; unsigned long ready_phys = virt_to_phys(&ready); + unsigned long i = 0; memset(&cmd, 0, sizeof(cmd)); cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK; @@ -120,8 +123,13 @@ static int iommu_completion_wait(struct amd_iommu *iommu) if (ret) return ret; - while (!ready) + while (!ready && (i < EXIT_LOOP_COUNT)) { + ++i; cpu_relax(); + } + + if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit())) + printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n"); return 0; } -- cgit v1.2.3 From d64495366ff78fdbd5bd3176a7ada2f0c2cbfba6 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 11 Jul 2008 17:14:28 +0200 Subject: x86, AMD IOMMU: rename struct command to iommu_cmd This patch gives the struct command a more descriptive and not so generic name. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 7fa2d5d57dd..dec10e1a397 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -39,7 +39,7 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock); /* * general struct to manage commands send to an IOMMU */ -struct command { +struct iommu_cmd { u32 data[4]; }; @@ -62,7 +62,7 @@ static int iommu_has_npcache(struct amd_iommu *iommu) * Writes the command to the IOMMUs command buffer and informs the * hardware about the new command. Must be called with iommu->lock held. */ -static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) +static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) { u32 tail, head; u8 *target; @@ -83,7 +83,7 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) * General queuing function for commands. Takes iommu->lock and calls * __iommu_queue_command(). */ -static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) +static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) { unsigned long flags; int ret; @@ -105,7 +105,7 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) static int iommu_completion_wait(struct amd_iommu *iommu) { int ret; - struct command cmd; + struct iommu_cmd cmd; volatile u64 ready = 0; unsigned long ready_phys = virt_to_phys(&ready); unsigned long i = 0; @@ -139,7 +139,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu) */ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) { - struct command cmd; + struct iommu_cmd cmd; BUG_ON(iommu == NULL); @@ -158,7 +158,7 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, u64 address, u16 domid, int pde, int s) { - struct command cmd; + struct iommu_cmd cmd; memset(&cmd, 0, sizeof(cmd)); address &= PAGE_MASK; -- cgit v1.2.3 From d0312b2142ac7665031755c1cc3dba827d4eb711 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 11 Jul 2008 17:14:29 +0200 Subject: x86, AMD IOMMU: remove unneeded initializations from command buffer allocation This patch removes an unneeded initialization from the alloc_command_buffer function and replaces a memset with __GFP_ZERO. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 0f5a9115a69..0124995c7b5 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -394,17 +394,15 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table) */ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) { - u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL, + u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(CMD_BUFFER_SIZE)); - u64 entry = 0; + u64 entry; if (cmd_buf == NULL) return NULL; iommu->cmd_buf_size = CMD_BUFFER_SIZE; - memset(cmd_buf, 0, CMD_BUFFER_SIZE); - entry = (u64)virt_to_phys(cmd_buf); entry |= MMIO_CMD_SIZE_512; memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, -- cgit v1.2.3 From 58a3bee567b588a84cdde05fecc45439b396362c Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 11 Jul 2008 17:14:30 +0200 Subject: x86, AMD IOMMU: use true/false instead of 0/1 for bool value This patch replaces the integer values used for the bool variable in ACPI scanning code with true and false. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 0124995c7b5..316fe2eaeef 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -509,7 +509,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, u8 *end = p, flags = 0; u16 dev_i, devid = 0, devid_start = 0, devid_to = 0; u32 ext_flags = 0; - bool alias = 0; + bool alias = false; struct ivhd_entry *e; /* @@ -559,7 +559,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, devid_start = e->devid; flags = e->flags; ext_flags = 0; - alias = 0; + alias = false; break; case IVHD_DEV_ALIAS: devid = e->devid; @@ -572,7 +572,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, flags = e->flags; devid_to = e->ext >> 8; ext_flags = 0; - alias = 1; + alias = true; break; case IVHD_DEV_EXT_SELECT: devid = e->devid; @@ -582,7 +582,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, devid_start = e->devid; flags = e->flags; ext_flags = e->ext; - alias = 0; + alias = false; break; case IVHD_DEV_RANGE_END: devid = e->devid; -- cgit v1.2.3 From 2e22847fbe05f2543ccebd0c2df94d9cf3c52aa5 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 11 Jul 2008 17:14:31 +0200 Subject: x86, AMD IOMMU: do runtime list initialization at compile time This patch changes the list initialization for the iommu list and the unity map list from runtime to compile time. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 316fe2eaeef..0c247032308 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -118,12 +118,12 @@ static int __initdata amd_iommu_detected; u16 amd_iommu_last_bdf; /* largest PCI device id we have to handle */ -struct list_head amd_iommu_unity_map; /* a list of required unity mappings +LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings we find in ACPI */ unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ int amd_iommu_isolate; /* if 1, device isolation is enabled */ -struct list_head amd_iommu_list; /* list of all AMD IOMMUs in the +LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the system */ /* @@ -673,8 +673,6 @@ static int __init init_iommu_all(struct acpi_table_header *table) struct amd_iommu *iommu; int ret; - INIT_LIST_HEAD(&amd_iommu_list); - end += table->length; p += IVRS_HEADER_LENGTH; @@ -780,8 +778,6 @@ static int __init init_memory_definitions(struct acpi_table_header *table) u8 *p = (u8 *)table, *end = (u8 *)table; struct ivmd_header *m; - INIT_LIST_HEAD(&amd_iommu_unity_map); - end += table->length; p += IVRS_HEADER_LENGTH; -- cgit v1.2.3 From 5dc8bff0f6d0dfeb1f1c6e694294ba7c33d099f1 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 11 Jul 2008 17:14:32 +0200 Subject: x86, AMD IOMMU: replace memset with __GFP_ZERO for table allocation This patch removes the memset from the data structure initialization code and allocate the structures with the __GFP_ZERO flag. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 0c247032308..2efc3d59b7e 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -890,7 +890,7 @@ int __init amd_iommu_init(void) ret = -ENOMEM; /* Device table - directly used by all IOMMUs */ - amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL, + amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(dev_table_size)); if (amd_iommu_dev_table == NULL) goto out; @@ -914,27 +914,23 @@ int __init amd_iommu_init(void) * Protection Domain table - maps devices to protection domains * This table has the same size as the rlookup_table */ - amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL, + amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(rlookup_table_size)); if (amd_iommu_pd_table == NULL) goto free; - amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(GFP_KERNEL, + amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages( + GFP_KERNEL | __GFP_ZERO, get_order(MAX_DOMAIN_ID/8)); if (amd_iommu_pd_alloc_bitmap == NULL) goto free; /* - * memory is allocated now; initialize the device table with all zeroes - * and let all alias entries point to itself + * let all alias entries point to itself */ - memset(amd_iommu_dev_table, 0, dev_table_size); for (i = 0; i < amd_iommu_last_bdf; ++i) amd_iommu_alias_table[i] = i; - memset(amd_iommu_pd_table, 0, rlookup_table_size); - memset(amd_iommu_pd_alloc_bitmap, 0, MAX_DOMAIN_ID / 8); - /* * never allocate domain 0 because its used as the non-allocated and * error value placeholder -- cgit v1.2.3 From 0906372e6cf372f3162481f24a0b8ccae0eff4d7 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 11 Jul 2008 17:14:33 +0200 Subject: x86, AMD IOMMU: replace self made size parsing with memparse call This patch replaces the self-made parsing of the amd_iommu_size option with the generic memparse function call. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 2efc3d59b7e..e0ff9404e6c 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1042,20 +1042,10 @@ static int __init parse_amd_iommu_options(char *str) static int __init parse_amd_iommu_size_options(char *str) { - for (; *str; ++str) { - if (strcmp(str, "32M") == 0) - amd_iommu_aperture_order = 25; - if (strcmp(str, "64M") == 0) - amd_iommu_aperture_order = 26; - if (strcmp(str, "128M") == 0) - amd_iommu_aperture_order = 27; - if (strcmp(str, "256M") == 0) - amd_iommu_aperture_order = 28; - if (strcmp(str, "512M") == 0) - amd_iommu_aperture_order = 29; - if (strcmp(str, "1G") == 0) - amd_iommu_aperture_order = 30; - } + unsigned order = PAGE_SHIFT + get_order(memparse(str, &str)); + + if ((order > 24) && (order < 31)) + amd_iommu_aperture_order = order; return 1; } -- cgit v1.2.3 From d591b0a3ae25f587d0c4da1e1d1a425143590790 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 11 Jul 2008 17:14:35 +0200 Subject: x86, AMD IOMMU: replace DEVID macro with a function This patch replaces the DEVID macro with a function and uses them where apropriate (also in the core code). Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 2 +- arch/x86/kernel/amd_iommu_init.c | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index dec10e1a397..8c3deb027d3 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -664,7 +664,7 @@ static int get_device_resources(struct device *dev, BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask); pcidev = to_pci_dev(dev); - _bdf = (pcidev->bus->number << 8) | pcidev->devfn; + _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); /* device not translated by any IOMMU in the system? */ if (_bdf >= amd_iommu_last_bdf) { diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index e0ff9404e6c..9bf1b8111b0 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -30,7 +30,6 @@ /* * definitions for the ACPI scanning code */ -#define DEVID(bus, devfn) (((bus) << 8) | (devfn)) #define PCI_BUS(x) (((x) >> 8) & 0xff) #define IVRS_HEADER_LENGTH 48 @@ -295,7 +294,7 @@ static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr) u32 cap; cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); - update_last_devid(DEVID(MMIO_GET_BUS(cap), MMIO_GET_LD(cap))); + update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap))); return 0; } @@ -494,8 +493,10 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu) iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET); range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); - iommu->first_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_FD(range)); - iommu->last_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_LD(range)); + iommu->first_device = calc_devid(MMIO_GET_BUS(range), + MMIO_GET_FD(range)); + iommu->last_device = calc_devid(MMIO_GET_BUS(range), + MMIO_GET_LD(range)); } /* -- cgit v1.2.3 From d54191b85e294c46f05a2249b1f55ae54930bcc7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 22 Apr 2008 15:09:30 +0200 Subject: Kprobe smoke test lockdep warning On Mon, 2008-04-21 at 18:54 -0400, Masami Hiramatsu wrote: > Thank you for reporting. > > Actually, kprobes tries to fixup thread's flags in post_kprobe_handler > (which is called from kprobe_exceptions_notify) by > trace_hardirqs_fixup_flags(pt_regs->flags). However, even the irq flag > is set in pt_regs->flags, true hardirq is still off until returning > from do_debug. Thus, lockdep assumes that hardirq is off without annotation. > > IMHO, one possible solution is that fixing hardirq flags right after > notify_die in do_debug instead of in post_kprobe_handler. My reply to BZ 10489: > [ 2.707509] Kprobe smoke test started > [ 2.709300] ------------[ cut here ]------------ > [ 2.709420] WARNING: at kernel/lockdep.c:2658 check_flags+0x4d/0x12c() > [ 2.709541] Modules linked in: > [ 2.709588] Pid: 1, comm: swapper Not tainted 2.6.25.jml.057 #1 > [ 2.709588] [] warn_on_slowpath+0x41/0x51 > [ 2.709588] [] ? save_stack_trace+0x1d/0x3b > [ 2.709588] [] ? save_trace+0x37/0x89 > [ 2.709588] [] ? kernel_map_pages+0x103/0x11c > [ 2.709588] [] ? native_sched_clock+0xca/0xea > [ 2.709588] [] ? mark_held_locks+0x41/0x5c > [ 2.709588] [] ? kprobe_exceptions_notify+0x322/0x3af > [ 2.709588] [] ? trace_hardirqs_on+0xf1/0x119 > [ 2.709588] [] ? kprobe_exceptions_notify+0x355/0x3af > [ 2.709588] [] check_flags+0x4d/0x12c > [ 2.709588] [] lock_release+0x58/0x195 > [ 2.709588] [] ? __atomic_notifier_call_chain+0x0/0x80 > [ 2.709588] [] __atomic_notifier_call_chain+0x5a/0x80 > [ 2.709588] [] atomic_notifier_call_chain+0xc/0xe > [ 2.709588] [] notify_die+0x2d/0x2f > [ 2.709588] [] do_debug+0x67/0xfe > [ 2.709588] [] debug_stack_correct+0x27/0x30 > [ 2.709588] [] ? kprobe_target+0x1/0x34 > [ 2.709588] [] ? init_test_probes+0x50/0x186 > [ 2.709588] [] init_kprobes+0x85/0x8c > [ 2.709588] [] kernel_init+0x13d/0x298 > [ 2.709588] [] ? kernel_init+0x0/0x298 > [ 2.709588] [] ? kernel_init+0x0/0x298 > [ 2.709588] [] kernel_thread_helper+0x7/0x10 > [ 2.709588] ======================= > [ 2.709588] ---[ end trace 778e504de7e3b1e3 ]--- > [ 2.709588] possible reason: unannotated irqs-off. > [ 2.709588] irq event stamp: 370065 > [ 2.709588] hardirqs last enabled at (370065): [] kprobe_exceptions_notify+0x322/0x3af > [ 2.709588] hardirqs last disabled at (370064): [] do_int3+0x1d/0x7d > [ 2.709588] softirqs last enabled at (370050): [] __do_softirq+0xfa/0x100 > [ 2.709588] softirqs last disabled at (370045): [] do_softirq+0x74/0xd9 > [ 2.714751] Kprobe smoke test passed successfully how I love this stuff... Ok, do_debug() is a trap, this can happen at any time regardless of the machine's IRQ state. So the first thing we do is fix up the IRQ state. Then we call this die notifier stuff; and return with messed up IRQ state... YAY. So, kprobes fudges it.. notify_die(DIE_DEBUG) kprobe_exceptions_notify() post_kprobe_handler() modify regs->flags trace_hardirqs_fixup_flags(regs->flags); <--- must be it So what's the use of modifying flags if they're not meant to take effect at some point. /me tries to reproduce issue; enable kprobes test thingy && boot OK, that reproduces.. So the below makes it work - but I'm not getting this code; at the time I wrote that stuff I CC'ed each and every kprobe maintainer listed in the usual places but got no reposonse - can some please explain this stuff to me? Are the saved flags only for the TF bit or are they made in full effect later (and if so, where) ? Signed-off-by: Peter Zijlstra Acked-by: Masami Hiramatsu Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index b8c6743a13d..43c019f85f0 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -860,7 +860,6 @@ static int __kprobes post_kprobe_handler(struct pt_regs *regs) resume_execution(cur, regs, kcb); regs->flags |= kcb->kprobe_saved_flags; - trace_hardirqs_fixup_flags(regs->flags); if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { kcb->kprobe_status = KPROBE_HIT_SSDONE; -- cgit v1.2.3 From a312b37b2a212fd2e227d1d6321f903b91b65ec7 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Tue, 8 Jul 2008 15:06:23 -0700 Subject: x86/paravirt: call paravirt_pagetable_setup_{start, done} Call paravirt_pagetable_setup_{start,done} These paravirt_ops functions were not being called on x86_64. Signed-off-by: Eduardo Habkost Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/kernel/paravirt.c | 4 ++++ arch/x86/kernel/setup.c | 2 ++ 2 files changed, 6 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index e0f571d58c1..2963ab5d91e 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -373,6 +374,9 @@ struct pv_mmu_ops pv_mmu_ops = { #ifndef CONFIG_X86_64 .pagetable_setup_start = native_pagetable_setup_start, .pagetable_setup_done = native_pagetable_setup_done, +#else + .pagetable_setup_start = paravirt_nop, + .pagetable_setup_done = paravirt_nop, #endif .read_cr2 = native_read_cr2, diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 36c540d4ac4..8ce6a91ce10 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -819,7 +819,9 @@ void __init setup_arch(char **cmdline_p) vmi_init(); #endif + paravirt_pagetable_setup_start(swapper_pg_dir); paging_init(); + paravirt_pagetable_setup_done(swapper_pg_dir); #ifdef CONFIG_X86_64 map_vsyscall(); -- cgit v1.2.3 From c1f2f09ef66d5dadd5fe42ea909e708470c9636d Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Tue, 8 Jul 2008 15:06:24 -0700 Subject: pvops-64: call paravirt_post_allocator_init() on setup_arch() Signed-off-by: Eduardo Habkost Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 8ce6a91ce10..2ed504b97d4 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -822,6 +822,7 @@ void __init setup_arch(char **cmdline_p) paravirt_pagetable_setup_start(swapper_pg_dir); paging_init(); paravirt_pagetable_setup_done(swapper_pg_dir); + paravirt_post_allocator_init(); #ifdef CONFIG_X86_64 map_vsyscall(); -- cgit v1.2.3 From 8840c0ccd763936a8e730ece118197a51be8dc8e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:25 -0700 Subject: x86_64: there's no need to preallocate level1_fixmap_pgt Early fixmap will allocate its own L1 pagetable page for fixmap mappings, so there's no need to preallocate one. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index b07ac7b217c..4b6bda21837 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -362,12 +362,6 @@ NEXT_PAGE(level3_kernel_pgt) .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE NEXT_PAGE(level2_fixmap_pgt) - .fill 506,8,0 - .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE - /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ - .fill 5,8,0 - -NEXT_PAGE(level1_fixmap_pgt) .fill 512,8,0 NEXT_PAGE(level2_ident_pgt) -- cgit v1.2.3 From 87b935a0ef9a1ddf62f2f0c0fc17b10654ff41cd Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:26 -0700 Subject: x86: clean up formatting of __switch_to process_64.c:__switch_to has some very old strange formatting, some of it dating back to pre-git. Fix it up. No functional changes. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_64.c | 56 ++++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 28 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index a8e53626ac9..e8a8e1b9981 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -537,8 +537,8 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { - struct thread_struct *prev = &prev_p->thread, - *next = &next_p->thread; + struct thread_struct *prev = &prev_p->thread; + struct thread_struct *next = &next_p->thread; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); unsigned fsindex, gsindex; @@ -586,35 +586,34 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Switch FS and GS. + * + * Segment register != 0 always requires a reload. Also + * reload when it has changed. When prev process used 64bit + * base always reload to avoid an information leak. */ - { - /* segment register != 0 always requires a reload. - also reload when it has changed. - when prev process used 64bit base always reload - to avoid an information leak. */ - if (unlikely(fsindex | next->fsindex | prev->fs)) { - loadsegment(fs, next->fsindex); - /* check if the user used a selector != 0 - * if yes clear 64bit base, since overloaded base - * is always mapped to the Null selector - */ - if (fsindex) + if (unlikely(fsindex | next->fsindex | prev->fs)) { + loadsegment(fs, next->fsindex); + /* + * Check if the user used a selector != 0; if yes + * clear 64bit base, since overloaded base is always + * mapped to the Null selector + */ + if (fsindex) prev->fs = 0; - } - /* when next process has a 64bit base use it */ - if (next->fs) - wrmsrl(MSR_FS_BASE, next->fs); - prev->fsindex = fsindex; - - if (unlikely(gsindex | next->gsindex | prev->gs)) { - load_gs_index(next->gsindex); - if (gsindex) + } + /* when next process has a 64bit base use it */ + if (next->fs) + wrmsrl(MSR_FS_BASE, next->fs); + prev->fsindex = fsindex; + + if (unlikely(gsindex | next->gsindex | prev->gs)) { + load_gs_index(next->gsindex); + if (gsindex) prev->gs = 0; - } - if (next->gs) - wrmsrl(MSR_KERNEL_GS_BASE, next->gs); - prev->gsindex = gsindex; } + if (next->gs) + wrmsrl(MSR_KERNEL_GS_BASE, next->gs); + prev->gsindex = gsindex; /* Must be after DS reload */ unlazy_fpu(prev_p); @@ -627,7 +626,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) write_pda(pcurrent, next_p); write_pda(kernelstack, - (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); + (unsigned long)task_stack_page(next_p) + + THREAD_SIZE - PDA_STACKOFFSET); #ifdef CONFIG_CC_STACKPROTECTOR write_pda(stack_canary, next_p->stack_canary); /* -- cgit v1.2.3 From cbcd79c2e5b496b84845618cef734b4c40736576 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:27 -0700 Subject: x86: use __page_aligned_data/bss Update arch/x86's use of page-aligned variables. The change to arch/x86/xen/mmu.c fixes an actual bug, but the rest are cleanups and to set a precedent. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common_64.c | 4 ++-- arch/x86/kernel/irq_32.c | 7 ++----- 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 7b8cc72feb4..15419cd3c5a 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -517,8 +518,7 @@ void pda_init(int cpu) } char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + - DEBUG_STKSZ] -__attribute__((section(".bss.page_aligned"))); + DEBUG_STKSZ] __page_aligned_bss; extern asmlinkage void ignore_sysret(void); diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 47a6f6f1247..1cf8c1fcc08 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -83,11 +83,8 @@ union irq_ctx { static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; -static char softirq_stack[NR_CPUS * THREAD_SIZE] - __attribute__((__section__(".bss.page_aligned"))); - -static char hardirq_stack[NR_CPUS * THREAD_SIZE] - __attribute__((__section__(".bss.page_aligned"))); +static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; +static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; static void call_on_stack(void *func, void *stack) { -- cgit v1.2.3 From 7c33b1e6ee26d67551109aca04d46544d0ce55b1 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:29 -0700 Subject: x86_64: unstatic get_local_pda This allows Xen's xen_cpu_up() to allocate a pda for the new CPU. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 687376ab07e..1deb3b624a7 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -768,7 +768,7 @@ static void __cpuinit do_fork_idle(struct work_struct *work) * * Must be called after the _cpu_pda pointer table is initialized. */ -static int __cpuinit get_local_pda(int cpu) +int __cpuinit get_local_pda(int cpu) { struct x8664_pda *oldpda, *newpda; unsigned long size = sizeof(struct x8664_pda); -- cgit v1.2.3 From 5b09b2876ed1a8e34a0da8f069575fc6174e2077 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:42 -0700 Subject: x86_64: add workaround for no %gs-based percpu As a stopgap until Mike Travis's x86-64 gs-based percpu patches are ready, provide workaround functions for x86_read/write_percpu for Xen's use. Specifically, this means that we can't really make use of vcpu placement, because we can't use a single gs-based memory access to get to vcpu fields. So disable all that for now. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/kernel/head64.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index c9781982914..1b318e903bf 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -39,6 +39,13 @@ static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata; static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly; #endif +void __init x86_64_init_pda(void) +{ + _cpu_pda = __cpu_pda; + cpu_pda(0) = &_boot_cpu_pda; + pda_init(0); +} + static void __init zap_identity_mappings(void) { pgd_t *pgd = pgd_offset_k(0UL); @@ -102,9 +109,7 @@ void __init x86_64_start_kernel(char * real_mode_data) early_printk("Kernel alive\n"); - _cpu_pda = __cpu_pda; - cpu_pda(0) = &_boot_cpu_pda; - pda_init(0); + x86_64_init_pda(); early_printk("Kernel really alive\n"); -- cgit v1.2.3 From 8c5e5ac32fe08793246709fbb94c055ec76a7c0e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:44 -0700 Subject: xen64: add xen-head code to head_64.S Add the Xen entrypoint and ELF notes to head_64.S. Adapts xen-head.S to compile either 32-bit or 64-bit. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/kernel/asm-offsets_64.c | 3 +++ arch/x86/kernel/head_64.S | 1 + 2 files changed, 4 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index bacf5deeec2..0f7e1f09aa0 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -131,5 +131,8 @@ int main(void) OFFSET(BP_loadflags, boot_params, hdr.loadflags); OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); OFFSET(BP_version, boot_params, hdr.version); + + BLANK(); + DEFINE(PAGE_SIZE_asm, PAGE_SIZE); return 0; } diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 4b6bda21837..2240f823676 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -401,6 +401,7 @@ ENTRY(phys_base) /* This must match the first entry in level2_kernel_pgt */ .quad 0x0000000000000000 +#include "../../x86/xen/xen-head.S" .section .bss, "aw", @nobits .align L1_CACHE_BYTES -- cgit v1.2.3 From 555cf2b5805a213ba262a2830c4d22ad635a249e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:45 -0700 Subject: xen64: add asm-offsets Add Xen vcpu_info offsets to asm-offsets_64. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/kernel/asm-offsets_64.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 0f7e1f09aa0..aa89387006f 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -18,6 +18,8 @@ #include #include +#include + #define __NO_STUBS 1 #undef __SYSCALL #undef _ASM_X86_64_UNISTD_H_ @@ -134,5 +136,11 @@ int main(void) BLANK(); DEFINE(PAGE_SIZE_asm, PAGE_SIZE); +#ifdef CONFIG_XEN + BLANK(); + OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); + OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); +#undef ENTRY +#endif return 0; } -- cgit v1.2.3 From 3d75e1b8ef1567348ceba93d4666a1c7c2333583 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:49 -0700 Subject: xen64: add hypervisor callbacks for events, etc Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 98 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index ae63e584c34..7cc2de79614 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1312,3 +1312,101 @@ KPROBE_ENTRY(ignore_sysret) sysret CFI_ENDPROC ENDPROC(ignore_sysret) + +#ifdef CONFIG_XEN +ENTRY(xen_hypervisor_callback) + zeroentry xen_do_hypervisor_callback +END(xen_hypervisor_callback) + +/* +# A note on the "critical region" in our callback handler. +# We want to avoid stacking callback handlers due to events occurring +# during handling of the last event. To do this, we keep events disabled +# until we've done all processing. HOWEVER, we must enable events before +# popping the stack frame (can't be done atomically) and so it would still +# be possible to get enough handler activations to overflow the stack. +# Although unlikely, bugs of that kind are hard to track down, so we'd +# like to avoid the possibility. +# So, on entry to the handler we detect whether we interrupted an +# existing activation in its critical region -- if so, we pop the current +# activation and restart the handler using the previous one. +*/ +ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) + CFI_STARTPROC +/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will + see the correct pointer to the pt_regs */ + movq %rdi, %rsp # we don't return, adjust the stack frame + CFI_ENDPROC + CFI_DEFAULT_STACK +11: incl %gs:pda_irqcount + movq %rsp,%rbp + CFI_DEF_CFA_REGISTER rbp + cmovzq %gs:pda_irqstackptr,%rsp + pushq %rbp # backlink for old unwinder + call xen_evtchn_do_upcall + popq %rsp + CFI_DEF_CFA_REGISTER rsp + decl %gs:pda_irqcount + jmp error_exit + CFI_ENDPROC +END(do_hypervisor_callback) + +/* +# Hypervisor uses this for application faults while it executes. +# We get here for two reasons: +# 1. Fault while reloading DS, ES, FS or GS +# 2. Fault while executing IRET +# Category 1 we do not need to fix up as Xen has already reloaded all segment +# registers that could be reloaded and zeroed the others. +# Category 2 we fix up by killing the current process. We cannot use the +# normal Linux return path in this case because if we use the IRET hypercall +# to pop the stack frame we end up in an infinite loop of failsafe callbacks. +# We distinguish between categories by comparing each saved segment register +# with its current contents: any discrepancy means we in category 1. +*/ +ENTRY(xen_failsafe_callback) +#if 1 + ud2a +#else + _frame (RIP-0x30) + CFI_REL_OFFSET rcx, 0 + CFI_REL_OFFSET r11, 8 + movw %ds,%cx + cmpw %cx,0x10(%rsp) + CFI_REMEMBER_STATE + jne 1f + movw %es,%cx + cmpw %cx,0x18(%rsp) + jne 1f + movw %fs,%cx + cmpw %cx,0x20(%rsp) + jne 1f + movw %gs,%cx + cmpw %cx,0x28(%rsp) + jne 1f + /* All segments match their saved values => Category 2 (Bad IRET). */ + movq (%rsp),%rcx + CFI_RESTORE rcx + movq 8(%rsp),%r11 + CFI_RESTORE r11 + addq $0x30,%rsp + CFI_ADJUST_CFA_OFFSET -0x30 + movq $11,%rdi /* SIGSEGV */ + jmp do_exit + CFI_RESTORE_STATE +1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ + movq (%rsp),%rcx + CFI_RESTORE rcx + movq 8(%rsp),%r11 + CFI_RESTORE r11 + addq $0x30,%rsp + CFI_ADJUST_CFA_OFFSET -0x30 + pushq $0 + CFI_ADJUST_CFA_OFFSET 8 + SAVE_ALL + jmp error_exit + CFI_ENDPROC +#endif +END(xen_failsafe_callback) + +#endif /* CONFIG_XEN */ -- cgit v1.2.3 From 4a5c3e77f70b3ea8b361d7fa9eb2e4dad18f70ae Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:07:09 -0700 Subject: xen64: implement failsafe callback Implement the failsafe callback, so that iret and segment register load exceptions are reported to the kernel. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 7cc2de79614..6aa6932e21b 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1365,10 +1365,8 @@ END(do_hypervisor_callback) # with its current contents: any discrepancy means we in category 1. */ ENTRY(xen_failsafe_callback) -#if 1 - ud2a -#else - _frame (RIP-0x30) + framesz = (RIP-0x30) /* workaround buggy gas */ + _frame framesz CFI_REL_OFFSET rcx, 0 CFI_REL_OFFSET r11, 8 movw %ds,%cx @@ -1391,8 +1389,13 @@ ENTRY(xen_failsafe_callback) CFI_RESTORE r11 addq $0x30,%rsp CFI_ADJUST_CFA_OFFSET -0x30 - movq $11,%rdi /* SIGSEGV */ - jmp do_exit + pushq $0 + CFI_ADJUST_CFA_OFFSET 8 + pushq %r11 + CFI_ADJUST_CFA_OFFSET 8 + pushq %rcx + CFI_ADJUST_CFA_OFFSET 8 + jmp general_protection CFI_RESTORE_STATE 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ movq (%rsp),%rcx @@ -1406,7 +1409,6 @@ ENTRY(xen_failsafe_callback) SAVE_ALL jmp error_exit CFI_ENDPROC -#endif END(xen_failsafe_callback) #endif /* CONFIG_XEN */ -- cgit v1.2.3 From 6596f2422306a05be2170efc114da49f26a047dd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 9 Jul 2008 22:32:33 +0200 Subject: Revert "x86_64: there's no need to preallocate level1_fixmap_pgt" This reverts commit 033786969d1d1b5af12a32a19d3a760314d05329. Suresh Siddha reported that this broke booting on his 2GB testbox. Reported-by: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 2240f823676..db3280afe88 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -362,6 +362,12 @@ NEXT_PAGE(level3_kernel_pgt) .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE NEXT_PAGE(level2_fixmap_pgt) + .fill 506,8,0 + .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE + /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ + .fill 5,8,0 + +NEXT_PAGE(level1_fixmap_pgt) .fill 512,8,0 NEXT_PAGE(level2_ident_pgt) -- cgit v1.2.3 From 6a52e4b1cddd90fbfde8fb67021657936ee74b07 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sat, 12 Jul 2008 02:22:00 -0700 Subject: x86_64: further cleanup of 32-bit compat syscall mechanisms AMD only supports "syscall" from 32-bit compat usermode. Intel and Centaur(?) only support "sysenter" from 32-bit compat usermode. Set the X86 feature bits accordingly, and set up the vdso in accordance with those bits. On the offchance we run on in a 64-bit environment which supports neither syscall nor sysenter from 32-bit mode, then fall back to the int $0x80 vdso. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/amd_64.c | 2 ++ arch/x86/kernel/cpu/common_64.c | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c index 7c36fb8a28d..d1692b2a41f 100644 --- a/arch/x86/kernel/cpu/amd_64.c +++ b/arch/x86/kernel/cpu/amd_64.c @@ -115,6 +115,8 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ if (c->x86_power & (1<<8)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + + set_cpu_cap(c, X86_FEATURE_SYSCALL32); } static void __cpuinit init_amd(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 15419cd3c5a..736f50fa433 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -317,9 +317,6 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) c->x86_phys_bits = eax & 0xff; } - /* Assume all 64-bit CPUs support 32-bit syscall */ - set_cpu_cap(c, X86_FEATURE_SYSCALL32); - if (c->x86_vendor != X86_VENDOR_UNKNOWN && cpu_devs[c->x86_vendor]->c_early_init) cpu_devs[c->x86_vendor]->c_early_init(c); -- cgit v1.2.3 From 094029479be8eb380447f42eff1b35362ef1a464 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sat, 12 Jul 2008 02:22:12 -0700 Subject: x86_64: adjust exception frame on paranoid exceptions Exceptions using paranoidentry need to have their exception frames adjusted explicitly. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_64.S | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 6aa6932e21b..80d5663db3b 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1189,6 +1189,7 @@ END(device_not_available) /* runs on exception stack */ KPROBE_ENTRY(debug) INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_debug, DEBUG_STACK @@ -1198,6 +1199,7 @@ KPROBE_END(debug) /* runs on exception stack */ KPROBE_ENTRY(nmi) INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME pushq $-1 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_nmi, 0, 0 @@ -1211,6 +1213,7 @@ KPROBE_END(nmi) KPROBE_ENTRY(int3) INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_int3, DEBUG_STACK @@ -1237,6 +1240,7 @@ END(coprocessor_segment_overrun) /* runs on exception stack */ ENTRY(double_fault) XCPT_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME paranoidentry do_double_fault jmp paranoid_exit1 CFI_ENDPROC @@ -1253,6 +1257,7 @@ END(segment_not_present) /* runs on exception stack */ ENTRY(stack_segment) XCPT_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME paranoidentry do_stack_segment jmp paranoid_exit1 CFI_ENDPROC @@ -1278,6 +1283,7 @@ END(spurious_interrupt_bug) /* runs on exception stack */ ENTRY(machine_check) INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_machine_check -- cgit v1.2.3 From 74d4affde8feb8d5bdebf7fba8e90e4eae3b7b1d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 7 Jul 2008 12:07:50 -0700 Subject: x86/paravirt: add hooks for spinlock operations Ticket spinlocks have absolutely ghastly worst-case performance characteristics in a virtual environment. If there is any contention for physical CPUs (ie, there are more runnable vcpus than cpus), then ticket locks can cause the system to end up spending 90+% of its time spinning. The problem is that (v)cpus waiting on a ticket spinlock will be granted access to the lock in strict order they got their tickets. If the hypervisor scheduler doesn't give the vcpus time in that order, they will burn timeslices waiting for the scheduler to give the right vcpu some time. In the worst case it could take O(n^2) vcpu scheduler timeslices for everyone waiting on the lock to get it, not counting new cpus trying to take the lock while the log-jam is sorted out. These hooks allow a paravirt backend to replace the spinlock implementation. At the very least, this could revert the implementation back to the old lock algorithm, which allows the next scheduled vcpu to take the lock, and has basically fairly good performance. It also allows the spinlocks to take advantages of the hypervisor features to make locks more efficient (spin and block, for example). The cost to native execution is an extra direct call when using a spinlock function. There's no overhead if CONFIG_PARAVIRT is turned off. The lock structure is fixed at a single "unsigned int", initialized to zero, but the spinlock implementation can use it as it wishes. Thanks to Thomas Friebel's Xen Summit talk "Preventing Guests from Spinning Around" for pointing out this problem. Signed-off-by: Jeremy Fitzhardinge Cc: Jens Axboe Cc: Peter Zijlstra Cc: Christoph Lameter Cc: Petr Tesarik Cc: Virtualization Cc: Xen devel Cc: Thomas Friebel Cc: Nick Piggin Signed-off-by: Ingo Molnar --- arch/x86/kernel/paravirt.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 2963ab5d91e..f3381686870 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -124,6 +124,7 @@ static void *get_call_destination(u8 type) .pv_irq_ops = pv_irq_ops, .pv_apic_ops = pv_apic_ops, .pv_mmu_ops = pv_mmu_ops, + .pv_lock_ops = pv_lock_ops, }; return *((void **)&tmpl + type); } @@ -450,6 +451,15 @@ struct pv_mmu_ops pv_mmu_ops = { .set_fixmap = native_set_fixmap, }; +struct pv_lock_ops pv_lock_ops = { + .spin_is_locked = __ticket_spin_is_locked, + .spin_is_contended = __ticket_spin_is_contended, + + .spin_lock = __ticket_spin_lock, + .spin_trylock = __ticket_spin_trylock, + .spin_unlock = __ticket_spin_unlock, +}; + EXPORT_SYMBOL_GPL(pv_time_ops); EXPORT_SYMBOL (pv_cpu_ops); EXPORT_SYMBOL (pv_mmu_ops); -- cgit v1.2.3 From 8efcbab674de2bee45a2e4cdf97de16b8e609ac8 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 7 Jul 2008 12:07:51 -0700 Subject: paravirt: introduce a "lock-byte" spinlock implementation Implement a version of the old spinlock algorithm, in which everyone spins waiting for a lock byte. In order to be compatible with the ticket-lock's use of a zero initializer, this uses the convention of '0' for unlocked and '1' for locked. This algorithm is much better than ticket locks in a virtual envionment, because it doesn't interact badly with the vcpu scheduler. If there are multiple vcpus spinning on a lock and the lock is released, the next vcpu to be scheduled will take the lock, rather than cycling around until the next ticketed vcpu gets it. To use this, you must call paravirt_use_bytelocks() very early, before any spinlocks have been taken. Signed-off-by: Jeremy Fitzhardinge Cc: Jens Axboe Cc: Peter Zijlstra Cc: Christoph Lameter Cc: Petr Tesarik Cc: Virtualization Cc: Xen devel Cc: Thomas Friebel Cc: Nick Piggin Signed-off-by: Ingo Molnar --- arch/x86/kernel/paravirt.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index f3381686870..bba4041bb7f 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -268,6 +268,15 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) return __get_cpu_var(paravirt_lazy_mode); } +void __init paravirt_use_bytelocks(void) +{ + pv_lock_ops.spin_is_locked = __byte_spin_is_locked; + pv_lock_ops.spin_is_contended = __byte_spin_is_contended; + pv_lock_ops.spin_lock = __byte_spin_lock; + pv_lock_ops.spin_trylock = __byte_spin_trylock; + pv_lock_ops.spin_unlock = __byte_spin_unlock; +} + struct pv_info pv_info = { .name = "bare hardware", .paravirt_enabled = 0, -- cgit v1.2.3 From 4bb689eee12ceb6d669a0c9a519037c049a8af38 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 9 Jul 2008 14:33:33 +0200 Subject: x86: paravirt spinlocks, !CONFIG_SMP build fixes Signed-off-by: Ingo Molnar --- arch/x86/kernel/paravirt.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index bba4041bb7f..6aa8aed06d5 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -270,11 +270,13 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) void __init paravirt_use_bytelocks(void) { +#ifdef CONFIG_SMP pv_lock_ops.spin_is_locked = __byte_spin_is_locked; pv_lock_ops.spin_is_contended = __byte_spin_is_contended; pv_lock_ops.spin_lock = __byte_spin_lock; pv_lock_ops.spin_trylock = __byte_spin_trylock; pv_lock_ops.spin_unlock = __byte_spin_unlock; +#endif } struct pv_info pv_info = { @@ -461,12 +463,14 @@ struct pv_mmu_ops pv_mmu_ops = { }; struct pv_lock_ops pv_lock_ops = { +#ifdef CONFIG_SMP .spin_is_locked = __ticket_spin_is_locked, .spin_is_contended = __ticket_spin_is_contended, .spin_lock = __ticket_spin_lock, .spin_trylock = __ticket_spin_trylock, .spin_unlock = __ticket_spin_unlock, +#endif }; EXPORT_SYMBOL_GPL(pv_time_ops); -- cgit v1.2.3 From 9af98578d6af588f52d0dacd64fe42caa405a327 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 9 Jul 2008 14:39:15 +0200 Subject: x86: paravirt spinlocks, modular build fix fix: MODPOST 408 modules ERROR: "pv_lock_ops" [net/dccp/dccp.ko] undefined! ERROR: "pv_lock_ops" [fs/jbd2/jbd2.ko] undefined! ERROR: "pv_lock_ops" [drivers/media/common/saa7146_vv.ko] undefined! Signed-off-by: Ingo Molnar --- arch/x86/kernel/paravirt.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 6aa8aed06d5..3edfd7af22a 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -472,6 +472,7 @@ struct pv_lock_ops pv_lock_ops = { .spin_unlock = __ticket_spin_unlock, #endif }; +EXPORT_SYMBOL_GPL(pv_lock_ops); EXPORT_SYMBOL_GPL(pv_time_ops); EXPORT_SYMBOL (pv_cpu_ops); -- cgit v1.2.3 From 34646bca474142e1424e5f6c4a33cb2ba0930ea1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 9 Jul 2008 15:42:09 +0200 Subject: x86, paravirt-spinlocks: fix boot hang the paravirt-spinlock patches caused a boot hang with this config: http://redhat.com/~mingo/misc/config-Wed_Jul__9_14_47_04_CEST_2008.bad i have bisected it down to: | commit e17b58c2e85bc2ad2afc07fb8d898017c2b75ed1 | Author: Jeremy Fitzhardinge | Date: Mon Jul 7 12:07:53 2008 -0700 | | xen: implement Xen-specific spinlocks i.e. applying that patch alone causes the hang. The hang happens in the ftrace self-test: initcall utsname_sysctl_init+0x0/0x19 returned 0 after 0 msecs calling init_sched_switch_trace+0x0/0x4c Testing tracer sched_switch: PASSED initcall init_sched_switch_trace+0x0/0x4c returned 0 after 167 msecs calling init_function_trace+0x0/0x12 Testing tracer ftrace: [hard hang] it should have continued like this: Testing tracer ftrace: PASSED initcall init_function_trace+0x0/0x12 returned 0 after 198 msecs calling init_irqsoff_tracer+0x0/0x14 Testing tracer irqsoff: PASSED initcall init_irqsoff_tracer+0x0/0x14 returned 0 after 3 msecs calling init_mmio_trace+0x0/0x12 initcall init_mmio_trace+0x0/0x12 returned 0 after 0 msecs the problem is that such lowlevel primitives as spinlocks should never be built with -pg (which ftrace does). Marking paravirt.o as non-pg and marking all spinlock ops as always-inline solve the hang. Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5112c84f542..78d52171400 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -7,10 +7,11 @@ extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinu CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) ifdef CONFIG_FTRACE -# Do not profile debug utilities +# Do not profile debug and lowlevel utilities CFLAGS_REMOVE_tsc_64.o = -pg CFLAGS_REMOVE_tsc_32.o = -pg CFLAGS_REMOVE_rtc.o = -pg +CFLAGS_REMOVE_paravirt.o = -pg endif # -- cgit v1.2.3 From 6718d0d6da2749d3bff522e6057e97e6aa85e4d1 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Wed, 9 Jul 2008 01:07:02 -0700 Subject: x86 ptrace: block-step fix The enable_single_step() logic bails out early if TF is already set. That skips some of the bookkeeping that keeps things straight. This makes PTRACE_SINGLEBLOCK break the behavior of a user task that was already setting TF itself in user mode. Fix the bookkeeping to notice the old TF setting as it should. Test case at: http://sources.redhat.com/cgi-bin/cvsweb.cgi/~checkout~/tests/ptrace-tests/tests/step-jump-cont-strict.c?cvsroot=systemtap Signed-off-by: Roland McGrath --- arch/x86/kernel/step.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 92c20fee678..0d2cb363ea7 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -105,6 +105,7 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) static int enable_single_step(struct task_struct *child) { struct pt_regs *regs = task_pt_regs(child); + unsigned long oflags; /* * Always set TIF_SINGLESTEP - this guarantees that @@ -113,11 +114,7 @@ static int enable_single_step(struct task_struct *child) */ set_tsk_thread_flag(child, TIF_SINGLESTEP); - /* - * If TF was already set, don't do anything else - */ - if (regs->flags & X86_EFLAGS_TF) - return 0; + oflags = regs->flags; /* Set TF on the kernel stack.. */ regs->flags |= X86_EFLAGS_TF; @@ -126,9 +123,22 @@ static int enable_single_step(struct task_struct *child) * ..but if TF is changed by the instruction we will trace, * don't mark it as being "us" that set it, so that we * won't clear it by hand later. + * + * Note that if we don't actually execute the popf because + * of a signal arriving right now or suchlike, we will lose + * track of the fact that it really was "us" that set it. */ - if (is_setting_trap_flag(child, regs)) + if (is_setting_trap_flag(child, regs)) { + clear_tsk_thread_flag(child, TIF_FORCED_TF); return 0; + } + + /* + * If TF was already set, check whether it was us who set it. + * If not, we should never attempt a block step. + */ + if (oflags & X86_EFLAGS_TF) + return test_tsk_thread_flag(child, TIF_FORCED_TF); set_tsk_thread_flag(child, TIF_FORCED_TF); -- cgit v1.2.3 From 64f097331928b01d704047c1dbc738bb6d2a9bf9 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Wed, 9 Jul 2008 01:33:14 -0700 Subject: x86 ptrace: unify TIF_SINGLESTEP This unifies the treatment of TIF_SINGLESTEP on i386 and x86_64. The bit is now excluded from _TIF_WORK_MASK on i386 as it has been on x86_64. This means the do_notify_resume() path using it is never used, so TIF_SINGLESTEP is not cleared on returning to user mode. Both now leave TIF_SINGLESTEP set when returning to user, so that it's already set on an int $0x80 system call entry. This removes the need for testing TF on the system_call path. Doing it this way fixes the regression for PTRACE_SINGLESTEP into a sigreturn syscall, introduced by commit 1e2e99f0e4aa6363e8515ed17011c210c8f1b52a. The clear_TF_reenable case that sets TIF_SINGLESTEP can only happen on a non-exception kernel entry, i.e. sysenter/syscall instruction. That will always get to the syscall exit tracing path. Signed-off-by: Roland McGrath --- arch/x86/kernel/entry_32.S | 4 ---- arch/x86/kernel/signal_32.c | 6 ------ arch/x86/kernel/signal_64.c | 6 ------ 3 files changed, 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 6bc07f0f120..0ad987d02b7 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -383,10 +383,6 @@ syscall_exit: # setting need_resched or sigpending # between sampling and the iret TRACE_IRQS_OFF - testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit - jz no_singlestep - orl $_TIF_SINGLESTEP,TI_flags(%ebp) -no_singlestep: movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx # current->work jne syscall_exit_work diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index d9237363096..295b5f5c938 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -657,12 +657,6 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { - /* Pending single-step? */ - if (thread_info_flags & _TIF_SINGLESTEP) { - regs->flags |= X86_EFLAGS_TF; - clear_thread_flag(TIF_SINGLESTEP); - } - /* deal with pending signal delivery */ if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index e53b267662e..bf87684474f 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -487,12 +487,6 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { - /* Pending single-step? */ - if (thread_info_flags & _TIF_SINGLESTEP) { - regs->flags |= X86_EFLAGS_TF; - clear_thread_flag(TIF_SINGLESTEP); - } - #ifdef CONFIG_X86_MCE /* notify userspace of pending MCEs */ if (thread_info_flags & _TIF_MCE_NOTIFY) -- cgit v1.2.3 From d4d67150165df8bf1cc05e532f6efca96f907cab Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Wed, 9 Jul 2008 02:38:07 -0700 Subject: x86 ptrace: unify syscall tracing This unifies and cleans up the syscall tracing code on i386 and x86_64. Using a single function for entry and exit tracing on 32-bit made the do_syscall_trace() into some terrible spaghetti. The logic is clear and simple using separate syscall_trace_enter() and syscall_trace_leave() functions as on 64-bit. The unification adds PTRACE_SYSEMU and PTRACE_SYSEMU_SINGLESTEP support on x86_64, for 32-bit ptrace() callers and for 64-bit ptrace() callers tracing either 32-bit or 64-bit tasks. It behaves just like 32-bit. Changing syscall_trace_enter() to return the syscall number shortens all the assembly paths, while adding the SYSEMU feature in a simple way. Signed-off-by: Roland McGrath --- arch/x86/kernel/entry_32.S | 19 +++--- arch/x86/kernel/entry_64.S | 14 +++-- arch/x86/kernel/ptrace.c | 141 +++++++++++++++------------------------------ 3 files changed, 62 insertions(+), 112 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 0ad987d02b7..cadf73f70d3 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -332,7 +332,7 @@ sysenter_past_esp: GET_THREAD_INFO(%ebp) /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ - testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) jnz syscall_trace_entry cmpl $(nr_syscalls), %eax jae syscall_badsys @@ -370,7 +370,7 @@ ENTRY(system_call) GET_THREAD_INFO(%ebp) # system call tracing in operation / emulation /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ - testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) jnz syscall_trace_entry cmpl $(nr_syscalls), %eax jae syscall_badsys @@ -510,12 +510,8 @@ END(work_pending) syscall_trace_entry: movl $-ENOSYS,PT_EAX(%esp) movl %esp, %eax - xorl %edx,%edx - call do_syscall_trace - cmpl $0, %eax - jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU, - # so must skip actual syscall - movl PT_ORIG_EAX(%esp), %eax + call syscall_trace_enter + /* What it returned is what we'll actually use. */ cmpl $(nr_syscalls), %eax jnae syscall_call jmp syscall_exit @@ -524,14 +520,13 @@ END(syscall_trace_entry) # perform syscall exit tracing ALIGN syscall_exit_work: - testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl + testb $_TIF_WORK_SYSCALL_EXIT, %cl jz work_pending TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call + ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call # schedule() instead movl %esp, %eax - movl $1, %edx - call do_syscall_trace + call syscall_trace_leave jmp resume_userspace END(syscall_exit_work) CFI_ENDPROC diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index ae63e584c34..63001c6ecf6 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -349,8 +349,7 @@ ENTRY(system_call_after_swapgs) movq %rcx,RIP-ARGOFFSET(%rsp) CFI_REL_OFFSET rip,RIP-ARGOFFSET GET_THREAD_INFO(%rcx) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ - TI_flags(%rcx) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx) jnz tracesys cmpq $__NR_syscall_max,%rax ja badsys @@ -430,7 +429,12 @@ tracesys: FIXUP_TOP_OF_STACK %rdi movq %rsp,%rdi call syscall_trace_enter - LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ + /* + * Reload arg registers from stack in case ptrace changed them. + * We don't reload %rax because syscall_trace_enter() returned + * the value it wants us to use in the table lookup. + */ + LOAD_ARGS ARGOFFSET, 1 RESTORE_REST cmpq $__NR_syscall_max,%rax ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ @@ -483,7 +487,7 @@ int_very_careful: ENABLE_INTERRUPTS(CLBR_NONE) SAVE_REST /* Check for syscall exit trace */ - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx + testl $_TIF_WORK_SYSCALL_EXIT,%edx jz int_signal pushq %rdi CFI_ADJUST_CFA_OFFSET 8 @@ -491,7 +495,7 @@ int_very_careful: call syscall_trace_leave popq %rdi CFI_ADJUST_CFA_OFFSET -8 - andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi + andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi jmp int_restore_rest int_signal: diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 77040b6070e..34e77b16a42 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1357,8 +1357,6 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task) #endif } -#ifdef CONFIG_X86_32 - void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) { struct siginfo info; @@ -1377,89 +1375,10 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) force_sig_info(SIGTRAP, &info, tsk); } -/* notification of system call entry/exit - * - triggered by current->work.syscall_trace - */ -int do_syscall_trace(struct pt_regs *regs, int entryexit) -{ - int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU); - /* - * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall - * interception - */ - int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP); - int ret = 0; - - /* do the secure computing check first */ - if (!entryexit) - secure_computing(regs->orig_ax); - - if (unlikely(current->audit_context)) { - if (entryexit) - audit_syscall_exit(AUDITSC_RESULT(regs->ax), - regs->ax); - /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only - * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is - * not used, entry.S will call us only on syscall exit, not - * entry; so when TIF_SYSCALL_AUDIT is used we must avoid - * calling send_sigtrap() on syscall entry. - * - * Note that when PTRACE_SYSEMU_SINGLESTEP is used, - * is_singlestep is false, despite his name, so we will still do - * the correct thing. - */ - else if (is_singlestep) - goto out; - } - - if (!(current->ptrace & PT_PTRACED)) - goto out; - - /* If a process stops on the 1st tracepoint with SYSCALL_TRACE - * and then is resumed with SYSEMU_SINGLESTEP, it will come in - * here. We have to check this and return */ - if (is_sysemu && entryexit) - return 0; - - /* Fake a debug trap */ - if (is_singlestep) - send_sigtrap(current, regs, 0); - - if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu) - goto out; - - /* the 0x80 provides a way for the tracing parent to distinguish - between a syscall stop and SIGTRAP delivery */ - /* Note that the debugger could change the result of test_thread_flag!*/ - ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0)); - - /* - * this isn't the same as continuing with a signal, but it will do - * for normal use. strace only continues with a signal if the - * stopping signal is not SIGTRAP. -brl - */ - if (current->exit_code) { - send_sig(current->exit_code, current, 1); - current->exit_code = 0; - } - ret = is_sysemu; -out: - if (unlikely(current->audit_context) && !entryexit) - audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax, - regs->bx, regs->cx, regs->dx, regs->si); - if (ret == 0) - return 0; - - regs->orig_ax = -1; /* force skip of syscall restarting */ - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); - return 1; -} - -#else /* CONFIG_X86_64 */ - static void syscall_trace(struct pt_regs *regs) { + if (!(current->ptrace & PT_PTRACED)) + return; #if 0 printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n", @@ -1481,39 +1400,71 @@ static void syscall_trace(struct pt_regs *regs) } } -asmlinkage void syscall_trace_enter(struct pt_regs *regs) +#ifdef CONFIG_X86_32 +# define IS_IA32 1 +#elif defined CONFIG_IA32_EMULATION +# define IS_IA32 test_thread_flag(TIF_IA32) +#else +# define IS_IA32 0 +#endif + +/* + * We must return the syscall number to actually look up in the table. + * This can be -1L to skip running any syscall at all. + */ +asmregparm long syscall_trace_enter(struct pt_regs *regs) { + long ret = 0; + /* do the secure computing check first */ secure_computing(regs->orig_ax); - if (test_thread_flag(TIF_SYSCALL_TRACE) - && (current->ptrace & PT_PTRACED)) + if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) + ret = -1L; + + if (ret || test_thread_flag(TIF_SYSCALL_TRACE)) syscall_trace(regs); if (unlikely(current->audit_context)) { - if (test_thread_flag(TIF_IA32)) { + if (IS_IA32) audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax, regs->bx, regs->cx, regs->dx, regs->si); - } else { +#ifdef CONFIG_X86_64 + else audit_syscall_entry(AUDIT_ARCH_X86_64, regs->orig_ax, regs->di, regs->si, regs->dx, regs->r10); - } +#endif } + + return ret ?: regs->orig_ax; } -asmlinkage void syscall_trace_leave(struct pt_regs *regs) +asmregparm void syscall_trace_leave(struct pt_regs *regs) { if (unlikely(current->audit_context)) audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); - if ((test_thread_flag(TIF_SYSCALL_TRACE) - || test_thread_flag(TIF_SINGLESTEP)) - && (current->ptrace & PT_PTRACED)) + if (test_thread_flag(TIF_SYSCALL_TRACE)) syscall_trace(regs); -} -#endif /* CONFIG_X86_32 */ + /* + * If TIF_SYSCALL_EMU is set, we only get here because of + * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). + * We already reported this syscall instruction in + * syscall_trace_enter(), so don't do any more now. + */ + if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) + return; + + /* + * If we are single-stepping, synthesize a trap to follow the + * system call instruction. + */ + if (test_thread_flag(TIF_SINGLESTEP) && + (current->ptrace & PT_PTRACED)) + send_sigtrap(current, regs, 0); +} -- cgit v1.2.3 From 380fdd7585a4c2f41b48925eba85c0654b7b858b Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Wed, 9 Jul 2008 02:39:29 -0700 Subject: x86 ptrace: user-sets-TF nits This closes some arcane holes in single-step handling that can arise only when user programs set TF directly (via popf or sigreturn) and then use vDSO (syscall/sysenter) system call entry. In those entry paths, the clear_TF_reenable case hits and we must check TIF_SINGLESTEP to be sure our bookkeeping stays correct wrt the user's view of TF. Signed-off-by: Roland McGrath --- arch/x86/kernel/ptrace.c | 10 ++++++++++ arch/x86/kernel/step.c | 13 +++++++++++++ 2 files changed, 23 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 34e77b16a42..e37dccce85d 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1416,6 +1416,16 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs) { long ret = 0; + /* + * If we stepped into a sysenter/syscall insn, it trapped in + * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. + * If user-mode had set TF itself, then it's still clear from + * do_debug() and we need to set it again to restore the user + * state. If we entered on the slow path, TF was already set. + */ + if (test_thread_flag(TIF_SINGLESTEP)) + regs->flags |= X86_EFLAGS_TF; + /* do the secure computing check first */ secure_computing(regs->orig_ax); diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 0d2cb363ea7..e8b9863ef8c 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -107,6 +107,19 @@ static int enable_single_step(struct task_struct *child) struct pt_regs *regs = task_pt_regs(child); unsigned long oflags; + /* + * If we stepped into a sysenter/syscall insn, it trapped in + * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. + * If user-mode had set TF itself, then it's still clear from + * do_debug() and we need to set it again to restore the user + * state so we don't wrongly set TIF_FORCED_TF below. + * If enable_single_step() was used last and that is what + * set TIF_SINGLESTEP, then both TF and TIF_FORCED_TF are + * already set and our bookkeeping is fine. + */ + if (unlikely(test_tsk_thread_flag(child, TIF_SINGLESTEP))) + regs->flags |= X86_EFLAGS_TF; + /* * Always set TIF_SINGLESTEP - this guarantees that * we single-step system calls etc.. This will also -- cgit v1.2.3 From fab3b58d3b242b5903f78d60d86803a8aecdf6de Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 17 Jul 2008 13:50:15 +0200 Subject: x86 reboot quirks: add Dell Precision WorkStation T5400 as reported in: "reboot=bios is mandatory on Dell T5400 server." http://bugzilla.kernel.org/show_bug.cgi?id=11108 add a DMI reboot quirk. Signed-off-by: Ingo Molnar Cc: --- arch/x86/kernel/reboot.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index f8a62160e15..9dcf39c0297 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -177,6 +177,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"), }, }, + { /* Handle problems with rebooting on Dell T5400's */ + .callback = set_bios_reboot, + .ident = "Dell Precision T5400", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"), + }, + }, { /* Handle problems with rebooting on HP laptops */ .callback = set_bios_reboot, .ident = "HP Compaq Laptop", -- cgit v1.2.3 From 4fdf08b5bf8d449cc9897395895157c6ff8ddc41 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 17 Jul 2008 11:29:24 -0700 Subject: x86: unify and correct the GDT_ENTRY() macro Merge the GDT_ENTRY() macro between arch/x86/boot/pm.c and arch/x86/kernel/acpi/sleep.c and put the new one in . While we're at it, correct the bitmasks for the limit and flags. The new version relies on using ULL constants in order to cause type promotion rather than explicit casts; this avoids having to include in . Signed-off-by: H. Peter Anvin --- arch/x86/kernel/acpi/sleep.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 868de3d5c39..a3ddad18aaa 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "realmode/wakeup.h" #include "sleep.h" @@ -23,15 +24,6 @@ static unsigned long acpi_realmode; static char temp_stack[10240]; #endif -/* XXX: this macro should move to asm-x86/segment.h and be shared with the - boot code... */ -#define GDT_ENTRY(flags, base, limit) \ - (((u64)(base & 0xff000000) << 32) | \ - ((u64)flags << 40) | \ - ((u64)(limit & 0x00ff0000) << 32) | \ - ((u64)(base & 0x00ffffff) << 16) | \ - ((u64)(limit & 0x0000ffff))) - /** * acpi_save_state_mem - save kernel state * -- cgit v1.2.3 From 6879827f4e08da219c99b91e4e1d793a924103e3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 18 Jul 2008 01:21:53 +0200 Subject: x86: remove arch/x86/kernel/smpcommon_32.c Yinghai Lu noticed that arch/x86/kernel/smpcommon_32.c got renamed to arch/x86/kernel/smpcommon.c but the old almost-empty file stayed around. Zap it. Reported-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpcommon_32.c | 1 - 1 file changed, 1 deletion(-) delete mode 100644 arch/x86/kernel/smpcommon_32.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpcommon_32.c b/arch/x86/kernel/smpcommon_32.c deleted file mode 100644 index 8b137891791..00000000000 --- a/arch/x86/kernel/smpcommon_32.c +++ /dev/null @@ -1 +0,0 @@ - -- cgit v1.2.3 From 29cbeb0e17d9d2ca824f62f71cfa7360b3157112 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 17 Jul 2008 21:50:23 -0700 Subject: x86: use cpu_clear in remove_cpu_from_maps Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 687376ab07e..27456574f07 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1311,7 +1311,7 @@ static void __ref remove_cpu_from_maps(int cpu) cpu_clear(cpu, cpu_callout_map); cpu_clear(cpu, cpu_callin_map); /* was set by cpu_init() */ - clear_bit(cpu, (unsigned long *)&cpu_initialized); + cpu_clear(cpu, cpu_initialized); numa_remove_cpu(cpu); } -- cgit v1.2.3 From 593f4a788e5d09e9f00182561437461b0b564de4 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Wed, 16 Jul 2008 19:15:30 +0100 Subject: x86: APIC: remove apic_write_around(); use alternatives Use alternatives to select the workaround for the 11AP Pentium erratum for the affected steppings on the fly rather than build time. Remove the X86_GOOD_APIC configuration option and replace all the calls to apic_write_around() with plain apic_write(), protecting accesses to the ESR as appropriate due to the 3AP Pentium erratum. Remove apic_read_around() and all its invocations altogether as not needed. Remove apic_write_atomic() and all its implementing backends. The use of ASM_OUTPUT2() is not strictly needed for input constraints, but I have used it for readability's sake. I had the feeling no one else was brave enough to do it, so I went ahead and here it is. Verified by checking the generated assembly and tested with both a 32-bit and a 64-bit configuration, also with the 11AP "feature" forced on and verified with gdb on /proc/kcore to work as expected (as an 11AP machines are quite hard to get hands on these days). Some script complained about the use of "volatile", but apic_write() needs it for the same reason and is effectively a replacement for writel(), so I have disregarded it. I am not sure what the policy wrt defconfig files is, they are generated and there is risk of a conflict resulting from an unrelated change, so I have left changes to them out. The option will get removed from them at the next run. Some testing with machines other than mine will be needed to avoid some stupid mistake, but despite its volume, the change is not really that intrusive, so I am fairly confident that because it works for me, it will everywhere. Signed-off-by: Maciej W. Rozycki Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 75 ++++++++++++++++++++--------------------- arch/x86/kernel/cpu/bugs.c | 23 +------------ arch/x86/kernel/cpu/intel.c | 10 ++++++ arch/x86/kernel/cpu/mcheck/p4.c | 4 +-- arch/x86/kernel/io_apic_32.c | 14 ++++---- arch/x86/kernel/ipi.c | 6 ++-- arch/x86/kernel/nmi.c | 4 +-- arch/x86/kernel/paravirt.c | 1 - arch/x86/kernel/smpboot.c | 49 +++++++++++---------------- arch/x86/kernel/vmi_32.c | 1 - 10 files changed, 80 insertions(+), 107 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index a437d027f20..2bc1186cc95 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -177,7 +177,7 @@ void __cpuinit enable_NMI_through_LVT0(void) /* Level triggered for 82489DX */ if (!lapic_is_integrated()) v |= APIC_LVT_LEVEL_TRIGGER; - apic_write_around(APIC_LVT0, v); + apic_write(APIC_LVT0, v); } /** @@ -212,9 +212,6 @@ int lapic_get_maxlvt(void) * this function twice on the boot CPU, once with a bogus timeout * value, second time for real. The other (noncalibrating) CPUs * call this function only once, with the real, calibrated value. - * - * We do reads before writes even if unnecessary, to get around the - * P5 APIC double write bug. */ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) { @@ -229,18 +226,18 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) if (!irqen) lvtt_value |= APIC_LVT_MASKED; - apic_write_around(APIC_LVTT, lvtt_value); + apic_write(APIC_LVTT, lvtt_value); /* * Divide PICLK by 16 */ tmp_value = apic_read(APIC_TDCR); - apic_write_around(APIC_TDCR, (tmp_value - & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) - | APIC_TDR_DIV_16); + apic_write(APIC_TDCR, + (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | + APIC_TDR_DIV_16); if (!oneshot) - apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); + apic_write(APIC_TMICT, clocks / APIC_DIVISOR); } /* @@ -249,7 +246,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) static int lapic_next_event(unsigned long delta, struct clock_event_device *evt) { - apic_write_around(APIC_TMICT, delta); + apic_write(APIC_TMICT, delta); return 0; } @@ -278,7 +275,7 @@ static void lapic_timer_setup(enum clock_event_mode mode, case CLOCK_EVT_MODE_SHUTDOWN: v = apic_read(APIC_LVTT); v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); - apic_write_around(APIC_LVTT, v); + apic_write(APIC_LVTT, v); break; case CLOCK_EVT_MODE_RESUME: /* Nothing to do here */ @@ -693,44 +690,44 @@ void clear_local_APIC(void) */ if (maxlvt >= 3) { v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ - apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED); + apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); } /* * Careful: we have to set masks only first to deassert * any level-triggered sources. */ v = apic_read(APIC_LVTT); - apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); + apic_write(APIC_LVTT, v | APIC_LVT_MASKED); v = apic_read(APIC_LVT0); - apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); + apic_write(APIC_LVT0, v | APIC_LVT_MASKED); v = apic_read(APIC_LVT1); - apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED); + apic_write(APIC_LVT1, v | APIC_LVT_MASKED); if (maxlvt >= 4) { v = apic_read(APIC_LVTPC); - apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); + apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); } /* lets not touch this if we didn't frob it */ #ifdef CONFIG_X86_MCE_P4THERMAL if (maxlvt >= 5) { v = apic_read(APIC_LVTTHMR); - apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED); + apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); } #endif /* * Clean APIC state for other OSs: */ - apic_write_around(APIC_LVTT, APIC_LVT_MASKED); - apic_write_around(APIC_LVT0, APIC_LVT_MASKED); - apic_write_around(APIC_LVT1, APIC_LVT_MASKED); + apic_write(APIC_LVTT, APIC_LVT_MASKED); + apic_write(APIC_LVT0, APIC_LVT_MASKED); + apic_write(APIC_LVT1, APIC_LVT_MASKED); if (maxlvt >= 3) - apic_write_around(APIC_LVTERR, APIC_LVT_MASKED); + apic_write(APIC_LVTERR, APIC_LVT_MASKED); if (maxlvt >= 4) - apic_write_around(APIC_LVTPC, APIC_LVT_MASKED); + apic_write(APIC_LVTPC, APIC_LVT_MASKED); #ifdef CONFIG_X86_MCE_P4THERMAL if (maxlvt >= 5) - apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED); + apic_write(APIC_LVTTHMR, APIC_LVT_MASKED); #endif /* Integrated APIC (!82489DX) ? */ if (lapic_is_integrated()) { @@ -756,7 +753,7 @@ void disable_local_APIC(void) */ value = apic_read(APIC_SPIV); value &= ~APIC_SPIV_APIC_ENABLED; - apic_write_around(APIC_SPIV, value); + apic_write(APIC_SPIV, value); /* * When LAPIC was disabled by the BIOS and enabled by the kernel, @@ -865,8 +862,8 @@ void __init sync_Arb_IDs(void) apic_wait_icr_idle(); apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); - apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG - | APIC_DM_INIT); + apic_write(APIC_ICR, + APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT); } /* @@ -902,16 +899,16 @@ void __init init_bsp_APIC(void) else value |= APIC_SPIV_FOCUS_DISABLED; value |= SPURIOUS_APIC_VECTOR; - apic_write_around(APIC_SPIV, value); + apic_write(APIC_SPIV, value); /* * Set up the virtual wire mode. */ - apic_write_around(APIC_LVT0, APIC_DM_EXTINT); + apic_write(APIC_LVT0, APIC_DM_EXTINT); value = APIC_DM_NMI; if (!lapic_is_integrated()) /* 82489DX */ value |= APIC_LVT_LEVEL_TRIGGER; - apic_write_around(APIC_LVT1, value); + apic_write(APIC_LVT1, value); } static void __cpuinit lapic_setup_esr(void) @@ -926,7 +923,7 @@ static void __cpuinit lapic_setup_esr(void) /* enables sending errors */ value = ERROR_APIC_VECTOR; - apic_write_around(APIC_LVTERR, value); + apic_write(APIC_LVTERR, value); /* * spec says clear errors after enabling vector. */ @@ -989,7 +986,7 @@ void __cpuinit setup_local_APIC(void) */ value = apic_read(APIC_TASKPRI); value &= ~APIC_TPRI_MASK; - apic_write_around(APIC_TASKPRI, value); + apic_write(APIC_TASKPRI, value); /* * After a crash, we no longer service the interrupts and a pending @@ -1047,7 +1044,7 @@ void __cpuinit setup_local_APIC(void) * Set spurious IRQ vector */ value |= SPURIOUS_APIC_VECTOR; - apic_write_around(APIC_SPIV, value); + apic_write(APIC_SPIV, value); /* * Set up LVT0, LVT1: @@ -1069,7 +1066,7 @@ void __cpuinit setup_local_APIC(void) apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", smp_processor_id()); } - apic_write_around(APIC_LVT0, value); + apic_write(APIC_LVT0, value); /* * only the BP should see the LINT1 NMI signal, obviously. @@ -1080,7 +1077,7 @@ void __cpuinit setup_local_APIC(void) value = APIC_DM_NMI | APIC_LVT_MASKED; if (!integrated) /* 82489DX */ value |= APIC_LVT_LEVEL_TRIGGER; - apic_write_around(APIC_LVT1, value); + apic_write(APIC_LVT1, value); } void __cpuinit end_local_APIC_setup(void) @@ -1091,7 +1088,7 @@ void __cpuinit end_local_APIC_setup(void) /* Disable the local apic timer */ value = apic_read(APIC_LVTT); value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); - apic_write_around(APIC_LVTT, value); + apic_write(APIC_LVTT, value); setup_apic_nmi_watchdog(NULL); apic_pm_activate(); @@ -1419,7 +1416,7 @@ void disconnect_bsp_APIC(int virt_wire_setup) value &= ~APIC_VECTOR_MASK; value |= APIC_SPIV_APIC_ENABLED; value |= 0xf; - apic_write_around(APIC_SPIV, value); + apic_write(APIC_SPIV, value); if (!virt_wire_setup) { /* @@ -1432,10 +1429,10 @@ void disconnect_bsp_APIC(int virt_wire_setup) APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); - apic_write_around(APIC_LVT0, value); + apic_write(APIC_LVT0, value); } else { /* Disable LVT0 */ - apic_write_around(APIC_LVT0, APIC_LVT_MASKED); + apic_write(APIC_LVT0, APIC_LVT_MASKED); } /* @@ -1449,7 +1446,7 @@ void disconnect_bsp_APIC(int virt_wire_setup) APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); - apic_write_around(APIC_LVT1, value); + apic_write(APIC_LVT1, value); } } diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 1b1c56bb338..c9b58a806e8 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -131,13 +131,7 @@ static void __init check_popad(void) * (for due to lack of "invlpg" and working WP on a i386) * - In order to run on anything without a TSC, we need to be * compiled for a i486. - * - In order to support the local APIC on a buggy Pentium machine, - * we need to be compiled with CONFIG_X86_GOOD_APIC disabled, - * which happens implicitly if compiled for a Pentium or lower - * (unless an advanced selection of CPU features is used) as an - * otherwise config implies a properly working local APIC without - * the need to do extra reads from the APIC. -*/ + */ static void __init check_config(void) { @@ -151,21 +145,6 @@ static void __init check_config(void) if (boot_cpu_data.x86 == 3) panic("Kernel requires i486+ for 'invlpg' and other features"); #endif - -/* - * If we were told we had a good local APIC, check for buggy Pentia, - * i.e. all B steppings and the C2 stepping of P54C when using their - * integrated APIC (see 11AP erratum in "Pentium Processor - * Specification Update"). - */ -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_GOOD_APIC) - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL - && cpu_has_apic - && boot_cpu_data.x86 == 5 - && boot_cpu_data.x86_model == 2 - && (boot_cpu_data.x86_mask < 6 || boot_cpu_data.x86_mask == 11)) - panic("Kernel compiled for PMMX+, assumes a local APIC without the read-before-write bug!"); -#endif } diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 70609efdf1d..b75f2569b8f 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -227,6 +227,16 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) if (cpu_has_bts) ds_init_intel(c); + /* + * See if we have a good local APIC by checking for buggy Pentia, + * i.e. all B steppings and the C2 stepping of P54C when using their + * integrated APIC (see 11AP erratum in "Pentium Processor + * Specification Update"). + */ + if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 && + (c->x86_mask < 0x6 || c->x86_mask == 0xb)) + set_cpu_cap(c, X86_FEATURE_11AP); + #ifdef CONFIG_X86_NUMAQ numaq_tsc_disable(); #endif diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index eef001ad3bd..9b60fce09f7 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -102,7 +102,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) /* The temperature transition interrupt handler setup */ h = THERMAL_APIC_VECTOR; /* our delivery vector */ h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */ - apic_write_around(APIC_LVTTHMR, h); + apic_write(APIC_LVTTHMR, h); rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); @@ -114,7 +114,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h); l = apic_read(APIC_LVTTHMR); - apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); /* enable thermal throttle processing */ diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 558abf4c796..eabaf9244f5 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -756,7 +756,7 @@ void send_IPI_self(int vector) /* * Send the IPI. The write to APIC_ICR fires this off. */ - apic_write_around(APIC_ICR, cfg); + apic_write(APIC_ICR, cfg); } #endif /* !CONFIG_SMP */ @@ -2030,7 +2030,7 @@ static void mask_lapic_irq(unsigned int irq) unsigned long v; v = apic_read(APIC_LVT0); - apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); + apic_write(APIC_LVT0, v | APIC_LVT_MASKED); } static void unmask_lapic_irq(unsigned int irq) @@ -2038,7 +2038,7 @@ static void unmask_lapic_irq(unsigned int irq) unsigned long v; v = apic_read(APIC_LVT0); - apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); + apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); } static struct irq_chip lapic_chip __read_mostly = { @@ -2168,7 +2168,7 @@ static inline void __init check_timer(void) * The AEOI mode will finish them in the 8259A * automatically. */ - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); init_8259A(1); timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); @@ -2256,7 +2256,7 @@ static inline void __init check_timer(void) printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); lapic_register_intr(0, vector); - apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ + apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ enable_8259A_irq(0); if (timer_irq_works()) { @@ -2264,14 +2264,14 @@ static inline void __init check_timer(void) goto out; } disable_8259A_irq(0); - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); printk(" failed.\n"); printk(KERN_INFO "...trying to set up timer as ExtINT IRQ..."); init_8259A(0); make_8259A_irq(0); - apic_write_around(APIC_LVT0, APIC_DM_EXTINT); + apic_write(APIC_LVT0, APIC_DM_EXTINT); unlock_ExtINT_logic(); diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c index 9d98cda39ad..3f7537b669d 100644 --- a/arch/x86/kernel/ipi.c +++ b/arch/x86/kernel/ipi.c @@ -70,7 +70,7 @@ void __send_IPI_shortcut(unsigned int shortcut, int vector) /* * Send the IPI. The write to APIC_ICR fires this off. */ - apic_write_around(APIC_ICR, cfg); + apic_write(APIC_ICR, cfg); } void send_IPI_self(int vector) @@ -98,7 +98,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector) * prepare target chip field */ cfg = __prepare_ICR2(mask); - apic_write_around(APIC_ICR2, cfg); + apic_write(APIC_ICR2, cfg); /* * program the ICR @@ -108,7 +108,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector) /* * Send the IPI. The write to APIC_ICR fires this off. */ - apic_write_around(APIC_ICR, cfg); + apic_write(APIC_ICR, cfg); } /* diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index ec024b3baad..384b49fed59 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -263,7 +263,7 @@ late_initcall(init_lapic_nmi_sysfs); static void __acpi_nmi_enable(void *__unused) { - apic_write_around(APIC_LVT0, APIC_DM_NMI); + apic_write(APIC_LVT0, APIC_DM_NMI); } /* @@ -277,7 +277,7 @@ void acpi_nmi_enable(void) static void __acpi_nmi_disable(void *__unused) { - apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); + apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); } /* diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index e0f571d58c1..5d7326a60b7 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -361,7 +361,6 @@ struct pv_cpu_ops pv_cpu_ops = { struct pv_apic_ops pv_apic_ops = { #ifdef CONFIG_X86_LOCAL_APIC .apic_write = native_apic_write, - .apic_write_atomic = native_apic_write_atomic, .apic_read = native_apic_read, .setup_boot_clock = setup_boot_APIC_clock, .setup_secondary_clock = setup_secondary_APIC_clock, diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 687376ab07e..f251f5c3882 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -546,8 +546,8 @@ static inline void __inquire_remote_apic(int apicid) printk(KERN_CONT "a previous APIC delivery may have failed\n"); - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); - apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); + apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]); timeout = 0; do { @@ -579,11 +579,11 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) int maxlvt; /* Target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid)); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid)); /* Boot on the stack */ /* Kick the second */ - apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); + apic_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); Dprintk("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); @@ -592,14 +592,9 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) * Give the other CPU some time to accept the IPI. */ udelay(200); - /* - * Due to the Pentium erratum 3AP. - */ maxlvt = lapic_get_maxlvt(); - if (maxlvt > 3) { - apic_read_around(APIC_SPIV); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); - } accept_status = (apic_read(APIC_ESR) & 0xEF); Dprintk("NMI sent.\n"); @@ -625,12 +620,14 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) return send_status; } + maxlvt = lapic_get_maxlvt(); + /* * Be paranoid about clearing APIC errors. */ if (APIC_INTEGRATED(apic_version[phys_apicid])) { - apic_read_around(APIC_SPIV); - apic_write(APIC_ESR, 0); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); apic_read(APIC_ESR); } @@ -639,13 +636,13 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) /* * Turn INIT on target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); /* * Send IPI */ - apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT - | APIC_DM_INIT); + apic_write(APIC_ICR, + APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT); Dprintk("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); @@ -655,10 +652,10 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) Dprintk("Deasserting INIT.\n"); /* Target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); /* Send IPI */ - apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); + apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); Dprintk("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); @@ -689,12 +686,10 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) */ Dprintk("#startup loops: %d.\n", num_starts); - maxlvt = lapic_get_maxlvt(); - for (j = 1; j <= num_starts; j++) { Dprintk("Sending STARTUP #%d.\n", j); - apic_read_around(APIC_SPIV); - apic_write(APIC_ESR, 0); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); apic_read(APIC_ESR); Dprintk("After apic_write.\n"); @@ -703,12 +698,11 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) */ /* Target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); /* Boot on the stack */ /* Kick the second */ - apic_write_around(APIC_ICR, APIC_DM_STARTUP - | (start_eip >> 12)); + apic_write(APIC_ICR, APIC_DM_STARTUP | (start_eip >> 12)); /* * Give the other CPU some time to accept the IPI. @@ -724,13 +718,8 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) * Give the other CPU some time to accept the IPI. */ udelay(200); - /* - * Due to the Pentium erratum 3AP. - */ - if (maxlvt > 3) { - apic_read_around(APIC_SPIV); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); - } accept_status = (apic_read(APIC_ESR) & 0xEF); if (send_status || accept_status) break; diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index b15346092b7..0a1b1a9d922 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -906,7 +906,6 @@ static inline int __init activate_vmi(void) #ifdef CONFIG_X86_LOCAL_APIC para_fill(pv_apic_ops.apic_read, APICRead); para_fill(pv_apic_ops.apic_write, APICWrite); - para_fill(pv_apic_ops.apic_write_atomic, APICWrite); #endif /* -- cgit v1.2.3 From 89b3b1f41bd94085da2f08dcb719bdbf7e8e9d57 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 15 Jul 2008 21:02:54 +0400 Subject: x86: apic_64 - make calibrate_APIC_clock to return error code Make calibration_result to return error and check calibration_result to be sufficient inside calibrate_APIC_clock. Signed-off-by: Cyrill Gorcunov Cc: Cyrill Gorcunov Cc: macro@linux-mips.org Cc: yhlu.kernel@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_64.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 1e3d32e27c1..ce294d623e5 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -314,7 +314,7 @@ static void setup_APIC_timer(void) #define TICK_COUNT 100000000 -static void __init calibrate_APIC_clock(void) +static int __init calibrate_APIC_clock(void) { unsigned apic, apic_start; unsigned long tsc, tsc_start; @@ -368,6 +368,17 @@ static void __init calibrate_APIC_clock(void) clockevent_delta2ns(0xF, &lapic_clockevent); calibration_result = result / HZ; + + /* + * Do a sanity check on the APIC calibration result + */ + if (calibration_result < (1000000 / HZ)) { + printk(KERN_WARNING + "APIC frequency too slow, disabling apic timer\n"); + return -1; + } + + return 0; } /* @@ -394,14 +405,7 @@ void __init setup_boot_APIC_clock(void) } printk(KERN_INFO "Using local APIC timer interrupts.\n"); - calibrate_APIC_clock(); - - /* - * Do a sanity check on the APIC calibration result - */ - if (calibration_result < (1000000 / HZ)) { - printk(KERN_WARNING - "APIC frequency too slow, disabling apic timer\n"); + if (calibrate_APIC_clock()) { /* No broadcast on UP ! */ if (num_possible_cpus() > 1) setup_APIC_timer(); -- cgit v1.2.3 From 836c129de971d526b6e85b8ad760bd635a00215e Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 15 Jul 2008 21:02:55 +0400 Subject: x86: apic_32 - introduce calibrate_APIC_clock Introduce calibrate_APIC_clock so it could help in further 32/64bit apic code merging. Signed-off-by: Cyrill Gorcunov Cc: Cyrill Gorcunov Cc: macro@linux-mips.org Cc: yhlu.kernel@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 93 +++++++++++++++++++++++++---------------------- 1 file changed, 50 insertions(+), 43 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 2bc1186cc95..d2a7eb511d6 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -369,12 +369,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev) } } -/* - * Setup the boot APIC - * - * Calibrate and verify the result. - */ -void __init setup_boot_APIC_clock(void) +static int __init calibrate_APIC_clock(void) { struct clock_event_device *levt = &__get_cpu_var(lapic_events); const long pm_100ms = PMTMR_TICKS_PER_SEC/10; @@ -384,24 +379,6 @@ void __init setup_boot_APIC_clock(void) long delta, deltapm; int pm_referenced = 0; - /* - * The local apic timer can be disabled via the kernel - * commandline or from the CPU detection code. Register the lapic - * timer as a dummy clock event source on SMP systems, so the - * broadcast mechanism is used. On UP systems simply ignore it. - */ - if (local_apic_timer_disabled) { - /* No broadcast on UP ! */ - if (num_possible_cpus() > 1) { - lapic_clockevent.mult = 1; - setup_APIC_timer(); - } - return; - } - - apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" - "calibrating APIC timer ...\n"); - local_irq_disable(); /* Replace the global interrupt handler */ @@ -486,8 +463,6 @@ void __init setup_boot_APIC_clock(void) calibration_result / (1000000 / HZ), calibration_result % (1000000 / HZ)); - local_apic_timer_verify_ok = 1; - /* * Do a sanity check on the APIC calibration result */ @@ -495,12 +470,11 @@ void __init setup_boot_APIC_clock(void) local_irq_enable(); printk(KERN_WARNING "APIC frequency too slow, disabling apic timer\n"); - /* No broadcast on UP ! */ - if (num_possible_cpus() > 1) - setup_APIC_timer(); - return; + return -1; } + local_apic_timer_verify_ok = 1; + /* We trust the pm timer based calibration */ if (!pm_referenced) { apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); @@ -540,22 +514,55 @@ void __init setup_boot_APIC_clock(void) if (!local_apic_timer_verify_ok) { printk(KERN_WARNING "APIC timer disabled due to verification failure.\n"); + return -1; + } + + return 0; +} + +/* + * Setup the boot APIC + * + * Calibrate and verify the result. + */ +void __init setup_boot_APIC_clock(void) +{ + /* + * The local apic timer can be disabled via the kernel + * commandline or from the CPU detection code. Register the lapic + * timer as a dummy clock event source on SMP systems, so the + * broadcast mechanism is used. On UP systems simply ignore it. + */ + if (local_apic_timer_disabled) { /* No broadcast on UP ! */ - if (num_possible_cpus() == 1) - return; - } else { - /* - * If nmi_watchdog is set to IO_APIC, we need the - * PIT/HPET going. Otherwise register lapic as a dummy - * device. - */ - if (nmi_watchdog != NMI_IO_APIC) - lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; - else - printk(KERN_WARNING "APIC timer registered as dummy," - " due to nmi_watchdog=%d!\n", nmi_watchdog); + if (num_possible_cpus() > 1) { + lapic_clockevent.mult = 1; + setup_APIC_timer(); + } + return; + } + + apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" + "calibrating APIC timer ...\n"); + + if (calibrate_APIC_clock()) { + /* No broadcast on UP ! */ + if (num_possible_cpus() > 1) + setup_APIC_timer(); + return; } + /* + * If nmi_watchdog is set to IO_APIC, we need the + * PIT/HPET going. Otherwise register lapic as a dummy + * device. + */ + if (nmi_watchdog != NMI_IO_APIC) + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; + else + printk(KERN_WARNING "APIC timer registered as dummy," + " due to nmi_watchdog=%d!\n", nmi_watchdog); + /* Setup the lapic or request the broadcast */ setup_APIC_timer(); } -- cgit v1.2.3 From 17c44697f293cf24cbbf51b4a5bd15f4fbc88b90 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 14 Jul 2008 17:18:04 +0100 Subject: x86: I/O APIC: Include required by some code Include for i8259A_lock used in print_PIC() -- #if-0-ed out by default. The 32-bit version gets it right already. The plan is to enable this code with "apic=debug" eventually. This will aid with debugging strange problems without the need to ask people to apply patches. Signed-off-by: Maciej W. Rozycki Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 6510cde36b3..834b06afef4 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From baa1318841d4bc95d783e6c15219b264720002c8 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 14 Jul 2008 18:44:51 +0100 Subject: x86: APIC: Make apic_verbosity unsigned As a microoptimisation, make apic_verbosity unsigned. This will make apic_printk(APIC_QUIET, ...) expand into just printk(...) with the surrounding condition and a reference to apic_verbosity removed. Signed-off-by: Maciej W. Rozycki Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 2 +- arch/x86/kernel/apic_64.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index d2a7eb511d6..7f30c0f3dbe 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -75,7 +75,7 @@ char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; /* * Debug level, exported for io_apic.c */ -int apic_verbosity; +unsigned int apic_verbosity; int pic_mode; diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index ce294d623e5..98c70f044e1 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -54,7 +54,7 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); /* * Debug level, exported for io_apic.c */ -int apic_verbosity; +unsigned int apic_verbosity; /* Have we found an MP table */ int smp_found_config; -- cgit v1.2.3 From 49a66a0bcea8737e82035a31ad0d8cd64d027a5d Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 14 Jul 2008 19:08:13 +0100 Subject: x86: I/O APIC: Always report how the timer has been set up Following recent (and less so) issues with the 8254 timer when routed through the I/O or local APIC, always report which configurations have been tried and which one has been set up eventually. This is so that logs posted by people for some other reason can be used as a cross-reference when investigating any possible future problems. The change unifies messages printed on 32-bit and 64-bit platforms and adds trailing newlines (removes leading ones), so that proper log level annotation can be used and any possible interspersed output will not cause a mess. I have chosen to use apic_printk(APIC_QUIET, ...) rather than printk(...) so that the distinction of these messages is maintained making possible future decisions about changes in this area easier. A change posted separately making apic_verbosity unsigned removes any extra code that would otherwise be generated as a result of this design decision. Signed-off-by: Maciej W. Rozycki Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_32.c | 39 ++++++++++++++++++++++----------------- arch/x86/kernel/io_apic_64.c | 40 ++++++++++++++++++++++------------------ 2 files changed, 44 insertions(+), 35 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index eabaf9244f5..de9aa0e3a9c 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -2177,8 +2177,9 @@ static inline void __init check_timer(void) pin2 = ioapic_i8259.pin; apic2 = ioapic_i8259.apic; - printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", - vector, apic1, pin1, apic2, pin2); + apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " + "apic1=%d pin1=%d apic2=%d pin2=%d\n", + vector, apic1, pin1, apic2, pin2); /* * Some BIOS writers are clueless and report the ExtINTA @@ -2216,12 +2217,13 @@ static inline void __init check_timer(void) } clear_IO_APIC_pin(apic1, pin1); if (!no_pin1) - printk(KERN_ERR "..MP-BIOS bug: " - "8254 timer not connected to IO-APIC\n"); + apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " + "8254 timer not connected to IO-APIC\n"); - printk(KERN_INFO "...trying to set up timer (IRQ0) " - "through the 8259A ... "); - printk("\n..... (found pin %d) ...", pin2); + apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer " + "(IRQ0) through the 8259A ...\n"); + apic_printk(APIC_QUIET, KERN_INFO + "..... (found apic %d pin %d) ...\n", apic2, pin2); /* * legacy devices should be connected to IO APIC #0 */ @@ -2230,7 +2232,7 @@ static inline void __init check_timer(void) unmask_IO_APIC_irq(0); enable_8259A_irq(0); if (timer_irq_works()) { - printk("works.\n"); + apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); timer_through_8259 = 1; if (nmi_watchdog == NMI_IO_APIC) { disable_8259A_irq(0); @@ -2244,30 +2246,33 @@ static inline void __init check_timer(void) */ disable_8259A_irq(0); clear_IO_APIC_pin(apic2, pin2); - printk(" failed.\n"); + apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); } if (nmi_watchdog == NMI_IO_APIC) { - printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); + apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work " + "through the IO-APIC - disabling NMI Watchdog!\n"); nmi_watchdog = NMI_NONE; } timer_ack = 0; - printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); + apic_printk(APIC_QUIET, KERN_INFO + "...trying to set up timer as Virtual Wire IRQ...\n"); lapic_register_intr(0, vector); apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ enable_8259A_irq(0); if (timer_irq_works()) { - printk(" works.\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } disable_8259A_irq(0); apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); - printk(" failed.\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); - printk(KERN_INFO "...trying to set up timer as ExtINT IRQ..."); + apic_printk(APIC_QUIET, KERN_INFO + "...trying to set up timer as ExtINT IRQ...\n"); init_8259A(0); make_8259A_irq(0); @@ -2276,12 +2281,12 @@ static inline void __init check_timer(void) unlock_ExtINT_logic(); if (timer_irq_works()) { - printk(" works.\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } - printk(" failed :(.\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " - "report. Then try booting with the 'noapic' option"); + "report. Then try booting with the 'noapic' option.\n"); out: local_irq_restore(flags); } diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 834b06afef4..64a46affd85 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -1697,8 +1697,9 @@ static inline void __init check_timer(void) pin2 = ioapic_i8259.pin; apic2 = ioapic_i8259.apic; - apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", - cfg->vector, apic1, pin1, apic2, pin2); + apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " + "apic1=%d pin1=%d apic2=%d pin2=%d\n", + cfg->vector, apic1, pin1, apic2, pin2); /* * Some BIOS writers are clueless and report the ExtINTA @@ -1736,14 +1737,13 @@ static inline void __init check_timer(void) } clear_IO_APIC_pin(apic1, pin1); if (!no_pin1) - apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: " + apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " "8254 timer not connected to IO-APIC\n"); - apic_printk(APIC_VERBOSE,KERN_INFO - "...trying to set up timer (IRQ0) " - "through the 8259A ... "); - apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...", - apic2, pin2); + apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer " + "(IRQ0) through the 8259A ...\n"); + apic_printk(APIC_QUIET, KERN_INFO + "..... (found apic %d pin %d) ...\n", apic2, pin2); /* * legacy devices should be connected to IO APIC #0 */ @@ -1752,7 +1752,7 @@ static inline void __init check_timer(void) unmask_IO_APIC_irq(0); enable_8259A_irq(0); if (timer_irq_works()) { - apic_printk(APIC_VERBOSE," works.\n"); + apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); timer_through_8259 = 1; if (nmi_watchdog == NMI_IO_APIC) { disable_8259A_irq(0); @@ -1766,29 +1766,32 @@ static inline void __init check_timer(void) */ disable_8259A_irq(0); clear_IO_APIC_pin(apic2, pin2); - apic_printk(APIC_VERBOSE," failed.\n"); + apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); } if (nmi_watchdog == NMI_IO_APIC) { - printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); + apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work " + "through the IO-APIC - disabling NMI Watchdog!\n"); nmi_watchdog = NMI_NONE; } - apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); + apic_printk(APIC_QUIET, KERN_INFO + "...trying to set up timer as Virtual Wire IRQ...\n"); lapic_register_intr(0); apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ enable_8259A_irq(0); if (timer_irq_works()) { - apic_printk(APIC_VERBOSE," works.\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } disable_8259A_irq(0); apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); - apic_printk(APIC_VERBOSE," failed.\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); - apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ..."); + apic_printk(APIC_QUIET, KERN_INFO + "...trying to set up timer as ExtINT IRQ...\n"); init_8259A(0); make_8259A_irq(0); @@ -1797,11 +1800,12 @@ static inline void __init check_timer(void) unlock_ExtINT_logic(); if (timer_irq_works()) { - apic_printk(APIC_VERBOSE," works.\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } - apic_printk(APIC_VERBOSE," failed :(.\n"); - panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); + panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " + "report. Then try booting with the 'noapic' option.\n"); out: local_irq_restore(flags); } -- cgit v1.2.3 From 7019cc2dd6fafcdc6b104005482dc910dcdbb797 Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Wed, 9 Jul 2008 15:27:19 -0500 Subject: x86 BIOS interface for RTC on SGI UV Real-time code needs to know the number of cycles per second on SGI UV. The information is provided via a run time BIOS call. This patch provides the linux side of that interface. This is the first of several run time BIOS calls to be defined in uv/bios.h and bios_uv.c. Note that BIOS_CALL() is just a stub for now. The bios side is being worked on. Signed-off-by: Russ Anderson Cc: Jack Steiner Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 1 + arch/x86/kernel/bios_uv.c | 48 ++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/genx2apic_uv_x.c | 23 +++++++++++++++++++ 3 files changed, 72 insertions(+) create mode 100644 arch/x86/kernel/bios_uv.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index da140611bb5..b78a17b1281 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -102,6 +102,7 @@ obj-$(CONFIG_OLPC) += olpc.o # 64 bit specific files ifeq ($(CONFIG_X86_64),y) obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o + obj-y += bios_uv.o obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o obj-$(CONFIG_AUDIT) += audit_64.o diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c new file mode 100644 index 00000000000..c639bd55391 --- /dev/null +++ b/arch/x86/kernel/bios_uv.c @@ -0,0 +1,48 @@ +/* + * BIOS run time interface routines. + * + * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include + +const char * +x86_bios_strerror(long status) +{ + const char *str; + switch (status) { + case 0: str = "Call completed without error"; break; + case -1: str = "Not implemented"; break; + case -2: str = "Invalid argument"; break; + case -3: str = "Call completed with error"; break; + default: str = "Unknown BIOS status code"; break; + } + return str; +} + +long +x86_bios_freq_base(unsigned long which, unsigned long *ticks_per_second, + unsigned long *drift_info) +{ + struct uv_bios_retval isrv; + + BIOS_CALL(isrv, BIOS_FREQ_BASE, which, 0, 0, 0, 0, 0, 0); + *ticks_per_second = isrv.v0; + *drift_info = isrv.v1; + return isrv.status; +} +EXPORT_SYMBOL_GPL(x86_bios_freq_base); diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 711f11c30b0..3c392934069 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -24,6 +24,7 @@ #include #include #include +#include DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info); @@ -40,6 +41,9 @@ EXPORT_SYMBOL_GPL(uv_cpu_to_blade); short uv_possible_blades; EXPORT_SYMBOL_GPL(uv_possible_blades); +unsigned long sn_rtc_cycles_per_second; +EXPORT_SYMBOL(sn_rtc_cycles_per_second); + /* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ static cpumask_t uv_target_cpus(void) @@ -272,6 +276,23 @@ static __init void map_mmioh_high(int max_pnode) map_high("MMIOH", mmioh.s.base, shift, map_uc); } +static __init void uv_rtc_init(void) +{ + long status, ticks_per_sec, drift; + + status = + x86_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec, + &drift); + if (status != 0 || ticks_per_sec < 100000) { + printk(KERN_WARNING + "unable to determine platform RTC clock frequency, " + "guessing.\n"); + /* BIOS gives wrong value for clock freq. so guess */ + sn_rtc_cycles_per_second = 1000000000000UL / 30000UL; + } else + sn_rtc_cycles_per_second = ticks_per_sec; +} + static __init void uv_system_init(void) { union uvh_si_addr_map_config_u m_n_config; @@ -326,6 +347,8 @@ static __init void uv_system_init(void) gnode_upper = (((unsigned long)node_id.s.node_id) & ~((1 << n_val) - 1)) << m_val; + uv_rtc_init(); + for_each_present_cpu(cpu) { nid = cpu_to_node(cpu); pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); -- cgit v1.2.3 From 78cbac65fd77242f3e5d77f4d7a71e8bc869fe4d Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Thu, 10 Jul 2008 21:14:52 +0200 Subject: x86: traps_xx: refactor die() like in x86_64 Make the diff between the traps_32.c and traps_64.c a bit smaller. Change traps_32.c to look more like traps_64.c: - move lock information to file scope - split out oops_begin() and oops_end() from die() - increment nest counter in oops_begin Only whitespace change in traps_64.c No functional changes intended. Signed-off-by: Alexander van Heukelum Acked-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 95 +++++++++++++++++++++++++--------------------- arch/x86/kernel/traps_64.c | 2 +- 2 files changed, 52 insertions(+), 45 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 8a768973c4f..51cccde376a 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -383,6 +383,54 @@ int is_valid_bugaddr(unsigned long ip) return ud2 == 0x0b0f; } +static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; +static int die_owner = -1; +static unsigned int die_nest_count; + +unsigned __kprobes long oops_begin(void) +{ + unsigned long flags; + + oops_enter(); + + if (die_owner != raw_smp_processor_id()) { + console_verbose(); + raw_local_irq_save(flags); + __raw_spin_lock(&die_lock); + die_owner = smp_processor_id(); + die_nest_count = 0; + bust_spinlocks(1); + } else { + raw_local_irq_save(flags); + } + die_nest_count++; + return flags; +} + +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) +{ + bust_spinlocks(0); + die_owner = -1; + add_taint(TAINT_DIE); + __raw_spin_unlock(&die_lock); + raw_local_irq_restore(flags); + + if (!regs) + return; + + if (kexec_should_crash(current)) + crash_kexec(regs); + + if (in_interrupt()) + panic("Fatal exception in interrupt"); + + if (panic_on_oops) + panic("Fatal exception"); + + oops_exit(); + do_exit(signr); +} + int __kprobes __die(const char *str, struct pt_regs *regs, long err) { unsigned short ss; @@ -423,31 +471,9 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) */ void die(const char *str, struct pt_regs *regs, long err) { - static struct { - raw_spinlock_t lock; - u32 lock_owner; - int lock_owner_depth; - } die = { - .lock = __RAW_SPIN_LOCK_UNLOCKED, - .lock_owner = -1, - .lock_owner_depth = 0 - }; - unsigned long flags; - - oops_enter(); - - if (die.lock_owner != raw_smp_processor_id()) { - console_verbose(); - raw_local_irq_save(flags); - __raw_spin_lock(&die.lock); - die.lock_owner = smp_processor_id(); - die.lock_owner_depth = 0; - bust_spinlocks(1); - } else { - raw_local_irq_save(flags); - } + unsigned long flags = oops_begin(); - if (++die.lock_owner_depth < 3) { + if (die_nest_count < 3) { report_bug(regs->ip, regs); if (__die(str, regs, err)) @@ -456,26 +482,7 @@ void die(const char *str, struct pt_regs *regs, long err) printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); } - bust_spinlocks(0); - die.lock_owner = -1; - add_taint(TAINT_DIE); - __raw_spin_unlock(&die.lock); - raw_local_irq_restore(flags); - - if (!regs) - return; - - if (kexec_should_crash(current)) - crash_kexec(regs); - - if (in_interrupt()) - panic("Fatal exception in interrupt"); - - if (panic_on_oops) - panic("Fatal exception"); - - oops_exit(); - do_exit(SIGSEGV); + oops_end(flags, regs, SIGSEGV); } static inline void diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 2696a683778..babdbe673b7 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -518,7 +518,7 @@ unsigned __kprobes long oops_begin(void) } void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) -{ +{ die_owner = -1; bust_spinlocks(0); die_nest_count--; -- cgit v1.2.3 From 7dedcee394a3f61475d08002bd12e8068d044216 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Thu, 10 Jul 2008 21:16:39 +0200 Subject: x86: traps_xx: modify x86_64 to use _log_lvl variants i386 has show_trace_log_lvl and show_stack_log_lvl, allowing traces to be emitted with log-level annotations. This patch introduces them to x86_64, but log_lvl is only ever set to an empty string. Output of traces is unchanged. i386-chunk is whitespace-only. Signed-off-by: Alexander van Heukelum Cc: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 2 +- arch/x86/kernel/traps_64.c | 24 ++++++++++++++++-------- 2 files changed, 17 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 51cccde376a..c971dce3847 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -256,7 +256,7 @@ static const struct stacktrace_ops print_trace_ops = { static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) + unsigned long *stack, unsigned long bp, char *log_lvl) { dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); printk("%s =======================\n", log_lvl); diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index babdbe673b7..c664e696200 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -355,17 +355,24 @@ static const struct stacktrace_ops print_trace_ops = { .address = print_trace_address, }; -void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) +static void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, char *log_lvl) { printk("\nCall Trace:\n"); - dump_trace(task, regs, stack, bp, &print_trace_ops, NULL); + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); printk("\n"); } +void show_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp) +{ + show_trace_log_lvl(task, regs, stack, bp, ""); +} + static void -_show_stack(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp) +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, unsigned long bp, char *log_lvl) { unsigned long *stack; int i; @@ -399,12 +406,12 @@ _show_stack(struct task_struct *task, struct pt_regs *regs, printk(" %016lx", *stack++); touch_nmi_watchdog(); } - show_trace(task, regs, sp, bp); + show_trace_log_lvl(task, regs, sp, bp, log_lvl); } void show_stack(struct task_struct *task, unsigned long *sp) { - _show_stack(task, NULL, sp, 0); + show_stack_log_lvl(task, NULL, sp, 0, ""); } /* @@ -454,7 +461,8 @@ void show_registers(struct pt_regs *regs) u8 *ip; printk("Stack: "); - _show_stack(NULL, regs, (unsigned long *)sp, regs->bp); + show_stack_log_lvl(NULL, regs, (unsigned long *)sp, + regs->bp, ""); printk("\n"); printk(KERN_EMERG "Code: "); -- cgit v1.2.3 From 8b2b9c1af065a45ef00c26964420489a53581779 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 15 Jul 2008 17:09:03 +0900 Subject: x86, intel_cacheinfo: fix use-after-free cache_kobject This avoids calling kobject_uevent() with cache_kobject that has already been deallocated in an error path. Signed-off-by: Akinobu Mita Cc: "H. Peter Anvin" Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 2c8afafa18e..ff517f0b8cc 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -780,15 +780,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) } kobject_put(per_cpu(cache_kobject, cpu)); cpuid4_cache_sysfs_exit(cpu); - break; + return retval; } kobject_uevent(&(this_object->kobj), KOBJ_ADD); } - if (!retval) - cpu_set(cpu, cache_dev_map); + cpu_set(cpu, cache_dev_map); kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD); - return retval; + return 0; } static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) -- cgit v1.2.3 From b8f8c3cf0a4ac0632ec3f0e15e9dc0c29de917af Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 18 Jul 2008 17:27:28 +0200 Subject: nohz: prevent tick stop outside of the idle loop Jack Ren and Eric Miao tracked down the following long standing problem in the NOHZ code: scheduler switch to idle task enable interrupts Window starts here ----> interrupt happens (does not set NEED_RESCHED) irq_exit() stops the tick ----> interrupt happens (does set NEED_RESCHED) return from schedule() cpu_idle(): preempt_disable(); Window ends here The interrupts can happen at any point inside the race window. The first interrupt stops the tick, the second one causes the scheduler to rerun and switch away from idle again and we end up with the tick disabled. The fact that it needs two interrupts where the first one does not set NEED_RESCHED and the second one does made the bug obscure and extremly hard to reproduce and analyse. Kudos to Jack and Eric. Solution: Limit the NOHZ functionality to the idle loop to make sure that we can not run into such a situation ever again. cpu_idle() { preempt_disable(); while(1) { tick_nohz_stop_sched_tick(1); <- tell NOHZ code that we are in the idle loop while (!need_resched()) halt(); tick_nohz_restart_sched_tick(); <- disables NOHZ mode preempt_enable_no_resched(); schedule(); preempt_disable(); } } In hindsight we should have done this forever, but ... /me grabs a large brown paperbag. Debugged-by: Jack Ren , Debugged-by: eric miao Signed-off-by: Thomas Gleixner --- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index f8476dfbb60..1f5fa1cf16d 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -166,7 +166,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(); + tick_nohz_stop_sched_tick(1); while (!need_resched()) { void (*idle)(void); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index e2319f39988..c0a5c2a687e 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -148,7 +148,7 @@ void cpu_idle(void) current_thread_info()->status |= TS_POLLING; /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(); + tick_nohz_stop_sched_tick(1); while (!need_resched()) { void (*idle)(void); -- cgit v1.2.3 From 47129654226b5bd418afe533ce4e11d6a0b6d6e4 Mon Sep 17 00:00:00 2001 From: Alexander Beregalov Date: Sun, 6 Jul 2008 20:13:49 +0400 Subject: x86 setup.c: cleanup includes x86: remove double includes in setup.c Signed-off-by: Alexander Beregalov Cc: yhlu.kernel@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 531b55b8e81..4a2b8acc1d9 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -57,12 +57,8 @@ #include #include #include -#include #include -#include -#include -#include #include #include #include @@ -104,7 +100,6 @@ #include #include -#include #include #include #ifdef CONFIG_X86_64 -- cgit v1.2.3 From 9781f39fd209cd93ab98b669814191acc67f32fd Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Thu, 10 Jul 2008 17:13:19 +0200 Subject: x86: consolidate the definition of the force_mwait variable The force_mwait variable iss defined either in arch/x86/kernel/cpu/amd.c or in arch/x86/kernel/setup_64.c, but it is only initialized and used in arch/x86/kernel/process.c. This patch moves the declaration to arch/x86/kernel/process.c. Signed-off-by: Thomas Petazzoni Cc: michael@free-electrons.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 2 -- arch/x86/kernel/process.c | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 81a07ca65d4..cae9cabc303 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -24,8 +24,6 @@ extern void vide(void); __asm__(".align 4\nvide: ret"); -int force_mwait __cpuinitdata; - static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) { if (cpuid_eax(0x80000000) >= 0x80000007) { diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 4d629c62f4f..74f2d196adb 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -15,6 +15,7 @@ unsigned long idle_nomwait; EXPORT_SYMBOL(idle_nomwait); struct kmem_cache *task_xstate_cachep; +static int force_mwait __cpuinitdata; int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { -- cgit v1.2.3 From 5ff4789d045cdaec7629e027e4f8ff8e34308b81 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 14 Jul 2008 20:11:18 +0200 Subject: AMD IOMMU: set iommu for device from ACPI code too The device<->iommu relationship has to be set from the information in the ACPI table too. This patch adds this logic to the driver. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 9bf1b8111b0..7661b02d720 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -426,11 +426,18 @@ static void set_dev_entry_bit(u16 devid, u8 bit) amd_iommu_dev_table[devid].data[i] |= (1 << _bit); } +/* Writes the specific IOMMU for a device into the rlookup table */ +static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) +{ + amd_iommu_rlookup_table[devid] = iommu; +} + /* * This function takes the device specific flags read from the ACPI * table and sets up the device table entry with that information */ -static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags) +static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu, + u16 devid, u32 flags, u32 ext_flags) { if (flags & ACPI_DEVFLAG_INITPASS) set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS); @@ -446,12 +453,8 @@ static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags) set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS); if (flags & ACPI_DEVFLAG_LINT1) set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS); -} -/* Writes the specific IOMMU for a device into the rlookup table */ -static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) -{ - amd_iommu_rlookup_table[devid] = iommu; + set_iommu_for_device(iommu, devid); } /* @@ -550,11 +553,12 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, case IVHD_DEV_ALL: for (dev_i = iommu->first_device; dev_i <= iommu->last_device; ++dev_i) - set_dev_entry_from_acpi(dev_i, e->flags, 0); + set_dev_entry_from_acpi(iommu, dev_i, + e->flags, 0); break; case IVHD_DEV_SELECT: devid = e->devid; - set_dev_entry_from_acpi(devid, e->flags, 0); + set_dev_entry_from_acpi(iommu, devid, e->flags, 0); break; case IVHD_DEV_SELECT_RANGE_START: devid_start = e->devid; @@ -565,7 +569,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, case IVHD_DEV_ALIAS: devid = e->devid; devid_to = e->ext >> 8; - set_dev_entry_from_acpi(devid, e->flags, 0); + set_dev_entry_from_acpi(iommu, devid, e->flags, 0); amd_iommu_alias_table[devid] = devid_to; break; case IVHD_DEV_ALIAS_RANGE: @@ -577,7 +581,8 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, break; case IVHD_DEV_EXT_SELECT: devid = e->devid; - set_dev_entry_from_acpi(devid, e->flags, e->ext); + set_dev_entry_from_acpi(iommu, devid, e->flags, + e->ext); break; case IVHD_DEV_EXT_SELECT_RANGE: devid_start = e->devid; @@ -590,7 +595,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, for (dev_i = devid_start; dev_i <= devid; ++dev_i) { if (alias) amd_iommu_alias_table[dev_i] = devid_to; - set_dev_entry_from_acpi( + set_dev_entry_from_acpi(iommu, amd_iommu_alias_table[dev_i], flags, ext_flags); } -- cgit v1.2.3 From 6ac8d51f01d345af5ea4209004a9ea29b2f20891 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Tue, 15 Jul 2008 21:09:13 +0530 Subject: x86: introducing asm-x86/traps.h Declaring x86 traps under one hood. Declaring x86 do_traps before defining them. Signed-off-by: Jaswinder Singh Cc: Andi Kleen Cc: David Woodhouse Cc: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 21 +-------------------- arch/x86/kernel/traps_64.c | 22 +--------------------- 2 files changed, 2 insertions(+), 41 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index c971dce3847..03df8e45e5a 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -58,6 +58,7 @@ #include #include #include +#include #include "mach_traps.h" @@ -77,26 +78,6 @@ char ignore_fpu_irq; gate_desc idt_table[256] __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; -asmlinkage void divide_error(void); -asmlinkage void debug(void); -asmlinkage void nmi(void); -asmlinkage void int3(void); -asmlinkage void overflow(void); -asmlinkage void bounds(void); -asmlinkage void invalid_op(void); -asmlinkage void device_not_available(void); -asmlinkage void coprocessor_segment_overrun(void); -asmlinkage void invalid_TSS(void); -asmlinkage void segment_not_present(void); -asmlinkage void stack_segment(void); -asmlinkage void general_protection(void); -asmlinkage void page_fault(void); -asmlinkage void coprocessor_error(void); -asmlinkage void simd_coprocessor_error(void); -asmlinkage void alignment_check(void); -asmlinkage void spurious_interrupt_bug(void); -asmlinkage void machine_check(void); - int panic_on_unrecovered_nmi; int kstack_depth_to_print = 24; static unsigned int code_bytes = 64; diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index c664e696200..3f18d73f420 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -51,30 +51,10 @@ #include #include #include +#include #include -asmlinkage void divide_error(void); -asmlinkage void debug(void); -asmlinkage void nmi(void); -asmlinkage void int3(void); -asmlinkage void overflow(void); -asmlinkage void bounds(void); -asmlinkage void invalid_op(void); -asmlinkage void device_not_available(void); -asmlinkage void double_fault(void); -asmlinkage void coprocessor_segment_overrun(void); -asmlinkage void invalid_TSS(void); -asmlinkage void segment_not_present(void); -asmlinkage void stack_segment(void); -asmlinkage void general_protection(void); -asmlinkage void page_fault(void); -asmlinkage void coprocessor_error(void); -asmlinkage void simd_coprocessor_error(void); -asmlinkage void alignment_check(void); -asmlinkage void spurious_interrupt_bug(void); -asmlinkage void machine_check(void); - int panic_on_unrecovered_nmi; int kstack_depth_to_print = 12; static unsigned int code_bytes = 64; -- cgit v1.2.3 From 1181f8b5f0302580af0958169ef4497c3eb57a61 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 3 Jul 2008 13:12:13 -0700 Subject: x86_32: remove redundant KERN_INFO This printk has a KERN_ facility level in the format string. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index d9237363096..d633d801f85 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -212,7 +212,7 @@ asmlinkage unsigned long sys_sigreturn(unsigned long __unused) badframe: if (show_unhandled_signals && printk_ratelimit()) { - printk(KERN_INFO "%s%s[%d] bad frame in sigreturn frame:" + printk("%s%s[%d] bad frame in sigreturn frame:" "%p ip:%lx sp:%lx oeax:%lx", task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, current->comm, task_pid_nr(current), frame, regs->ip, -- cgit v1.2.3 From fa10c51a04a43ced5fd6033f19a74d2c82198b34 Mon Sep 17 00:00:00 2001 From: Alexander Beregalov Date: Wed, 9 Jul 2008 22:28:24 +0400 Subject: arch/x86/kernel/cpu/common_64.c: remove double inclusions x86: remove double inclusions in arch/x86/kernel/cpu/common_64.c Signed-off-by: Alexander Beregalov Cc: yhlu.kernel@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common_64.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 7b8cc72feb4..2a4475beea4 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -7,12 +7,9 @@ #include #include #include -#include #include #include -#include #include -#include #include #include #include -- cgit v1.2.3 From 65c011845316d3c1381f478ca0d8265c43b3b039 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 15 Jul 2008 14:14:30 -0700 Subject: cpumask: Replace cpumask_of_cpu with cpumask_of_cpu_ptr * This patch replaces the dangerous lvalue version of cpumask_of_cpu with new cpumask_of_cpu_ptr macros. These are patterned after the node_to_cpumask_ptr macros. In general terms, if there is a cpumask_of_cpu_map[] then a pointer to the cpumask_of_cpu_map[cpu] entry is used. The cpumask_of_cpu_map is provided when there is a large NR_CPUS count, reducing greatly the amount of code generated and stack space used for cpumask_of_cpu(). The pointer to the cpumask_t value is needed for calling set_cpus_allowed_ptr() to reduce the amount of stack space needed to pass the cpumask_t value. If there isn't a cpumask_of_cpu_map[], then a temporary variable is declared and filled in with value from cpumask_of_cpu(cpu) as well as a pointer variable pointing to this temporary variable. Afterwards, the pointer is used to reference the cpumask value. The compiler will optimize out the extra dereference through the pointer as well as the stack space used for the pointer, resulting in identical code. A good example of the orthogonal usages is in net/sunrpc/svc.c: case SVC_POOL_PERCPU: { unsigned int cpu = m->pool_to[pidx]; cpumask_of_cpu_ptr(cpumask, cpu); *oldmask = current->cpus_allowed; set_cpus_allowed_ptr(current, cpumask); return 1; } case SVC_POOL_PERNODE: { unsigned int node = m->pool_to[pidx]; node_to_cpumask_ptr(nodecpumask, node); *oldmask = current->cpus_allowed; set_cpus_allowed_ptr(current, nodecpumask); return 1; } Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/cstate.c | 3 ++- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 10 +++++++--- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 15 ++++++++++----- arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c | 9 ++++++--- arch/x86/kernel/cpu/cpufreq/speedstep-ich.c | 3 ++- arch/x86/kernel/cpu/intel_cacheinfo.c | 3 ++- arch/x86/kernel/microcode.c | 13 +++++++++---- arch/x86/kernel/reboot.c | 14 +++++++++----- 8 files changed, 47 insertions(+), 23 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index c2502eb9aa8..9220cf46aa1 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -73,6 +73,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, struct cpuinfo_x86 *c = &cpu_data(cpu); cpumask_t saved_mask; + cpumask_of_cpu_ptr(new_mask, cpu); int retval; unsigned int eax, ebx, ecx, edx; unsigned int edx_part; @@ -91,7 +92,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, /* Make sure we are running on right CPU */ saved_mask = current->cpus_allowed; - retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + retval = set_cpus_allowed_ptr(current, new_mask); if (retval) return -1; diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index dd097b83583..ff2fff56f0a 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -200,10 +200,12 @@ static void drv_read(struct drv_cmd *cmd) static void drv_write(struct drv_cmd *cmd) { cpumask_t saved_mask = current->cpus_allowed; + cpumask_of_cpu_ptr_declare(cpu_mask); unsigned int i; for_each_cpu_mask_nr(i, cmd->mask) { - set_cpus_allowed_ptr(current, &cpumask_of_cpu(i)); + cpumask_of_cpu_ptr_next(cpu_mask, i); + set_cpus_allowed_ptr(current, cpu_mask); do_drv_write(cmd); } @@ -267,11 +269,12 @@ static unsigned int get_measured_perf(unsigned int cpu) } aperf_cur, mperf_cur; cpumask_t saved_mask; + cpumask_of_cpu_ptr(cpu_mask, cpu); unsigned int perf_percent; unsigned int retval; saved_mask = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + set_cpus_allowed_ptr(current, cpu_mask); if (get_cpu() != cpu) { /* We were not able to run on requested processor */ put_cpu(); @@ -337,6 +340,7 @@ static unsigned int get_measured_perf(unsigned int cpu) static unsigned int get_cur_freq_on_cpu(unsigned int cpu) { + cpumask_of_cpu_ptr(cpu_mask, cpu); struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu); unsigned int freq; unsigned int cached_freq; @@ -349,7 +353,7 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu) } cached_freq = data->freq_table[data->acpi_data->state].frequency; - freq = extract_freq(get_cur_val(&cpumask_of_cpu(cpu)), data); + freq = extract_freq(get_cur_val(cpu_mask), data); if (freq != cached_freq) { /* * The dreaded BIOS frequency change behind our back. diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index c45ca6d4dce..53c7b693697 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -479,11 +479,12 @@ static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvi static int check_supported_cpu(unsigned int cpu) { cpumask_t oldmask; + cpumask_of_cpu_ptr(cpu_mask, cpu); u32 eax, ebx, ecx, edx; unsigned int rc = 0; oldmask = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + set_cpus_allowed_ptr(current, cpu_mask); if (smp_processor_id() != cpu) { printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu); @@ -1016,6 +1017,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation) { cpumask_t oldmask; + cpumask_of_cpu_ptr(cpu_mask, pol->cpu); struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); u32 checkfid; u32 checkvid; @@ -1030,7 +1032,7 @@ static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsi /* only run on specific CPU from here on */ oldmask = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); + set_cpus_allowed_ptr(current, cpu_mask); if (smp_processor_id() != pol->cpu) { printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); @@ -1105,6 +1107,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) { struct powernow_k8_data *data; cpumask_t oldmask; + cpumask_of_cpu_ptr_declare(newmask); int rc; if (!cpu_online(pol->cpu)) @@ -1156,7 +1159,8 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) /* only run on specific CPU from here on */ oldmask = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); + cpumask_of_cpu_ptr_next(newmask, pol->cpu); + set_cpus_allowed_ptr(current, newmask); if (smp_processor_id() != pol->cpu) { printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); @@ -1178,7 +1182,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) set_cpus_allowed_ptr(current, &oldmask); if (cpu_family == CPU_HW_PSTATE) - pol->cpus = cpumask_of_cpu(pol->cpu); + pol->cpus = *newmask; else pol->cpus = per_cpu(cpu_core_map, pol->cpu); data->available_cores = &(pol->cpus); @@ -1244,6 +1248,7 @@ static unsigned int powernowk8_get (unsigned int cpu) { struct powernow_k8_data *data; cpumask_t oldmask = current->cpus_allowed; + cpumask_of_cpu_ptr(newmask, cpu); unsigned int khz = 0; unsigned int first; @@ -1253,7 +1258,7 @@ static unsigned int powernowk8_get (unsigned int cpu) if (!data) return -EINVAL; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + set_cpus_allowed_ptr(current, newmask); if (smp_processor_id() != cpu) { printk(KERN_ERR PFX "limiting to CPU %d failed in powernowk8_get\n", cpu); diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index 8b0dd6f2a1a..fd561bb26c6 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -313,9 +313,10 @@ static unsigned int get_cur_freq(unsigned int cpu) unsigned l, h; unsigned clock_freq; cpumask_t saved_mask; + cpumask_of_cpu_ptr(new_mask, cpu); saved_mask = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + set_cpus_allowed_ptr(current, new_mask); if (smp_processor_id() != cpu) return 0; @@ -554,9 +555,11 @@ static int centrino_target (struct cpufreq_policy *policy, */ if (!cpus_empty(covered_cpus)) { + cpumask_of_cpu_ptr_declare(new_mask); + for_each_cpu_mask_nr(j, covered_cpus) { - set_cpus_allowed_ptr(current, - &cpumask_of_cpu(j)); + cpumask_of_cpu_ptr_next(new_mask, j); + set_cpus_allowed_ptr(current, new_mask); wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); } } diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 191f7263c61..2f3728dc24f 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -244,7 +244,8 @@ static unsigned int _speedstep_get(const cpumask_t *cpus) static unsigned int speedstep_get(unsigned int cpu) { - return _speedstep_get(&cpumask_of_cpu(cpu)); + cpumask_of_cpu_ptr(newmask, cpu); + return _speedstep_get(newmask); } /** diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index a7b0f8f1736..e4b8d189d7e 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -516,6 +516,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu) unsigned long j; int retval; cpumask_t oldmask; + cpumask_of_cpu_ptr(newmask, cpu); if (num_cache_leaves == 0) return -ENOENT; @@ -526,7 +527,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu) return -ENOMEM; oldmask = current->cpus_allowed; - retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + retval = set_cpus_allowed_ptr(current, newmask); if (retval) goto out; diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 56b933119a0..58520169e35 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -388,6 +388,7 @@ static int do_microcode_update (void) void *new_mc = NULL; int cpu; cpumask_t old; + cpumask_of_cpu_ptr_declare(newmask); old = current->cpus_allowed; @@ -404,7 +405,8 @@ static int do_microcode_update (void) if (!uci->valid) continue; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + cpumask_of_cpu_ptr_next(newmask, cpu); + set_cpus_allowed_ptr(current, newmask); error = get_maching_microcode(new_mc, cpu); if (error < 0) goto out; @@ -574,6 +576,7 @@ static int apply_microcode_check_cpu(int cpu) struct cpuinfo_x86 *c = &cpu_data(cpu); struct ucode_cpu_info *uci = ucode_cpu_info + cpu; cpumask_t old; + cpumask_of_cpu_ptr(newmask, cpu); unsigned int val[2]; int err = 0; @@ -582,7 +585,7 @@ static int apply_microcode_check_cpu(int cpu) return 0; old = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + set_cpus_allowed_ptr(current, newmask); /* Check if the microcode we have in memory matches the CPU */ if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || @@ -620,11 +623,12 @@ static int apply_microcode_check_cpu(int cpu) static void microcode_init_cpu(int cpu, int resume) { cpumask_t old; + cpumask_of_cpu_ptr(newmask, cpu); struct ucode_cpu_info *uci = ucode_cpu_info + cpu; old = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + set_cpus_allowed_ptr(current, newmask); mutex_lock(µcode_mutex); collect_cpu_info(cpu); if (uci->valid && system_state == SYSTEM_RUNNING && !resume) @@ -656,11 +660,12 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz) return -EINVAL; if (val == 1) { cpumask_t old; + cpumask_of_cpu_ptr(newmask, cpu); old = current->cpus_allowed; get_online_cpus(); - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + set_cpus_allowed_ptr(current, newmask); mutex_lock(µcode_mutex); if (uci->valid) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index f8a62160e15..214bbdfc851 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -403,24 +403,28 @@ void native_machine_shutdown(void) { /* Stop the cpus and apics */ #ifdef CONFIG_SMP - int reboot_cpu_id; /* The boot cpu is always logical cpu 0 */ - reboot_cpu_id = 0; + int reboot_cpu_id = 0; + cpumask_of_cpu_ptr(newmask, reboot_cpu_id); #ifdef CONFIG_X86_32 /* See if there has been given a command line override */ if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) && - cpu_online(reboot_cpu)) + cpu_online(reboot_cpu)) { reboot_cpu_id = reboot_cpu; + cpumask_of_cpu_ptr_next(newmask, reboot_cpu_id); + } #endif /* Make certain the cpu I'm about to reboot on is online */ - if (!cpu_online(reboot_cpu_id)) + if (!cpu_online(reboot_cpu_id)) { reboot_cpu_id = smp_processor_id(); + cpumask_of_cpu_ptr_next(newmask, reboot_cpu_id); + } /* Make certain I only run on the appropriate processor */ - set_cpus_allowed_ptr(current, &cpumask_of_cpu(reboot_cpu_id)); + set_cpus_allowed_ptr(current, newmask); /* O.K Now that I'm on the appropriate processor, * stop all of the others. -- cgit v1.2.3 From cb6d2be60dc3ec9ac788f45d8e24b82a9faacdd9 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 15 Jul 2008 14:14:31 -0700 Subject: cpumask: Optimize cpumask_of_cpu in arch/x86/kernel/io_apic_64.c * Optimize various places where a pointer to the cpumask_of_cpu value will result in reducing stack pressure. Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_64.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index bf27114773d..a85db790754 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -1372,12 +1372,10 @@ static unsigned int startup_ioapic_irq(unsigned int irq) static int ioapic_retrigger_irq(unsigned int irq) { struct irq_cfg *cfg = &irq_cfg[irq]; - cpumask_t mask; unsigned long flags; spin_lock_irqsave(&vector_lock, flags); - mask = cpumask_of_cpu(first_cpu(cfg->domain)); - send_IPI_mask(mask, cfg->vector); + send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector); spin_unlock_irqrestore(&vector_lock, flags); return 1; -- cgit v1.2.3 From c42f4f4c6dab3b2b7768c36173ee7c7ecf79eddb Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 15 Jul 2008 14:14:32 -0700 Subject: cpumask: Optimize cpumask_of_cpu in arch/x86/kernel/ldt.c * Optimize various places where a pointer to the cpumask_of_cpu value will result in reducing stack pressure. Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/kernel/ldt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index a8449571858..3fee2aa50f3 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -62,12 +62,12 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) if (reload) { #ifdef CONFIG_SMP - cpumask_t mask; + cpumask_of_cpu_ptr_declare(mask); preempt_disable(); load_LDT(pc); - mask = cpumask_of_cpu(smp_processor_id()); - if (!cpus_equal(current->mm->cpu_vm_mask, mask)) + cpumask_of_cpu_ptr_next(mask, smp_processor_id()); + if (!cpus_equal(current->mm->cpu_vm_mask, *mask)) smp_call_function(flush_ldt, current->mm, 1); preempt_enable(); #else -- cgit v1.2.3 From eb53fac5cafc4b2f8443ff064938b4494a28c54e Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 15 Jul 2008 14:14:37 -0700 Subject: cpumask: Use optimized CPUMASK_ALLOC macros in the centrino_target * Use the CPUMASK_ALLOC macros in the centrino_target() function. Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c | 73 +++++++++++++++--------- 1 file changed, 45 insertions(+), 28 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index fd561bb26c6..470c016cb25 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -442,6 +442,13 @@ static int centrino_verify (struct cpufreq_policy *policy) * * Sets a new CPUFreq policy. */ +struct allmasks { + cpumask_t online_policy_cpus; + cpumask_t saved_mask; + cpumask_t set_mask; + cpumask_t covered_cpus; +}; + static int centrino_target (struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) @@ -449,48 +456,55 @@ static int centrino_target (struct cpufreq_policy *policy, unsigned int newstate = 0; unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu; struct cpufreq_freqs freqs; - cpumask_t online_policy_cpus; - cpumask_t saved_mask; - cpumask_t set_mask; - cpumask_t covered_cpus; int retval = 0; unsigned int j, k, first_cpu, tmp; - - if (unlikely(centrino_model[cpu] == NULL)) - return -ENODEV; + CPUMASK_ALLOC(allmasks); + CPUMASK_VAR(online_policy_cpus, allmasks); + CPUMASK_VAR(saved_mask, allmasks); + CPUMASK_VAR(set_mask, allmasks); + CPUMASK_VAR(covered_cpus, allmasks); + + if (unlikely(allmasks == NULL)) + return -ENOMEM; + + if (unlikely(centrino_model[cpu] == NULL)) { + retval = -ENODEV; + goto out; + } if (unlikely(cpufreq_frequency_table_target(policy, centrino_model[cpu]->op_points, target_freq, relation, &newstate))) { - return -EINVAL; + retval = -EINVAL; + goto out; } #ifdef CONFIG_HOTPLUG_CPU /* cpufreq holds the hotplug lock, so we are safe from here on */ - cpus_and(online_policy_cpus, cpu_online_map, policy->cpus); + cpus_and(*online_policy_cpus, cpu_online_map, policy->cpus); #else - online_policy_cpus = policy->cpus; + *online_policy_cpus = policy->cpus; #endif - saved_mask = current->cpus_allowed; + *saved_mask = current->cpus_allowed; first_cpu = 1; - cpus_clear(covered_cpus); - for_each_cpu_mask_nr(j, online_policy_cpus) { + cpus_clear(*covered_cpus); + for_each_cpu_mask_nr(j, *online_policy_cpus) { /* * Support for SMP systems. * Make sure we are running on CPU that wants to change freq */ - cpus_clear(set_mask); + cpus_clear(*set_mask); if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) - cpus_or(set_mask, set_mask, online_policy_cpus); + cpus_or(*set_mask, *set_mask, *online_policy_cpus); else - cpu_set(j, set_mask); + cpu_set(j, *set_mask); - set_cpus_allowed_ptr(current, &set_mask); + set_cpus_allowed_ptr(current, set_mask); preempt_disable(); - if (unlikely(!cpu_isset(smp_processor_id(), set_mask))) { + if (unlikely(!cpu_isset(smp_processor_id(), *set_mask))) { dprintk("couldn't limit to CPUs in this domain\n"); retval = -EAGAIN; if (first_cpu) { @@ -518,7 +532,7 @@ static int centrino_target (struct cpufreq_policy *policy, dprintk("target=%dkHz old=%d new=%d msr=%04x\n", target_freq, freqs.old, freqs.new, msr); - for_each_cpu_mask_nr(k, online_policy_cpus) { + for_each_cpu_mask_nr(k, *online_policy_cpus) { freqs.cpu = k; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); @@ -537,11 +551,11 @@ static int centrino_target (struct cpufreq_policy *policy, break; } - cpu_set(j, covered_cpus); + cpu_set(j, *covered_cpus); preempt_enable(); } - for_each_cpu_mask_nr(k, online_policy_cpus) { + for_each_cpu_mask_nr(k, *online_policy_cpus) { freqs.cpu = k; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } @@ -554,10 +568,10 @@ static int centrino_target (struct cpufreq_policy *policy, * Best effort undo.. */ - if (!cpus_empty(covered_cpus)) { + if (!cpus_empty(*covered_cpus)) { cpumask_of_cpu_ptr_declare(new_mask); - for_each_cpu_mask_nr(j, covered_cpus) { + for_each_cpu_mask_nr(j, *covered_cpus) { cpumask_of_cpu_ptr_next(new_mask, j); set_cpus_allowed_ptr(current, new_mask); wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); @@ -567,19 +581,22 @@ static int centrino_target (struct cpufreq_policy *policy, tmp = freqs.new; freqs.new = freqs.old; freqs.old = tmp; - for_each_cpu_mask_nr(j, online_policy_cpus) { + for_each_cpu_mask_nr(j, *online_policy_cpus) { freqs.cpu = j; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } } - set_cpus_allowed_ptr(current, &saved_mask); - return 0; + set_cpus_allowed_ptr(current, saved_mask); + retval = 0; + goto out; migrate_end: preempt_enable(); - set_cpus_allowed_ptr(current, &saved_mask); - return 0; + set_cpus_allowed_ptr(current, saved_mask); +out: + CPUMASK_FREE(allmasks); + return retval; } static struct freq_attr* centrino_attr[] = { -- cgit v1.2.3 From 08e1a13e7d14ba5d6a22bf4b8c6e11128d3bcdfe Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 18 Jul 2008 13:44:16 +0100 Subject: x86: reduce forbid_dac's visibility It's not used anywhere outside its declaring file. Signed-off-by: Jan Beulich Signed-off-by: H. Peter Anvin --- arch/x86/kernel/pci-dma.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 8467ec2320f..702714bd151 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -9,8 +9,7 @@ #include #include -int forbid_dac __read_mostly; -EXPORT_SYMBOL(forbid_dac); +static int forbid_dac __read_mostly; const struct dma_mapping_ops *dma_ops; EXPORT_SYMBOL(dma_ops); -- cgit v1.2.3 From 08ad8afaa0f7343e9c64eec5dbbb178e390e03a2 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 18 Jul 2008 13:45:20 +0100 Subject: x86: reduce force_mwait visibility It's not used anywhere outside its single referencing file. Signed-off-by: Jan Beulich Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/amd.c | 2 -- arch/x86/kernel/process.c | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 81a07ca65d4..cae9cabc303 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -24,8 +24,6 @@ extern void vide(void); __asm__(".align 4\nvide: ret"); -int force_mwait __cpuinitdata; - static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) { if (cpuid_eax(0x80000000) >= 0x80000007) { diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 158bd6a16f6..9f94bb1c811 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -199,6 +199,7 @@ static void poll_idle(void) * * idle=mwait overrides this decision and forces the usage of mwait. */ +static int __cpuinitdata force_mwait; #define MWAIT_INFO 0x05 #define MWAIT_ECX_EXTENDED_INFO 0x01 -- cgit v1.2.3 From 2ddf9b7b3e6660199269e34cfa27148440ddc3bf Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 18 Jul 2008 13:32:23 +0100 Subject: i386/xen: add proper unwind annotations to xen_sysenter_target Signed-off-by: Jan Beulich Cc: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_32.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 6bc07f0f120..ad5264c29e9 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1024,6 +1024,7 @@ ENDPROC(kernel_thread_helper) ENTRY(xen_sysenter_target) RING0_INT_FRAME addl $5*4, %esp /* remove xen-provided frame */ + CFI_ADJUST_CFA_OFFSET -5*4 jmp sysenter_past_esp CFI_ENDPROC -- cgit v1.2.3 From ae79cdaacb5599781f8bb49f4bdd5723029669cf Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 18 Jul 2008 16:08:13 -0700 Subject: x86: Add a arch directory for x86 under debugfs Add a directory for x86 arch under debugfs. Can be used to accumulate all x86 specific debugfs files. Signed-off-by: Venkatesh Pallipadi Signed-off-by: H. Peter Anvin --- arch/x86/kernel/kdebugfs.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index c0320599171..f2d43bc7551 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -12,9 +12,13 @@ #include #include #include +#include #include +struct dentry *arch_debugfs_dir; +EXPORT_SYMBOL(arch_debugfs_dir); + #ifdef CONFIG_DEBUG_BOOT_PARAMS struct setup_data_node { u64 paddr; @@ -209,6 +213,10 @@ static int __init arch_kdebugfs_init(void) { int error = 0; + arch_debugfs_dir = debugfs_create_dir("x86", NULL); + if (!arch_debugfs_dir) + return -ENOMEM; + #ifdef CONFIG_DEBUG_BOOT_PARAMS error = boot_params_kdebugfs_init(); #endif -- cgit v1.2.3 From e5849e71adcbb774ce40f09c1bcb48acca3b6da7 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 18 Jul 2008 17:28:40 -0700 Subject: x86: remove arch_get_ram_range no user now Signed-off-by: Yinghai Lu Signed-off-by: H. Peter Anvin --- arch/x86/kernel/e820.c | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 28c29180b38..df1b32fa88d 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1367,24 +1367,3 @@ void __init setup_memory_map(void) printk(KERN_INFO "BIOS-provided physical RAM map:\n"); e820_print_map(who); } - -#ifdef CONFIG_X86_64 -int __init arch_get_ram_range(int slot, u64 *addr, u64 *size) -{ - int i; - - if (slot < 0 || slot >= e820.nr_map) - return -1; - for (i = slot; i < e820.nr_map; i++) { - if (e820.map[i].type != E820_RAM) - continue; - break; - } - if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT)) - return -1; - *addr = e820.map[i].addr; - *size = min_t(u64, e820.map[i].size + e820.map[i].addr, - max_pfn << PAGE_SHIFT) - *addr; - return i + 1; -} -#endif -- cgit v1.2.3 From 5f1f2b3d9dbaee82cd532f28da459adcbf611499 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 18 Jul 2008 16:16:23 -0700 Subject: x86: improve debug printout: add target bootmem range in early_res_to_bootmem() Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index df1b32fa88d..6c60aeaac15 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -877,7 +877,8 @@ void __init early_res_to_bootmem(u64 start, u64 end) for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) count++; - printk(KERN_INFO "(%d early reservations) ==> bootmem\n", count); + printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n", + count, start, end); for (i = 0; i < count; i++) { struct early_res *r = &early_res[i]; printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, -- cgit v1.2.3 From 3c9cb6de1e5ad37d1558fdb0d9d2bed5a7bac0d9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 19 Jul 2008 02:07:25 -0700 Subject: x86: introduce x86_quirks introduce x86_quirks array of boot-time quirk methods. No change in functionality intended. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 9 ++------- arch/x86/kernel/mpparse.c | 17 +++++------------ arch/x86/kernel/setup.c | 4 ++++ arch/x86/kernel/visws_quirks.c | 42 ++++++++++++++++++++---------------------- 4 files changed, 31 insertions(+), 41 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 6c60aeaac15..9af89078f7b 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1299,11 +1299,6 @@ void __init e820_reserve_resources(void) } } -/* - * Non-standard memory setup can be specified via this quirk: - */ -char * (*arch_memory_setup_quirk)(void); - char *__init default_machine_specific_memory_setup(void) { char *who = "BIOS-e820"; @@ -1344,8 +1339,8 @@ char *__init default_machine_specific_memory_setup(void) char *__init __attribute__((weak)) machine_specific_memory_setup(void) { - if (arch_memory_setup_quirk) { - char *who = arch_memory_setup_quirk(); + if (x86_quirks->arch_memory_setup) { + char *who = x86_quirks->arch_memory_setup(); if (who) return who; diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 3b25e49380c..3cbd2df3abe 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #ifdef CONFIG_X86_32 @@ -725,12 +726,6 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) static struct intel_mp_floating *mpf_found; -/* - * Machine specific quirk for finding the SMP config before other setup - * activities destroy the table: - */ -int (*mach_get_smp_config_quirk)(unsigned int early); - /* * Scan the memory blocks for an SMP configuration block. */ @@ -738,8 +733,8 @@ static void __init __get_smp_config(unsigned int early) { struct intel_mp_floating *mpf = mpf_found; - if (mach_get_smp_config_quirk) { - if (mach_get_smp_config_quirk(early)) + if (x86_quirks->mach_get_smp_config) { + if (x86_quirks->mach_get_smp_config(early)) return; } if (acpi_lapic && early) @@ -899,14 +894,12 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, return 0; } -int (*mach_find_smp_config_quirk)(unsigned int reserve); - static void __init __find_smp_config(unsigned int reserve) { unsigned int address; - if (mach_find_smp_config_quirk) { - if (mach_find_smp_config_quirk(reserve)) + if (x86_quirks->mach_find_smp_config) { + if (x86_quirks->mach_find_smp_config(reserve)) return; } /* diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4a2b8acc1d9..bbcc13d0b56 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -574,6 +574,10 @@ static int __init setup_elfcorehdr(char *arg) early_param("elfcorehdr", setup_elfcorehdr); #endif +static struct x86_quirks default_x86_quirks __initdata; + +struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index e94bdb6add1..41e01b145c4 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -73,7 +73,7 @@ int is_visws_box(void) return visws_board_type >= 0; } -static int __init visws_time_init_quirk(void) +static int __init visws_time_init(void) { printk(KERN_INFO "Starting Cobalt Timer system clock\n"); @@ -93,7 +93,7 @@ static int __init visws_time_init_quirk(void) return 0; } -static int __init visws_pre_intr_init_quirk(void) +static int __init visws_pre_intr_init(void) { init_VISWS_APIC_irqs(); @@ -114,7 +114,7 @@ EXPORT_SYMBOL(sgivwfb_mem_size); long long mem_size __initdata = 0; -static char * __init visws_memory_setup_quirk(void) +static char * __init visws_memory_setup(void) { long long gfx_mem_size = 8 * MB; @@ -176,7 +176,7 @@ static void visws_machine_power_off(void) outl(PIIX_SPECIAL_STOP, 0xCFC); } -static int __init visws_get_smp_config_quirk(unsigned int early) +static int __init visws_get_smp_config(unsigned int early) { /* * Prevent MP-table parsing by the generic code: @@ -192,7 +192,7 @@ extern unsigned int __cpuinitdata maxcpus; * No problem for Linux. */ -static void __init MP_processor_info (struct mpc_config_processor *m) +static void __init MP_processor_info(struct mpc_config_processor *m) { int ver, logical_apicid; physid_mask_t apic_cpus; @@ -232,7 +232,7 @@ static void __init MP_processor_info (struct mpc_config_processor *m) apic_version[m->mpc_apicid] = ver; } -int __init visws_find_smp_config_quirk(unsigned int reserve) +static int __init visws_find_smp_config(unsigned int reserve) { struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS); unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); @@ -258,7 +258,17 @@ int __init visws_find_smp_config_quirk(unsigned int reserve) return 1; } -extern int visws_trap_init_quirk(void); +static int visws_trap_init(void); + +static struct x86_quirks visws_x86_quirks __initdata = { + .arch_time_init = visws_time_init, + .arch_pre_intr_init = visws_pre_intr_init, + .arch_memory_setup = visws_memory_setup, + .arch_intr_init = NULL, + .arch_trap_init = visws_trap_init, + .mach_get_smp_config = visws_get_smp_config, + .mach_find_smp_config = visws_find_smp_config, +}; void __init visws_early_detect(void) { @@ -272,16 +282,10 @@ void __init visws_early_detect(void) /* * Install special quirks for timer, interrupt and memory setup: - */ - arch_time_init_quirk = visws_time_init_quirk; - arch_pre_intr_init_quirk = visws_pre_intr_init_quirk; - arch_memory_setup_quirk = visws_memory_setup_quirk; - - /* * Fall back to generic behavior for traps: + * Override generic MP-table parsing: */ - arch_intr_init_quirk = NULL; - arch_trap_init_quirk = visws_trap_init_quirk; + x86_quirks = &visws_x86_quirks; /* * Install reboot quirks: @@ -294,12 +298,6 @@ void __init visws_early_detect(void) */ no_broadcast = 0; - /* - * Override generic MP-table parsing: - */ - mach_get_smp_config_quirk = visws_get_smp_config_quirk; - mach_find_smp_config_quirk = visws_find_smp_config_quirk; - #ifdef CONFIG_X86_IO_APIC /* * Turn off IO-APIC detection and initialization: @@ -426,7 +424,7 @@ static __init void cobalt_init(void) co_apic_read(CO_APIC_ID)); } -int __init visws_trap_init_quirk(void) +static int __init visws_trap_init(void) { lithium_init(); cobalt_init(); -- cgit v1.2.3 From 64898a8bad8c94ad7a4bd5cc86b66edfbb081f4a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 19 Jul 2008 18:01:16 -0700 Subject: x86: extend and use x86_quirks to clean up NUMAQ code add these new x86_quirks methods: int *mpc_record; int (*mpc_apic_id)(struct mpc_config_processor *m); void (*mpc_oem_bus_info)(struct mpc_config_bus *m, char *name); void (*mpc_oem_pci_bus)(struct mpc_config_bus *m); void (*smp_read_mpc_oem)(struct mp_config_oemtable *oemtable, unsigned short oemsize); ... and move NUMAQ related mps table handling to numaq_32.c. also move the call to smp_read_mpc_oem() to smp_read_mpc() directly. Should not change functionality, albeit it would be nice to get it tested on real NUMAQ as well ... Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/mpparse.c | 191 +++++---------------------------------------- arch/x86/kernel/numaq_32.c | 190 +++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 200 insertions(+), 181 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 3cbd2df3abe..6ae005ccaed 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -49,76 +49,6 @@ static int __init mpf_checksum(unsigned char *mp, int len) return sum & 0xFF; } -#ifdef CONFIG_X86_NUMAQ -int found_numaq; -/* - * Have to match translation table entries to main table entries by counter - * hence the mpc_record variable .... can't see a less disgusting way of - * doing this .... - */ -struct mpc_config_translation { - unsigned char mpc_type; - unsigned char trans_len; - unsigned char trans_type; - unsigned char trans_quad; - unsigned char trans_global; - unsigned char trans_local; - unsigned short trans_reserved; -}; - - -static int mpc_record; -static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] - __cpuinitdata; - -static inline int generate_logical_apicid(int quad, int phys_apicid) -{ - return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1); -} - - -static inline int mpc_apic_id(struct mpc_config_processor *m, - struct mpc_config_translation *translation_record) -{ - int quad = translation_record->trans_quad; - int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid); - - printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n", - m->mpc_apicid, - (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8, - (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4, - m->mpc_apicver, quad, logical_apicid); - return logical_apicid; -} - -int mp_bus_id_to_node[MAX_MP_BUSSES]; - -int mp_bus_id_to_local[MAX_MP_BUSSES]; - -static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name, - struct mpc_config_translation *translation) -{ - int quad = translation->trans_quad; - int local = translation->trans_local; - - mp_bus_id_to_node[m->mpc_busid] = quad; - mp_bus_id_to_local[m->mpc_busid] = local; - printk(KERN_INFO "Bus #%d is %s (node %d)\n", - m->mpc_busid, name, quad); -} - -int quad_local_to_mp_bus_id [NR_CPUS/4][4]; -static void mpc_oem_pci_bus(struct mpc_config_bus *m, - struct mpc_config_translation *translation) -{ - int quad = translation->trans_quad; - int local = translation->trans_local; - - quad_local_to_mp_bus_id[quad][local] = m->mpc_busid; -} - -#endif - static void __cpuinit MP_processor_info(struct mpc_config_processor *m) { int apicid; @@ -128,14 +58,12 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m) disabled_cpus++; return; } -#ifdef CONFIG_X86_NUMAQ - if (found_numaq) - apicid = mpc_apic_id(m, translation_table[mpc_record]); + + if (x86_quirks->mpc_apic_id) + apicid = x86_quirks->mpc_apic_id(m); else apicid = m->mpc_apicid; -#else - apicid = m->mpc_apicid; -#endif + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { bootup_cpu = " (Bootup-CPU)"; boot_cpu_physical_apicid = m->mpc_apicid; @@ -152,12 +80,10 @@ static void __init MP_bus_info(struct mpc_config_bus *m) memcpy(str, m->mpc_bustype, 6); str[6] = 0; -#ifdef CONFIG_X86_NUMAQ - if (found_numaq) - mpc_oem_bus_info(m, str, translation_table[mpc_record]); -#else - printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str); -#endif + if (x86_quirks->mpc_oem_bus_info) + x86_quirks->mpc_oem_bus_info(m, str); + else + printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str); #if MAX_MP_BUSSES < 256 if (m->mpc_busid >= MAX_MP_BUSSES) { @@ -174,10 +100,9 @@ static void __init MP_bus_info(struct mpc_config_bus *m) mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; #endif } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { -#ifdef CONFIG_X86_NUMAQ - if (found_numaq) - mpc_oem_pci_bus(m, translation_table[mpc_record]); -#endif + if (x86_quirks->mpc_oem_pci_bus) + x86_quirks->mpc_oem_pci_bus(m); + clear_bit(m->mpc_busid, mp_bus_not_pci); #if defined(CONFIG_EISA) || defined (CONFIG_MCA) mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; @@ -317,83 +242,6 @@ static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m) m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); } -#ifdef CONFIG_X86_NUMAQ -static void __init MP_translation_info(struct mpc_config_translation *m) -{ - printk(KERN_INFO - "Translation: record %d, type %d, quad %d, global %d, local %d\n", - mpc_record, m->trans_type, m->trans_quad, m->trans_global, - m->trans_local); - - if (mpc_record >= MAX_MPC_ENTRY) - printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n"); - else - translation_table[mpc_record] = m; /* stash this for later */ - if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad)) - node_set_online(m->trans_quad); -} - -/* - * Read/parse the MPC oem tables - */ - -static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, - unsigned short oemsize) -{ - int count = sizeof(*oemtable); /* the header size */ - unsigned char *oemptr = ((unsigned char *)oemtable) + count; - - mpc_record = 0; - printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", - oemtable); - if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) { - printk(KERN_WARNING - "SMP mpc oemtable: bad signature [%c%c%c%c]!\n", - oemtable->oem_signature[0], oemtable->oem_signature[1], - oemtable->oem_signature[2], oemtable->oem_signature[3]); - return; - } - if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) { - printk(KERN_WARNING "SMP oem mptable: checksum error!\n"); - return; - } - while (count < oemtable->oem_length) { - switch (*oemptr) { - case MP_TRANSLATION: - { - struct mpc_config_translation *m = - (struct mpc_config_translation *)oemptr; - MP_translation_info(m); - oemptr += sizeof(*m); - count += sizeof(*m); - ++mpc_record; - break; - } - default: - { - printk(KERN_WARNING - "Unrecognised OEM table entry type! - %d\n", - (int)*oemptr); - return; - } - } - } -} - -void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem, - char *productid) -{ - if (strncmp(oem, "IBM NUMA", 8)) - printk("Warning! Not a NUMA-Q system!\n"); - else - found_numaq = 1; - - if (mpc->mpc_oemptr) - smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr, - mpc->mpc_oemsize); -} -#endif /* CONFIG_X86_NUMAQ */ - /* * Read/parse the MPC */ @@ -458,7 +306,6 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) } else mps_oem_check(mpc, oem, str); #endif - /* save the local APIC address, it might be non-default */ if (!acpi_lapic) mp_lapic_addr = mpc->mpc_lapic; @@ -466,12 +313,17 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) if (early) return 1; + if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) { + struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr; + x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize); + } + /* * Now process the configuration blocks. */ -#ifdef CONFIG_X86_NUMAQ - mpc_record = 0; -#endif + if (x86_quirks->mpc_record) + *x86_quirks->mpc_record = 0; + while (count < mpc->mpc_length) { switch (*mpt) { case MP_PROCESSOR: @@ -537,9 +389,8 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) count = mpc->mpc_length; break; } -#ifdef CONFIG_X86_NUMAQ - ++mpc_record; -#endif + if (x86_quirks->mpc_record) + (*x86_quirks->mpc_record)++; } #ifdef CONFIG_X86_GENERICARCH diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index a23e8233b9a..7f4e00d1d89 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c @@ -33,6 +33,7 @@ #include #include #include +#include #define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) @@ -71,6 +72,181 @@ static void __init smp_dump_qct(void) } } + +void __init numaq_tsc_disable(void) +{ + if (!found_numaq) + return; + + if (num_online_nodes() > 1) { + printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); + setup_clear_cpu_cap(X86_FEATURE_TSC); + } +} + +int found_numaq; +/* + * Have to match translation table entries to main table entries by counter + * hence the mpc_record variable .... can't see a less disgusting way of + * doing this .... + */ +struct mpc_config_translation { + unsigned char mpc_type; + unsigned char trans_len; + unsigned char trans_type; + unsigned char trans_quad; + unsigned char trans_global; + unsigned char trans_local; + unsigned short trans_reserved; +}; + +/* x86_quirks member */ +static int mpc_record; +static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] + __cpuinitdata; + +static inline int generate_logical_apicid(int quad, int phys_apicid) +{ + return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1); +} + +/* x86_quirks member */ +static int mpc_apic_id(struct mpc_config_processor *m) +{ + int quad = translation_table[mpc_record]->trans_quad; + int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid); + + printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n", + m->mpc_apicid, + (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8, + (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4, + m->mpc_apicver, quad, logical_apicid); + return logical_apicid; +} + +int mp_bus_id_to_node[MAX_MP_BUSSES]; + +int mp_bus_id_to_local[MAX_MP_BUSSES]; + +/* x86_quirks member */ +static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name) +{ + int quad = translation_table[mpc_record]->trans_quad; + int local = translation_table[mpc_record]->trans_local; + + mp_bus_id_to_node[m->mpc_busid] = quad; + mp_bus_id_to_local[m->mpc_busid] = local; + printk(KERN_INFO "Bus #%d is %s (node %d)\n", + m->mpc_busid, name, quad); +} + +int quad_local_to_mp_bus_id [NR_CPUS/4][4]; + +/* x86_quirks member */ +static void mpc_oem_pci_bus(struct mpc_config_bus *m) +{ + int quad = translation_table[mpc_record]->trans_quad; + int local = translation_table[mpc_record]->trans_local; + + quad_local_to_mp_bus_id[quad][local] = m->mpc_busid; +} + +static void __init MP_translation_info(struct mpc_config_translation *m) +{ + printk(KERN_INFO + "Translation: record %d, type %d, quad %d, global %d, local %d\n", + mpc_record, m->trans_type, m->trans_quad, m->trans_global, + m->trans_local); + + if (mpc_record >= MAX_MPC_ENTRY) + printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n"); + else + translation_table[mpc_record] = m; /* stash this for later */ + if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad)) + node_set_online(m->trans_quad); +} + +static int __init mpf_checksum(unsigned char *mp, int len) +{ + int sum = 0; + + while (len--) + sum += *mp++; + + return sum & 0xFF; +} + +/* + * Read/parse the MPC oem tables + */ + +static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, + unsigned short oemsize) +{ + int count = sizeof(*oemtable); /* the header size */ + unsigned char *oemptr = ((unsigned char *)oemtable) + count; + + mpc_record = 0; + printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", + oemtable); + if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) { + printk(KERN_WARNING + "SMP mpc oemtable: bad signature [%c%c%c%c]!\n", + oemtable->oem_signature[0], oemtable->oem_signature[1], + oemtable->oem_signature[2], oemtable->oem_signature[3]); + return; + } + if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) { + printk(KERN_WARNING "SMP oem mptable: checksum error!\n"); + return; + } + while (count < oemtable->oem_length) { + switch (*oemptr) { + case MP_TRANSLATION: + { + struct mpc_config_translation *m = + (struct mpc_config_translation *)oemptr; + MP_translation_info(m); + oemptr += sizeof(*m); + count += sizeof(*m); + ++mpc_record; + break; + } + default: + { + printk(KERN_WARNING + "Unrecognised OEM table entry type! - %d\n", + (int)*oemptr); + return; + } + } + } +} + +static struct x86_quirks numaq_x86_quirks __initdata = { + .arch_time_init = NULL, + .arch_pre_intr_init = NULL, + .arch_memory_setup = NULL, + .arch_intr_init = NULL, + .arch_trap_init = NULL, + .mach_get_smp_config = NULL, + .mach_find_smp_config = NULL, + .mpc_record = &mpc_record, + .mpc_apic_id = mpc_apic_id, + .mpc_oem_bus_info = mpc_oem_bus_info, + .mpc_oem_pci_bus = mpc_oem_pci_bus, + .smp_read_mpc_oem = smp_read_mpc_oem, +}; + +void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem, + char *productid) +{ + if (strncmp(oem, "IBM NUMA", 8)) + printk("Warning! Not a NUMA-Q system!\n"); + else + found_numaq = 1; +} + static __init void early_check_numaq(void) { /* @@ -82,6 +258,9 @@ static __init void early_check_numaq(void) */ if (smp_found_config) early_get_smp_config(); + + if (found_numaq) + x86_quirks = &numaq_x86_quirks; } int __init get_memcfg_numaq(void) @@ -92,14 +271,3 @@ int __init get_memcfg_numaq(void) smp_dump_qct(); return 1; } - -void __init numaq_tsc_disable(void) -{ - if (!found_numaq) - return; - - if (num_online_nodes() > 1) { - printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); - setup_clear_cpu_cap(X86_FEATURE_TSC); - } -} -- cgit v1.2.3 From 63b5d7af2556a7de6bf72c5dd0b85a32fb4c3767 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 19 Jul 2008 18:02:26 -0700 Subject: x86: add ->pre_time_init to x86_quirks so NUMAQ can use that to call numaq_pre_time_init() This allows us to remove a NUMAQ special from arch/x86/kernel/setup.c. (and paves the way to remove the NUMAQ subarch) Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/numaq_32.c | 7 +++++++ arch/x86/kernel/setup.c | 8 -------- arch/x86/kernel/time_32.c | 1 + 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index 7f4e00d1d89..b8c45610b20 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c @@ -84,6 +84,12 @@ void __init numaq_tsc_disable(void) } } +static int __init numaq_pre_time_init(void) +{ + numaq_tsc_disable(); + return 0; +} + int found_numaq; /* * Have to match translation table entries to main table entries by counter @@ -224,6 +230,7 @@ static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, } static struct x86_quirks numaq_x86_quirks __initdata = { + .arch_pre_time_init = numaq_pre_time_init, .arch_time_init = NULL, .arch_pre_intr_init = NULL, .arch_memory_setup = NULL, diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index bbcc13d0b56..4064616cfa8 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -853,14 +853,6 @@ void __init setup_arch(char **cmdline_p) init_cpu_to_node(); #endif -#ifdef CONFIG_X86_NUMAQ - /* - * need to check online nodes num, call it - * here before time_init/tsc_init - */ - numaq_tsc_disable(); -#endif - init_apic_mappings(); ioapic_init_mappings(); diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 059ca6ee59b..ffe3c664afc 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -129,6 +129,7 @@ void __init hpet_time_init(void) */ void __init time_init(void) { + pre_time_init_hook(); tsc_init(); late_time_init = choose_time_init(); } -- cgit v1.2.3 From e3a61b0a8c0e342e700a61cd554b01050f333a36 Mon Sep 17 00:00:00 2001 From: Simon Arlott Date: Sat, 19 Jul 2008 23:32:54 +0100 Subject: x86: add unknown_nmi_panic kernel parameter It's not possible to enable the unknown_nmi_panic sysctl option until init is run. It's useful to be able to panic the kernel during boot too, this adds a parameter to enable this option. Signed-off-by: Simon Arlott Signed-off-by: Ingo Molnar --- arch/x86/kernel/nmi.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index ec024b3baad..e0b44b7b717 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -448,6 +448,13 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) #ifdef CONFIG_SYSCTL +static int __init setup_unknown_nmi_panic(char *str) +{ + unknown_nmi_panic = 1; + return 1; +} +__setup("unknown_nmi_panic", setup_unknown_nmi_panic); + static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) { unsigned char reason = get_nmi_reason(); -- cgit v1.2.3 From 6bca67f951f80b9e61078f8cdf5fb7b3d9e51aa9 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Fri, 18 Jul 2008 18:11:27 -0700 Subject: NR_CPUS: Replace NR_CPUS in arch/x86/kernel/cpu/mcheck/mce_64.c * nr_cpu_ids should be used to allocate arrays based on the number of cpu's present. Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index c4a7ec31394..2fe06ab5c54 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -580,7 +580,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, char __user *buf = ubuf; int i, err; - cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL); + cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); if (!cpu_tsc) return -ENOMEM; -- cgit v1.2.3 From f2ad47ffeb1d292b7c7d1e2f6aedb37646c391db Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Fri, 18 Jul 2008 18:11:28 -0700 Subject: NR_CPUS: Replace NR_CPUS in arch/x86/kernel/cpu/proc.c * Use nr_cpu_ids instead of NR_CPUS to limit traversal of cpu online map. Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 0d0d9057e7c..a26c480b949 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -160,7 +160,7 @@ static void *c_start(struct seq_file *m, loff_t *pos) { if (*pos == 0) /* just in case, cpu 0 is not the first */ *pos = first_cpu(cpu_online_map); - if ((*pos) < NR_CPUS && cpu_online(*pos)) + if ((*pos) < nr_cpu_ids && cpu_online(*pos)) return &cpu_data(*pos); return NULL; } -- cgit v1.2.3 From 247bc6ca0f691e4617e7bdb70cbaccc939f754ec Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Fri, 18 Jul 2008 18:11:29 -0700 Subject: NR_CPUS: Replace NR_CPUS in arch/x86/kernel/genx2apic_uv_x.c * Replace NR_CPUS loop with for_each_possible_cpu(). * nr_cpu_ids should be used to determine if a percpu area is available for a given cpu. Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/kernel/genx2apic_uv_x.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 711f11c30b0..3e5d7b8698f 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -94,7 +94,7 @@ static void uv_send_IPI_mask(cpumask_t mask, int vector) { unsigned int cpu; - for (cpu = 0; cpu < NR_CPUS; ++cpu) + for_each_possible_cpu(cpu) if (cpu_isset(cpu, mask)) uv_send_IPI_one(cpu, vector); } @@ -128,7 +128,7 @@ static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask) * May as well be the first. */ cpu = first_cpu(cpumask); - if ((unsigned)cpu < NR_CPUS) + if ((unsigned)cpu < nr_cpu_ids) return per_cpu(x86_cpu_to_apicid, cpu); else return BAD_APICID; -- cgit v1.2.3 From 1bd9d6b64e1474f1a03f8660e8721d746cffae57 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Fri, 18 Jul 2008 18:11:30 -0700 Subject: NR_CPUS: Replace NR_CPUS in arch/x86/kernel/genapic_flat_64.c * nr_cpu_ids should be used to determine if a percpu area is available for a given cpu. Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/kernel/genapic_flat_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c index 1a9c68845ee..786548a62d3 100644 --- a/arch/x86/kernel/genapic_flat_64.c +++ b/arch/x86/kernel/genapic_flat_64.c @@ -168,7 +168,7 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask) * May as well be the first. */ cpu = first_cpu(cpumask); - if ((unsigned)cpu < NR_CPUS) + if ((unsigned)cpu < nr_cpu_ids) return per_cpu(x86_cpu_to_apicid, cpu); else return BAD_APICID; -- cgit v1.2.3 From c4762aba0b1f72659aae9ce37b772ca8bd8f06f4 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Fri, 18 Jul 2008 18:11:34 -0700 Subject: NR_CPUS: Replace NR_CPUS in speedstep-centrino.c Some cleanups in speedstep-centrino.c for NR_CPUS=4096. * Use new CPUMASK_PTR (instead of old CPUMASK_VAR). * Replace arrays sized by NR_CPUS with percpu variables. * Cleanup some formatting problems (>80 chars per line) and other checkpatch complaints. Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c | 85 ++++++++++++++---------- 1 file changed, 51 insertions(+), 34 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index 470c016cb25..ca2ac13b7af 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -28,7 +28,8 @@ #define PFX "speedstep-centrino: " #define MAINTAINER "cpufreq@lists.linux.org.uk" -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) +#define dprintk(msg...) \ + cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) #define INTEL_MSR_RANGE (0xffff) @@ -66,11 +67,12 @@ struct cpu_model struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */ }; -static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x); +static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, + const struct cpu_id *x); /* Operating points for current CPU */ -static struct cpu_model *centrino_model[NR_CPUS]; -static const struct cpu_id *centrino_cpu[NR_CPUS]; +static DEFINE_PER_CPU(struct cpu_model *, centrino_model); +static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu); static struct cpufreq_driver centrino_driver; @@ -255,7 +257,7 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy) return -ENOENT; } - centrino_model[policy->cpu] = model; + per_cpu(centrino_model, policy->cpu) = model; dprintk("found \"%s\": max frequency: %dkHz\n", model->model_name, model->max_freq); @@ -264,10 +266,14 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy) } #else -static inline int centrino_cpu_init_table(struct cpufreq_policy *policy) { return -ENODEV; } +static inline int centrino_cpu_init_table(struct cpufreq_policy *policy) +{ + return -ENODEV; +} #endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */ -static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x) +static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, + const struct cpu_id *x) { if ((c->x86 == x->x86) && (c->x86_model == x->x86_model) && @@ -286,23 +292,28 @@ static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe) * for centrino, as some DSDTs are buggy. * Ideally, this can be done using the acpi_data structure. */ - if ((centrino_cpu[cpu] == &cpu_ids[CPU_BANIAS]) || - (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_A1]) || - (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_B0])) { + if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) || + (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) || + (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) { msr = (msr >> 8) & 0xff; return msr * 100000; } - if ((!centrino_model[cpu]) || (!centrino_model[cpu]->op_points)) + if ((!per_cpu(centrino_model, cpu)) || + (!per_cpu(centrino_model, cpu)->op_points)) return 0; msr &= 0xffff; - for (i=0;centrino_model[cpu]->op_points[i].frequency != CPUFREQ_TABLE_END; i++) { - if (msr == centrino_model[cpu]->op_points[i].index) - return centrino_model[cpu]->op_points[i].frequency; + for (i = 0; + per_cpu(centrino_model, cpu)->op_points[i].frequency + != CPUFREQ_TABLE_END; + i++) { + if (msr == per_cpu(centrino_model, cpu)->op_points[i].index) + return per_cpu(centrino_model, cpu)-> + op_points[i].frequency; } if (failsafe) - return centrino_model[cpu]->op_points[i-1].frequency; + return per_cpu(centrino_model, cpu)->op_points[i-1].frequency; else return 0; } @@ -348,7 +359,8 @@ static int centrino_cpu_init(struct cpufreq_policy *policy) int i; /* Only Intel makes Enhanced Speedstep-capable CPUs */ - if (cpu->x86_vendor != X86_VENDOR_INTEL || !cpu_has(cpu, X86_FEATURE_EST)) + if (cpu->x86_vendor != X86_VENDOR_INTEL || + !cpu_has(cpu, X86_FEATURE_EST)) return -ENODEV; if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC)) @@ -362,9 +374,9 @@ static int centrino_cpu_init(struct cpufreq_policy *policy) break; if (i != N_IDS) - centrino_cpu[policy->cpu] = &cpu_ids[i]; + per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i]; - if (!centrino_cpu[policy->cpu]) { + if (!per_cpu(centrino_cpu, policy->cpu)) { dprintk("found unsupported CPU with " "Enhanced SpeedStep: send /proc/cpuinfo to " MAINTAINER "\n"); @@ -387,23 +399,26 @@ static int centrino_cpu_init(struct cpufreq_policy *policy) /* check to see if it stuck */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); if (!(l & (1<<16))) { - printk(KERN_INFO PFX "couldn't enable Enhanced SpeedStep\n"); + printk(KERN_INFO PFX + "couldn't enable Enhanced SpeedStep\n"); return -ENODEV; } } freq = get_cur_freq(policy->cpu); - - policy->cpuinfo.transition_latency = 10000; /* 10uS transition latency */ + policy->cpuinfo.transition_latency = 10000; + /* 10uS transition latency */ policy->cur = freq; dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur); - ret = cpufreq_frequency_table_cpuinfo(policy, centrino_model[policy->cpu]->op_points); + ret = cpufreq_frequency_table_cpuinfo(policy, + per_cpu(centrino_model, policy->cpu)->op_points); if (ret) return (ret); - cpufreq_frequency_table_get_attr(centrino_model[policy->cpu]->op_points, policy->cpu); + cpufreq_frequency_table_get_attr( + per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu); return 0; } @@ -412,12 +427,12 @@ static int centrino_cpu_exit(struct cpufreq_policy *policy) { unsigned int cpu = policy->cpu; - if (!centrino_model[cpu]) + if (!per_cpu(centrino_model, cpu)) return -ENODEV; cpufreq_frequency_table_put_attr(cpu); - centrino_model[cpu] = NULL; + per_cpu(centrino_model, cpu) = NULL; return 0; } @@ -431,14 +446,16 @@ static int centrino_cpu_exit(struct cpufreq_policy *policy) */ static int centrino_verify (struct cpufreq_policy *policy) { - return cpufreq_frequency_table_verify(policy, centrino_model[policy->cpu]->op_points); + return cpufreq_frequency_table_verify(policy, + per_cpu(centrino_model, policy->cpu)->op_points); } /** * centrino_setpolicy - set a new CPUFreq policy * @policy: new policy * @target_freq: the target frequency - * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) + * @relation: how that frequency relates to achieved frequency + * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) * * Sets a new CPUFreq policy. */ @@ -459,21 +476,21 @@ static int centrino_target (struct cpufreq_policy *policy, int retval = 0; unsigned int j, k, first_cpu, tmp; CPUMASK_ALLOC(allmasks); - CPUMASK_VAR(online_policy_cpus, allmasks); - CPUMASK_VAR(saved_mask, allmasks); - CPUMASK_VAR(set_mask, allmasks); - CPUMASK_VAR(covered_cpus, allmasks); + CPUMASK_PTR(online_policy_cpus, allmasks); + CPUMASK_PTR(saved_mask, allmasks); + CPUMASK_PTR(set_mask, allmasks); + CPUMASK_PTR(covered_cpus, allmasks); if (unlikely(allmasks == NULL)) return -ENOMEM; - if (unlikely(centrino_model[cpu] == NULL)) { + if (unlikely(per_cpu(centrino_model, cpu) == NULL)) { retval = -ENODEV; goto out; } if (unlikely(cpufreq_frequency_table_target(policy, - centrino_model[cpu]->op_points, + per_cpu(centrino_model, cpu)->op_points, target_freq, relation, &newstate))) { @@ -515,7 +532,7 @@ static int centrino_target (struct cpufreq_policy *policy, break; } - msr = centrino_model[cpu]->op_points[newstate].index; + msr = per_cpu(centrino_model, cpu)->op_points[newstate].index; if (first_cpu) { rdmsr(MSR_IA32_PERF_CTL, oldmsr, h); -- cgit v1.2.3 From 31656519e132f6612584815f128c83976a9aaaef Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 18 Jul 2008 18:01:23 +0200 Subject: sched, x86: clean up hrtick implementation random uvesafb failures were reported against Gentoo: http://bugs.gentoo.org/show_bug.cgi?id=222799 and Mihai Moldovan bisected it back to: > 8f4d37ec073c17e2d4aa8851df5837d798606d6f is first bad commit > commit 8f4d37ec073c17e2d4aa8851df5837d798606d6f > Author: Peter Zijlstra > Date: Fri Jan 25 21:08:29 2008 +0100 > > sched: high-res preemption tick Linus suspected it to be hrtick + vm86 interaction and observed: > Btw, Peter, Ingo: I think that commit is doing bad things. They aren't > _incorrect_ per se, but they are definitely bad. > > Why? > > Using random _TIF_WORK_MASK flags is really impolite for doing > "scheduling" work. There's a reason that arch/x86/kernel/entry_32.S > special-cases the _TIF_NEED_RESCHED flag: we don't want to exit out of > vm86 mode unnecessarily. > > See the "work_notifysig_v86" label, and how it does that > "save_v86_state()" thing etc etc. Right, I never liked having to fiddle with those TIF flags. Initially I needed it because the hrtimer base lock could not nest in the rq lock. That however is fixed these days. Currently the only reason left to fiddle with the TIF flags is remote wakeups. We cannot program a remote cpu's hrtimer. I've been thinking about using the new and improved IPI function call stuff to implement hrtimer_start_on(). However that does require that smp_call_function_single(.wait=0) works from interrupt context - /me looks at the latest series from Jens - Yes that does seem to be supported, good. Here's a stab at cleaning this stuff up ... Mihai reported test success as well. Signed-off-by: Peter Zijlstra Tested-by: Mihai Moldovan Cc: Michal Januszewski Cc: Antonino Daplas Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 3 --- arch/x86/kernel/signal_64.c | 3 --- 2 files changed, 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index d9237363096..e1fc7bd57bf 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -667,8 +667,5 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); - if (thread_info_flags & _TIF_HRTICK_RESCHED) - hrtick_resched(); - clear_thread_flag(TIF_IRET); } diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index e53b267662e..88023fcc704 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -502,9 +502,6 @@ void do_notify_resume(struct pt_regs *regs, void *unused, /* deal with pending signal delivery */ if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); - - if (thread_info_flags & _TIF_HRTICK_RESCHED) - hrtick_resched(); } void signal_fault(struct pt_regs *regs, void __user *frame, char *where) -- cgit v1.2.3 From 7e37c2998a5a0b00134f6227167694b710f57ac0 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Tue, 1 Jul 2008 01:19:19 +0300 Subject: x86: KVM guest: make kvm_smp_prepare_boot_cpu() static This patch makes the needlessly global kvm_smp_prepare_boot_cpu() static. Signed-off-by: Adrian Bunk Signed-off-by: Avi Kivity --- arch/x86/kernel/kvmclock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 87edf1ceb1d..d02def06ca9 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -113,7 +113,7 @@ static void kvm_setup_secondary_clock(void) #endif #ifdef CONFIG_SMP -void __init kvm_smp_prepare_boot_cpu(void) +static void __init kvm_smp_prepare_boot_cpu(void) { WARN_ON(kvm_register_clock("primary cpu clock")); native_smp_prepare_boot_cpu(); -- cgit v1.2.3 From 9175fc06aee79c349790672178d3fd7507d75c86 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 21 Jul 2008 01:38:14 -0700 Subject: x86: use setup_clear_cpu_cap() when disabling the lapic ... so don't need to call clear_cpu_cap again in early_identify_cpu, and could use cleared_cpu_caps like other places. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 5 +---- arch/x86/kernel/apic_64.c | 2 +- arch/x86/kernel/cpu/common_64.c | 4 ---- 3 files changed, 2 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index a437d027f20..e9a00e5074b 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1214,9 +1214,6 @@ int apic_version[MAX_APICS]; int __init APIC_init_uniprocessor(void) { - if (disable_apic) - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); - if (!smp_found_config && !cpu_has_apic) return -1; @@ -1700,7 +1697,7 @@ early_param("lapic", parse_lapic); static int __init parse_nolapic(char *arg) { disable_apic = 1; - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + setup_clear_cpu_cap(X86_FEATURE_APIC); return 0; } early_param("nolapic", parse_nolapic); diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 1e3d32e27c1..16e586cacbd 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -1337,7 +1337,7 @@ early_param("apic", apic_set_verbosity); static __init int setup_disableapic(char *str) { disable_apic = 1; - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + setup_clear_cpu_cap(X86_FEATURE_APIC); return 0; } early_param("disableapic", setup_disableapic); diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 7b8cc72feb4..0485cf64452 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -324,10 +324,6 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) cpu_devs[c->x86_vendor]->c_early_init(c); validate_pat_support(c); - - /* early_param could clear that, but recall get it set again */ - if (disable_apic) - clear_cpu_cap(c, X86_FEATURE_APIC); } /* -- cgit v1.2.3 From 7edf8891ad7aef5f4e97991fed6fb0e605e96ea0 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 21 Jul 2008 01:39:03 -0700 Subject: x86: remove extra calling to get ext cpuid level Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common_64.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 0485cf64452..daee611f014 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -305,7 +305,6 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) c->x86_capability[2] = cpuid_edx(0x80860001); } - c->extended_cpuid_level = cpuid_eax(0x80000000); if (c->extended_cpuid_level >= 0x80000007) c->x86_power = cpuid_edx(0x80000007); -- cgit v1.2.3 From cfc1b9a6a683c835a20d5b565ade55baf639f72f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 21 Jul 2008 21:35:38 +0200 Subject: x86: convert Dprintk to pr_debug There are a couple of places where (P)Dprintk is used which is an old compile time enabled printk wrapper. Convert it to the generic pr_debug(). Signed-off-by: Thomas Gleixner --- arch/x86/kernel/acpi/boot.c | 6 ++-- arch/x86/kernel/cpu/perfctr-watchdog.c | 4 +-- arch/x86/kernel/setup_percpu.c | 6 ++-- arch/x86/kernel/smpboot.c | 52 +++++++++++++++++----------------- 4 files changed, 34 insertions(+), 34 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index f489d7a9be9..fa88a1d7129 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1021,7 +1021,7 @@ void __init mp_config_acpi_legacy_irqs(void) mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; #endif set_bit(MP_ISA_BUS, mp_bus_not_pci); - Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); + pr_debug("Bus #%d is ISA\n", MP_ISA_BUS); #ifdef CONFIG_X86_ES7000 /* @@ -1127,8 +1127,8 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity) return gsi; } if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { - Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", - mp_ioapic_routing[ioapic].apic_id, ioapic_pin); + pr_debug(KERN_DEBUG "Pin %d-%d already programmed\n", + mp_ioapic_routing[ioapic].apic_id, ioapic_pin); #ifdef CONFIG_X86_32 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); #else diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 6d4bdc02388..de7439f82b9 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -250,7 +250,7 @@ static void write_watchdog_counter(unsigned int perfctr_msr, do_div(count, nmi_hz); if(descr) - Dprintk("setting %s to -0x%08Lx\n", descr, count); + pr_debug("setting %s to -0x%08Lx\n", descr, count); wrmsrl(perfctr_msr, 0 - count); } @@ -261,7 +261,7 @@ static void write_watchdog_counter32(unsigned int perfctr_msr, do_div(count, nmi_hz); if(descr) - Dprintk("setting %s to -0x%08Lx\n", descr, count); + pr_debug("setting %s to -0x%08Lx\n", descr, count); wrmsr(perfctr_msr, (u32)(-count), 0); } diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index cac68430d31..f7745f94c00 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -227,8 +227,8 @@ static void __init setup_node_to_cpumask_map(void) /* allocate the map */ map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); - Dprintk(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n", - map, nr_node_ids); + pr_debug(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n", + map, nr_node_ids); /* node_to_cpumask() will now work */ node_to_cpumask_map = map; @@ -248,7 +248,7 @@ void __cpuinit numa_set_node(int cpu, int node) per_cpu(x86_cpu_to_node_map, cpu) = node; else - Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu); + pr_debug("Setting node for non-present cpu %d\n", cpu); } void __cpuinit numa_clear_node(int cpu) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 27640196eb7..4b53a647bc0 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -216,7 +216,7 @@ static void __cpuinit smp_callin(void) panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__, phys_id, cpuid); } - Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); + pr_debug("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); /* * STARTUP IPIs are fragile beasts as they might sometimes @@ -251,7 +251,7 @@ static void __cpuinit smp_callin(void) * boards) */ - Dprintk("CALLIN, before setup_local_APIC().\n"); + pr_debug("CALLIN, before setup_local_APIC().\n"); smp_callin_clear_local_apic(); setup_local_APIC(); end_local_APIC_setup(); @@ -266,7 +266,7 @@ static void __cpuinit smp_callin(void) local_irq_enable(); calibrate_delay(); local_irq_disable(); - Dprintk("Stack at about %p\n", &cpuid); + pr_debug("Stack at about %p\n", &cpuid); /* * Save our processor parameters @@ -513,7 +513,7 @@ static void impress_friends(void) /* * Allow the user to impress friends. */ - Dprintk("Before bogomips.\n"); + pr_debug("Before bogomips.\n"); for_each_possible_cpu(cpu) if (cpu_isset(cpu, cpu_callout_map)) bogosum += cpu_data(cpu).loops_per_jiffy; @@ -523,7 +523,7 @@ static void impress_friends(void) bogosum/(500000/HZ), (bogosum/(5000/HZ))%100); - Dprintk("Before bogocount - setting activated=1.\n"); + pr_debug("Before bogocount - setting activated=1.\n"); } static inline void __inquire_remote_apic(int apicid) @@ -585,7 +585,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) /* Kick the second */ apic_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); - Dprintk("Waiting for send to finish...\n"); + pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); /* @@ -596,7 +596,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); accept_status = (apic_read(APIC_ESR) & 0xEF); - Dprintk("NMI sent.\n"); + pr_debug("NMI sent.\n"); if (send_status) printk(KERN_ERR "APIC never delivered???\n"); @@ -631,7 +631,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) apic_read(APIC_ESR); } - Dprintk("Asserting INIT.\n"); + pr_debug("Asserting INIT.\n"); /* * Turn INIT on target chip @@ -644,12 +644,12 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT); - Dprintk("Waiting for send to finish...\n"); + pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); mdelay(10); - Dprintk("Deasserting INIT.\n"); + pr_debug("Deasserting INIT.\n"); /* Target chip */ apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); @@ -657,7 +657,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) /* Send IPI */ apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); - Dprintk("Waiting for send to finish...\n"); + pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); mb(); @@ -684,14 +684,14 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) /* * Run STARTUP IPI loop. */ - Dprintk("#startup loops: %d.\n", num_starts); + pr_debug("#startup loops: %d.\n", num_starts); for (j = 1; j <= num_starts; j++) { - Dprintk("Sending STARTUP #%d.\n", j); + pr_debug("Sending STARTUP #%d.\n", j); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); apic_read(APIC_ESR); - Dprintk("After apic_write.\n"); + pr_debug("After apic_write.\n"); /* * STARTUP IPI @@ -709,9 +709,9 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) */ udelay(300); - Dprintk("Startup point 1.\n"); + pr_debug("Startup point 1.\n"); - Dprintk("Waiting for send to finish...\n"); + pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); /* @@ -724,7 +724,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) if (send_status || accept_status) break; } - Dprintk("After Startup.\n"); + pr_debug("After Startup.\n"); if (send_status) printk(KERN_ERR "APIC never delivered???\n"); @@ -875,7 +875,7 @@ do_rest: if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { - Dprintk("Setting warm reset code and vector.\n"); + pr_debug("Setting warm reset code and vector.\n"); store_NMI_vector(&nmi_high, &nmi_low); @@ -896,9 +896,9 @@ do_rest: /* * allow APs to start initializing. */ - Dprintk("Before Callout %d.\n", cpu); + pr_debug("Before Callout %d.\n", cpu); cpu_set(cpu, cpu_callout_map); - Dprintk("After Callout %d.\n", cpu); + pr_debug("After Callout %d.\n", cpu); /* * Wait 5s total for a response @@ -911,10 +911,10 @@ do_rest: if (cpu_isset(cpu, cpu_callin_map)) { /* number CPUs logically, starting from 1 (BSP is 0) */ - Dprintk("OK.\n"); + pr_debug("OK.\n"); printk(KERN_INFO "CPU%d: ", cpu); print_cpu_info(&cpu_data(cpu)); - Dprintk("CPU has booted.\n"); + pr_debug("CPU has booted.\n"); } else { boot_error = 1; if (*((volatile unsigned char *)trampoline_base) @@ -959,7 +959,7 @@ int __cpuinit native_cpu_up(unsigned int cpu) WARN_ON(irqs_disabled()); - Dprintk("++++++++++++++++++++=_---CPU UP %u\n", cpu); + pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu); if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || !physid_isset(apicid, phys_cpu_present_map)) { @@ -971,7 +971,7 @@ int __cpuinit native_cpu_up(unsigned int cpu) * Already booted CPU? */ if (cpu_isset(cpu, cpu_callin_map)) { - Dprintk("do_boot_cpu %d Already started\n", cpu); + pr_debug("do_boot_cpu %d Already started\n", cpu); return -ENOSYS; } @@ -998,7 +998,7 @@ int __cpuinit native_cpu_up(unsigned int cpu) err = do_boot_cpu(apicid, cpu); #endif if (err) { - Dprintk("do_boot_cpu failed %d\n", err); + pr_debug("do_boot_cpu failed %d\n", err); return -EIO; } @@ -1202,7 +1202,7 @@ void __init native_smp_prepare_boot_cpu(void) void __init native_smp_cpus_done(unsigned int max_cpus) { - Dprintk("Boot done.\n"); + pr_debug("Boot done.\n"); impress_friends(); smp_checks(); -- cgit v1.2.3 From 3bfd49c8ab1859ae0f5fa1df2b3781c99115f442 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 21 May 2008 12:52:33 -0700 Subject: device create: x86: convert device_create to device_create_drvdata device_create() is race-prone, so use the race-free device_create_drvdata() instead as device_create() is going away. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpuid.c | 4 ++-- arch/x86/kernel/msr.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 2de5fa2bbf7..14b11b3be31 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -141,8 +141,8 @@ static __cpuinit int cpuid_device_create(int cpu) { struct device *dev; - dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), - "cpu%d", cpu); + dev = device_create_drvdata(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), + NULL, "cpu%d", cpu); return IS_ERR(dev) ? PTR_ERR(dev) : 0; } diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index a153b3905f6..9fd80955244 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -149,8 +149,8 @@ static int __cpuinit msr_device_create(int cpu) { struct device *dev; - dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), - "msr%d", cpu); + dev = device_create_drvdata(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), + NULL, "msr%d", cpu); return IS_ERR(dev) ? PTR_ERR(dev) : 0; } -- cgit v1.2.3 From fc3a8828b139c24aade3f9d608775e36c248f8f5 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 2 May 2008 06:02:41 +0200 Subject: driver core: fix a lot of printk usages of bus_id We have the dev_printk() variants for this kind of thing, use them instead of directly trying to access the bus_id field of struct device. This is done in order to remove bus_id entirely. Cc: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/pci-dma.c | 6 ++---- arch/x86/kernel/pci-gart_64.c | 4 +--- 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index a4213c00dff..cbecb05551b 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -314,8 +314,7 @@ int dma_supported(struct device *dev, u64 mask) { #ifdef CONFIG_PCI if (mask > 0xffffffff && forbid_dac > 0) { - printk(KERN_INFO "PCI: Disallowing DAC for device %s\n", - dev->bus_id); + dev_info(dev, "PCI: Disallowing DAC for device\n"); return 0; } #endif @@ -342,8 +341,7 @@ int dma_supported(struct device *dev, u64 mask) type. Normally this doesn't make any difference, but gives more gentle handling of IOMMU overflow. */ if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) { - printk(KERN_INFO "%s: Force SAC with mask %Lx\n", - dev->bus_id, mask); + dev_info(dev, "Force SAC with mask %Lx\n", mask); return 0; } diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index be60961f869..df5f142657d 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -198,9 +198,7 @@ static void iommu_full(struct device *dev, size_t size, int dir) * out. Hopefully no network devices use single mappings that big. */ - printk(KERN_ERR - "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n", - size, dev->bus_id); + dev_err(dev, "PCI-DMA: Out of IOMMU space for %lu bytes\n", size); if (size > PAGE_SIZE*EMERGENCY_PAGES) { if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) -- cgit v1.2.3 From 4a0b2b4dbe1335b8b9886ba3dc85a145d5d938ed Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 1 Jul 2008 18:48:41 +0200 Subject: sysdev: Pass the attribute to the low level sysdev show/store function This allow to dynamically generate attributes and share show/store functions between attributes. Right now most attributes are generated by special macros and lots of duplicated code. With the attribute passed it's instead possible to attach some data to the attribute and then use that in shared low level functions to do different things. I need this for the dynamically generated bank attributes in the x86 machine check code, but it'll allow some further cleanups. I converted all users in tree to the new show/store prototype. It's a single huge patch to avoid unbisectable sections. Runtime tested: x86-32, x86-64 Compiled only: ia64, powerpc Not compile tested/only grep converted: sh, arm, avr32 Signed-off-by: Andi Kleen Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/mcheck/mce_64.c | 14 ++++++++++---- arch/x86/kernel/cpu/mcheck/therm_throt.c | 1 + arch/x86/kernel/microcode.c | 10 +++++++--- 3 files changed, 18 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index c4a7ec31394..e6a4d5f6764 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -762,10 +762,14 @@ DEFINE_PER_CPU(struct sys_device, device_mce); /* Why are there no generic functions for this? */ #define ACCESSOR(name, var, start) \ - static ssize_t show_ ## name(struct sys_device *s, char *buf) { \ + static ssize_t show_ ## name(struct sys_device *s, \ + struct sysdev_attribute *attr, \ + char *buf) { \ return sprintf(buf, "%lx\n", (unsigned long)var); \ } \ - static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \ + static ssize_t set_ ## name(struct sys_device *s, \ + struct sysdev_attribute *attr, \ + const char *buf, size_t siz) { \ char *end; \ unsigned long new = simple_strtoul(buf, &end, 0); \ if (end == buf) return -EINVAL; \ @@ -786,14 +790,16 @@ ACCESSOR(bank3ctl,bank[3],mce_restart()) ACCESSOR(bank4ctl,bank[4],mce_restart()) ACCESSOR(bank5ctl,bank[5],mce_restart()) -static ssize_t show_trigger(struct sys_device *s, char *buf) +static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, + char *buf) { strcpy(buf, trigger); strcat(buf, "\n"); return strlen(trigger) + 1; } -static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz) +static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, + const char *buf,size_t siz) { char *p; int len; diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 1f4cc48c14c..d5ae2243f0b 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -35,6 +35,7 @@ atomic_t therm_throt_en = ATOMIC_INIT(0); #define define_therm_throt_sysdev_show_func(name) \ static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ + struct sysdev_attribute *attr, \ char *buf) \ { \ unsigned int cpu = dev->id; \ diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 56b933119a0..fc4790638b6 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -644,7 +644,9 @@ static void microcode_fini_cpu(int cpu) mutex_unlock(µcode_mutex); } -static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz) +static ssize_t reload_store(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, size_t sz) { struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; char *end; @@ -674,14 +676,16 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz) return sz; } -static ssize_t version_show(struct sys_device *dev, char *buf) +static ssize_t version_show(struct sys_device *dev, + struct sysdev_attribute *attr, char *buf) { struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; return sprintf(buf, "0x%x\n", uci->rev); } -static ssize_t pf_show(struct sys_device *dev, char *buf) +static ssize_t pf_show(struct sys_device *dev, + struct sysdev_attribute *attr, char *buf) { struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; -- cgit v1.2.3 From d95d62c018209355c0dc998682ff792432aa870c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 1 Jul 2008 18:48:43 +0200 Subject: sysdev: Convert the x86 mce tolerant sysdev attribute to generic attribute Use the new generic int attribute accessors for the x86 mce tolerant attribute. Simple example to illustrate the new macros. There are much more places all over the tree that could be converted like this. Signed-off-by: Andi Kleen Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/mcheck/mce_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index e6a4d5f6764..9ab65be8242 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -812,12 +812,12 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, } static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); -ACCESSOR(tolerant,tolerant,) +static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); ACCESSOR(check_interval,check_interval,mce_restart()) static struct sysdev_attribute *mce_attributes[] = { &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, - &attr_tolerant, &attr_check_interval, &attr_trigger, + &attr_tolerant.attr, &attr_check_interval, &attr_trigger, NULL }; -- cgit v1.2.3 From 988781dc3e1d9209192b04458d279815923f5e76 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 21 Jul 2008 11:21:43 -0700 Subject: x86: use setup_clear_cpu_cap with disable_apic, fix beauty fix: /proc/cpuinfo will still show apic feature even if we booted up with it disabled. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ec952aa5394..b4aacb9f52e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -680,7 +680,7 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_LOCAL_APIC disable_apic = 1; #endif - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + setup_clear_cpu_cap(X86_FEATURE_APIC); } #ifdef CONFIG_PCI -- cgit v1.2.3 From c2e3277f875b83e5adc34e96989d6d87ec5f80f7 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 22 Jul 2008 15:40:46 +1000 Subject: x86: fix pte_flags() to only return flags, fix lguest (updated) (Jeremy said: rusty: use PTE_MASK rusty: use PTE_MASK rusty: use PTE_MASK When I asked: jsgf: does that include the NX flag? He responded eloquently: rusty: use PTE_MASK rusty: use PTE_MASK yes, it's the official constant of masking flags out of ptes ) Change a15af1c9ea2750a9ff01e51615c45950bad8221b 'x86/paravirt: add pte_flags to just get pte flags' removed lguest's private pte_flags() in favor of a generic one. Unfortunately, the generic one doesn't filter out the non-flags bits: this results in lguest creating corrupt shadow page tables and blowing up host memory. Since noone is supposed to use the pfn part of pte_flags(), it seems safest to always do the filtering. Signed-off-by: Rusty Russell Acked-by: Jeremy Fitzhardinge Signed-off-and-morning-tea-spilled-by: Ingo Molnar --- arch/x86/kernel/paravirt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 097d8a6797f..94da4d52d79 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -443,7 +443,7 @@ struct pv_mmu_ops pv_mmu_ops = { #endif /* PAGETABLE_LEVELS >= 3 */ .pte_val = native_pte_val, - .pte_flags = native_pte_val, + .pte_flags = native_pte_flags, .pgd_val = native_pgd_val, .make_pte = native_make_pte, -- cgit v1.2.3 From 15e8f348db372dec21229fda5d52ae6ee7e64666 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Mon, 23 Jun 2008 20:41:12 -0700 Subject: x86_64: remove bogus optimization in sysret_signal This short-circuit path in sysret_signal looks wrong to me. AFAICT, in practice the branch is never taken--and if it were, it would go wrong. To wit, try loading a module whose init function does set_thread_flag(TIF_IRET), and see insmod crash (presumably with a wrong user stack pointer). This is because the FIXUP_TOP_OF_STACK work hasn't been done yet when we jump around the call to ptregscall_common and get to int_with_check--where it expects the user RSP,SS,CS and EFLAGS to have been stored by FIXUP_TOP_OF_STACK. I don't think it's normally possible to get to sysret_signal with no _TIF_DO_NOTIFY_MASK bits set anyway, so these two instructions are already superfluous. If it ever did happen, it is harmless to call do_notify_resume with nothing for it to do. Signed-off-by: Roland McGrath --- arch/x86/kernel/entry_64.S | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 8410e26f418..a169225869c 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -402,16 +402,12 @@ sysret_careful: sysret_signal: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - testl $_TIF_DO_NOTIFY_MASK,%edx - jz 1f - - /* Really a signal */ /* edx: work flags (arg3) */ leaq do_notify_resume(%rip),%rax leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 xorl %esi,%esi # oldset -> arg2 call ptregscall_common -1: movl $_TIF_WORK_MASK,%edi + movl $_TIF_WORK_MASK,%edi /* Use IRET because user could have changed frame. This works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ DISABLE_INTERRUPTS(CLBR_NONE) -- cgit v1.2.3 From 86a1c34a929f30fde8ad01ea8245df61ddcf58b7 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Mon, 23 Jun 2008 15:37:04 -0700 Subject: x86_64 syscall audit fast-path This adds a fast path for 64-bit syscall entry and exit when TIF_SYSCALL_AUDIT is set, but no other kind of syscall tracing. This path does not need to save and restore all registers as the general case of tracing does. Avoiding the iret return path when syscall audit is enabled helps performance a lot. Signed-off-by: Roland McGrath --- arch/x86/kernel/entry_64.S | 48 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index a169225869c..db7d34a89d2 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -53,6 +53,12 @@ #include #include +/* Avoid __ASSEMBLER__'ifying just for this. */ +#include +#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_64BIT 0x80000000 +#define __AUDIT_ARCH_LE 0x40000000 + .code64 #ifdef CONFIG_FTRACE @@ -351,6 +357,7 @@ ENTRY(system_call_after_swapgs) GET_THREAD_INFO(%rcx) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx) jnz tracesys +system_call_fastpath: cmpq $__NR_syscall_max,%rax ja badsys movq %r10,%rcx @@ -402,6 +409,10 @@ sysret_careful: sysret_signal: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) +#ifdef CONFIG_AUDITSYSCALL + bt $TIF_SYSCALL_AUDIT,%edx + jc sysret_audit +#endif /* edx: work flags (arg3) */ leaq do_notify_resume(%rip),%rax leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 @@ -418,8 +429,45 @@ badsys: movq $-ENOSYS,RAX-ARGOFFSET(%rsp) jmp ret_from_sys_call +#ifdef CONFIG_AUDITSYSCALL + /* + * Fast path for syscall audit without full syscall trace. + * We just call audit_syscall_entry() directly, and then + * jump back to the normal fast path. + */ +auditsys: + movq %r10,%r9 /* 6th arg: 4th syscall arg */ + movq %rdx,%r8 /* 5th arg: 3rd syscall arg */ + movq %rsi,%rcx /* 4th arg: 2nd syscall arg */ + movq %rdi,%rdx /* 3rd arg: 1st syscall arg */ + movq %rax,%rsi /* 2nd arg: syscall number */ + movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */ + call audit_syscall_entry + LOAD_ARGS 0 /* reload call-clobbered registers */ + jmp system_call_fastpath + + /* + * Return fast path for syscall audit. Call audit_syscall_exit() + * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT + * masked off. + */ +sysret_audit: + movq %rax,%rsi /* second arg, syscall return value */ + cmpq $0,%rax /* is it < 0? */ + setl %al /* 1 if so, 0 if not */ + movzbl %al,%edi /* zero-extend that into %edi */ + inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ + call audit_syscall_exit + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi + jmp sysret_check +#endif /* CONFIG_AUDITSYSCALL */ + /* Do syscall tracing */ tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx) + jz auditsys +#endif SAVE_REST movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ FIXUP_TOP_OF_STACK %rdi -- cgit v1.2.3 From 5cbf1565f29eb57a86a305b08836613508e294d7 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Tue, 24 Jun 2008 01:13:31 -0700 Subject: x86_64 ia32 syscall audit fast-path This adds fast paths for 32-bit syscall entry and exit when TIF_SYSCALL_AUDIT is set, but no other kind of syscall tracing. These paths does not need to save and restore all registers as the general case of tracing does. Avoiding the iret return path when syscall audit is enabled helps performance a lot. Signed-off-by: Roland McGrath --- arch/x86/kernel/entry_64.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index db7d34a89d2..89434d43960 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -492,6 +492,7 @@ tracesys: * Has correct top of stack, but partial stack frame. */ .globl int_ret_from_sys_call + .globl int_with_check int_ret_from_sys_call: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF -- cgit v1.2.3 From af0575bba0f46dd9054d46e0a88c57afad3bf4d2 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Tue, 24 Jun 2008 04:16:52 -0700 Subject: i386 syscall audit fast-path This adds fast paths for 32-bit syscall entry and exit when TIF_SYSCALL_AUDIT is set, but no other kind of syscall tracing. These paths does not need to save and restore all registers as the general case of tracing does. Avoiding the iret return path when syscall audit is enabled helps performance a lot. Signed-off-by: Roland McGrath --- arch/x86/kernel/entry_32.S | 55 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index cdfd94cc6b1..109792bc7cf 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -54,6 +54,16 @@ #include #include +/* Avoid __ASSEMBLER__'ifying just for this. */ +#include +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_LE 0x40000000 + +#ifndef CONFIG_AUDITSYSCALL +#define sysenter_audit syscall_trace_entry +#define sysexit_audit syscall_exit_work +#endif + /* * We use macros for low-level operations which need to be overridden * for paravirtualization. The following will never clobber any registers: @@ -333,7 +343,8 @@ sysenter_past_esp: /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) - jnz syscall_trace_entry + jnz sysenter_audit +sysenter_do_call: cmpl $(nr_syscalls), %eax jae syscall_badsys call *sys_call_table(,%eax,4) @@ -343,7 +354,8 @@ sysenter_past_esp: TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx - jne syscall_exit_work + jne sysexit_audit +sysenter_exit: /* if something modifies registers it must also disable sysexit */ movl PT_EIP(%esp), %edx movl PT_OLDESP(%esp), %ecx @@ -351,6 +363,45 @@ sysenter_past_esp: TRACE_IRQS_ON 1: mov PT_FS(%esp), %fs ENABLE_INTERRUPTS_SYSEXIT + +#ifdef CONFIG_AUDITSYSCALL +sysenter_audit: + testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + jnz syscall_trace_entry + addl $4,%esp + CFI_ADJUST_CFA_OFFSET -4 + /* %esi already in 8(%esp) 6th arg: 4th syscall arg */ + /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */ + /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */ + movl %ebx,%ecx /* 3rd arg: 1st syscall arg */ + movl %eax,%edx /* 2nd arg: syscall number */ + movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ + call audit_syscall_entry + pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + movl PT_EAX(%esp),%eax /* reload syscall number */ + jmp sysenter_do_call + +sysexit_audit: + testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx + jne syscall_exit_work + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) + movl %eax,%edx /* second arg, syscall return value */ + cmpl $0,%eax /* is it < 0? */ + setl %al /* 1 if so, 0 if not */ + movzbl %al,%eax /* zero-extend that */ + inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ + call audit_syscall_exit + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx + jne syscall_exit_work + movl PT_EAX(%esp),%eax /* reload syscall return value */ + jmp sysenter_exit +#endif + CFI_ENDPROC .pushsection .fixup,"ax" 2: movl $0,PT_FS(%esp) -- cgit v1.2.3 From 9e882c9282512cc622752f29ec7c29ce338fc1eb Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 21 Jul 2008 16:49:54 -0700 Subject: x86: call early_cpu_init at the same point Call early_cpu_init() at the same (early) point in setup_arch(). The x86_64 code was calling it relatively late, after when other arch code need to do cpu-related setup which depends on it. Signed-off-by: Jeremy Fitzhardinge Cc: Mark McLoughlin Cc: Eduardo Habkost Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b4aacb9f52e..b520dae02bf 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -597,11 +597,11 @@ void __init setup_arch(char **cmdline_p) memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); visws_early_detect(); pre_setup_arch_hook(); - early_cpu_init(); #else printk(KERN_INFO "Command line: %s\n", boot_command_line); #endif + early_cpu_init(); early_ioremap_init(); ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); @@ -665,9 +665,6 @@ void __init setup_arch(char **cmdline_p) bss_resource.start = virt_to_phys(&__bss_start); bss_resource.end = virt_to_phys(&__bss_stop)-1; -#ifdef CONFIG_X86_64 - early_cpu_init(); -#endif strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; -- cgit v1.2.3 From 9d25d4db81833029d30b7b03cc1000cbbe09e192 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 21 Jul 2008 18:41:26 +0100 Subject: x86: BUILD_IRQ say .text to avoid .data.percpu When I edit the x86_64 Makefile to -fno-unit-at-a-time, bootup panics on 0xCCs in IRQ0x3e_interrupt(): IRQ0x20_interrupt etc. have got linked into .data.percpu. Perhaps there are other ways of triggering that: specify ".text" in the BUILD_IRQ() macro for safety. I've been using -fno-unit-at-a-time (to lessen inlining, for easier debugging) for a long time. Signed-off-by: Hugh Dickins Cc: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 0373e88de95..9414125f19c 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -43,7 +43,7 @@ #define BUILD_IRQ(nr) \ asmlinkage void IRQ_NAME(nr); \ - asm("\n.p2align\n" \ + asm("\n.text\n.p2align\n" \ "IRQ" #nr "_interrupt:\n\t" \ "push $~(" #nr ") ; " \ "jmp common_interrupt"); -- cgit v1.2.3 From 27ac792ca0b0a1e7e65f20342260650516c95864 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 23 Jul 2008 21:28:13 -0700 Subject: PAGE_ALIGN(): correctly handle 64-bit values on 32-bit architectures On 32-bit architectures PAGE_ALIGN() truncates 64-bit values to the 32-bit boundary. For example: u64 val = PAGE_ALIGN(size); always returns a value < 4GB even if size is greater than 4GB. The problem resides in PAGE_MASK definition (from include/asm-x86/page.h for example): #define PAGE_SHIFT 12 #define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE-1)) ... #define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) The "~" is performed on a 32-bit value, so everything in "and" with PAGE_MASK greater than 4GB will be truncated to the 32-bit boundary. Using the ALIGN() macro seems to be the right way, because it uses typeof(addr) for the mask. Also move the PAGE_ALIGN() definitions out of include/asm-*/page.h in include/linux/mm.h. See also lkml discussion: http://lkml.org/lkml/2008/6/11/237 [akpm@linux-foundation.org: fix drivers/media/video/uvc/uvc_queue.c] [akpm@linux-foundation.org: fix v850] [akpm@linux-foundation.org: fix powerpc] [akpm@linux-foundation.org: fix arm] [akpm@linux-foundation.org: fix mips] [akpm@linux-foundation.org: fix drivers/media/video/pvrusb2/pvrusb2-dvb.c] [akpm@linux-foundation.org: fix drivers/mtd/maps/uclinux.c] [akpm@linux-foundation.org: fix powerpc] Signed-off-by: Andrea Righi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/module_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c index 0e867676b5a..6ba87830d4b 100644 --- a/arch/x86/kernel/module_64.c +++ b/arch/x86/kernel/module_64.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From d75f65fd247fe85d90a3880d143b1bb22fe13a48 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 23 Jul 2008 21:28:34 -0700 Subject: remove include/linux/pm_legacy.h Remove the obsolete and no longer used include/linux/pm_legacy.h Reviewed-by: Robert P. J. Day Signed-off-by: Adrian Bunk Cc: Pavel Machek Acked-by: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/apm_32.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index bf9b441331e..9ee24e6bc4b 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -219,7 +219,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From bdfe6b7c681669148dae4db27eb24ee5408ba371 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 23 Jul 2008 21:28:41 -0700 Subject: pm: acpi hibernation: utilize hardware signature ACPI defines a hardware signature. BIOS calculates the signature according to hardware configure and if hardware changes while hibernated, the signature will change. In that case, S4 resume should fail. Still, there may be systems on which this mechanism does not work correctly, so it is better to provide a workaround for them. For this reason, add a new switch to the acpi_sleep= command line argument allowing one to disable hardware signature checking. [shaohua.li@intel.com: build fix] Signed-off-by: Shaohua Li Signed-off-by: Rafael J. Wysocki Cc: Andi Kleen Cc: Len Brown Acked-by: Pavel Machek Cc: Cc: Shaohua Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/acpi/sleep.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index a3ddad18aaa..fa2161d5003 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -150,6 +150,10 @@ static int __init acpi_sleep_setup(char *str) acpi_realmode_flags |= 2; if (strncmp(str, "s3_beep", 7) == 0) acpi_realmode_flags |= 4; +#ifdef CONFIG_HIBERNATION + if (strncmp(str, "s4_nohwsig", 10) == 0) + acpi_no_s4_hw_signature(); +#endif if (strncmp(str, "old_ordering", 12) == 0) acpi_old_suspend_ordering(); str = strchr(str, ','); -- cgit v1.2.3 From 9deb27baedb79759c3ab9435a7d8b841842d56e9 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 23 Jul 2008 21:29:24 -0700 Subject: flag parameters: signalfd This patch adds the new signalfd4 syscall. It extends the old signalfd syscall by one parameter which is meant to hold a flag value. In this patch the only flag support is SFD_CLOEXEC which causes the close-on-exec flag for the returned file descriptor to be set. A new name SFD_CLOEXEC is introduced which in this implementation must have the same value as O_CLOEXEC. The following test must be adjusted for architectures other than x86 and x86-64 and in case the syscall numbers changed. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #include #include #include #include #include #ifndef __NR_signalfd4 # ifdef __x86_64__ # define __NR_signalfd4 289 # elif defined __i386__ # define __NR_signalfd4 327 # else # error "need __NR_signalfd4" # endif #endif #define SFD_CLOEXEC O_CLOEXEC int main (void) { sigset_t ss; sigemptyset (&ss); sigaddset (&ss, SIGUSR1); int fd = syscall (__NR_signalfd4, -1, &ss, 8, 0); if (fd == -1) { puts ("signalfd4(0) failed"); return 1; } int coe = fcntl (fd, F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if (coe & FD_CLOEXEC) { puts ("signalfd4(0) set close-on-exec flag"); return 1; } close (fd); fd = syscall (__NR_signalfd4, -1, &ss, 8, SFD_CLOEXEC); if (fd == -1) { puts ("signalfd4(SFD_CLOEXEC) failed"); return 1; } coe = fcntl (fd, F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if ((coe & FD_CLOEXEC) == 0) { puts ("signalfd4(SFD_CLOEXEC) does not set close-on-exec flag"); return 1; } close (fd); puts ("OK"); return 0; } ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ [akpm@linux-foundation.org: add sys_ni stub] Signed-off-by: Ulrich Drepper Acked-by: Davide Libenzi Cc: Michael Kerrisk Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/syscall_table_32.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index adff5562f5f..c12a36c9fd5 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -326,3 +326,4 @@ ENTRY(sys_call_table) .long sys_fallocate .long sys_timerfd_settime /* 325 */ .long sys_timerfd_gettime + .long sys_signalfd4 -- cgit v1.2.3 From b087498eb5605673b0f260a7620d91818cd72304 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 23 Jul 2008 21:29:25 -0700 Subject: flag parameters: eventfd This patch adds the new eventfd2 syscall. It extends the old eventfd syscall by one parameter which is meant to hold a flag value. In this patch the only flag support is EFD_CLOEXEC which causes the close-on-exec flag for the returned file descriptor to be set. A new name EFD_CLOEXEC is introduced which in this implementation must have the same value as O_CLOEXEC. The following test must be adjusted for architectures other than x86 and x86-64 and in case the syscall numbers changed. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #include #include #include #include #ifndef __NR_eventfd2 # ifdef __x86_64__ # define __NR_eventfd2 290 # elif defined __i386__ # define __NR_eventfd2 328 # else # error "need __NR_eventfd2" # endif #endif #define EFD_CLOEXEC O_CLOEXEC int main (void) { int fd = syscall (__NR_eventfd2, 1, 0); if (fd == -1) { puts ("eventfd2(0) failed"); return 1; } int coe = fcntl (fd, F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if (coe & FD_CLOEXEC) { puts ("eventfd2(0) sets close-on-exec flag"); return 1; } close (fd); fd = syscall (__NR_eventfd2, 1, EFD_CLOEXEC); if (fd == -1) { puts ("eventfd2(EFD_CLOEXEC) failed"); return 1; } coe = fcntl (fd, F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if ((coe & FD_CLOEXEC) == 0) { puts ("eventfd2(EFD_CLOEXEC) does not set close-on-exec flag"); return 1; } close (fd); puts ("OK"); return 0; } ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ [akpm@linux-foundation.org: add sys_ni stub] Signed-off-by: Ulrich Drepper Acked-by: Davide Libenzi Cc: Michael Kerrisk Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/syscall_table_32.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index c12a36c9fd5..cf112cb11c3 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -327,3 +327,4 @@ ENTRY(sys_call_table) .long sys_timerfd_settime /* 325 */ .long sys_timerfd_gettime .long sys_signalfd4 + .long sys_eventfd2 -- cgit v1.2.3 From a0998b50c3f0b8fdd265c63e0032f86ebe377dbf Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 23 Jul 2008 21:29:27 -0700 Subject: flag parameters: epoll_create This patch adds the new epoll_create2 syscall. It extends the old epoll_create syscall by one parameter which is meant to hold a flag value. In this patch the only flag support is EPOLL_CLOEXEC which causes the close-on-exec flag for the returned file descriptor to be set. A new name EPOLL_CLOEXEC is introduced which in this implementation must have the same value as O_CLOEXEC. The following test must be adjusted for architectures other than x86 and x86-64 and in case the syscall numbers changed. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #include #include #include #include #include #ifndef __NR_epoll_create2 # ifdef __x86_64__ # define __NR_epoll_create2 291 # elif defined __i386__ # define __NR_epoll_create2 329 # else # error "need __NR_epoll_create2" # endif #endif #define EPOLL_CLOEXEC O_CLOEXEC int main (void) { int fd = syscall (__NR_epoll_create2, 1, 0); if (fd == -1) { puts ("epoll_create2(0) failed"); return 1; } int coe = fcntl (fd, F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if (coe & FD_CLOEXEC) { puts ("epoll_create2(0) set close-on-exec flag"); return 1; } close (fd); fd = syscall (__NR_epoll_create2, 1, EPOLL_CLOEXEC); if (fd == -1) { puts ("epoll_create2(EPOLL_CLOEXEC) failed"); return 1; } coe = fcntl (fd, F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if ((coe & FD_CLOEXEC) == 0) { puts ("epoll_create2(EPOLL_CLOEXEC) set close-on-exec flag"); return 1; } close (fd); puts ("OK"); return 0; } ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Ulrich Drepper Acked-by: Davide Libenzi Cc: Michael Kerrisk Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/syscall_table_32.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index cf112cb11c3..4d7007ca263 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -328,3 +328,4 @@ ENTRY(sys_call_table) .long sys_timerfd_gettime .long sys_signalfd4 .long sys_eventfd2 + .long sys_epoll_create2 -- cgit v1.2.3 From 336dd1f70ff62d7dd8655228caed4c5bfc818c56 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 23 Jul 2008 21:29:29 -0700 Subject: flag parameters: dup2 This patch adds the new dup3 syscall. It extends the old dup2 syscall by one parameter which is meant to hold a flag value. Support for the O_CLOEXEC flag is added in this patch. The following test must be adjusted for architectures other than x86 and x86-64 and in case the syscall numbers changed. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #include #include #include #include #include #ifndef __NR_dup3 # ifdef __x86_64__ # define __NR_dup3 292 # elif defined __i386__ # define __NR_dup3 330 # else # error "need __NR_dup3" # endif #endif int main (void) { int fd = syscall (__NR_dup3, 1, 4, 0); if (fd == -1) { puts ("dup3(0) failed"); return 1; } int coe = fcntl (fd, F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if (coe & FD_CLOEXEC) { puts ("dup3(0) set close-on-exec flag"); return 1; } close (fd); fd = syscall (__NR_dup3, 1, 4, O_CLOEXEC); if (fd == -1) { puts ("dup3(O_CLOEXEC) failed"); return 1; } coe = fcntl (fd, F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if ((coe & FD_CLOEXEC) == 0) { puts ("dup3(O_CLOEXEC) set close-on-exec flag"); return 1; } close (fd); puts ("OK"); return 0; } ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Ulrich Drepper Acked-by: Davide Libenzi Cc: Michael Kerrisk Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/syscall_table_32.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 4d7007ca263..24a3f1ea6a0 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -329,3 +329,4 @@ ENTRY(sys_call_table) .long sys_signalfd4 .long sys_eventfd2 .long sys_epoll_create2 + .long sys_dup3 /* 330 */ -- cgit v1.2.3 From ed8cae8ba01348bfd83333f4648dd807b04d7f08 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 23 Jul 2008 21:29:30 -0700 Subject: flag parameters: pipe This patch introduces the new syscall pipe2 which is like pipe but it also takes an additional parameter which takes a flag value. This patch implements the handling of O_CLOEXEC for the flag. I did not add support for the new syscall for the architectures which have a special sys_pipe implementation. I think the maintainers of those archs have the chance to go with the unified implementation but that's up to them. The implementation introduces do_pipe_flags. I did that instead of changing all callers of do_pipe because some of the callers are written in assembler. I would probably screw up changing the assembly code. To avoid breaking code do_pipe is now a small wrapper around do_pipe_flags. Once all callers are changed over to do_pipe_flags the old do_pipe function can be removed. The following test must be adjusted for architectures other than x86 and x86-64 and in case the syscall numbers changed. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #include #include #include #include #ifndef __NR_pipe2 # ifdef __x86_64__ # define __NR_pipe2 293 # elif defined __i386__ # define __NR_pipe2 331 # else # error "need __NR_pipe2" # endif #endif int main (void) { int fd[2]; if (syscall (__NR_pipe2, fd, 0) != 0) { puts ("pipe2(0) failed"); return 1; } for (int i = 0; i < 2; ++i) { int coe = fcntl (fd[i], F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if (coe & FD_CLOEXEC) { printf ("pipe2(0) set close-on-exit for fd[%d]\n", i); return 1; } } close (fd[0]); close (fd[1]); if (syscall (__NR_pipe2, fd, O_CLOEXEC) != 0) { puts ("pipe2(O_CLOEXEC) failed"); return 1; } for (int i = 0; i < 2; ++i) { int coe = fcntl (fd[i], F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if ((coe & FD_CLOEXEC) == 0) { printf ("pipe2(O_CLOEXEC) does not set close-on-exit for fd[%d]\n", i); return 1; } } close (fd[0]); close (fd[1]); puts ("OK"); return 0; } ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Ulrich Drepper Acked-by: Davide Libenzi Cc: Michael Kerrisk Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/syscall_table_32.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 24a3f1ea6a0..66154769d52 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -330,3 +330,4 @@ ENTRY(sys_call_table) .long sys_eventfd2 .long sys_epoll_create2 .long sys_dup3 /* 330 */ + .long sys_pipe2 -- cgit v1.2.3 From 4006553b06306b34054529477b06b68a1c66249b Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 23 Jul 2008 21:29:32 -0700 Subject: flag parameters: inotify_init This patch introduces the new syscall inotify_init1 (note: the 1 stands for the one parameter the syscall takes, as opposed to no parameter before). The values accepted for this parameter are function-specific and defined in the inotify.h header. Here the values must match the O_* flags, though. In this patch CLOEXEC support is introduced. The following test must be adjusted for architectures other than x86 and x86-64 and in case the syscall numbers changed. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #include #include #include #include #ifndef __NR_inotify_init1 # ifdef __x86_64__ # define __NR_inotify_init1 294 # elif defined __i386__ # define __NR_inotify_init1 332 # else # error "need __NR_inotify_init1" # endif #endif #define IN_CLOEXEC O_CLOEXEC int main (void) { int fd; fd = syscall (__NR_inotify_init1, 0); if (fd == -1) { puts ("inotify_init1(0) failed"); return 1; } int coe = fcntl (fd, F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if (coe & FD_CLOEXEC) { puts ("inotify_init1(0) set close-on-exit"); return 1; } close (fd); fd = syscall (__NR_inotify_init1, IN_CLOEXEC); if (fd == -1) { puts ("inotify_init1(IN_CLOEXEC) failed"); return 1; } coe = fcntl (fd, F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if ((coe & FD_CLOEXEC) == 0) { puts ("inotify_init1(O_CLOEXEC) does not set close-on-exit"); return 1; } close (fd); puts ("OK"); return 0; } ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ [akpm@linux-foundation.org: add sys_ni stub] Signed-off-by: Ulrich Drepper Acked-by: Davide Libenzi Cc: Michael Kerrisk Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/syscall_table_32.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 66154769d52..f59aba5ff0f 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -331,3 +331,4 @@ ENTRY(sys_call_table) .long sys_epoll_create2 .long sys_dup3 /* 330 */ .long sys_pipe2 + .long sys_inotify_init1 -- cgit v1.2.3 From 9fe5ad9c8cef9ad5873d8ee55d1cf00d9b607df0 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 23 Jul 2008 21:29:43 -0700 Subject: flag parameters add-on: remove epoll_create size param Remove the size parameter from the new epoll_create syscall and renames the syscall itself. The updated test program follows. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #include #include #include #include #include #ifndef __NR_epoll_create2 # ifdef __x86_64__ # define __NR_epoll_create2 291 # elif defined __i386__ # define __NR_epoll_create2 329 # else # error "need __NR_epoll_create2" # endif #endif #define EPOLL_CLOEXEC O_CLOEXEC int main (void) { int fd = syscall (__NR_epoll_create2, 0); if (fd == -1) { puts ("epoll_create2(0) failed"); return 1; } int coe = fcntl (fd, F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if (coe & FD_CLOEXEC) { puts ("epoll_create2(0) set close-on-exec flag"); return 1; } close (fd); fd = syscall (__NR_epoll_create2, EPOLL_CLOEXEC); if (fd == -1) { puts ("epoll_create2(EPOLL_CLOEXEC) failed"); return 1; } coe = fcntl (fd, F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if ((coe & FD_CLOEXEC) == 0) { puts ("epoll_create2(EPOLL_CLOEXEC) set close-on-exec flag"); return 1; } close (fd); puts ("OK"); return 0; } ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Ulrich Drepper Acked-by: Davide Libenzi Cc: Michael Kerrisk Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/syscall_table_32.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index f59aba5ff0f..d44395ff34c 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -328,7 +328,7 @@ ENTRY(sys_call_table) .long sys_timerfd_gettime .long sys_signalfd4 .long sys_eventfd2 - .long sys_epoll_create2 + .long sys_epoll_create1 .long sys_dup3 /* 330 */ .long sys_pipe2 .long sys_inotify_init1 -- cgit v1.2.3 From 7e2a31da854dcf8324012a83a31b40bc11e52589 Mon Sep 17 00:00:00 2001 From: David Brownell Date: Wed, 23 Jul 2008 21:30:47 -0700 Subject: rtc-cmos: avoid spurious irqs This fixes kernel http://bugzilla.kernel.org/show_bug.cgi?id=11112 (bogus RTC update IRQs reported) for rtc-cmos, in two ways: - When HPET is stealing the IRQs, use the first IRQ to grab the seconds counter which will be monitored (instead of using whatever was previously in that memory); - In sane IRQ handling modes, scrub out old IRQ status before enabling IRQs. That latter is done by tightening up IRQ handling for rtc-cmos everywhere, also ensuring that when HPET is used it's the only thing triggering IRQ reports to userspace; net object shrink. Also fix a bogus HPET message related to its RTC emulation. Signed-off-by: David Brownell Report-by: W Unruh Cc: Andrew Victor Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/hpet.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 0ea6a19bfdf..ad2b15a1334 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -468,7 +468,7 @@ void hpet_disable(void) #define RTC_NUM_INTS 1 static unsigned long hpet_rtc_flags; -static unsigned long hpet_prev_update_sec; +static int hpet_prev_update_sec; static struct rtc_time hpet_alarm_time; static unsigned long hpet_pie_count; static unsigned long hpet_t1_cmp; @@ -575,6 +575,9 @@ int hpet_set_rtc_irq_bit(unsigned long bit_mask) hpet_rtc_flags |= bit_mask; + if ((bit_mask & RTC_UIE) && !(oldbits & RTC_UIE)) + hpet_prev_update_sec = -1; + if (!oldbits) hpet_rtc_timer_init(); @@ -652,7 +655,7 @@ static void hpet_rtc_timer_reinit(void) if (hpet_rtc_flags & RTC_PIE) hpet_pie_count += lost_ints; if (printk_ratelimit()) - printk(KERN_WARNING "rtc: lost %d interrupts\n", + printk(KERN_WARNING "hpet1: lost %d rtc interrupts\n", lost_ints); } } @@ -670,7 +673,8 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) if (hpet_rtc_flags & RTC_UIE && curr_time.tm_sec != hpet_prev_update_sec) { - rtc_int_flag = RTC_UF; + if (hpet_prev_update_sec >= 0) + rtc_int_flag = RTC_UF; hpet_prev_update_sec = curr_time.tm_sec; } -- cgit v1.2.3 From 6209ed9d8443b63c36d340908530fa470c4d4fff Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 24 Jul 2008 12:49:26 -0700 Subject: x86-64: make BUILD_IRQ() also reset section back Commit 9d25d4db81833029d30b7b03cc1000cbbe09e192 ("x86: BUILD_IRQ say .text to avoid .data.percpu") added a ".text" specifier to make sure that BUILD_IRQ() builds the irq trampoline in the text segment rather than in some random left-over segment that the compiler happened to leave the asm in. However, we should also make sure that we switch back by adding a ".previous" at the end, so that there are no subtle issues with subsequent compiler-generated code. Signed-off-by: Linus Torvalds --- arch/x86/kernel/irqinit_64.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 9414125f19c..1f26fd9ec4f 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -46,7 +46,8 @@ asm("\n.text\n.p2align\n" \ "IRQ" #nr "_interrupt:\n\t" \ "push $~(" #nr ") ; " \ - "jmp common_interrupt"); + "jmp common_interrupt\n" \ + ".previous"); #define BI(x,y) \ BUILD_IRQ(x##y) -- cgit v1.2.3 From b30f3ae50cd03ef2ff433a5030fbf88dd8323528 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 24 Jul 2008 15:43:44 -0700 Subject: x86-64: Clean up 'save/restore_i387()' usage Suresh Siddha wants to fix a possible FPU leakage in error conditions, but the fact that save/restore_i387() are inlines in a header file makes that harder to do than necessary. So start off with an obvious cleanup. This just moves the x86-64 version of save/restore_i387() out of the header file, and moves it to the only file that it is actually used in: arch/x86/kernel/signal_64.c. So exposing it in a header file was wrong to begin with. [ Side note: I'd like to fix up some of the games we play with the 32-bit version of these functions too, but that's a separate matter. The 32-bit versions are shared - under different names at that! - by both the native x86-32 code and the x86-64 32-bit compatibility code ] Acked-by: Suresh Siddha Signed-off-by: Linus Torvalds --- arch/x86/kernel/signal_64.c | 53 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 47c3d249e63..b45ef8ddd65 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -53,6 +53,59 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, return do_sigaltstack(uss, uoss, regs->sp); } +/* + * Signal frame handlers. + */ + +static inline int save_i387(struct _fpstate __user *buf) +{ + struct task_struct *tsk = current; + int err = 0; + + BUILD_BUG_ON(sizeof(struct user_i387_struct) != + sizeof(tsk->thread.xstate->fxsave)); + + if ((unsigned long)buf % 16) + printk("save_i387: bad fpstate %p\n", buf); + + if (!used_math()) + return 0; + clear_used_math(); /* trigger finit */ + if (task_thread_info(tsk)->status & TS_USEDFPU) { + err = save_i387_checking((struct i387_fxsave_struct __user *) + buf); + if (err) + return err; + task_thread_info(tsk)->status &= ~TS_USEDFPU; + stts(); + } else { + if (__copy_to_user(buf, &tsk->thread.xstate->fxsave, + sizeof(struct i387_fxsave_struct))) + return -1; + } + return 1; +} + +/* + * This restores directly out of user space. Exceptions are handled. + */ +static inline int restore_i387(struct _fpstate __user *buf) +{ + struct task_struct *tsk = current; + int err; + + if (!used_math()) { + err = init_fpu(tsk); + if (err) + return err; + } + + if (!(task_thread_info(current)->status & TS_USEDFPU)) { + clts(); + task_thread_info(current)->status |= TS_USEDFPU; + } + return restore_fpu_checking((__force struct i387_fxsave_struct *)buf); +} /* * Do a signal return; undo the signal stack. -- cgit v1.2.3 From ef53d9c5e4da147ecaa43c44c5e5945eb83970a2 Mon Sep 17 00:00:00 2001 From: Srinivasa D S Date: Fri, 25 Jul 2008 01:46:04 -0700 Subject: kprobes: improve kretprobe scalability with hashed locking Currently list of kretprobe instances are stored in kretprobe object (as used_instances,free_instances) and in kretprobe hash table. We have one global kretprobe lock to serialise the access to these lists. This causes only one kretprobe handler to execute at a time. Hence affects system performance, particularly on SMP systems and when return probe is set on lot of functions (like on all systemcalls). Solution proposed here gives fine-grain locks that performs better on SMP system compared to present kretprobe implementation. Solution: 1) Instead of having one global lock to protect kretprobe instances present in kretprobe object and kretprobe hash table. We will have two locks, one lock for protecting kretprobe hash table and another lock for kretporbe object. 2) We hold lock present in kretprobe object while we modify kretprobe instance in kretprobe object and we hold per-hash-list lock while modifying kretprobe instances present in that hash list. To prevent deadlock, we never grab a per-hash-list lock while holding a kretprobe lock. 3) We can remove used_instances from struct kretprobe, as we can track used instances of kretprobe instances using kretprobe hash table. Time duration for kernel compilation ("make -j 8") on a 8-way ppc64 system with return probes set on all systemcalls looks like this. cacheline non-cacheline Un-patched kernel aligned patch aligned patch =============================================================================== real 9m46.784s 9m54.412s 10m2.450s user 40m5.715s 40m7.142s 40m4.273s sys 2m57.754s 2m58.583s 3m17.430s =========================================================== Time duration for kernel compilation ("make -j 8) on the same system, when kernel is not probed. ========================= real 9m26.389s user 40m8.775s sys 2m7.283s ========================= Signed-off-by: Srinivasa DS Signed-off-by: Jim Keniston Acked-by: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S. Miller Cc: Masami Hiramatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/kprobes.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 43c019f85f0..6c27679ec6a 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -431,7 +431,6 @@ static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) regs->ip = (unsigned long)p->ainsn.insn; } -/* Called with kretprobe_lock held */ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) { @@ -682,8 +681,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; INIT_HLIST_HEAD(&empty_rp); - spin_lock_irqsave(&kretprobe_lock, flags); - head = kretprobe_inst_table_head(current); + kretprobe_hash_lock(current, &head, &flags); /* fixup registers */ #ifdef CONFIG_X86_64 regs->cs = __KERNEL_CS; @@ -732,7 +730,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) kretprobe_assert(ri, orig_ret_address, trampoline_address); - spin_unlock_irqrestore(&kretprobe_lock, flags); + kretprobe_hash_unlock(current, &flags); hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { hlist_del(&ri->hlist); -- cgit v1.2.3 From 95b68dec0d52c7b8fea3698b3938cf3ab936436b Mon Sep 17 00:00:00 2001 From: Chandru Date: Fri, 25 Jul 2008 01:47:55 -0700 Subject: calgary iommu: use the first kernels TCE tables in kdump kdump kernel fails to boot with calgary iommu and aacraid driver on a x366 box. The ongoing dma's of aacraid from the first kernel continue to exist until the driver is loaded in the kdump kernel. Calgary is initialized prior to aacraid and creation of new tce tables causes wrong dma's to occur. Here we try to get the tce tables of the first kernel in kdump kernel and use them. While in the kdump kernel we do not allocate new tce tables but instead read the base address register contents of calgary iommu and use the tables that the registers point to. With these changes the kdump kernel and hence aacraid now boots normally. Signed-off-by: Chandru Siddalingappa Acked-by: Muli Ben-Yehuda Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/pci-calgary_64.c | 85 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 79 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 151f2d171f7..19e7fc7c2c4 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -167,6 +168,8 @@ static void calgary_dump_error_regs(struct iommu_table *tbl); static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); static void calioc2_tce_cache_blast(struct iommu_table *tbl); static void calioc2_dump_error_regs(struct iommu_table *tbl); +static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl); +static void get_tce_space_from_tar(void); static struct cal_chipset_ops calgary_chip_ops = { .handle_quirks = calgary_handle_quirks, @@ -830,7 +833,11 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) tbl = pci_iommu(dev->bus); tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space; - tce_free(tbl, 0, tbl->it_size); + + if (is_kdump_kernel()) + calgary_init_bitmap_from_tce_table(tbl); + else + tce_free(tbl, 0, tbl->it_size); if (is_calgary(dev->device)) tbl->chip_ops = &calgary_chip_ops; @@ -1209,6 +1216,10 @@ static int __init calgary_init(void) if (ret) return ret; + /* Purely for kdump kernel case */ + if (is_kdump_kernel()) + get_tce_space_from_tar(); + do { dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); if (!dev) @@ -1339,6 +1350,61 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev) return (val != 0xffffffff); } +/* + * calgary_init_bitmap_from_tce_table(): + * Funtion for kdump case. In the second/kdump kernel initialize + * the bitmap based on the tce table entries obtained from first kernel + */ +static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl) +{ + u64 *tp; + unsigned int index; + tp = ((u64 *)tbl->it_base); + for (index = 0 ; index < tbl->it_size; index++) { + if (*tp != 0x0) + set_bit(index, tbl->it_map); + tp++; + } +} + +/* + * get_tce_space_from_tar(): + * Function for kdump case. Get the tce tables from first kernel + * by reading the contents of the base adress register of calgary iommu + */ +static void get_tce_space_from_tar() +{ + int bus; + void __iomem *target; + unsigned long tce_space; + + for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { + struct calgary_bus_info *info = &bus_info[bus]; + unsigned short pci_device; + u32 val; + + val = read_pci_config(bus, 0, 0, 0); + pci_device = (val & 0xFFFF0000) >> 16; + + if (!is_cal_pci_dev(pci_device)) + continue; + if (info->translation_disabled) + continue; + + if (calgary_bus_has_devices(bus, pci_device) || + translate_empty_slots) { + target = calgary_reg(bus_info[bus].bbar, + tar_offset(bus)); + tce_space = be64_to_cpu(readq(target)); + tce_space = tce_space & TAR_SW_BITS; + + tce_space = tce_space & (~specified_table_size); + info->tce_space = (u64 *)__va(tce_space); + } + } + return; +} + void __init detect_calgary(void) { int bus; @@ -1394,7 +1460,8 @@ void __init detect_calgary(void) return; } - specified_table_size = determine_tce_table_size(max_pfn * PAGE_SIZE); + specified_table_size = determine_tce_table_size((is_kdump_kernel() ? + saved_max_pfn : max_pfn) * PAGE_SIZE); for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { struct calgary_bus_info *info = &bus_info[bus]; @@ -1412,10 +1479,16 @@ void __init detect_calgary(void) if (calgary_bus_has_devices(bus, pci_device) || translate_empty_slots) { - tbl = alloc_tce_table(); - if (!tbl) - goto cleanup; - info->tce_space = tbl; + /* + * If it is kdump kernel, find and use tce tables + * from first kernel, else allocate tce tables here + */ + if (!is_kdump_kernel()) { + tbl = alloc_tce_table(); + if (!tbl) + goto cleanup; + info->tce_space = tbl; + } calgary_found = 1; } } -- cgit v1.2.3 From 3a61ec387c9092dfc91a5959145d36835a72fc4c Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 25 Jul 2008 13:07:50 +0200 Subject: x86, AMD IOMMU: include amd_iommu_last_bdf in device initialization All the values read while searching for amd_iommu_last_bdf are defined as inclusive. Let the code handle this value as such. Found by Wei Wang. Thanks Wei. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Cc: Wei Wang Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 4 ++-- arch/x86/kernel/amd_iommu_init.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index c25210e6ac8..74697408576 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -667,7 +667,7 @@ static int get_device_resources(struct device *dev, _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); /* device not translated by any IOMMU in the system? */ - if (_bdf >= amd_iommu_last_bdf) { + if (_bdf > amd_iommu_last_bdf) { *iommu = NULL; *domain = NULL; *bdf = 0xffff; @@ -1085,7 +1085,7 @@ void prealloc_protection_domains(void) while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { devid = (dev->bus->number << 8) | dev->devfn; - if (devid >= amd_iommu_last_bdf) + if (devid > amd_iommu_last_bdf) continue; devid = amd_iommu_alias_table[devid]; if (domain_for_device(devid)) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index c9d8ff2eb13..d9a9da597e7 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -732,7 +732,7 @@ static int __init init_exclusion_range(struct ivmd_header *m) set_device_exclusion_range(m->devid, m); break; case ACPI_IVMD_TYPE_ALL: - for (i = 0; i < amd_iommu_last_bdf; ++i) + for (i = 0; i <= amd_iommu_last_bdf; ++i) set_device_exclusion_range(i, m); break; case ACPI_IVMD_TYPE_RANGE: @@ -934,7 +934,7 @@ int __init amd_iommu_init(void) /* * let all alias entries point to itself */ - for (i = 0; i < amd_iommu_last_bdf; ++i) + for (i = 0; i <= amd_iommu_last_bdf; ++i) amd_iommu_alias_table[i] = i; /* -- cgit v1.2.3 From 8d8bb39b9eba32dd70e87fd5ad5c5dd4ba118e06 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 25 Jul 2008 19:44:49 -0700 Subject: dma-mapping: add the device argument to dma_mapping_error() Add per-device dma_mapping_ops support for CONFIG_X86_64 as POWER architecture does: This enables us to cleanly fix the Calgary IOMMU issue that some devices are not behind the IOMMU (http://lkml.org/lkml/2008/5/8/423). I think that per-device dma_mapping_ops support would be also helpful for KVM people to support PCI passthrough but Andi thinks that this makes it difficult to support the PCI passthrough (see the above thread). So I CC'ed this to KVM camp. Comments are appreciated. A pointer to dma_mapping_ops to struct dev_archdata is added. If the pointer is non NULL, DMA operations in asm/dma-mapping.h use it. If it's NULL, the system-wide dma_ops pointer is used as before. If it's useful for KVM people, I plan to implement a mechanism to register a hook called when a new pci (or dma capable) device is created (it works with hot plugging). It enables IOMMUs to set up an appropriate dma_mapping_ops per device. The major obstacle is that dma_mapping_error doesn't take a pointer to the device unlike other DMA operations. So x86 can't have dma_mapping_ops per device. Note all the POWER IOMMUs use the same dma_mapping_error function so this is not a problem for POWER but x86 IOMMUs use different dma_mapping_error functions. The first patch adds the device argument to dma_mapping_error. The patch is trivial but large since it touches lots of drivers and dma-mapping.h in all the architecture. This patch: dma_mapping_error() doesn't take a pointer to the device unlike other DMA operations. So we can't have dma_mapping_ops per device. Note that POWER already has dma_mapping_ops per device but all the POWER IOMMUs use the same dma_mapping_error function. x86 IOMMUs use device argument. [akpm@linux-foundation.org: fix sge] [akpm@linux-foundation.org: fix svc_rdma] [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: fix bnx2x] [akpm@linux-foundation.org: fix s2io] [akpm@linux-foundation.org: fix pasemi_mac] [akpm@linux-foundation.org: fix sdhci] [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: fix sparc] [akpm@linux-foundation.org: fix ibmvscsi] Signed-off-by: FUJITA Tomonori Cc: Muli Ben-Yehuda Cc: Andi Kleen Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Avi Kivity Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/pci-calgary_64.c | 2 +- arch/x86/kernel/pci-dma.c | 27 ++++++++++++++++----------- arch/x86/kernel/pci-gart_64.c | 3 +-- arch/x86/kernel/pci-nommu.c | 14 +------------- arch/x86/kernel/pci-swiotlb_64.c | 2 +- 5 files changed, 20 insertions(+), 28 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 19e7fc7c2c4..1eb86be93d7 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -544,7 +544,7 @@ error: return ret; } -static const struct dma_mapping_ops calgary_dma_ops = { +static struct dma_mapping_ops calgary_dma_ops = { .alloc_coherent = calgary_alloc_coherent, .map_single = calgary_map_single, .unmap_single = calgary_unmap_single, diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index cbecb05551b..37544123896 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -11,7 +11,7 @@ static int forbid_dac __read_mostly; -const struct dma_mapping_ops *dma_ops; +struct dma_mapping_ops *dma_ops; EXPORT_SYMBOL(dma_ops); static int iommu_sac_force __read_mostly; @@ -312,6 +312,8 @@ static int dma_release_coherent(struct device *dev, int order, void *vaddr) int dma_supported(struct device *dev, u64 mask) { + struct dma_mapping_ops *ops = get_dma_ops(dev); + #ifdef CONFIG_PCI if (mask > 0xffffffff && forbid_dac > 0) { dev_info(dev, "PCI: Disallowing DAC for device\n"); @@ -319,8 +321,8 @@ int dma_supported(struct device *dev, u64 mask) } #endif - if (dma_ops->dma_supported) - return dma_ops->dma_supported(dev, mask); + if (ops->dma_supported) + return ops->dma_supported(dev, mask); /* Copied from i386. Doesn't make much sense, because it will only work for pci_alloc_coherent. @@ -367,6 +369,7 @@ void * dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) { + struct dma_mapping_ops *ops = get_dma_ops(dev); void *memory = NULL; struct page *page; unsigned long dma_mask = 0; @@ -435,8 +438,8 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, /* Let low level make its own zone decisions */ gfp &= ~(GFP_DMA32|GFP_DMA); - if (dma_ops->alloc_coherent) - return dma_ops->alloc_coherent(dev, size, + if (ops->alloc_coherent) + return ops->alloc_coherent(dev, size, dma_handle, gfp); return NULL; } @@ -448,14 +451,14 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, } } - if (dma_ops->alloc_coherent) { + if (ops->alloc_coherent) { free_pages((unsigned long)memory, get_order(size)); gfp &= ~(GFP_DMA|GFP_DMA32); - return dma_ops->alloc_coherent(dev, size, dma_handle, gfp); + return ops->alloc_coherent(dev, size, dma_handle, gfp); } - if (dma_ops->map_simple) { - *dma_handle = dma_ops->map_simple(dev, virt_to_phys(memory), + if (ops->map_simple) { + *dma_handle = ops->map_simple(dev, virt_to_phys(memory), size, PCI_DMA_BIDIRECTIONAL); if (*dma_handle != bad_dma_address) @@ -477,12 +480,14 @@ EXPORT_SYMBOL(dma_alloc_coherent); void dma_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t bus) { + struct dma_mapping_ops *ops = get_dma_ops(dev); + int order = get_order(size); WARN_ON(irqs_disabled()); /* for portability */ if (dma_release_coherent(dev, order, vaddr)) return; - if (dma_ops->unmap_single) - dma_ops->unmap_single(dev, bus, size, 0); + if (ops->unmap_single) + ops->unmap_single(dev, bus, size, 0); free_pages((unsigned long)vaddr, order); } EXPORT_SYMBOL(dma_free_coherent); diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index df5f142657d..744126e6495 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -692,8 +692,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) extern int agp_amd64_init(void); -static const struct dma_mapping_ops gart_dma_ops = { - .mapping_error = NULL, +static struct dma_mapping_ops gart_dma_ops = { .map_single = gart_map_single, .map_simple = gart_map_simple, .unmap_single = gart_unmap_single, diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 792b9179eff..3f91f71cdc3 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -72,21 +72,9 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, return nents; } -/* Make sure we keep the same behaviour */ -static int nommu_mapping_error(dma_addr_t dma_addr) -{ -#ifdef CONFIG_X86_32 - return 0; -#else - return (dma_addr == bad_dma_address); -#endif -} - - -const struct dma_mapping_ops nommu_dma_ops = { +struct dma_mapping_ops nommu_dma_ops = { .map_single = nommu_map_single, .map_sg = nommu_map_sg, - .mapping_error = nommu_mapping_error, .is_phys = 1, }; diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c index 20df839b9c2..c4ce0332759 100644 --- a/arch/x86/kernel/pci-swiotlb_64.c +++ b/arch/x86/kernel/pci-swiotlb_64.c @@ -18,7 +18,7 @@ swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size, return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction); } -const struct dma_mapping_ops swiotlb_dma_ops = { +struct dma_mapping_ops swiotlb_dma_ops = { .mapping_error = swiotlb_dma_mapping_error, .alloc_coherent = swiotlb_alloc_coherent, .free_coherent = swiotlb_free_coherent, -- cgit v1.2.3 From 1956a96de488feb05e95c08c9d5e80f63a4be2b1 Mon Sep 17 00:00:00 2001 From: Alexis Bruemmer Date: Fri, 25 Jul 2008 19:44:51 -0700 Subject: x86 calgary: fix handling of devices that aren't behind the Calgary The calgary code can give drivers addresses above 4GB which is very bad for hardware that is only 32bit DMA addressable. With this patch, the calgary code sets the global dma_ops to swiotlb or nommu properly, and the dma_ops of devices behind the Calgary/CalIOC2 to calgary_dma_ops. So the calgary code can handle devices safely that aren't behind the Calgary/CalIOC2. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Alexis Bruemmer Signed-off-by: FUJITA Tomonori Cc: Muli Ben-Yehuda Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/pci-calgary_64.c | 71 +++++++++++++++------------------------- 1 file changed, 26 insertions(+), 45 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 1eb86be93d7..b67a4b1d4ea 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -37,6 +37,7 @@ #include #include #include + #include #include #include @@ -413,22 +414,6 @@ static void calgary_unmap_sg(struct device *dev, } } -static int calgary_nontranslate_map_sg(struct device* dev, - struct scatterlist *sg, int nelems, int direction) -{ - struct scatterlist *s; - int i; - - for_each_sg(sg, s, nelems, i) { - struct page *p = sg_page(s); - - BUG_ON(!p); - s->dma_address = virt_to_bus(sg_virt(s)); - s->dma_length = s->length; - } - return nelems; -} - static int calgary_map_sg(struct device *dev, struct scatterlist *sg, int nelems, int direction) { @@ -439,9 +424,6 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, unsigned long entry; int i; - if (!translation_enabled(tbl)) - return calgary_nontranslate_map_sg(dev, sg, nelems, direction); - for_each_sg(sg, s, nelems, i) { BUG_ON(!sg_page(s)); @@ -477,7 +459,6 @@ error: static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, size_t size, int direction) { - dma_addr_t dma_handle = bad_dma_address; void *vaddr = phys_to_virt(paddr); unsigned long uaddr; unsigned int npages; @@ -486,12 +467,7 @@ static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, uaddr = (unsigned long)vaddr; npages = num_dma_pages(uaddr, size); - if (translation_enabled(tbl)) - dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction); - else - dma_handle = virt_to_bus(vaddr); - - return dma_handle; + return iommu_alloc(dev, tbl, vaddr, npages, direction); } static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, @@ -500,9 +476,6 @@ static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, struct iommu_table *tbl = find_iommu_table(dev); unsigned int npages; - if (!translation_enabled(tbl)) - return; - npages = num_dma_pages(dma_handle, size); iommu_free(tbl, dma_handle, npages); } @@ -525,18 +498,12 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size, goto error; memset(ret, 0, size); - if (translation_enabled(tbl)) { - /* set up tces to cover the allocated range */ - mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); - if (mapping == bad_dma_address) - goto free; - - *dma_handle = mapping; - } else /* non translated slot */ - *dma_handle = virt_to_bus(ret); - + /* set up tces to cover the allocated range */ + mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); + if (mapping == bad_dma_address) + goto free; + *dma_handle = mapping; return ret; - free: free_pages((unsigned long)ret, get_order(size)); ret = NULL; @@ -1241,6 +1208,16 @@ static int __init calgary_init(void) goto error; } while (1); + dev = NULL; + for_each_pci_dev(dev) { + struct iommu_table *tbl; + + tbl = find_iommu_table(&dev->dev); + + if (translation_enabled(tbl)) + dev->dev.archdata.dma_ops = &calgary_dma_ops; + } + return ret; error: @@ -1262,6 +1239,7 @@ error: calgary_disable_translation(dev); calgary_free_bus(dev); pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ + dev->dev.archdata.dma_ops = NULL; } while (1); return ret; @@ -1503,6 +1481,10 @@ void __init detect_calgary(void) printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, debugging ? "enabled" : "disabled"); + + /* swiotlb for devices that aren't behind the Calgary. */ + if (max_pfn > MAX_DMA32_PFN) + swiotlb = 1; } return; @@ -1519,7 +1501,7 @@ int __init calgary_iommu_init(void) { int ret; - if (no_iommu || swiotlb) + if (no_iommu || (swiotlb && !calgary_detected)) return -ENODEV; if (!calgary_detected) @@ -1532,15 +1514,14 @@ int __init calgary_iommu_init(void) if (ret) { printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " "falling back to no_iommu\n", ret); - if (max_pfn > MAX_DMA32_PFN) - printk(KERN_ERR "WARNING more than 4GB of memory, " - "32bit PCI may malfunction.\n"); return ret; } force_iommu = 1; bad_dma_address = 0x0; - dma_ops = &calgary_dma_ops; + /* dma_ops is set to swiotlb or nommu */ + if (!dma_ops) + dma_ops = &nommu_dma_ops; return 0; } -- cgit v1.2.3 From 3ab83521378268044a448113c6aa9a9e245f4d2f Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 25 Jul 2008 19:45:07 -0700 Subject: kexec jump This patch provides an enhancement to kexec/kdump. It implements the following features: - Backup/restore memory used by the original kernel before/after kexec. - Save/restore CPU state before/after kexec. The features of this patch can be used as a general method to call program in physical mode (paging turning off). This can be used to call BIOS code under Linux. kexec-tools needs to be patched to support kexec jump. The patches and the precompiled kexec can be download from the following URL: source: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec-tools-src_git_kh10.tar.bz2 patches: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec-tools-patches_git_kh10.tar.bz2 binary: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec_git_kh10 Usage example of calling some physical mode code and return: 1. Compile and install patched kernel with following options selected: CONFIG_X86_32=y CONFIG_KEXEC=y CONFIG_PM=y CONFIG_KEXEC_JUMP=y 2. Build patched kexec-tool or download the pre-built one. 3. Build some physical mode executable named such as "phy_mode" 4. Boot kernel compiled in step 1. 5. Load physical mode executable with /sbin/kexec. The shell command line can be as follow: /sbin/kexec --load-preserve-context --args-none phy_mode 6. Call physical mode executable with following shell command line: /sbin/kexec -e Implementation point: To support jumping without reserving memory. One shadow backup page (source page) is allocated for each page used by kexeced code image (destination page). When do kexec_load, the image of kexeced code is loaded into source pages, and before executing, the destination pages and the source pages are swapped, so the contents of destination pages are backupped. Before jumping to the kexeced code image and after jumping back to the original kernel, the destination pages and the source pages are swapped too. C ABI (calling convention) is used as communication protocol between kernel and called code. A flag named KEXEC_PRESERVE_CONTEXT for sys_kexec_load is added to indicate that the loaded kernel image is used for jumping back. Now, only the i386 architecture is supported. Signed-off-by: Huang Ying Acked-by: Vivek Goyal Cc: "Eric W. Biederman" Cc: Pavel Machek Cc: Nigel Cunningham Cc: "Rafael J. Wysocki" Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/machine_kexec_32.c | 27 ++++-- arch/x86/kernel/machine_kexec_64.c | 2 +- arch/x86/kernel/relocate_kernel_32.S | 174 ++++++++++++++++++++++++++++++----- 3 files changed, 172 insertions(+), 31 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 8864230d55a..2b67609d0a1 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -22,6 +22,7 @@ #include #include #include +#include #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) static u32 kexec_pgd[1024] PAGE_ALIGNED; @@ -85,10 +86,12 @@ static void load_segments(void) * reboot code buffer to allow us to avoid allocations * later. * - * Currently nothing. + * Make control page executable. */ int machine_kexec_prepare(struct kimage *image) { + if (nx_enabled) + set_pages_x(image->control_code_page, 1); return 0; } @@ -98,16 +101,24 @@ int machine_kexec_prepare(struct kimage *image) */ void machine_kexec_cleanup(struct kimage *image) { + if (nx_enabled) + set_pages_nx(image->control_code_page, 1); } /* * Do not allocate memory (or fail in any way) in machine_kexec(). * We are past the point of no return, committed to rebooting now. */ -NORET_TYPE void machine_kexec(struct kimage *image) +void machine_kexec(struct kimage *image) { unsigned long page_list[PAGES_NR]; void *control_page; + asmlinkage unsigned long + (*relocate_kernel_ptr)(unsigned long indirection_page, + unsigned long control_page, + unsigned long start_address, + unsigned int has_pae, + unsigned int preserve_context); tracer_disable(); @@ -115,10 +126,11 @@ NORET_TYPE void machine_kexec(struct kimage *image) local_irq_disable(); control_page = page_address(image->control_code_page); - memcpy(control_page, relocate_kernel, PAGE_SIZE); + memcpy(control_page, relocate_kernel, PAGE_SIZE/2); + relocate_kernel_ptr = control_page; page_list[PA_CONTROL_PAGE] = __pa(control_page); - page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; + page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; page_list[PA_PGD] = __pa(kexec_pgd); page_list[VA_PGD] = (unsigned long)kexec_pgd; #ifdef CONFIG_X86_PAE @@ -131,6 +143,7 @@ NORET_TYPE void machine_kexec(struct kimage *image) page_list[VA_PTE_0] = (unsigned long)kexec_pte0; page_list[PA_PTE_1] = __pa(kexec_pte1); page_list[VA_PTE_1] = (unsigned long)kexec_pte1; + page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT); /* The segment registers are funny things, they have both a * visible and an invisible part. Whenever the visible part is @@ -149,8 +162,10 @@ NORET_TYPE void machine_kexec(struct kimage *image) set_idt(phys_to_virt(0),0); /* now call it */ - relocate_kernel((unsigned long)image->head, (unsigned long)page_list, - image->start, cpu_has_pae); + image->start = relocate_kernel_ptr((unsigned long)image->head, + (unsigned long)page_list, + image->start, cpu_has_pae, + image->preserve_context); } void arch_crash_save_vmcoreinfo(void) diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 9dd9262693a..c43caa3a91f 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -181,7 +181,7 @@ void machine_kexec_cleanup(struct kimage *image) * Do not allocate memory (or fail in any way) in machine_kexec(). * We are past the point of no return, committed to rebooting now. */ -NORET_TYPE void machine_kexec(struct kimage *image) +void machine_kexec(struct kimage *image) { unsigned long page_list[PAGES_NR]; void *control_page; diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index c30fe25d470..703310a9902 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -20,11 +20,44 @@ #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) #define PAE_PGD_ATTR (_PAGE_PRESENT) +/* control_page + PAGE_SIZE/2 ~ control_page + PAGE_SIZE * 3/4 are + * used to save some data for jumping back + */ +#define DATA(offset) (PAGE_SIZE/2+(offset)) + +/* Minimal CPU state */ +#define ESP DATA(0x0) +#define CR0 DATA(0x4) +#define CR3 DATA(0x8) +#define CR4 DATA(0xc) + +/* other data */ +#define CP_VA_CONTROL_PAGE DATA(0x10) +#define CP_PA_PGD DATA(0x14) +#define CP_PA_SWAP_PAGE DATA(0x18) +#define CP_PA_BACKUP_PAGES_MAP DATA(0x1c) + .text .align PAGE_SIZE .globl relocate_kernel relocate_kernel: - movl 8(%esp), %ebp /* list of pages */ + /* Save the CPU context, used for jumping back */ + + pushl %ebx + pushl %esi + pushl %edi + pushl %ebp + pushf + + movl 20+8(%esp), %ebp /* list of pages */ + movl PTR(VA_CONTROL_PAGE)(%ebp), %edi + movl %esp, ESP(%edi) + movl %cr0, %eax + movl %eax, CR0(%edi) + movl %cr3, %eax + movl %eax, CR3(%edi) + movl %cr4, %eax + movl %eax, CR4(%edi) #ifdef CONFIG_X86_PAE /* map the control page at its virtual address */ @@ -138,15 +171,25 @@ relocate_kernel: relocate_new_kernel: /* read the arguments and say goodbye to the stack */ - movl 4(%esp), %ebx /* page_list */ - movl 8(%esp), %ebp /* list of pages */ - movl 12(%esp), %edx /* start address */ - movl 16(%esp), %ecx /* cpu_has_pae */ + movl 20+4(%esp), %ebx /* page_list */ + movl 20+8(%esp), %ebp /* list of pages */ + movl 20+12(%esp), %edx /* start address */ + movl 20+16(%esp), %ecx /* cpu_has_pae */ + movl 20+20(%esp), %esi /* preserve_context */ /* zero out flags, and disable interrupts */ pushl $0 popfl + /* save some information for jumping back */ + movl PTR(VA_CONTROL_PAGE)(%ebp), %edi + movl %edi, CP_VA_CONTROL_PAGE(%edi) + movl PTR(PA_PGD)(%ebp), %eax + movl %eax, CP_PA_PGD(%edi) + movl PTR(PA_SWAP_PAGE)(%ebp), %eax + movl %eax, CP_PA_SWAP_PAGE(%edi) + movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi) + /* get physical address of control page now */ /* this is impossible after page table switch */ movl PTR(PA_CONTROL_PAGE)(%ebp), %edi @@ -197,8 +240,90 @@ identity_mapped: xorl %eax, %eax movl %eax, %cr3 + movl CP_PA_SWAP_PAGE(%edi), %eax + pushl %eax + pushl %ebx + call swap_pages + addl $8, %esp + + /* To be certain of avoiding problems with self-modifying code + * I need to execute a serializing instruction here. + * So I flush the TLB, it's handy, and not processor dependent. + */ + xorl %eax, %eax + movl %eax, %cr3 + + /* set all of the registers to known values */ + /* leave %esp alone */ + + testl %esi, %esi + jnz 1f + xorl %edi, %edi + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %esi, %esi + xorl %ebp, %ebp + ret +1: + popl %edx + movl CP_PA_SWAP_PAGE(%edi), %esp + addl $PAGE_SIZE, %esp +2: + call *%edx + + /* get the re-entry point of the peer system */ + movl 0(%esp), %ebp + call 1f +1: + popl %ebx + subl $(1b - relocate_kernel), %ebx + movl CP_VA_CONTROL_PAGE(%ebx), %edi + lea PAGE_SIZE(%ebx), %esp + movl CP_PA_SWAP_PAGE(%ebx), %eax + movl CP_PA_BACKUP_PAGES_MAP(%ebx), %edx + pushl %eax + pushl %edx + call swap_pages + addl $8, %esp + movl CP_PA_PGD(%ebx), %eax + movl %eax, %cr3 + movl %cr0, %eax + orl $(1<<31), %eax + movl %eax, %cr0 + lea PAGE_SIZE(%edi), %esp + movl %edi, %eax + addl $(virtual_mapped - relocate_kernel), %eax + pushl %eax + ret + +virtual_mapped: + movl CR4(%edi), %eax + movl %eax, %cr4 + movl CR3(%edi), %eax + movl %eax, %cr3 + movl CR0(%edi), %eax + movl %eax, %cr0 + movl ESP(%edi), %esp + movl %ebp, %eax + + popf + popl %ebp + popl %edi + popl %esi + popl %ebx + ret + /* Do the copies */ - movl %ebx, %ecx +swap_pages: + movl 8(%esp), %edx + movl 4(%esp), %ecx + pushl %ebp + pushl %ebx + pushl %edi + pushl %esi + movl %ecx, %ebx jmp 1f 0: /* top, read another word from the indirection page */ @@ -226,27 +351,28 @@ identity_mapped: movl %ecx, %esi /* For every source page do a copy */ andl $0xfffff000, %esi + movl %edi, %eax + movl %esi, %ebp + + movl %edx, %edi movl $1024, %ecx rep ; movsl - jmp 0b -3: - - /* To be certain of avoiding problems with self-modifying code - * I need to execute a serializing instruction here. - * So I flush the TLB, it's handy, and not processor dependent. - */ - xorl %eax, %eax - movl %eax, %cr3 + movl %ebp, %edi + movl %eax, %esi + movl $1024, %ecx + rep ; movsl - /* set all of the registers to known values */ - /* leave %esp alone */ + movl %eax, %edi + movl %edx, %esi + movl $1024, %ecx + rep ; movsl - xorl %eax, %eax - xorl %ebx, %ebx - xorl %ecx, %ecx - xorl %edx, %edx - xorl %esi, %esi - xorl %edi, %edi - xorl %ebp, %ebp + lea PAGE_SIZE(%ebp), %esi + jmp 0b +3: + popl %esi + popl %edi + popl %ebx + popl %ebp ret -- cgit v1.2.3 From 89081d17f7bb81d89fa1aa9b70f821c5cf4d39e9 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 25 Jul 2008 19:45:10 -0700 Subject: kexec jump: save/restore device state This patch implements devices state save/restore before after kexec. This patch together with features in kexec_jump patch can be used for following: - A simple hibernation implementation without ACPI support. You can kexec a hibernating kernel, save the memory image of original system and shutdown the system. When resuming, you restore the memory image of original system via ordinary kexec load then jump back. - Kernel/system debug through making system snapshot. You can make system snapshot, jump back, do some thing and make another system snapshot. - Cooperative multi-kernel/system. With kexec jump, you can switch between several kernels/systems quickly without boot process except the first time. This appears like swap a whole kernel/system out/in. - A general method to call program in physical mode (paging turning off). This can be used to invoke BIOS code under Linux. The following user-space tools can be used with kexec jump: - kexec-tools needs to be patched to support kexec jump. The patches and the precompiled kexec can be download from the following URL: source: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec-tools-src_git_kh10.tar.bz2 patches: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec-tools-patches_git_kh10.tar.bz2 binary: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec_git_kh10 - makedumpfile with patches are used as memory image saving tool, it can exclude free pages from original kernel memory image file. The patches and the precompiled makedumpfile can be download from the following URL: source: http://khibernation.sourceforge.net/download/release_v10/makedumpfile/makedumpfile-src_cvs_kh10.tar.bz2 patches: http://khibernation.sourceforge.net/download/release_v10/makedumpfile/makedumpfile-patches_cvs_kh10.tar.bz2 binary: http://khibernation.sourceforge.net/download/release_v10/makedumpfile/makedumpfile_cvs_kh10 - An initramfs image can be used as the root file system of kexeced kernel. An initramfs image built with "BuildRoot" can be downloaded from the following URL: initramfs image: http://khibernation.sourceforge.net/download/release_v10/initramfs/rootfs_cvs_kh10.gz All user space tools above are included in the initramfs image. Usage example of simple hibernation: 1. Compile and install patched kernel with following options selected: CONFIG_X86_32=y CONFIG_RELOCATABLE=y CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y CONFIG_PM=y CONFIG_HIBERNATION=y CONFIG_KEXEC_JUMP=y 2. Build an initramfs image contains kexec-tool and makedumpfile, or download the pre-built initramfs image, called rootfs.gz in following text. 3. Prepare a partition to save memory image of original kernel, called hibernating partition in following text. 4. Boot kernel compiled in step 1 (kernel A). 5. In the kernel A, load kernel compiled in step 1 (kernel B) with /sbin/kexec. The shell command line can be as follow: /sbin/kexec --load-preserve-context /boot/bzImage --mem-min=0x100000 --mem-max=0xffffff --initrd=rootfs.gz 6. Boot the kernel B with following shell command line: /sbin/kexec -e 7. The kernel B will boot as normal kexec. In kernel B the memory image of kernel A can be saved into hibernating partition as follow: jump_back_entry=`cat /proc/cmdline | tr ' ' '\n' | grep kexec_jump_back_entry | cut -d '='` echo $jump_back_entry > kexec_jump_back_entry cp /proc/vmcore dump.elf Then you can shutdown the machine as normal. 8. Boot kernel compiled in step 1 (kernel C). Use the rootfs.gz as root file system. 9. In kernel C, load the memory image of kernel A as follow: /sbin/kexec -l --args-none --entry=`cat kexec_jump_back_entry` dump.elf 10. Jump back to the kernel A as follow: /sbin/kexec -e Then, kernel A is resumed. Implementation point: To support jumping between two kernels, before jumping to (executing) the new kernel and jumping back to the original kernel, the devices are put into quiescent state, and the state of devices and CPU is saved. After jumping back from kexeced kernel and jumping to the new kernel, the state of devices and CPU are restored accordingly. The devices/CPU state save/restore code of software suspend is called to implement corresponding function. Known issues: - Because the segment number supported by sys_kexec_load is limited, hibernation image with many segments may not be load. This is planned to be eliminated by adding a new flag to sys_kexec_load to make a image can be loaded with multiple sys_kexec_load invoking. Now, only the i386 architecture is supported. Signed-off-by: Huang Ying Acked-by: Vivek Goyal Cc: "Eric W. Biederman" Cc: Pavel Machek Cc: Nigel Cunningham Cc: "Rafael J. Wysocki" Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/machine_kexec_32.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 2b67609d0a1..9fe478d9840 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -125,6 +125,18 @@ void machine_kexec(struct kimage *image) /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); + if (image->preserve_context) { +#ifdef CONFIG_X86_IO_APIC + /* We need to put APICs in legacy mode so that we can + * get timer interrupts in second kernel. kexec/kdump + * paths already have calls to disable_IO_APIC() in + * one form or other. kexec jump path also need + * one. + */ + disable_IO_APIC(); +#endif + } + control_page = page_address(image->control_code_page); memcpy(control_page, relocate_kernel, PAGE_SIZE/2); -- cgit v1.2.3