aboutsummaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@woody.linux-foundation.org>2008-02-09 23:29:57 -0800
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2008-02-09 23:29:57 -0800
commit0b6ca82af83a79f3d1001c8a0701ed34ac38126e (patch)
treedef8eb112c513b21e826e370f2f34249e97914eb /arch/x86
parentbfc1de0c40a26c6daa46c297e28138aecb4c5664 (diff)
parentfac84939609a683503947f41eb93e1917d026263 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-x86
* git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-x86: (32 commits) x86: cpa, strict range check in try_preserve_large_page() x86: cpa, enable CONFIG_DEBUG_PAGEALLOC on 64-bit x86: cpa, use page pool x86: introduce page pool in cpa x86: DEBUG_PAGEALLOC: enable after mem_init() brk: help text typo fix lguest: accept guest _PAGE_PWT page table entries x86 PM: update stale comments x86 PM: consolidate suspend and hibernation code x86 PM: rename 32-bit files in arch/x86/power x86 PM: move 64-bit hibernation files to arch/x86/power x86: trivial printk optimizations x86: fix early_ioremap pagetable ops x86: construct 32-bit boot time page tables in native format. x86, core: remove CONFIG_FORCED_INLINING x86: avoid unused variable warning in mm/init_64.c x86: fixup more paravirt fallout brk: document randomize_va_space and CONFIG_COMPAT_BRK (was Re: x86: fix sparse warnings in acpi/bus.c x86: fix sparse warning in topology.c ...
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig.debug6
-rw-r--r--arch/x86/Makefile4
-rw-r--r--arch/x86/boot/printf.c24
-rw-r--r--arch/x86/configs/i386_defconfig1
-rw-r--r--arch/x86/configs/x86_64_defconfig1
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c2
-rw-r--r--arch/x86/kernel/entry_32.S15
-rw-r--r--arch/x86/kernel/entry_64.S18
-rw-r--r--arch/x86/kernel/geode_32.c5
-rw-r--r--arch/x86/kernel/head_32.S151
-rw-r--r--arch/x86/kernel/mfgpt_32.c123
-rw-r--r--arch/x86/kernel/setup_32.c4
-rw-r--r--arch/x86/kernel/topology.c2
-rw-r--r--arch/x86/mm/init_32.c74
-rw-r--r--arch/x86/mm/init_64.c8
-rw-r--r--arch/x86/mm/ioremap.c55
-rw-r--r--arch/x86/mm/pageattr.c140
-rw-r--r--arch/x86/power/Makefile4
-rw-r--r--arch/x86/power/cpu_32.c (renamed from arch/x86/power/cpu.c)2
-rw-r--r--arch/x86/power/cpu_64.c (renamed from arch/x86/kernel/suspend_64.c)160
-rw-r--r--arch/x86/power/hibernate_32.c (renamed from arch/x86/power/suspend.c)6
-rw-r--r--arch/x86/power/hibernate_64.c169
-rw-r--r--arch/x86/power/hibernate_asm_32.S (renamed from arch/x86/power/swsusp.S)3
-rw-r--r--arch/x86/power/hibernate_asm_64.S (renamed from arch/x86/kernel/suspend_asm_64.S)9
-rw-r--r--arch/x86/xen/mmu.c6
-rw-r--r--arch/x86/xen/time.c10
27 files changed, 609 insertions, 395 deletions
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index fa555148823..864affc9a7b 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -34,13 +34,9 @@ config DEBUG_STACK_USAGE
This option will slow down process creation somewhat.
-comment "Page alloc debug is incompatible with Software Suspend on i386"
- depends on DEBUG_KERNEL && HIBERNATION
- depends on X86_32
-
config DEBUG_PAGEALLOC
bool "Debug page memory allocations"
- depends on DEBUG_KERNEL && X86_32
+ depends on DEBUG_KERNEL
help
Unmap pages from the kernel linear mapping after free_pages().
This results in a large slowdown, but helps to find certain types
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 364865b1b08..204af43535c 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -191,8 +191,10 @@ drivers-$(CONFIG_PCI) += arch/x86/pci/
# must be linked after kernel/
drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/
-ifeq ($(CONFIG_X86_32),y)
+# suspend and hibernation support
drivers-$(CONFIG_PM) += arch/x86/power/
+
+ifeq ($(CONFIG_X86_32),y)
drivers-$(CONFIG_FB) += arch/x86/video/
endif
diff --git a/arch/x86/boot/printf.c b/arch/x86/boot/printf.c
index 1a09f9309d3..7e7e890699b 100644
--- a/arch/x86/boot/printf.c
+++ b/arch/x86/boot/printf.c
@@ -33,8 +33,8 @@ static int skip_atoi(const char **s)
#define PLUS 4 /* show plus */
#define SPACE 8 /* space if plus */
#define LEFT 16 /* left justified */
-#define SPECIAL 32 /* 0x */
-#define LARGE 64 /* use 'ABCDEF' instead of 'abcdef' */
+#define SMALL 32 /* Must be 32 == 0x20 */
+#define SPECIAL 64 /* 0x */
#define do_div(n,base) ({ \
int __res; \
@@ -45,12 +45,16 @@ __res; })
static char *number(char *str, long num, int base, int size, int precision,
int type)
{
- char c, sign, tmp[66];
- const char *digits = "0123456789abcdefghijklmnopqrstuvwxyz";
+ /* we are called with base 8, 10 or 16, only, thus don't need "G..." */
+ static const char digits[16] = "0123456789ABCDEF"; /* "GHIJKLMNOPQRSTUVWXYZ"; */
+
+ char tmp[66];
+ char c, sign, locase;
int i;
- if (type & LARGE)
- digits = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+ /* locase = 0 or 0x20. ORing digits or letters with 'locase'
+ * produces same digits or (maybe lowercased) letters */
+ locase = (type & SMALL);
if (type & LEFT)
type &= ~ZEROPAD;
if (base < 2 || base > 36)
@@ -81,7 +85,7 @@ static char *number(char *str, long num, int base, int size, int precision,
tmp[i++] = '0';
else
while (num != 0)
- tmp[i++] = digits[do_div(num, base)];
+ tmp[i++] = (digits[do_div(num, base)] | locase);
if (i > precision)
precision = i;
size -= precision;
@@ -95,7 +99,7 @@ static char *number(char *str, long num, int base, int size, int precision,
*str++ = '0';
else if (base == 16) {
*str++ = '0';
- *str++ = digits[33];
+ *str++ = ('X' | locase);
}
}
if (!(type & LEFT))
@@ -244,9 +248,9 @@ int vsprintf(char *buf, const char *fmt, va_list args)
base = 8;
break;
- case 'X':
- flags |= LARGE;
case 'x':
+ flags |= SMALL;
+ case 'X':
base = 16;
break;
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 77562e7cdab..3df340b54e5 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1421,7 +1421,6 @@ CONFIG_DEBUG_BUGVERBOSE=y
# CONFIG_DEBUG_VM is not set
# CONFIG_DEBUG_LIST is not set
# CONFIG_FRAME_POINTER is not set
-# CONFIG_FORCED_INLINING is not set
# CONFIG_RCU_TORTURE_TEST is not set
# CONFIG_LKDTM is not set
# CONFIG_FAULT_INJECTION is not set
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 9e2b0ef851d..eef98cb00c6 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1346,7 +1346,6 @@ CONFIG_DEBUG_BUGVERBOSE=y
# CONFIG_DEBUG_VM is not set
# CONFIG_DEBUG_LIST is not set
# CONFIG_FRAME_POINTER is not set
-# CONFIG_FORCED_INLINING is not set
# CONFIG_RCU_TORTURE_TEST is not set
# CONFIG_LKDTM is not set
# CONFIG_FAULT_INJECTION is not set
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 21dc1a061bf..76ec0f8f138 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -84,8 +84,6 @@ ifeq ($(CONFIG_X86_64),y)
obj-y += genapic_64.o genapic_flat_64.o
obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
obj-$(CONFIG_AUDIT) += audit_64.o
- obj-$(CONFIG_PM) += suspend_64.o
- obj-$(CONFIG_HIBERNATION) += suspend_asm_64.o
obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 24885be5c48..9b7e01daa1c 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -118,7 +118,7 @@ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
{
- return sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
+ sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
}
/* Mutex protecting device creation against CPU hotplug */
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index be5c31d0488..824e21b80aa 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -409,7 +409,8 @@ restore_nocheck_notrace:
RESTORE_REGS
addl $4, %esp # skip orig_eax/error_code
CFI_ADJUST_CFA_OFFSET -4
-1: INTERRUPT_RETURN
+ENTRY(irq_return)
+ INTERRUPT_RETURN
.section .fixup,"ax"
iret_exc:
pushl $0 # no error code
@@ -418,7 +419,7 @@ iret_exc:
.previous
.section __ex_table,"a"
.align 4
- .long 1b,iret_exc
+ .long irq_return,iret_exc
.previous
CFI_RESTORE_STATE
@@ -865,20 +866,16 @@ nmi_espfix_stack:
RESTORE_REGS
lss 12+4(%esp), %esp # back to espfix stack
CFI_ADJUST_CFA_OFFSET -24
-1: INTERRUPT_RETURN
+ jmp irq_return
CFI_ENDPROC
-.section __ex_table,"a"
- .align 4
- .long 1b,iret_exc
-.previous
KPROBE_END(nmi)
#ifdef CONFIG_PARAVIRT
ENTRY(native_iret)
-1: iret
+ iret
.section __ex_table,"a"
.align 4
- .long 1b,iret_exc
+ .long native_iret, iret_exc
.previous
END(native_iret)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index c7341e81941..6be39a387c5 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -581,16 +581,24 @@ retint_restore_args: /* return to kernel space */
*/
TRACE_IRQS_IRETQ
restore_args:
- RESTORE_ARGS 0,8,0
-#ifdef CONFIG_PARAVIRT
+ RESTORE_ARGS 0,8,0
+
+ENTRY(irq_return)
INTERRUPT_RETURN
-#endif
+
+ .section __ex_table, "a"
+ .quad irq_return, bad_iret
+ .previous
+
+#ifdef CONFIG_PARAVIRT
ENTRY(native_iret)
iretq
.section __ex_table,"a"
.quad native_iret, bad_iret
.previous
+#endif
+
.section .fixup,"ax"
bad_iret:
/*
@@ -804,7 +812,7 @@ paranoid_swapgs\trace:
SWAPGS_UNSAFE_STACK
paranoid_restore\trace:
RESTORE_ALL 8
- INTERRUPT_RETURN
+ jmp irq_return
paranoid_userspace\trace:
GET_THREAD_INFO(%rcx)
movl threadinfo_flags(%rcx),%ebx
@@ -919,7 +927,7 @@ error_kernelspace:
iret run with kernel gs again, so don't set the user space flag.
B stepping K8s sometimes report an truncated RIP for IRET
exceptions returning to compat mode. Check for these here too. */
- leaq native_iret(%rip),%rbp
+ leaq irq_return(%rip),%rbp
cmpq %rbp,RIP(%rsp)
je error_swapgs
movl %ebp,%ebp /* zero extend */
diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c
index 9c7f7d39596..9dad6ca6cd7 100644
--- a/arch/x86/kernel/geode_32.c
+++ b/arch/x86/kernel/geode_32.c
@@ -163,14 +163,11 @@ EXPORT_SYMBOL_GPL(geode_gpio_setup_event);
static int __init geode_southbridge_init(void)
{
- int timers;
-
if (!is_geode())
return -ENODEV;
init_lbars();
- timers = geode_mfgpt_detect();
- printk(KERN_INFO "geode: %d MFGPT timers available.\n", timers);
+ (void) mfgpt_timer_setup();
return 0;
}
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 5d8c5730686..74ef4a41f22 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -19,6 +19,10 @@
#include <asm/thread_info.h>
#include <asm/asm-offsets.h>
#include <asm/setup.h>
+#include <asm/processor-flags.h>
+
+/* Physical address */
+#define pa(X) ((X) - __PAGE_OFFSET)
/*
* References to members of the new_cpu_data structure.
@@ -80,10 +84,6 @@ INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_
*/
.section .text.head,"ax",@progbits
ENTRY(startup_32)
- /* check to see if KEEP_SEGMENTS flag is meaningful */
- cmpw $0x207, BP_version(%esi)
- jb 1f
-
/* test KEEP_SEGMENTS flag to see if the bootloader is asking
us to not reload segments */
testb $(1<<6), BP_loadflags(%esi)
@@ -92,7 +92,7 @@ ENTRY(startup_32)
/*
* Set segments to known values.
*/
-1: lgdt boot_gdt_descr - __PAGE_OFFSET
+ lgdt pa(boot_gdt_descr)
movl $(__BOOT_DS),%eax
movl %eax,%ds
movl %eax,%es
@@ -105,8 +105,8 @@ ENTRY(startup_32)
*/
cld
xorl %eax,%eax
- movl $__bss_start - __PAGE_OFFSET,%edi
- movl $__bss_stop - __PAGE_OFFSET,%ecx
+ movl $pa(__bss_start),%edi
+ movl $pa(__bss_stop),%ecx
subl %edi,%ecx
shrl $2,%ecx
rep ; stosl
@@ -118,31 +118,32 @@ ENTRY(startup_32)
* (kexec on panic case). Hence copy out the parameters before initializing
* page tables.
*/
- movl $(boot_params - __PAGE_OFFSET),%edi
+ movl $pa(boot_params),%edi
movl $(PARAM_SIZE/4),%ecx
cld
rep
movsl
- movl boot_params - __PAGE_OFFSET + NEW_CL_POINTER,%esi
+ movl pa(boot_params) + NEW_CL_POINTER,%esi
andl %esi,%esi
jz 1f # No comand line
- movl $(boot_command_line - __PAGE_OFFSET),%edi
+ movl $pa(boot_command_line),%edi
movl $(COMMAND_LINE_SIZE/4),%ecx
rep
movsl
1:
#ifdef CONFIG_PARAVIRT
- cmpw $0x207, (boot_params + BP_version - __PAGE_OFFSET)
+ /* This is can only trip for a broken bootloader... */
+ cmpw $0x207, pa(boot_params + BP_version)
jb default_entry
/* Paravirt-compatible boot parameters. Look to see what architecture
we're booting under. */
- movl (boot_params + BP_hardware_subarch - __PAGE_OFFSET), %eax
+ movl pa(boot_params + BP_hardware_subarch), %eax
cmpl $num_subarch_entries, %eax
jae bad_subarch
- movl subarch_entries - __PAGE_OFFSET(,%eax,4), %eax
+ movl pa(subarch_entries)(,%eax,4), %eax
subl $__PAGE_OFFSET, %eax
jmp *%eax
@@ -170,17 +171,68 @@ num_subarch_entries = (. - subarch_entries) / 4
* Mappings are created both at virtual address 0 (identity mapping)
* and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END.
*
- * Warning: don't use %esi or the stack in this code. However, %esp
- * can be used as a GPR if you really need it...
+ * Note that the stack is not yet set up!
*/
-page_pde_offset = (__PAGE_OFFSET >> 20);
+#define PTE_ATTR 0x007 /* PRESENT+RW+USER */
+#define PDE_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */
+#define PGD_ATTR 0x001 /* PRESENT (no other attributes) */
default_entry:
- movl $(pg0 - __PAGE_OFFSET), %edi
- movl $(swapper_pg_dir - __PAGE_OFFSET), %edx
- movl $0x007, %eax /* 0x007 = PRESENT+RW+USER */
+#ifdef CONFIG_X86_PAE
+
+ /*
+ * In PAE mode swapper_pg_dir is statically defined to contain enough
+ * entries to cover the VMSPLIT option (that is the top 1, 2 or 3
+ * entries). The identity mapping is handled by pointing two PGD
+ * entries to the first kernel PMD.
+ *
+ * Note the upper half of each PMD or PTE are always zero at
+ * this stage.
+ */
+
+#define KPMDS ((0x100000000-__PAGE_OFFSET) >> 30) /* Number of kernel PMDs */
+
+ xorl %ebx,%ebx /* %ebx is kept at zero */
+
+ movl $pa(pg0), %edi
+ movl $pa(swapper_pg_pmd), %edx
+ movl $PTE_ATTR, %eax
+10:
+ leal PDE_ATTR(%edi),%ecx /* Create PMD entry */
+ movl %ecx,(%edx) /* Store PMD entry */
+ /* Upper half already zero */
+ addl $8,%edx
+ movl $512,%ecx
+11:
+ stosl
+ xchgl %eax,%ebx
+ stosl
+ xchgl %eax,%ebx
+ addl $0x1000,%eax
+ loop 11b
+
+ /*
+ * End condition: we must map up to and including INIT_MAP_BEYOND_END
+ * bytes beyond the end of our own page tables.
+ */
+ leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp
+ cmpl %ebp,%eax
+ jb 10b
+1:
+ movl %edi,pa(init_pg_tables_end)
+
+ /* Do early initialization of the fixmap area */
+ movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax
+ movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8)
+#else /* Not PAE */
+
+page_pde_offset = (__PAGE_OFFSET >> 20);
+
+ movl $pa(pg0), %edi
+ movl $pa(swapper_pg_dir), %edx
+ movl $PTE_ATTR, %eax
10:
- leal 0x007(%edi),%ecx /* Create PDE entry */
+ leal PDE_ATTR(%edi),%ecx /* Create PDE entry */
movl %ecx,(%edx) /* Store identity PDE entry */
movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */
addl $4,%edx
@@ -189,19 +241,20 @@ default_entry:
stosl
addl $0x1000,%eax
loop 11b
- /* End condition: we must map up to and including INIT_MAP_BEYOND_END */
- /* bytes beyond the end of our own page tables; the +0x007 is the attribute bits */
- leal (INIT_MAP_BEYOND_END+0x007)(%edi),%ebp
+ /*
+ * End condition: we must map up to and including INIT_MAP_BEYOND_END
+ * bytes beyond the end of our own page tables; the +0x007 is
+ * the attribute bits
+ */
+ leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp
cmpl %ebp,%eax
jb 10b
- movl %edi,(init_pg_tables_end - __PAGE_OFFSET)
-
- /* Do an early initialization of the fixmap area */
- movl $(swapper_pg_dir - __PAGE_OFFSET), %edx
- movl $(swapper_pg_pmd - __PAGE_OFFSET), %eax
- addl $0x67, %eax /* 0x67 == _PAGE_TABLE */
- movl %eax, 4092(%edx)
+ movl %edi,pa(init_pg_tables_end)
+ /* Do early initialization of the fixmap area */
+ movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax
+ movl %eax,pa(swapper_pg_dir+0xffc)
+#endif
jmp 3f
/*
* Non-boot CPU entry point; entered from trampoline.S
@@ -241,7 +294,7 @@ ENTRY(startup_32_smp)
* NOTE! We have to correct for the fact that we're
* not yet offset PAGE_OFFSET..
*/
-#define cr4_bits mmu_cr4_features-__PAGE_OFFSET
+#define cr4_bits pa(mmu_cr4_features)
movl cr4_bits,%edx
andl %edx,%edx
jz 6f
@@ -276,10 +329,10 @@ ENTRY(startup_32_smp)
/*
* Enable paging
*/
- movl $swapper_pg_dir-__PAGE_OFFSET,%eax
+ movl $pa(swapper_pg_dir),%eax
movl %eax,%cr3 /* set the page table pointer.. */
movl %cr0,%eax
- orl $0x80000000,%eax
+ orl $X86_CR0_PG,%eax
movl %eax,%cr0 /* ..and set paging (PG) bit */
ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */
1:
@@ -552,16 +605,44 @@ ENTRY(_stext)
*/
.section ".bss.page_aligned","wa"
.align PAGE_SIZE_asm
+#ifdef CONFIG_X86_PAE
+ENTRY(swapper_pg_pmd)
+ .fill 1024*KPMDS,4,0
+#else
ENTRY(swapper_pg_dir)
.fill 1024,4,0
-ENTRY(swapper_pg_pmd)
+#endif
+ENTRY(swapper_pg_fixmap)
.fill 1024,4,0
ENTRY(empty_zero_page)
.fill 4096,1,0
-
/*
* This starts the data section.
*/
+#ifdef CONFIG_X86_PAE
+.section ".data.page_aligned","wa"
+ /* Page-aligned for the benefit of paravirt? */
+ .align PAGE_SIZE_asm
+ENTRY(swapper_pg_dir)
+ .long pa(swapper_pg_pmd+PGD_ATTR),0 /* low identity map */
+# if KPMDS == 3
+ .long pa(swapper_pg_pmd+PGD_ATTR),0
+ .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0
+ .long pa(swapper_pg_pmd+PGD_ATTR+0x2000),0
+# elif KPMDS == 2
+ .long 0,0
+ .long pa(swapper_pg_pmd+PGD_ATTR),0
+ .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0
+# elif KPMDS == 1
+ .long 0,0
+ .long 0,0
+ .long pa(swapper_pg_pmd+PGD_ATTR),0
+# else
+# error "Kernel PMDs should be 1, 2 or 3"
+# endif
+ .align PAGE_SIZE_asm /* needs to be page-sized too */
+#endif
+
.data
ENTRY(stack_start)
.long init_thread_union+THREAD_SIZE
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 219f86eb612..027fc067b39 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -12,48 +12,37 @@
*/
/*
- * We are using the 32Khz input clock - its the only one that has the
+ * We are using the 32.768kHz input clock - it's the only one that has the
* ranges we find desirable. The following table lists the suitable
- * divisors and the associated hz, minimum interval
- * and the maximum interval:
+ * divisors and the associated Hz, minimum interval and the maximum interval:
*
- * Divisor Hz Min Delta (S) Max Delta (S)
- * 1 32000 .0005 2.048
- * 2 16000 .001 4.096
- * 4 8000 .002 8.192
- * 8 4000 .004 16.384
- * 16 2000 .008 32.768
- * 32 1000 .016 65.536
- * 64 500 .032 131.072
- * 128 250 .064 262.144
- * 256 125 .128 524.288
+ * Divisor Hz Min Delta (s) Max Delta (s)
+ * 1 32768 .00048828125 2.000
+ * 2 16384 .0009765625 4.000
+ * 4 8192 .001953125 8.000
+ * 8 4096 .00390625 16.000
+ * 16 2048 .0078125 32.000
+ * 32 1024 .015625 64.000
+ * 64 512 .03125 128.000
+ * 128 256 .0625 256.000
+ * 256 128 .125 512.000
*/
#include <linux/kernel.h>
#include <linux/interrupt.h>
-#include <linux/module.h>
#include <asm/geode.h>
-#define F_AVAIL 0x01
-
static struct mfgpt_timer_t {
- int flags;
- struct module *owner;
+ unsigned int avail:1;
} mfgpt_timers[MFGPT_MAX_TIMERS];
/* Selected from the table above */
#define MFGPT_DIVISOR 16
#define MFGPT_SCALE 4 /* divisor = 2^(scale) */
-#define MFGPT_HZ (32000 / MFGPT_DIVISOR)
+#define MFGPT_HZ (32768 / MFGPT_DIVISOR)
#define MFGPT_PERIODIC (MFGPT_HZ / HZ)
-#ifdef CONFIG_GEODE_MFGPT_TIMER
-static int __init mfgpt_timer_setup(void);
-#else
-#define mfgpt_timer_setup() (0)
-#endif
-
/* Allow for disabling of MFGPTs */
static int disable;
static int __init mfgpt_disable(char *s)
@@ -85,28 +74,37 @@ __setup("mfgptfix", mfgpt_fix);
* In other cases (such as with VSAless OpenFirmware), the system firmware
* leaves timers available for us to use.
*/
-int __init geode_mfgpt_detect(void)
+
+
+static int timers = -1;
+
+static void geode_mfgpt_detect(void)
{
- int count = 0, i;
+ int i;
u16 val;
+ timers = 0;
+
if (disable) {
- printk(KERN_INFO "geode-mfgpt: Skipping MFGPT setup\n");
- return 0;
+ printk(KERN_INFO "geode-mfgpt: MFGPT support is disabled\n");
+ goto done;
+ }
+
+ if (!geode_get_dev_base(GEODE_DEV_MFGPT)) {
+ printk(KERN_INFO "geode-mfgpt: MFGPT LBAR is not set up\n");
+ goto done;
}
for (i = 0; i < MFGPT_MAX_TIMERS; i++) {
val = geode_mfgpt_read(i, MFGPT_REG_SETUP);
if (!(val & MFGPT_SETUP_SETUP)) {
- mfgpt_timers[i].flags = F_AVAIL;
- count++;
+ mfgpt_timers[i].avail = 1;
+ timers++;
}
}
- /* set up clock event device, if desired */
- i = mfgpt_timer_setup();
-
- return count;
+done:
+ printk(KERN_INFO "geode-mfgpt: %d MFGPT timers available.\n", timers);
}
int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable)
@@ -183,36 +181,41 @@ int geode_mfgpt_set_irq(int timer, int cmp, int irq, int enable)
return 0;
}
-static int mfgpt_get(int timer, struct module *owner)
+static int mfgpt_get(int timer)
{
- mfgpt_timers[timer].flags &= ~F_AVAIL;
- mfgpt_timers[timer].owner = owner;
+ mfgpt_timers[timer].avail = 0;
printk(KERN_INFO "geode-mfgpt: Registered timer %d\n", timer);
return timer;
}
-int geode_mfgpt_alloc_timer(int timer, int domain, struct module *owner)
+int geode_mfgpt_alloc_timer(int timer, int domain)
{
int i;
- if (!geode_get_dev_base(GEODE_DEV_MFGPT))
- return -ENODEV;
+ if (timers == -1) {
+ /* timers haven't been detected yet */
+ geode_mfgpt_detect();
+ }
+
+ if (!timers)
+ return -1;
+
if (timer >= MFGPT_MAX_TIMERS)
- return -EIO;
+ return -1;
if (timer < 0) {
/* Try to find an available timer */
for (i = 0; i < MFGPT_MAX_TIMERS; i++) {
- if (mfgpt_timers[i].flags & F_AVAIL)
- return mfgpt_get(i, owner);
+ if (mfgpt_timers[i].avail)
+ return mfgpt_get(i);
if (i == 5 && domain == MFGPT_DOMAIN_WORKING)
break;
}
} else {
/* If they requested a specific timer, try to honor that */
- if (mfgpt_timers[timer].flags & F_AVAIL)
- return mfgpt_get(timer, owner);
+ if (mfgpt_timers[timer].avail)
+ return mfgpt_get(timer);
}
/* No timers available - too bad */
@@ -244,10 +247,11 @@ static int __init mfgpt_setup(char *str)
}
__setup("mfgpt_irq=", mfgpt_setup);
-static inline void mfgpt_disable_timer(u16 clock)
+static void mfgpt_disable_timer(u16 clock)
{
- u16 val = geode_mfgpt_read(clock, MFGPT_REG_SETUP);
- geode_mfgpt_write(clock, MFGPT_REG_SETUP, val & ~MFGPT_SETUP_CNTEN);
+ /* avoid races by clearing CMP1 and CMP2 unconditionally */
+ geode_mfgpt_write(clock, MFGPT_REG_SETUP, (u16) ~MFGPT_SETUP_CNTEN |
+ MFGPT_SETUP_CMP1 | MFGPT_SETUP_CMP2);
}
static int mfgpt_next_event(unsigned long, struct clock_event_device *);
@@ -263,7 +267,7 @@ static struct clock_event_device mfgpt_clockevent = {
.shift = 32
};
-static inline void mfgpt_start_timer(u16 clock, u16 delta)
+static void mfgpt_start_timer(u16 delta)
{
geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_CMP2, (u16) delta);
geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_COUNTER, 0);
@@ -278,21 +282,25 @@ static void mfgpt_set_mode(enum clock_event_mode mode,
mfgpt_disable_timer(mfgpt_event_clock);
if (mode == CLOCK_EVT_MODE_PERIODIC)
- mfgpt_start_timer(mfgpt_event_clock, MFGPT_PERIODIC);
+ mfgpt_start_timer(MFGPT_PERIODIC);
mfgpt_tick_mode = mode;
}
static int mfgpt_next_event(unsigned long delta, struct clock_event_device *evt)
{
- mfgpt_start_timer(mfgpt_event_clock, delta);
+ mfgpt_start_timer(delta);
return 0;
}
-/* Assume (foolishly?), that this interrupt was due to our tick */
-
static irqreturn_t mfgpt_tick(int irq, void *dev_id)
{
+ u16 val = geode_mfgpt_read(mfgpt_event_clock, MFGPT_REG_SETUP);
+
+ /* See if the interrupt was for us */
+ if (!(val & (MFGPT_SETUP_SETUP | MFGPT_SETUP_CMP2 | MFGPT_SETUP_CMP1)))
+ return IRQ_NONE;
+
/* Turn off the clock (and clear the event) */
mfgpt_disable_timer(mfgpt_event_clock);
@@ -320,13 +328,12 @@ static struct irqaction mfgptirq = {
.name = "mfgpt-timer"
};
-static int __init mfgpt_timer_setup(void)
+int __init mfgpt_timer_setup(void)
{
int timer, ret;
u16 val;
- timer = geode_mfgpt_alloc_timer(MFGPT_TIMER_ANY, MFGPT_DOMAIN_WORKING,
- THIS_MODULE);
+ timer = geode_mfgpt_alloc_timer(MFGPT_TIMER_ANY, MFGPT_DOMAIN_WORKING);
if (timer < 0) {
printk(KERN_ERR
"mfgpt-timer: Could not allocate a MFPGT timer\n");
@@ -363,7 +370,7 @@ static int __init mfgpt_timer_setup(void)
&mfgpt_clockevent);
printk(KERN_INFO
- "mfgpt-timer: registering the MFGT timer as a clock event.\n");
+ "mfgpt-timer: registering the MFGPT timer as a clock event.\n");
clockevents_register_device(&mfgpt_clockevent);
return 0;
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index d1d8c347cc0..691ab4cb167 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -154,7 +154,11 @@ struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
EXPORT_SYMBOL(boot_cpu_data);
+#ifndef CONFIG_X86_PAE
unsigned long mmu_cr4_features;
+#else
+unsigned long mmu_cr4_features = X86_CR4_PAE;
+#endif
/* for MCA, but anyone else can use it if they want */
unsigned int machine_id;
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index e6757aaa202..a40051b71d9 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -53,7 +53,7 @@ EXPORT_SYMBOL(arch_register_cpu);
void arch_unregister_cpu(int num)
{
- return unregister_cpu(&per_cpu(cpu_devices, num).cpu);
+ unregister_cpu(&per_cpu(cpu_devices, num).cpu);
}
EXPORT_SYMBOL(arch_unregister_cpu);
#else
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index d1bc04006d1..8106bba41ec 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -46,6 +46,7 @@
#include <asm/pgalloc.h>
#include <asm/sections.h>
#include <asm/paravirt.h>
+#include <asm/setup.h>
unsigned int __VMALLOC_RESERVE = 128 << 20;
@@ -328,44 +329,38 @@ pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
void __init native_pagetable_setup_start(pgd_t *base)
{
-#ifdef CONFIG_X86_PAE
- int i;
+ unsigned long pfn, va;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
/*
- * Init entries of the first-level page table to the
- * zero page, if they haven't already been set up.
- *
- * In a normal native boot, we'll be running on a
- * pagetable rooted in swapper_pg_dir, but not in PAE
- * mode, so this will end up clobbering the mappings
- * for the lower 24Mbytes of the address space,
- * without affecting the kernel address space.
+ * Remove any mappings which extend past the end of physical
+ * memory from the boot time page table:
*/
- for (i = 0; i < USER_PTRS_PER_PGD; i++)
- set_pgd(&base[i],
- __pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
-
- /* Make sure kernel address space is empty so that a pagetable
- will be allocated for it. */
- memset(&base[USER_PTRS_PER_PGD], 0,
- KERNEL_PGD_PTRS * sizeof(pgd_t));
-#else
+ for (pfn = max_low_pfn + 1; pfn < 1<<(32-PAGE_SHIFT); pfn++) {
+ va = PAGE_OFFSET + (pfn<<PAGE_SHIFT);
+ pgd = base + pgd_index(va);
+ if (!pgd_present(*pgd))
+ break;
+
+ pud = pud_offset(pgd, va);
+ pmd = pmd_offset(pud, va);
+ if (!pmd_present(*pmd))
+ break;
+
+ pte = pte_offset_kernel(pmd, va);
+ if (!pte_present(*pte))
+ break;
+
+ pte_clear(NULL, va, pte);
+ }
paravirt_alloc_pd(&init_mm, __pa(base) >> PAGE_SHIFT);
-#endif
}
void __init native_pagetable_setup_done(pgd_t *base)
{
-#ifdef CONFIG_X86_PAE
- /*
- * Add low memory identity-mappings - SMP needs it when
- * starting up on an AP from real-mode. In the non-PAE
- * case we already have these mappings through head.S.
- * All user-space mappings are explicitly cleared after
- * SMP startup.
- */
- set_pgd(&base[0], base[USER_PTRS_PER_PGD]);
-#endif
}
/*
@@ -374,9 +369,8 @@ void __init native_pagetable_setup_done(pgd_t *base)
* the boot process.
*
* If we're booting on native hardware, this will be a pagetable
- * constructed in arch/i386/kernel/head.S, and not running in PAE mode
- * (even if we'll end up running in PAE). The root of the pagetable
- * will be swapper_pg_dir.
+ * constructed in arch/x86/kernel/head_32.S. The root of the
+ * pagetable will be swapper_pg_dir.
*
* If we're booting paravirtualized under a hypervisor, then there are
* more options: we may already be running PAE, and the pagetable may
@@ -537,14 +531,6 @@ void __init paging_init(void)
load_cr3(swapper_pg_dir);
-#ifdef CONFIG_X86_PAE
- /*
- * We will bail out later - printk doesn't work right now so
- * the user would just see a hanging kernel.
- */
- if (cpu_has_pae)
- set_in_cr4(X86_CR4_PAE);
-#endif
__flush_tlb_all();
kmap_init();
@@ -675,13 +661,11 @@ void __init mem_init(void)
BUG_ON((unsigned long)high_memory > VMALLOC_START);
#endif /* double-sanity-check paranoia */
-#ifdef CONFIG_X86_PAE
- if (!cpu_has_pae)
- panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
-#endif
if (boot_cpu_data.wp_works_ok < 0)
test_wp_bit();
+ cpa_init();
+
/*
* Subtle. SMP is doing it's boot stuff late (because it has to
* fork idle threads) - but it also needs low mappings for the
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5fe880fc305..b59fc238151 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -528,13 +528,15 @@ void __init mem_init(void)
reservedpages << (PAGE_SHIFT-10),
datasize >> 10,
initsize >> 10);
+
+ cpa_init();
}
void free_init_pages(char *what, unsigned long begin, unsigned long end)
{
- unsigned long addr;
+ unsigned long addr = begin;
- if (begin >= end)
+ if (addr >= end)
return;
/*
@@ -549,7 +551,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
#else
printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
- for (addr = begin; addr < end; addr += PAGE_SIZE) {
+ for (; addr < end; addr += PAGE_SIZE) {
ClearPageReserved(virt_to_page(addr));
init_page_count(virt_to_page(addr));
memset((void *)(addr & ~(PAGE_SIZE-1)),
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index ee6648fe6b1..a4897a85268 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -260,41 +260,46 @@ static int __init early_ioremap_debug_setup(char *str)
early_param("early_ioremap_debug", early_ioremap_debug_setup);
static __initdata int after_paging_init;
-static __initdata unsigned long bm_pte[1024]
+static __initdata pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
__attribute__((aligned(PAGE_SIZE)));
-static inline unsigned long * __init early_ioremap_pgd(unsigned long addr)
+static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
{
- return (unsigned long *)swapper_pg_dir + ((addr >> 22) & 1023);
+ pgd_t *pgd = &swapper_pg_dir[pgd_index(addr)];
+ pud_t *pud = pud_offset(pgd, addr);
+ pmd_t *pmd = pmd_offset(pud, addr);
+
+ return pmd;
}
-static inline unsigned long * __init early_ioremap_pte(unsigned long addr)
+static inline pte_t * __init early_ioremap_pte(unsigned long addr)
{
- return bm_pte + ((addr >> PAGE_SHIFT) & 1023);
+ return &bm_pte[pte_index(addr)];
}
void __init early_ioremap_init(void)
{
- unsigned long *pgd;
+ pmd_t *pmd;
if (early_ioremap_debug)
printk(KERN_INFO "early_ioremap_init()\n");
- pgd = early_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN));
- *pgd = __pa(bm_pte) | _PAGE_TABLE;
+ pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
memset(bm_pte, 0, sizeof(bm_pte));
+ pmd_populate_kernel(&init_mm, pmd, bm_pte);
+
/*
- * The boot-ioremap range spans multiple pgds, for which
+ * The boot-ioremap range spans multiple pmds, for which
* we are not prepared:
*/
- if (pgd != early_ioremap_pgd(fix_to_virt(FIX_BTMAP_END))) {
+ if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) {
WARN_ON(1);
- printk(KERN_WARNING "pgd %p != %p\n",
- pgd, early_ioremap_pgd(fix_to_virt(FIX_BTMAP_END)));
+ printk(KERN_WARNING "pmd %p != %p\n",
+ pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END)));
printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
- fix_to_virt(FIX_BTMAP_BEGIN));
+ fix_to_virt(FIX_BTMAP_BEGIN));
printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END): %08lx\n",
- fix_to_virt(FIX_BTMAP_END));
+ fix_to_virt(FIX_BTMAP_END));
printk(KERN_WARNING "FIX_BTMAP_END: %d\n", FIX_BTMAP_END);
printk(KERN_WARNING "FIX_BTMAP_BEGIN: %d\n",
@@ -304,28 +309,29 @@ void __init early_ioremap_init(void)
void __init early_ioremap_clear(void)
{
- unsigned long *pgd;
+ pmd_t *pmd;
if (early_ioremap_debug)
printk(KERN_INFO "early_ioremap_clear()\n");
- pgd = early_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN));
- *pgd = 0;
- paravirt_release_pt(__pa(pgd) >> PAGE_SHIFT);
+ pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
+ pmd_clear(pmd);
+ paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT);
__flush_tlb_all();
}
void __init early_ioremap_reset(void)
{
enum fixed_addresses idx;
- unsigned long *pte, phys, addr;
+ unsigned long addr, phys;
+ pte_t *pte;
after_paging_init = 1;
for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) {
addr = fix_to_virt(idx);
pte = early_ioremap_pte(addr);
- if (*pte & _PAGE_PRESENT) {
- phys = *pte & PAGE_MASK;
+ if (pte_present(*pte)) {
+ phys = pte_val(*pte) & PAGE_MASK;
set_fixmap(idx, phys);
}
}
@@ -334,7 +340,8 @@ void __init early_ioremap_reset(void)
static void __init __early_set_fixmap(enum fixed_addresses idx,
unsigned long phys, pgprot_t flags)
{
- unsigned long *pte, addr = __fix_to_virt(idx);
+ unsigned long addr = __fix_to_virt(idx);
+ pte_t *pte;
if (idx >= __end_of_fixed_addresses) {
BUG();
@@ -342,9 +349,9 @@ static void __init __early_set_fixmap(enum fixed_addresses idx,
}
pte = early_ioremap_pte(addr);
if (pgprot_val(flags))
- *pte = (phys & PAGE_MASK) | pgprot_val(flags);
+ set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags));
else
- *pte = 0;
+ pte_clear(NULL, addr, pte);
__flush_tlb_one(addr);
}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 8493c855582..440210a2277 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -8,6 +8,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/mm.h>
+#include <linux/interrupt.h>
#include <asm/e820.h>
#include <asm/processor.h>
@@ -191,7 +192,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address)
* or when the present bit is not set. Otherwise we would return a
* pointer to a nonexisting mapping.
*/
-pte_t *lookup_address(unsigned long address, int *level)
+pte_t *lookup_address(unsigned long address, unsigned int *level)
{
pgd_t *pgd = pgd_offset_k(address);
pud_t *pud;
@@ -252,10 +253,11 @@ static int
try_preserve_large_page(pte_t *kpte, unsigned long address,
struct cpa_data *cpa)
{
- unsigned long nextpage_addr, numpages, pmask, psize, flags;
+ unsigned long nextpage_addr, numpages, pmask, psize, flags, addr;
pte_t new_pte, old_pte, *tmp;
pgprot_t old_prot, new_prot;
- int level, do_split = 1;
+ int i, do_split = 1;
+ unsigned int level;
spin_lock_irqsave(&pgd_lock, flags);
/*
@@ -302,6 +304,19 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
new_prot = static_protections(new_prot, address);
/*
+ * We need to check the full range, whether
+ * static_protection() requires a different pgprot for one of
+ * the pages in the range we try to preserve:
+ */
+ addr = address + PAGE_SIZE;
+ for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE) {
+ pgprot_t chk_prot = static_protections(new_prot, addr);
+
+ if (pgprot_val(chk_prot) != pgprot_val(new_prot))
+ goto out_unlock;
+ }
+
+ /*
* If there are no changes, return. maxpages has been updated
* above:
*/
@@ -335,23 +350,103 @@ out_unlock:
return do_split;
}
+static LIST_HEAD(page_pool);
+static unsigned long pool_size, pool_pages, pool_low;
+static unsigned long pool_used, pool_failed, pool_refill;
+
+static void cpa_fill_pool(void)
+{
+ struct page *p;
+ gfp_t gfp = GFP_KERNEL;
+
+ /* Do not allocate from interrupt context */
+ if (in_irq() || irqs_disabled())
+ return;
+ /*
+ * Check unlocked. I does not matter when we have one more
+ * page in the pool. The bit lock avoids recursive pool
+ * allocations:
+ */
+ if (pool_pages >= pool_size || test_and_set_bit_lock(0, &pool_refill))
+ return;
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ /*
+ * We could do:
+ * gfp = in_atomic() ? GFP_ATOMIC : GFP_KERNEL;
+ * but this fails on !PREEMPT kernels
+ */
+ gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
+#endif
+
+ while (pool_pages < pool_size) {
+ p = alloc_pages(gfp, 0);
+ if (!p) {
+ pool_failed++;
+ break;
+ }
+ spin_lock_irq(&pgd_lock);
+ list_add(&p->lru, &page_pool);
+ pool_pages++;
+ spin_unlock_irq(&pgd_lock);
+ }
+ clear_bit_unlock(0, &pool_refill);
+}
+
+#define SHIFT_MB (20 - PAGE_SHIFT)
+#define ROUND_MB_GB ((1 << 10) - 1)
+#define SHIFT_MB_GB 10
+#define POOL_PAGES_PER_GB 16
+
+void __init cpa_init(void)
+{
+ struct sysinfo si;
+ unsigned long gb;
+
+ si_meminfo(&si);
+ /*
+ * Calculate the number of pool pages:
+ *
+ * Convert totalram (nr of pages) to MiB and round to the next
+ * GiB. Shift MiB to Gib and multiply the result by
+ * POOL_PAGES_PER_GB:
+ */
+ gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
+ pool_size = POOL_PAGES_PER_GB * gb;
+ pool_low = pool_size;
+
+ cpa_fill_pool();
+ printk(KERN_DEBUG
+ "CPA: page pool initialized %lu of %lu pages preallocated\n",
+ pool_pages, pool_size);
+}
+
static int split_large_page(pte_t *kpte, unsigned long address)
{
unsigned long flags, pfn, pfninc = 1;
- gfp_t gfp_flags = GFP_KERNEL;
unsigned int i, level;
pte_t *pbase, *tmp;
pgprot_t ref_prot;
struct page *base;
-#ifdef CONFIG_DEBUG_PAGEALLOC
- gfp_flags = GFP_ATOMIC | __GFP_NOWARN;
-#endif
- base = alloc_pages(gfp_flags, 0);
- if (!base)
+ /*
+ * Get a page from the pool. The pool list is protected by the
+ * pgd_lock, which we have to take anyway for the split
+ * operation:
+ */
+ spin_lock_irqsave(&pgd_lock, flags);
+ if (list_empty(&page_pool)) {
+ spin_unlock_irqrestore(&pgd_lock, flags);
return -ENOMEM;
+ }
+
+ base = list_first_entry(&page_pool, struct page, lru);
+ list_del(&base->lru);
+ pool_pages--;
+
+ if (pool_pages < pool_low)
+ pool_low = pool_pages;
- spin_lock_irqsave(&pgd_lock, flags);
/*
* Check for races, another CPU might have split this page
* up for us already:
@@ -396,17 +491,24 @@ static int split_large_page(pte_t *kpte, unsigned long address)
base = NULL;
out_unlock:
+ /*
+ * If we dropped out via the lookup_address check under
+ * pgd_lock then stick the page back into the pool:
+ */
+ if (base) {
+ list_add(&base->lru, &page_pool);
+ pool_pages++;
+ } else
+ pool_used++;
spin_unlock_irqrestore(&pgd_lock, flags);
- if (base)
- __free_pages(base, 0);
-
return 0;
}
static int __change_page_attr(unsigned long address, struct cpa_data *cpa)
{
- int level, do_split, err;
+ int do_split, err;
+ unsigned int level;
struct page *kpte_page;
pte_t *kpte;
@@ -598,7 +700,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
* Check whether we really changed something:
*/
if (!cpa.flushtlb)
- return ret;
+ goto out;
/*
* No need to flush, when we did not set any of the caching
@@ -617,6 +719,8 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
else
cpa_flush_all(cache);
+out:
+ cpa_fill_pool();
return ret;
}
@@ -770,6 +874,12 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
* but that can deadlock->flush only current cpu:
*/
__flush_tlb_all();
+
+ /*
+ * Try to refill the page pool here. We can do this only after
+ * the tlb flush.
+ */
+ cpa_fill_pool();
}
#endif
diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile
index d764ec95006..9ff4d5b55ad 100644
--- a/arch/x86/power/Makefile
+++ b/arch/x86/power/Makefile
@@ -1,2 +1,2 @@
-obj-$(CONFIG_PM) += cpu.o
-obj-$(CONFIG_HIBERNATION) += swsusp.o suspend.o
+obj-$(CONFIG_PM_SLEEP) += cpu_$(BITS).o
+obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu_32.c
index efcf620d143..7f9c6da04a4 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu_32.c
@@ -40,7 +40,7 @@ static void __save_processor_state(struct saved_context *ctxt)
savesegment(ss, ctxt->ss);
/*
- * control registers
+ * control registers
*/
ctxt->cr0 = read_cr0();
ctxt->cr2 = read_cr2();
diff --git a/arch/x86/kernel/suspend_64.c b/arch/x86/power/cpu_64.c
index 7ac7130022f..66bdfb591fd 100644
--- a/arch/x86/kernel/suspend_64.c
+++ b/arch/x86/power/cpu_64.c
@@ -1,8 +1,9 @@
/*
- * Suspend support specific for i386.
+ * Suspend and hibernation support for x86-64
*
* Distribute under GPLv2
*
+ * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl>
* Copyright (c) 2002 Pavel Machek <pavel@suse.cz>
* Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
*/
@@ -14,9 +15,6 @@
#include <asm/pgtable.h>
#include <asm/mtrr.h>
-/* References to section boundaries */
-extern const void __nosave_begin, __nosave_end;
-
static void fix_processor_context(void);
struct saved_context saved_context;
@@ -63,7 +61,7 @@ static void __save_processor_state(struct saved_context *ctxt)
mtrr_save_fixed_ranges(NULL);
/*
- * control registers
+ * control registers
*/
rdmsrl(MSR_EFER, ctxt->efer);
ctxt->cr0 = read_cr0();
@@ -166,155 +164,3 @@ static void fix_processor_context(void)
loaddebug(&current->thread, 7);
}
}
-
-#ifdef CONFIG_HIBERNATION
-/* Defined in arch/x86_64/kernel/suspend_asm.S */
-extern int restore_image(void);
-
-/*
- * Address to jump to in the last phase of restore in order to get to the image
- * kernel's text (this value is passed in the image header).
- */
-unsigned long restore_jump_address;
-
-/*
- * Value of the cr3 register from before the hibernation (this value is passed
- * in the image header).
- */
-unsigned long restore_cr3;
-
-pgd_t *temp_level4_pgt;
-
-void *relocated_restore_code;
-
-static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
-{
- long i, j;
-
- i = pud_index(address);
- pud = pud + i;
- for (; i < PTRS_PER_PUD; pud++, i++) {
- unsigned long paddr;
- pmd_t *pmd;
-
- paddr = address + i*PUD_SIZE;
- if (paddr >= end)
- break;
-
- pmd = (pmd_t *)get_safe_page(GFP_ATOMIC);
- if (!pmd)
- return -ENOMEM;
- set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
- for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
- unsigned long pe;
-
- if (paddr >= end)
- break;
- pe = __PAGE_KERNEL_LARGE_EXEC | paddr;
- pe &= __supported_pte_mask;
- set_pmd(pmd, __pmd(pe));
- }
- }
- return 0;
-}
-
-static int set_up_temporary_mappings(void)
-{
- unsigned long start, end, next;
- int error;
-
- temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC);
- if (!temp_level4_pgt)
- return -ENOMEM;
-
- /* It is safe to reuse the original kernel mapping */
- set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map),
- init_level4_pgt[pgd_index(__START_KERNEL_map)]);
-
- /* Set up the direct mapping from scratch */
- start = (unsigned long)pfn_to_kaddr(0);
- end = (unsigned long)pfn_to_kaddr(end_pfn);
-
- for (; start < end; start = next) {
- pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC);
- if (!pud)
- return -ENOMEM;
- next = start + PGDIR_SIZE;
- if (next > end)
- next = end;
- if ((error = res_phys_pud_init(pud, __pa(start), __pa(next))))
- return error;
- set_pgd(temp_level4_pgt + pgd_index(start),
- mk_kernel_pgd(__pa(pud)));
- }
- return 0;
-}
-
-int swsusp_arch_resume(void)
-{
- int error;
-
- /* We have got enough memory and from now on we cannot recover */
- if ((error = set_up_temporary_mappings()))
- return error;
-
- relocated_restore_code = (void *)get_safe_page(GFP_ATOMIC);
- if (!relocated_restore_code)
- return -ENOMEM;
- memcpy(relocated_restore_code, &core_restore_code,
- &restore_registers - &core_restore_code);
-
- restore_image();
- return 0;
-}
-
-/*
- * pfn_is_nosave - check if given pfn is in the 'nosave' section
- */
-
-int pfn_is_nosave(unsigned long pfn)
-{
- unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT;
- unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT;
- return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
-}
-
-struct restore_data_record {
- unsigned long jump_address;
- unsigned long cr3;
- unsigned long magic;
-};
-
-#define RESTORE_MAGIC 0x0123456789ABCDEFUL
-
-/**
- * arch_hibernation_header_save - populate the architecture specific part
- * of a hibernation image header
- * @addr: address to save the data at
- */
-int arch_hibernation_header_save(void *addr, unsigned int max_size)
-{
- struct restore_data_record *rdr = addr;
-
- if (max_size < sizeof(struct restore_data_record))
- return -EOVERFLOW;
- rdr->jump_address = restore_jump_address;
- rdr->cr3 = restore_cr3;
- rdr->magic = RESTORE_MAGIC;
- return 0;
-}
-
-/**
- * arch_hibernation_header_restore - read the architecture specific data
- * from the hibernation image header
- * @addr: address to read the data from
- */
-int arch_hibernation_header_restore(void *addr)
-{
- struct restore_data_record *rdr = addr;
-
- restore_jump_address = rdr->jump_address;
- restore_cr3 = rdr->cr3;
- return (rdr->magic == RESTORE_MAGIC) ? 0 : -EINVAL;
-}
-#endif /* CONFIG_HIBERNATION */
diff --git a/arch/x86/power/suspend.c b/arch/x86/power/hibernate_32.c
index a0020b913f3..f2b6e3f11bf 100644
--- a/arch/x86/power/suspend.c
+++ b/arch/x86/power/hibernate_32.c
@@ -1,5 +1,5 @@
/*
- * Suspend support specific for i386 - temporary page tables
+ * Hibernation support specific for i386 - temporary page tables
*
* Distribute under GPLv2
*
@@ -13,7 +13,7 @@
#include <asm/page.h>
#include <asm/pgtable.h>
-/* Defined in arch/i386/power/swsusp.S */
+/* Defined in hibernate_asm_32.S */
extern int restore_image(void);
/* References to section boundaries */
@@ -23,7 +23,7 @@ extern const void __nosave_begin, __nosave_end;
pgd_t *resume_pg_dir;
/* The following three functions are based on the analogous code in
- * arch/i386/mm/init.c
+ * arch/x86/mm/init_32.c
*/
/*
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
new file mode 100644
index 00000000000..b542355e0e3
--- /dev/null
+++ b/arch/x86/power/hibernate_64.c
@@ -0,0 +1,169 @@
+/*
+ * Hibernation support for x86-64
+ *
+ * Distribute under GPLv2
+ *
+ * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl>
+ * Copyright (c) 2002 Pavel Machek <pavel@suse.cz>
+ * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
+ */
+
+#include <linux/smp.h>
+#include <linux/suspend.h>
+#include <asm/proto.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/mtrr.h>
+
+/* References to section boundaries */
+extern const void __nosave_begin, __nosave_end;
+
+/* Defined in hibernate_asm_64.S */
+extern int restore_image(void);
+
+/*
+ * Address to jump to in the last phase of restore in order to get to the image
+ * kernel's text (this value is passed in the image header).
+ */
+unsigned long restore_jump_address;
+
+/*
+ * Value of the cr3 register from before the hibernation (this value is passed
+ * in the image header).
+ */
+unsigned long restore_cr3;
+
+pgd_t *temp_level4_pgt;
+
+void *relocated_restore_code;
+
+static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
+{
+ long i, j;
+
+ i = pud_index(address);
+ pud = pud + i;
+ for (; i < PTRS_PER_PUD; pud++, i++) {
+ unsigned long paddr;
+ pmd_t *pmd;
+
+ paddr = address + i*PUD_SIZE;
+ if (paddr >= end)
+ break;
+
+ pmd = (pmd_t *)get_safe_page(GFP_ATOMIC);
+ if (!pmd)
+ return -ENOMEM;
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+ for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
+ unsigned long pe;
+
+ if (paddr >= end)
+ break;
+ pe = __PAGE_KERNEL_LARGE_EXEC | paddr;
+ pe &= __supported_pte_mask;
+ set_pmd(pmd, __pmd(pe));
+ }
+ }
+ return 0;
+}
+
+static int set_up_temporary_mappings(void)
+{
+ unsigned long start, end, next;
+ int error;
+
+ temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC);
+ if (!temp_level4_pgt)
+ return -ENOMEM;
+
+ /* It is safe to reuse the original kernel mapping */
+ set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map),
+ init_level4_pgt[pgd_index(__START_KERNEL_map)]);
+
+ /* Set up the direct mapping from scratch */
+ start = (unsigned long)pfn_to_kaddr(0);
+ end = (unsigned long)pfn_to_kaddr(end_pfn);
+
+ for (; start < end; start = next) {
+ pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC);
+ if (!pud)
+ return -ENOMEM;
+ next = start + PGDIR_SIZE;
+ if (next > end)
+ next = end;
+ if ((error = res_phys_pud_init(pud, __pa(start), __pa(next))))
+ return error;
+ set_pgd(temp_level4_pgt + pgd_index(start),
+ mk_kernel_pgd(__pa(pud)));
+ }
+ return 0;
+}
+
+int swsusp_arch_resume(void)
+{
+ int error;
+
+ /* We have got enough memory and from now on we cannot recover */
+ if ((error = set_up_temporary_mappings()))
+ return error;
+
+ relocated_restore_code = (void *)get_safe_page(GFP_ATOMIC);
+ if (!relocated_restore_code)
+ return -ENOMEM;
+ memcpy(relocated_restore_code, &core_restore_code,
+ &restore_registers - &core_restore_code);
+
+ restore_image();
+ return 0;
+}
+
+/*
+ * pfn_is_nosave - check if given pfn is in the 'nosave' section
+ */
+
+int pfn_is_nosave(unsigned long pfn)
+{
+ unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT;
+ unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT;
+ return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
+}
+
+struct restore_data_record {
+ unsigned long jump_address;
+ unsigned long cr3;
+ unsigned long magic;
+};
+
+#define RESTORE_MAGIC 0x0123456789ABCDEFUL
+
+/**
+ * arch_hibernation_header_save - populate the architecture specific part
+ * of a hibernation image header
+ * @addr: address to save the data at
+ */
+int arch_hibernation_header_save(void *addr, unsigned int max_size)
+{
+ struct restore_data_record *rdr = addr;
+
+ if (max_size < sizeof(struct restore_data_record))
+ return -EOVERFLOW;
+ rdr->jump_address = restore_jump_address;
+ rdr->cr3 = restore_cr3;
+ rdr->magic = RESTORE_MAGIC;
+ return 0;
+}
+
+/**
+ * arch_hibernation_header_restore - read the architecture specific data
+ * from the hibernation image header
+ * @addr: address to read the data from
+ */
+int arch_hibernation_header_restore(void *addr)
+{
+ struct restore_data_record *rdr = addr;
+
+ restore_jump_address = rdr->jump_address;
+ restore_cr3 = rdr->cr3;
+ return (rdr->magic == RESTORE_MAGIC) ? 0 : -EINVAL;
+}
diff --git a/arch/x86/power/swsusp.S b/arch/x86/power/hibernate_asm_32.S
index 53662e05b39..b95aa6cfe3c 100644
--- a/arch/x86/power/swsusp.S
+++ b/arch/x86/power/hibernate_asm_32.S
@@ -1,7 +1,6 @@
.text
-/* Originally gcc generated, modified by hand
- *
+/*
* This may not use any stack, nor any variable that is not "NoSave":
*
* Its rewriting one kernel image with another. What is stack in "old"
diff --git a/arch/x86/kernel/suspend_asm_64.S b/arch/x86/power/hibernate_asm_64.S
index aeb9a4d7681..1deb3244b99 100644
--- a/arch/x86/kernel/suspend_asm_64.S
+++ b/arch/x86/power/hibernate_asm_64.S
@@ -1,7 +1,12 @@
-/* Copyright 2004,2005 Pavel Machek <pavel@suse.cz>, Andi Kleen <ak@suse.de>, Rafael J. Wysocki <rjw@sisk.pl>
+/*
+ * Hibernation support for x86-64
*
* Distribute under GPLv2.
*
+ * Copyright 2007 Rafael J. Wysocki <rjw@sisk.pl>
+ * Copyright 2005 Andi Kleen <ak@suse.de>
+ * Copyright 2004 Pavel Machek <pavel@suse.cz>
+ *
* swsusp_arch_resume must not use any stack or any nonlocal variables while
* copying pages:
*
@@ -9,7 +14,7 @@
* image could very well be data page in "new" image, and overwriting
* your own stack under you is bad idea.
*/
-
+
.text
#include <linux/linkage.h>
#include <asm/segment.h>
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 45aa771e73a..0144395448a 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -58,7 +58,7 @@
xmaddr_t arbitrary_virt_to_machine(unsigned long address)
{
- int level;
+ unsigned int level;
pte_t *pte = lookup_address(address, &level);
unsigned offset = address & PAGE_MASK;
@@ -71,7 +71,7 @@ void make_lowmem_page_readonly(void *vaddr)
{
pte_t *pte, ptev;
unsigned long address = (unsigned long)vaddr;
- int level;
+ unsigned int level;
pte = lookup_address(address, &level);
BUG_ON(pte == NULL);
@@ -86,7 +86,7 @@ void make_lowmem_page_readwrite(void *vaddr)
{
pte_t *pte, ptev;
unsigned long address = (unsigned long)vaddr;
- int level;
+ unsigned int level;
pte = lookup_address(address, &level);
BUG_ON(pte == NULL);
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index b3721fd6877..c39e1a5aa24 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -217,17 +217,17 @@ unsigned long long xen_sched_clock(void)
/* Get the CPU speed from Xen */
unsigned long xen_cpu_khz(void)
{
- u64 cpu_khz = 1000000ULL << 32;
+ u64 xen_khz = 1000000ULL << 32;
const struct vcpu_time_info *info =
&HYPERVISOR_shared_info->vcpu_info[0].time;
- do_div(cpu_khz, info->tsc_to_system_mul);
+ do_div(xen_khz, info->tsc_to_system_mul);
if (info->tsc_shift < 0)
- cpu_khz <<= -info->tsc_shift;
+ xen_khz <<= -info->tsc_shift;
else
- cpu_khz >>= info->tsc_shift;
+ xen_khz >>= info->tsc_shift;
- return cpu_khz;
+ return xen_khz;
}
/*