From 485761bd6a72d33b3d4fa884927b2b0d983b701e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 26 Aug 2005 18:34:10 -0700 Subject: [PATCH] x86_64: Tell VM about holes in nodes Some nodes can have large holes on x86-64. This fixes problems with the VM allowing too many dirty pages because it overestimates the number of available RAM in a node. In extreme cases you can end up with all RAM filled with dirty pages which can lead to deadlocks and other nasty behaviour. This patch just tells the VM about the known holes from e820. Reserved (like the kernel text or mem_map) is still not taken into account, but that should be only a few percent error now. Small detail is that the flat setup uses the NUMA free_area_init_node() now too because it offers more flexibility. (akpm: lotsa thanks to Martin for working this problem out) Cc: Martin Bligh Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/e820.c | 34 ++++++++++++++++++++++++++++++++++ arch/x86_64/mm/init.c | 16 ++++++++++++---- arch/x86_64/mm/numa.c | 8 +++++++- 3 files changed, 53 insertions(+), 5 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index 6ded3a50dfe..b548dea4e5b 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -185,6 +185,40 @@ unsigned long __init e820_end_of_ram(void) } /* + * Compute how much memory is missing in a range. + * Unlike the other functions in this file the arguments are in page numbers. + */ +unsigned long __init +e820_hole_size(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long ram = 0; + unsigned long start = start_pfn << PAGE_SHIFT; + unsigned long end = end_pfn << PAGE_SHIFT; + int i; + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + unsigned long last, addr; + + if (ei->type != E820_RAM || + ei->addr+ei->size <= start || + ei->addr >= end) + continue; + + addr = round_up(ei->addr, PAGE_SIZE); + if (addr < start) + addr = start; + + last = round_down(ei->addr + ei->size, PAGE_SIZE); + if (last >= end) + last = end; + + if (last > addr) + ram += last - addr; + } + return ((end - start) - ram) >> PAGE_SHIFT; +} + +/* * Mark e820 reserved areas as busy for the resource manager. */ void __init e820_reserve_resources(void) diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 72e4b364ed7..aa4a5189ece 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -322,18 +322,26 @@ void zap_low_mappings(void) void __init paging_init(void) { { - unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; + unsigned long zones_size[MAX_NR_ZONES]; + unsigned long holes[MAX_NR_ZONES]; unsigned int max_dma; + memset(zones_size, 0, sizeof(zones_size)); + memset(holes, 0, sizeof(holes)); + max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; - if (end_pfn < max_dma) + if (end_pfn < max_dma) { zones_size[ZONE_DMA] = end_pfn; - else { + holes[ZONE_DMA] = e820_hole_size(0, end_pfn); + } else { zones_size[ZONE_DMA] = max_dma; + holes[ZONE_DMA] = e820_hole_size(0, max_dma); zones_size[ZONE_NORMAL] = end_pfn - max_dma; + holes[ZONE_NORMAL] = e820_hole_size(max_dma, end_pfn); } - free_area_init(zones_size); + free_area_init_node(0, NODE_DATA(0), zones_size, + __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes); } return; } diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 70cb2904a90..6a156f5692a 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -126,9 +126,11 @@ void __init setup_node_zones(int nodeid) { unsigned long start_pfn, end_pfn; unsigned long zones[MAX_NR_ZONES]; + unsigned long holes[MAX_NR_ZONES]; unsigned long dma_end_pfn; memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES); + memset(holes, 0, sizeof(unsigned long) * MAX_NR_ZONES); start_pfn = node_start_pfn(nodeid); end_pfn = node_end_pfn(nodeid); @@ -139,13 +141,17 @@ void __init setup_node_zones(int nodeid) dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT; if (start_pfn < dma_end_pfn) { zones[ZONE_DMA] = dma_end_pfn - start_pfn; + holes[ZONE_DMA] = e820_hole_size(start_pfn, dma_end_pfn); zones[ZONE_NORMAL] = end_pfn - dma_end_pfn; + holes[ZONE_NORMAL] = e820_hole_size(dma_end_pfn, end_pfn); + } else { zones[ZONE_NORMAL] = end_pfn - start_pfn; + holes[ZONE_NORMAL] = e820_hole_size(start_pfn, end_pfn); } free_area_init_node(nodeid, NODE_DATA(nodeid), zones, - start_pfn, NULL); + start_pfn, holes); } void __init numa_init_array(void) -- cgit v1.2.3 From 69be8f189653cd81aae5a74e26615b12871bb72e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 29 Aug 2005 11:44:09 -0400 Subject: [PATCH] convert signal handling of NODEFER to act like other Unix boxes. It has been reported that the way Linux handles NODEFER for signals is not consistent with the way other Unix boxes handle it. I've written a program to test the behavior of how this flag affects signals and had several reports from people who ran this on various Unix boxes, confirming that Linux seems to be unique on the way this is handled. The way NODEFER affects signals on other Unix boxes is as follows: 1) If NODEFER is set, other signals in sa_mask are still blocked. 2) If NODEFER is set and the signal is in sa_mask, then the signal is still blocked. (Note: this is the behavior of all tested but Linux _and_ NetBSD 2.0 *). The way NODEFER affects signals on Linux: 1) If NODEFER is set, other signals are _not_ blocked regardless of sa_mask (Even NetBSD doesn't do this). 2) If NODEFER is set and the signal is in sa_mask, then the signal being handled is not blocked. The patch converts signal handling in all current Linux architectures to the way most Unix boxes work. Unix boxes that were tested: DU4, AIX 5.2, Irix 6.5, NetBSD 2.0, SFU 3.5 on WinXP, AIX 5.3, Mac OSX, and of course Linux 2.6.13-rcX. * NetBSD was the only other Unix to behave like Linux on point #2. The main concern was brought up by point #1 which even NetBSD isn't like Linux. So with this patch, we leave NetBSD as the lonely one that behaves differently here with #2. Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/signal.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c index 98590a989f3..d642fbf3da2 100644 --- a/arch/x86_64/kernel/signal.c +++ b/arch/x86_64/kernel/signal.c @@ -394,10 +394,11 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, #endif ret = setup_rt_frame(sig, ka, info, oldset, regs); - if (ret && !(ka->sa.sa_flags & SA_NODEFER)) { + if (ret) { spin_lock_irq(¤t->sighand->siglock); sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); - sigaddset(¤t->blocked,sig); + if (!(ka->sa.sa_flags & SA_NODEFER)) + sigaddset(¤t->blocked,sig); recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); } -- cgit v1.2.3