From 5a6fe125950676015f5108fb71b2a67441755003 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 10 Feb 2009 14:02:27 +0000 Subject: Do not account for the address space used by hugetlbfs using VM_ACCOUNT When overcommit is disabled, the core VM accounts for pages used by anonymous shared, private mappings and special mappings. It keeps track of VMAs that should be accounted for with VM_ACCOUNT and VMAs that never had a reserve with VM_NORESERVE. Overcommit for hugetlbfs is much riskier than overcommit for base pages due to contiguity requirements. It avoids overcommiting on both shared and private mappings using reservation counters that are checked and updated during mmap(). This ensures (within limits) that hugepages exist in the future when faults occurs or it is too easy to applications to be SIGKILLed. As hugetlbfs makes its own reservations of a different unit to the base page size, VM_ACCOUNT should never be set. Even if the units were correct, we would double account for the usage in the core VM and hugetlbfs. VM_NORESERVE may be set because an application can request no reserves be made for hugetlbfs at the risk of getting killed later. With commit fc8744adc870a8d4366908221508bb113d8b72ee, VM_NORESERVE and VM_ACCOUNT are getting unconditionally set for hugetlbfs-backed mappings. This breaks the accounting for both the core VM and hugetlbfs, can trigger an OOM storm when hugepage pools are too small lockups and corrupted counters otherwise are used. This patch brings hugetlbfs more in line with how the core VM treats VM_NORESERVE but prevents VM_ACCOUNT being set. Signed-off-by: Mel Gorman Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 618e9830408..20746420954 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2269,14 +2269,12 @@ void hugetlb_change_protection(struct vm_area_struct *vma, int hugetlb_reserve_pages(struct inode *inode, long from, long to, - struct vm_area_struct *vma) + struct vm_area_struct *vma, + int acctflag) { - long ret, chg; + long ret = 0, chg; struct hstate *h = hstate_inode(inode); - if (vma && vma->vm_flags & VM_NORESERVE) - return 0; - /* * Shared mappings base their reservation on the number of pages that * are already allocated on behalf of the file. Private mappings need @@ -2285,22 +2283,25 @@ int hugetlb_reserve_pages(struct inode *inode, */ if (!vma || vma->vm_flags & VM_SHARED) chg = region_chg(&inode->i_mapping->private_list, from, to); - else { - struct resv_map *resv_map = resv_map_alloc(); - if (!resv_map) - return -ENOMEM; - + else chg = to - from; - set_vma_resv_map(vma, resv_map); - set_vma_resv_flags(vma, HPAGE_RESV_OWNER); - } - if (chg < 0) return chg; if (hugetlb_get_quota(inode->i_mapping, chg)) return -ENOSPC; + + /* + * Only apply hugepage reservation if asked. We still have to + * take the filesystem quota because it is an upper limit + * defined for the mount and not necessarily memory as a whole + */ + if (acctflag & VM_NORESERVE) { + reset_vma_resv_huge_pages(vma); + return 0; + } + ret = hugetlb_acct_memory(h, chg); if (ret < 0) { hugetlb_put_quota(inode->i_mapping, chg); @@ -2308,6 +2309,16 @@ int hugetlb_reserve_pages(struct inode *inode, } if (!vma || vma->vm_flags & VM_SHARED) region_add(&inode->i_mapping->private_list, from, to); + else { + struct resv_map *resv_map = resv_map_alloc(); + + if (!resv_map) + return -ENOMEM; + + set_vma_resv_map(vma, resv_map); + set_vma_resv_flags(vma, HPAGE_RESV_OWNER); + } + return 0; } -- cgit v1.2.3 From 17c9d12e126cb0de8d535dc1908c4819d712bc68 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 11 Feb 2009 16:34:16 +0000 Subject: Do not account for hugetlbfs quota at mmap() time if mapping [SHM|MAP]_NORESERVE Commit 5a6fe125950676015f5108fb71b2a67441755003 brought hugetlbfs more in line with the core VM by obeying VM_NORESERVE and not reserving hugepages for both shared and private mappings when [SHM|MAP]_NORESERVE are specified. However, it is still taking filesystem quota unconditionally. At fault time, if there are no reserves and attempt is made to allocate the page and account for filesystem quota. If either fail, the fault fails. The impact is that quota is getting accounted for twice. This patch partially reverts 5a6fe125950676015f5108fb71b2a67441755003. To help prevent this mistake happening again, it improves the documentation of hugetlb_reserve_pages() Reported-by: Andy Whitcroft Signed-off-by: Mel Gorman Acked-by: Andy Whitcroft Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 53 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 20 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 20746420954..107da3d809a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2272,9 +2272,17 @@ int hugetlb_reserve_pages(struct inode *inode, struct vm_area_struct *vma, int acctflag) { - long ret = 0, chg; + long ret, chg; struct hstate *h = hstate_inode(inode); + /* + * Only apply hugepage reservation if asked. At fault time, an + * attempt will be made for VM_NORESERVE to allocate a page + * and filesystem quota without using reserves + */ + if (acctflag & VM_NORESERVE) + return 0; + /* * Shared mappings base their reservation on the number of pages that * are already allocated on behalf of the file. Private mappings need @@ -2283,42 +2291,47 @@ int hugetlb_reserve_pages(struct inode *inode, */ if (!vma || vma->vm_flags & VM_SHARED) chg = region_chg(&inode->i_mapping->private_list, from, to); - else + else { + struct resv_map *resv_map = resv_map_alloc(); + if (!resv_map) + return -ENOMEM; + chg = to - from; + set_vma_resv_map(vma, resv_map); + set_vma_resv_flags(vma, HPAGE_RESV_OWNER); + } + if (chg < 0) return chg; + /* There must be enough filesystem quota for the mapping */ if (hugetlb_get_quota(inode->i_mapping, chg)) return -ENOSPC; /* - * Only apply hugepage reservation if asked. We still have to - * take the filesystem quota because it is an upper limit - * defined for the mount and not necessarily memory as a whole + * Check enough hugepages are available for the reservation. + * Hand back the quota if there are not */ - if (acctflag & VM_NORESERVE) { - reset_vma_resv_huge_pages(vma); - return 0; - } - ret = hugetlb_acct_memory(h, chg); if (ret < 0) { hugetlb_put_quota(inode->i_mapping, chg); return ret; } + + /* + * Account for the reservations made. Shared mappings record regions + * that have reservations as they are shared by multiple VMAs. + * When the last VMA disappears, the region map says how much + * the reservation was and the page cache tells how much of + * the reservation was consumed. Private mappings are per-VMA and + * only the consumed reservations are tracked. When the VMA + * disappears, the original reservation is the VMA size and the + * consumed reservations are stored in the map. Hence, nothing + * else has to be done for private mappings here + */ if (!vma || vma->vm_flags & VM_SHARED) region_add(&inode->i_mapping->private_list, from, to); - else { - struct resv_map *resv_map = resv_map_alloc(); - - if (!resv_map) - return -ENOMEM; - - set_vma_resv_map(vma, resv_map); - set_vma_resv_flags(vma, HPAGE_RESV_OWNER); - } - return 0; } -- cgit v1.2.3