[Devel] [PATCH RHEL7 COMMIT] ms/rmap: drop support of non-linear mappings
Kirill Tkhai
ktkhai at virtuozzo.com
Mon Oct 3 07:41:32 PDT 2016
The commit is pushed to "branch-rh7-3.10.0-327.36.1.vz7.18.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-327.36.1.vz7.18.3
------>
commit de430b3b071cfbeb42f515a1e869a86680a0bf86
Author: Kirill A. Shutemov <kirill.shutemov at linux.intel.com>
Date: Mon Oct 3 14:40:31 2016 +0000
ms/rmap: drop support of non-linear mappings
We don't create non-linear mappings anymore. Let's drop code which
handles them in rmap.
Signed-off-by: Kirill A. Shutemov <kirill.shutemov at linux.intel.com>
Signed-off-by: Andrew Morton <akpm at linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds at linux-foundation.org>
https://jira.sw.ru/browse/PSBM-52992
(cherry picked from commit 27ba0644ea9dfe6e7693abc85837b60e40583b96)
Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
Reviewed-by: Cyrill Gorcunov <gorcunov at openvz.org>
---
Documentation/cachetlb.txt | 8 +-
fs/inode.c | 1 -
include/linux/fs.h | 4 +-
include/linux/mm.h | 6 --
include/linux/mm_types.h | 4 +-
include/linux/rmap.h | 3 -
kernel/fork.c | 8 +-
mm/memory.c | 5 --
mm/mmap.c | 24 ++---
mm/rmap.c | 214 +--------------------------------------------
mm/swap.c | 4 +-
11 files changed, 18 insertions(+), 263 deletions(-)
diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt
index 2431096..cab2a07 100644
--- a/Documentation/cachetlb.txt
+++ b/Documentation/cachetlb.txt
@@ -318,10 +318,10 @@ maps this page at its virtual address.
about doing this.
The idea is, first at flush_dcache_page() time, if
- page->mapping->i_mmap is an empty tree and ->i_mmap_nonlinear
- an empty list, just mark the architecture private page flag bit.
- Later, in update_mmu_cache(), a check is made of this flag bit,
- and if set the flush is done and the flag bit is cleared.
+ page->mapping->i_mmap is an empty tree, just mark the architecture
+ private page flag bit. Later, in update_mmu_cache(), a check is
+ made of this flag bit, and if set the flush is done and the flag
+ bit is cleared.
IMPORTANT NOTE: It is often important, if you defer the flush,
that the actual flush occurs on the same CPU
diff --git a/fs/inode.c b/fs/inode.c
index 71dc454..6404cc5 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -367,7 +367,6 @@ void address_space_init_once(struct address_space *mapping)
INIT_LIST_HEAD(&mapping->private_list);
spin_lock_init(&mapping->private_lock);
mapping->i_mmap = RB_ROOT;
- INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
INIT_LIST_HEAD(&mapping->i_peer_list);
}
EXPORT_SYMBOL(address_space_init_once);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 408bb24..41e13f1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -559,7 +559,6 @@ struct address_space {
spinlock_t tree_lock; /* and lock protecting it */
atomic_t i_mmap_writable;/* count VM_SHARED mappings */
struct rb_root i_mmap; /* tree of private and shared mappings */
- struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
struct mutex i_mmap_mutex; /* protect tree, count, list */
/* Protected by tree_lock together with the radix tree */
unsigned long nrpages; /* number of total pages */
@@ -634,8 +633,7 @@ int mapping_tagged(struct address_space *mapping, int tag);
*/
static inline int mapping_mapped(struct address_space *mapping)
{
- return !RB_EMPTY_ROOT(&mapping->i_mmap) ||
- !list_empty(&mapping->i_mmap_nonlinear);
+ return !RB_EMPTY_ROOT(&mapping->i_mmap);
}
/*
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4ad0b19..d0392e5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1689,12 +1689,6 @@ struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
for (vma = vma_interval_tree_iter_first(root, start, last); \
vma; vma = vma_interval_tree_iter_next(vma, start, last))
-static inline void vma_nonlinear_insert(struct vm_area_struct *vma,
- struct list_head *list)
-{
- list_add_tail(&vma->shared.nonlinear, list);
-}
-
void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
struct rb_root *root);
void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7a033ec..97c537a 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -277,15 +277,13 @@ struct vm_area_struct {
/*
* For areas with an address space and backing store,
- * linkage into the address_space->i_mmap interval tree, or
- * linkage of vma in the address_space->i_mmap_nonlinear list.
+ * linkage into the address_space->i_mmap interval tree.
*/
union {
struct {
struct rb_node rb;
unsigned long rb_subtree_last;
} linear;
- struct list_head nonlinear;
} shared;
/*
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 2491eae..10869e7 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -243,7 +243,6 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
* arg: passed to rmap_one() and invalid_vma()
* rmap_one: executed on each vma where page is mapped
* done: for checking traversing termination condition
- * file_nonlinear: for handling file nonlinear mapping
* anon_lock: for getting anon_lock by optimized way rather than default
* invalid_vma: for skipping uninterested vma
*/
@@ -252,8 +251,6 @@ struct rmap_walk_control {
int (*rmap_one)(struct page *page, struct vm_area_struct *vma,
unsigned long addr, void *arg);
int (*done)(struct page *page);
- int (*file_nonlinear)(struct page *, struct address_space *,
- struct vm_area_struct *vma);
struct anon_vma *(*anon_lock)(struct page *page);
bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
};
diff --git a/kernel/fork.c b/kernel/fork.c
index 2fcde98..1e7b897 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -439,12 +439,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
atomic_inc(&mapping->i_mmap_writable);
flush_dcache_mmap_lock(mapping);
/* insert tmp into the share list, just after mpnt */
- if (unlikely(tmp->vm_flags & VM_NONLINEAR))
- vma_nonlinear_insert(tmp,
- &mapping->i_mmap_nonlinear);
- else
- vma_interval_tree_insert_after(tmp, mpnt,
- &mapping->i_mmap);
+ vma_interval_tree_insert_after(tmp, mpnt,
+ &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
mutex_unlock(&mapping->i_mmap_mutex);
}
diff --git a/mm/memory.c b/mm/memory.c
index e93aa91..8a55890 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4546,13 +4546,8 @@ restart:
vma_interval_tree_foreach(vma, &mapping->i_mmap, 0, ULONG_MAX)
if (synchronize_mapping_faults_vma(mapping, vma))
goto restart;
- list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
- if (synchronize_mapping_faults_vma(mapping, vma))
- goto restart;
vma_interval_tree_foreach(vma, &mapping->i_mmap, 0, ULONG_MAX)
vma->vm_private_data = NULL;
- list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
- vma->vm_private_data = NULL;
}
void close_mapping_peer(struct address_space *mapping)
diff --git a/mm/mmap.c b/mm/mmap.c
index eeebddc..207a8f9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -223,10 +223,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
mapping_unmap_writable(mapping);
flush_dcache_mmap_lock(mapping);
- if (unlikely(vma->vm_flags & VM_NONLINEAR))
- list_del_init(&vma->shared.nonlinear);
- else
- vma_interval_tree_remove(vma, &mapping->i_mmap);
+ vma_interval_tree_remove(vma, &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
}
@@ -626,10 +623,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
atomic_inc(&mapping->i_mmap_writable);
flush_dcache_mmap_lock(mapping);
- if (unlikely(vma->vm_flags & VM_NONLINEAR))
- vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
- else
- vma_interval_tree_insert(vma, &mapping->i_mmap);
+ vma_interval_tree_insert(vma, &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
}
}
@@ -761,14 +755,11 @@ again: remove_next = 1 + (end > next->vm_end);
if (file) {
mapping = file->f_mapping;
- if (!(vma->vm_flags & VM_NONLINEAR)) {
- root = &mapping->i_mmap;
- uprobe_munmap(vma, vma->vm_start, vma->vm_end);
+ root = &mapping->i_mmap;
+ uprobe_munmap(vma, vma->vm_start, vma->vm_end);
- if (adjust_next)
- uprobe_munmap(next, next->vm_start,
- next->vm_end);
- }
+ if (adjust_next)
+ uprobe_munmap(next, next->vm_start, next->vm_end);
mutex_lock(&mapping->i_mmap_mutex);
if (insert) {
@@ -3197,8 +3188,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
*
* mmap_sem in write mode is required in order to block all operations
* that could modify pagetables and free pages without need of
- * altering the vma layout (for example populate_range() with
- * nonlinear vmas). It's also needed in write mode to avoid new
+ * altering the vma layout. It's also needed in write mode to avoid new
* anon_vmas to be associated with existing vmas.
*
* A single task can't take more than one mm_take_all_locks() in a row
diff --git a/mm/rmap.c b/mm/rmap.c
index f18fdbb..c778134 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -553,9 +553,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
if (!vma->anon_vma || !page__anon_vma ||
vma->anon_vma->root != page__anon_vma->root)
return -EFAULT;
- } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
- if (!vma->vm_file ||
- vma->vm_file->f_mapping != page->mapping)
+ } else if (page->mapping) {
+ if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping)
return -EFAULT;
} else
return -EFAULT;
@@ -1302,7 +1301,6 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
set_pte_at(mm, address, pte, swp_pte);
- BUG_ON(pte_file(*pte));
} else if (IS_ENABLED(CONFIG_MIGRATION) &&
(TTU_ACTION(flags) == TTU_MIGRATION)) {
/* Establish migration entry for a file page */
@@ -1344,133 +1342,6 @@ out_mlock:
return ret;
}
-/*
- * objrmap doesn't work for nonlinear VMAs because the assumption that
- * offset-into-file correlates with offset-into-virtual-addresses does not hold.
- * Consequently, given a particular page and its ->index, we cannot locate the
- * ptes which are mapping that page without an exhaustive linear search.
- *
- * So what this code does is a mini "virtual scan" of each nonlinear VMA which
- * maps the file to which the target page belongs. The ->vm_private_data field
- * holds the current cursor into that scan. Successive searches will circulate
- * around the vma's virtual address space.
- *
- * So as more replacement pressure is applied to the pages in a nonlinear VMA,
- * more scanning pressure is placed against them as well. Eventually pages
- * will become fully unmapped and are eligible for eviction.
- *
- * For very sparsely populated VMAs this is a little inefficient - chances are
- * there there won't be many ptes located within the scan cluster. In this case
- * maybe we could scan further - to the end of the pte page, perhaps.
- *
- * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can
- * acquire it without blocking. If vma locked, mlock the pages in the cluster,
- * rather than unmapping them. If we encounter the "check_page" that vmscan is
- * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN.
- */
-#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE)
-#define CLUSTER_MASK (~(CLUSTER_SIZE - 1))
-
-static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
- struct vm_area_struct *vma, struct page *check_page)
-{
- struct mm_struct *mm = vma->vm_mm;
- pmd_t *pmd;
- pte_t *pte;
- pte_t pteval;
- spinlock_t *ptl;
- struct page *page;
- unsigned long address;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
- unsigned long end;
- int ret = SWAP_AGAIN;
- int locked_vma = 0;
-
- address = (vma->vm_start + cursor) & CLUSTER_MASK;
- end = address + CLUSTER_SIZE;
- if (address < vma->vm_start)
- address = vma->vm_start;
- if (end > vma->vm_end)
- end = vma->vm_end;
-
- pmd = mm_find_pmd(mm, address);
- if (!pmd)
- return ret;
-
- mmun_start = address;
- mmun_end = end;
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
-
- /*
- * If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
- * keep the sem while scanning the cluster for mlocking pages.
- */
- if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
- locked_vma = (vma->vm_flags & VM_LOCKED);
- if (!locked_vma)
- up_read(&vma->vm_mm->mmap_sem); /* don't need it */
- }
-
- pte = pte_offset_map_lock(mm, pmd, address, &ptl);
-
- /* Update high watermark before we lower rss */
- update_hiwater_rss(mm);
-
- for (; address < end; pte++, address += PAGE_SIZE) {
- if (!pte_present(*pte))
- continue;
- page = vm_normal_page(vma, address, *pte);
- BUG_ON(!page || PageAnon(page));
-
- if (locked_vma) {
- if (page == check_page) {
- /* we know we have check_page locked */
- mlock_vma_page(page);
- ret = SWAP_MLOCK;
- } else if (trylock_page(page)) {
- /*
- * If we can lock the page, perform mlock.
- * Otherwise leave the page alone, it will be
- * eventually encountered again later.
- */
- mlock_vma_page(page);
- unlock_page(page);
- }
- continue; /* don't unmap */
- }
-
- if (ptep_clear_flush_young_notify(vma, address, pte))
- continue;
-
- /* Nuke the page table entry. */
- flush_cache_page(vma, address, pte_pfn(*pte));
- pteval = ptep_clear_flush_notify(vma, address, pte);
-
- /* If nonlinear, store the file page offset in the pte. */
- if (page->index != linear_page_index(vma, address)) {
- pte_t ptfile = pgoff_to_pte(page->index);
- if (pte_soft_dirty(pteval))
- ptfile = pte_file_mksoft_dirty(ptfile);
- set_pte_at(mm, address, pte, ptfile);
- }
-
- /* Move the dirty bit to the physical page now the pte is gone. */
- if (pte_dirty(pteval))
- set_page_dirty_mm(page, mm);
-
- page_remove_rmap(page);
- page_cache_release(page);
- dec_mm_counter(mm, MM_FILEPAGES);
- (*mapcount)--;
- }
- pte_unmap_unlock(pte - 1, ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
- if (locked_vma)
- up_read(&vma->vm_mm->mmap_sem);
- return ret;
-}
-
bool is_vma_temporary_stack(struct vm_area_struct *vma)
{
int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
@@ -1560,10 +1431,6 @@ static int try_to_unmap_mapping(struct page *page,
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct vm_area_struct *vma;
int ret = SWAP_AGAIN;
- unsigned long cursor;
- unsigned long max_nl_cursor = 0;
- unsigned long max_nl_size = 0;
- unsigned int mapcount;
if (PageHuge(page))
pgoff = page->index << compound_order(page);
@@ -1575,75 +1442,6 @@ static int try_to_unmap_mapping(struct page *page,
goto out;
}
- if (list_empty(&mapping->i_mmap_nonlinear))
- goto out;
-
- /*
- * We don't bother to try to find the munlocked page in nonlinears.
- * It's costly. Instead, later, page reclaim logic may call
- * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily.
- */
- if (TTU_ACTION(flags) == TTU_MUNLOCK)
- goto out;
-
- list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
- shared.nonlinear) {
- cursor = (unsigned long) vma->vm_private_data;
- if (cursor > max_nl_cursor)
- max_nl_cursor = cursor;
- cursor = vma->vm_end - vma->vm_start;
- if (cursor > max_nl_size)
- max_nl_size = cursor;
- }
-
- if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */
- ret = SWAP_FAIL;
- goto out;
- }
-
- /*
- * We don't try to search for this page in the nonlinear vmas,
- * and page_referenced wouldn't have found it anyway. Instead
- * just walk the nonlinear vmas trying to age and unmap some.
- * The mapcount of the page we came in with is irrelevant,
- * but even so use it as a guide to how hard we should try?
- */
- mapcount = page_mapcount(page);
- if (!mapcount)
- goto out;
- cond_resched();
-
- max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
- if (max_nl_cursor == 0)
- max_nl_cursor = CLUSTER_SIZE;
-
- do {
- list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
- shared.nonlinear) {
- cursor = (unsigned long) vma->vm_private_data;
- while ( cursor < max_nl_cursor &&
- cursor < vma->vm_end - vma->vm_start) {
- if (try_to_unmap_cluster(cursor, &mapcount,
- vma, page) == SWAP_MLOCK)
- ret = SWAP_MLOCK;
- cursor += CLUSTER_SIZE;
- vma->vm_private_data = (void *) cursor;
- if ((int)mapcount <= 0)
- goto out;
- }
- vma->vm_private_data = (void *) max_nl_cursor;
- }
- cond_resched();
- max_nl_cursor += CLUSTER_SIZE;
- } while (max_nl_cursor <= max_nl_size);
-
- /*
- * Don't loop forever (perhaps all the remaining pages are
- * in locked vmas). Reset cursor on all unreserved nonlinear
- * vmas, now forgetting on which ones it had fallen behind.
- */
- list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
- vma->vm_private_data = NULL;
out:
return ret;
}
@@ -1837,14 +1635,6 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
goto done;
}
- if (!rwc->file_nonlinear)
- goto done;
-
- if (list_empty(&mapping->i_mmap_nonlinear))
- goto done;
-
- ret = rwc->file_nonlinear(page, mapping, vma);
-
done:
mutex_unlock(&mapping->i_mmap_mutex);
return ret;
diff --git a/mm/swap.c b/mm/swap.c
index 0a42ebf..6c7f5fe 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1075,10 +1075,8 @@ void __init swap_setup(void)
int i;
bdi_init(swapper_spaces[0].backing_dev_info);
- for (i = 0; i < MAX_SWAPFILES; i++) {
+ for (i = 0; i < MAX_SWAPFILES; i++)
spin_lock_init(&swapper_spaces[i].tree_lock);
- INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
- }
#endif
/* Use a smaller cluster for small-memory machines */
More information about the Devel
mailing list