[Devel] [PATCH rh7 3/8] ms/mm: convert i_mmap_mutex to rwsem

Mon Nov 30 18:28:25 MSK 2020

From: Davidlohr Bueso <dave at stgolabs.net>

The i_mmap_mutex is a close cousin of the anon vma lock, both protecting
similar data, one for file backed pages and the other for anon memory.  To
this end, this lock can also be a rwsem.  In addition, there are some
important opportunities to share the lock when there are no tree
modifications.

This conversion is straightforward.  For now, all users take the write
lock.

[sfr at canb.auug.org.au: update fremap.c]
Signed-off-by: Davidlohr Bueso <dbueso at suse.de>
Reviewed-by: Rik van Riel <riel at redhat.com>
Acked-by: "Kirill A. Shutemov" <kirill at shutemov.name>
Acked-by: Hugh Dickins <hughd at google.com>
Cc: Oleg Nesterov <oleg at redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz at infradead.org>
Cc: Srikar Dronamraju <srikar at linux.vnet.ibm.com>
Acked-by: Mel Gorman <mgorman at suse.de>
Signed-off-by: Stephen Rothwell <sfr at canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm at linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds at linux-foundation.org>

https://jira.sw.ru/browse/PSBM-122663
(cherry picked from commit c8c06efa8b552608493b7066c234cfa82c47fcea)
Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
---
 Documentation/vm/locking     |  2 +-
 fs/hugetlbfs/inode.c         | 10 +++++-----
 fs/inode.c                   |  2 +-
 include/linux/fs.h           |  7 ++++---
 include/linux/mmu_notifier.h |  2 +-
 kernel/events/uprobes.c      |  2 +-
 mm/filemap.c                 | 10 +++++-----
 mm/hugetlb.c                 | 10 +++++-----
 mm/memory.c                  |  2 +-
 mm/mmap.c                    |  6 +++---
 mm/mremap.c                  |  2 +-
 mm/rmap.c                    |  4 ++--
 12 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/Documentation/vm/locking b/Documentation/vm/locking
index f61228bd6395..fb6402884062 100644
--- a/Documentation/vm/locking
+++ b/Documentation/vm/locking
@@ -66,7 +66,7 @@ in some cases it is not really needed. Eg, vm_start is modified by
 expand_stack(), it is hard to come up with a destructive scenario without 
 having the vmlist protection in this case.
 
-The page_table_lock nests with the inode i_mmap_mutex and the kmem cache
+The page_table_lock nests with the inode i_mmap_rwsem and the kmem cache
 c_spinlock spinlocks.  This is okay, since the kmem code asks for pages after
 dropping c_spinlock.  The page_table_lock also nests with pagecache_lock and
 pagemap_lru_lock spinlocks, and no code asks for memory with these locks
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index fb40a55cc8f1..68f8f2f0eaf5 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -757,12 +757,12 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
 }
 
 /*
- * Hugetlbfs is not reclaimable; therefore its i_mmap_mutex will never
+ * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never
  * be taken from reclaim -- unlike regular filesystems. This needs an
  * annotation because huge_pmd_share() does an allocation under hugetlb's
- * i_mmap_mutex.
+ * i_mmap_rwsem.
  */
-struct lock_class_key hugetlbfs_i_mmap_mutex_key;
+static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
 
 static struct inode *hugetlbfs_get_inode(struct super_block *sb,
 					struct inode *dir,
@@ -779,8 +779,8 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
 	if (inode) {
 		inode->i_ino = get_next_ino();
 		inode_init_owner(inode, dir, mode);
-		lockdep_set_class(&inode->i_mapping->i_mmap_mutex,
-				&hugetlbfs_i_mmap_mutex_key);
+		lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
+				&hugetlbfs_i_mmap_rwsem_key);
 		inode->i_mapping->a_ops = &hugetlbfs_aops;
 		inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/inode.c b/fs/inode.c
index 5253272c3742..2423a30dda1b 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -356,7 +356,7 @@ void address_space_init_once(struct address_space *mapping)
 	memset(mapping, 0, sizeof(*mapping));
 	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
 	spin_lock_init(&mapping->tree_lock);
-	mutex_init(&mapping->i_mmap_mutex);
+	init_rwsem(&mapping->i_mmap_rwsem);
 	INIT_LIST_HEAD(&mapping->private_list);
 	spin_lock_init(&mapping->private_lock);
 	mapping->i_mmap = RB_ROOT;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e32cb9b71042..f422b0f7b02a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -17,6 +17,7 @@
 #include <linux/pid.h>
 #include <linux/bug.h>
 #include <linux/mutex.h>
+#include <linux/rwsem.h>
 #include <linux/capability.h>
 #include <linux/semaphore.h>
 #include <linux/fiemap.h>
@@ -626,7 +627,7 @@ struct address_space {
 	RH_KABI_REPLACE(unsigned int i_mmap_writable,
 			 atomic_t i_mmap_writable) /* count VM_SHARED mappings */
 	struct rb_root		i_mmap;		/* tree of private and shared mappings */
-	struct mutex		i_mmap_mutex;	/* protect tree, count, list */
+	struct rw_semaphore	i_mmap_rwsem;	/* protect tree, count, list */
 	/* Protected by tree_lock together with the radix tree */
 	unsigned long		nrpages;	/* number of total pages */
 	/* number of shadow or DAX exceptional entries */
@@ -700,12 +701,12 @@ int mapping_tagged(struct address_space *mapping, int tag);
 
 static inline void i_mmap_lock_write(struct address_space *mapping)
 {
-	mutex_lock(&mapping->i_mmap_mutex);
+	down_write(&mapping->i_mmap_rwsem);
 }
 
 static inline void i_mmap_unlock_write(struct address_space *mapping)
 {
-	mutex_unlock(&mapping->i_mmap_mutex);
+	up_write(&mapping->i_mmap_rwsem);
 }
 
 /*
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 156b1506758a..ebcd0b4431f4 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -218,7 +218,7 @@ struct mmu_notifier_rhel7 {
  * Therefore notifier chains can only be traversed when either
  *
  * 1. mmap_sem is held.
- * 2. One of the reverse map locks is held (i_mmap_mutex or anon_vma->rwsem).
+ * 2. One of the reverse map locks is held (i_mmap_rwsem or anon_vma->rwsem).
  * 3. No other concurrent thread can access the list (release)
  */
 struct mmu_notifier {
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 816ad8e3d92f..9f312227a769 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -697,7 +697,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
 
 		if (!prev && !more) {
 			/*
-			 * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through
+			 * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through
 			 * reclaim. This is optimistic, no harm done if it fails.
 			 */
 			prev = kmalloc(sizeof(struct map_info),
diff --git a/mm/filemap.c b/mm/filemap.c
index 2bd5ca4e7528..950d92b6059b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -66,16 +66,16 @@
 /*
  * Lock ordering:
  *
- *  ->i_mmap_mutex		(truncate_pagecache)
+ *  ->i_mmap_rwsem		(truncate_pagecache)
  *    ->private_lock		(__free_pte->__set_page_dirty_buffers)
  *      ->swap_lock		(exclusive_swap_page, others)
  *        ->mapping->tree_lock
  *
  *  ->i_mutex
- *    ->i_mmap_mutex		(truncate->unmap_mapping_range)
+ *    ->i_mmap_rwsem		(truncate->unmap_mapping_range)
  *
  *  ->mmap_sem
- *    ->i_mmap_mutex
+ *    ->i_mmap_rwsem
  *      ->page_table_lock or pte_lock	(various, mainly in memory.c)
  *        ->mapping->tree_lock	(arch-dependent flush_dcache_mmap_lock)
  *
@@ -89,7 +89,7 @@
  *    sb_lock			(fs/fs-writeback.c)
  *    ->mapping->tree_lock	(__sync_single_inode)
  *
- *  ->i_mmap_mutex
+ *  ->i_mmap_rwsem
  *    ->anon_vma.lock		(vma_adjust)
  *
  *  ->anon_vma.lock
@@ -109,7 +109,7 @@
  *    ->inode->i_lock		(zap_pte_range->set_page_dirty)
  *    ->private_lock		(zap_pte_range->__set_page_dirty_buffers)
  *
- * ->i_mmap_mutex
+ * ->i_mmap_rwsem
  *   ->tasklist_lock            (memory_failure, collect_procs_ao)
  */
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 569bfe91b2ba..b2c5fe4a8534 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3507,9 +3507,9 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
 	 * on its way out.  We're lucky that the flag has such an appropriate
 	 * name, and can in fact be safely cleared here. We could clear it
 	 * before the __unmap_hugepage_range above, but all that's necessary
-	 * is to clear it before releasing the i_mmap_mutex. This works
+	 * is to clear it before releasing the i_mmap_rwsem. This works
 	 * because in the context this is called, the VMA is about to be
-	 * destroyed and the i_mmap_mutex is held.
+	 * destroyed and the i_mmap_rwsem is held.
 	 */
 	vma->vm_flags &= ~VM_MAYSHARE;
 }
@@ -4448,9 +4448,9 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 		spin_unlock(ptl);
 	}
 	/*
-	 * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
+	 * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
 	 * may have cleared our pud entry and done put_page on the page table:
-	 * once we release i_mmap_mutex, another task can do the final put_page
+	 * once we release i_mmap_rwsem, another task can do the final put_page
 	 * and that page table be reused and filled with junk.
 	 */
 	flush_tlb_range(vma, start, end);
@@ -4650,7 +4650,7 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
  * and returns the corresponding pte. While this is not necessary for the
  * !shared pmd case because we can allocate the pmd later as well, it makes the
  * code much cleaner. pmd allocation is essential for the shared case because
- * pud has to be populated inside the same i_mmap_mutex section - otherwise
+ * pud has to be populated inside the same i_mmap_rwsem section - otherwise
  * racing tasks could either miss the sharing (see huge_pte_offset) or select a
  * bad pmd for sharing.
  */
diff --git a/mm/memory.c b/mm/memory.c
index 2708bd660cbd..7e66dea08f3f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4370,7 +4370,7 @@ restart:
 		spin_unlock(&inode->i_lock);
 	}
 
-	mutex_lock_nested(&peer->i_mmap_mutex, SINGLE_DEPTH_NESTING);
+	down_write_nested(&peer->i_mmap_rwsem, SINGLE_DEPTH_NESTING);
 	if (!peer->i_peer_file) {
 		i_mmap_unlock_write(peer);
 		goto restart;
diff --git a/mm/mmap.c b/mm/mmap.c
index 837a1acabfb9..e515507854b7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -248,7 +248,7 @@ error:
 }
 
 /*
- * Requires inode->i_mapping->i_mmap_mutex
+ * Requires inode->i_mapping->i_mmap_rwsem
  */
 static void __remove_shared_vm_struct(struct vm_area_struct *vma,
 		struct file *file, struct address_space *mapping)
@@ -3200,7 +3200,7 @@ void exit_mmap(struct mm_struct *mm)
 
 /* Insert vm structure into process list sorted by address
  * and into the inode's i_mmap tree.  If vm_file is non-NULL
- * then i_mmap_mutex is taken here.
+ * then i_mmap_rwsem is taken here.
  */
 int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
 {
@@ -3476,7 +3476,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
 		 */
 		if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
 			BUG();
-		mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem);
+		down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_sem);
 	}
 }
 
diff --git a/mm/mremap.c b/mm/mremap.c
index 1e07f34dbae4..6acb2f1c7a2e 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -117,7 +117,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 	unsigned long len = old_end - old_addr;
 
 	/*
-	 * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma
+	 * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
 	 * locks to ensure that rmap will always observe either the old or the
 	 * new ptes. This is the easiest way to avoid races with
 	 * truncate_pagecache(), page migration, etc...
diff --git a/mm/rmap.c b/mm/rmap.c
index 478f384f1825..e72be32c3dae 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1716,14 +1716,14 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
 	 * The page lock not only makes sure that page->mapping cannot
 	 * suddenly be NULLified by truncation, it makes sure that the
 	 * structure at mapping cannot be freed and reused yet,
-	 * so we can safely take mapping->i_mmap_mutex.
+	 * so we can safely take mapping->i_mmap_rwsem.
 	 */
 	VM_BUG_ON(!PageLocked(page));
 
 	if (!mapping)
 		return ret;
 	pgoff = page_to_pgoff(page);
-	mutex_lock_nested(&mapping->i_mmap_mutex, SINGLE_DEPTH_NESTING);
+	down_write_nested(&mapping->i_mmap_rwsem, SINGLE_DEPTH_NESTING);
 	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 		unsigned long address = vma_address(page, vma);
 
-- 
2.26.2