[Devel] [PATCH RHEL7 COMMIT] mm/tcache: use lockless lookups in tcache's page tree.

Konstantin Khorenko khorenko at virtuozzo.com
Thu Jul 20 11:52:43 MSK 2017


The commit is pushed to "branch-rh7-3.10.0-514.26.1.vz7.33.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-514.26.1.vz7.33.11
------>
commit ba2ea161f25b4cc8c7de15be06f1001dd4d3ee24
Author: Andrey Ryabinin <aryabinin at virtuozzo.com>
Date:   Thu Jul 20 12:52:43 2017 +0400

    mm/tcache: use lockless lookups in tcache's page tree.
    
    Currently tcache looks up page in radix tree under 'tcache_node_tree->lock'.
    After the page found, it has to be deleted from lru list which requires
    taking another 'tcache_nodeinfo->lock' under 'tcache_node_tree->lock'.
    
    So let's say we have gigabytes of data in the tcache and node is under memory
    pressure. And than direct_io() happens, so we need to invalidate gigabytes
    of data. tcache_node_tree and tcache_nodeinfo lock become so contended that
    we might hit softlockup:
    	Kernel panic - not syncing: Watchdog detected hard LOCKUP on cpu 2
    	 [<ffffffff811f559c>] __tcache_page_tree_delete+0x1c/0x1b0
    	 [<ffffffff811f6152>] tcache_invalidate_node_pages+0x72/0x1c0
    	 [<ffffffff811f6716>] tcache_cleancache_invalidate_inode+0x166/0x370
    	 [<ffffffff811f1b83>] __cleancache_invalidate_inode+0x83/0xa0
    	 [<ffffffff8118b311>] invalidate_inode_pages2_range+0x1c1/0x430
    	 [<ffffffff8118b597>] invalidate_inode_pages2+0x17/0x20
    	 [<ffffffffa0217673>] dio_invalidate_cache+0x23/0xf0 [pio_direct]
    	 [<ffffffffa0217818>] dio_prepare_merge+0xd8/0x1d0 [pio_direct]
    	 [<ffffffffa01ef35b>] ploop1_prepare_merge+0xab/0xd0 [pfmt_ploop1]
    	 [<ffffffffa02cd8d4>] ploop_ioctl+0x1194/0x2760 [ploop]
    	 [<ffffffff8120a222>] ? path_openat+0xc2/0x460
    	 [<ffffffff8120b8c2>] ? user_path_at_empty+0x72/0xc0
    	 [<ffffffff812d4a2f>] blkdev_ioctl+0x2df/0x770
    	 [<ffffffff81236bf1>] block_ioctl+0x41/0x50
    	 [<ffffffff8120da75>] do_vfs_ioctl+0x255/0x4f0
    	 [<ffffffff81218897>] ? __fd_install+0x47/0x60
    	 [<ffffffff8120dd64>] SyS_ioctl+0x54/0xa0
    	 [<ffffffff816449c9>] system_call_fastpath+0x16/0x1
    
    Tcache is basically a copy of the page-cache, so we could just implement
    something like the page-cache lockless protocol, but for tcache.
    
    So lookups now require only RCU-read lock held and tcache_node_tree
    lock needed only for deletion from radix tree.
    Also this allows us to get rid of nesting locks situation.
    
    Simple test that runs in small memcg and just reads big files in parallel
    shows some improvement with this patch.
    
    Before:
          10826.662747      task-clock (msec)         #    0.039 CPUs utilized
                52,324      context-switches          #    0.005 M/sec
                 1,178      cpu-migrations            #    0.109 K/sec
                19,392      page-faults               #    0.002 M/sec
    
         275.361955913 seconds time elapsed
    
    After:
          11088.596109      task-clock (msec)         #    0.043 CPUs utilized
                51,313      context-switches          #    0.005 M/sec
                 1,048      cpu-migrations            #    0.095 K/sec
                19,355      page-faults               #    0.002 M/sec
    
         255.423624009 seconds time elapsed
    
    https://jira.sw.ru/browse/PSBM-64727
    Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
---
 mm/tcache.c | 206 +++++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 129 insertions(+), 77 deletions(-)

diff --git a/mm/tcache.c b/mm/tcache.c
index 3778523..a77e3cf 100644
--- a/mm/tcache.c
+++ b/mm/tcache.c
@@ -15,6 +15,7 @@
 #include <linux/spinlock.h>
 #include <linux/rcupdate.h>
 #include <linux/rwsem.h>
+#include <linux/pagemap.h>
 #include <linux/rbtree.h>
 #include <linux/radix-tree.h>
 #include <linux/idr.h>
@@ -703,17 +704,10 @@ static inline void tcache_init_page(struct page *page,
 	page->index = index;
 }
 
-static inline void tcache_hold_page(struct page *page)
-{
-	get_page(page);
-}
-
 static inline void tcache_put_page(struct page *page)
 {
-	if (put_page_testzero(page)) {
-		page->mapping = NULL;	/* to make free_pages_check happy */
-		free_hot_cold_page(page, false);
-	}
+	page->mapping = NULL;
+	free_hot_cold_page(page, false);
 }
 
 static int tcache_page_tree_insert(struct tcache_node *node, pgoff_t index,
@@ -745,6 +739,11 @@ static int tcache_page_tree_insert(struct tcache_node *node, pgoff_t index,
 static struct page *__tcache_page_tree_delete(struct tcache_node *node,
 					      pgoff_t index, struct page *page)
 {
+	if (!page_ref_freeze(page, 2)) {
+		put_page(page);
+		return NULL;
+	}
+
 	page = radix_tree_delete_item(&node->page_tree, index, page);
 	if (page) {
 		if (!--node->nr_pages)
@@ -779,12 +778,10 @@ tcache_attach_page(struct tcache_node *node, pgoff_t index, struct page *page)
 
 	spin_lock_irqsave(&node->tree_lock, flags);
 	err = tcache_page_tree_insert(node, index, page);
-	if (!err) {
-		tcache_hold_page(page);
+	spin_unlock(&node->tree_lock);
+	if (!err)
 		tcache_lru_add(node->pool, page);
-	}
-
-	spin_unlock_irqrestore(&node->tree_lock, flags);
+	local_irq_restore(flags);
 	return err;
 }
 
@@ -795,61 +792,127 @@ tcache_attach_page(struct tcache_node *node, pgoff_t index, struct page *page)
 static struct page *tcache_detach_page(struct tcache_node *node, pgoff_t index,
 				       bool reused)
 {
+	void **pagep;
 	unsigned long flags;
 	struct page *page;
 
-	local_irq_save(flags);
-	page = tcache_page_tree_delete(node, index, NULL);
-	if (page)
-		tcache_lru_del(node->pool, page, reused);
-	local_irq_restore(flags);
+	rcu_read_lock();
+repeat:
+	page = NULL;
+	pagep = radix_tree_lookup_slot(&node->page_tree, index);
+	if (pagep) {
+		page = radix_tree_deref_slot(pagep);
+		if (unlikely(!page))
+			goto out;
+		if (radix_tree_exception(page)) {
+			if (radix_tree_deref_retry(page))
+				goto repeat;
+			WARN_ON(1);
+		}
+		if (!page_cache_get_speculative(page))
+			goto repeat;
+		/*
+		 * Has the page moved?
+		 * This is part of the lockless pagecache protocol. See
+		 * include/linux/pagemap.h for details.
+		 */
+		if (unlikely(page != *pagep)) {
+			put_page(page);
+			goto repeat;
+		}
+	}
+out:
+	rcu_read_unlock();
+
+	if (page) {
+		local_irq_save(flags);
+		page = tcache_page_tree_delete(node, index, page);
+		if (page)
+			tcache_lru_del(node->pool, page, reused);
+		local_irq_restore(flags);
+	}
 
 	return page;
 }
 
-static noinline_for_stack void
-tcache_invalidate_node_pages(struct tcache_node *node)
+static unsigned tcache_lookup(struct page **pages, struct tcache_node *node,
+			pgoff_t start, unsigned int nr_pages, pgoff_t *indices)
 {
 	struct radix_tree_iter iter;
-	struct page *page;
+	unsigned int ret = 0;
 	void **slot;
-	pgoff_t index = 0;
 
-	spin_lock_irq(&node->tree_lock);
+	if (!nr_pages)
+		return 0;
+
+	rcu_read_lock();
+restart:
+	radix_tree_for_each_slot(slot, &node->page_tree, &iter, start) {
+		struct page *page;
+repeat:
+		page = radix_tree_deref_slot(slot);
+		if (unlikely(!page))
+			continue;
+
+		if (radix_tree_exception(page) && radix_tree_deref_retry(page))
+			goto restart;
+
+		if (!page_cache_get_speculative(page))
+			goto repeat;
+
+		/* Has the page moved? */
+		if (unlikely(page != *slot)) {
+			page_cache_release(page);
+			goto repeat;
+		}
+
+		indices[ret] = iter.index;
+		pages[ret] = page;
+		if (++ret == nr_pages)
+			break;
+	}
+	rcu_read_unlock();
+	return ret;
+}
+
+#define TCACHE_PAGEVEC_SIZE 16
+static noinline_for_stack void
+tcache_invalidate_node_pages(struct tcache_node *node)
+{
+	pgoff_t indices[TCACHE_PAGEVEC_SIZE];
+	struct page *pages[TCACHE_PAGEVEC_SIZE];
+	pgoff_t index = 0;
+	unsigned nr_pages;
+	int i;
 
 	/*
 	 * First forbid new page insertions - see tcache_page_tree_replace.
 	 */
 	node->invalidated = true;
 
-	/*
-	 * Now truncate all pages. Be careful, because pages can still be
-	 * deleted from this node by the shrinker or by concurrent lookups.
-	 */
-restart:
-	radix_tree_for_each_slot(slot, &node->page_tree, &iter, index) {
-		page = radix_tree_deref_slot_protected(slot, &node->tree_lock);
-		BUG_ON(!__tcache_page_tree_delete(node, page->index, page));
-		tcache_lru_del(node->pool, page, false);
-		tcache_put_page(page);
-
-		if (need_resched()) {
-			spin_unlock_irq(&node->tree_lock);
-			cond_resched();
+	while ((nr_pages = tcache_lookup(pages, node, index,
+						TCACHE_PAGEVEC_SIZE, indices))) {
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pages[i];
+
+			index = indices[i];
+
 			spin_lock_irq(&node->tree_lock);
-			/*
-			 * Restart iteration over the radix tree, because the
-			 * current node could have been freed when we dropped
-			 * the lock.
-			 */
-			index = iter.index + 1;
-			goto restart;
+			page = __tcache_page_tree_delete(node, page->index, page);
+			spin_unlock(&node->tree_lock);
+
+			if (page) {
+				tcache_lru_del(node->pool, page, false);
+				local_irq_enable();
+				tcache_put_page(page);
+			} else
+				local_irq_enable();
 		}
+		cond_resched();
+		index++;
 	}
 
-	BUG_ON(node->nr_pages != 0);
-
-	spin_unlock_irq(&node->tree_lock);
+	WARN_ON(node->nr_pages != 0);
 }
 
 static noinline_for_stack void
@@ -932,12 +995,16 @@ __tcache_lru_isolate(struct tcache_nodeinfo *ni,
 	struct tcache_node *node;
 	struct page *page;
 	int nr_isolated = 0;
+	int nr_scanned = nr_to_isolate;
 
-	while (nr_to_isolate > 0 && !list_empty(&pni->lru)) {
+	while (nr_to_isolate > 0 && !list_empty(&pni->lru) && nr_scanned--) {
 		page = list_first_entry(&pni->lru, struct page, lru);
+
+		if (unlikely(!page_cache_get_speculative(page)))
+			continue;
+
 		__tcache_lru_del(ni, pni, page);
 
-		tcache_hold_page(page);
 		/*
 		 * A node can be destroyed only if all its pages have been
 		 * removed both from the tree and the LRU list. Since we are
@@ -976,7 +1043,7 @@ tcache_lru_isolate(int nid, struct page **pages, int nr_to_isolate)
 	if (!tcache_grab_pool(pni->pool))
 		goto again;
 
-	nr = __tcache_lru_isolate(ni, pni, pages + nr_isolated, nr_to_isolate);
+	nr = __tcache_lru_isolate(ni, pni, pages, nr_to_isolate);
 	nr_isolated += nr;
 	nr_to_isolate -= nr;
 
@@ -984,9 +1051,6 @@ tcache_lru_isolate(int nid, struct page **pages, int nr_to_isolate)
 		__tcache_insert_reclaim_node(ni, pni);
 
 	tcache_put_pool(pni->pool);
-
-	if (nr_to_isolate > 0)
-		goto again;
 out:
 	spin_unlock_irq(&ni->lock);
 	return nr_isolated;
@@ -998,18 +1062,7 @@ static bool __tcache_reclaim_page(struct page *page)
 	bool ret;
 
 	node = tcache_page_node(page);
-	if (tcache_page_tree_delete(node, page->index, page)) {
-		/*
-		 * We deleted the page from the tree - drop the
-		 * corresponding reference.
-		 */
-		tcache_put_page(page);
-		ret = true;
-	} else
-		/* The page was deleted by a concurrent thread - abort. */
-		ret = false;
-
-	/* Drop the reference taken in __tcache_lru_isolate. */
+	ret = tcache_page_tree_delete(node, page->index, page);
 	tcache_put_node_and_pool(node);
 	return ret;
 }
@@ -1021,9 +1074,10 @@ static int tcache_reclaim_pages(struct page **pages, int nr)
 
 	local_irq_disable();
 	for (i = 0; i < nr; i++) {
-		nr_reclaimed += !!__tcache_reclaim_page(pages[i]);
-		/* Drop the reference taken in __tcache_lru_isolate. */
-		tcache_put_page(pages[i]);
+		if (__tcache_reclaim_page(pages[i])) {
+			nr_reclaimed++;
+			tcache_put_page(pages[i]);
+		}
 		pages[i] = NULL;
 	}
 	local_irq_enable();
@@ -1048,10 +1102,10 @@ tcache_try_to_reclaim_page(struct tcache_pool *pool, int nid)
 	if (!ret)
 		goto out;
 
-	if (!__tcache_reclaim_page(page)) {
-		tcache_put_page(page);
+	if (!__tcache_reclaim_page(page))
 		page = NULL;
-	}
+	else
+		page_ref_unfreeze(page, 1);
 out:
 	local_irq_restore(flags);
 	return page;
@@ -1135,13 +1189,11 @@ static void tcache_cleancache_put_page(int pool_id,
 		cache_page = tcache_alloc_page(node->pool);
 		if (cache_page) {
 			copy_highpage(cache_page, page);
-			/* cleancache does not care about failures */
-			(void)tcache_attach_page(node, index, cache_page);
+			if (tcache_attach_page(node, index, cache_page))
+				if (put_page_testzero(cache_page))
+					tcache_put_page(page);
 		}
 		tcache_put_node_and_pool(node);
-		if (cache_page)
-			tcache_put_page(cache_page);
-
 	}
 }
 


More information about the Devel mailing list