[Devel] [PATCH RHEL7 COMMIT] mm/tcache: use lockless lookups in tcache's page tree.
Konstantin Khorenko
khorenko at virtuozzo.com
Thu Jul 20 11:52:43 MSK 2017
The commit is pushed to "branch-rh7-3.10.0-514.26.1.vz7.33.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-514.26.1.vz7.33.11
------>
commit ba2ea161f25b4cc8c7de15be06f1001dd4d3ee24
Author: Andrey Ryabinin <aryabinin at virtuozzo.com>
Date: Thu Jul 20 12:52:43 2017 +0400
mm/tcache: use lockless lookups in tcache's page tree.
Currently tcache looks up page in radix tree under 'tcache_node_tree->lock'.
After the page found, it has to be deleted from lru list which requires
taking another 'tcache_nodeinfo->lock' under 'tcache_node_tree->lock'.
So let's say we have gigabytes of data in the tcache and node is under memory
pressure. And than direct_io() happens, so we need to invalidate gigabytes
of data. tcache_node_tree and tcache_nodeinfo lock become so contended that
we might hit softlockup:
Kernel panic - not syncing: Watchdog detected hard LOCKUP on cpu 2
[<ffffffff811f559c>] __tcache_page_tree_delete+0x1c/0x1b0
[<ffffffff811f6152>] tcache_invalidate_node_pages+0x72/0x1c0
[<ffffffff811f6716>] tcache_cleancache_invalidate_inode+0x166/0x370
[<ffffffff811f1b83>] __cleancache_invalidate_inode+0x83/0xa0
[<ffffffff8118b311>] invalidate_inode_pages2_range+0x1c1/0x430
[<ffffffff8118b597>] invalidate_inode_pages2+0x17/0x20
[<ffffffffa0217673>] dio_invalidate_cache+0x23/0xf0 [pio_direct]
[<ffffffffa0217818>] dio_prepare_merge+0xd8/0x1d0 [pio_direct]
[<ffffffffa01ef35b>] ploop1_prepare_merge+0xab/0xd0 [pfmt_ploop1]
[<ffffffffa02cd8d4>] ploop_ioctl+0x1194/0x2760 [ploop]
[<ffffffff8120a222>] ? path_openat+0xc2/0x460
[<ffffffff8120b8c2>] ? user_path_at_empty+0x72/0xc0
[<ffffffff812d4a2f>] blkdev_ioctl+0x2df/0x770
[<ffffffff81236bf1>] block_ioctl+0x41/0x50
[<ffffffff8120da75>] do_vfs_ioctl+0x255/0x4f0
[<ffffffff81218897>] ? __fd_install+0x47/0x60
[<ffffffff8120dd64>] SyS_ioctl+0x54/0xa0
[<ffffffff816449c9>] system_call_fastpath+0x16/0x1
Tcache is basically a copy of the page-cache, so we could just implement
something like the page-cache lockless protocol, but for tcache.
So lookups now require only RCU-read lock held and tcache_node_tree
lock needed only for deletion from radix tree.
Also this allows us to get rid of nesting locks situation.
Simple test that runs in small memcg and just reads big files in parallel
shows some improvement with this patch.
Before:
10826.662747 task-clock (msec) # 0.039 CPUs utilized
52,324 context-switches # 0.005 M/sec
1,178 cpu-migrations # 0.109 K/sec
19,392 page-faults # 0.002 M/sec
275.361955913 seconds time elapsed
After:
11088.596109 task-clock (msec) # 0.043 CPUs utilized
51,313 context-switches # 0.005 M/sec
1,048 cpu-migrations # 0.095 K/sec
19,355 page-faults # 0.002 M/sec
255.423624009 seconds time elapsed
https://jira.sw.ru/browse/PSBM-64727
Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
---
mm/tcache.c | 206 +++++++++++++++++++++++++++++++++++++-----------------------
1 file changed, 129 insertions(+), 77 deletions(-)
diff --git a/mm/tcache.c b/mm/tcache.c
index 3778523..a77e3cf 100644
--- a/mm/tcache.c
+++ b/mm/tcache.c
@@ -15,6 +15,7 @@
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/rwsem.h>
+#include <linux/pagemap.h>
#include <linux/rbtree.h>
#include <linux/radix-tree.h>
#include <linux/idr.h>
@@ -703,17 +704,10 @@ static inline void tcache_init_page(struct page *page,
page->index = index;
}
-static inline void tcache_hold_page(struct page *page)
-{
- get_page(page);
-}
-
static inline void tcache_put_page(struct page *page)
{
- if (put_page_testzero(page)) {
- page->mapping = NULL; /* to make free_pages_check happy */
- free_hot_cold_page(page, false);
- }
+ page->mapping = NULL;
+ free_hot_cold_page(page, false);
}
static int tcache_page_tree_insert(struct tcache_node *node, pgoff_t index,
@@ -745,6 +739,11 @@ static int tcache_page_tree_insert(struct tcache_node *node, pgoff_t index,
static struct page *__tcache_page_tree_delete(struct tcache_node *node,
pgoff_t index, struct page *page)
{
+ if (!page_ref_freeze(page, 2)) {
+ put_page(page);
+ return NULL;
+ }
+
page = radix_tree_delete_item(&node->page_tree, index, page);
if (page) {
if (!--node->nr_pages)
@@ -779,12 +778,10 @@ tcache_attach_page(struct tcache_node *node, pgoff_t index, struct page *page)
spin_lock_irqsave(&node->tree_lock, flags);
err = tcache_page_tree_insert(node, index, page);
- if (!err) {
- tcache_hold_page(page);
+ spin_unlock(&node->tree_lock);
+ if (!err)
tcache_lru_add(node->pool, page);
- }
-
- spin_unlock_irqrestore(&node->tree_lock, flags);
+ local_irq_restore(flags);
return err;
}
@@ -795,61 +792,127 @@ tcache_attach_page(struct tcache_node *node, pgoff_t index, struct page *page)
static struct page *tcache_detach_page(struct tcache_node *node, pgoff_t index,
bool reused)
{
+ void **pagep;
unsigned long flags;
struct page *page;
- local_irq_save(flags);
- page = tcache_page_tree_delete(node, index, NULL);
- if (page)
- tcache_lru_del(node->pool, page, reused);
- local_irq_restore(flags);
+ rcu_read_lock();
+repeat:
+ page = NULL;
+ pagep = radix_tree_lookup_slot(&node->page_tree, index);
+ if (pagep) {
+ page = radix_tree_deref_slot(pagep);
+ if (unlikely(!page))
+ goto out;
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page))
+ goto repeat;
+ WARN_ON(1);
+ }
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+ /*
+ * Has the page moved?
+ * This is part of the lockless pagecache protocol. See
+ * include/linux/pagemap.h for details.
+ */
+ if (unlikely(page != *pagep)) {
+ put_page(page);
+ goto repeat;
+ }
+ }
+out:
+ rcu_read_unlock();
+
+ if (page) {
+ local_irq_save(flags);
+ page = tcache_page_tree_delete(node, index, page);
+ if (page)
+ tcache_lru_del(node->pool, page, reused);
+ local_irq_restore(flags);
+ }
return page;
}
-static noinline_for_stack void
-tcache_invalidate_node_pages(struct tcache_node *node)
+static unsigned tcache_lookup(struct page **pages, struct tcache_node *node,
+ pgoff_t start, unsigned int nr_pages, pgoff_t *indices)
{
struct radix_tree_iter iter;
- struct page *page;
+ unsigned int ret = 0;
void **slot;
- pgoff_t index = 0;
- spin_lock_irq(&node->tree_lock);
+ if (!nr_pages)
+ return 0;
+
+ rcu_read_lock();
+restart:
+ radix_tree_for_each_slot(slot, &node->page_tree, &iter, start) {
+ struct page *page;
+repeat:
+ page = radix_tree_deref_slot(slot);
+ if (unlikely(!page))
+ continue;
+
+ if (radix_tree_exception(page) && radix_tree_deref_retry(page))
+ goto restart;
+
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+
+ /* Has the page moved? */
+ if (unlikely(page != *slot)) {
+ page_cache_release(page);
+ goto repeat;
+ }
+
+ indices[ret] = iter.index;
+ pages[ret] = page;
+ if (++ret == nr_pages)
+ break;
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+#define TCACHE_PAGEVEC_SIZE 16
+static noinline_for_stack void
+tcache_invalidate_node_pages(struct tcache_node *node)
+{
+ pgoff_t indices[TCACHE_PAGEVEC_SIZE];
+ struct page *pages[TCACHE_PAGEVEC_SIZE];
+ pgoff_t index = 0;
+ unsigned nr_pages;
+ int i;
/*
* First forbid new page insertions - see tcache_page_tree_replace.
*/
node->invalidated = true;
- /*
- * Now truncate all pages. Be careful, because pages can still be
- * deleted from this node by the shrinker or by concurrent lookups.
- */
-restart:
- radix_tree_for_each_slot(slot, &node->page_tree, &iter, index) {
- page = radix_tree_deref_slot_protected(slot, &node->tree_lock);
- BUG_ON(!__tcache_page_tree_delete(node, page->index, page));
- tcache_lru_del(node->pool, page, false);
- tcache_put_page(page);
-
- if (need_resched()) {
- spin_unlock_irq(&node->tree_lock);
- cond_resched();
+ while ((nr_pages = tcache_lookup(pages, node, index,
+ TCACHE_PAGEVEC_SIZE, indices))) {
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pages[i];
+
+ index = indices[i];
+
spin_lock_irq(&node->tree_lock);
- /*
- * Restart iteration over the radix tree, because the
- * current node could have been freed when we dropped
- * the lock.
- */
- index = iter.index + 1;
- goto restart;
+ page = __tcache_page_tree_delete(node, page->index, page);
+ spin_unlock(&node->tree_lock);
+
+ if (page) {
+ tcache_lru_del(node->pool, page, false);
+ local_irq_enable();
+ tcache_put_page(page);
+ } else
+ local_irq_enable();
}
+ cond_resched();
+ index++;
}
- BUG_ON(node->nr_pages != 0);
-
- spin_unlock_irq(&node->tree_lock);
+ WARN_ON(node->nr_pages != 0);
}
static noinline_for_stack void
@@ -932,12 +995,16 @@ __tcache_lru_isolate(struct tcache_nodeinfo *ni,
struct tcache_node *node;
struct page *page;
int nr_isolated = 0;
+ int nr_scanned = nr_to_isolate;
- while (nr_to_isolate > 0 && !list_empty(&pni->lru)) {
+ while (nr_to_isolate > 0 && !list_empty(&pni->lru) && nr_scanned--) {
page = list_first_entry(&pni->lru, struct page, lru);
+
+ if (unlikely(!page_cache_get_speculative(page)))
+ continue;
+
__tcache_lru_del(ni, pni, page);
- tcache_hold_page(page);
/*
* A node can be destroyed only if all its pages have been
* removed both from the tree and the LRU list. Since we are
@@ -976,7 +1043,7 @@ tcache_lru_isolate(int nid, struct page **pages, int nr_to_isolate)
if (!tcache_grab_pool(pni->pool))
goto again;
- nr = __tcache_lru_isolate(ni, pni, pages + nr_isolated, nr_to_isolate);
+ nr = __tcache_lru_isolate(ni, pni, pages, nr_to_isolate);
nr_isolated += nr;
nr_to_isolate -= nr;
@@ -984,9 +1051,6 @@ tcache_lru_isolate(int nid, struct page **pages, int nr_to_isolate)
__tcache_insert_reclaim_node(ni, pni);
tcache_put_pool(pni->pool);
-
- if (nr_to_isolate > 0)
- goto again;
out:
spin_unlock_irq(&ni->lock);
return nr_isolated;
@@ -998,18 +1062,7 @@ static bool __tcache_reclaim_page(struct page *page)
bool ret;
node = tcache_page_node(page);
- if (tcache_page_tree_delete(node, page->index, page)) {
- /*
- * We deleted the page from the tree - drop the
- * corresponding reference.
- */
- tcache_put_page(page);
- ret = true;
- } else
- /* The page was deleted by a concurrent thread - abort. */
- ret = false;
-
- /* Drop the reference taken in __tcache_lru_isolate. */
+ ret = tcache_page_tree_delete(node, page->index, page);
tcache_put_node_and_pool(node);
return ret;
}
@@ -1021,9 +1074,10 @@ static int tcache_reclaim_pages(struct page **pages, int nr)
local_irq_disable();
for (i = 0; i < nr; i++) {
- nr_reclaimed += !!__tcache_reclaim_page(pages[i]);
- /* Drop the reference taken in __tcache_lru_isolate. */
- tcache_put_page(pages[i]);
+ if (__tcache_reclaim_page(pages[i])) {
+ nr_reclaimed++;
+ tcache_put_page(pages[i]);
+ }
pages[i] = NULL;
}
local_irq_enable();
@@ -1048,10 +1102,10 @@ tcache_try_to_reclaim_page(struct tcache_pool *pool, int nid)
if (!ret)
goto out;
- if (!__tcache_reclaim_page(page)) {
- tcache_put_page(page);
+ if (!__tcache_reclaim_page(page))
page = NULL;
- }
+ else
+ page_ref_unfreeze(page, 1);
out:
local_irq_restore(flags);
return page;
@@ -1135,13 +1189,11 @@ static void tcache_cleancache_put_page(int pool_id,
cache_page = tcache_alloc_page(node->pool);
if (cache_page) {
copy_highpage(cache_page, page);
- /* cleancache does not care about failures */
- (void)tcache_attach_page(node, index, cache_page);
+ if (tcache_attach_page(node, index, cache_page))
+ if (put_page_testzero(cache_page))
+ tcache_put_page(page);
}
tcache_put_node_and_pool(node);
- if (cache_page)
- tcache_put_page(cache_page);
-
}
}
More information about the Devel
mailing list