[Devel] [PATCH rh7 8/8] mm/swap: track shadow entries of swapped anon pages
Andrey Ryabinin
aryabinin at virtuozzo.com
Tue Feb 12 18:39:15 MSK 2019
This mostly copy of page cache implementation. Record refault
information when page swapped out, read it on swap in.
https://pmc.acronis.com/browse/VSTOR-19037
Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
---
drivers/staging/zcache/zcache-main.c | 2 +-
include/linux/swap.h | 10 +--
mm/shmem.c | 2 +-
mm/swap_state.c | 123 ++++++++++++++++++++++++---
mm/swapfile.c | 2 +-
mm/tswap.c | 2 +-
mm/vmscan.c | 6 +-
mm/workingset.c | 3 +-
8 files changed, 125 insertions(+), 25 deletions(-)
diff --git a/drivers/staging/zcache/zcache-main.c b/drivers/staging/zcache/zcache-main.c
index 01e8446b04d0..732be2143e64 100644
--- a/drivers/staging/zcache/zcache-main.c
+++ b/drivers/staging/zcache/zcache-main.c
@@ -948,7 +948,7 @@ static int zcache_get_swap_cache_page(int type, pgoff_t offset,
/* May fail (-ENOMEM) if radix-tree node allocation failed. */
__set_page_locked(new_page);
SetPageSwapBacked(new_page);
- err = __add_to_swap_cache(new_page, entry);
+ err = __add_to_swap_cache(new_page, entry, NULL);
if (likely(!err)) {
radix_tree_preload_end();
lru_cache_add_anon(new_page);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7797cb88870b..2985b5f90ce5 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -455,9 +455,9 @@ extern struct address_space *swapper_spaces[];
extern unsigned long total_swapcache_pages(void);
extern void show_swap_cache_info(void);
extern int add_to_swap(struct page *, struct list_head *list);
-extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t);
-extern int __add_to_swap_cache(struct page *page, swp_entry_t entry);
-extern void __delete_from_swap_cache(struct page *);
+extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t, void **);
+extern int __add_to_swap_cache(struct page *page, swp_entry_t entry, void **shadow);
+extern void __delete_from_swap_cache(struct page *, void *shadow);
extern void delete_from_swap_cache(struct page *);
extern void free_page_and_swap_cache(struct page *);
extern void free_pages_and_swap_cache(struct page **, int);
@@ -592,12 +592,12 @@ static inline int add_to_swap(struct page *page, struct list_head *list)
}
static inline int add_to_swap_cache(struct page *page, swp_entry_t entry,
- gfp_t gfp_mask)
+ gfp_t gfp_mask, void **)
{
return -1;
}
-static inline void __delete_from_swap_cache(struct page *page)
+static inline void __delete_from_swap_cache(struct page *page, void *shadow)
{
}
diff --git a/mm/shmem.c b/mm/shmem.c
index cda801a5496b..b25e1423d407 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -995,7 +995,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
if (list_empty(&info->swaplist))
list_add_tail(&info->swaplist, &shmem_swaplist);
- if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
+ if (add_to_swap_cache(page, swap, GFP_ATOMIC, NULL) == 0) {
spin_lock(&info->lock);
shmem_recalc_inode(inode);
info->swapped++;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 83e48a7edb28..3931364e78a3 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -91,10 +91,12 @@ void show_swap_cache_info(void)
* __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
* but sets SwapCache flag and private instead of mapping and index.
*/
-int __add_to_swap_cache(struct page *page, swp_entry_t entry)
+int __add_to_swap_cache(struct page *page, swp_entry_t entry, void **shadow)
{
int error;
+ void **slot;
struct address_space *address_space;
+ struct radix_tree_node *node;
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapCache(page), page);
@@ -106,13 +108,46 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry)
address_space = swap_address_space(entry);
spin_lock_irq(&address_space->tree_lock);
- error = radix_tree_insert(&address_space->page_tree,
- entry.val, page);
- if (likely(!error)) {
- address_space->nrpages++;
- __inc_zone_page_state(page, NR_FILE_PAGES);
- INC_CACHE_INFO(add_total);
+ error = __radix_tree_create(&address_space->page_tree, entry.val, 0,
+ &node, &slot);
+ if (error)
+ goto out;
+ if (*slot) {
+ void *p;
+
+ p = radix_tree_deref_slot_protected(slot,
+ &address_space->tree_lock);
+ if (!radix_tree_very_exceptional_entry(p)) {
+ error = -EEXIST;
+ goto out;
+ }
+
+ address_space->nrexceptional--;
+ if (shadow)
+ *shadow = p;
+ if (node)
+ workingset_node_shadows_dec(node);
}
+ radix_tree_replace_slot(slot, page);
+ address_space->nrpages++;
+ __inc_zone_page_state(page, NR_FILE_PAGES);
+ INC_CACHE_INFO(add_total);
+ if (node) {
+ workingset_node_pages_inc(node);
+ /*
+ * Don't track node that contains actual pages.
+ *
+ * Avoid acquiring the list_lru lock if already
+ * untracked. The list_empty() test is safe as
+ * node->private_list is protected by
+ * mapping->tree_lock.
+ */
+ if (!list_empty(&node->private_list))
+ list_lru_del(&workingset_shadow_nodes,
+ &node->private_list);
+ }
+
+out:
spin_unlock_irq(&address_space->tree_lock);
if (unlikely(error)) {
@@ -131,23 +166,78 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry)
}
-int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
+int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask,
+ void **shadow)
{
int error;
error = radix_tree_maybe_preload(gfp_mask);
if (!error) {
- error = __add_to_swap_cache(page, entry);
+ error = __add_to_swap_cache(page, entry, shadow);
radix_tree_preload_end();
}
return error;
}
+static void page_swap_cache_delete(struct address_space *mapping,
+ struct page *page, void *shadow)
+{
+ struct radix_tree_node *node;
+ void **slot;
+
+ VM_BUG_ON(!PageLocked(page));
+
+ __radix_tree_lookup(&mapping->page_tree, page_private(page), &node, &slot);
+ radix_tree_clear_tags(&mapping->page_tree, node, slot);
+
+ if (!node) {
+ /*
+ * We need a node to properly account shadow
+ * entries. Don't plant any without. XXX
+ */
+ shadow = NULL;
+ }
+
+ radix_tree_replace_slot(slot, shadow);
+
+ if (shadow) {
+ mapping->nrexceptional++;
+ /*
+ * Make sure the nrexceptional update is committed before
+ * the nrpages update so that final truncate racing
+ * with reclaim does not see both counters 0 at the
+ * same time and miss a shadow entry.
+ */
+ smp_wmb();
+ }
+
+ if (!node)
+ return;
+
+ workingset_node_pages_dec(node);
+ if (shadow)
+ workingset_node_shadows_inc(node);
+ else
+ if (__radix_tree_delete_node(&mapping->page_tree, node))
+ return;
+
+ /*
+ * Track node that only contains shadow entries.
+ *
+ * Avoid acquiring the list_lru lock if already tracked. The
+ * list_empty() test is safe as node->private_list is
+ * protected by mapping->tree_lock.
+ */
+ if (!workingset_node_pages(node) && list_empty(&node->private_list)) {
+ node->private_data = mapping;
+ list_lru_add(&workingset_shadow_nodes, &node->private_list);
+ }
+}
/*
* This must be called only on pages that have
* been verified to be in the swap cache.
*/
-void __delete_from_swap_cache(struct page *page)
+void __delete_from_swap_cache(struct page *page, void *shadow)
{
swp_entry_t entry;
struct address_space *address_space;
@@ -158,7 +248,7 @@ void __delete_from_swap_cache(struct page *page)
entry.val = page_private(page);
address_space = swap_address_space(entry);
- radix_tree_delete(&address_space->page_tree, page_private(page));
+ page_swap_cache_delete(address_space, page, shadow);
set_page_private(page, 0);
ClearPageSwapCache(page);
address_space->nrpages--;
@@ -203,7 +293,7 @@ int add_to_swap(struct page *page, struct list_head *list)
* Add it to the swap cache.
*/
err = add_to_swap_cache(page, entry,
- __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
+ __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL);
if (!err) {
return 1;
@@ -232,7 +322,7 @@ void delete_from_swap_cache(struct page *page)
address_space = swap_address_space(entry);
spin_lock_irq(&address_space->tree_lock);
- __delete_from_swap_cache(page);
+ __delete_from_swap_cache(page, NULL);
spin_unlock_irq(&address_space->tree_lock);
swapcache_free(entry);
@@ -323,6 +413,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
{
struct page *found_page, *new_page = NULL;
struct address_space *swapper_space = swap_address_space(entry);
+ void *shadow = NULL;
int err;
*new_page_allocated = false;
@@ -395,9 +486,13 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
/* May fail (-ENOMEM) if radix-tree node allocation failed. */
__set_page_locked(new_page);
SetPageSwapBacked(new_page);
- err = __add_to_swap_cache(new_page, entry);
+ err = __add_to_swap_cache(new_page, entry, &shadow);
if (likely(!err)) {
radix_tree_preload_end();
+ if (shadow && workingset_refault(shadow)) {
+ SetPageActive(new_page);
+ workingset_activation(new_page);
+ }
/*
* Initiate read into locked page and return.
*/
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 14043e6bf776..ffc3981c8c60 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1208,7 +1208,7 @@ int reuse_swap_page(struct page *page)
address_space = swap_address_space(entry);
spin_lock_irq(&address_space->tree_lock);
- __delete_from_swap_cache(page);
+ __delete_from_swap_cache(page, NULL);
spin_unlock_irq(&address_space->tree_lock);
/* the page is still in use, do not uncharge */
diff --git a/mm/tswap.c b/mm/tswap.c
index 112a13d223d6..8b18bd17afcf 100644
--- a/mm/tswap.c
+++ b/mm/tswap.c
@@ -213,7 +213,7 @@ static int tswap_evict_page(struct page *page)
goto out_free_swapcache;
SetPageSwapBacked(page);
- err = __add_to_swap_cache(page, entry);
+ err = __add_to_swap_cache(page, entry, NULL);
if (err) {
ClearPageSwapBacked(page);
/* __add_to_swap_cache clears page->private on failure */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 583ba1abfc44..fe034747bb31 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -781,8 +781,12 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page_private(page) };
+ void *shadow = NULL;
+
mem_cgroup_swapout(page, swap);
- __delete_from_swap_cache(page);
+
+ shadow = workingset_eviction(mapping, page);
+ __delete_from_swap_cache(page, shadow);
spin_unlock_irq(&mapping->tree_lock);
swapcache_free(swap);
} else {
diff --git a/mm/workingset.c b/mm/workingset.c
index 0b4cf96bb026..46865ad551ce 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -275,7 +275,8 @@ bool workingset_refault(void *shadow)
}
lruvec = mem_cgroup_zone_lruvec(zone, memcg);
refault = atomic_long_read(&lruvec->inactive_age);
- active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
+ active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
+ lruvec_lru_size(lruvec, LRU_ACTIVE_ANON);
rcu_read_unlock();
/*
--
2.19.2
More information about the Devel
mailing list