[Devel] [RK for VZ7] mm: Reduce access frequency to shrinker_rwsem during shrink_slab

Vasily Averin vvs at virtuozzo.com
Sun Aug 23 10:37:52 MSK 2020


Bug https://jira.sw.ru/browse/PSBM-99181 has introduced a problem: when
the kernel has opened NFS delegations and NFS server is not accessible
at the time when NFS shrinker is called, the whole shrinker list
execution gets stuck until NFS server is back. Being a problem in itself
it also introduces bigger problem - during that hang, the shrinker_rwsem
also gets locked, consequently no new mounts can be done at that time
because new superblock tries to register it's own shrinker and also gets
stuck at aquiring shrinker_rwsem.

Commit 9e9e35d050955648449498827deb2d43be0564e1 is a workaround for that
problem. It is known that during signle shrinker execution we do not
actually need to hold shrinker_rwsem so we release and reacqiure the
rwsem for each shrinker in the list.

Because of this workaround shrink_slab function now experiences a major
slowdown, because shrinker_rwsem gets accessed for each shrinker in the
list twice. On an idle fresh-booted system shrinker_list could be
iterated up to 1600 times a second, although originally the problem was
local to only one NFS shrinker.

This patch fixes commit 9e9e35d050955648449498827deb2d43be0564e1 in a
way that before calling for up_read for shrinker_rwsem, we check that
this is really an NFS shrinker by checking NFS magic in superblock, if
it is accessible from shrinker.

https://jira.sw.ru/browse/PSBM-99181

Co-authored-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
Reviewed-by: Andrey Ryabinin <aryabinin at virtuozzo.com>

Changes:
  v2: Added missing 'rwsem_is_contented' check
... fixed by following ...
From: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
Subject: [PATCH RHEL7] mm: Fixing rwsem_is_contented conditional code in shrink_slab_memcg
Date: Fri, 21 Aug 2020 14:08:44 +0300
Message-Id: <1598008124-415916-1-git-send-email-valeriy.vdovin at virtuozzo.com>

Fixes commit 38afbd5ecdd6 ("Reduce access frequency to shrinker_rwsem during shrink_slab")
that partially reverts code in shrink_slab_memcg by adding missing line.

https://jira.sw.ru/browse/PSBM-99181
Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>

RK-specific changes:
a) memcg_kmem_enabled() replaced by static_key_enabled(&memcg_kmem_enabled_key)
to avoid RK compilation error:
vmscan.o: Found a jump label at shrink_slab+0x66, key: memcg_kmem_enabled_key.
/usr/libexec/kpatch/create-diff-object: ERROR: vmscan.o:
 kpatch_regenerate_special_section: 1984: Found 1 jump label(s)
 in the patched code. Jump labels aren't currently supported.
 Use static_key_enabled() instead.

b) Using of 'super_cache_scan' pointer in is_nfs_shrinker() was replaced by
global vairable rk_p_super_cache_scan to avoid static-to-global conversion
of super_cache_scan() in original patch to avoid RK compilation error:
ERROR: super.o: symbol info mismatch: super_cache_scan
/usr/libexec/kpatch/create-diff-object: unreconcilable difference
-------------- next part --------------
diff --git a/fs/super.c b/fs/super.c
index f131d14..4ba5df4 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -144,6 +144,8 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
 	return freed;
 }
 
+unsigned long (* rk_p_super_cache_scan)(struct shrinker *, struct shrink_control *) = super_cache_scan;
+
 static unsigned long super_cache_count(struct shrinker *shrink,
 				       struct shrink_control *sc)
 {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d7082d2..ae97c21 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -453,6 +453,18 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 	return freed;
 }
 
+extern unsigned long (* rk_p_super_cache_scan)(struct shrinker *, struct shrink_control *);
+static inline bool is_nfs_shrinker(struct shrinker *shrinker)
+{
+	struct super_block *sb = container_of(shrinker,
+		struct super_block, s_shrink);
+
+	if (shrinker->scan_objects == rk_p_super_cache_scan)
+		return sb->s_magic == NFS_SUPER_MAGIC;
+
+	return false;
+}
+
 struct shrinker *get_shrinker(struct shrinker *shrinker)
 {
 	/*
@@ -491,7 +503,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 	unsigned long ret, freed = 0;
 	int i, nr_max;
 
-	if (!memcg_kmem_enabled())
+	if (!static_key_enabled(&memcg_kmem_enabled_key))
 		return 0;
 
 	if (!down_read_trylock(&shrinker_rwsem))
@@ -511,6 +523,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 			.memcg = memcg,
 		};
 		struct shrinker *shrinker;
+		bool is_nfs;
 
 		shrinker = idr_find(&shrinker_idr, i);
 		if (unlikely(!shrinker)) {
@@ -518,6 +531,8 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 			continue;
 		}
 
+		is_nfs = is_nfs_shrinker(shrinker);
+
 		/*
 		 * Take a refcnt on a shrinker so that it can't be freed or
 		 * removed from shrinker_idr (and shrinker_list). These way we
@@ -527,10 +542,16 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 		 * take too much time to finish (e.g. on nfs). And holding
 		 * global shrinker_rwsem can block registring and unregistring
 		 * of shrinkers.
+		 *
+		 * The up_read logic should only be executed for nfs shrinker
+		 * path, because it has proven to hang. For others it should be
+		 * skipped to reduce performance penalties.
 		 */
-		if(!get_shrinker(shrinker))
-			continue;
-		up_read(&shrinker_rwsem);
+		if (is_nfs) {
+			if (!get_shrinker(shrinker))
+				continue;
+			up_read(&shrinker_rwsem);
+		}
 
 		ret = do_shrink_slab(&sc, shrinker, priority);
 		if (ret == SHRINK_EMPTY) {
@@ -565,14 +586,19 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 		 * memcg_expand_one_shrinker_map if new shrinkers
 		 * were registred in the meanwhile.
 		 */
-		if (!down_read_trylock(&shrinker_rwsem)) {
-			freed = freed ? : 1;
+		if (is_nfs) {
+			if (!down_read_trylock(&shrinker_rwsem)) {
+				freed = freed ? : 1;
+				put_shrinker(shrinker);
+				return freed;
+			}
 			put_shrinker(shrinker);
-			return freed;
+			map = memcg_nid_shrinker_map(memcg, nid);
+			nr_max = min(shrinker_nr_max, map->nr_max);
+		} else if (rwsem_is_contended(&shrinker_rwsem)) {
+			freed = freed ? : 1;
+			break;
 		}
-		put_shrinker(shrinker);
-		map = memcg_nid_shrinker_map(memcg, nid);
-		nr_max = min(shrinker_nr_max, map->nr_max);
 	}
 unlock:
 	up_read(&shrinker_rwsem);
@@ -648,13 +674,17 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 			.for_drop_caches = for_drop_caches,
 		};
 
+		bool is_nfs = is_nfs_shrinker(shrinker);
+
 		if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
 			sc.nid = 0;
 
 		/* See comment in shrink_slab_memcg. */
-		if(!get_shrinker(shrinker))
-			continue;
-		up_read(&shrinker_rwsem);
+		if (is_nfs) {
+			if (!get_shrinker(shrinker))
+				continue;
+			up_read(&shrinker_rwsem);
+		}
 
 		ret = do_shrink_slab(&sc, shrinker, priority);
 		if (ret == SHRINK_EMPTY)
@@ -665,12 +695,14 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 		 * We can safely continue traverse of the shrinker_list as
 		 * our shrinker is still on the list due to refcount.
 		 */
-		if (!down_read_trylock(&shrinker_rwsem)) {
-			freed = freed ? : 1;
+		if (is_nfs) {
+			if (!down_read_trylock(&shrinker_rwsem)) {
+				freed = freed ? : 1;
+				put_shrinker(shrinker);
+				goto out;
+			}
 			put_shrinker(shrinker);
-			goto out;
 		}
-		put_shrinker(shrinker);
 	}
 
 	up_read(&shrinker_rwsem);


More information about the Devel mailing list