[Devel] [PATCH RHEL8 COMMIT] fuse: improve bdi dirty memory limits for fuse

Konstantin Khorenko khorenko at virtuozzo.com
Fri Apr 23 11:54:52 MSK 2021


The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.19
------>
commit ed637d7538892a8811c7295aacfa33e531304e60
Author: Maxim Patlasov <MPatlasov at parallels.com>
Date:   Fri Apr 23 11:54:52 2021 +0300

    fuse: improve bdi dirty memory limits for fuse
    
    Port diff-fuse-improve-bdi-dirty-memory-limits-for-fuse-2 from 2.6.32-*:
    
    So far default dirty ratio for fuse was 1% of global dirty limit. The latter
    is 20% of RAM by default. This is too small for PCS:
    
    BdiDirtyThresh WriteRate
    31664k         55,7 MB/sec
    63328k         75,0 MB/sec
    126436k        92,0 MB/sec
    158096k        107 MB/sec
    199816k        106 MB/sec
    
    The patch increase per-bdi dirty ratio for fuse from 1% to 20%. It can be
    changed by writing to /sys/class/bdi/<bdi>/max_ratio.
    
    The patch also introduces per-bdi min/max limits for dirty memory (measured
    in pages). They are 64MB/256MB for fuse by default. They can be changed by
    writing to /sys/class/bdi/<bdi>/{min|max}_dirty_pages.
    
    Rationale:
    1. For machines with plenty of RAM (>16GB) we give 256MB per fuse mount.
    This should be enough to avoid BdiDirtyThresh being bottleneck.
    2. For machines with scarcity of RAM (1GB or lesser) we give 64MB per fuse
    mount. We beleive it's not too much.
    3. For machines with moderate RAM size we get some value in 64MB..256MB range.
    E.g. in case of 4GB, BdiDirtyThresh should be (roughly) about 160MB (20% of
    20% of 4GB). Acording to measurements cited above this is enough to saturate
    PCS.
    
    https://jira.sw.ru/browse/PSBM-13700
    
    Signed-off-by: Maxim V. Patlasov <MPatlasov at parallels.com>
    
    Acked-by: Pavel Emelyanov <xemul at parallels.com>
    
    +++
    fs/mm: writeback: fix per bdi dirty background threshold calculation
    
    After patch [1] introduced upper and lower boundaries for per bdi dirty
    threshold (see bdi->min_dirty_pages and max_dirty_pages), it is
    incorrect to use bdi_dirty_limit() helper for calculating background
    threshold. E.g. on a 16 GB host, bdi_dirty_limit() would return the
    following values for a FUSE device if the upper boundary was unset:
    
      bdi_thresh = (16 GB * 20 / 100) * 20 / 100 = 655 MB
                    ^^^^^   ^^^^^^^^    ^^^^^^^^
                  RAM size           bdi->max_ratio
    
                        vm.dirty_ratio
    
      bdi_bg_thresh = (16 GB * 10 / 100) * 20 / 100 = 327 MB
                       ^^^^^   ^^^^^^^^    ^^^^^^^^
                     RAM size           bdi->max_ratio
    
                       vm.dirty_background_ratio
    
    which looks fine.
    
    However, with the default upper threshold of 256 MB for FUSE devices,
    both dirty and background thresholds will be equal to 256 MB. As a
    result the background flusher will only wake up once the writer is
    throttled. This obviously results in a huge write rate degradation.
    
    To fix this issue, let's use bdi_dirty_limit() helper only for
    calculating the throttle threshold, and compute the background threshold
    as follows:
    
      bdi_bg_thresh = bdi_thresh * global_background_thresh / global_thresh
    
    https://jira.sw.ru/browse/PSBM-45497
    
    mFixes: 2f5b9552e256d ("fuse: improve bdi dirty memory limits for fuse") [1]
    Signed-off-by: Vladimir Davydov <vdavydov at virtuozzo.com>
    
    Acked-by: Maxim Patlasov <mpatlasov at virtuozzo.com>
---
 fs/fuse/inode.c                  | 12 +++++++---
 include/linux/backing-dev-defs.h |  3 +++
 include/linux/backing-dev.h      |  2 ++
 mm/backing-dev.c                 | 47 ++++++++++++++++++++++++--------------
 mm/page-writeback.c              | 49 ++++++++++++++++++++++++++++++++++++++--
 5 files changed, 91 insertions(+), 22 deletions(-)

diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 3065503aa327..667e6c33cfdf 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1277,10 +1277,10 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
 	sb->s_bdi->capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
 
 	/*
-	 * For a single fuse filesystem use max 1% of dirty +
+	 * For a single fuse filesystem use max 20% of dirty +
 	 * writeback threshold.
 	 *
-	 * This gives about 1M of write buffer for memory maps on a
+	 * This gives about 20M of write buffer for memory maps on a
 	 * machine with 1G and 10% dirty_ratio, which should be more
 	 * than enough.
 	 *
@@ -1288,7 +1288,13 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
 	 *
 	 *    /sys/class/bdi/<bdi>/max_ratio
 	 */
-	bdi_set_max_ratio(sb->s_bdi, 1);
+	bdi_set_max_ratio(sb->s_bdi, 20);
+
+	/*
+	 * These values have precedence over max_ratio
+	 */
+	bdi_set_max_dirty(sb->s_bdi, (256 * 1024 * 1024) / PAGE_SIZE);
+	bdi_set_min_dirty(sb->s_bdi, (64 * 1024 * 1024) / PAGE_SIZE);
 
 	return 0;
 }
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index f864c7c3793f..40ee29bddc13 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -205,6 +205,9 @@ struct backing_dev_info {
 	unsigned int min_ratio;
 	unsigned int max_ratio, max_prop_frac;
 
+	unsigned int min_dirty_pages;
+	unsigned int max_dirty_pages;
+
 	/*
 	 * Sum of avg_write_bw of wbs with dirty inodes.  > 0 if there are
 	 * any dirty wbs, which is depended upon by bdi_has_dirty().
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 504a2f123ad0..7fbccf522be2 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -109,6 +109,8 @@ static inline unsigned long wb_stat_error(void)
 
 int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio);
 int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
+int bdi_set_min_dirty(struct backing_dev_info *bdi, unsigned int min_dirty);
+int bdi_set_max_dirty(struct backing_dev_info *bdi, unsigned int max_dirty);
 
 /*
  * Flags in backing_dev_info::capability
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 9ae81bcb90a9..cc2a3c0e6ae5 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -175,43 +175,52 @@ static DEVICE_ATTR_RW(name);
 
 BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
 
-static ssize_t min_ratio_store(struct device *dev,
-		struct device_attribute *attr, const char *buf, size_t count)
+static inline ssize_t generic_uint_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count,
+		int (*set_func) (struct backing_dev_info *, unsigned int))
 {
 	struct backing_dev_info *bdi = dev_get_drvdata(dev);
-	unsigned int ratio;
+	unsigned int val;
 	ssize_t ret;
 
-	ret = kstrtouint(buf, 10, &ratio);
+	ret = kstrtouint(buf, 10, &val);
 	if (ret < 0)
 		return ret;
 
-	ret = bdi_set_min_ratio(bdi, ratio);
+	ret = set_func(bdi, val);
 	if (!ret)
 		ret = count;
 
 	return ret;
 }
+
+static ssize_t min_ratio_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	return generic_uint_store(dev, attr, buf, count, bdi_set_min_ratio);
+}
 BDI_SHOW(min_ratio, bdi->min_ratio)
 
 static ssize_t max_ratio_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t count)
 {
-	struct backing_dev_info *bdi = dev_get_drvdata(dev);
-	unsigned int ratio;
-	ssize_t ret;
-
-	ret = kstrtouint(buf, 10, &ratio);
-	if (ret < 0)
-		return ret;
+	return generic_uint_store(dev, attr, buf, count, bdi_set_max_ratio);
+}
+BDI_SHOW(max_ratio, bdi->max_ratio)
 
-	ret = bdi_set_max_ratio(bdi, ratio);
-	if (!ret)
-		ret = count;
+static ssize_t min_dirty_pages_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	return generic_uint_store(dev, attr, buf, count, bdi_set_min_dirty);
+}
+BDI_SHOW(min_dirty_pages, bdi->min_dirty_pages)
 
-	return ret;
+static ssize_t max_dirty_pages_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	return generic_uint_store(dev, attr, buf, count, bdi_set_max_dirty);
 }
-BDI_SHOW(max_ratio, bdi->max_ratio)
+BDI_SHOW(max_dirty_pages, bdi->max_dirty_pages)
 
 static ssize_t stable_pages_required_show(struct device *dev,
 					  struct device_attribute *attr,
@@ -228,6 +237,8 @@ static struct attribute *bdi_dev_attrs[] = {
 	&dev_attr_read_ahead_kb.attr,
 	&dev_attr_min_ratio.attr,
 	&dev_attr_max_ratio.attr,
+	&dev_attr_min_dirty_pages.attr,
+	&dev_attr_max_dirty_pages.attr,
 	&dev_attr_stable_pages_required.attr,
 	NULL,
 };
@@ -854,6 +865,8 @@ static int bdi_init(struct backing_dev_info *bdi)
 	bdi->min_ratio = 0;
 	bdi->max_ratio = 100;
 	bdi->max_prop_frac = FPROP_FRAC_BASE;
+	bdi->min_dirty_pages = 0;
+	bdi->max_dirty_pages = 0;
 	INIT_LIST_HEAD(&bdi->bdi_list);
 	INIT_LIST_HEAD(&bdi->wb_list);
 	init_waitqueue_head(&bdi->wb_waitq);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index ef85b08e228c..d37a1e7a3f0e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -713,6 +713,41 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
 }
 EXPORT_SYMBOL(bdi_set_max_ratio);
 
+int bdi_set_min_dirty(struct backing_dev_info *bdi, unsigned min_dirty)
+{
+	int ret = 0;
+
+	spin_lock_bh(&bdi_lock);
+	if (min_dirty > bdi->max_dirty_pages) {
+		ret = -EINVAL;
+	} else {
+		bdi->min_dirty_pages = min_dirty;
+	}
+	spin_unlock_bh(&bdi_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(bdi_set_min_dirty);
+
+int bdi_set_max_dirty(struct backing_dev_info *bdi, unsigned max_dirty)
+{
+	int ret = 0;
+
+	if (max_dirty > get_num_physpages())
+		return -EINVAL;
+
+	spin_lock_bh(&bdi_lock);
+	if (bdi->min_dirty_pages > max_dirty) {
+		ret = -EINVAL;
+	} else {
+		bdi->max_dirty_pages = max_dirty;
+	}
+	spin_unlock_bh(&bdi_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(bdi_set_max_dirty);
+
 static unsigned long dirty_freerun_ceiling(unsigned long thresh,
 					   unsigned long bg_thresh)
 {
@@ -764,6 +799,7 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
 static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
 {
 	struct wb_domain *dom = dtc_dom(dtc);
+	struct backing_dev_info *bdi = dtc->wb->bdi;
 	unsigned long thresh = dtc->thresh;
 	u64 wb_thresh;
 	long numerator, denominator;
@@ -785,6 +821,12 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
 	if (wb_thresh > (thresh * wb_max_ratio) / 100)
 		wb_thresh = thresh * wb_max_ratio / 100;
 
+	if (bdi->min_dirty_pages && wb_thresh < bdi->min_dirty_pages)
+		wb_thresh = min((unsigned long)bdi->min_dirty_pages, thresh);
+
+	if (bdi->max_dirty_pages && wb_thresh > bdi->max_dirty_pages)
+		wb_thresh = bdi->max_dirty_pages;
+
 	return wb_thresh;
 }
 
@@ -1928,6 +1970,7 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
 	struct dirty_throttle_control * const gdtc = &gdtc_stor;
 	struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
 						     &mdtc_stor : NULL;
+	unsigned long bdi_thresh, bdi_bg_thresh;
 
 	/*
 	 * Similar to balance_dirty_pages() but ignores pages being written
@@ -1941,8 +1984,10 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
 	if (gdtc->dirty > gdtc->bg_thresh)
 		return true;
 
-	if (wb_stat(wb, WB_RECLAIMABLE) >
-	    wb_calc_thresh(gdtc->wb, gdtc->bg_thresh))
+	bdi_thresh = __wb_calc_thresh(gdtc);
+	bdi_bg_thresh = gdtc->thresh ? div_u64((u64)bdi_thresh * gdtc->bg_thresh,
+					       gdtc->thresh) : 0;
+	if (wb_stat(wb, WB_RECLAIMABLE) > bdi_bg_thresh)
 		return true;
 
 	if (mdtc) {


More information about the Devel mailing list