[Devel] [PATCH RHEL8 COMMIT] fuse: improve bdi dirty memory limits for fuse
Konstantin Khorenko
khorenko at virtuozzo.com
Fri Apr 23 11:54:52 MSK 2021
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.19
------>
commit ed637d7538892a8811c7295aacfa33e531304e60
Author: Maxim Patlasov <MPatlasov at parallels.com>
Date: Fri Apr 23 11:54:52 2021 +0300
fuse: improve bdi dirty memory limits for fuse
Port diff-fuse-improve-bdi-dirty-memory-limits-for-fuse-2 from 2.6.32-*:
So far default dirty ratio for fuse was 1% of global dirty limit. The latter
is 20% of RAM by default. This is too small for PCS:
BdiDirtyThresh WriteRate
31664k 55,7 MB/sec
63328k 75,0 MB/sec
126436k 92,0 MB/sec
158096k 107 MB/sec
199816k 106 MB/sec
The patch increase per-bdi dirty ratio for fuse from 1% to 20%. It can be
changed by writing to /sys/class/bdi/<bdi>/max_ratio.
The patch also introduces per-bdi min/max limits for dirty memory (measured
in pages). They are 64MB/256MB for fuse by default. They can be changed by
writing to /sys/class/bdi/<bdi>/{min|max}_dirty_pages.
Rationale:
1. For machines with plenty of RAM (>16GB) we give 256MB per fuse mount.
This should be enough to avoid BdiDirtyThresh being bottleneck.
2. For machines with scarcity of RAM (1GB or lesser) we give 64MB per fuse
mount. We beleive it's not too much.
3. For machines with moderate RAM size we get some value in 64MB..256MB range.
E.g. in case of 4GB, BdiDirtyThresh should be (roughly) about 160MB (20% of
20% of 4GB). Acording to measurements cited above this is enough to saturate
PCS.
https://jira.sw.ru/browse/PSBM-13700
Signed-off-by: Maxim V. Patlasov <MPatlasov at parallels.com>
Acked-by: Pavel Emelyanov <xemul at parallels.com>
+++
fs/mm: writeback: fix per bdi dirty background threshold calculation
After patch [1] introduced upper and lower boundaries for per bdi dirty
threshold (see bdi->min_dirty_pages and max_dirty_pages), it is
incorrect to use bdi_dirty_limit() helper for calculating background
threshold. E.g. on a 16 GB host, bdi_dirty_limit() would return the
following values for a FUSE device if the upper boundary was unset:
bdi_thresh = (16 GB * 20 / 100) * 20 / 100 = 655 MB
^^^^^ ^^^^^^^^ ^^^^^^^^
RAM size bdi->max_ratio
vm.dirty_ratio
bdi_bg_thresh = (16 GB * 10 / 100) * 20 / 100 = 327 MB
^^^^^ ^^^^^^^^ ^^^^^^^^
RAM size bdi->max_ratio
vm.dirty_background_ratio
which looks fine.
However, with the default upper threshold of 256 MB for FUSE devices,
both dirty and background thresholds will be equal to 256 MB. As a
result the background flusher will only wake up once the writer is
throttled. This obviously results in a huge write rate degradation.
To fix this issue, let's use bdi_dirty_limit() helper only for
calculating the throttle threshold, and compute the background threshold
as follows:
bdi_bg_thresh = bdi_thresh * global_background_thresh / global_thresh
https://jira.sw.ru/browse/PSBM-45497
mFixes: 2f5b9552e256d ("fuse: improve bdi dirty memory limits for fuse") [1]
Signed-off-by: Vladimir Davydov <vdavydov at virtuozzo.com>
Acked-by: Maxim Patlasov <mpatlasov at virtuozzo.com>
---
fs/fuse/inode.c | 12 +++++++---
include/linux/backing-dev-defs.h | 3 +++
include/linux/backing-dev.h | 2 ++
mm/backing-dev.c | 47 ++++++++++++++++++++++++--------------
mm/page-writeback.c | 49 ++++++++++++++++++++++++++++++++++++++--
5 files changed, 91 insertions(+), 22 deletions(-)
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 3065503aa327..667e6c33cfdf 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1277,10 +1277,10 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
sb->s_bdi->capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
/*
- * For a single fuse filesystem use max 1% of dirty +
+ * For a single fuse filesystem use max 20% of dirty +
* writeback threshold.
*
- * This gives about 1M of write buffer for memory maps on a
+ * This gives about 20M of write buffer for memory maps on a
* machine with 1G and 10% dirty_ratio, which should be more
* than enough.
*
@@ -1288,7 +1288,13 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
*
* /sys/class/bdi/<bdi>/max_ratio
*/
- bdi_set_max_ratio(sb->s_bdi, 1);
+ bdi_set_max_ratio(sb->s_bdi, 20);
+
+ /*
+ * These values have precedence over max_ratio
+ */
+ bdi_set_max_dirty(sb->s_bdi, (256 * 1024 * 1024) / PAGE_SIZE);
+ bdi_set_min_dirty(sb->s_bdi, (64 * 1024 * 1024) / PAGE_SIZE);
return 0;
}
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index f864c7c3793f..40ee29bddc13 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -205,6 +205,9 @@ struct backing_dev_info {
unsigned int min_ratio;
unsigned int max_ratio, max_prop_frac;
+ unsigned int min_dirty_pages;
+ unsigned int max_dirty_pages;
+
/*
* Sum of avg_write_bw of wbs with dirty inodes. > 0 if there are
* any dirty wbs, which is depended upon by bdi_has_dirty().
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 504a2f123ad0..7fbccf522be2 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -109,6 +109,8 @@ static inline unsigned long wb_stat_error(void)
int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio);
int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
+int bdi_set_min_dirty(struct backing_dev_info *bdi, unsigned int min_dirty);
+int bdi_set_max_dirty(struct backing_dev_info *bdi, unsigned int max_dirty);
/*
* Flags in backing_dev_info::capability
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 9ae81bcb90a9..cc2a3c0e6ae5 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -175,43 +175,52 @@ static DEVICE_ATTR_RW(name);
BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
-static ssize_t min_ratio_store(struct device *dev,
- struct device_attribute *attr, const char *buf, size_t count)
+static inline ssize_t generic_uint_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count,
+ int (*set_func) (struct backing_dev_info *, unsigned int))
{
struct backing_dev_info *bdi = dev_get_drvdata(dev);
- unsigned int ratio;
+ unsigned int val;
ssize_t ret;
- ret = kstrtouint(buf, 10, &ratio);
+ ret = kstrtouint(buf, 10, &val);
if (ret < 0)
return ret;
- ret = bdi_set_min_ratio(bdi, ratio);
+ ret = set_func(bdi, val);
if (!ret)
ret = count;
return ret;
}
+
+static ssize_t min_ratio_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ return generic_uint_store(dev, attr, buf, count, bdi_set_min_ratio);
+}
BDI_SHOW(min_ratio, bdi->min_ratio)
static ssize_t max_ratio_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
- struct backing_dev_info *bdi = dev_get_drvdata(dev);
- unsigned int ratio;
- ssize_t ret;
-
- ret = kstrtouint(buf, 10, &ratio);
- if (ret < 0)
- return ret;
+ return generic_uint_store(dev, attr, buf, count, bdi_set_max_ratio);
+}
+BDI_SHOW(max_ratio, bdi->max_ratio)
- ret = bdi_set_max_ratio(bdi, ratio);
- if (!ret)
- ret = count;
+static ssize_t min_dirty_pages_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ return generic_uint_store(dev, attr, buf, count, bdi_set_min_dirty);
+}
+BDI_SHOW(min_dirty_pages, bdi->min_dirty_pages)
- return ret;
+static ssize_t max_dirty_pages_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ return generic_uint_store(dev, attr, buf, count, bdi_set_max_dirty);
}
-BDI_SHOW(max_ratio, bdi->max_ratio)
+BDI_SHOW(max_dirty_pages, bdi->max_dirty_pages)
static ssize_t stable_pages_required_show(struct device *dev,
struct device_attribute *attr,
@@ -228,6 +237,8 @@ static struct attribute *bdi_dev_attrs[] = {
&dev_attr_read_ahead_kb.attr,
&dev_attr_min_ratio.attr,
&dev_attr_max_ratio.attr,
+ &dev_attr_min_dirty_pages.attr,
+ &dev_attr_max_dirty_pages.attr,
&dev_attr_stable_pages_required.attr,
NULL,
};
@@ -854,6 +865,8 @@ static int bdi_init(struct backing_dev_info *bdi)
bdi->min_ratio = 0;
bdi->max_ratio = 100;
bdi->max_prop_frac = FPROP_FRAC_BASE;
+ bdi->min_dirty_pages = 0;
+ bdi->max_dirty_pages = 0;
INIT_LIST_HEAD(&bdi->bdi_list);
INIT_LIST_HEAD(&bdi->wb_list);
init_waitqueue_head(&bdi->wb_waitq);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index ef85b08e228c..d37a1e7a3f0e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -713,6 +713,41 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
}
EXPORT_SYMBOL(bdi_set_max_ratio);
+int bdi_set_min_dirty(struct backing_dev_info *bdi, unsigned min_dirty)
+{
+ int ret = 0;
+
+ spin_lock_bh(&bdi_lock);
+ if (min_dirty > bdi->max_dirty_pages) {
+ ret = -EINVAL;
+ } else {
+ bdi->min_dirty_pages = min_dirty;
+ }
+ spin_unlock_bh(&bdi_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(bdi_set_min_dirty);
+
+int bdi_set_max_dirty(struct backing_dev_info *bdi, unsigned max_dirty)
+{
+ int ret = 0;
+
+ if (max_dirty > get_num_physpages())
+ return -EINVAL;
+
+ spin_lock_bh(&bdi_lock);
+ if (bdi->min_dirty_pages > max_dirty) {
+ ret = -EINVAL;
+ } else {
+ bdi->max_dirty_pages = max_dirty;
+ }
+ spin_unlock_bh(&bdi_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(bdi_set_max_dirty);
+
static unsigned long dirty_freerun_ceiling(unsigned long thresh,
unsigned long bg_thresh)
{
@@ -764,6 +799,7 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
{
struct wb_domain *dom = dtc_dom(dtc);
+ struct backing_dev_info *bdi = dtc->wb->bdi;
unsigned long thresh = dtc->thresh;
u64 wb_thresh;
long numerator, denominator;
@@ -785,6 +821,12 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
if (wb_thresh > (thresh * wb_max_ratio) / 100)
wb_thresh = thresh * wb_max_ratio / 100;
+ if (bdi->min_dirty_pages && wb_thresh < bdi->min_dirty_pages)
+ wb_thresh = min((unsigned long)bdi->min_dirty_pages, thresh);
+
+ if (bdi->max_dirty_pages && wb_thresh > bdi->max_dirty_pages)
+ wb_thresh = bdi->max_dirty_pages;
+
return wb_thresh;
}
@@ -1928,6 +1970,7 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
struct dirty_throttle_control * const gdtc = &gdtc_stor;
struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
&mdtc_stor : NULL;
+ unsigned long bdi_thresh, bdi_bg_thresh;
/*
* Similar to balance_dirty_pages() but ignores pages being written
@@ -1941,8 +1984,10 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
if (gdtc->dirty > gdtc->bg_thresh)
return true;
- if (wb_stat(wb, WB_RECLAIMABLE) >
- wb_calc_thresh(gdtc->wb, gdtc->bg_thresh))
+ bdi_thresh = __wb_calc_thresh(gdtc);
+ bdi_bg_thresh = gdtc->thresh ? div_u64((u64)bdi_thresh * gdtc->bg_thresh,
+ gdtc->thresh) : 0;
+ if (wb_stat(wb, WB_RECLAIMABLE) > bdi_bg_thresh)
return true;
if (mdtc) {
More information about the Devel
mailing list