[Devel] [PATCH RHEL7 COMMIT] fs/fuse kio: share bandwith/IOPS for prometheus stats
Konstantin Khorenko
khorenko at virtuozzo.com
Mon Nov 11 16:20:09 MSK 2019
The commit is pushed to "branch-rh7-3.10.0-1062.4.1.vz7.115.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-1062.4.1.vz7.115.12
------>
commit 98927dceeb4bdcc9dc8aca846e28633a1ad77a93
Author: Sergey Lysanov <slysanov at virtuozzo.com>
Date: Thu Nov 7 16:49:33 2019 +0300
fs/fuse kio: share bandwith/IOPS for prometheus stats
Pass the following counters from KIO to prometheus through sysfs:
- reads_total
- read_bytes_total
- writes_total
- write_bytes_total
- flushes_total
The compatability with previous version of prometheus proto was saved -
size of histograms wasn't changed.
https://pmc.acronis.com/browse/VSTOR-20601
Signed-off-by: Sergey Lysanov <slysanov at virtuozzo.com>
Reviewed-by: Ildar Ismagilov <ildar.ismagilov at virtuozzo.com>
---
fs/fuse/fuse_i.h | 3 +-
fs/fuse/kio/pcs/fuse_io.c | 14 +++--
fs/fuse/kio/pcs/fuse_ktrace.h | 2 +-
fs/fuse/kio/pcs/fuse_prometheus_prot.h | 35 ++++++++---
fs/fuse/kio/pcs/pcs_cs.c | 2 +-
fs/fuse/kio/pcs/pcs_fuse_kdirect.c | 109 ++++++++++++++++++++-------------
6 files changed, 103 insertions(+), 62 deletions(-)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 092916ce8c0e..1e9ba641922e 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1183,7 +1183,8 @@ struct fuse_req *fuse_generic_request_alloc(struct fuse_conn *fc,
struct kmem_cache *cachep,
unsigned npages, gfp_t flags);
-void fuse_stat_account(struct fuse_conn * fc, int op, ktime_t val);
+void fuse_stat_observe(struct fuse_conn *fc, int op, ktime_t val);
+void fuse_stat_account(struct fuse_conn *fc, int op, u64 val);
int fuse_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
diff --git a/fs/fuse/kio/pcs/fuse_io.c b/fs/fuse/kio/pcs/fuse_io.c
index fe70f6c02bc0..cbee5f2eff06 100644
--- a/fs/fuse/kio/pcs/fuse_io.c
+++ b/fs/fuse/kio/pcs/fuse_io.c
@@ -47,7 +47,8 @@ static void on_read_done(struct pcs_fuse_req *r, size_t size)
clear_highpage(r->exec.io.bvec[i].bv_page);
}
}
- fuse_stat_account(pfc->fc, KFUSE_OP_READ, ktime_sub(ktime_get(), r->exec.ireq.ts));
+ fuse_stat_observe(pfc->fc, KFUSE_OP_READ, ktime_sub(ktime_get(), r->exec.ireq.ts));
+ fuse_stat_account(pfc->fc, KFUSE_OP_READ, size);
r->req.out.args[0].size = size;
fuse_read_dio_end(fi);
request_end(pfc->fc, &r->req);
@@ -58,7 +59,8 @@ static void on_sync_done(struct pcs_fuse_req *r)
struct pcs_fuse_cluster *pfc = cl_from_req(r);
DTRACE("do fuse_request_end req:%p op:%d err:%d\n", &r->req, r->req.in.h.opcode, r->req.out.h.error);
- fuse_stat_account(pfc->fc, KFUSE_OP_FSYNC, ktime_sub(ktime_get(), r->exec.ireq.ts));
+ fuse_stat_observe(pfc->fc, KFUSE_OP_FSYNC, ktime_sub(ktime_get(), r->exec.ireq.ts));
+ fuse_stat_account(pfc->fc, KFUSE_OP_FSYNC, 0);
request_end(pfc->fc, &r->req);
}
@@ -71,7 +73,8 @@ static void on_write_done(struct pcs_fuse_req *r, off_t pos, size_t size)
out->size = size;
DTRACE("do fuse_request_end req:%p op:%d err:%d\n", &r->req, r->req.in.h.opcode, r->req.out.h.error);
- fuse_stat_account(pfc->fc, KFUSE_OP_WRITE, ktime_sub(ktime_get(), r->exec.ireq.ts));
+ fuse_stat_observe(pfc->fc, KFUSE_OP_WRITE, ktime_sub(ktime_get(), r->exec.ireq.ts));
+ fuse_stat_account(pfc->fc, KFUSE_OP_WRITE, size);
fuse_write_dio_end(fi);
request_end(pfc->fc, &r->req);
}
@@ -82,7 +85,8 @@ static void on_fallocate_done(struct pcs_fuse_req *r, off_t pos, size_t size)
struct fuse_inode *fi = get_fuse_inode(r->req.io_inode);
DTRACE("do fuse_request_end req:%p op:%d err:%d\n", &r->req, r->req.in.h.opcode, r->req.out.h.error);
- fuse_stat_account(pfc->fc, KFUSE_OP_FALLOCATE, ktime_sub(ktime_get(), r->exec.ireq.ts));
+ fuse_stat_observe(pfc->fc, KFUSE_OP_FALLOCATE, ktime_sub(ktime_get(), r->exec.ireq.ts));
+ fuse_stat_account(pfc->fc, KFUSE_OP_FALLOCATE, 0);
fuse_write_dio_end(fi);
request_end(pfc->fc, &r->req);
@@ -268,7 +272,7 @@ static void falloc_req_complete(struct pcs_int_request *ireq)
spin_unlock(&di->kq_lock);
DTRACE("do fuse_request_end req:%p op:%d err:%d\n", &r->req, r->req.in.h.opcode, r->req.out.h.error);
- fuse_stat_account(pfc->fc, KFUSE_OP_FALLOCATE, ktime_sub(ktime_get(), ireq->ts));
+ fuse_stat_observe(pfc->fc, KFUSE_OP_FALLOCATE, ktime_sub(ktime_get(), ireq->ts));
fuse_write_dio_end(fi);
request_end(pfc->fc, &r->req);
diff --git a/fs/fuse/kio/pcs/fuse_ktrace.h b/fs/fuse/kio/pcs/fuse_ktrace.h
index 7cce9e26959a..45a4064aa6d0 100644
--- a/fs/fuse/kio/pcs/fuse_ktrace.h
+++ b/fs/fuse/kio/pcs/fuse_ktrace.h
@@ -19,7 +19,7 @@ struct fuse_ktrace
struct dentry *dir;
unsigned long __percpu *ovfl;
struct dentry *prometheus_dentry;
- struct kfuse_histogram * __percpu *prometheus_hist;
+ struct kfuse_metrics __percpu *prometheus_metrics;
u8 * __percpu buf;
};
diff --git a/fs/fuse/kio/pcs/fuse_prometheus_prot.h b/fs/fuse/kio/pcs/fuse_prometheus_prot.h
index e39f2337268f..2959b1e7ff14 100644
--- a/fs/fuse/kio/pcs/fuse_prometheus_prot.h
+++ b/fs/fuse/kio/pcs/fuse_prometheus_prot.h
@@ -8,23 +8,38 @@
#define KFUSE_OP_CS_WRITE_ZERO 4
#define KFUSE_OP_CS_FIEMAP 5
-#define KFUSE_OP_READ 6
-#define KFUSE_OP_WRITE 7
-#define KFUSE_OP_FSYNC 8
-#define KFUSE_OP_FALLOCATE 9
-#define KFUSE_OP_MAX 10
+#define KFUSE_OP_READ 6
+#define KFUSE_OP_WRITE 7
+#define KFUSE_OP_FSYNC 8
+#define KFUSE_OP_FALLOCATE 9
+#define KFUSE_OP_UNALIGNED_WRITE 10
+#define KFUSE_OP_UNALIGNED_READ 11
+#define KFUSE_OP_MAX 12
+/* Histograms contain latencies of all operations except unaligned
+ * writes and reads
+ */
+#define KFUSE_HISTOGRAM_MAX 10
#define KFUSE_PROM_MAX (9*5 + 2)
-struct kfuse_stat_rec
-{
+struct kfuse_histogram {
u64 buckets[KFUSE_PROM_MAX];
u64 sum;
};
-struct kfuse_histogram
-{
- struct kfuse_stat_rec metrics[KFUSE_OP_MAX];
+struct kfuse_counter {
+ u64 events;
+ u64 val_total;
+};
+
+struct kfuse_metrics {
+ /* Histograms are compatible with old version of proto
+ * between userspace and kio where the counters were skipped.
+ */
+ struct kfuse_histogram hists[KFUSE_HISTOGRAM_MAX];
+
+ /* Counters were added in 3.5 release */
+ struct kfuse_counter cnts[KFUSE_OP_MAX];
};
#endif /* __FUSE_PROMETHEUS_PROT__ */
diff --git a/fs/fuse/kio/pcs/pcs_cs.c b/fs/fuse/kio/pcs/pcs_cs.c
index c6ff456c59c2..e29039d03fea 100644
--- a/fs/fuse/kio/pcs/pcs_cs.c
+++ b/fs/fuse/kio/pcs/pcs_cs.c
@@ -289,7 +289,7 @@ void cs_log_io_times(struct pcs_int_request * ireq, struct pcs_msg * resp, unsig
struct pcs_cs_iohdr * h = (struct pcs_cs_iohdr *)msg_inline_head(resp);
int reqt = h->hdr.type != PCS_CS_SYNC_RESP ? ireq->iochunk.cmd : PCS_REQ_T_SYNC;
- fuse_stat_account(fc, reqt, ktime_sub(ktime_get(), ireq->ts_sent));
+ fuse_stat_observe(fc, reqt, ktime_sub(ktime_get(), ireq->ts_sent));
if (fc->ktrace && fc->ktrace_level >= LOG_TRACE) {
int n = 1;
struct fuse_trace_hdr * t;
diff --git a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
index 2bda2381bb8e..98dd0cf3ddd9 100644
--- a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
+++ b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
@@ -1350,17 +1350,8 @@ static void fuse_trace_free(struct fuse_ktrace *tr)
if (tr->prometheus_dentry) {
debugfs_remove(tr->prometheus_dentry);
}
- if (tr->prometheus_hist) {
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct kfuse_histogram ** histp;
- histp = per_cpu_ptr(tr->prometheus_hist, cpu);
- if (*histp)
- free_page((unsigned long)*histp);
- }
- free_percpu(tr->prometheus_hist);
- }
+ if (tr->prometheus_metrics)
+ free_percpu(tr->prometheus_metrics);
free_percpu(tr->buf);
debugfs_remove(tr->dir);
kfree(tr);
@@ -1408,20 +1399,20 @@ static struct rchan_callbacks relay_callbacks = {
.remove_buf_file = remove_buf_file_callback,
};
-void fuse_stat_account(struct fuse_conn * fc, int op, ktime_t val)
+void fuse_stat_observe(struct fuse_conn *fc, int op, ktime_t val)
{
struct fuse_ktrace * tr = fc->ktrace;
- BUG_ON(op >= KFUSE_OP_MAX);
+ BUG_ON(op >= KFUSE_HISTOGRAM_MAX);
if (tr) {
- struct kfuse_histogram ** histp;
+ struct kfuse_metrics *metrics;
int cpu;
cpu = get_cpu();
- histp = per_cpu_ptr(tr->prometheus_hist, cpu);
- if (histp && *histp) {
- struct kfuse_stat_rec * rec = (*histp)->metrics + op;
+ metrics = per_cpu_ptr(tr->prometheus_metrics, cpu);
+ if (metrics) {
+ struct kfuse_histogram *rec = &metrics->hists[op];
int bucket;
unsigned long long lat = ktime_to_ns(val)/1000;
@@ -1445,6 +1436,27 @@ void fuse_stat_account(struct fuse_conn * fc, int op, ktime_t val)
}
}
+void fuse_stat_account(struct fuse_conn *fc, int op, u64 val)
+{
+ struct fuse_ktrace *tr = fc->ktrace;
+
+ BUG_ON(op >= KFUSE_OP_MAX);
+
+ if (tr) {
+ struct kfuse_metrics *metrics;
+ int cpu;
+
+ cpu = get_cpu();
+ metrics = per_cpu_ptr(tr->prometheus_metrics, cpu);
+ if (metrics) {
+ struct kfuse_counter *cnt = &metrics->cnts[op];
+ cnt->val_total += val;
+ ++cnt->events;
+ }
+ put_cpu();
+ }
+}
+
static int prometheus_file_open(struct inode *inode, struct file *filp)
{
struct fuse_ktrace * tr = inode->i_private;
@@ -1465,48 +1477,57 @@ static int prometheus_file_release(struct inode *inode, struct file *filp)
return 0;
}
+/* NOTE: old versions of userspace could read only histograms */
static ssize_t prometheus_file_read(struct file *filp,
char __user *buffer,
size_t count,
loff_t *ppos)
{
- struct fuse_ktrace * tr = filp->private_data;
- struct kfuse_histogram * hist;
+ struct fuse_ktrace *tr = filp->private_data;
+ struct kfuse_metrics *stats;
int cpu;
- if (*ppos >= sizeof(struct kfuse_histogram))
+ if (*ppos >= sizeof(struct kfuse_metrics))
return 0;
- if (*ppos + count > sizeof(struct kfuse_histogram))
- count = sizeof(struct kfuse_histogram) - *ppos;
+ if (*ppos + count > sizeof(struct kfuse_metrics))
+ count = sizeof(struct kfuse_metrics) - *ppos;
- hist = (void*)get_zeroed_page(GFP_KERNEL);
- if (!hist)
+ stats = (void *)get_zeroed_page(GFP_KERNEL);
+ BUILD_BUG_ON(sizeof(*stats) > PAGE_SIZE);
+ if (!stats)
return -ENOMEM;
- if (!tr->prometheus_hist)
+ if (!tr->prometheus_metrics)
return -EINVAL;
for_each_possible_cpu(cpu) {
- struct kfuse_histogram ** histp;
+ struct kfuse_metrics *m;
- histp = per_cpu_ptr(tr->prometheus_hist, cpu);
- if (histp && *histp) {
+ m = per_cpu_ptr(tr->prometheus_metrics, cpu);
+ if (m) {
int i, k;
- for (i = 0; i < KFUSE_OP_MAX; i++) {
+ /* aggregate histograms from each cpu */
+ for (i = 0; i < KFUSE_HISTOGRAM_MAX; i++) {
for (k = 0; k < KFUSE_PROM_MAX; k++) {
- hist->metrics[i].buckets[k] += (*histp)->metrics[i].buckets[k];
+ stats->hists[i].buckets[k] += m->hists[i].buckets[k];
}
- hist->metrics[i].sum += (*histp)->metrics[i].sum;
+ stats->hists[i].sum += m->hists[i].sum;
+ }
+
+ /* aggregate counters from each cpu */
+ for (i = 0; i < KFUSE_OP_MAX; i++) {
+ stats->cnts[i].events += m->cnts[i].events;
+ stats->cnts[i].val_total += m->cnts[i].val_total;
}
}
}
- if (copy_to_user(buffer, (char*)hist + *ppos, count))
+ if (copy_to_user(buffer, (char *)stats + *ppos, count))
count = -EFAULT;
else
*ppos += count;
- free_page((unsigned long)hist);
+ free_page((unsigned long)stats);
return count;
}
@@ -1522,7 +1543,8 @@ static int fuse_ktrace_setup(struct fuse_conn * fc)
struct fuse_ktrace * tr = NULL;
struct fuse_ktrace * old_tr;
struct dentry * dir;
- struct kfuse_histogram * __percpu * hist;
+ struct kfuse_metrics __percpu * metrics;
+ int cpu;
char name[16];
if (!fuse_trace_root)
@@ -1554,19 +1576,18 @@ static int fuse_ktrace_setup(struct fuse_conn * fc)
tr->prometheus_dentry = debugfs_create_file("prometheus", S_IFREG|0444, dir, tr,
&prometheus_file_operations);
- hist = (void*)alloc_percpu(void *);
- if (hist) {
- int cpu;
- BUILD_BUG_ON(sizeof(struct kfuse_histogram) > PAGE_SIZE);
+ ret = -ENOMEM;
- for_each_possible_cpu(cpu) {
- struct kfuse_histogram ** histp;
- histp = per_cpu_ptr(hist, cpu);
- *histp = (void*)get_zeroed_page(GFP_KERNEL);
- }
- tr->prometheus_hist = hist;
+ metrics = alloc_percpu(struct kfuse_metrics);
+ if (!metrics)
+ goto err;
+ for_each_possible_cpu(cpu) {
+ struct kfuse_metrics *m;
+ m = per_cpu_ptr(metrics, cpu);
+ memset(m, 0, sizeof(*m));
}
+ tr->prometheus_metrics = metrics;
tr->buf = __alloc_percpu(KTRACE_LOG_BUF_SIZE, 16);
More information about the Devel
mailing list