[Devel] [PATCH RHEL7 COMMIT] fuse kio: Add pcs engine combo v0.8
Konstantin Khorenko
khorenko at virtuozzo.com
Mon Feb 19 14:22:33 MSK 2018
The commit is pushed to "branch-rh7-3.10.0-693.17.1.vz7.45.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-693.17.1.vz7.43.7
------>
commit f51eb62d39b6ffe05458cb548f4bd82bc6f47bc5
Author: Dmitry Monakhov <dmonakhov at openvz.org>
Date: Mon Feb 19 14:22:33 2018 +0300
fuse kio: Add pcs engine combo v0.8
https://jira.sw.ru/browse/PSBM-80680
Signed-off-by: Dmitry Monakhov <dmonakhov at openvz.org>
---
fs/fuse/Kconfig | 6 +
fs/fuse/Makefile | 11 +
fs/fuse/kio/pcs/fuse_io.c | 168 ++
fs/fuse/kio/pcs/log.h | 45 +
fs/fuse/kio/pcs/pcs_align.h | 18 +
fs/fuse/kio/pcs/pcs_client_types.h | 164 ++
fs/fuse/kio/pcs/pcs_cluster.c | 332 ++++
fs/fuse/kio/pcs/pcs_cluster.h | 106 ++
fs/fuse/kio/pcs/pcs_cluster_core.c | 214 +++
fs/fuse/kio/pcs/pcs_cs.c | 1067 +++++++++++
fs/fuse/kio/pcs/pcs_cs.h | 182 ++
fs/fuse/kio/pcs/pcs_cs_prot.h | 125 ++
fs/fuse/kio/pcs/pcs_error.h | 189 ++
fs/fuse/kio/pcs/pcs_flow_detect.h | 7 +
fs/fuse/kio/pcs/pcs_flow_detect_stub.h | 76 +
fs/fuse/kio/pcs/pcs_fuse_kdirect.c | 742 ++++++++
fs/fuse/kio/pcs/pcs_ioctl.h | 85 +
fs/fuse/kio/pcs/pcs_map.c | 2999 +++++++++++++++++++++++++++++++
fs/fuse/kio/pcs/pcs_map.h | 264 +++
fs/fuse/kio/pcs/pcs_mds_prot.h | 1335 ++++++++++++++
fs/fuse/kio/pcs/pcs_perfcounters.h | 7 +
fs/fuse/kio/pcs/pcs_perfcounters_stub.h | 30 +
fs/fuse/kio/pcs/pcs_prot_types.h | 451 +++++
fs/fuse/kio/pcs/pcs_req.c | 116 ++
fs/fuse/kio/pcs/pcs_req.h | 320 ++++
fs/fuse/kio/pcs/pcs_rpc.c | 1314 ++++++++++++++
fs/fuse/kio/pcs/pcs_rpc.h | 290 +++
fs/fuse/kio/pcs/pcs_rpc_prot.h | 97 +
fs/fuse/kio/pcs/pcs_sock_io.c | 702 ++++++++
fs/fuse/kio/pcs/pcs_sock_io.h | 236 +++
fs/fuse/kio/pcs/pcs_timer.h | 19 +
fs/fuse/kio/pcs/pcs_types.h | 38 +
32 files changed, 11755 insertions(+)
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index a0591e4b3a04..433a39957c9d 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -39,3 +39,9 @@ config FUSE_KIO_NULLIO
depends on FUSE_FS
help
This FUSE extension allows to handle io requests directly inside kernel
+
+config FUSE_KIO_PCS
+ tristate "Enable kdirect PCS io engine"
+ depends on FUSE_FS
+ help
+ This FUSE extension allows to forward io requests directly to PCS
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index f7500f0e832e..cdefac9c4fbe 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -11,4 +11,15 @@ fuse_kio_noop-objs := kio/kio_noop.o
obj-$(CONFIG_FUSE_KIO_NULLIO) += fuse_kio_nullio.o
fuse_kio_nullio-objs := kio/kio_nullio.o
+obj-$(CONFIG_FUSE_KIO_PCS) += fuse_kio_pcs.o
+fuse_kio_pcs-objs := kio/pcs/pcs_fuse_kdirect.o \
+ kio/pcs/pcs_sock_io.o \
+ kio/pcs/pcs_rpc.o \
+ kio/pcs/pcs_req.o \
+ kio/pcs/pcs_map.o \
+ kio/pcs/pcs_cluster.o \
+ kio/pcs/pcs_cluster_core.o \
+ kio/pcs/pcs_cs.o \
+ kio/pcs/fuse_io.o
+
fuse-objs := dev.o dir.o file.o inode.o control.o
diff --git a/fs/fuse/kio/pcs/fuse_io.c b/fs/fuse/kio/pcs/fuse_io.c
new file mode 100644
index 000000000000..c9eaa8d453db
--- /dev/null
+++ b/fs/fuse/kio/pcs/fuse_io.c
@@ -0,0 +1,168 @@
+#include <net/sock.h>
+#include <linux/net.h>
+#include <linux/kthread.h>
+#include <linux/types.h>
+#include <linux/pagemap.h>
+
+#include "pcs_types.h"
+#include "pcs_sock_io.h"
+#include "pcs_rpc.h"
+#include "pcs_sock_io.h"
+#include "pcs_req.h"
+#include "pcs_map.h"
+#include "pcs_cs.h"
+#include "pcs_cluster.h"
+#include "log.h"
+
+#include "../../fuse_i.h"
+
+static void intreq_complete(struct pcs_int_request *ireq)
+{
+ pcs_api_iorequest_t *req = ireq->apireq.req;
+
+ BUG_ON(ireq->type != PCS_IREQ_API);
+
+ if (pcs_if_error(&ireq->error)) {
+ req->flags |= PCS_REQ_F_ERROR;
+ if (ireq->error.value == PCS_ERR_NO_STORAGE ||
+ ireq->error.value == PCS_ERR_CSD_LACKING)
+ req->flags |= PCS_REQ_F_NOSPACE;
+ }
+ req->complete(req);
+}
+
+static void on_read_done(struct pcs_fuse_req *r, size_t size)
+{
+ struct pcs_fuse_cluster *pfc = cl_from_req(r);
+
+ DTRACE("do fuse_request_end req:%p op:%d err:%d\n", &r->req, r->req.in.h.opcode, r->req.out.h.error);
+ r->req.out.args[0].size = size;
+ request_end(pfc->fc, &r->req);
+}
+
+static void on_sync_done(struct pcs_fuse_req *r)
+{
+ struct pcs_fuse_cluster *pfc = cl_from_req(r);
+
+ DTRACE("do fuse_request_end req:%p op:%d err:%d\n", &r->req, r->req.in.h.opcode, r->req.out.h.error);
+ request_end(pfc->fc, &r->req);
+}
+
+static void on_write_done(struct pcs_fuse_req *r, off_t pos, size_t size)
+{
+ struct fuse_write_out *out = &r->req.misc.write.out;
+ struct pcs_fuse_cluster *pfc = cl_from_req(r);
+
+ out->size = size;
+
+ DTRACE("do fuse_request_end req:%p op:%d err:%d\n", &r->req, r->req.in.h.opcode, r->req.out.h.error);
+ request_end(pfc->fc, &r->req);
+}
+
+static void req_get_iter(void *data, unsigned int offset, struct iov_iter *it)
+{
+ struct pcs_fuse_req *r = data;
+
+ iov_iter_init_bvec(it, r->exec.io.bvec, r->exec.io.num_bvecs, r->exec.io.req.size, 0);
+ iov_iter_advance(it, offset);
+}
+
+static inline void set_io_buff(struct pcs_fuse_req *r, off_t offset, size_t size,
+ int is_bvec, int zeroing)
+{
+ int i;
+ size_t count = 0;
+ if (is_bvec) {
+ r->exec.io.bvec = r->req.bvec;
+ r->exec.io.num_bvecs = r->req.num_bvecs;
+ } else {
+ r->exec.io.bvec = r->exec.io.inline_bvec;
+ r->exec.io.num_bvecs = r->req.num_pages;
+ for (i = 0; i < r->req.num_pages && count < size; i++) {
+ r->exec.io.bvec[i].bv_page = r->req.pages[i];
+ r->exec.io.bvec[i].bv_offset = r->req.page_descs[i].offset;
+ r->exec.io.bvec[i].bv_len = r->req.page_descs[i].length;
+ count += r->exec.io.bvec[i].bv_len;
+ }
+ }
+ count = 0;
+ for (i = 0; i < r->exec.io.num_bvecs; i++) {
+ count += r->exec.io.bvec[i].bv_len;
+ if (zeroing && r->exec.io.bvec[i].bv_len < PAGE_SIZE)
+ clear_highpage(r->exec.io.bvec[i].bv_page);
+ }
+ BUG_ON(size > count);
+ r->exec.io.req.pos = offset;
+ r->exec.io.req.size = size;
+}
+
+static void prepare_io_(struct pcs_fuse_req *r, unsigned short type, off_t offset, size_t size,
+ void (*complete)(struct _pcs_api_iorequest_t *))
+{
+ /* Use inline request structure */
+ struct pcs_int_request *ireq = &r->exec.ireq;
+
+ TRACE("INIT r(%p) ireq:%p {%ld, %ld}\n", r, ireq, offset, size);
+
+ /* Initialize IO request */
+ switch (type)
+ {
+ case PCS_REQ_T_READ:
+ BUG_ON(r->req.out.argbvec && r->req.out.argpages);
+ set_io_buff(r, offset, size, r->req.out.argbvec, r->req.out.page_zeroing);
+ break;
+ case PCS_REQ_T_WRITE:
+ BUG_ON(r->req.in.argbvec && r->req.in.argpages);
+ set_io_buff(r, offset, size, r->req.in.argbvec, 0);
+ break;
+ }
+
+ r->exec.io.req.type = type;
+ r->exec.io.req.datasource = r;
+ r->exec.io.req.get_iter = req_get_iter;
+ r->exec.io.req.complete = complete;
+
+ /* Initialize internal request structure */
+ ireq->type = PCS_IREQ_API;
+ ireq->apireq.req = &r->exec.io.req;
+ ireq->complete_cb = intreq_complete;
+ ireq->completion_data.parent = 0;
+ ireq->completion_data.ctx = r;
+ ireq->completion_data.priv = r;
+}
+
+static void ioreq_complete(pcs_api_iorequest_t *ioreq)
+{
+ struct pcs_fuse_req *r = ioreq->datasource;
+
+ BUG_ON(ioreq != &r->exec.io.req);
+
+ if (ioreq->flags & PCS_REQ_F_ERROR) {
+ if (ioreq->flags & PCS_REQ_F_NOSPACE)
+ r->req.out.h.error = -ENOSPC;
+ else
+ r->req.out.h.error = -EIO;
+ } else {
+ r->req.out.h.error = 0;
+ }
+
+ switch (ioreq->type) {
+ case PCS_REQ_T_READ:
+ on_read_done(r, ioreq->size);
+ break;
+ case PCS_REQ_T_WRITE:
+ on_write_done(r, ioreq->pos, ioreq->size);
+ break;
+ case PCS_REQ_T_SYNC:
+ on_sync_done(r);
+ break;
+ default:
+ BUG();
+ }
+
+}
+
+void pcs_fuse_prep_io(struct pcs_fuse_req *r, unsigned short type, off_t offset, size_t size)
+{
+ prepare_io_(r, type, offset, size, ioreq_complete);
+}
diff --git a/fs/fuse/kio/pcs/log.h b/fs/fuse/kio/pcs/log.h
new file mode 100644
index 000000000000..ee524a8b7a34
--- /dev/null
+++ b/fs/fuse/kio/pcs/log.h
@@ -0,0 +1,45 @@
+#ifndef __PCSLOG_H__
+#define __PCSLOG_H__
+
+#include <linux/printk.h>
+
+/*
+ * Log level values and flags
+ */
+#define LOG_ERR 0
+#define LOG_WARN 1
+#define LOG_INFO 2
+#define LOG_DEBUG 4
+/* The high debug levels are used for dumping the system state */
+#define LOG_DEBUG2 5
+#define LOG_DEBUG3 6
+/* Tracing levels */
+#define LOG_TRACE 7
+#define LOG_DEBUG4 8
+#define LOG_DEBUG5 9
+#define LOG_LEVEL_MAX LOG_DEBUG5
+
+
+#define __PCS_DEBUG__ 1
+#define __PCS_DTRACE__ 1
+
+#ifndef __PCS_DEBUG__
+#define pcs_log(level, fmt, ...)
+#define TRACE(fmt, ...) do {} while (0)
+#define DTRACE(fmt, ...) do {} while (0)
+#else
+static int pcs_loglevel __attribute__ ((unused)) = LOG_DEBUG;
+#define pcs_log(level, fmt, args...) do \
+ { \
+ if (level <= pcs_loglevel) \
+ pr_debug(fmt , ##args); \
+ } while (0)
+#define TRACE(fmt, args...) trace_printk("%d: " fmt "\n", __LINE__, ## args)
+
+#ifndef __PCS_DTRACE__
+#define DTRACE(fmt, ...) do {} while (0)
+#else
+#define DTRACE(fmt, args...) trace_printk("%d: " fmt "\n", __LINE__, ## args)
+#endif
+#endif
+#endif /* __PCSLOG_H__ */
diff --git a/fs/fuse/kio/pcs/pcs_align.h b/fs/fuse/kio/pcs/pcs_align.h
new file mode 100644
index 000000000000..8dac73cb9713
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_align.h
@@ -0,0 +1,18 @@
+#ifndef __PCS_ALIGN_H__
+#define __PCS_ALIGN_H__
+
+#include "pcs_types.h"
+
+/* ----- helpers ----- */
+
+#if defined(__GNUC__) || defined(__clang__)
+
+#define __pre_aligned(x)
+#define __pre_packed
+#define __unaligned __attribute__((packed, may_alias))
+#endif
+
+#define PCS_ALIGN_TO(sz, align) (((sz)+(align)-1)&~((align)-1))
+#define PCS_ALIGN(sz) PCS_ALIGN_TO(sz, 8)
+
+#endif /* __PCS_ALIGN_H__ */
diff --git a/fs/fuse/kio/pcs/pcs_client_types.h b/fs/fuse/kio/pcs/pcs_client_types.h
new file mode 100644
index 000000000000..3bffd4992221
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_client_types.h
@@ -0,0 +1,164 @@
+#ifndef _PCS_CLIENT_TYPES_H_
+#define _PCS_CLIENT_TYPES_H_ 1
+
+#include "pcs_prot_types.h"
+#include "pcs_mds_prot.h"
+#include "pcs_flow_detect.h"
+
+/* Values of lease. It is value, not bitmask. */
+#define PCS_LEASE_NONE 0
+#define PCS_LEASE_READ 1
+#define PCS_LEASE_WRITE 2
+#define PCS_LEASE_VALIDATE 3
+
+struct pcs_dentry_name {
+ const char *data;
+ int len;
+};
+
+struct pcs_dentry_id {
+ PCS_FILE_ID_T parent;
+ struct pcs_dentry_name name;
+};
+
+struct pcs_map_set {
+ struct list_lru lru;
+ struct list_lru dirty_lru;
+ struct list_head dirty_queue;
+ spinlock_t lock;
+ atomic_t count;
+ atomic_t dirty_count;
+ int map_thresh;
+ int map_dirty_thresh;
+ int map_max;
+ struct shrinker shrinker;
+
+ /* TODO: temproraly disabled */
+ struct pcs_flow_table_global ftab;
+};
+
+struct pcs_mapping {
+ struct pcs_cluster_core *cluster;
+ unsigned chunk_size_bits;
+ unsigned long nrmaps;
+ struct radix_tree_root map_tree; /* GFP_ATOMIC */
+ spinlock_t map_lock;
+ struct pcs_flow_table ftab;
+};
+
+struct fuse_inode;
+struct pcs_dentry_info {
+ struct pcs_dentry_id id;
+ struct pcs_mds_fileinfo fileinfo;
+ PCS_FILETIME_T local_mtime;
+ struct pcs_mapping mapping;
+ struct pcs_cluster_core *cluster;
+ struct fuse_inode *inode;
+};
+
+static inline void pcs_clear_fileinfo(struct pcs_dentry_info *i)
+{
+ struct pcs_mds_fileinfo *mi = (struct pcs_mds_fileinfo *)&i->fileinfo;
+
+ memset(mi, 0, sizeof(*mi));
+}
+
+static inline void pcs_set_fileinfo(struct pcs_dentry_info *i, const struct pcs_mds_fileinfo *finfo)
+{
+ struct pcs_mds_fileinfo *mi = &i->fileinfo;
+
+ *mi = *finfo;
+
+ if (mi->sys.stripe_depth == 0) {
+ mi->sys.stripe_depth = 1;
+ mi->sys.strip_width = mi->sys.chunk_size;
+ }
+ i->mapping.chunk_size_bits = ilog2(mi->sys.chunk_size);
+
+}
+
+/* Size constants */
+#define PCS_MAX_SYMLINK_SIZE 4095
+#define PCS_DFLT_MSS_WRITE (64*1024)
+#define PCS_DFLT_MSS_READ (128*1024)
+#define PCS_DFLT_MSS_LOCAL (512*1024)
+
+/* Minimal delay before retrying failed operation. */
+#define PCS_ERROR_DELAY 200
+/* Maximum delay before retrying failed operation. */
+#define PCS_ERROR_DELAY_MAX 5000
+#define PCS_LEASE_RETRY 3
+
+#define PCS_INFO_DIR_COMPAT ".pstorage.info"
+#define PCS_INFO_DIR ".vstorage.info"
+
+/* Special magic suffix. readlink() on a name which such suffix from fuse-mouted pcs
+ * gives URI of file, which can be accessible via pcs api. If the file is pstorage symlink,
+ * it returns its contents to run it though VFS layer again: we cannot do this internally.
+ */
+#define PCS_API_URI_SUFFIX "#._PSTORAGE_URI_"
+
+enum {
+ PCS_REQ_T_READ = 0,
+ PCS_REQ_T_WRITE = 1,
+ PCS_REQ_T_SYNC = 2,
+};
+
+/* Request flags */
+#define PCS_REQ_F_ERROR 2
+#define PCS_REQ_F_NOSPACE 4
+#define PCS_REQ_F_CACHED 0x10
+
+struct iov_iter;
+typedef struct _pcs_api_iorequest_t {
+ off_t pos;
+ size_t size;
+ unsigned short type;
+ unsigned short flags;
+
+ void *datasource;
+ void (*get_iter)(void *datasource, unsigned int offset, struct iov_iter *it);
+
+ void (*complete)(struct _pcs_api_iorequest_t *);
+} pcs_api_iorequest_t;
+
+typedef struct _pcs_api_csconnreq_t {
+ PCS_NODE_ID_T id; /* CS id */
+ PCS_NET_ADDR_T addr; /* CS addr */
+ int error; /* pcs_errors.h */
+ void (*complete)(struct _pcs_api_csconnreq_t *, int);
+} pcs_api_csconnreq_t;
+
+/*
+ * Useful macro
+ */
+
+#define PCS_FILE_ID_FMT "[%08llx]"
+#define PCS_FILE_ID_ARGS(id) (unsigned long long)(id)
+#define DENTRY_NAME_FMT "%*.*s"
+#define DENTRY_FMT PCS_FILE_ID_FMT "/" DENTRY_NAME_FMT
+#define DENTRY_NAME_ARGS(n) (n).len, (n).len, (n).data
+#define DENTRY_ID_ARGS(id) PCS_FILE_ID_ARGS((id).parent), DENTRY_NAME_ARGS((id).name)
+#define DENTRY_ARGS(de) DENTRY_ID_ARGS(((struct pcs_dentry_info *)(de))->id)
+
+#define DENTRY_SIZE(de) ((de)->fileinfo.attr.size)
+#define DENTRY_CHUNK_SIZE(de) ((de)->fileinfo.sys.chunk_size)
+#define DENTRY_CHUNK_SIZE_BITS(de) ((de)->mapping.chunk_size_bits)
+
+void pcs_mapset_limit(struct pcs_map_set *maps, int limit);
+
+
+/* Inode id comparison function */
+static inline int pcs_dentry_id_cmp(struct pcs_dentry_id const *a, struct pcs_dentry_id const *b)
+{
+ int res;
+ res = memcmp(&a->parent, &b->parent, sizeof(a->parent));
+ if (res)
+ return res;
+ res = a->name.len - b->name.len;
+ if (res)
+ return res;
+ return memcmp(a->name.data, b->name.data, a->name.len);
+}
+
+#endif /* _PCS_CLIENT_TYPES_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_cluster.c b/fs/fuse/kio/pcs/pcs_cluster.c
new file mode 100644
index 000000000000..7a9af9683e5e
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_cluster.c
@@ -0,0 +1,332 @@
+#include <net/sock.h>
+#include <linux/net.h>
+#include <linux/kthread.h>
+#include <linux/types.h>
+#include <linux/rbtree.h>
+
+#include "pcs_types.h"
+#include "pcs_sock_io.h"
+#include "pcs_rpc.h"
+#include "pcs_sock_io.h"
+#include "pcs_req.h"
+#include "pcs_map.h"
+#include "pcs_cs.h"
+#include "pcs_cluster.h"
+#include "log.h"
+
+#include "../../fuse_i.h"
+
+static inline int is_file_inline(struct pcs_dentry_info *di)
+{
+ return di->fileinfo.attr.attrib & PCS_FATTR_INLINE;
+}
+
+
+void pcs_sreq_complete(struct pcs_int_request *sreq)
+{
+ struct pcs_int_request *ireq = sreq->completion_data.parent;
+ struct pcs_cluster_core *cluster = sreq->cc;
+
+ if (pcs_if_error(&sreq->error)) {
+ if (!pcs_if_error(&ireq->error)) {
+ /* If we decided to abort api request, do not redo chunk request
+ * even if the error is harmless. Otherwise, analyze sreq error
+ * and, most likely, resubmit request.
+ */
+ if (ireq_check_redo(sreq)) {
+ if (ireq_is_timed_out(sreq)) {
+ DTRACE("timeout while IO request on \"" DENTRY_FMT "\" last_err=%u",
+ DENTRY_ARGS(sreq->dentry), sreq->error.value);
+ }
+ if (sreq->type != PCS_IREQ_CUSTOM) {
+ map_notify_soft_error(sreq);
+
+ if (!(sreq->flags & IREQ_F_ONCE)) {
+ sreq->flags |= IREQ_F_ONCE;
+ pcs_clear_error(&sreq->error);
+ pcs_cc_submit(sreq->cc, sreq);
+ return;
+ }
+ }
+ pcs_clear_error(&sreq->error);
+ ireq_delay(sreq);
+ return;
+ }
+ pcs_copy_error(&ireq->error, &sreq->error);
+ }
+
+ if (sreq->type != PCS_IREQ_CUSTOM)
+ map_notify_iochunk_error(sreq);
+ }
+
+ if (sreq->type != PCS_IREQ_CUSTOM) {
+ if (!(sreq->flags & IREQ_F_CACHED))
+ ireq->flags &= ~IREQ_F_CACHED;
+ pcs_deaccount_ireq(sreq, &sreq->error);
+ } else if (sreq->custom.destruct)
+ sreq->custom.destruct(sreq);
+
+ if (!pcs_sreq_detach(sreq))
+ ireq_complete(ireq);
+
+ if (sreq->type == PCS_IREQ_IOCHUNK && sreq->iochunk.flow)
+ pcs_flow_put(sreq->iochunk.flow, &cluster->maps.ftab);
+
+ ireq_destroy(sreq);
+}
+
+void pcs_cc_process_ireq_chunk(struct pcs_int_request *ireq)
+{
+ struct pcs_map_entry *map;
+
+ TRACE(PCS_FILE_ID_FMT" [%llx]\n", ireq->dentry->fileinfo.attr.id,
+ (unsigned long long)ireq->iochunk.chunk);
+
+ map = pcs_find_get_map(ireq->dentry, ireq->iochunk.chunk +
+ ((ireq->flags & IREQ_F_MAPPED) ? 0 : ireq->iochunk.offset));
+
+ if (map_check_limit(map, ireq))
+ return;
+ if (ireq->iochunk.map)
+ pcs_map_put(ireq->iochunk.map);
+ ireq->iochunk.map = map;
+
+ map_submit(map, ireq, 0);
+}
+
+/* TODO Remove noinline in production */
+static noinline void __pcs_cc_process_ireq_rw(struct pcs_int_request *ireq)
+{
+ struct pcs_dentry_info *di = ireq->dentry;
+ u64 pos = ireq->apireq.req->pos;
+ unsigned int sz = ireq->apireq.req->size;
+ unsigned int dio_offset = 0;
+ struct pcs_flow_node *fl;
+
+ if (di->fileinfo.sys.map_type != PCS_MAP_PLAIN) {
+ BUG_ON(1);
+ return;
+ }
+
+ TRACE(DENTRY_FMT " %p op=%d at %llu [%llu]\n", DENTRY_ARGS(di), ireq, ireq->apireq.req->type,
+ (unsigned long long)ireq->apireq.req->pos, (unsigned long long)ireq->apireq.req->size);
+
+
+ atomic_set(&ireq->iocount, 1);
+ ireq->flags |= IREQ_F_CACHED;
+
+ fl = pcs_flow_record(&di->mapping.ftab, ireq->apireq.req->type == PCS_REQ_T_WRITE,
+ pos, sz, &di->cluster->maps.ftab);
+
+ while (sz) {
+ struct pcs_int_request *sreq;
+ unsigned int len;
+ u64 rpos, chunk, end_pos;
+
+ rpos = map_file_to_chunk(pos, di->fileinfo.sys.chunk_size, di->fileinfo.sys.stripe_depth, di->fileinfo.sys.strip_width);
+
+ chunk = rpos & ~((u64)di->fileinfo.sys.chunk_size - 1);
+ end_pos = ((rpos / di->fileinfo.sys.strip_width) + 1) * (u64)di->fileinfo.sys.strip_width;
+
+ sreq = ireq_alloc(di);
+ if (!sreq) {
+ pcs_set_local_error(&ireq->error, PCS_ERR_NOMEM);
+ break;
+ }
+
+ sreq->dentry = di;
+ sreq->type = PCS_IREQ_IOCHUNK;
+ sreq->iochunk.map = NULL;
+ sreq->iochunk.flow = pcs_flow_get(fl);
+ sreq->iochunk.direction = ireq->apireq.req->type;
+ sreq->iochunk.cs_index = 0;
+ sreq->iochunk.chunk = chunk;
+ sreq->iochunk.offset = rpos % di->fileinfo.sys.chunk_size;
+ sreq->iochunk.dio_offset = dio_offset;
+ len = di->fileinfo.sys.chunk_size - sreq->iochunk.offset;
+ if (len > sz)
+ len = sz;
+ if (rpos + len > end_pos)
+ len = end_pos - rpos;
+ sreq->iochunk.size = len;
+ sreq->iochunk.csl = NULL;
+ sreq->iochunk.banned_cs.val = 0;
+ sreq->iochunk.msg.destructor = NULL;
+ sreq->iochunk.msg.rpc = NULL;
+
+ pcs_sreq_attach(sreq, ireq);
+ sreq->complete_cb = pcs_sreq_complete;
+ pcs_cc_process_ireq_chunk(sreq);
+
+ pos += len;
+ sz -= len;
+ dio_offset += len;
+ }
+ pcs_flow_put(fl, &di->cluster->maps.ftab);
+ if (atomic_dec_and_test(&ireq->iocount))
+ ireq_complete(ireq);
+}
+
+static void pcs_cc_process_ireq_ioreq(struct pcs_int_request *ireq)
+{
+
+ if (ireq->apireq.req->type == PCS_REQ_T_SYNC) {
+ map_inject_flush_req(ireq);
+ return;
+ }
+ if (ireq->apireq.req->type != PCS_REQ_T_READ &&
+ ireq->apireq.req->type != PCS_REQ_T_WRITE) {
+ pcs_set_local_error(&ireq->error, PCS_ERR_PROTOCOL);
+ ireq_complete(ireq);
+ }
+ return __pcs_cc_process_ireq_rw(ireq);
+
+}
+
+static void ireq_process_(struct pcs_int_request *ireq)
+{
+ TRACE("enter " DENTRY_FMT " type=%u\n", DENTRY_ARGS(ireq->dentry), ireq->type);
+
+ switch (ireq->type) {
+ case PCS_IREQ_NOOP:
+ ireq_complete(ireq);
+ break;
+ case PCS_IREQ_IOCHUNK:
+ pcs_cc_process_ireq_chunk(ireq);
+ break;
+ case PCS_IREQ_API:
+ pcs_cc_process_ireq_ioreq(ireq);
+ break;
+ case PCS_IREQ_FLUSH:
+ process_flush_req(ireq);
+ break;
+ case PCS_IREQ_TRUNCATE:
+ process_ireq_truncate(ireq);
+ break;
+ case PCS_IREQ_CUSTOM:
+ ireq->custom.action(ireq);
+ break;
+ default:
+ BUG();
+ break;
+ }
+}
+
+static void ireq_notify_err(struct pcs_int_request *ireq, pcs_error_t *err)
+{
+ if (ireq->completion_data.parent)
+ ireq_notify_err(ireq->completion_data.parent, err);
+
+ else if (ireq->completion_data.priv) {
+ struct pcs_fuse_req *r = ireq->completion_data.priv;
+ r->exec.ctl.last_err = *err;
+ }
+}
+
+static void ireq_on_error_(struct pcs_int_request *ireq)
+{
+ /* Distinguish unrecoverable errors and recoverable ones.
+ * Recoverable errors must trigger restart.
+ */
+ ireq_notify_err(ireq, &ireq->error);
+ switch (ireq->error.value) {
+ /* This can happen if we lost connection for long time and lease has been lost.
+ * We should try to reacquire lock. Server must reject reacquisition, if someone
+ * took the lock after us.
+ */
+ case PCS_ERR_LEASE_REQUIRED:
+ case PCS_ERR_LEASE_EXPIRED:
+ case PCS_ERR_INTEGRITY_FAIL: {
+ /* TODO: tag ireq->dentry with EIO here */
+ goto fatal;
+ }
+ case PCS_ERR_CSD_LACKING:
+ goto fatal;
+ break;
+ case PCS_ERR_INV_PARAMS:
+ case PCS_ERR_NOT_FOUND:
+ case PCS_ERR_NON_EMPTY_DIR:
+ case PCS_ERR_NOT_DIR:
+ case PCS_ERR_IS_DIR:
+ case PCS_ERR_NO_STORAGE:
+ case PCS_ERR_UNAVAIL:
+fatal:
+ printk(KERN_INFO "%s fatal error:%d nodeid:%llu", __func__,
+ ireq->error.value, ireq->dentry->inode->nodeid);
+ ireq->flags |= IREQ_F_FATAL;
+ break;
+ case PCS_ERR_LEASE_CONFLICT:
+ WARN_ON_ONCE(1);
+ break;
+ default:
+ break;
+ ;
+ }
+}
+
+static int ireq_check_redo_(struct pcs_int_request *ireq)
+{
+ pcs_error_t *err = &ireq->error;
+
+ if (ireq->flags & IREQ_F_FATAL)
+ return 0;
+
+ if (ireq->completion_data.parent &&
+ pcs_if_error(&ireq->completion_data.parent->error) &&
+ !ireq_check_redo(ireq->completion_data.parent))
+ return 0;
+
+ /* Fatal errors */
+ switch (err->value) {
+ case PCS_ERR_PROTOCOL:
+ case PCS_ERR_INV_PARAMS:
+ case PCS_ERR_NOT_FOUND:
+ case PCS_ERR_IS_DIR:
+ case PCS_ERR_NOT_DIR:
+ return 0;
+ }
+
+ /* Remote errors are never fatal */
+ if (err->remote)
+ return 1;
+
+ /* Fatal errors */
+ switch (err->value) {
+ case PCS_ERR_NOMEM:
+ case PCS_ERR_LEASE_REQUIRED:
+ case PCS_ERR_LEASE_EXPIRED:
+ case PCS_ERR_INTEGRITY_FAIL:
+ case PCS_ERR_NO_STORAGE:
+ return 0;
+ }
+
+ return 1;
+}
+
+int pcs_cluster_init(struct pcs_fuse_cluster *pfc, struct workqueue_struct *wq,
+ struct fuse_conn *fc, PCS_CLUSTER_ID_T *cl_id,
+ PCS_NODE_ID_T *id)
+{
+ struct pcs_cluster_core_attr attr;
+
+ attr.cluster = *cl_id;
+ attr.node = *id;
+ attr.abort_timeout_ms = 0;
+
+ pfc->fc = fc;
+
+ /* core init */
+ if (pcs_cc_init(&pfc->cc, wq, NULL, &attr))
+ return -1;
+ pfc->cc.op.ireq_process = ireq_process_;
+ pfc->cc.op.ireq_on_error = ireq_on_error_;
+ pfc->cc.op.ireq_check_redo = ireq_check_redo_;
+
+ return 0;
+}
+
+void pcs_cluster_fini(struct pcs_fuse_cluster *pfc)
+{
+ pcs_cc_fini(&pfc->cc);
+ kfree(pfc);
+}
diff --git a/fs/fuse/kio/pcs/pcs_cluster.h b/fs/fuse/kio/pcs/pcs_cluster.h
new file mode 100644
index 000000000000..3a8116b705df
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_cluster.h
@@ -0,0 +1,106 @@
+#ifndef _PCS_CLUSTER_H_
+#define _PCS_CLUSTER_H_ 1
+
+#include "pcs_req.h"
+#include "../../fuse_i.h"
+struct fuse_conn;
+
+/* Try to follows pcs/client/fused structure style */
+struct pcs_fuse_exec_ctx {
+ struct pcs_int_request ireq;
+ struct {
+ pcs_api_iorequest_t req;
+ struct bio_vec *bvec;
+ unsigned num_bvecs;
+ /* Fuck mem economy, make it simple for testing purpose
+ TODO: IMPLEMENT fuse_req iterator similar to bvec one */
+ struct bio_vec inline_bvec[FUSE_MAX_PAGES_PER_REQ];
+ } io;
+ struct {
+ unsigned retry_cnt;
+ pcs_error_t last_err;
+ } ctl;
+};
+
+struct pcs_fuse_req {
+ struct fuse_req req;
+ struct pcs_fuse_exec_ctx exec; /* Zero initialized context */
+};
+
+struct pcs_fuse_cluster {
+ struct pcs_cluster_core cc;
+ struct fuse_conn *fc;
+};
+
+struct pcs_fuse_work {
+ struct work_struct work;
+ pcs_error_t status;
+ void *ctx;
+ void *ctx2;
+};
+
+int pcs_cluster_init(struct pcs_fuse_cluster *c, struct workqueue_struct *,
+ struct fuse_conn *fc, PCS_CLUSTER_ID_T *cl_id,
+ PCS_NODE_ID_T *id);
+void pcs_cluster_fini(struct pcs_fuse_cluster *c);
+
+static inline struct pcs_fuse_req *pcs_fuse_req_from_work(struct pcs_fuse_exec_ctx *ctx)
+{
+ return container_of(ctx, struct pcs_fuse_req, exec);
+}
+
+static inline struct fuse_req *fuse_req_from_pcs(struct pcs_fuse_req *r)
+{
+ return (struct fuse_req *)r;
+}
+
+static inline struct pcs_fuse_req *pcs_req_from_fuse(struct fuse_req *req)
+{
+ return container_of(req, struct pcs_fuse_req, req);
+}
+
+static inline struct pcs_fuse_cluster *pcs_cluster_from_cc(struct pcs_cluster_core *c)
+{
+ return container_of(c, struct pcs_fuse_cluster, cc);
+}
+
+static inline struct pcs_dentry_info *pcs_inode_from_fuse(struct fuse_inode *fi)
+{
+
+ BUG_ON(!fi->private);
+
+ return (struct pcs_dentry_info *)fi->private;
+}
+
+static inline struct pcs_fuse_cluster *cl_from_req(struct pcs_fuse_req *r)
+{
+ return pcs_cluster_from_cc(r->exec.ireq.cc);
+}
+
+static inline struct pcs_cluster_core *cc_from_rpc(struct pcs_rpc_engine *eng)
+{
+ return container_of(eng, struct pcs_cluster_core, eng);
+}
+
+/* from pcs_cluter_core.h */
+struct pcs_cluster_core_attr {
+ PCS_CLUSTER_ID_T cluster;
+ PCS_NODE_ID_T node;
+
+ /* Timeouts */
+ int abort_timeout_ms;
+};
+int pcs_cc_init(struct pcs_cluster_core *cc, struct workqueue_struct *wq,
+ const char *cluster_name, struct pcs_cluster_core_attr *attr);
+void pcs_cc_fini(struct pcs_cluster_core *cc);
+
+void pcs_fuse_prep_io(struct pcs_fuse_req *r, unsigned short type, off_t offset, size_t size);
+int fuse_pcs_csconn_send(struct fuse_conn *fc, struct pcs_rpc *ep, int flags);
+
+
+static inline void pcs_cc_set_abort_timeout(struct pcs_cluster_core *cc, int timeout)
+{
+ cc->cfg.def.abort_timeout = cc->cfg.curr.abort_timeout = timeout;
+}
+
+#endif /* _PCS_CLUSTER_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_cluster_core.c b/fs/fuse/kio/pcs/pcs_cluster_core.c
new file mode 100644
index 000000000000..a5bdbc8ebd82
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_cluster_core.c
@@ -0,0 +1,214 @@
+#include <net/sock.h>
+#include <linux/net.h>
+#include <linux/kthread.h>
+#include <linux/types.h>
+#include <linux/rbtree.h>
+
+#include "pcs_types.h"
+#include "pcs_sock_io.h"
+#include "pcs_rpc.h"
+#include "pcs_sock_io.h"
+#include "pcs_req.h"
+#include "pcs_map.h"
+#include "pcs_cs.h"
+#include "pcs_cluster.h"
+#include "log.h"
+
+#include "../../fuse_i.h"
+
+void pcs_mapset_limit(struct pcs_map_set *maps, int limit)
+{
+ maps->map_thresh = limit - limit/4;
+ maps->map_dirty_thresh = limit - limit/8;
+ maps->map_max = limit;
+}
+
+static unsigned long pcs_map_shrink_count(struct shrinker *shrinker, struct shrink_control *sc)
+{
+ struct pcs_map_set *maps = container_of(shrinker,
+ struct pcs_map_set, shrinker);
+
+ return list_lru_count_node(&maps->lru, sc->nid) +
+ list_lru_count_node(&maps->dirty_lru, sc->nid);
+}
+
+
+static int pcs_mapset_init(struct pcs_map_set *maps)
+{
+ if (list_lru_init(&maps->lru))
+ return -ENOMEM;
+
+ if (list_lru_init(&maps->dirty_lru)) {
+ list_lru_destroy(&maps->lru);
+ return -ENOMEM;
+ }
+ INIT_LIST_HEAD(&maps->dirty_queue);
+ atomic_set(&maps->count, 0);
+ atomic_set(&maps->dirty_count, 0);
+ pcs_mapset_limit(maps, PCS_MAP_LIMIT);
+ pcs_flow_table_global_init(&maps->ftab);
+
+ maps->shrinker.count_objects = pcs_map_shrink_count;
+ maps->shrinker.scan_objects = pcs_map_shrink_scan;
+ maps->shrinker.seeks = DEFAULT_SEEKS;
+ maps->shrinker.flags = SHRINKER_NUMA_AWARE;
+ register_shrinker(&maps->shrinker);
+
+ return 0;
+}
+
+static void pcs_mapset_fini(struct pcs_map_set *maps)
+{
+ unregister_shrinker(&maps->shrinker);
+
+ BUG_ON(list_lru_count(&maps->lru));
+ BUG_ON(list_lru_count(&maps->dirty_lru));
+ BUG_ON(!list_empty(&maps->dirty_queue));
+
+ list_lru_destroy(&maps->lru);
+ list_lru_destroy(&maps->dirty_lru);
+}
+
+static void init_def_mss(struct pcs_cluster_core *cc)
+{
+ cc->cfg.def.wmss = PCS_DFLT_MSS_WRITE;
+ cc->cfg.def.rmss = PCS_DFLT_MSS_READ;
+ cc->cfg.def.lmss = PCS_DFLT_MSS_LOCAL;
+}
+
+
+static void cc_workqueue_handler(struct work_struct *w)
+{
+ LIST_HEAD(queue);
+ struct pcs_cluster_core *cc = (struct pcs_cluster_core *)
+ container_of(w, struct pcs_cluster_core, main_job);
+
+ spin_lock_irq(&cc->lock);
+ list_splice_tail_init(&cc->work_queue, &queue);
+ spin_unlock_irq(&cc->lock);
+
+ while (!list_empty(&queue)) {
+ struct pcs_int_request *ireq = list_first_entry(&queue, struct pcs_int_request, list);
+
+ list_del_init(&ireq->list);
+ TRACE("process ireq:%p" DENTRY_FMT " type=%u\n", ireq, DENTRY_ARGS(ireq->dentry), ireq->type);
+ cc->op.ireq_process(ireq);
+ }
+}
+
+static void cc_completion_handler(struct work_struct *w)
+{
+ struct pcs_cluster_core *cc = (struct pcs_cluster_core *)
+ container_of(w, struct pcs_cluster_core, completion_job);
+ LIST_HEAD(queue);
+
+ spin_lock_irq(&cc->lock);
+ list_splice_tail_init(&cc->completion_queue, &queue);
+ spin_unlock_irq(&cc->lock);
+
+ while (!list_empty(&queue)) {
+ struct pcs_int_request *ireq = list_first_entry(&queue, struct pcs_int_request, list);
+
+ list_del_init(&ireq->list);
+ TRACE("complete " DENTRY_FMT " type=%u\n", DENTRY_ARGS(ireq->dentry), ireq->type);
+ ireq_complete(ireq);
+ }
+}
+
+int pcs_cc_init(struct pcs_cluster_core *cc, struct workqueue_struct *wq,
+ const char *cluster_name, struct pcs_cluster_core_attr *attr)
+{
+ int err;
+ /* Ignore this for now, i have cluter_id and node_id*/
+ /* if (cluster_name == NULL) */
+ /* return -1; */
+
+ spin_lock_init(&cc->lock);
+ INIT_LIST_HEAD(&cc->work_queue);
+ INIT_LIST_HEAD(&cc->completion_queue); /* completion queue only for sanity */
+ INIT_WORK(&cc->main_job, cc_workqueue_handler);
+ INIT_WORK(&cc->completion_job, cc_completion_handler);
+ cc->wq = wq;
+
+ pcs_csset_init(&cc->css);
+
+ err = pcs_mapset_init(&cc->maps);
+ if (err)
+ return err;
+
+ pcs_rpc_engine_init(&cc->eng, PCS_NODE_ROLE_TOOL);
+ pcs_rpc_init_gc(&cc->eng, 1024);
+ if (attr) {
+ pcs_rpc_set_cluster_id(&cc->eng, &attr->cluster);
+ pcs_rpc_set_host_id(&cc->eng, &attr->node);
+ if (attr->abort_timeout_ms)
+ pcs_cc_set_abort_timeout(cc, attr->abort_timeout_ms);
+ }
+ /* TODO resurect ratelimit and randeng
+ * pcs_ratelimit_init(cc, &cc->rlim);
+ * pcs_srandomdev(&cc->rng);
+ */
+
+ memset(&cc->cfg, 0, sizeof(cc->cfg));
+ memset(&cc->op, 0, sizeof(cc->op));
+
+ init_def_mss(cc);
+ cc->cfg.def.kernel_cache_en = 1;
+ cc->cfg.curr = cc->cfg.def;
+ cc->cfg.sn = PCS_CONFIG_SEQ_ANY;
+
+ cc->io_locality = 0;
+ cc->io_tweaks = 0;
+ cc->netlat_cutoff = PCS_MAX_NETWORK_LATENCY*1000;
+ cc->iolat_cutoff = PCS_MAX_IO_LATENCY*1000;
+ cc->abort_callback = NULL;
+
+ TRACE("Ok cc->{ cl_id:" CLUSTER_ID_FMT ", node_id:" NODE_FMT ", f:%x}\n",
+ CLUSTER_ID_ARGS(cc->eng.cluster_id), NODE_ARGS(cc->eng.local_id),
+ cc->eng.flags);
+
+ return 0;
+}
+
+void pcs_cc_fini(struct pcs_cluster_core *cc)
+{
+ pcs_csset_fini(&cc->css);
+ pcs_mapset_fini(&cc->maps);
+ pcs_rpc_engine_fini(&cc->eng);
+
+ BUG_ON(!list_empty(&cc->completion_queue));
+ BUG_ON(!list_empty(&cc->work_queue));
+ pcs_flow_table_global_fini(&cc->maps.ftab);
+}
+
+void pcs_cc_submit(struct pcs_cluster_core *cc, struct pcs_int_request *ireq)
+{
+ int was_idle = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cc->lock, flags);
+ was_idle = list_empty(&cc->work_queue);
+ list_add_tail(&ireq->list, &cc->work_queue);
+ spin_unlock_irqrestore(&cc->lock, flags);
+
+ if (was_idle)
+ queue_work(cc->wq, &cc->main_job);
+}
+
+/* move request queue "q" back to main work_queue, it will be processed from the very beginning */
+void pcs_cc_requeue(struct pcs_cluster_core *cc, struct list_head *q)
+{
+ unsigned long flags;
+ int was_idle = 0;
+
+ if (list_empty(q))
+ return;
+
+ spin_lock_irqsave(&cc->lock, flags);
+ was_idle = list_empty(&cc->work_queue);
+ list_splice_tail_init(q, &cc->work_queue);
+ spin_unlock_irqrestore(&cc->lock, flags);
+
+ if (was_idle)
+ queue_work(cc->wq, &cc->main_job);
+}
diff --git a/fs/fuse/kio/pcs/pcs_cs.c b/fs/fuse/kio/pcs/pcs_cs.c
new file mode 100644
index 000000000000..0f7463e8f13a
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_cs.c
@@ -0,0 +1,1067 @@
+#include <net/sock.h>
+#include <linux/net.h>
+#include <linux/kthread.h>
+#include <linux/types.h>
+#include <linux/rbtree.h>
+
+#include "pcs_types.h"
+#include "pcs_sock_io.h"
+#include "pcs_rpc.h"
+#include "pcs_sock_io.h"
+#include "pcs_req.h"
+#include "pcs_map.h"
+#include "pcs_cs.h"
+#include "pcs_cs_prot.h"
+#include "pcs_cluster.h"
+#include "pcs_ioctl.h"
+#include "log.h"
+
+/* Lock order: cs->lock -> css->lock (lru, hash, bl_list) */
+
+
+struct pcs_rpc_params cn_rpc_params = {
+ .alloc_hdr_size = sizeof(struct pcs_rpc_hdr),
+ .max_msg_size = PCS_CS_MSG_MAX_SIZE,
+ .holddown_timeout = HZ,
+ .connect_timeout = 5*HZ,
+ .response_timeout = 30*HZ,
+ .max_conn_retry = 3,
+ .flags = 0,
+};
+
+static void cs_aborting(struct pcs_rpc *ep, int error);
+static struct pcs_msg *cs_get_hdr(struct pcs_rpc *ep, struct pcs_rpc_hdr *h);
+static int cs_input(struct pcs_rpc *ep, struct pcs_msg *msg);
+static void cs_keep_waiting(struct pcs_rpc *ep, struct pcs_msg *req, struct pcs_msg *msg);
+static void cs_connect(struct pcs_rpc *ep);
+static void pcs_cs_isolate(struct pcs_cs *cs, struct list_head *dispose);
+static void pcs_cs_destroy(struct pcs_cs *cs);
+
+struct pcs_rpc_ops cn_rpc_ops = {
+ .demux_request = cs_input,
+ .get_hdr = cs_get_hdr,
+ .state_change = cs_aborting,
+ .keep_waiting = cs_keep_waiting,
+ .connect = cs_connect,
+};
+
+struct pcs_cs *pcs_cs_alloc(struct pcs_cs_set *css,
+ struct pcs_cluster_core *cc)
+{
+ struct pcs_cs *cs;
+
+ cs = kzalloc(sizeof(struct pcs_cs), GFP_NOIO);
+ if (cs == NULL)
+ return NULL;
+
+ INIT_HLIST_NODE(&cs->hlist);
+ INIT_LIST_HEAD(&cs->lru_link);
+ spin_lock_init(&cs->lock);
+ cs->css = css;
+ cs->in_flight = 0;
+ cs->cwnd = PCS_CS_INIT_CWND;
+ cs->eff_cwnd = PCS_CS_INIT_CWND;
+ cs->cwr_state = 0;
+ atomic_set(&cs->latency_avg, 0);
+ cs->net_latency_avg = 0;
+ cs->last_latency = 0;
+ cs->latency_stamp = 0;
+ cs->net_latency_stamp = 0;
+ cs->idle_stamp = 0;
+ cs->in_flight_hwm = 0;
+ cs->in_flight_hwm_stamp = 0;
+ pcs_cs_init_cong_queue(cs);
+ pcs_cs_init_active_list(cs);
+
+ cs->io_prio = -1;
+ cs->mds_flags = 0;
+ cs->io_prio_stamp = 0;
+
+ INIT_LIST_HEAD(&cs->flow_lru);
+ cs->nflows = 0;
+
+ cs->state = 0;
+ cs->is_probing = 0;
+ cs->is_dead = 0;
+ INIT_LIST_HEAD(&cs->bl_link);
+
+ cs->addr_serno = 0;
+
+ cs->rpc = pcs_rpc_create(&cc->eng, &cn_rpc_params, &cn_rpc_ops);
+ if (cs->rpc == NULL) {
+ kfree(cs);
+ return NULL;
+ }
+ cs->rpc->private = cs;
+ cs->nmaps = 0;
+ INIT_LIST_HEAD(&cs->map_list);
+ memset(&cs->stat, 0, sizeof(cs->stat));
+ return cs;
+}
+
+unsigned int pcs_cs_hash(PCS_NODE_ID_T *id)
+{
+ return *(unsigned int *)id % PCS_CS_HASH_SIZE;
+}
+
+static struct pcs_cs *
+__lookup_cs(struct pcs_cs_set *csset, PCS_NODE_ID_T *id)
+{
+ struct pcs_cs *cs;
+ hlist_for_each_entry_rcu(cs, &csset->ht[pcs_cs_hash(id)], hlist) {
+ if (memcmp(&cs->id, id, sizeof(cs->id)) == 0)
+ return cs;
+ }
+ return NULL;
+}
+
+static struct pcs_cs *
+lookup_and_lock_cs(struct pcs_cs_set *csset, PCS_NODE_ID_T *id)
+{
+ struct pcs_cs *cs;
+retry:
+ rcu_read_lock();
+ cs = __lookup_cs(csset, id);
+ if (!cs) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ spin_lock(&cs->lock);
+ rcu_read_unlock();
+ if (cs->is_dead) {
+ spin_unlock(&cs->lock);
+ goto retry;
+ }
+ return cs;
+}
+
+static void add_cs(struct pcs_cs_set *csset, struct pcs_cs *cs)
+{
+ unsigned int hash = pcs_cs_hash(&cs->id);
+
+ assert_spin_locked(&csset->lock);
+
+ list_add_tail(&cs->lru_link, &csset->lru);
+ csset->ncs++;
+ hlist_add_head_rcu(&cs->hlist, &csset->ht[hash]);
+}
+
+static inline int netaddr_cmp(PCS_NET_ADDR_T const *addr1, PCS_NET_ADDR_T const *addr2, int ignore_port)
+{
+ unsigned int d;
+ size_t sz = 0;
+
+ d = addr1->type - addr2->type;
+ if (d)
+ return d;
+ d = addr1->port - addr2->port;
+ if (!ignore_port && d)
+ return d;
+
+ switch (addr1->type) {
+ case PCS_ADDRTYPE_IP:
+ sz = sizeof(struct in_addr);
+ break;
+ case PCS_ADDRTYPE_IP6:
+ sz = sizeof(struct in6_addr);
+ break;
+ default:
+ BUG();
+ }
+
+ return memcmp(addr1->address, addr2->address, sz);
+}
+
+int pcs_netaddr_cmp(PCS_NET_ADDR_T const *addr1, PCS_NET_ADDR_T const *addr2)
+{
+ return netaddr_cmp(addr1, addr2, 0);
+}
+
+/* Return locked cs */
+struct pcs_cs *pcs_cs_find_create(struct pcs_cs_set *csset, PCS_NODE_ID_T *id, PCS_NET_ADDR_T *addr, int flags)
+{
+ struct pcs_cs *cs;
+
+again:
+ cs = lookup_and_lock_cs(csset, id);
+ if (cs) {
+ /* If rpc is connected, leave it connected until failure.
+ * After current connect fails, reconnect will be done to new address
+ */
+ if (addr) {
+ if (pcs_netaddr_cmp(&cs->addr, addr)) {
+ cs->addr = *addr;
+ cs->addr_serno++;
+ if (!(flags & CS_FL_INACTIVE))
+ pcs_map_notify_addr_change(cs);
+ TRACE("Port change CS" NODE_FMT " seq=%d", NODE_ARGS(*id), cs->addr_serno);
+ pcs_rpc_set_address(cs->rpc, addr);
+
+ }
+ }
+ /* TODO: (flags & PCS_RPC_F_LOCAL) should be checker here */
+ return cs;
+ }
+ BUG_ON(addr == NULL);
+
+ cs = pcs_cs_alloc(csset, cc_from_csset(csset));
+ if (!cs)
+ return NULL;
+
+ cs->id = *id;
+
+ cs->addr = *addr;
+ cs->addr_serno = 1;
+
+ pcs_rpc_set_peer_id(cs->rpc, id, PCS_NODE_ROLE_CS);
+ pcs_rpc_set_address(cs->rpc, addr);
+
+
+ /* TODO: Init PCS_RPC_F_LOCAL if available here */
+
+ spin_lock(&cs->lock);
+ spin_lock(&csset->lock);
+ if (__lookup_cs(csset, id)) {
+ spin_unlock(&csset->lock);
+ cs->is_dead = 1;
+ spin_unlock(&cs->lock);
+ pcs_cs_destroy(cs);
+ goto again;
+ }
+ add_cs(csset, cs);
+ spin_unlock(&csset->lock);
+ return cs;
+}
+
+static void (*io_times_logger_cb)(struct pcs_int_request *ireq, struct pcs_msg *resp, u32 max_iolat, void *ctx) = NULL;
+static void *io_times_logger_ctx = NULL;
+
+void cs_set_io_times_logger(void (*logger)(struct pcs_int_request *ireq, struct pcs_msg *resp, u32 max_iolat, void *ctx), void *ctx)
+{
+ io_times_logger_cb = logger;
+ io_times_logger_ctx = ctx;
+}
+
+
+void pcs_cs_update_stat(struct pcs_cs *cs, u32 iolat, u32 netlat, int op_type)
+{
+ pcs_perfcounter_stat_update(&cs->stat.iolat, iolat);
+ pcs_perfcounter_stat_update(&cs->stat.netlat, netlat);
+ switch (op_type) {
+ case PCS_CS_WRITE_SYNC_RESP:
+ case PCS_CS_WRITE_RESP:
+ cs->stat.write_ops_rate.total++;
+ break;
+ case PCS_CS_READ_RESP:
+ cs->stat.read_ops_rate.total++;
+ break;
+ case PCS_CS_SYNC_RESP:
+ cs->stat.sync_ops_rate.total++;
+ break;
+ }
+}
+
+static void cs_response_done(struct pcs_msg *msg)
+{
+ struct pcs_int_request *ireq = ireq_from_msg(msg);
+
+ if (!pcs_if_error(&msg->error)) {
+ struct pcs_cs_iohdr *h = (struct pcs_cs_iohdr *)msg_inline_head(msg->response);
+
+ if (h->sync.misc & PCS_CS_IO_CACHED)
+ ireq->flags |= IREQ_F_CACHED;
+
+ pcs_map_verify_sync_state(ireq->dentry, ireq, msg);
+ } else {
+ TRACE(XID_FMT " IO error %d %lu : %llu:%u+%u\n", XID_ARGS(ireq->iochunk.hbuf.hdr.xid), msg->error.value, msg->error.remote ? (unsigned long)msg->error.offender.val : 0UL,
+ (unsigned long long)ireq->iochunk.chunk, (unsigned)ireq->iochunk.offset, ireq->iochunk.size);
+ }
+
+ pcs_copy_error_cond(&ireq->error, &msg->error);
+ if (msg->rpc) {
+ pcs_rpc_put(msg->rpc);
+ msg->rpc = NULL;
+ }
+ ireq_complete(ireq);
+}
+
+static void cs_get_read_response_iter(struct pcs_msg *msg, int offset, struct iov_iter *it)
+{
+ if (offset < sizeof(struct pcs_cs_iohdr)) {
+ iov_iter_init_plain(it, msg->_inline_buffer,
+ sizeof(struct pcs_cs_iohdr), 0);
+ iov_iter_advance(it, offset);
+ TRACE("return msg:%p->size:%d off:%d it_len:%ld\n\n", msg, msg->size, offset, iov_iter_count(it));
+ return;
+ } else {
+ struct pcs_msg *req = msg->private;
+ struct pcs_int_request *ireq = req->private2;
+ struct pcs_int_request *parent = ireq->completion_data.parent;
+
+ if (parent->type == PCS_IREQ_API) {
+ pcs_api_iorequest_t *ar = parent->apireq.req;
+
+ /* Read directly to memory given by user */
+ BUG_ON(ireq->iochunk.direction != PCS_REQ_T_READ);
+
+ offset -= (unsigned int)sizeof(struct pcs_cs_iohdr);
+ ar->get_iter(ar->datasource, ireq->iochunk.dio_offset, it);
+ iov_iter_truncate(it, ireq->iochunk.size);
+ iov_iter_advance(it, offset);
+
+ TRACE("return msg:%p->size:%d off:%d it_len:%ld\n\n", msg, msg->size, offset, iov_iter_count(it));
+ return;
+ } else
+ BUG();
+ }
+}
+
+static void cs_connect(struct pcs_rpc *ep)
+{
+ struct pcs_cluster_core *cc = cc_from_rpc(ep->eng);
+ struct pcs_fuse_cluster *pfc = pcs_cluster_from_cc(cc);
+
+ ep->state = PCS_RPC_CONNECT;
+ if (fuse_pcs_csconn_send(pfc->fc, ep, PCS_IOC_CS_OPEN))
+ pcs_rpc_reset(ep);
+}
+
+static struct pcs_msg *cs_get_hdr(struct pcs_rpc *ep, struct pcs_rpc_hdr *h)
+{
+ struct pcs_msg *msg, *resp;
+ struct pcs_rpc_hdr *req_h;
+
+ if (!RPC_IS_RESPONSE(h->type))
+ return NULL;
+
+ if (h->type != PCS_CS_READ_RESP)
+ return NULL;
+
+ /* The goal is to avoid allocation new msg and reuse one inlined in ireq */
+
+ msg = pcs_rpc_lookup_xid(ep, &h->xid);
+ if (msg == NULL)
+ return NULL;
+
+ req_h = (struct pcs_rpc_hdr *)msg_inline_head(msg);
+ if (req_h->type != PCS_CS_READ_REQ)
+ return NULL;
+
+ resp = pcs_rpc_alloc_input_msg(ep, sizeof(struct pcs_cs_iohdr));
+ if (!resp)
+ return NULL;
+
+ memcpy(resp->_inline_buffer, h, sizeof(struct pcs_rpc_hdr));
+ resp->size = h->len;
+ resp->private = msg;
+ resp->get_iter = cs_get_read_response_iter;
+ resp->done = rpc_work_input;
+ pcs_msg_del_calendar(msg);
+
+ return resp;
+}
+
+static void cs_get_data(struct pcs_msg *msg, int offset, struct iov_iter *it)
+{
+ struct pcs_int_request *ireq = ireq_from_msg(msg);
+
+ if (offset < sizeof(struct pcs_cs_iohdr)) {
+ iov_iter_init_plain(it, (char *)&ireq->iochunk.hbuf,
+ sizeof(struct pcs_cs_iohdr), 0);
+ iov_iter_advance(it, offset);
+ TRACE("return msg:%p->size:%d off:%d it_len:%ld\n\n", msg, msg->size, offset, iov_iter_count(it));
+
+ return;
+ } else {
+ struct pcs_int_request *parent = ireq->completion_data.parent;
+ if (parent->type == PCS_IREQ_API) {
+ pcs_api_iorequest_t *ar = parent->apireq.req;
+
+ BUG_ON(ireq->iochunk.direction != PCS_REQ_T_WRITE);
+
+ offset -= (unsigned int)sizeof(struct pcs_cs_iohdr);
+ ar->get_iter(ar->datasource, ireq->iochunk.dio_offset, it);
+ iov_iter_truncate(it, ireq->iochunk.size);
+ iov_iter_advance(it, offset);
+
+ TRACE("return msg:%p->size:%d off:%d it_len:%ld\n\n", msg, msg->size, offset, iov_iter_count(it));
+ return;
+ } else
+ BUG();
+ }
+}
+
+static void cs_sent(struct pcs_msg *msg)
+{
+ msg->done = cs_response_done;
+ if (pcs_if_error(&msg->error)) {
+ msg->done(msg);
+ return;
+ }
+ pcs_rpc_sent(msg);
+}
+
+void pcs_cs_submit(struct pcs_cs *cs, struct pcs_int_request *ireq)
+{
+ struct pcs_msg *msg = &ireq->iochunk.msg;
+ struct pcs_cs_iohdr *ioh;
+ struct pcs_cs_list *csl = ireq->iochunk.csl;
+
+ msg->private = cs;
+
+ BUG_ON(msg->rpc);
+ msg->private2 = ireq;
+
+ ioh = &ireq->iochunk.hbuf;
+ ioh->hdr.len = sizeof(struct pcs_cs_iohdr) +
+ (ireq->iochunk.direction ? ireq->iochunk.size : 0);
+ ioh->hdr.type = ireq->iochunk.direction ? PCS_CS_WRITE_REQ : PCS_CS_READ_REQ;
+ pcs_rpc_get_new_xid(&cc_from_cs(cs)->eng, &ioh->hdr.xid);
+ ioh->offset = ireq->iochunk.offset;
+ ioh->size = ireq->iochunk.size;
+ ioh->iocontext = (u32)ireq->dentry->fileinfo.attr.id;
+ ioh->_reserved = 0;
+ memset(&ioh->sync, 0, sizeof(ioh->sync));
+
+ if (ireq->flags & IREQ_F_SEQ)
+ ioh->sync.misc = PCS_CS_IO_SEQ;
+
+ msg->size = ioh->hdr.len;
+ msg->rpc = NULL;
+ pcs_clear_error(&msg->error);
+ msg->done = cs_sent;
+ msg->get_iter = cs_get_data;
+
+ /* TODO
+ * Theoretically at this moment this map may already becomes dead
+ * what should I do then?
+ * This may happens only in case of aio/dio vs truncate race
+ */
+ BUG_ON(ireq->iochunk.map->state & PCS_MAP_DEAD);
+ ioh->map_version = csl->version;
+ if (ireq->iochunk.direction)
+ msg->timeout = csl->write_timeout;
+ else
+ msg->timeout = csl->read_timeout;
+ ireq->ts_sent = jiffies;
+ ireq->wait_origin.val = 0;
+
+
+ DTRACE(XID_FMT " About to send msg:%p, ireq:%p : %llu:%u+%u\n", XID_ARGS(ireq->iochunk.hbuf.hdr.xid),
+ msg, ireq,
+ (unsigned long long)ireq->iochunk.chunk,
+ (unsigned)ireq->iochunk.offset,
+ ireq->iochunk.size);
+
+/* TODO reanable ratelimiting */
+#if 0
+ if (cc_from_cs(cs)->rlim.rate)
+ pcs_submit_ratelimited(&cc_from_cs(cs)->rlim, ireq);
+ else
+ pcs_rpc_send(cs->rpc, msg);
+#endif
+ pcs_rpc_queue(cs->rpc, msg);
+}
+
+static void handle_congestion(struct pcs_cs *cs, struct pcs_rpc_hdr *h)
+{
+ struct pcs_cs *who;
+
+ TRACE("Received congestion notification from CS" NODE_FMT, NODE_ARGS(h->xid.origin));
+
+ if (cs->id.val == h->xid.origin.val) {
+ who = cs;
+ spin_lock(&who->lock);
+ } else
+ who = lookup_and_lock_cs(cs->css, &h->xid.origin);
+
+ if (who && !who->cwr_state) {
+ /* Unless node is already reducing congestion window, shrink it
+ * to half of min(in_flight, cwnd) and enter congestion reduction state,
+ * where we ignore further congestion notifications until window is reduced
+ */
+ if (who->in_flight < who->cwnd)
+ who->cwnd = who->in_flight;
+ who->cwnd /= 2;
+ if (who->cwnd == 0)
+ who->cwnd = 1;
+ if (who->eff_cwnd > who->cwnd)
+ who->eff_cwnd = who->cwnd;
+ if (who->in_flight >= who->eff_cwnd)
+ who->cwr_state = 1;
+ }
+ spin_unlock(&who->lock);
+}
+
+static int may_reroute(struct pcs_cs_list *csl, PCS_NODE_ID_T cs_id)
+{
+ int i;
+ int legit = 0;
+
+ for (i = csl->nsrv - 1; i >= 0; i--) {
+ struct pcs_cs *cs = csl->cs[i].cslink.cs;
+
+ if (cs->id.val == cs_id.val)
+ continue;
+ if (test_bit(CS_SF_FAILED, &cs->state))
+ continue;
+ if (cs_is_blacklisted(cs))
+ continue;
+ if (test_bit(i, &csl->blacklist) &&
+ jiffies < READ_ONCE(csl->blacklist_expires))
+ continue;
+ legit++;
+ }
+ return legit;
+}
+
+static void cs_keep_waiting(struct pcs_rpc *ep, struct pcs_msg *req, struct pcs_msg *msg)
+{
+ struct pcs_rpc_hdr *h = (struct pcs_rpc_hdr *)msg_inline_head(msg);
+ struct pcs_cs *cs = ep->private;
+ struct pcs_cs *who;
+
+ /* Some CS reported it cannot complete local IO in time, close congestion window */
+ who = lookup_and_lock_cs(cs->css, &h->xid.origin);
+ if (who) {
+ struct pcs_int_request *ireq = req->private2;
+ abs_time_t lat = ((jiffies - ireq->ts_sent) * 1000) / HZ;
+ if (ireq)
+ ireq->wait_origin = h->xid.origin;
+
+ if (!who->cwr_state) {
+ DTRACE("Congestion window on CS" NODE_FMT " reducing %d/%d/%d", NODE_ARGS(h->xid.origin),
+ who->in_flight, who->eff_cwnd, who->cwnd);
+ if (who->in_flight < who->cwnd)
+ who->cwnd = who->in_flight;
+ who->cwnd /= 2;
+ if (who->cwnd == 0)
+ who->cwnd = 1;
+ if (who->eff_cwnd > who->cwnd)
+ who->eff_cwnd = who->cwnd;
+ if (who->in_flight >= who->eff_cwnd)
+ who->cwr_state = 1;
+ }
+ cs_update_io_latency(who, lat);
+ if (ireq && ireq->type == PCS_IREQ_IOCHUNK && ireq->iochunk.direction == 0) {
+ /* Force CS reselection */
+ pcs_map_force_reselect(who);
+
+ /* If request still has no banned CS and delayed for too long,
+ * cancel and reroute
+ */
+ if (ireq->iochunk.banned_cs.val == 0 && lat >= PCS_MAX_READ_IO_LATENCY*1000
+ && may_reroute(ireq->iochunk.csl, h->xid.origin)) {
+ TRACE("Canceling read on CS" NODE_FMT, NODE_ARGS(h->xid.origin));
+ ireq->iochunk.banned_cs = h->xid.origin;
+ spin_unlock(&who->lock);
+ pcs_rpc_cancel_request(req);
+ return;
+ }
+ }
+
+ spin_unlock(&who->lock);
+ }
+
+}
+
+static int cs_input(struct pcs_rpc *ep, struct pcs_msg *msg)
+{
+ struct pcs_rpc_hdr *h = (struct pcs_rpc_hdr *)msg->_inline_buffer;
+
+ switch (h->type) {
+ case PCS_CS_CONG_NOTIFY:
+ handle_congestion(ep->private, h);
+ msg->done(msg);
+ return 0;
+ default:
+ pcs_log(0, "Unsupported message type %u\n", h->type);
+ return PCS_ERR_PROTOCOL;
+ }
+}
+
+void pcs_cs_notify_error(struct pcs_cluster_core *cc, pcs_error_t *err)
+{
+ struct list_head queue;
+ struct pcs_cs *cs;
+
+ INIT_LIST_HEAD(&queue);
+
+ /* Filter out errors specific for particular chunk.
+ * Probably, we should handle only timeouts here.
+ */
+ switch (err->value) {
+ case PCS_ERR_CSD_STALE_MAP:
+ case PCS_ERR_CSD_REPLICATING:
+ case PCS_ERR_PROTOCOL:
+ case PCS_ERR_CSD_RO_MAP:
+ return;
+ }
+
+ cs = lookup_and_lock_cs(&cc->css, &err->offender);
+ if (cs == NULL)
+ return;
+
+ list_splice_tail_init(&cs->cong_queue, &queue);
+ clear_bit(CS_SF_CONGESTED, &cs->state);
+ cs->cong_queue_len = 0;
+ cs_blacklist(cs, err->value, "notify error");
+ spin_unlock(&cs->lock);
+
+ pcs_cc_requeue(cc, &queue);
+
+}
+
+static void pcs_cs_isolate(struct pcs_cs *cs, struct list_head *dispose)
+{
+ assert_spin_locked(&cs->lock);
+
+ list_splice_tail_init(&cs->active_list, dispose);
+ list_splice_tail_init(&cs->cong_queue, dispose);
+ cs->active_list_len = 0;
+ cs->cong_queue_len = 0;
+ clear_bit(CS_SF_CONGESTED, &cs->state);
+
+ cs->is_dead = 1;
+ spin_lock(&cs->css->lock);
+ if (!hlist_unhashed(&cs->hlist))
+ hlist_del_rcu(&cs->hlist);
+ list_del(&cs->lru_link);
+ list_del(&cs->bl_link);
+ cs->css->ncs--;
+
+ if (list_empty(&cs->css->bl_list))
+ cancel_delayed_work(&cs->css->bl_work);
+ spin_unlock(&cs->css->lock);
+
+ while (!list_empty(&cs->map_list)) {
+ struct pcs_cs_link *csl = list_first_entry(&cs->map_list,
+ struct pcs_cs_link,
+ link);
+ csl->cs = NULL;
+ cs->nmaps--;
+ list_del_init(&csl->link);
+ }
+
+
+ BUG_ON(cs->nmaps);
+
+ if (!list_empty(&cs->flow_lru))
+ pcs_flow_cs_unbind_all(cs);
+ BUG_ON(cs->nflows);
+}
+
+static void cs_free_callback(struct rcu_head *head)
+{
+ struct pcs_cs *cs = container_of(head, struct pcs_cs, rcu);
+
+ kfree(cs);
+}
+
+static void pcs_cs_destroy(struct pcs_cs *cs)
+{
+ BUG_ON(!list_empty(&cs->active_list));
+ BUG_ON(!list_empty(&cs->cong_queue));
+ BUG_ON(!cs->is_dead);
+
+ if (cs->rpc) {
+ pcs_rpc_close(cs->rpc);
+ cs->rpc = NULL;
+ }
+ call_rcu(&cs->rcu, cs_free_callback);
+}
+
+
+void cs_aborting(struct pcs_rpc *ep, int error)
+{
+ pcs_rpc_reset(ep);
+}
+
+/* Latency is difficult value to use for any decisions.
+ * It is sampled at random, we do not know what is happening while
+ * we have no samples. For now we do the following: arriving samples
+ * are locked and used as if latency stays at this value until the next sample.
+ * If we have no samples, latency value slowly decays. This prepared value
+ * is used to take EWMA.
+ */
+
+static unsigned int lat_decay(unsigned int lat, unsigned decay_period,
+ abs_time_t now, abs_time_t stamp)
+{
+ unsigned int interval;
+
+ if (now < stamp + decay_period)
+ return lat;
+
+ if (stamp == 0 || now > stamp + 30 * decay_period)
+ return 0;
+
+ interval = (now - stamp) / decay_period;
+
+ return lat >>= interval;
+
+}
+
+unsigned int __cs_get_avg_latency(struct pcs_cs *cs, abs_time_t now)
+{
+ return lat_decay(atomic_read(&cs->latency_avg), CS_LAT_DECAY_INTERVAL,
+ now, READ_ONCE(cs->latency_stamp));
+}
+
+unsigned int cs_get_avg_latency(struct pcs_cs *cs)
+{
+ return __cs_get_avg_latency(cs, jiffies);
+}
+unsigned int __cs_get_avg_net_latency(struct pcs_cs *cs, abs_time_t now)
+{
+ return lat_decay(READ_ONCE(cs->net_latency_avg), CS_LAT_DECAY_INTERVAL,
+ now, READ_ONCE(cs->net_latency_stamp));
+
+}
+
+unsigned int cs_get_avg_net_latency(struct pcs_cs *cs)
+{
+ return __cs_get_avg_net_latency(cs, jiffies);
+}
+
+void cs_account_latency(struct pcs_cs *cs, unsigned int cost)
+{
+ unsigned lat;
+ abs_time_t now = jiffies;
+
+ lat = __cs_get_avg_latency(cs, now);
+
+ atomic_add(cost, &cs->latency_avg);
+ WRITE_ONCE(cs->latency_stamp, now);
+}
+
+void cs_update_io_latency(struct pcs_cs *cs, u32 lat)
+{
+ abs_time_t now = jiffies;
+ u32 cur_latency;
+
+ cur_latency = __cs_get_avg_latency(cs, jiffies);
+
+ atomic_add((int)(lat - cur_latency) >> CS_LAT_EWMA_LOG, &cs->latency_avg);
+ WRITE_ONCE(cs->last_latency, lat);
+ WRITE_ONCE(cs->latency_stamp, now);
+}
+
+
+void cs_update_net_latency(struct pcs_cs *cs, u32 lat)
+{
+ abs_time_t now = jiffies;
+ struct pcs_rpc *ep = cs->rpc;
+ u32 cur_latency;
+
+ cur_latency = __cs_get_avg_net_latency(cs, now);
+
+ cur_latency += ((int)(lat - cur_latency) >> CS_LAT_EWMA_LOG);
+
+ WRITE_ONCE(cs->net_latency_avg, cur_latency);
+ WRITE_ONCE(cs->net_latency_stamp, now);
+
+ if (lat < READ_ONCE(ep->netlat_min))
+ WRITE_ONCE(ep->netlat_min, lat);
+ if (lat > READ_ONCE(ep->netlat_max))
+ WRITE_ONCE(ep->netlat_max, lat);
+ atomic_inc(&ep->netlat_cnt);
+ atomic64_add(lat, &ep->netlat_avg);
+}
+
+unsigned int cs_get_avg_in_flight(struct pcs_cs *cs)
+{
+ assert_spin_locked(&cs->lock);
+
+ if (cs->in_flight == 0) {
+ abs_time_t now;
+
+ now = jiffies;
+
+ if (now >= cs->idle_stamp + CS_LAT_DECAY_INTERVAL) {
+ if (cs->idle_stamp == 0 || now > cs->idle_stamp + 30*CS_LAT_DECAY_INTERVAL) {
+ cs->in_flight_avg = 0;
+ } else {
+ unsigned int interval;
+
+ interval = (now - cs->idle_stamp)/CS_LAT_DECAY_INTERVAL;
+ cs->idle_stamp = now;
+ cs->in_flight_avg >>= interval;
+ }
+ if (cs->cwnd > PCS_CS_INIT_CWND) {
+ cs->cwnd = PCS_CS_INIT_CWND;
+ if (cs->eff_cwnd > PCS_CS_INIT_CWND)
+ cs->eff_cwnd = PCS_CS_INIT_CWND;
+ }
+ }
+ }
+
+ return cs->in_flight_avg;
+}
+
+void cs_increment_in_flight(struct pcs_cs *cs, unsigned int to_add)
+{
+ unsigned int avg;
+
+ spin_lock(&cs->lock);
+ avg = cs_get_avg_in_flight(cs);
+
+ cs->in_flight += to_add;
+
+ cs->in_flight_avg = avg + (((int)(cs->in_flight - avg)) >> CS_LAT_EWMA_LOG);
+
+ if (cs->in_flight > cs->in_flight_hwm) {
+ cs->in_flight_hwm = cs->in_flight;
+ cs->in_flight_hwm_stamp = jiffies;
+ DTRACE("HWM on CS" NODE_FMT " is %u\n", NODE_ARGS(cs->id), cs->in_flight);
+ }
+ spin_unlock(&cs->lock);
+}
+
+void cs_decrement_in_flight(struct pcs_cs *cs, unsigned int to_dec)
+{
+ assert_spin_locked(&cs->lock);
+
+ cs->in_flight -= to_dec;
+
+ BUG_ON((int)cs->in_flight < 0);
+
+ if (cs->in_flight < cs->eff_cwnd) {
+ cs->cwr_state = 0;
+ pcs_cs_flush_cong_queue(cs);
+ }
+ if (cs->in_flight == 0)
+ cs->idle_stamp = jiffies;
+}
+
+/* Check that cwnd was used recently. If it was not used, drop it. */
+
+void cs_cwnd_use_or_lose(struct pcs_cs *cs)
+{
+ assert_spin_locked(&cs->lock);
+
+ if (cs->in_flight_hwm < cs->cwnd && cs->cwnd > PCS_CS_INIT_CWND) {
+ abs_time_t now = jiffies;
+
+ if (now > cs->in_flight_hwm_stamp + CS_LAT_DECAY_INTERVAL) {
+ unsigned int cwnd;
+
+ cwnd = cs->in_flight_hwm;
+ if (cwnd < PCS_CS_INIT_CWND)
+ cwnd = PCS_CS_INIT_CWND;
+
+ TRACE("Congestion window on CS#" NODE_FMT " was not used, shrink %u -> %u", NODE_ARGS(cs->id),
+ cs->cwnd, cwnd);
+ cs->cwnd = cwnd;
+ if (cs->eff_cwnd > cwnd)
+ cs->eff_cwnd = cwnd;
+ cs->in_flight_hwm_stamp = now;
+ cs->in_flight_hwm = cs->in_flight;
+ }
+ }
+}
+
+static void cs_probe_done(struct pcs_msg *msg)
+{
+ struct pcs_cs_set *css = msg->private;
+ struct pcs_cs *cs;
+
+ cs = lookup_and_lock_cs(css, &msg->rpc->peer_id);
+
+ if (cs) {
+ if (!pcs_if_error(&msg->error)) {
+ cs_whitelist(cs, "probe");
+ } else {
+ TRACE("probe error %d", msg->error.value);
+ cs_blacklist(cs, msg->error.value, "probe");
+ }
+ cs->is_probing = 0;
+ }
+ spin_unlock(&cs->lock);
+ pcs_free_msg(msg);
+}
+
+static struct pcs_msg *cs_prep_probe(struct pcs_cs *cs)
+{
+ struct pcs_msg *msg;
+ struct pcs_cs_map_prop *m;
+ unsigned int msg_sz = offsetof(struct pcs_cs_map_prop, nodes) + sizeof(struct pcs_cs_node_desc);
+
+
+ msg = pcs_rpc_alloc_output_msg(msg_sz);
+ if (!msg)
+ return NULL;
+
+ m = (struct pcs_cs_map_prop *)msg_inline_head(msg);
+ memset(m, 0, msg_sz);
+
+ m->hdr.h.type = PCS_CS_MAP_PROP_REQ;
+ m->hdr.h.len = msg_sz;
+
+ m->flags = CS_MAPF_PING;
+ m->nnodes = 1;
+ m->nodes[0].state = CS_OBJ_UNKNOWN;
+ m->nodes[0].info.id = cs->id;
+ m->nodes[0].info.addr = cs->rpc->addr;
+
+ msg->done = cs_probe_done;
+ msg->private = cs->css;
+ msg->timeout = PCS_CS_BLACKLIST_TIMER / 2;
+ return msg;
+}
+
+static void bl_timer_work(struct work_struct *w)
+{
+ struct pcs_cs_set *css = container_of(w, struct pcs_cs_set, bl_work.work);
+ struct pcs_cluster_core *cc = cc_from_csset(css);
+ LIST_HEAD(local_lst);
+ LIST_HEAD(to_blacklist);
+ LIST_HEAD(to_resubmit);
+
+ spin_lock(&css->lock);
+ list_splice_tail_init(&css->bl_list, &local_lst);
+ spin_unlock(&css->lock);
+ if (list_empty(&local_lst))
+ return;
+
+ while (!list_empty(&local_lst)) {
+ struct pcs_cs *cs;
+ struct pcs_msg *msg;
+
+ cs = list_first_entry(&local_lst, struct pcs_cs, bl_link);
+
+ spin_lock(&cs->lock);
+ BUG_ON(cs->is_dead);
+ list_move(&cs->bl_link, &to_blacklist);
+ if (cs->is_probing) {
+ spin_unlock(&cs->lock);
+ continue;
+ }
+ if (!cs->nmaps) {
+ pcs_cs_isolate(cs, &to_resubmit);
+ spin_unlock(&cs->lock);
+ pcs_cs_destroy(cs);
+ continue;
+ }
+ cs->is_probing = 1;
+ spin_unlock(&cs->lock);
+ msg = cs_prep_probe(cs);
+ if (msg)
+ pcs_rpc_call(cs->rpc, msg);
+ spin_lock(&cs->lock);
+ if (!msg)
+ cs->is_probing = 0;
+ spin_unlock(&cs->lock);
+ }
+ spin_lock(&css->lock);
+ list_splice(&to_blacklist, &css->bl_list);
+ if (list_empty(&css->bl_list))
+ mod_delayed_work(cc->wq, &css->bl_work, PCS_CS_BLACKLIST_TIMER);
+ spin_unlock(&css->lock);
+
+ pcs_cc_requeue(cc, &to_resubmit);
+}
+
+void pcs_csset_init(struct pcs_cs_set *css)
+{
+ unsigned int i;
+
+ for (i = 0; i < PCS_CS_HASH_SIZE; i++)
+ INIT_HLIST_HEAD(&css->ht[i]);
+
+ INIT_LIST_HEAD(&css->lru);
+ INIT_LIST_HEAD(&css->bl_list);
+ INIT_DELAYED_WORK(&css->bl_work, bl_timer_work);
+ css->ncs = 0;
+ spin_lock_init(&css->lock);
+}
+
+void pcs_csset_fini(struct pcs_cs_set *css)
+{
+ unsigned int i;
+ LIST_HEAD(to_resubmit);
+
+ for (i = 0; i < PCS_CS_HASH_SIZE; i++) {
+ spin_lock(&css->lock);
+ while (!hlist_empty(&css->ht[i])) {
+ struct pcs_cs *cs;
+
+ rcu_read_lock();
+ cs = hlist_entry(css->ht[i].first, struct pcs_cs, hlist);
+ hlist_del_init_rcu(&cs->hlist);
+ spin_unlock(&css->lock);
+
+ spin_lock(&cs->lock);
+ if (cs->is_dead) {
+ spin_unlock(&cs->lock);
+ rcu_read_unlock();
+ spin_lock(&css->lock);
+ continue;
+ }
+ rcu_read_unlock();
+ pcs_cs_isolate(cs, &to_resubmit);
+ spin_unlock(&cs->lock);
+ pcs_cs_destroy(cs);
+
+ spin_lock(&css->lock);
+ }
+ spin_unlock(&css->lock);
+
+ }
+ cancel_delayed_work_sync(&css->bl_work);
+ /* NOTE: It looks like must being empty at destruction */
+ BUG_ON(!list_empty(&to_resubmit));
+ pcs_cc_requeue(cc_from_csset(css), &to_resubmit);
+
+ BUG_ON(timer_pending(&css->bl_work.timer));
+ BUG_ON(!list_empty(&css->bl_list));
+ BUG_ON(!list_empty(&css->lru));
+ BUG_ON(css->ncs);
+
+
+}
+
+int pcs_cs_for_each_entry(struct pcs_cs_set *set, int (*cb)(struct pcs_cs *cs, void *arg), void *arg)
+{
+ int rc = 0;
+ unsigned int i;
+ struct pcs_cs *cs;
+ struct hlist_node *node;
+
+ spin_lock(&set->lock);
+ for (i = 0; i < PCS_CS_HASH_SIZE; i++) {
+ hlist_for_each_entry_safe(cs, node, &set->ht[i], hlist) {
+ rc = cb(cs, arg);
+ if (rc < 0) {
+ spin_lock(&set->lock);
+ return rc;
+ }
+ }
+ }
+ spin_unlock(&set->lock);
+ return rc;
+}
+
+static int do_update_stat(struct pcs_cs *cs, void *arg)
+{
+ (void)arg;
+ pcs_cs_stat_up(cs);
+ return 0;
+}
+
+void pcs_cs_set_stat_up(struct pcs_cs_set *set)
+{
+ pcs_cs_for_each_entry(set, do_update_stat, 0);
+}
+
+void pcs_cs_cong_enqueue(struct pcs_int_request *ireq, struct pcs_cs *cs)
+{
+ spin_lock(&cs->lock);
+ if (test_bit(CS_SF_CONGESTED, &cs->state))
+ test_bit(CS_SF_CONGESTED, &cs->state);
+ list_add_tail(&ireq->list, &cs->cong_queue);
+ cs->cong_queue_len++;
+ if (!ireq->qdepth)
+ ireq->qdepth = cs->cong_queue_len + cs->active_list_len;
+ spin_unlock(&cs->lock);
+}
diff --git a/fs/fuse/kio/pcs/pcs_cs.h b/fs/fuse/kio/pcs/pcs_cs.h
new file mode 100644
index 000000000000..c04317e4a9a9
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_cs.h
@@ -0,0 +1,182 @@
+#ifndef _PCS_CS_H_
+#define _PCS_CS_H_ 1
+
+#include "pcs_prot_types.h"
+#include "pcs_perfcounters.h"
+
+struct pcs_map_entry;
+
+#define PCS_CS_INIT_CWND (1*1024*1024)
+#define PCS_CS_MAX_CWND (16*1024*1024)
+#define PCS_MAX_NETWORK_LATENCY ((2000*3)/4)
+#define PCS_MAX_IO_LATENCY (8*HZ)
+#define PCS_MAX_READ_IO_LATENCY (5*HZ)
+
+/* io_prio received from MDS is valid during this time, otherwise it is stale and cannot be used */
+#define PCS_CS_IO_PRIO_VALID_TIME (60*HZ)
+
+/* When CS is idle its latency halves after CS_LAT_DECAY_INTERVAL */
+#define CS_LAT_DECAY_INTERVAL (HZ/2)
+
+/* When CS is active time constant is ln(2) * 2^CS_LAT_EWMA_LOG / IOPS,
+ * so that with IOPS=100 and CS_LAT_EWMA_LOG=6 we have ~400ms
+ */
+#define CS_LAT_EWMA_LOG (6)
+
+#define PCS_CS_BLACKLIST_TIMER (10*HZ)
+
+enum {
+ CS_SF_LOCAL,
+ CS_SF_LOCAL_SOCK,
+ CS_SF_INACTIVE,
+ CS_SF_REPLICATING,
+ CS_SF_FAILED,
+ CS_SF_BLACKLISTED,
+ CS_SF_ACTIVE,
+ CS_SF_CONGESTED,
+};
+
+struct pcs_cs {
+ struct hlist_node hlist;
+ union {
+ struct list_head lru_link;
+ struct rcu_head rcu;
+ };
+ spinlock_t lock;
+ struct pcs_cs_set *css;
+
+ PCS_NODE_ID_T id;
+
+ unsigned int in_flight;
+ unsigned int eff_cwnd;
+ unsigned int cwnd;
+ int cwr_state;
+ atomic_t latency_avg;
+ unsigned int net_latency_avg;
+ unsigned int in_flight_avg;
+ unsigned int last_latency;
+ unsigned int in_flight_hwm;
+ abs_time_t in_flight_hwm_stamp;
+ abs_time_t latency_stamp;
+ abs_time_t net_latency_stamp;
+ abs_time_t idle_stamp;
+ struct list_head cong_queue;
+ int cong_queue_len;
+ struct list_head active_list;
+ int active_list_len;
+
+ pcs_cs_io_prio_t io_prio;
+ pcs_cs_net_prio_t net_prio;
+ u8 mds_flags;
+ abs_time_t io_prio_stamp;
+
+ struct list_head flow_lru;
+ int nflows;
+
+ unsigned long state;
+ int blacklist_reason;
+ struct list_head bl_link;
+ unsigned is_probing:1;
+ unsigned is_dead:1;
+
+
+ int addr_serno;
+ PCS_NET_ADDR_T addr;
+
+ struct pcs_rpc *rpc;
+
+ int nmaps;
+ struct list_head map_list;
+
+ struct {
+ struct pcs_perf_stat_cnt iolat;
+ struct pcs_perf_stat_cnt netlat;
+ struct pcs_perf_rate_cnt read_ops_rate;
+ struct pcs_perf_rate_cnt write_ops_rate;
+ struct pcs_perf_rate_cnt sync_ops_rate;
+ } stat;
+};
+
+static inline void pcs_cs_init_cong_queue(struct pcs_cs *cs)
+{
+ INIT_LIST_HEAD(&cs->cong_queue);
+ cs->cong_queue_len = 0;
+ clear_bit(CS_SF_CONGESTED, &cs->state);
+}
+
+static inline void pcs_cs_init_active_list(struct pcs_cs *cs)
+{
+ INIT_LIST_HEAD(&cs->active_list);
+ cs->active_list_len = 0;
+}
+
+static inline void pcs_cs_flush_cong_queue(struct pcs_cs *cs)
+{
+ assert_spin_locked(&cs->lock);
+ list_splice_tail(&cs->cong_queue, &cs->active_list);
+ cs->active_list_len += cs->cong_queue_len;
+ pcs_cs_init_cong_queue(cs);
+}
+
+void pcs_cs_cong_enqueue(struct pcs_int_request *ireq, struct pcs_cs *cs);
+
+#define PCS_CS_HASH_SIZE 1024
+
+struct pcs_cs_set {
+ struct hlist_head ht[PCS_CS_HASH_SIZE];
+ struct list_head lru;
+ struct list_head bl_list;
+ struct delayed_work bl_work;
+ unsigned int ncs;
+ spinlock_t lock;
+};
+
+void pcs_cs_submit(struct pcs_cs *cs, struct pcs_int_request *ireq);
+struct pcs_cs *pcs_cs_find_create(struct pcs_cs_set *csset, PCS_NODE_ID_T *id, PCS_NET_ADDR_T *addr, int local);
+void pcs_cs_notify_error(struct pcs_cluster_core *cc, pcs_error_t *err);
+
+void cs_update_io_latency(struct pcs_cs *cs, u32 lat);
+unsigned int cs_get_avg_latency(struct pcs_cs *cs);
+unsigned int __cs_get_avg_latency(struct pcs_cs *cs, abs_time_t now);
+void cs_account_latency(struct pcs_cs *cs, unsigned int to_add);
+void cs_update_net_latency(struct pcs_cs *cs, u32 lat);
+unsigned int cs_get_avg_net_latency(struct pcs_cs *cs);
+unsigned int __cs_get_avg_net_latency(struct pcs_cs *cs, abs_time_t now);
+void cs_increment_in_flight(struct pcs_cs *cs, unsigned int to_add);
+void cs_decrement_in_flight(struct pcs_cs *cs, unsigned int to_dec);
+void cs_cwnd_use_or_lose(struct pcs_cs *cs);
+unsigned int cs_get_avg_in_flight(struct pcs_cs *cs);
+
+void pcs_csset_init(struct pcs_cs_set *css);
+void pcs_csset_fini(struct pcs_cs_set *css);
+
+struct pcs_cs *pcs_cs_alloc(struct pcs_cs_set *css, struct pcs_cluster_core *cc);
+
+void cs_log_io_times(struct pcs_int_request *ireq, struct pcs_msg *resp, unsigned int max_iolat);
+int pcs_cs_format_io_times(char *buf, int buflen, struct pcs_int_request *ireq, struct pcs_msg *resp);
+void cs_set_io_times_logger(void (*logger)(struct pcs_int_request *ireq, struct pcs_msg *resp, u32 max_iolat, void *ctx), void *ctx);
+
+int pcs_cs_for_each_entry(struct pcs_cs_set *set, int (*cb)(struct pcs_cs *cs, void *arg), void *arg);
+
+void pcs_cs_update_stat(struct pcs_cs *cs, u32 iolat, u32 netlat, int op_type);
+
+static inline void pcs_cs_stat_up(struct pcs_cs *cs)
+{
+#if 0
+ /* TODO: temproraly disable perf counters */
+ pcs_perfcounter_stat_up(&cs->stat.iolat);
+ pcs_perfcounter_stat_up(&cs->stat.netlat);
+ pcs_perfcounter_up_rate(&cs->stat.write_ops_rate);
+ pcs_perfcounter_up_rate(&cs->stat.read_ops_rate);
+ pcs_perfcounter_up_rate(&cs->stat.sync_ops_rate);
+#endif
+}
+
+static inline bool cs_is_blacklisted(struct pcs_cs *cs)
+{
+ return test_bit(CS_SF_BLACKLISTED, &cs->state);
+}
+
+void pcs_cs_set_stat_up(struct pcs_cs_set *set);
+
+#endif /* _PCS_CS_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_cs_prot.h b/fs/fuse/kio/pcs/pcs_cs_prot.h
new file mode 100644
index 000000000000..f6b1c7f0dedf
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_cs_prot.h
@@ -0,0 +1,125 @@
+#ifndef _PCS_CS_PROT_H_
+#define _PCS_CS_PROT_H_ 1
+
+#include "pcs_rpc_prot.h"
+
+#define PCS_CS_FLUSH_WEIGHT (128*1024)
+
+struct pcs_cs_sync_data
+{
+ PCS_INTEGRITY_SEQ_T integrity_seq; /* Invariant. Changed only on CS host crash */
+ PCS_SYNC_SEQ_T sync_epoch; /* Invariant. Changed on CSD startup. */
+ PCS_SYNC_SEQ_T sync_dirty; /* Sync number of CS upon completion of local write */
+ PCS_SYNC_SEQ_T sync_current; /* Current sync number of CS. If > sync_dirty, write is synced */
+
+ u64 misc; /* Message received by CS */
+ u32 ts_io; /* Local IO finished */
+ u32 ts_net; /* Net finished */
+ u64 _reserved; /* For future extensions */
+} __attribute__((aligned(8)));
+
+/* IO req/resp flags. Older version have flag field zero, so zero value should be neutral.
+ * We have room for 12 flags.
+ */
+#define PCS_CS_IO_CACHED (1ULL<<63) /* Resp: result is read from cache or written ahead to journal */
+#define PCS_CS_IO_SEQ (1ULL<<62) /* Req: request is part of sequential flow */
+
+#define PCS_CS_RESET_TS_RECV(sdata, ts) do { (sdata)->misc = ((u64)ts & 0xFFFFFFFFFFFFFULL); } while (0)
+#define PCS_CS_SET_TS_RECV(sdata, ts) do { (sdata)->misc = ((sdata)->misc & ~0xFFFFFFFFFFFFFULL) | ((u64)ts & 0xFFFFFFFFFFFFFULL); } while (0)
+#define PCS_CS_ADD_TS_RECV(sdata, ts) do { (sdata)->misc |= ((u64)ts & 0xFFFFFFFFFFFFFULL); } while (0)
+#define PCS_CS_GET_TS_RECV(sdata) ((sdata)->misc & 0xFFFFFFFFFFFFFULL)
+
+struct pcs_cs_sync_resp {
+ PCS_NODE_ID_T cs_id;
+ struct pcs_cs_sync_data sync;
+} __attribute__((aligned(8)));
+
+struct pcs_cs_iohdr {
+ struct pcs_rpc_hdr hdr;
+
+ PCS_MAP_VERSION_T map_version;
+ PCS_CHUNK_UID_T uid;
+ u64 offset;
+ u32 size;
+ u32 iocontext;
+ u64 _reserved; /* For future extensions */
+ struct pcs_cs_sync_data sync; /* Filled in all requests and responses */
+ struct pcs_cs_sync_resp sync_resp[0]; /* Used only in response to write/sync */
+} __attribute__((aligned(8)));
+
+
+/* Maximal message size. Actually, random */
+#define PCS_CS_MSG_MAX_SIZE (1024*1024 + sizeof(struct pcs_cs_iohdr))
+
+#define PCS_CS_READ_REQ (PCS_RPC_CS_CLIENT_BASE)
+#define PCS_CS_READ_RESP (PCS_CS_READ_REQ|PCS_RPC_DIRECTION)
+
+#define PCS_CS_WRITE_REQ (PCS_RPC_CS_CLIENT_BASE + 2)
+#define PCS_CS_WRITE_RESP (PCS_CS_WRITE_REQ|PCS_RPC_DIRECTION)
+
+#define PCS_CS_REPLICATE_REQ (PCS_RPC_CS_CLIENT_BASE + 4)
+#define PCS_CS_REPLICATE_RESP (PCS_CS_REPLICATE_REQ|PCS_RPC_DIRECTION)
+
+#define PCS_CS_SYNC_REQ (PCS_RPC_CS_CLIENT_BASE + 6)
+#define PCS_CS_SYNC_RESP (PCS_CS_SYNC_REQ|PCS_RPC_DIRECTION)
+
+#define PCS_CS_WRITE_SYNC_REQ (PCS_RPC_CS_CLIENT_BASE + 8)
+#define PCS_CS_WRITE_SYNC_RESP (PCS_CS_WRITE_SYNC_REQ|PCS_RPC_DIRECTION)
+
+struct pcs_cs_cong_notification {
+ struct pcs_rpc_hdr hdr;
+
+ PCS_XID_T xid; /* XID of request triggered congestion notification */
+} __attribute__((aligned(8)));
+
+#define PCS_CS_CONG_NOTIFY (PCS_RPC_CS_CLIENT_BASE + 10)
+
+////////////////////////////////////////////
+//// from pcs_mds_cs_prot.h
+//// required for PCS_CS_MAP_PROP_REQ/ping to work
+struct pcs_cs_fs_info {
+ u64 free_space;
+ u64 total_space;
+};
+
+struct pcs_cs_node_desc {
+ s32 state; /* CS_OBJ_XXX */
+ u8 flags; /* CS_OBJF_XXX */
+ u8 role;
+ u16 csum_lo;
+ u32 status; /* PCS_ERR_XXX filled in response */
+ u16 csum_hi;
+ u8 parent_idx; /* Index of parent in replica tree. Undefined for root. */
+ u8 source_idx; /* Index of replication source for this replica */
+ u64 dirty_mask; /* Initialized by CS before forwarding the map downstream */
+ struct pcs_cs_info info; /* CS id and address */
+ struct pcs_cs_fs_info fs_info; /* Filled by CS in response */
+} __attribute__((aligned(8)));
+
+struct pcs_cs_map_prop {
+ struct pcs_mds_hdr hdr;
+
+ PCS_CHUNK_UID_T chunk_uid;
+ /* Messages with version less or equal to the current one (if available) will be ignored unless
+ * the CS_MAPF_PING flag is set. Otherwise the version is ignored as well as chunk state/flags.
+ */
+ PCS_MAP_VERSION_T version;
+ /* During replication this version indicates the newest dirty mask version allowed to be using for recovery. */
+ PCS_MAP_VERSION_T dirty_version;
+ u32 flags; /* CS_MAPF_XXX */
+ u32 chunk_size;
+ /* The maximum number of nodes in the chain. Intended to be using in timeout calculation. */
+ u16 chain_nodes;
+ u16 reserved;
+ u32 nnodes;
+ struct pcs_cs_node_desc nodes[0];
+} __attribute__((aligned(8)));
+
+#define CS_OBJ_UNKNOWN -1
+#define CS_MAPF_PING 0x1000
+#define PCS_CS_MAP_PROP_REQ (PCS_RPC_CS_CS_BASE + 2)
+#define PCS_CS_MAP_PROP_RESP (PCS_CS_MAP_PROP_REQ | PCS_RPC_DIRECTION)
+//////////////////////////////////////////// end pcs_mds_cs_prot.h
+
+
+#endif /* _PCS_CS_PROT_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_error.h b/fs/fuse/kio/pcs/pcs_error.h
new file mode 100644
index 000000000000..f4ec588943dc
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_error.h
@@ -0,0 +1,189 @@
+#ifndef _PCS_ERROR_H_
+#define _PCS_ERROR_H_ 1
+
+#include "pcs_types.h"
+
+typedef enum {
+ PCS_ERR_OK = 0, /* No error */
+ PCS_ERR_NOMEM = 1, /* Out of memory: malloc failure */
+ PCS_ERR_PROTOCOL = 2, /* Fatal protocol error. Some condition, which should happen
+ * only if we have some bug in protocol implementation
+ */
+ PCS_ERR_AUTH = 3, /* Authentication failure due to wrong credentials */
+ PCS_ERR_NET = 4, /* Misc network error */
+ PCS_ERR_NOSPACE = 5, /* ENOSPC/EDQUOT while local file io */
+ PCS_ERR_IO = 6, /* Misc error while local file io */
+ PCS_ERR_LOST_LOCK = 7, /* CN did not get response from MDS for lease update,
+ * it is generated by CN itself, sort of self-fencing
+ * Probably, useless and should be removed.
+ */
+
+ PCS_ERR_NOT_FOUND = 8, /* Requested object not found */
+ PCS_ERR_INTERRUPTED = 9, /* The operation was interrupted, should be retried */
+ PCS_ERR_NET_ABORT = 10, /* Message dropped due to abort of network connection */
+ PCS_ERR_CONNECT_TIMEOUT = 11, /* Failed connect() */
+ PCS_ERR_AUTH_TIMEOUT = 12, /* Authentication failure due to timeout */
+ PCS_ERR_RESPONSE_TIMEOUT= 13, /* Peer did not respond or did not hold deadline */
+ PCS_ERR_WRITE_TIMEOUT = 14, /* Socket write() failed, peer is stuck or network is broken */
+
+ PCS_ERR_CANCEL_REQUEST = 18, /* Request was cancelled by user */
+ PCS_ERR_CANCEL_IO = 19, /* IO request was cancelled */
+
+ PCS_ERR_LEASE_REQUIRED = 20, /* Lease required */
+ PCS_ERR_LEASE_EXPIRED = 21, /* Lease is expired */
+ PCS_ERR_LEASE_CONFLICT = 22, /* Lease request conflicts with another lease */
+ PCS_ERR_INV_PATH = 23, /* The path is invalid. Usually means an attempt to make a directory a subdirectory of itself. */
+ PCS_ERR_NOT_DIR = 24, /* Attempt to read non-directory */
+ PCS_ERR_IS_DIR = 25, /* Attempt to access directory (resize/io) */
+ PCS_ERR_NON_EMPTY_DIR = 26, /* Attempt to rename/delete non empty directory */
+ PCS_ERR_ZERO_CHUNK = 27, /* The requested chunk was not written yet and contains zero data */
+ PCS_ERR_INVALID = 29, /* Object is invalid */
+ PCS_ERR_INV_PARAMS = 30, /* Invalid parameters */
+ PCS_ERR_NO_ID = 31, /* Request from the client without ID */
+ PCS_ERR_INVALID_ID = 32, /* The client or server ID is invalid or banned */
+ PCS_ERR_NORES = 33, /* Not enough resources (too many requests) */
+ PCS_ERR_UNAVAIL = 34, /* Service unavailable */
+ PCS_ERR_BAD_CLUSTER = 35, /* The cluster id specified by client is invalid */
+ PCS_ERR_READONLY = 36, /* Invalid operation on read-only object */
+ PCS_ERR_PERM = 37, /* Permission denied */
+ PCS_ERR_UNSUPPORTED = 38, /* Operation is not supported */
+
+ PCS_ERR_TEMP_UNAVAIL = 40, /* The resource is temporary unavailable */
+ PCS_ERR_INTEGRITY = 41, /* Not enough alive replicas available */
+ PCS_ERR_INTEGRITY_FAIL = 42, /* Fatal. Returned by MDS to client, when it is known that
+ * some unsynced data could be lost.
+ */
+
+ PCS_ERR_NO_STORAGE = 50, /* The number of chunk servers in cluster is less than the required number of replicas */
+ PCS_ERR_NOT_ALLOWED = 51, /* Operation is not allowed due to licensing limitations */
+ PCS_ERR_CFG_VERSION = 60, /* Configuration version mismatch */
+ PCS_ERR_CLNT_VERSION = 61, /* Client version is incompatible with sever version (outdated) */
+ PCS_ERR_EXISTS = 70, /* Specified object already exists */
+ PCS_ERR_EPOCH_MISMATCH = 72, /* Object epoch mismatch due to concurrent update */
+ PCS_ERR_NO_DIR = 75, /* Name directory does not exists */
+ PCS_ERR_DIR_INST_VER = 76, /* Name instance version mismatch */
+ PCS_ERR_CONTEXT_LOST = 80, /* Operation context is lost on server restart */
+ PCS_ERR_NS_LEASE_BUSY = 81, /* Lease wasn't acquired due to other active lease */
+ PCS_ERR_NS_LEASE_INVALID= 82, /* Active lease doesn't have reference with id provided in the request */
+ PCS_ERR_NS_LOCK_EXPIRED = 83, /* Lock at object's name NS has already expired */
+
+ PCS_ERR_CSD_STALE_MAP = 100, /* Old map (or no map) at CS */
+ PCS_ERR_CSD_RO_MAP = 101, /* Write request with read-only map */
+ PCS_ERR_CSD_WR_IN_PROGR = 102, /* Read only map is rejected due to write requests being processed */
+ PCS_ERR_CSD_REPLICATING = 103, /* Attempt to read from unfinished replica */
+ PCS_ERR_CSD_STALLED_REPL= 104, /* Relplication stalled */
+ PCS_ERR_CANCEL_KEEPWAIT = 105, /* IO request was canceled and redirected to another CS */
+ PCS_ERR_CSD_LACKING = 110, /* Not enough CS servers available */
+ PCS_ERR_CSD_DROPPED = 120, /* The CS server was dropped by administrator */
+ PCS_ERR_MDS_NOT_MASTER = 200, /* The target MDS is not current master */
+ PCS_ERR_MDS_EXIST = 201, /* The MDS with such id already exist in cluster */
+ PCS_ERR_MDS_RM_TOOMANY = 202, /* Removing this MDS will make the cluster unusable */
+
+ PCS_ERR_LICENSE_LIMIT = 300, /* Operation can't be completed due to license limitations */
+ PCS_ERR_NO_LICENSE = 301, /* No active license */
+
+ PCS_ERR_SSL_CERTIFICATE_REVOKED = 400, /* Certificate revoked */
+ PCS_ERR_SSL_CERTIFICATE_EXPIRED = 401, /* Certificate expired */
+ PCS_ERR_SSL_UNKNOWN_CA = 402, /* Certificate issued by a CA the peer does not know and trust */
+ PCS_ERR_PEER_CERTIFICATE_REJECTED = 403, /* The peer certificate has failed the verification */
+
+ PCS_ERR_UNKNOWN = 4095, /* Unknown error */
+ PCS_ERR_MAX = PCS_ERR_UNKNOWN
+} pcs_err_t;
+
+/* Get long description of the error */
+const char *pcs_strerror(pcs_err_t errnum);
+
+/* Get short mnemonic */
+const char *pcs_errname(pcs_err_t errnum);
+
+/* Render string describing errno (on Linux and Mac) or Windows system error code. Return 0 on success or positive error number */
+int pcs_sys_strerror_r(int err, char *buf, int buflen);
+
+/* ----------------------------------------------------------------------------------- */
+
+/* Error code handling. "value" is one of error codes defined below,
+ * all the components share one error namespace. System errnos are not used,
+ * each subsystem, using syscalls, must recode errnos to one of PCS error codes.
+ * "remote" means that "offender" is valid. "offender" is node id, where this error
+ * was generated.
+ *
+ * XXX TODO there is one important case. Now "offender" is set when we have connection
+ * to peer and that peer returned an error via RPC. This is wrong (in many situation):
+ * we should return remote error, when communication with node fails due to
+ * failure of network between us and peer, or the peer itself. This is important.
+ * But tricky, we should remember that not all the communication channels should generate
+ * remote error. F.e. failure of communication with MDS is local error for CS communicating
+ * with MDS and remote error with that CS as offender for another nodes. It is easy to mess
+ * up and I did mess it up form the very beginning. :-)
+ */
+
+struct _pcs_error_t
+{
+ unsigned int value : 31, remote: 1;
+
+ PCS_NODE_ID_T offender;
+};
+typedef struct _pcs_error_t pcs_error_t;
+
+static __inline void pcs_clear_error(pcs_error_t * err)
+{
+ err->value = 0;
+}
+
+static __inline int pcs_if_error(pcs_error_t const* err)
+{
+ return err->value != 0;
+}
+
+static __inline void pcs_copy_error(pcs_error_t * dst, pcs_error_t const* src)
+{
+ dst->value = src->value;
+ dst->remote = src->remote;
+ if (dst->remote)
+ dst->offender = src->offender;
+}
+
+static __inline void pcs_copy_error_cond(pcs_error_t * dst, pcs_error_t const* src)
+{
+ if (src->value && !dst->value)
+ pcs_copy_error(dst, src);
+}
+
+static __inline void pcs_set_local_error(pcs_error_t * status, int err)
+{
+ status->value = err;
+ status->remote = 0;
+}
+
+int pcs_error_to_errno(pcs_error_t *);
+
+static __inline void *pcs_err_ptr(int err)
+{
+ return (void*)(~(ULONG_PTR)err);
+}
+
+static __inline int pcs_ptr_err(void *ptr)
+{
+ return (int)(~(ULONG_PTR)ptr);
+}
+
+static __inline int pcs_is_ptr_err(void *ptr)
+{
+ return 0 < ~(ULONG_PTR)ptr && ~(ULONG_PTR)ptr <= PCS_ERR_MAX;
+}
+
+static __inline int pcs_is_ptr_err_or_null(void *ptr)
+{
+ return !ptr || pcs_is_ptr_err(ptr);
+}
+
+/* Convert errno on Linux/Mac or Windows error code to pcs_err_t */
+pcs_err_t pcs_errno_to_err(int err);
+
+__must_check static inline int errno_eagain(int err)
+{
+ return err == EAGAIN || err == EWOULDBLOCK;
+}
+
+#endif /* _PCS_ERROR_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_flow_detect.h b/fs/fuse/kio/pcs/pcs_flow_detect.h
new file mode 100644
index 000000000000..1ac936dcc28f
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_flow_detect.h
@@ -0,0 +1,7 @@
+#ifndef _PCS_FLOW_DETECT_H_
+#define _PCS_FLOW_DETECT_H_ 1
+
+/* TODO:!!! this is stump for flow_detection */
+#include "pcs_flow_detect_stub.h"
+
+#endif /* _PCS_FLOW_DETECT_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_flow_detect_stub.h b/fs/fuse/kio/pcs/pcs_flow_detect_stub.h
new file mode 100644
index 000000000000..f9d0ffe68829
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_flow_detect_stub.h
@@ -0,0 +1,76 @@
+#ifndef _PCS_FLOW_DETECT_STUB_H_
+#define _PCS_FLOW_DETECT_STUB_H_ 1
+
+/* TODO:!!! this is stump for flow_detection */
+
+/* This should be enough for 1000 iops, otherwise lifetime is to be decreased or/and limit increased. */
+#define PCS_FLOW_LIFETIME (512)
+#define PCS_FLOW_LIMIT_DFLT (512)
+
+#define PCS_FLOW_RECENTTIME (50)
+#define PCS_FLOW_THRESH (6)
+
+struct pcs_flow_node
+{
+ int STUMB;
+};
+
+struct pcs_flow_table
+{
+ struct pcs_flow_node *STUMB;
+};
+
+struct pcs_flow_table_global
+{
+ struct pcs_flow_table *STUMB;
+ int nflows;
+};
+
+struct pcs_cs;
+
+static void pcs_flow_table_global_init(struct pcs_flow_table_global * gtab) __attribute__((unused));
+static void pcs_flow_table_global_fini(struct pcs_flow_table_global * gtab) __attribute__((unused));
+static void pcs_flow_table_init(struct pcs_flow_table * tab, struct pcs_flow_table_global * gtab) __attribute__((unused));
+static void pcs_flow_table_fini(struct pcs_flow_table * tab, struct pcs_flow_table_global * gtab) __attribute__((unused));
+static struct pcs_flow_node * pcs_flow_record(struct pcs_flow_table * tab, int dir, u64 start, unsigned int len,
+ struct pcs_flow_table_global * gtab) __attribute__((unused));
+static void pcs_flow_confirm(struct pcs_flow_node * fl, struct pcs_flow_table * tab, int dir, u64 start, unsigned int len,
+ struct pcs_flow_table_global * gtab) __attribute__((unused));
+static void pcs_flow_truncate(struct pcs_flow_table * tab, u64 new_size, struct pcs_flow_table_global * gtab) __attribute__((unused));
+static int pcs_flow_analysis(struct pcs_flow_table_global * gtab) __attribute__((unused));
+static int pcs_flow_cs_analysis(struct pcs_cs * cs) __attribute__((unused));
+static void pcs_flow_bind_cs(struct pcs_flow_node * fl, struct pcs_cs * cs) __attribute__((unused));
+static void pcs_flow_cs_unbind_all(struct pcs_cs * cs) __attribute__((unused));
+static void pcs_flow_put(struct pcs_flow_node * fl, struct pcs_flow_table_global * gtab) __attribute__((unused));
+static struct pcs_flow_node * pcs_flow_get(struct pcs_flow_node * fl) __attribute__((unused));
+static int pcs_flow_sequential(struct pcs_flow_node * fl) __attribute__((unused));
+
+
+
+
+
+
+static void pcs_flow_table_global_init(struct pcs_flow_table_global * gtab) {}
+static void pcs_flow_table_global_fini(struct pcs_flow_table_global * gtab) {}
+static void pcs_flow_table_init(struct pcs_flow_table * tab, struct pcs_flow_table_global * gtab) {}
+static void pcs_flow_table_fini(struct pcs_flow_table * tab, struct pcs_flow_table_global * gtab) {}
+
+static struct pcs_flow_node * pcs_flow_record(struct pcs_flow_table * tab, int dir, u64 start, unsigned int len,
+ struct pcs_flow_table_global * gtab)
+{
+ return NULL;
+}
+static void pcs_flow_confirm(struct pcs_flow_node * fl, struct pcs_flow_table * tab, int dir, u64 start, unsigned int len,
+ struct pcs_flow_table_global * gtab) {}
+static void pcs_flow_truncate(struct pcs_flow_table * tab, u64 new_size, struct pcs_flow_table_global * gtab) {}
+static int pcs_flow_analysis(struct pcs_flow_table_global * gtab) { return 0; }
+static int pcs_flow_cs_analysis(struct pcs_cs * cs) {return 0;}
+static void pcs_flow_bind_cs(struct pcs_flow_node * fl, struct pcs_cs * cs) {}
+static void pcs_flow_cs_unbind_all(struct pcs_cs * cs) {}
+
+static void pcs_flow_put(struct pcs_flow_node * fl, struct pcs_flow_table_global * gtab) {}
+static struct pcs_flow_node * pcs_flow_get(struct pcs_flow_node * fl) {return NULL;}
+static int pcs_flow_sequential(struct pcs_flow_node * fl) {return 0;}
+
+
+#endif /* _PCS_FLOW_DETECT_STUB_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
new file mode 100644
index 000000000000..f7226021d469
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
@@ -0,0 +1,742 @@
+/*
+ * Implement kdirect API for PCS cluster client kernel implementation
+ */
+#include "../../fuse_i.h"
+
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/compat.h>
+#include <linux/swap.h>
+#include <linux/aio.h>
+#include <linux/falloc.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/virtinfo.h>
+#include <linux/file.h>
+#include <linux/workqueue.h>
+#include <linux/delay.h>
+#include <linux/socket.h>
+#include <linux/net.h>
+
+#include "pcs_ioctl.h"
+#include "pcs_cluster.h"
+#include "pcs_rpc.h"
+
+static struct kmem_cache *pcs_fuse_req_cachep;
+static struct kmem_cache *pcs_ireq_cachep;
+static struct workqueue_struct *pcs_wq;
+static struct fuse_kio_ops kio_pcs_ops;
+
+static void process_pcs_init_reply(struct fuse_conn *fc, struct fuse_req *req)
+{
+ struct pcs_fuse_cluster *pfc;
+ struct fuse_ioctl_out *arg = &req->misc.ioctl.out;
+ struct pcs_ioc_init_kdirect *info = req->out.args[1].value;
+
+ if (req->out.h.error || arg->result) {
+ printk("Fail to initialize has_kdirect {%d,%d}\n",
+ req->out.h.error, arg->result);
+ fc->conn_error = 1;
+ goto out;
+ }
+ pfc = kmalloc(sizeof(*pfc), GFP_NOIO);
+ if (!pfc) {
+ fc->conn_error = 1;
+ goto out;
+ }
+
+ if (pcs_cluster_init(pfc, pcs_wq, fc, &info->cluster_id, &info->node_id)) {
+ fc->conn_error = 1;
+ goto out;
+ }
+ /* TODO: Not yet implemented PSBM-80365 */
+ fc->no_fiemap = 1;
+ fc->no_fallocate = 1;
+
+ fc->kio.ctx = pfc;
+ printk("FUSE: kio_pcs: cl: " CLUSTER_ID_FMT ", clientid: " NODE_FMT "\n",
+ CLUSTER_ID_ARGS(info->cluster_id), NODE_ARGS(info->node_id));
+out:
+ kfree(info);
+ /* We are called from process_init_reply before connection
+ * was not initalized yet. Do it now. */
+ fuse_set_initialized(fc);
+ wake_up_all(&fc->blocked_waitq);
+
+}
+
+int kpcs_conn_init(struct fuse_conn *fc)
+{
+ struct fuse_req *req;
+ struct fuse_ioctl_in *inarg;
+ struct fuse_ioctl_out *outarg;
+ struct pcs_ioc_init_kdirect *info;
+
+ BUG_ON(!fc->conn_init);
+
+ info = kzalloc(sizeof(*info), GFP_NOIO);
+ if (!info)
+ return -ENOMEM;
+
+ req = fuse_request_alloc(fc, 0);
+ if (IS_ERR(req)) {
+ kfree(info);
+ return PTR_ERR(req);
+ }
+
+ __set_bit(FR_BACKGROUND, &req->flags);
+ memset(&req->misc.ioctl, 0, sizeof(req->misc.ioctl));
+ /* filehandle and nodeid are null, but this is OK */
+ inarg = &req->misc.ioctl.in;
+ outarg = &req->misc.ioctl.out;
+ inarg->cmd = PCS_IOC_INIT_KDIRECT;
+
+ req->in.h.opcode = FUSE_IOCTL;
+ req->in.numargs = 1;
+ req->in.args[0].size = sizeof(*inarg);
+ req->in.args[0].value = inarg;
+ req->out.numargs = 2;
+ req->out.args[0].size = sizeof(*outarg);
+ req->out.args[0].value = outarg;
+ req->out.args[1].size = sizeof(*info);
+ req->out.args[1].value = info;
+ req->misc.ioctl.ctx = info;
+ req->end = process_pcs_init_reply;
+
+ fuse_request_send_background(fc, req);
+ return 0;
+}
+
+void kpcs_conn_fini(struct fuse_conn *fc)
+{
+ if (!fc->kio.ctx)
+ return;
+
+ TRACE("%s fc:%p\n", __FUNCTION__, fc);
+ flush_workqueue(pcs_wq);
+ pcs_cluster_fini((struct pcs_fuse_cluster *) fc->kio.ctx);
+}
+
+void kpcs_conn_abort(struct fuse_conn *fc)
+{
+ if (!fc->kio.ctx)
+ return;
+
+ //pcs_cluster_fini((struct pcs_fuse_cluster *) fc->kio.ctx);
+ printk("%s TODO: implement this method\n", __FUNCTION__);
+
+}
+
+static int kpcs_probe(struct fuse_conn *fc, char *name)
+
+{
+ printk("%s TODO IMPLEMENT check fuse_conn args here!\n", __FUNCTION__);
+ if (!strncmp(name, kio_pcs_ops.name, FUSE_KIO_NAME))
+ return 1;
+
+ return 0;
+}
+
+
+static int fuse_pcs_getfileinfo(struct fuse_conn *fc, struct file *file,
+ struct pcs_mds_fileinfo *info)
+{
+ struct fuse_file *ff = file->private_data;
+ struct fuse_req *req;
+ struct fuse_ioctl_in *inarg;
+ struct fuse_ioctl_out *outarg;
+ struct pcs_ioc_fileinfo ioc_info;
+ int err = 0;
+
+ req = fuse_get_req(fc, 0);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ memset(&req->misc.ioctl, 0, sizeof(req->misc.ioctl));
+ inarg = &req->misc.ioctl.in;
+ outarg = &req->misc.ioctl.out;
+
+ req->in.h.opcode = FUSE_IOCTL;
+ req->in.h.nodeid = ff->nodeid;
+
+ inarg->cmd = PCS_IOC_GETFILEINFO;
+ inarg->fh = ff->fh;
+ inarg->arg = 0;
+ inarg->flags = 0;
+ req->in.numargs = 1;
+ req->in.args[0].size = sizeof(*inarg);
+ req->in.args[0].value = inarg;
+
+ memset(&ioc_info, 0, sizeof(ioc_info));
+
+ req->out.numargs = 2;
+ req->out.args[0].size = sizeof(*outarg);
+ req->out.args[0].value = outarg;
+ req->out.args[1].size = sizeof(ioc_info);
+ req->out.args[1].value = &ioc_info;
+
+ fuse_request_send(fc, req);
+
+ if (req->out.h.error || outarg->result) {
+ printk("%s:%d h.err:%d result:%d\n", __FUNCTION__, __LINE__,
+ req->out.h.error, outarg->result);
+ err = req->out.h.error ? req->out.h.error : outarg->result;
+ fuse_put_request(fc, req);
+ return err;
+ } else
+ *info = ioc_info.fileinfo;
+
+ fuse_put_request(fc, req);
+ return 0;
+}
+
+static int fuse_pcs_kdirect_claim_op(struct fuse_conn *fc, struct file *file,
+ bool claim)
+{
+ struct fuse_file *ff = file->private_data;
+ struct fuse_req *req;
+ struct fuse_ioctl_in *inarg;
+ struct fuse_ioctl_out *outarg;
+ int err = 0;
+
+ req = fuse_get_req(fc, 0);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ memset(&req->misc.ioctl, 0, sizeof(req->misc.ioctl));
+ inarg = &req->misc.ioctl.in;
+ outarg = &req->misc.ioctl.out;
+
+ req->in.h.opcode = FUSE_IOCTL;
+ req->in.h.nodeid = ff->nodeid;
+
+ if (claim)
+ inarg->cmd = PCS_IOC_KDIRECT_CLAIM;
+ else
+ inarg->cmd = PCS_IOC_KDIRECT_RELEASE;
+
+ inarg->fh = ff->fh;
+ inarg->arg = 0;
+ inarg->flags = 0;
+ req->in.numargs = 1;
+ req->in.args[0].size = sizeof(*inarg);
+ req->in.args[0].value = inarg;
+ req->out.numargs = 1;
+ req->out.args[0].size = sizeof(*outarg);
+ req->out.args[0].value = outarg;
+ fuse_request_send(fc, req);
+ if (req->out.h.error || outarg->result) {
+ printk("%s:%d h.err:%d result:%d\n", __FUNCTION__, __LINE__,
+ req->out.h.error, outarg->result);
+ err = req->out.h.error ? req->out.h.error : outarg->result;
+ }
+
+ fuse_put_request(fc, req);
+ return err;
+}
+
+static int kpcs_do_file_open(struct fuse_conn *fc, struct file *file, struct inode *inode)
+{
+ struct pcs_mds_fileinfo info;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct pcs_fuse_cluster *pfc = (struct pcs_fuse_cluster*)fc->kio.ctx;
+ struct pcs_dentry_info *di = NULL;
+ int ret;
+
+ ret = fuse_pcs_getfileinfo(fc, file, &info);
+ if (ret)
+ return ret;
+
+ if (info.sys.map_type != PCS_MAP_PLAIN) {
+ TRACE("Unsupported map_type:%x, ignore\n", info.sys.map_type);
+ return 0;
+ }
+
+ di = kzalloc(sizeof(*di), GFP_KERNEL);
+ if (!di)
+ return -ENOMEM;
+
+ /* TODO Init fields */
+ /* di.id.parent = id->parent; */
+ /* di.id.name.data = name; */
+ /* di.id.name.len = id->name.len; */
+
+ pcs_mapping_init(&pfc->cc, &di->mapping);
+ pcs_set_fileinfo(di, &info);
+ di->cluster = &pfc->cc;
+ di->inode = fi;
+ TRACE("init id:%llu chunk_size:%d stripe_depth:%d strip_width:%d\n",
+ fi->nodeid, di->fileinfo.sys.chunk_size,
+ di->fileinfo.sys.stripe_depth, di->fileinfo.sys.strip_width);
+
+ mutex_lock(&inode->i_mutex);
+ /* Some one already initialized it under us ? */
+ if (fi->private) {
+ mutex_unlock(&inode->i_mutex);
+ pcs_mapping_invalidate(&di->mapping);
+ pcs_mapping_deinit(&di->mapping);
+ kfree(di);
+ return 0;
+ }
+ ret = fuse_pcs_kdirect_claim_op(fc, file, true);
+ if (ret) {
+ mutex_unlock(&inode->i_mutex);
+ pcs_mapping_invalidate(&di->mapping);
+ pcs_mapping_deinit(&di->mapping);
+ kfree(di);
+ return ret;
+ }
+ /* TODO: Propper initialization of dentry should be here!!! */
+ fi->private = di;
+ mutex_unlock(&inode->i_mutex);
+ return 0;
+}
+
+int kpcs_file_open(struct fuse_conn *fc, struct file *file, struct inode *inode)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ if (!S_ISREG(inode->i_mode))
+ return 0;
+ if (fi->nodeid - FUSE_ROOT_ID >= PCS_FUSE_INO_SPECIAL_)
+ return 0;
+ /* Already initialized */
+ if (fi->private) {
+ /*TODO: propper refcount for claim_cnt should be here */
+ return 0;
+ }
+ return kpcs_do_file_open(fc, file, inode);
+}
+
+void kpcs_inode_release(struct fuse_inode *fi)
+{
+ struct pcs_dentry_info *di = fi->private;
+
+ if(!di)
+ return;
+
+ pcs_mapping_invalidate(&di->mapping);
+ pcs_mapping_deinit(&di->mapping);
+ /* TODO: properly destroy dentry info here!! */
+ kfree(di);
+}
+
+static void pcs_fuse_reply_handle(struct fuse_conn *fc, struct fuse_req *req)
+{
+ struct pcs_fuse_work *work = (struct pcs_fuse_work*) req->misc.ioctl.ctx;
+ int err;
+
+ err = req->out.h.error ? req->out.h.error : req->misc.ioctl.out.result;
+ if (err) {
+ /* TODO Fine grane error conversion here */
+ pcs_set_local_error(&work->status, PCS_ERR_PROTOCOL);
+ }
+ queue_work(pcs_wq, &work->work);
+}
+
+#define MAX_CS_CNT 32
+static void fuse_complete_map_work(struct work_struct *w)
+{
+ struct pcs_fuse_work *work = container_of(w, struct pcs_fuse_work, work);
+ struct pcs_map_entry *m = (struct pcs_map_entry *)work->ctx;
+ struct pcs_ioc_getmap *omap = (struct pcs_ioc_getmap *)work->ctx2;
+
+ BUG_ON(!m);
+ BUG_ON(!omap);
+ pcs_copy_error_cond(&omap->error, &work->status);
+ if (omap->cs_cnt > MAX_CS_CNT) {
+ printk("Corrupted cs_cnt from userspace");
+ pcs_set_local_error(&omap->error, PCS_ERR_PROTOCOL);
+ }
+
+ pcs_map_complete(m, omap);
+ kfree(omap);
+ kfree(work);
+}
+
+int fuse_map_resolve(struct pcs_map_entry *m, int direction)
+{
+ struct pcs_dentry_info *di = pcs_dentry_from_mapping(m->mapping);
+ struct fuse_conn *fc = pcs_cluster_from_cc(di->cluster)->fc;
+ struct fuse_req *req;
+ struct fuse_ioctl_in *inarg;
+ struct fuse_ioctl_out *outarg;
+ struct pcs_ioc_getmap *map_ioc;
+ struct pcs_fuse_work *reply_work;
+ size_t map_sz;
+
+ DTRACE("enter m: " MAP_FMT ", dir:%d \n", MAP_ARGS(m), direction);
+
+ BUG_ON(!(m->state & PCS_MAP_RESOLVING));
+
+ map_sz = sizeof(*map_ioc) + MAX_CS_CNT * sizeof(struct pcs_cs_info);
+ map_ioc = kzalloc(map_sz, GFP_NOIO);
+ if (!map_ioc)
+ return -ENOMEM;
+
+ reply_work = kzalloc(sizeof(*reply_work), GFP_NOIO);
+ if (!reply_work) {
+ kfree(map_ioc);
+ return -ENOMEM;
+ }
+ req = fuse_get_req_for_background(fc, 0);
+ if (IS_ERR(req)) {
+ kfree(map_ioc);
+ kfree(reply_work);
+ return PTR_ERR(req);
+ }
+
+
+ memset(&req->misc.ioctl, 0, sizeof(req->misc.ioctl));
+ inarg = &req->misc.ioctl.in;
+ outarg = &req->misc.ioctl.out;
+ inarg->cmd = PCS_IOC_GETMAP;
+ map_ioc->cs_max = MAX_CS_CNT;
+
+ /* fill ioc_map struct */
+ if (pcs_map_encode_req(m, map_ioc, direction) != 0) {
+ kfree(map_ioc);
+ kfree(reply_work);
+ fuse_put_request(fc, req);
+ return 0;
+ }
+
+ /* Fill core ioctl */
+ req->in.h.opcode = FUSE_IOCTL;
+ /* FH is null, peer will lookup by nodeid */
+ inarg->fh = 0;
+ req->in.h.nodeid = di->inode->nodeid;
+ req->in.numargs = 2;
+ req->in.args[0].size = sizeof(*inarg);
+ req->in.args[0].value = inarg;
+ req->in.args[1].size = map_sz;
+ req->in.args[1].value = map_ioc;
+
+ req->out.numargs = 2;
+ /* TODO: make this ioctl varsizable */
+ req->out.argvar = 1;
+ req->out.args[0].size = sizeof(*outarg);
+ req->out.args[0].value = outarg;
+ req->out.args[1].size = map_sz;
+ req->out.args[1].value = map_ioc;
+
+ INIT_WORK(&reply_work->work, fuse_complete_map_work);
+ reply_work->ctx = m;
+ reply_work->ctx2 = map_ioc;
+ req->misc.ioctl.ctx = reply_work;
+ req->end = pcs_fuse_reply_handle;
+
+ fuse_request_send_background(fc, req);
+
+ return 0;
+}
+static void pfocess_pcs_csconn_work(struct work_struct *w)
+{
+ struct pcs_fuse_work *work = container_of(w, struct pcs_fuse_work, work);
+ struct pcs_rpc *ep = (struct pcs_rpc *)work->ctx;
+ struct socket *sock = (struct socket *)work->ctx2;
+ BUG_ON(!ep);
+
+ if (pcs_if_error(&work->status)) {
+ mutex_lock(&ep->mutex);
+ pcs_rpc_reset(ep);
+ mutex_unlock(&ep->mutex);
+ TRACE(PEER_FMT" fail with %d\n", PEER_ARGS(ep), work->status.value);
+ } else {
+ if (sock)
+ rpc_connect_done(ep, sock);
+ }
+ pcs_rpc_put(ep);
+ kfree(work);
+}
+
+static void process_pcs_csconn_reply(struct fuse_conn *fc, struct fuse_req *req)
+{
+ struct pcs_ioc_csconn *csconn = (struct pcs_ioc_csconn *)req->in.args[1].value;
+ struct fuse_ioctl_out *arg = &req->misc.ioctl.out;
+ struct pcs_fuse_work *work = (struct pcs_fuse_work*) req->misc.ioctl.ctx;
+ int is_open = csconn->flags & PCS_IOC_CS_OPEN;
+
+ if (req->out.h.error || arg->result < 0) {
+ pcs_set_local_error(&work->status, PCS_ERR_PROTOCOL);
+ goto out;
+ }
+ /* Grab socket from caller's context (fuse-evloop) and do the rest in kwork */
+ if (is_open) {
+ struct socket *sock;
+ struct file* filp;
+ int err;
+
+ filp = fget((unsigned int)arg->result);
+ arg->result = 0;
+ if (!filp) {
+ pcs_set_local_error(&work->status, PCS_ERR_PROTOCOL);
+ goto out;
+ }
+ sock = sock_from_file(filp, &err);
+ if (!sock) {
+ fput(filp);
+ pcs_set_local_error(&work->status, PCS_ERR_PROTOCOL);
+ } else
+ TRACE("id: "NODE_FMT" sock:%p\n", NODE_ARGS(csconn->id), sock);
+ work->ctx2 = sock;
+ }
+out:
+ kfree(csconn);
+ pcs_fuse_reply_handle(fc, req);
+
+}
+
+int fuse_pcs_csconn_send(struct fuse_conn *fc, struct pcs_rpc *ep, int flags)
+{
+ struct fuse_req *req;
+ struct fuse_ioctl_in *inarg;
+ struct fuse_ioctl_out *outarg;
+ struct pcs_ioc_csconn *csconn;
+ struct pcs_fuse_work *reply_work;
+
+ /* Socket must being freed from kernelspace before requesting new one*/
+ BUG_ON(!(flags & PCS_IOC_CS_REOPEN));
+
+ TRACE("start %s cmd:%ld id:%lld flags:%x\n", __FUNCTION__,
+ PCS_IOC_CSCONN, ep->peer_id.val, flags);
+
+ csconn = kzalloc(sizeof(*csconn), GFP_NOIO);
+ if (!csconn)
+ return -ENOMEM;
+
+ reply_work = kzalloc(sizeof(*reply_work), GFP_NOIO);
+ if (!reply_work) {
+ kfree(csconn);
+ return -ENOMEM;
+ }
+
+ req = fuse_get_req_for_background(fc, 0);
+ if (IS_ERR(req)) {
+ kfree(csconn);
+ kfree(reply_work);
+ return PTR_ERR(req);
+ }
+
+ memset(&req->misc.ioctl, 0, sizeof(req->misc.ioctl));
+ inarg = &req->misc.ioctl.in;
+ outarg = &req->misc.ioctl.out;
+
+ inarg->cmd = PCS_IOC_CSCONN;
+ inarg->fh = 0;
+ inarg->arg = 0;
+ inarg->flags = 0;
+
+ csconn->id.val = ep->peer_id.val;
+ memcpy(&csconn->address, &ep->addr, sizeof(ep->addr));
+ csconn->flags = flags;
+
+ req->in.h.opcode = FUSE_IOCTL;
+ req->in.numargs = 2;
+ req->in.args[0].size = sizeof(*inarg);
+ req->in.args[0].value = inarg;
+ req->in.args[1].size = sizeof(*csconn);
+ req->in.args[1].value = csconn;
+
+ req->out.numargs = 1;
+ req->out.args[0].size = sizeof(*outarg);
+ req->out.args[0].value = outarg;
+
+ INIT_WORK(&reply_work->work, pfocess_pcs_csconn_work);
+ reply_work->ctx = pcs_rpc_get(ep);
+ reply_work->ctx2 = NULL; /* return socket should be here */
+ req->misc.ioctl.ctx = reply_work;
+
+ req->end = process_pcs_csconn_reply;
+ fuse_request_send_background(fc, req);
+
+ return 0;
+}
+
+struct fuse_req *kpcs_req_alloc(struct fuse_conn *fc,
+ unsigned npages, gfp_t flags)
+{
+ return fuse_generic_request_alloc(fc, pcs_fuse_req_cachep,
+ npages, flags);
+}
+
+/* IOHOOKS */
+
+struct pcs_int_request * __ireq_alloc(void)
+{
+ return kmem_cache_alloc(pcs_ireq_cachep, GFP_NOIO);
+}
+void ireq_destroy(struct pcs_int_request *ireq)
+{
+ kmem_cache_free(pcs_ireq_cachep, ireq);
+}
+
+static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req, int async)
+{
+ struct pcs_fuse_req *r = pcs_req_from_fuse(req);
+ struct fuse_inode *fi = get_fuse_inode(req->io_inode);
+ struct pcs_dentry_info *di = pcs_inode_from_fuse(fi);
+ struct pcs_int_request* ireq;
+
+ BUG_ON(!di);
+ BUG_ON(req->cache != pcs_fuse_req_cachep);
+
+ /* Init pcs_fuse_req */
+ memset(&r->exec.io, 0, sizeof(r->exec.io));
+ memset(&r->exec.ctl, 0, sizeof(r->exec.ctl));
+ /* Use inline request structure */
+ ireq = &r->exec.ireq;
+ ireq_init(di, ireq);
+
+ switch (r->req.in.h.opcode) {
+ case FUSE_WRITE: {
+ struct fuse_write_in *in = &r->req.misc.write.in;
+ struct fuse_write_out *out = &r->req.misc.write.out;
+ out->size = in->size;
+ break;
+ }
+ case FUSE_READ: {
+ struct fuse_read_in *in = &r->req.misc.read.in;
+ size_t size = in->size;
+
+ if (in->offset + in->size > di->fileinfo.attr.size) {
+ if (in->offset >= di->fileinfo.attr.size) {
+ req->out.args[0].size = 0;
+ break;
+ }
+ size = di->fileinfo.attr.size - in->offset;
+ }
+ pcs_fuse_prep_io(r, PCS_REQ_T_READ, in->offset, size);
+ goto submit;
+ }
+ case FUSE_FSYNC:
+ /*NOOP */
+ break;
+ }
+ r->req.out.h.error = 0;
+ DTRACE("do fuse_request_end req:%p op:%d err:%d\n", &r->req, r->req.in.h.opcode, r->req.out.h.error);
+
+ request_end(pfc->fc, &r->req);
+ return;
+submit:
+ if (async)
+ pcs_cc_submit(ireq->cc, ireq);
+ else
+ ireq_process(ireq);
+}
+
+
+int kpcs_req_send(struct fuse_conn* fc, struct fuse_req *req, bool bg, bool lk)
+{
+ struct pcs_fuse_cluster *pfc = (struct pcs_fuse_cluster*)fc->kio.ctx;
+ struct fuse_inode *fi = get_fuse_inode(req->io_inode);
+
+ if (!fc->initialized || fc->conn_error)
+ return 1;
+
+ BUG_ON(!pfc);
+ /* HYPOTHESIS #1
+ * IFAIU at this point request can not belongs to any list
+ * so I cant avoid grab fc->lock here at all
+ */
+ BUG_ON(!list_empty(&req->list));
+
+ TRACE(" Enter req:%p op:%d bg:%d lk:%d\n", req, req->in.h.opcode, bg, lk);
+
+ /* TODO: This is just a crunch, Conn cleanup requires sane locking */
+ if (req->in.h.opcode == FUSE_DESTROY) {
+ kpcs_conn_fini(fc);
+ spin_lock(&fc->lock);
+ fc->kio.ctx = NULL;
+ spin_unlock(&fc->lock);
+ return 1;
+ }
+ if ((req->in.h.opcode != FUSE_READ &&
+ req->in.h.opcode != FUSE_WRITE))
+ return 1;
+
+ fi = get_fuse_inode(req->io_inode);
+ if (!fi->private)
+ return 1;
+
+ /* TODO, fetch only read requests for now */
+ if (req->in.h.opcode != FUSE_READ)
+ return 1;
+
+ __clear_bit(FR_BACKGROUND, &req->flags);
+ __clear_bit(FR_PENDING, &req->flags);
+ /* request_end below will do fuse_put_request() */
+ if (!bg)
+ atomic_inc(&req->count);
+ pcs_fuse_submit(pfc, req, lk);
+ if (!bg)
+ wait_event(req->waitq, test_bit(FR_FINISHED, &req->flags));
+
+ return 0;
+}
+
+
+static struct fuse_kio_ops kio_pcs_ops = {
+ .name = "pcs",
+ .owner = THIS_MODULE,
+ .probe = kpcs_probe, /*TODO: check sb->dev name */
+
+ .conn_init = kpcs_conn_init,
+ .conn_fini = kpcs_conn_fini,
+ .conn_abort = kpcs_conn_abort,
+ .req_alloc = kpcs_req_alloc,
+ .req_send = kpcs_req_send,
+ .file_open = kpcs_file_open,
+ .inode_release = kpcs_inode_release,
+};
+
+
+static int __init kpcs_mod_init(void)
+{
+ int err = -ENOMEM;
+ pcs_fuse_req_cachep = kmem_cache_create("pcs_fuse_request",
+ sizeof(struct pcs_fuse_req),
+ 0, 0, NULL);
+
+ if (!pcs_fuse_req_cachep)
+ return err;
+
+ pcs_ireq_cachep = kmem_cache_create("pcs_ireq",
+ sizeof(struct pcs_int_request),
+ 0, SLAB_MEM_SPREAD, NULL);
+ if (!pcs_ireq_cachep)
+ goto free_fuse_cache;
+ pcs_wq = alloc_workqueue("pcs_cluster", WQ_MEM_RECLAIM, 0);
+ if (!pcs_wq)
+ goto free_ireq_cache;
+
+ if(fuse_register_kio(&kio_pcs_ops))
+ goto free_wq;
+ printk("%s fuse_c:%p ireq_c:%p pcs_wq:%p\n", __FUNCTION__,
+ pcs_fuse_req_cachep, pcs_ireq_cachep, pcs_wq);
+
+ return 0;
+free_wq:
+ destroy_workqueue(pcs_wq);
+free_ireq_cache:
+ kmem_cache_destroy(pcs_ireq_cachep);
+free_fuse_cache:
+ kmem_cache_destroy(pcs_fuse_req_cachep);
+ return err;
+}
+
+static void __exit kpcs_mod_exit(void)
+{
+ fuse_unregister_kio(&kio_pcs_ops);
+ destroy_workqueue(pcs_wq);
+ kmem_cache_destroy(pcs_ireq_cachep);
+ kmem_cache_destroy(pcs_fuse_req_cachep);
+}
+
+module_init(kpcs_mod_init);
+module_exit(kpcs_mod_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel at openvz.org>");
diff --git a/fs/fuse/kio/pcs/pcs_ioctl.h b/fs/fuse/kio/pcs/pcs_ioctl.h
new file mode 100644
index 000000000000..6451baabb492
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_ioctl.h
@@ -0,0 +1,85 @@
+#ifndef _PCS_IOCTL_H_
+#define _PCS_IOCTL_H_ 1
+
+#include <linux/ioctl.h>
+
+
+#include "pcs_prot_types.h"
+#include "pcs_mds_prot.h"
+#include "pcs_error.h"
+#include "pcs_map.h"
+#include "pcs_rpc.h"
+
+#define PCS_FUSE_INO_SPECIAL_ ((unsigned long long)-0x1000)
+
+struct pcs_client_lease_info
+{
+ u32 type;
+ u32 pad;
+ struct pcs_pc_lease_info info;
+};
+
+struct pcs_getleases_resp {
+ u32 nleases;
+ u32 nleases_total;
+ struct pcs_client_lease_info leases[0];
+};
+
+union pcs_getleases_ioc
+{
+ char path[4096];
+ struct pcs_getleases_resp resp;
+};
+
+struct pcs_ioc_init_kdirect
+{
+ PCS_NODE_ID_T node_id;
+ PCS_CLUSTER_ID_T cluster_id;
+};
+
+struct pcs_ioc_fileinfo
+{
+ struct pcs_mds_fileinfo fileinfo;
+};
+
+struct pcs_ioc_getmap
+{
+ PCS_CHUNK_UID_T uid; /* chunk unique id on out */
+ PCS_MAP_VERSION_T version; /* in (on retry) / out */
+ u64 chunk_start; /* in / out */
+ u64 chunk_end; /* out */
+ u32 state; /* in/out: PCS_IOC_MAP_S_XXX */
+#define PCS_IOC_MAP_S_READ 0x1
+#define PCS_IOC_MAP_S_WRITE 0x2
+#define PCS_IOC_MAP_S_NEW 0x4
+#define PCS_IOC_MAP_S_ERROR 0x8
+ pcs_error_t error; /* in/out */
+ u16 mds_flags; /* in/out */
+ u32 psize_ret; /* length of chunk on CS (out) */
+ u32 chunk_psize; /* physical size of chunk on CS on in */
+ u32 read_tout; /* read timeout (msec) on out */
+ u32 write_tout; /* write timeout (msec) on out */
+ /* TODO: cs array is only for OUT ? */
+ u32 cs_cnt; /* The number of CS (including root) entries that follows */
+ u32 cs_max; /* Max number of CS (including root) entries requested */
+ struct pcs_cs_info cs[0]; /* Array of CS including root */
+};
+
+struct pcs_ioc_csconn
+{
+ PCS_NODE_ID_T id;
+ PCS_NET_ADDR_T address;
+ u32 flags;
+#define PCS_IOC_CS_OPEN 0x1
+#define PCS_IOC_CS_CLOSE 0x2
+#define PCS_IOC_CS_REOPEN (PCS_IOC_CS_OPEN|PCS_IOC_CS_CLOSE)
+};
+
+#define PCS_IOC_INIT_KDIRECT _IOR('V',32, struct pcs_ioc_init_kdirect)
+#define PCS_IOC_CSCONN _IOR('V',33, struct pcs_ioc_csconn)
+#define PCS_IOC_GETFILEINFO _IOR('V',34, struct pcs_ioc_fileinfo)
+#define PCS_IOC_KDIRECT_CLAIM _IO('V',35)
+#define PCS_IOC_KDIRECT_RELEASE _IO('V',36)
+#define PCS_IOC_GETMAP _IOWR('V',37, struct pcs_ioc_getmap)
+
+#endif /* _PCS_IOCTL_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_map.c b/fs/fuse/kio/pcs/pcs_map.c
new file mode 100644
index 000000000000..32cfd073befd
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_map.c
@@ -0,0 +1,2999 @@
+#include <net/sock.h>
+#include <linux/net.h>
+#include <linux/kthread.h>
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/highmem.h>
+#include <linux/log2.h>
+
+#include "pcs_types.h"
+#include "pcs_sock_io.h"
+#include "pcs_rpc.h"
+#include "pcs_sock_io.h"
+#include "pcs_req.h"
+#include "pcs_map.h"
+#include "pcs_cs.h"
+#include "pcs_ioctl.h"
+#include "log.h"
+
+/* Lock order
+ ->map->lock : Motivated by truncate
+ ->mapping->map_lock
+
+ map->lock
+ ->cs->lock : pcs_map_set_cslist
+
+*/
+#define MAP_BATCH 16
+
+static void pcs_ireq_queue_fail(struct list_head *queue, int error);
+
+abs_time_t get_real_time_ms(void)
+{
+ struct timespec tv = current_kernel_time();
+ return (abs_time_t)tv.tv_sec * 1000 + tv.tv_nsec / 1000000;
+}
+
+
+static inline unsigned int pcs_sync_timeout(struct pcs_cluster_core *cc)
+{
+ /* This is ~8 second distribution around PCS_SYNC_TIMEOUT */
+ //// TODO: timeout randomization temproraly disabled
+ ////return PCS_SYNC_TIMEOUT - 0x1000 + (pcs_random(&cc->rng) & 0x1FFF);
+ return PCS_SYNC_TIMEOUT;
+}
+
+static void cslist_destroy(struct pcs_cs_list * csl)
+{
+ int i;
+
+ TRACE("csl:%p csl->map:%p refcnt:%d\n", csl, csl->map, atomic_read(&csl->refcnt));
+ BUG_ON(csl->map);
+
+ for (i = 0; i < csl->nsrv; i++) {
+ struct pcs_cs_link * cslink = &csl->cs[i].cslink;
+
+ /* Possible after error inside cslist_alloc() */
+ if (!cslink->cs)
+ continue;
+
+ spin_lock(&cslink->cs->lock);
+ if (!list_empty(&cslink->link)) {
+ list_del_init(&cslink->link);
+ cslink->cs->nmaps--;
+ }
+ spin_unlock(&cslink->cs->lock);
+ }
+ kfree(csl);
+}
+
+static inline void cslist_get(struct pcs_cs_list * csl)
+{
+ TRACE("csl:%p csl->map:%p refcnt:%d\n", csl, csl->map, atomic_read(&csl->refcnt));
+
+ atomic_inc(&csl->refcnt);
+}
+static inline void cslist_put(struct pcs_cs_list * csl)
+{
+ TRACE("csl:%p csl->map:%p refcnt:%d\n", csl, csl->map, atomic_read(&csl->refcnt));
+ if (atomic_dec_and_test(&csl->refcnt))
+ cslist_destroy(csl);
+}
+
+static void map_drop_cslist(struct pcs_map_entry * m)
+{
+ assert_spin_locked(&m->lock);
+
+ if (m->cs_list == NULL)
+ return;
+
+ m->cs_list->map = NULL;
+ /* Barrier here is only for sanity checks in cslist_destroy() */
+ smp_mb__before_atomic_dec();
+ cslist_put(m->cs_list);
+ m->cs_list = NULL;
+}
+
+static void pcs_map_callback(struct rcu_head *head)
+{
+ struct pcs_map_entry *m = container_of(head, struct pcs_map_entry, rcu);
+
+ BUG_ON(atomic_read(&m->__refcnt));
+ BUG_ON(!list_empty(&m->queue));
+ BUG_ON(!(m->state & PCS_MAP_DEAD));
+ BUG_ON(m->cs_list);
+
+ kfree(m);
+}
+
+static void __pcs_map_free(struct pcs_map_entry *m)
+{
+ call_rcu(&m->rcu, pcs_map_callback);
+}
+
+void __pcs_map_put(struct pcs_map_entry *m)
+__releases(m->lock)
+{
+ TRACE(" %p id:%lld state:%x ref:%d\n",m, m->id, m->state, atomic_read(&m->__refcnt));
+
+ assert_spin_locked(&m->lock);
+ if (m->state & PCS_MAP_DEAD) {
+ spin_unlock(&m->lock);
+ __pcs_map_free(m);
+ return;
+ }
+ map_add_lru(m);
+ spin_unlock(&m->lock);
+}
+
+static struct pcs_map_entry * __pcs_map_get(struct pcs_map_entry *m)
+{
+ //TRACE( MAP_FMT " ref:%d, maps-count:%d \n", MAP_ARGS(m), m->__refcnt);
+ BUG_ON(atomic_inc_return(&m->__refcnt) <= 1);
+
+ return m;
+}
+
+static void pcs_map_reset(struct pcs_map_entry * m)
+{
+ m->state &= ~(PCS_MAP_READABLE|PCS_MAP_WRITEABLE);
+}
+static void pcs_ireq_queue_fail(struct list_head *queue, int error);
+static void map_sync_work_add(struct pcs_map_entry *m, unsigned long timeout);
+static void map_sync_work_del(struct pcs_map_entry *m);
+
+/* Truncate map from mapping */
+static void pcs_map_truncate(struct pcs_map_entry *m, struct list_head *queue)
+{
+
+ void *ret;
+
+ TRACE( MAP_FMT " ref:%d\n", MAP_ARGS(m), atomic_read(&m->__refcnt));
+
+ assert_spin_locked(&m->lock);
+ BUG_ON(m->state & PCS_MAP_DEAD);
+ BUG_ON(!m->mapping);
+ BUG_ON(!list_empty(&m->queue) && !queue);
+
+ spin_lock(&m->mapping->map_lock);
+ ret = radix_tree_delete(&m->mapping->map_tree, m->index);
+ BUG_ON(!ret || ret != m);
+ m->mapping->nrmaps--;
+ spin_unlock(&m->mapping->map_lock);
+
+ list_splice_tail_init(&m->queue, queue);
+ m->mapping = NULL;
+ map_sync_work_del(m);
+ pcs_map_reset(m);
+ m->state |= PCS_MAP_DEAD;
+ map_drop_cslist(m);
+}
+
+void pcs_mapping_init(struct pcs_cluster_core *cc, struct pcs_mapping * mapping)
+{
+ mapping->cluster = cc;
+ INIT_RADIX_TREE(&mapping->map_tree, GFP_ATOMIC);
+ spin_lock_init(&mapping->map_lock);
+ pcs_flow_table_init(&mapping->ftab, &cc->maps.ftab);
+}
+
+/* Must be called once right after lease is acquired. At that point we already
+ * have all the file attributes.
+ */
+void pcs_mapping_open(struct pcs_mapping * mapping)
+{
+ struct pcs_dentry_info *di = pcs_dentry_from_mapping(mapping);
+
+ switch (di->fileinfo.sys.map_type) {
+ default:
+ BUG();
+ case PCS_MAP_PLAIN:
+ return;
+ }
+}
+
+void pcs_mapping_dump(struct pcs_mapping * mapping)
+{
+ struct pcs_dentry_info *di = pcs_dentry_from_mapping(mapping);
+ unsigned long pos = 0;
+ struct pcs_map_entry *maps[MAP_BATCH];
+ int nr_maps, total = 0;
+
+ if (!mapping->nrmaps)
+ return;
+
+ DTRACE(DENTRY_FMT "\n", DENTRY_ARGS(di));
+
+ do {
+ int i;
+ rcu_read_lock();
+ nr_maps = radix_tree_gang_lookup(&mapping->map_tree,
+ (void **)maps, pos, MAP_BATCH);
+
+ for (i = 0; i < nr_maps; i++, total++) {
+ pos = maps[i]->index;
+ DTRACE("[%d] " MAP_FMT ", id:" CUID_FMT ", v:" VER_FMT " ref:%d\n", total, MAP_ARGS(maps[i]),
+ CUID_ARGS(maps[i]->id), VER_ARGS(maps[i]->version),
+ atomic_read(&maps[i]->__refcnt));
+ }
+ pos++;
+ rcu_read_unlock();
+ } while (nr_maps);
+}
+
+void map_truncate_tail(struct pcs_mapping * mapping, u64 offset)
+{
+
+ unsigned long pos = offset >> mapping->chunk_size_bits;
+ struct pcs_map_entry *maps[MAP_BATCH];
+ int nr_maps;
+ LIST_HEAD(dispose);
+
+ TRACE("%s " DENTRY_FMT "\n", __FUNCTION__, DENTRY_ARGS(pcs_dentry_from_mapping(mapping)));
+ do {
+ int i;
+
+ rcu_read_lock();
+ nr_maps = radix_tree_gang_lookup(&mapping->map_tree,
+ (void **)maps, pos, MAP_BATCH);
+
+ for (i = 0; i < nr_maps; i++) {
+ struct pcs_map_entry *m = maps[i];
+
+ spin_lock(&m->lock);
+ if (!pcs_map_get_locked(m)) {
+ spin_unlock(&m->lock);
+ continue;
+ }
+ pcs_map_truncate(m, &dispose);
+ map_del_lru(m);
+ spin_unlock(&m->lock);
+ pcs_map_put(m);
+ }
+ pos++;
+ rcu_read_unlock();
+ } while (nr_maps);
+
+ pcs_ireq_queue_fail(&dispose, PCS_ERR_NET_ABORT);
+}
+
+void pcs_mapping_invalidate(struct pcs_mapping * mapping)
+{
+ pcs_mapping_dump(mapping);
+ map_truncate_tail(mapping, 0);
+ /* If some CSes are still not shutdown, we can have some map entries referenced in their queues */
+ pcs_flow_table_fini(&mapping->ftab, &pcs_dentry_from_mapping(mapping)->cluster->maps.ftab);
+}
+
+void pcs_mapping_deinit(struct pcs_mapping * mapping)
+{
+
+ BUG_ON(mapping->nrmaps);
+}
+
+static inline int map_reclaimable(struct pcs_map_entry * m)
+{
+ return list_empty(&m->queue)
+ && !(m->state & (PCS_MAP_ERROR|PCS_MAP_RESOLVING));
+}
+
+static enum lru_status map_isolate(struct list_head *item,
+ struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+{
+ struct list_head *dispose = arg;
+ struct pcs_map_entry *m = list_entry(item, struct pcs_map_entry, lru_link);
+
+ if (!spin_trylock(&m->lock))
+ return LRU_SKIP;
+
+ if (!map_reclaimable(m)) {
+ spin_unlock(&m->lock);
+ return LRU_SKIP;
+ }
+
+ pcs_map_truncate(m, NULL);
+ list_lru_isolate_move(lru, item, dispose);
+ spin_unlock(&m->lock);
+
+ return LRU_REMOVED;
+}
+
+static enum lru_status map_dirty_walk(struct list_head *item,
+ struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+{
+ struct pcs_map_entry *m = list_entry(item, struct pcs_map_entry, lru_link);
+
+
+ if (!spin_trylock(&m->lock))
+ return LRU_SKIP;
+
+ BUG_ON(!(m->flags & PCS_MAP_DIRTY));
+ /* Flushes are not limited by ireq_delay(). So, we have
+ * to suppress too frequent flushes when MDS fails to update map
+ * by any reason.
+ */
+ if (!(m->flags & (PCS_MAP_FLUSHING|PCS_MAP_DIRTY_GC)) &&
+ timer_pending(&m->sync_work.timer) &&
+ (jiffies >= m->error_tstamp + PCS_ERROR_DELAY)) {
+ m->flags |= PCS_MAP_DIRTY_GC;
+ map_sync_work_add(m, 0);
+ }
+ spin_unlock(&m->lock);
+ return LRU_SKIP;
+}
+
+unsigned long pcs_map_shrink_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ LIST_HEAD(dispose);
+ unsigned long freed = 0;
+ unsigned long nr_to_scan = sc->nr_to_scan;
+ struct pcs_map_set *maps = container_of(shrink,
+ struct pcs_map_set, shrinker);
+
+ /* This shrinker performs only atomic operations,
+ * any GFP maks will works
+ * if (!(sc->gfp_mask & __GFP_FS)) */
+ /* return SHRINK_STOP; */
+
+ freed = list_lru_walk_node(&maps->lru, sc->nid, map_isolate,
+ &dispose, &nr_to_scan);
+
+ if (nr_to_scan)
+ list_lru_walk_node(&maps->dirty_lru, sc->nid,
+ map_dirty_walk, NULL, &nr_to_scan);
+
+ while (!list_empty(&dispose)) {
+ struct pcs_map_entry *m;
+ m = list_first_entry(&dispose, struct pcs_map_entry, lru_link);
+ list_del_init(&m->lru_link);
+ __pcs_map_free(m);
+ }
+
+ if (!list_empty(&maps->dirty_queue)) {
+ INIT_LIST_HEAD(&dispose);
+ spin_lock(&maps->lock);
+ list_splice_tail(&maps->dirty_queue, &dispose);
+ spin_unlock(&maps->lock);
+ pcs_cc_requeue(container_of(maps, struct pcs_cluster_core, maps), &dispose);
+ }
+ TRACE(" lru_freed:%ld \n", freed);
+ return freed;
+}
+
+unsigned long map_gc(struct pcs_map_set *maps)
+{
+ struct shrink_control sc = {
+ .gfp_mask = GFP_NOIO,
+ .nr_to_scan = 1,
+ .nid = numa_node_id(),
+ };
+
+ return pcs_map_shrink_scan(&maps->shrinker, &sc);
+}
+
+static inline int is_dirtying(struct pcs_map_entry * map, struct pcs_int_request *ireq)
+{
+ if (!ireq->iochunk.direction)
+ return 0;
+
+ /* Was not dirty? */
+ if (!(map->flags & PCS_MAP_DIRTY))
+ return 1;
+
+ /* Is already dirty, but we work on flush right now. Wait for end of flush. */
+ if (map->flags & (PCS_MAP_FLUSHING|PCS_MAP_DIRTY_GC))
+ return 1;
+
+ return 0;
+}
+
+static void map_queue_on_limit(struct pcs_int_request *ireq)
+{
+ struct pcs_map_set * maps = &ireq->dentry->cluster->maps;
+
+ TRACE("queueing due to dirty limit\n");
+
+ if (ireq_is_timed_out(ireq)) {
+ pcs_log(LOG_ERR, "timeout while map get on \"" DENTRY_FMT "\" last_err=%u",
+ DENTRY_ARGS(ireq->dentry), ireq->error.value);
+ BUG();
+ }
+
+ if (ireq->type == PCS_IREQ_IOCHUNK && ireq->iochunk.map) {
+ pcs_map_put(ireq->iochunk.map);
+ ireq->iochunk.map = NULL;
+ }
+
+ list_add_tail(&ireq->list, &maps->dirty_queue);
+ map_gc(maps);
+}
+
+/* TODO: this check differ from original */
+int map_check_limit(struct pcs_map_entry * map, struct pcs_int_request *ireq)
+{
+ struct pcs_map_set * maps = &ireq->dentry->cluster->maps;
+
+ if (map == NULL) {
+ map_queue_on_limit(ireq);
+ return 1;
+ }
+
+ if (list_empty(&maps->dirty_queue))
+ return 0;
+
+ /* The goal is to queue request which is going to increase pressure on map limit. */
+
+ /* If map failed the request must pass. If it is under resolution it can pass.
+ *
+ * This looks dangerous, error maps can overflow map table.
+ * Nevertheless, altogether this combines to another statement: if map is not reclaimable,
+ * the request passes. So, it really does not increase pressure.
+ */
+
+ if (!map_reclaimable(map))
+ return 0;
+ /*
+ * When map is new, the request definitely increases the pressure.
+ *
+ * Also it does if the request is going to move clean map to dirty state
+ */
+ if (((map->state & PCS_MAP_NEW) || is_dirtying(map, ireq))) {
+ int nid = page_to_nid(virt_to_page(map));
+
+ if (list_lru_count_node(&maps->dirty_lru, nid) >
+ maps->map_dirty_thresh)
+ map_queue_on_limit(ireq);
+ return 1;
+ }
+ return 0;
+}
+
+static void map_sync_work_add(struct pcs_map_entry *m, unsigned long timeout)
+{
+ struct pcs_cluster_core *cc = cc_from_maps(m->maps);
+
+ assert_spin_locked(&m->lock);
+
+ if (!timer_pending(&m->sync_work.timer))
+ __pcs_map_get(m);
+ mod_delayed_work(cc->wq, &m->sync_work, timeout);
+}
+static void map_sync_work_del(struct pcs_map_entry *m)
+{
+ assert_spin_locked(&m->lock);
+
+ if (!timer_pending(&m->sync_work.timer))
+ return;
+ cancel_delayed_work(&m->sync_work);
+ pcs_map_put_locked(m);
+}
+static void sync_timer_work(struct work_struct *w);
+
+/* Returns map with incremented refcnt */
+struct pcs_map_entry * pcs_find_get_map(struct pcs_dentry_info *di, u64 offset)
+{
+ struct pcs_map_set * maps = &di->mapping.cluster->maps;
+ unsigned long idx = offset >> DENTRY_CHUNK_SIZE_BITS(di);
+ struct pcs_map_entry *m;
+
+again:
+ for (;;) {
+ rcu_read_lock();
+ m = radix_tree_lookup(&di->mapping.map_tree, idx);
+ if (m) {
+ BUG_ON(m->index != idx);
+ m = pcs_map_get(m);
+ rcu_read_unlock();
+ if (!m)
+ continue;
+ else
+ return m;
+ }
+ rcu_read_unlock();
+ /* No direct throttler here */
+ break;
+ }
+ m = kzalloc(sizeof(struct pcs_map_entry), GFP_NOIO);
+ if (!m)
+ return NULL;
+
+ if (radix_tree_preload(GFP_NOIO)) {
+ kfree(m);
+ return NULL;
+ }
+
+ m->mapping = NULL;
+ m->maps = NULL;
+ m->res_offset = offset;
+ m->chunk_psize = 0;
+ m->index = idx;
+
+ map_version_init(&m->version);
+ m->id = 0; /* For logging only, it is not used before map is completed */
+ m->state = PCS_MAP_NEW;
+ m->flags = 0;
+ atomic_set(&m->__refcnt, 1);
+ m->mds_flags = 0;
+ m->cs_list = NULL;
+ m->error_tstamp = 0;
+ m->mapping = &di->mapping;
+ INIT_DELAYED_WORK(&m->sync_work, sync_timer_work);
+ INIT_LIST_HEAD(&m->queue);
+ INIT_LIST_HEAD(&m->lru_link);
+ spin_lock_init(&m->lock);
+ atomic_inc(&maps->count);
+ m->maps = maps;
+
+ spin_lock(&di->mapping.map_lock);
+ m->mapping->nrmaps++;
+ if (radix_tree_insert(&di->mapping.map_tree, idx, m)) {
+ m->mapping->nrmaps--;
+ spin_unlock(&di->mapping.map_lock);
+ radix_tree_preload_end();
+ kfree(m);
+ goto again;
+ }
+ spin_unlock(&di->mapping.map_lock);
+ radix_tree_preload_end();
+
+ return m;
+}
+
+/* When CS goes up/down invalidate read_index on all the maps using this CS.
+ * This results in reevaluation of CS used for reads from this chunk at the next read.
+ */
+
+static void map_recalc_maps(struct pcs_cs * cs)
+{
+ struct pcs_cs_link * csl;
+ assert_spin_locked(&cs->lock);
+
+ list_for_each_entry(csl, &cs->map_list, link) {
+ struct pcs_cs_record * cs_rec;
+ struct pcs_cs_list * cs_list;
+ int read_idx;
+
+ cs_rec = container_of(csl, struct pcs_cs_record, cslink);
+ cs_list = container_of(cs_rec - csl->index, struct pcs_cs_list, cs[0]);
+ read_idx = READ_ONCE(cs_list->read_index);
+
+ if (read_idx >= 0 && (!cs_is_blacklisted(cs) ||
+ cs_list->cs[read_idx].cslink.cs == cs))
+ WRITE_ONCE(cs_list->read_index, -1);
+ }
+}
+
+void pcs_map_force_reselect(struct pcs_cs * cs)
+{
+ struct pcs_cs_link * csl;
+ assert_spin_locked(&cs->lock);
+
+ list_for_each_entry(csl, &cs->map_list, link) {
+ struct pcs_cs_record * cs_rec;
+ struct pcs_cs_list * cs_list;
+ int read_idx;
+
+ cs_rec = container_of(csl, struct pcs_cs_record, cslink);
+ cs_list = container_of(cs_rec - csl->index, struct pcs_cs_list, cs[0]);
+ read_idx = READ_ONCE(cs_list->read_index);
+
+ if (read_idx >= 0 && cs_list->cs[read_idx].cslink.cs == cs)
+ WRITE_ONCE(cs_list->read_index, -1);
+ }
+}
+
+static int all_blacklisted(struct pcs_cs_list * csl)
+{
+ int i = 0;
+
+ for (i = 0; i < csl->nsrv; i++) {
+ if (test_bit(i, &csl->blacklist)) {
+ if (jiffies < READ_ONCE(csl->blacklist_expires))
+ continue;
+ TRACE("expire replication blacklist");
+ clear_bit(i, &csl->blacklist);
+ }
+ if (!test_bit(CS_SF_BLACKLISTED, &csl->cs[i].cslink.cs->state))
+ break;
+ }
+ return i == csl->nsrv;
+}
+
+static int urgent_whitelist(struct pcs_cs * cs)
+{
+ struct pcs_cs_link * csl;
+ assert_spin_locked(&cs->lock);
+
+ list_for_each_entry(csl, &cs->map_list, link) {
+ struct pcs_cs_record * cs_rec;
+ struct pcs_cs_list * cs_list;
+
+ cs_rec = container_of(csl, struct pcs_cs_record, cslink);
+ cs_list = container_of(cs_rec - csl->index, struct pcs_cs_list, cs[0]);
+
+ if (cs_list->map == NULL)
+ continue;
+
+ if (all_blacklisted(cs_list))
+ return 1;
+ }
+ return 0;
+}
+
+void cs_blacklist(struct pcs_cs * cs, int error, char * reason)
+{
+ assert_spin_locked(&cs->lock);
+
+ if (!cs_is_blacklisted(cs)) {
+ spin_lock(&cs->css->lock);
+ set_bit(CS_SF_BLACKLISTED, &cs->state);
+ cs->blacklist_reason = error;
+ TRACE("Blacklisting CS" NODE_FMT " by %s, err=%d", NODE_ARGS(cs->id), reason, error);
+ if (list_empty(&cs->css->bl_list)) {
+ struct pcs_cluster_core *cc = cc_from_csset(cs->css);
+
+ mod_delayed_work(cc->wq, &cs->css->bl_work, PCS_CS_BLACKLIST_TIMER);
+ }
+ list_add_tail(&cs->bl_link, &cs->css->bl_list);
+ spin_unlock(&cs->css->lock);
+ map_recalc_maps(cs);
+ }
+}
+
+static void cs_blacklist_unlocked(struct pcs_cs * cs, int error, char * reason)
+{
+ spin_lock(&cs->lock);
+ cs_blacklist(cs, error, reason);
+ spin_unlock(&cs->lock);
+}
+
+void cs_whitelist(struct pcs_cs * cs, char * reason)
+{
+ assert_spin_locked(&cs->lock);
+
+ if (cs_is_blacklisted(cs)) {
+ clear_bit(CS_SF_BLACKLISTED, &cs->state);
+ TRACE("Whitelisting CS" NODE_FMT " by %s", NODE_ARGS(cs->id), reason);
+
+ map_recalc_maps(cs);
+
+ spin_lock(&cs->css->lock);
+ list_del_init(&cs->bl_link);
+ if (list_empty(&cs->css->bl_list))
+ cancel_delayed_work(&cs->css->bl_work);
+ spin_unlock(&cs->css->lock);
+ }
+}
+
+static inline void __map_error(struct pcs_map_entry *m , int remote, int error, u64 offender)
+{
+ assert_spin_locked(&m->lock);
+ m->state = PCS_MAP_ERROR;
+ m->iofailure.remote = remote;
+ m->iofailure.value = error;
+ m->iofailure.offender.val = offender;
+}
+
+static inline void map_remote_error_nolock(struct pcs_map_entry *m , int error, u64 offender)
+{
+ __map_error(m, 1 , error, offender);
+}
+static void map_remote_error(struct pcs_map_entry *m , int error, u64 offender)
+{
+ spin_lock(&m->lock);
+ map_remote_error_nolock(m, error, offender);
+ spin_unlock(&m->lock);
+}
+
+void pcs_map_notify_addr_change(struct pcs_cs * cs)
+{
+ struct pcs_cs_link * csl;
+ assert_spin_locked(&cs->lock);
+
+ cs_whitelist(cs, "addr update");
+
+ list_for_each_entry(csl, &cs->map_list, link) {
+ struct pcs_cs_record * cs_rec;
+ struct pcs_cs_list * cs_list;
+ struct pcs_map_entry * m;
+
+ cs_rec = container_of(csl, struct pcs_cs_record, cslink);
+ cs_list = container_of(cs_rec - csl->index, struct pcs_cs_list, cs[0]);
+
+ if (csl->addr_serno == cs->addr_serno)
+ continue;
+
+ if ((m = cs_list->map) == NULL)
+ continue;
+
+ spin_lock(&m->lock);
+ if ((m->state & PCS_MAP_DEAD) || m->cs_list != cs_list)
+ goto unlock;
+
+ if (m->state & (PCS_MAP_ERROR|PCS_MAP_RESOLVING|PCS_MAP_NEW))
+ goto unlock;
+
+ TRACE(MAP_FMT " invalidating due to address change of CS#"NODE_FMT,
+ MAP_ARGS(m), NODE_ARGS(cs->id));
+
+ map_remote_error_nolock(m, PCS_ERR_CSD_STALE_MAP, cs->id.val);
+ unlock:
+ spin_unlock(&m->lock);
+
+ }
+}
+
+noinline static void pcs_ireq_queue_fail(struct list_head *queue, int error)
+{
+ while (!list_empty(queue)) {
+ struct pcs_int_request *ireq = list_first_entry(queue, struct pcs_int_request, list);
+
+ list_del_init(&ireq->list);
+
+ pcs_set_local_error(&ireq->error, error);
+
+ if (ireq->type == PCS_IREQ_TRUNCATE) {
+ ireq_on_error(ireq);
+
+ if (!(ireq->flags & IREQ_F_FATAL)) {
+ if (ireq_is_timed_out(ireq)) {
+ pcs_log(LOG_ERR, "timeout while truncate(%d) request on \"" DENTRY_FMT "\" last err=%u",
+ ireq->type, DENTRY_ARGS(ireq->dentry), ireq->error.value);
+ BUG();
+ }
+ pcs_clear_error(&ireq->error);
+
+ TRACE("requeue truncate(%d) %llu@" DENTRY_FMT "\n", ireq->type,
+ (unsigned long long)ireq->truncreq.offset, DENTRY_ARGS(ireq->dentry));
+
+ ireq_delay(ireq);
+ continue;
+ }
+ }
+ ireq_complete(ireq);
+ }
+}
+
+void transfer_sync_data(struct pcs_cs_list * new_cs_list, struct pcs_cs_list * old_cs_list)
+{
+ int i, k;
+
+ if (new_cs_list->nsrv == 0 || old_cs_list->nsrv == 0)
+ return;
+
+ for (i = 0; i < new_cs_list->nsrv; i++) {
+ for (k = 0; k < old_cs_list->nsrv; k++) {
+ if (old_cs_list->cs[k].info.id.val == new_cs_list->cs[i].info.id.val) {
+ new_cs_list->cs[i].sync = old_cs_list->cs[k].sync;
+ break;
+ }
+ }
+ }
+}
+
+static int cs_is_dirty(struct cs_sync_state * sync)
+{
+ int res;
+
+ if (!sync->dirty_integrity || !sync->dirty_epoch || !sync->dirty_seq)
+ return 0;
+
+ res = pcs_sync_seq_compare(sync->dirty_epoch, sync->sync_epoch);
+ if (!res)
+ res = pcs_sync_seq_compare(sync->dirty_seq, sync->sync_seq);
+
+ return res >= 0;
+}
+
+static void evaluate_dirty_status(struct pcs_map_entry * m)
+{
+ int i;
+
+ assert_spin_locked(&m->lock);
+
+ if (m->flags & PCS_MAP_DIRTY) {
+ m->flags &= ~PCS_MAP_DIRTY;
+ atomic_dec(&m->maps->dirty_count);
+ }
+
+ if (m->cs_list == NULL)
+ return;
+
+ for (i = 0; i < m->cs_list->nsrv; i++) {
+ struct pcs_cs_record * rec = m->cs_list->cs + i;
+
+ BUG_ON(rec->info.integrity_seq == 0);
+
+ if (cs_is_dirty(&rec->sync)) {
+ if (rec->sync.dirty_integrity == rec->info.integrity_seq) {
+ if (!(m->flags & PCS_MAP_DIRTY)) {
+ m->flags |= PCS_MAP_DIRTY;
+ atomic_inc(&m->maps->dirty_count);
+ }
+ } else {
+ TRACE(MAP_FMT " integrity seq advanced on CS#"NODE_FMT,
+ MAP_ARGS(m), NODE_ARGS(rec->info.id));
+
+ rec->sync.dirty_integrity = 0;
+ rec->sync.dirty_epoch = 0;
+ rec->sync.dirty_seq = 0;
+ }
+ } else
+ rec->sync.dirty_integrity = 0;
+ }
+
+ if (!(m->flags & PCS_MAP_DIRTY)) {
+ map_sync_work_del(m);
+ pcs_log(LOG_DEBUG5, "map %p is clean", m);
+ } else {
+ pcs_log(LOG_DEBUG5, "map %p is dirty", m);
+ if (!timer_pending(&m->sync_work.timer) && !(m->flags & PCS_MAP_FLUSHING))
+ map_sync_work_add(m, pcs_sync_timeout(cc_from_map(m)));
+ }
+}
+
+int pcs_map_encode_req(struct pcs_map_entry*m, struct pcs_ioc_getmap *map, int direction)
+{
+ int i;
+
+ spin_lock(&m->lock);
+ BUG_ON(map_chunk_start(m) > m->res_offset);
+ BUG_ON(map_chunk_end(m) < m->res_offset);
+ /*
+ * Someone truncate mapping while IO is in progress
+ * aio_dio vs truncate race ?
+ */
+ if (m->state & PCS_MAP_DEAD) {
+ spin_unlock(&m->lock);
+ pcs_map_put(m);
+ return 1;
+ }
+
+ map->uid = m->id;
+ map->version = m->version;
+ map->chunk_start = m->res_offset;
+ map->chunk_end = map_chunk_end(m);
+ map->state = 0;
+ if (m->state & PCS_MAP_READABLE)
+ map->state |= PCS_IOC_MAP_S_READ;
+ if (m->state & PCS_MAP_WRITEABLE || direction)
+ map->state |= PCS_IOC_MAP_S_WRITE;
+ if (m->state & PCS_MAP_NEW)
+ map->state |= PCS_IOC_MAP_S_NEW;
+ if (m->state & PCS_MAP_ERROR) {
+ map->state |= PCS_IOC_MAP_S_ERROR;
+ map->error = m->iofailure;
+ }
+ map->mds_flags = m->mds_flags;
+ map->psize_ret = 0; /* UNUSED */
+ map->chunk_psize = 0; /* UNUSED */
+
+ if (m->cs_list && m->cs_list->nsrv) {
+ map->cs_cnt = m->cs_list->nsrv;
+ for (i = 0; i < m->cs_list->nsrv; i++) {
+ map->cs[i] = m->cs_list->cs[i].info;
+ if (!(m->flags & PCS_MAP_DIRTY) || !cs_is_dirty(&m->cs_list->cs[i].sync))
+ map->cs[i].integrity_seq = 0;
+ }
+ }
+
+#ifdef __PCS_DEBUG
+ printk("%s submit m(%p)->uid:%lld\n", __FUNCTION__, m, m->id);
+ printk("map {id:%lld [%lld, %lld] v:{" VER_FMT "} st:%x, cnt:%d max:%d SZ:%ld}\n",
+ m->id, map->chunk_start, map->chunk_end, VER_ARGS(m->version),
+ map->state, map->cs_cnt, map->cs_max, map_sz);
+
+ printk("cs_list: ");
+ for (i = 0; i < map->cs_cnt; i++) {
+ printk("[%d]{id:%lld fl:%x} ",
+ i, map->cs[i].id.val, map->cs[i].flags);
+ }
+ printk("\n.");
+#endif
+ spin_unlock(&m->lock);
+ return 0;
+}
+
+/*
+ * Alloc and initialize cslist, grab cs->lock inside
+ */
+struct pcs_cs_list* cslist_alloc( struct pcs_cs_set *css, struct pcs_cs_info *rec, int cs_cnt,
+ int read_tout, int write_tout, int error_clear)
+{
+ struct pcs_cs_list * cs_list = NULL;
+ int i;
+
+ cs_list = kzalloc(sizeof(struct pcs_cs_list) + cs_cnt * sizeof(struct pcs_cs_record), GFP_NOFS);
+ if (!cs_list)
+ return NULL;
+
+ atomic_set(&cs_list->refcnt, 1);
+ atomic_set(&cs_list->seq_read_in_flight, 0);
+ cs_list->read_index = -1;
+ cs_list->cong_index = -1 ;
+ cs_list->flags = 0;
+ cs_list->blacklist = 0;
+ cs_list->read_timeout = (read_tout * HZ) / 1000;
+ cs_list->write_timeout = (write_tout * HZ) / 1000;
+ cs_list->nsrv = cs_cnt;
+ for (i = 0; i < cs_cnt; i++) {
+ cs_list->cs[i].info = rec[i];
+ memset(&cs_list->cs[i].sync, 0, sizeof(cs_list->cs[i].sync));
+ cs_list->cs[i].cslink.cs = NULL;
+ INIT_LIST_HEAD(&cs_list->cs[i].cslink.link);
+ cs_list->cs[i].cslink.index = i;
+ }
+
+
+ for (i = 0; i < cs_cnt; i++) {
+ struct pcs_cs_link * cslink = &cs_list->cs[i].cslink;
+ struct pcs_cs * cs;
+
+ if (cs_list->cs[i].info.flags & CS_FL_REPLICATING) {
+ __set_bit(i, &cs_list->blacklist);
+ cs_list->blacklist_expires = jiffies + PCS_REPLICATION_BLACKLIST_TIMEOUT;
+ }
+
+ cs = pcs_cs_find_create(css, &cs_list->cs[i].info.id,
+ &cs_list->cs[i].info.addr, cs_list->cs[i].info.flags);
+
+ if (!cs) {
+ cslist_destroy(cs_list);
+ return NULL;
+ }
+ assert_spin_locked(&cs->lock);
+ BUG_ON(cs->is_dead);
+
+ cslink->cs = cs;
+ cslink->addr_serno = cs->addr_serno;
+
+ cs->io_prio = cs_list->cs[i].info.io_prio;
+ cs->net_prio = cs_list->cs[i].info.net_prio;
+ cs->io_prio_stamp = jiffies;
+
+ /* update cs state */
+ cs->mds_flags = cs_list->cs[i].info.flags;
+ if (cs->mds_flags & CS_FL_LOCAL) {
+ set_bit(CS_SF_LOCAL, &cs->state);
+ cs_list->flags |= CSL_FL_HAS_LOCAL;
+ }
+ if (cs->mds_flags & CS_FL_LOCAL_SOCK)
+ set_bit(CS_SF_LOCAL_SOCK, &cs->state);
+ if (cs->mds_flags & CS_FL_INACTIVE) {
+ set_bit(CS_SF_INACTIVE, &cs->state);
+ cs_blacklist(cs, PCS_ERR_NET_ABORT, "mds hint");
+ }
+ if (cs->mds_flags & CS_FL_REPLICATING)
+ set_bit(CS_SF_REPLICATING, &cs->state);
+ if (cs->mds_flags & CS_FL_FAILED)
+ set_bit(CS_SF_FAILED, &cs->state);
+
+ list_add(&cslink->link, &cs->map_list);
+ cs->nmaps++;
+ spin_unlock(&cs->lock);
+ }
+
+ for (i = cs_cnt - 1; i >= 0; i--) {
+ struct pcs_cs * cs = cs_list->cs[i].cslink.cs;
+ spin_lock(&cs->lock);
+ if (cs_is_blacklisted(cs) && !(test_bit(CS_SF_INACTIVE, &cs->state))) {
+ if (error_clear)
+ cs_whitelist(cs, "mds hint");
+ else if (urgent_whitelist(cs))
+ cs_whitelist(cs, "urgent");
+ }
+ spin_unlock(&cs->lock);
+ }
+
+ return cs_list;
+}
+
+void pcs_map_complete(struct pcs_map_entry *m, struct pcs_ioc_getmap *omap)
+{
+ pcs_error_t error = omap->error;
+ struct pcs_cs_list * cs_list = NULL;
+ struct list_head queue;
+ int error_sensed = 0;
+
+ INIT_LIST_HEAD(&queue);
+
+ spin_lock(&m->lock);
+
+ TRACE(" recv m: " MAP_FMT " resp{ st:%d, err:%d, v:" VER_FMT "}\n",
+ MAP_ARGS(m), omap->state, omap->error.value, VER_ARGS(omap->version));
+
+ if (pcs_if_error(&omap->error))
+ goto error;
+
+ if (m->state & PCS_MAP_DEAD) {
+ spin_unlock(&m->lock);
+ goto out_ignore;
+ }
+
+ error_sensed = m->state & PCS_MAP_ERROR;
+
+ if (omap->cs_cnt) {
+ spin_unlock(&m->lock);
+ cs_list = cslist_alloc(&cc_from_map(m)->css, omap->cs, omap->cs_cnt, omap->read_tout, omap->write_tout, error_sensed);
+ spin_lock(&m->lock);
+ if (!cs_list) {
+ pcs_set_local_error(&error, PCS_ERR_NOMEM);
+ goto error;
+ }
+ /* Recheck one more time because we drop the lock */
+ if (m->state & PCS_MAP_DEAD) {
+ spin_unlock(&m->lock);
+ goto out_ignore;
+ }
+ }
+
+ if (!(m->state & PCS_MAP_RESOLVING)) {
+ /* This may happen because of __pcs_map_error() explicit assign
+ m->state = PCS_MAP_ERROR;
+ If m->state becomes atomic bit fields this will be impossible.
+ */
+ spin_unlock(&m->lock);
+ goto out_ignore;
+ }
+ pcs_map_reset(m);
+ m->id = omap->uid;
+ m->version = omap->version;
+
+ if (cs_list) {
+ if (m->cs_list) {
+ transfer_sync_data(cs_list, m->cs_list);
+ map_drop_cslist(m);
+ }
+ cs_list->map = m;
+ cs_list->version = m->version;
+ m->cs_list = cs_list;
+ cs_list = NULL;
+ } else if (m->state & PCS_MAP_NEW) {
+ /* This suppose to be zero chunk */
+ BUG_ON(!(m->state & (PCS_MAP_READABLE|PCS_MAP_NEW)));
+ map_drop_cslist(m);
+ m->chunk_psize = 0;
+ if (m->flags & PCS_MAP_DIRTY) {
+ m->flags &= ~PCS_MAP_DIRTY;
+ atomic_dec(&m->maps->dirty_count);
+ }
+
+ }
+
+ m->state = 0;
+ if (omap->state & PCS_IOC_MAP_S_READ)
+ m->state |= PCS_MAP_READABLE;
+ if (omap->state & PCS_IOC_MAP_S_WRITE)
+ m->state |= PCS_MAP_WRITEABLE;
+ if (omap->state & PCS_IOC_MAP_S_ERROR)
+ m->state |= PCS_MAP_ERROR;
+ if (omap->state & PCS_IOC_MAP_S_NEW) {
+ m->state |= PCS_MAP_NEW;
+ /* Userspace has optimization which may return map
+ * which cover larger range, But this complicate locking.
+ * Simply ignore it for now. */
+ if (omap->chunk_start < map_chunk_start(m))
+ omap->chunk_start = map_chunk_start(m);
+ if (map_chunk_end(m) < omap->chunk_end)
+ omap->chunk_end = map_chunk_end(m);
+ }
+ m->mds_flags = omap->mds_flags;
+ m->chunk_psize = omap->chunk_psize; /* UNUSED */
+ m->res_offset = omap->chunk_start;
+ if (map_chunk_start(m) != omap->chunk_start ||
+ map_chunk_end(m) != omap->chunk_end) {
+ BUG();
+ }
+
+ evaluate_dirty_status(m);
+#ifdef __PCS_DEBUG
+ if (1) {
+ int i;
+ TRACE(MAP_FMT " -> " CUID_FMT " psize=%u %d node map { ",
+ MAP_ARGS(m), CUID_ARGS(m->id),
+ m->chunk_psize, m->cs_list ? m->cs_list->nsrv : 0);
+ if (m->cs_list) {
+ for (i = 0; i < m->cs_list->nsrv; i++)
+ printk( NODE_FMT ":%x:%u ",
+ NODE_ARGS(m->cs_list->cs[i].info.id),
+ m->cs_list->cs[i].info.flags,
+ CS_FL_ROLE_GET(m->cs_list->cs[i].info.flags));
+ }
+ printk("}\n");
+ }
+#endif
+ m->error_tstamp = 0;
+ list_splice_tail_init(&m->queue, &queue);
+ spin_unlock(&m->lock);
+
+ /* Success, resubmit waiting requests */
+ pcs_cc_requeue(cc_from_map(m), &queue);
+ BUG_ON(!list_empty(&queue));
+ pcs_map_put(m);
+
+ return;
+
+error:
+ TRACE(" map error: %d for " MAP_FMT "\n", error.value, MAP_ARGS(m));
+ BUG_ON(!pcs_if_error(&error));
+
+ m->state &= ~PCS_MAP_RESOLVING;
+ m->error_tstamp = jiffies;
+ list_splice_tail_init(&m->queue, &queue);
+ pcs_map_reset(m);
+ spin_unlock(&m->lock);
+
+ pcs_ireq_queue_fail(&queue, error.value);
+out_ignore:
+ BUG_ON(!list_empty(&queue));
+ pcs_map_put(m);
+ if (cs_list)
+ cslist_put(cs_list);
+}
+
+/* Atomically schedule map resolve and push ireq to wait completion */
+static void pcs_map_queue_resolve(struct pcs_map_entry * m, struct pcs_int_request *ireq, int direction)
+{
+
+ DTRACE("enter m: " MAP_FMT ", ireq:%p dir:%d \n", MAP_ARGS(m), ireq, direction);
+
+ spin_lock(&m->lock);
+ /* This should not happen unless aio_dio/fsync vs truncate race */
+ if (m->state & PCS_MAP_DEAD) {
+ struct list_head l;
+
+ spin_unlock(&m->lock);
+ INIT_LIST_HEAD(&l);
+ list_add(&ireq->list, &l);
+ pcs_ireq_queue_fail(&l, PCS_ERR_NET_ABORT);
+ return;
+ }
+ DTRACE("%p {%p %p}\n",ireq, ireq->list.next, ireq->list.prev);
+ BUG_ON(!list_empty(&ireq->list));
+
+ list_add_tail(&ireq->list, &m->queue);
+ if (m->state & PCS_MAP_RESOLVING) {
+ spin_unlock(&m->lock);
+ return;
+ }
+ /* If converting a hole, adjust res_offset */
+ if (direction && !m->cs_list && !(m->state & PCS_MAP_RESOLVING)
+ && ireq->type == PCS_IREQ_IOCHUNK)
+ m->res_offset = ireq->iochunk.chunk + ireq->iochunk.offset;
+
+ m->state |= PCS_MAP_RESOLVING;
+ __pcs_map_get(m); /* drop on pcs_map_complete */
+
+ spin_unlock(&m->lock);
+ /// TODO: THINK!!!!
+ /// May be it is reasonable to schedule fuse_map_resolve from work_queue?
+ fuse_map_resolve(m, direction);
+}
+
+/* If version on m is not already advanced, we must notify MDS about the error.
+ * It it is still not advanced, we just ignore the error in hope new map
+ * will work.
+ */
+static void map_notify_error(struct pcs_map_entry * m, struct pcs_int_request * sreq,
+ PCS_MAP_VERSION_T * failed_version, struct pcs_cs_list * csl)
+{
+ int cs_notify = 0;
+
+ spin_lock(&m->lock);
+ if (m->state & PCS_MAP_DEAD) {
+ spin_unlock(&m->lock);
+ return;
+ }
+ if (sreq->error.remote &&
+ !(m->state & (PCS_MAP_ERROR|PCS_MAP_NEW|PCS_MAP_RESOLVING|PCS_MAP_DEAD)) &&
+ map_version_compare(failed_version, &m->version) >= 0) {
+ int suppress_error = 0;
+
+ if (csl) {
+ int i;
+
+ for (i = 0; i < csl->nsrv; i++) {
+ if (csl->cs[i].info.id.val == sreq->error.offender.val) {
+ if (csl->cs[i].cslink.cs->addr_serno != csl->cs[i].cslink.addr_serno) {
+ TRACE("error for CS"NODE_FMT " has been suppressed", NODE_ARGS(sreq->error.offender));
+ suppress_error = 1;
+ }
+ break;
+ }
+ }
+ }
+ if (suppress_error)
+ map_remote_error_nolock(m, PCS_ERR_CSD_STALE_MAP, sreq->error.offender.val);
+ else {
+ map_remote_error_nolock(m, sreq->error.value, sreq->error.offender.val);
+ cs_notify = 1;
+ }
+ }
+ spin_unlock(&m->lock);
+ if (cs_notify)
+ pcs_cs_notify_error(sreq->dentry->cluster, &sreq->error);
+
+}
+
+/* This function notifies map about fatal error, which does not result in request restart.
+ * Even though the request is not retried internally, it can be retried by client, so that
+ * we have to force invalidation of current version.
+ */
+void map_notify_iochunk_error(struct pcs_int_request * sreq)
+{
+ struct pcs_map_entry * m = sreq->iochunk.map;
+
+ if (!m || (m->state & PCS_MAP_DEAD))
+ return;
+
+ map_notify_error(m, sreq, &sreq->iochunk.hbuf.map_version, sreq->iochunk.csl);
+}
+
+static void map_replicating(struct pcs_int_request *ireq)
+{
+ struct pcs_cs_list * csl = ireq->iochunk.csl;
+ int read_idx = READ_ONCE(csl->read_index);
+
+ BUG_ON(ireq->iochunk.direction);
+
+ if (csl == NULL || csl->map == NULL)
+ return;
+
+ TRACE("reading unfinished replica %lx %d", csl->blacklist, read_idx);
+
+ if (ireq->iochunk.cs_index != read_idx)
+ return;
+
+ BUG_ON(read_idx < 0 || read_idx > csl->nsrv);
+
+ if (!ireq->error.remote ||
+ csl->cs[read_idx].cslink.cs->id.val != ireq->error.offender.val) {
+ TRACE("wrong cs id " NODE_FMT " " NODE_FMT, NODE_ARGS(csl->cs[read_idx].cslink.cs->id), NODE_ARGS(ireq->error.offender));
+ return;
+ }
+
+ /* If request was issued for the last CS in the list, clear error. */
+ pcs_clear_error(&ireq->error);
+ WRITE_ONCE(csl->blacklist_expires, jiffies + PCS_REPLICATION_BLACKLIST_TIMEOUT);
+
+ /* And blacklist the last replica */
+ if (!(test_bit(read_idx, &csl->blacklist))) {
+ WRITE_ONCE(csl->read_index, -1);
+ set_bit(read_idx, &csl->blacklist);
+ }
+}
+
+static void map_read_error(struct pcs_int_request *ireq)
+{
+ struct pcs_cs_list * csl = ireq->iochunk.csl;
+ struct pcs_cs * cs;
+
+ BUG_ON(ireq->iochunk.direction);
+
+ if (csl == NULL || csl->map == NULL || (csl->map->state & PCS_MAP_ERROR))
+ return;
+
+ cs = csl->cs[ireq->iochunk.cs_index].cslink.cs;
+
+ if (ireq->flags & IREQ_F_MAPPED) {
+ cs_blacklist_unlocked(cs, ireq->error.value, "error on directly mapped CS");
+ return;
+ }
+
+ /* If everything is already backlisted, proceed reporting error to MDS */
+ if (all_blacklisted(csl)) {
+ cs_blacklist_unlocked(cs, ireq->error.value, "total read error");
+ return;
+ }
+
+ /* If this CS is already blacklisted, select another CS, we have spare ones */
+ if (cs_is_blacklisted(cs)) {
+ TRACE("Skipping CS" NODE_FMT, NODE_ARGS(cs->id));
+ WRITE_ONCE(csl->read_index, -1);
+ pcs_clear_error(&ireq->error);
+ return;
+ }
+
+ /* Mark CS as dubioius */
+ if (csl->cs[ireq->iochunk.cs_index].cslink.addr_serno == cs->addr_serno)
+ cs_blacklist_unlocked(cs, ireq->error.value, "read error");
+
+ /* If some clean CSes remained, select another one, otherwise report error to MDS */
+ if (!all_blacklisted(csl)) {
+ WRITE_ONCE(csl->read_index, -1);
+ pcs_clear_error(&ireq->error);
+ }
+}
+
+static unsigned int cong_roundup(unsigned int size)
+{
+ return (size + 65535) & ~65535;
+}
+
+static int worth_to_grow(struct pcs_int_request *ireq, struct pcs_cs * cs)
+{
+ if (ireq->type == PCS_IREQ_FLUSH)
+ return 0;
+
+ return jiffies < ireq->ts_sent + cc_from_csset(cs->css)->netlat_cutoff;
+}
+
+static void pcs_cs_deaccount(struct pcs_int_request *ireq, struct pcs_cs * cs, int error)
+{
+ unsigned int cost;
+
+ spin_lock(&cs->lock);
+ if (ireq->type == PCS_IREQ_IOCHUNK)
+ cost = (ireq->flags & IREQ_F_RND_WEIGHT) ? 512*1024 : cong_roundup(ireq->iochunk.size);
+ else
+ cost = PCS_CS_FLUSH_WEIGHT;
+
+ if (!error) {
+ int iolat_cutoff = cc_from_csset(cs->css)->iolat_cutoff;
+
+ if (cs->last_latency > iolat_cutoff && ireq->type != PCS_IREQ_FLUSH) {
+ unsigned int clamp;
+
+ clamp = PCS_CS_INIT_CWND;
+ if (cs->last_latency > iolat_cutoff*8)
+ clamp = PCS_CS_INIT_CWND/8;
+ else if (cs->last_latency > iolat_cutoff*4)
+ clamp = PCS_CS_INIT_CWND/4;
+ else if (cs->last_latency > iolat_cutoff*2)
+ clamp = PCS_CS_INIT_CWND/2;
+
+ TRACE("IO latency on CS" NODE_FMT " is %u, cwnd %u, clamp %u", NODE_ARGS(cs->id), cs->last_latency, cs->cwnd, clamp);
+
+ if (cs->cwnd > clamp)
+ cs->cwnd = clamp;
+ } else if (cs->in_flight >= cs->cwnd && !cs->cwr_state && worth_to_grow(ireq, cs)) {
+ unsigned int cwnd;
+
+ if (cs->cwnd < PCS_CS_INIT_CWND)
+ cwnd = cs->cwnd + cost;
+ else
+ cwnd = cs->cwnd + 0x100000000ULL/cs->cwnd;
+
+ if (cwnd > PCS_CS_MAX_CWND)
+ cwnd = PCS_CS_MAX_CWND;
+ if (cwnd != cs->cwnd) {
+ cs->cwnd = cwnd;
+ DTRACE("Congestion window on CS" NODE_FMT " UP %u", NODE_ARGS(cs->id), cwnd);
+ }
+ }
+ cs->eff_cwnd = cs->cwnd;
+ cs_whitelist(cs, "io hint");
+ } else if (error > 0) {
+ /* In case of error coming from some CS temporarily shrink congestion
+ * window to minimum to allow one request in flight. It will come back to normal
+ * as soon as CS is probed for aliveness.
+ */
+ TRACE("Congestion window on CS" NODE_FMT " is closed (%u)", NODE_ARGS(cs->id), cs->cwnd);
+ cs->eff_cwnd = 1;
+ }
+ cs_decrement_in_flight(cs, cost);
+ spin_unlock(&cs->lock);
+}
+
+static void pcs_cs_wakeup(struct pcs_cs * cs, int requeue)
+{
+ struct pcs_int_request * sreq;
+ struct pcs_map_entry * map;
+
+ while (1) {
+ spin_lock(&cs->lock);
+
+ if (cs->in_flight >= cs->eff_cwnd || list_empty(&cs->active_list)) {
+ spin_unlock(&cs->lock);
+ break;
+ }
+ sreq = list_first_entry(&cs->active_list, struct pcs_int_request, list);
+ BUG_ON(!cs->active_list_len);
+ list_del_init(&sreq->list);
+ cs->active_list_len--;
+ spin_unlock(&cs->lock);
+
+ if (sreq->type != PCS_IREQ_FLUSH) {
+ map = pcs_find_get_map(sreq->dentry, sreq->iochunk.chunk +
+ ((sreq->flags & IREQ_F_MAPPED) ? 0 : sreq->iochunk.offset));
+ if (map) {
+ if (sreq->iochunk.map)
+ pcs_map_put(sreq->iochunk.map);
+ sreq->iochunk.map = map;
+ if (sreq->iochunk.flow) {
+ struct pcs_int_request * preq = sreq->completion_data.parent;
+
+ pcs_flow_confirm(sreq->iochunk.flow, &map->mapping->ftab, preq->apireq.req->type == PCS_REQ_T_WRITE,
+ preq->apireq.req->pos, preq->apireq.req->size,
+ &sreq->cc->maps.ftab);
+ }
+ map_submit(map, sreq, requeue);
+ } else {
+ map_queue_on_limit(sreq);
+ }
+ } else {
+ map = sreq->flushreq.map;
+ if (map->state & PCS_MAP_DEAD) {
+ pcs_clear_error(&sreq->error);
+ ireq_complete(sreq);
+ } else
+ map_submit(map, sreq, requeue);
+ }
+ }
+}
+
+static int __pcs_cs_still_congested(struct pcs_cs * cs)
+{
+
+ assert_spin_locked(&cs->lock);
+
+ if (!list_empty(&cs->active_list)) {
+ BUG_ON(!cs->active_list_len);
+ list_splice_tail(&cs->active_list, &cs->cong_queue);
+ cs->cong_queue_len += cs->active_list_len;
+ set_bit(CS_SF_CONGESTED, &cs->state);
+ pcs_cs_init_active_list(cs);
+ } else if (list_empty(&cs->cong_queue)) {
+ BUG_ON(cs->cong_queue_len);
+ BUG_ON(test_bit(CS_SF_CONGESTED, &cs->state));
+ return 0;
+ } else {
+ BUG_ON(cs->active_list_len);
+ }
+
+ if (cs->in_flight >= cs->eff_cwnd)
+ return 0;
+
+ /* Exceptional situation: CS is not congested, but still has congestion queue.
+ * This can happen f.e. when CS was congested with reads and has some writes in queue,
+ * then all reads are complete, but writes cannot be sent because of congestion
+ * on another CSes in chain. This is absolutely normal, we just should queue
+ * not on this CS, but on actualle congested CSes. With current algorithm of preventing
+ * reordering, we did a mistake and queued on node which used to be congested.
+ * Solution for now is to retry sending with flag "requeue" set, it will requeue
+ * requests on another nodes. It is difficult to say how frequently this happens,
+ * so we spit out message. If we will have lots of them in logs, we have to select
+ * different solution.
+ */
+
+ TRACE("CS#" NODE_FMT " is free, but still has queue", NODE_ARGS(cs->id));
+ pcs_cs_flush_cong_queue(cs);
+
+ return 1;
+}
+static int pcs_cs_still_congested(struct pcs_cs * cs)
+{
+ int ret;
+
+ spin_lock(&cs->lock);
+ ret = __pcs_cs_still_congested(cs);
+ spin_unlock(&cs->lock);
+ return ret;
+}
+
+void pcs_deaccount_ireq(struct pcs_int_request *ireq, pcs_error_t * err)
+{
+ int error = 0;
+ unsigned long long match_id = 0;
+ struct pcs_cs_list * csl, ** csl_p = 0;
+
+ switch (ireq->type) {
+ case PCS_IREQ_IOCHUNK:
+ csl_p = &ireq->iochunk.csl;
+ if (ireq->iochunk.map) {
+ pcs_map_put(ireq->iochunk.map);
+ ireq->iochunk.map = NULL;
+ }
+ break;
+ case PCS_IREQ_FLUSH:
+ csl_p = &ireq->flushreq.csl;
+ break;
+ default:
+ BUG();
+ }
+
+ if ((csl = *csl_p) == NULL)
+ return;
+
+ if (pcs_if_error(err)) {
+ if (!err->remote) {
+ error = -1;
+ } else {
+ match_id = err->offender.val;
+ error = err->value;
+
+ switch (error) {
+ case PCS_ERR_CSD_STALE_MAP:
+ case PCS_ERR_CSD_REPLICATING:
+ case PCS_ERR_CSD_RO_MAP:
+ error = 0;
+ }
+ }
+ }
+
+ if (ireq->type == PCS_IREQ_FLUSH || (ireq->iochunk.direction && !(ireq->flags & IREQ_F_MAPPED))) {
+ int i;
+ int requeue = 0;
+
+ for (i = csl->nsrv - 1; i >= 0; i--) {
+ if (!match_id || csl->cs[i].cslink.cs->id.val == match_id)
+ break;
+
+ pcs_cs_deaccount(ireq, csl->cs[i].cslink.cs, -1);
+ }
+
+ if (i >= 0) {
+ pcs_cs_deaccount(ireq, csl->cs[i].cslink.cs, error);
+ i--;
+ }
+
+ for ( ; i >= 0; i--) {
+ pcs_cs_deaccount(ireq, csl->cs[i].cslink.cs, 0);
+ }
+
+ do {
+ for (i = csl->nsrv - 1; i >= 0; i--)
+ pcs_cs_wakeup(csl->cs[i].cslink.cs, requeue);
+
+ requeue = 0;
+ for (i = csl->nsrv - 1; i >= 0; i--)
+ requeue += pcs_cs_still_congested(csl->cs[i].cslink.cs);
+ } while (requeue);
+ } else {
+ int requeue = 0;
+ struct pcs_cs * rcs = csl->cs[ireq->iochunk.cs_index].cslink.cs;
+
+ if (ireq->flags & IREQ_F_SEQ_READ) {
+ ireq->flags &= ~IREQ_F_SEQ_READ;
+ if (atomic_dec_and_test(&csl->seq_read_in_flight))
+ WRITE_ONCE(csl->select_stamp, jiffies);
+ }
+
+ pcs_cs_deaccount(ireq, rcs, error);
+
+ do {
+ pcs_cs_wakeup(rcs, requeue);
+
+ requeue = pcs_cs_still_congested(rcs);
+ } while (requeue);
+ }
+ *csl_p = NULL;
+ cslist_put(csl);
+}
+
+void map_notify_soft_error(struct pcs_int_request *ireq)
+{
+ pcs_error_t err;
+
+ if (ireq->error.value == PCS_ERR_CSD_REPLICATING)
+ map_replicating(ireq);
+
+ if (ireq->error.value == PCS_ERR_CANCEL_KEEPWAIT)
+ pcs_clear_error(&ireq->error);
+
+ err = ireq->error;
+
+ if (!ireq->iochunk.direction &&
+ pcs_if_error(&err) &&
+ err.remote &&
+ err.value != PCS_ERR_CSD_STALE_MAP &&
+ err.value != PCS_ERR_CSD_REPLICATING &&
+ err.value != PCS_ERR_CSD_RO_MAP)
+ map_read_error(ireq);
+
+ if (pcs_if_error(&ireq->error))
+ map_notify_iochunk_error(ireq);
+
+ if (map_version_compare(&ireq->iochunk.hbuf.map_version, &ireq->iochunk.map->version) < 0)
+ ireq->flags &= ~IREQ_F_ONCE;
+
+ pcs_deaccount_ireq(ireq, &err);
+}
+
+static unsigned int map_ioprio_to_latency(unsigned int io_prio)
+{
+ static unsigned int map[] = {
+ 50000,
+ 50000,
+ 10000,
+ 4000,
+ 2000,
+ };
+
+ if (io_prio < sizeof(map)/sizeof(map[0]))
+ return map[io_prio];
+ else
+ return 500;
+}
+
+static int get_io_locality(struct pcs_cluster_core *cc)
+{
+ int io_locality;
+
+ io_locality = cc->io_locality;
+ if (io_locality == 0)
+ io_locality = cc->cfg.curr.io_locality;
+
+ return io_locality;
+}
+
+static unsigned int get_io_tweaks(struct pcs_cluster_core *cc)
+{
+ unsigned int io_tweaks;
+
+ io_tweaks = cc->io_tweaks;
+ if (io_tweaks == 0)
+ io_tweaks = cc->cfg.curr.io_tweaks;
+
+ return io_tweaks;
+}
+
+static int select_cs_for_read(struct pcs_cluster_core *cc, struct pcs_cs_list * csl, int is_seq, unsigned int pos, PCS_NODE_ID_T banned_cs)
+{
+ abs_time_t now = jiffies;
+ unsigned int local_min, remote_min, local_pipe, remote_pipe;
+ unsigned int local_mask, local_busy_mask;
+ int local_idx, remote_idx, selected;
+ int io_locality = get_io_locality(cc);
+ int io_cost;
+ int failed_cnt = 0;
+ int i;
+
+next_pass:
+
+ local_min = remote_min = local_pipe = remote_pipe = ~0U;
+ local_idx = remote_idx = -1;
+ local_mask = local_busy_mask = 0;
+
+ for (i = csl->nsrv - 1; i >= 0; i--) {
+ struct pcs_cs * cs = csl->cs[i].cslink.cs;
+ unsigned int w, io_lat, net_lat;
+ unsigned int in_flight;
+ abs_time_t io_prio_stamp;
+
+ if (failed_cnt >= 0 && ((test_bit(CS_SF_FAILED, &cs->state)) || cs->id.val == banned_cs.val)) {
+ failed_cnt++;
+ continue;
+ }
+
+ if (test_bit(i, &csl->blacklist)) {
+ if (jiffies < READ_ONCE(csl->blacklist_expires))
+ continue;
+ TRACE("expire replication blacklist");
+ clear_bit(i, &csl->blacklist);
+ }
+
+ if (cs_is_blacklisted(cs))
+ continue;
+
+ io_lat = __cs_get_avg_latency(cs, now);
+ net_lat = __cs_get_avg_net_latency(cs, now);
+ in_flight = READ_ONCE(cs->in_flight);
+ io_prio_stamp = READ_ONCE(cs->io_prio_stamp);
+
+ w = io_lat + net_lat;
+
+ if ((io_lat >> CS_LAT_EWMA_LOG) == 0 &&
+ now < io_prio_stamp + PCS_CS_IO_PRIO_VALID_TIME)
+ w = map_ioprio_to_latency(READ_ONCE(cs->io_prio)) + net_lat;
+
+ if (get_io_tweaks(cc) & PCS_TWEAK_USE_FLOW_LOAD)
+ w += pcs_flow_cs_analysis(cs) * 8000;
+
+ if (w <= remote_min) {
+
+ if (w < remote_min || in_flight <= remote_pipe) {
+ remote_min = w;
+ remote_pipe = in_flight;
+ remote_idx = i;
+ }
+ }
+
+ if (test_bit(CS_SF_LOCAL, &cs->state)) {
+ local_mask |= (1 << i);
+ if (io_lat > 1000)
+ local_busy_mask |= (1 << i);
+
+ if (w < local_min || (w == local_min && in_flight <= local_pipe)) {
+ local_min = w;
+ local_pipe = in_flight;
+ local_idx = i;
+ }
+ }
+ }
+
+ if (remote_idx < 0) {
+ if (failed_cnt > 0) {
+ failed_cnt = -1;
+ goto next_pass;
+ }
+ return -1;
+ }
+
+ /* If the flow is sequential, but we have too many sequential flows, consider
+ * all of them random, which is essentially true.
+ */
+ io_cost = 8000;
+ if (is_seq) {
+ int nflows = pcs_flow_analysis(&cc->maps.ftab);
+
+ if (nflows >= PCS_FLOW_THRESH && io_locality < 0)
+ is_seq = 0;
+
+ if (nflows < PCS_FLOW_THRESH)
+ io_cost = 500;
+ }
+
+ if (local_idx < 0)
+ selected = remote_idx;
+ else if (io_locality > 0)
+ selected = local_idx;
+ else if (io_locality == 0 && local_mask != local_busy_mask) {
+ selected = local_idx;
+ io_cost = local_min / 16;
+ } else if (get_io_tweaks(cc) & PCS_TWEAK_IGNORE_SEQUENTIAL)
+ selected = remote_idx;
+ else {
+ if (is_seq)
+ selected = local_idx;
+ else
+ selected = remote_idx;
+ }
+
+ /* Add penalty. The result of current decision will reflect itself in latency
+ * after at least one round-trip time. Penalty poisons weight until that moment.
+ * Ideally it should decay and be replaced with EWMA average introduced by increased latency.
+ * Think about better algorithm, maybe, it is the key to finally correct algorithm.
+ */
+ if (!(get_io_tweaks(cc) & PCS_TWEAK_USE_FLOW_LOAD))
+ cs_account_latency(csl->cs[selected].cslink.cs, io_cost);
+
+ return selected;
+}
+
+struct pcs_int_request *
+pcs_ireq_split(struct pcs_int_request *ireq, unsigned int iochunk, int noalign)
+{
+ struct pcs_int_request * sreq;
+
+ sreq = ireq_alloc(ireq->dentry);
+ if (!sreq)
+ return NULL;
+
+ sreq->dentry = ireq->dentry;
+ sreq->type = PCS_IREQ_IOCHUNK;
+ sreq->flags = ireq->flags;
+ sreq->iochunk.map = ireq->iochunk.map;
+ if (sreq->iochunk.map)
+ __pcs_map_get(sreq->iochunk.map);
+ sreq->iochunk.flow = pcs_flow_get(ireq->iochunk.flow);
+ sreq->iochunk.direction = ireq->iochunk.direction;
+ sreq->iochunk.role = ireq->iochunk.role;
+ sreq->iochunk.cs_index = ireq->iochunk.cs_index;
+ sreq->iochunk.chunk = ireq->iochunk.chunk;
+ sreq->iochunk.offset = ireq->iochunk.offset;
+ sreq->iochunk.dio_offset = ireq->iochunk.dio_offset;
+ if (!noalign &&
+ (sreq->iochunk.offset & 4095) &&
+ iochunk > (sreq->iochunk.offset & 4095) &&
+ ireq->iochunk.map &&
+ sreq->iochunk.chunk + sreq->iochunk.offset + iochunk != map_chunk_end(ireq->iochunk.map))
+ iochunk -= (sreq->iochunk.offset & 4095);
+ sreq->iochunk.size = iochunk;
+
+ if (ireq->flags & IREQ_F_LOC_TOKEN)
+ BUG();
+
+ sreq->iochunk.csl = NULL;
+ sreq->iochunk.banned_cs.val = 0;
+ sreq->complete_cb = ireq->complete_cb;
+ sreq->iochunk.msg.destructor = NULL;
+ sreq->iochunk.msg.rpc = NULL;
+ pcs_sreq_attach(sreq, ireq->completion_data.parent);
+
+ ireq->iochunk.size -= iochunk;
+ ireq->iochunk.offset += iochunk;
+ ireq->iochunk.dio_offset += iochunk;
+
+ return sreq;
+}
+
+static int pcs_cslist_submit_read(struct pcs_int_request *ireq, struct pcs_cs_list * csl, int requeue)
+{
+ struct pcs_cluster_core *cc = ireq->cc;
+ struct pcs_cs * cs;
+ unsigned int iochunk;
+ int allot;
+ int i = -1;
+ int is_seq, csl_seq = atomic_read(&csl->seq_read_in_flight);
+
+ is_seq = csl_seq || pcs_flow_sequential(ireq->iochunk.flow);
+ i = READ_ONCE(csl->read_index);
+
+ if (i >= 0) {
+ abs_time_t now = jiffies;
+ abs_time_t selected = READ_ONCE(csl->select_stamp);
+
+ cs = csl->cs[i].cslink.cs;
+
+ /* Force rebalance after long timeout or when there is no sequential IO
+ * on this chunk and new read begins from chunk start.
+ * Also rebalance after short timeout, but only if one of the following conditions hold:
+ * 1. No active sequential reads on this chunk, including this one.
+ * 2. io_locality < 0
+ * 3. No active sequential reads, sequential read from remote CS. Maybe, we want to switch to local.
+ */
+ if (now > selected + PCS_MAP_MAX_REBALANCE_TIMEOUT ||
+ (!csl_seq && ireq->iochunk.offset == 0) ||
+ (get_io_tweaks(cc) & PCS_TWEAK_REBALANCE_ALWAYS) ||
+ (now > selected + PCS_MAP_MIN_REBALANCE_TIMEOUT &&
+ (!is_seq || get_io_locality(cc) < 0 ||
+ (!csl_seq &&
+ !(test_bit(CS_SF_LOCAL, &cs->state)) && (csl->flags & CSL_FL_HAS_LOCAL))))) {
+ i = -1;
+ WRITE_ONCE(csl->read_index, -1);
+ }
+ }
+
+ if (i < 0) {
+ i = select_cs_for_read(cc, csl, is_seq, ireq->iochunk.offset, ireq->iochunk.banned_cs);
+
+ if (i < 0) {
+ /* All CSes are blacklisted. Generate error for the first one
+ * and let MDS to figure what heppened with the rest.
+ */
+ cs = csl->cs[0].cslink.cs;
+ map_remote_error(ireq->iochunk.map, cs->blacklist_reason, cs->id.val);
+
+ TRACE("Read from " MAP_FMT " blocked by blacklist error %d, CS" NODE_FMT,
+ MAP_ARGS(ireq->iochunk.map), cs->blacklist_reason, NODE_ARGS(cs->id));
+ return -1;
+ }
+
+ WRITE_ONCE(csl->read_index, i);
+ WRITE_ONCE(csl->select_stamp, jiffies);
+
+ TRACE("Selected read map " MAP_FMT " to CS" NODE_FMT "; is_seq=%d\n", MAP_ARGS(ireq->iochunk.map),
+ NODE_ARGS(csl->cs[i].cslink.cs->id), is_seq);
+ pcs_flow_bind_cs(ireq->iochunk.flow, csl->cs[i].cslink.cs);
+ }
+ cs = csl->cs[i].cslink.cs;
+
+ ireq->iochunk.cs_index = i;
+
+ spin_lock(&cs->lock);
+ cs_cwnd_use_or_lose(cs);
+ allot = cs->eff_cwnd - cs->in_flight;
+ spin_unlock(&cs->lock);
+
+ if (allot < 0) {
+ pcs_cs_cong_enqueue(ireq, cs);
+
+ return 0;
+ }
+
+ if (allot < ireq->dentry->cluster->cfg.curr.lmss)
+ allot = ireq->dentry->cluster->cfg.curr.lmss;
+
+ if (test_bit(CS_SF_LOCAL, &cs->state))
+ iochunk = ireq->dentry->cluster->cfg.curr.lmss;
+ else
+ iochunk = ireq->dentry->cluster->cfg.curr.rmss;
+
+ for (;;) {
+ struct pcs_int_request * sreq = ireq;
+ unsigned int weight;
+
+ if (ireq->iochunk.size > iochunk) {
+ sreq = pcs_ireq_split(ireq, iochunk, 0);
+
+ if (sreq == NULL) {
+ pcs_set_local_error(&ireq->error, PCS_ERR_NOMEM);
+ ireq_complete(ireq);
+ return 0;
+ }
+ }
+
+ sreq->flags &= ~(IREQ_F_RND_WEIGHT | IREQ_F_SEQ);
+ BUG_ON(sreq->flags & IREQ_F_SEQ_READ);
+ if (pcs_flow_sequential(sreq->iochunk.flow)) {
+ sreq->flags |= IREQ_F_SEQ_READ | IREQ_F_SEQ;
+ atomic_inc(&csl->seq_read_in_flight);
+ weight = cong_roundup(sreq->iochunk.size);
+ } else if (sreq->iochunk.size >= 512*1024 || !(get_io_tweaks(cc) & PCS_TWEAK_USE_FLOW_WEIGHT)) {
+ weight = cong_roundup(sreq->iochunk.size);
+ } else {
+ sreq->flags |= IREQ_F_RND_WEIGHT;
+ weight = 512*1024;
+ }
+
+ cs_increment_in_flight(cs, weight);
+ allot -= weight;
+
+ BUG_ON(sreq->iochunk.csl);
+ cslist_get(csl);
+ sreq->iochunk.csl = csl;
+ pcs_cs_submit(cs, sreq);
+
+ if (sreq == ireq)
+ return 0;
+
+ if (allot < 0) {
+ pcs_cs_cong_enqueue(ireq, cs);
+ return 0;
+ }
+ }
+}
+
+static int pcs_cslist_submit_write(struct pcs_int_request *ireq, struct pcs_cs_list * csl, int requeue)
+{
+ struct pcs_cs * cs;
+ unsigned int iochunk;
+ int i;
+ int congested_idx;
+ int max_excess;
+ int allot;
+
+ ireq->iochunk.cs_index = 0;
+ iochunk = ireq->dentry->cluster->cfg.curr.lmss;
+
+restart:
+ congested_idx = -1;
+ max_excess = 0;
+ allot = ireq->iochunk.size;
+
+ for (i = 0; i < csl->nsrv; i++) {
+ int cs_allot;
+
+ cs = csl->cs[i].cslink.cs;
+ if (cs_is_blacklisted(cs)) {
+ map_remote_error(ireq->iochunk.map, cs->blacklist_reason, cs->id.val);
+ TRACE("Write to " MAP_FMT " blocked by blacklist error %d, CS" NODE_FMT,
+ MAP_ARGS(ireq->iochunk.map), cs->blacklist_reason, NODE_ARGS(cs->id));
+ return -1;
+ }
+ spin_lock(&cs->lock);
+ cs_cwnd_use_or_lose(cs);
+ cs_allot = cs->eff_cwnd - cs->in_flight;
+ spin_unlock(&cs->lock);
+
+ if (cs_allot < 0) {
+ cs_allot = -cs_allot;
+ if (cs_allot > max_excess) {
+ congested_idx = i;
+ max_excess = cs_allot;
+ }
+ } else {
+ if (cs_allot < allot)
+ allot = cs_allot;
+ }
+
+ if (!(test_bit(CS_SF_LOCAL, &cs->state)))
+ iochunk = ireq->dentry->cluster->cfg.curr.wmss;
+ }
+
+ if (congested_idx >= 0) {
+ int cur_cong_idx = READ_ONCE(csl->cong_index);
+
+
+ if (cur_cong_idx >= 0 && !requeue &&
+ (READ_ONCE(csl->cs[cur_cong_idx].cslink.cs->cong_queue_len) ||
+ READ_ONCE(csl->cs[cur_cong_idx].cslink.cs->active_list_len)))
+ congested_idx = cur_cong_idx;
+ else
+ WRITE_ONCE(csl->cong_index, congested_idx);
+
+ pcs_cs_cong_enqueue(ireq, csl->cs[congested_idx].cslink.cs);
+ return 0;
+ }
+ WRITE_ONCE(csl->cong_index, -1);
+
+ if (allot < ireq->dentry->cluster->cfg.curr.lmss)
+ allot = ireq->dentry->cluster->cfg.curr.lmss;
+
+ for (;;) {
+ struct pcs_int_request * sreq = ireq;
+ unsigned int weight;
+
+ if (ireq->iochunk.size > iochunk) {
+ sreq = pcs_ireq_split(ireq, iochunk, 0);
+
+ if (sreq == NULL) {
+ pcs_set_local_error(&ireq->error, PCS_ERR_NOMEM);
+ ireq_complete(ireq);
+ return 0;
+ }
+ }
+
+ sreq->flags &= ~(IREQ_F_RND_WEIGHT | IREQ_F_SEQ);
+ BUG_ON(sreq->flags & IREQ_F_SEQ_READ);
+ if (pcs_flow_sequential(sreq->iochunk.flow)) {
+ weight = cong_roundup(sreq->iochunk.size);
+ sreq->flags |= IREQ_F_SEQ;
+ } else if (!(get_io_tweaks(ireq->cc) & PCS_TWEAK_USE_FLOW_WEIGHT) ||
+ sreq->iochunk.size > 512*1024) {
+ weight = cong_roundup(sreq->iochunk.size);
+ } else {
+ weight = 512*1024;
+ sreq->flags |= IREQ_F_RND_WEIGHT;
+ }
+
+ for (i = 0; i < csl->nsrv; i++)
+ cs_increment_in_flight(csl->cs[i].cslink.cs, weight);
+
+ allot -= weight;
+ cs = csl->cs[0].cslink.cs;
+
+ cslist_get(csl);
+ BUG_ON(sreq->iochunk.csl);
+ sreq->iochunk.csl = csl;
+ pcs_cs_submit(cs, sreq);
+
+ if (ireq == sreq)
+ return 0;
+
+ /* Window for some of CSes is closed. Restart processing remaining part
+ * of request. Note, if state of map has been changed, it even can fail
+ * and return to caller with -1.
+ */
+ if (allot < 0)
+ goto restart;
+ }
+}
+
+static int pcs_cslist_submit_flush(struct pcs_int_request *ireq, struct pcs_cs_list * csl, int requeue)
+{
+ struct pcs_cs * cs;
+ int i;
+ int congested_idx;
+ int max_excess;
+ int allot = PCS_CS_FLUSH_WEIGHT;
+ struct pcs_msg * msg;
+ struct pcs_cs_iohdr * ioh;
+
+ congested_idx = -1;
+ max_excess = 0;
+
+ for (i = 0; i < csl->nsrv; i++) {
+ int cs_allot;
+
+ cs = csl->cs[i].cslink.cs;
+
+ if (cs_is_blacklisted(cs)) {
+ map_remote_error(ireq->flushreq.map, cs->blacklist_reason, cs->id.val);
+ TRACE("Flush to " MAP_FMT " blocked by blacklist error %d, CS" NODE_FMT,
+ MAP_ARGS(ireq->flushreq.map), cs->blacklist_reason, NODE_ARGS(cs->id));
+ return -1;
+ }
+
+ spin_lock(&cs->lock);
+ cs_cwnd_use_or_lose(cs);
+ cs_allot = cs->eff_cwnd - cs->in_flight;
+ spin_unlock(&cs->lock);
+
+ if (cs_allot < 0) {
+ cs_allot = -cs_allot;
+ if (cs_allot > max_excess) {
+ congested_idx = i;
+ max_excess = cs_allot;
+ }
+ }
+ }
+
+ if (congested_idx >= 0) {
+ int cur_cong_idx = READ_ONCE(csl->cong_index);
+
+ if (cur_cong_idx >= 0 && !requeue &&
+ (READ_ONCE(csl->cs[cur_cong_idx].cslink.cs->cong_queue_len) ||
+ READ_ONCE(csl->cs[cur_cong_idx].cslink.cs->active_list_len)))
+ congested_idx = cur_cong_idx;
+ else
+ WRITE_ONCE(csl->cong_index, congested_idx);
+
+ pcs_cs_cong_enqueue(ireq, csl->cs[congested_idx].cslink.cs);
+ return 0;
+ }
+
+ WRITE_ONCE(csl->cong_index, -1);
+
+ for (i = 0; i < csl->nsrv; i++) {
+ cs = csl->cs[i].cslink.cs;
+ cs_increment_in_flight(cs, allot);
+ }
+
+ cs = csl->cs[0].cslink.cs;
+
+ BUG_ON(ireq->flushreq.csl);
+ cslist_get(csl);
+ ireq->flushreq.csl = csl;
+ ireq->ts_sent = jiffies;
+ ireq->wait_origin.val = 0;
+
+ msg = ireq->flushreq.msg;
+ msg->private2 = ireq;
+
+ ioh = (struct pcs_cs_iohdr *)msg->_inline_buffer;
+
+ if (msg->rpc) {
+ pcs_rpc_put(msg->rpc);
+ msg->rpc = NULL;
+ }
+ pcs_clear_error(&msg->error);
+ msg->timeout = csl->write_timeout;
+
+ pcs_rpc_get_new_xid(cs->rpc->eng, &ioh->hdr.xid);
+ ioh->map_version = csl->version;
+
+ pcs_rpc_call(cs->rpc, msg);
+ return 0;
+}
+
+
+
+int pcs_cslist_submit(struct pcs_int_request *ireq, struct pcs_cs_list *csl, int requeue)
+{
+ BUG_ON(!atomic_read(&csl->refcnt));
+
+ if (ireq->type == PCS_IREQ_FLUSH) {
+ BUG();
+ return pcs_cslist_submit_flush(ireq, csl, requeue);
+ } else if (!ireq->iochunk.direction) {
+ return pcs_cslist_submit_read(ireq, csl, requeue);
+ } else if (ireq->flags & IREQ_F_MAPPED) {
+ BUG();
+ return -EIO;
+ } else {
+ BUG();
+ return pcs_cslist_submit_write(ireq, csl, requeue);
+ }
+ BUG();
+ return -EIO;
+}
+
+void map_submit(struct pcs_map_entry * m, struct pcs_int_request *ireq, int requeue)
+{
+ int direction;
+ int done;
+
+ DTRACE("enter m: " MAP_FMT ", ireq:%p \n", MAP_ARGS(m), ireq);
+ BUG_ON(ireq->type != PCS_IREQ_IOCHUNK && ireq->type != PCS_IREQ_FLUSH);
+
+ if (ireq_is_timed_out(ireq)) {
+ pcs_log(LOG_ERR, "timeout while getting map \"" MAP_FMT "\", last err=%d",
+ MAP_ARGS(m), ireq->error.value);
+ BUG();
+ }
+
+ BUG_ON(pcs_if_error(&ireq->error));
+
+
+ direction = (ireq->type != PCS_IREQ_FLUSH ? ireq->iochunk.direction : 1);
+
+ do {
+ struct pcs_cs_list *csl = NULL;
+
+ spin_lock(&m->lock);
+ if (ireq->type == PCS_IREQ_IOCHUNK) {
+ ireq->iochunk.hbuf.map_version = m->version;
+ ireq->iochunk.hbuf.uid = ireq->iochunk.map->id;
+ }
+ if (!(m->state & (1 << direction))) {
+ spin_unlock(&m->lock);
+ pcs_map_queue_resolve(m, ireq, direction);
+ return;
+ }
+ csl = m->cs_list;
+ if (csl)
+ cslist_get(csl);
+ spin_unlock(&m->lock);
+
+ if (ireq->type != PCS_IREQ_FLUSH && !(ireq->flags & IREQ_F_MAPPED)) {
+ u64 pos = ireq->iochunk.chunk + ireq->iochunk.offset;
+ u64 len = map_chunk_end(m) - pos;
+
+ /*
+ * For non variable chunks all alligment should be done
+ * inside pcs_cc_process_ireq_ioreq();
+ */
+ BUG_ON(pos < map_chunk_start(m));
+ BUG_ON(ireq->iochunk.chunk != map_chunk_start(m));
+ BUG_ON(ireq->iochunk.offset != pos - ireq->iochunk.chunk);
+ if (ireq->iochunk.size > len) {
+ struct pcs_int_request * sreq;
+
+ sreq = pcs_ireq_split(ireq, len, 0);
+ if (ireq->iochunk.map) {
+ pcs_map_put(ireq->iochunk.map);
+ ireq->iochunk.map = NULL;
+ }
+ ireq->iochunk.chunk = map_chunk_end(m);
+ ireq->iochunk.offset = 0;
+ pcs_cc_submit(ireq->dentry->cluster, ireq);
+ ireq = sreq;
+ }
+ }
+
+ if (!csl) {
+ if (ireq->type != PCS_IREQ_FLUSH)
+ ireq_handle_hole(ireq);
+ else
+ ireq_complete(ireq);
+ return;
+ }
+
+ if (direction && ireq->type != PCS_IREQ_FLUSH)
+ ireq->dentry->local_mtime = get_real_time_ms();
+
+ done = !pcs_cslist_submit(ireq, csl, requeue);
+ cslist_put(csl);
+ } while (!done);
+}
+
+static int valid_for_truncate(struct pcs_map_entry * m, struct pcs_int_request *ireq)
+{
+ /* This weird test means that map is valid, but points to a hole. In this case
+ * truncate is noop.
+ */
+ if ((m->state & (PCS_MAP_ERROR|PCS_MAP_RESOLVING|PCS_MAP_NEW|PCS_MAP_READABLE)) ==
+ (PCS_MAP_NEW|PCS_MAP_READABLE))
+ return 1;
+
+ /* If we already have valid map, remember its version
+ * and switch to the next phase: invalidation and requesting
+ * new map.
+ */
+ if (!(m->state & (PCS_MAP_ERROR|PCS_MAP_RESOLVING|PCS_MAP_NEW))) {
+ map_remote_error_nolock(m, PCS_ERR_CSD_STALE_MAP, m->cs_list ? m->cs_list->cs[0].info.id.val : 0);
+ ireq->truncreq.version = m->version;
+ }
+ /* Otherwise lookup valid map first. */
+ return 0;
+}
+
+
+//// TODO: truncate should probably synhroniously truncate local mapping.
+void process_ireq_truncate(struct pcs_int_request *ireq)
+{
+ struct pcs_dentry_info *di = ireq->dentry;
+ struct pcs_map_entry * m;
+ u64 end;
+
+ /* Special case: full truncate */
+ if (ireq->truncreq.offset == 0) {
+ map_truncate_tail(&di->mapping, 0);
+ ireq_complete(ireq);
+ return;
+ }
+
+ m = pcs_find_get_map(di, ireq->truncreq.offset - 1);
+
+ TRACE("process TRUNCATE %llu@" DENTRY_FMT " %x",
+ (unsigned long long)ireq->truncreq.offset, DENTRY_ARGS(di), m ? m->state : -1);
+
+ if (m == NULL) {
+ map_queue_on_limit(ireq);
+ return;
+ }
+ end = map_chunk_end(m);
+ if (end <= ireq->truncreq.offset) {
+ map_truncate_tail(&di->mapping, end);
+ ireq_complete(ireq);
+ return;
+ }
+
+ if (ireq->truncreq.phase == 0) {
+ if (valid_for_truncate(m, ireq)) {
+ map_truncate_tail(&di->mapping, end);
+ ireq_complete(ireq);
+ return;
+ }
+ } else {
+ /* We already had some valid map. Must get new one. */
+
+
+ spin_lock(&m->lock);
+ if ((m->state & (PCS_MAP_ERROR|PCS_MAP_RESOLVING|PCS_MAP_NEW|PCS_MAP_READABLE)) ==
+ (PCS_MAP_NEW|PCS_MAP_READABLE)) {
+
+ spin_unlock(&m->lock);
+ pcs_log(LOG_INFO, "map " MAP_FMT " unexpectedly converted to hole", MAP_ARGS(m));
+ map_truncate_tail(&di->mapping, end);
+ ireq_complete(ireq);
+ return;
+ }
+
+ if (m->state & PCS_MAP_RESOLVING) {
+ list_add_tail(&ireq->list, &m->queue);
+ spin_unlock(&m->lock);
+ return;
+ }
+
+ if (!(m->state & (PCS_MAP_ERROR|PCS_MAP_NEW))) {
+ if (map_version_compare(&m->version, &ireq->truncreq.version) > 0) {
+ spin_unlock(&m->lock);
+ map_truncate_tail(&di->mapping, end);
+ ireq_complete(ireq);
+ return;
+ }
+
+ TRACE("map " MAP_FMT " is not updated yet", MAP_ARGS(m));
+ map_remote_error_nolock(m, PCS_ERR_CSD_STALE_MAP, m->cs_list ? m->cs_list->cs[0].info.id.val : 0);
+
+ }
+ spin_unlock(&m->lock);
+ }
+ pcs_map_queue_resolve(m, ireq, 1);
+}
+
+
+noinline void pcs_mapping_truncate(struct pcs_int_request *ireq, u64 old_size)
+{
+ struct pcs_dentry_info *di = ireq->dentry;
+ u64 new_size = DENTRY_SIZE(di);
+ u64 offset;
+ struct pcs_map_entry * m = NULL;
+ int queue = 0;
+
+ di->local_mtime = get_real_time_ms();
+
+ if (new_size < old_size)
+ pcs_flow_truncate(&di->mapping.ftab, new_size, &di->cluster->maps.ftab);
+
+ if (old_size < new_size)
+ offset = old_size;
+ else
+ offset = new_size;
+
+ ireq->truncreq.offset = offset;
+ ireq->truncreq.phase = 0;
+
+ if (offset == 0) {
+ map_truncate_tail(&di->mapping, offset);
+ ireq_complete(ireq);
+ return;
+ }
+
+ map_truncate_tail(&di->mapping, offset + 1);
+
+ m = pcs_find_get_map(di, offset - 1);
+
+ if (m) {
+ TRACE("mapping truncate %llu->%llu " DENTRY_FMT " %x", (unsigned long long)old_size,
+ (unsigned long long)new_size, DENTRY_ARGS(ireq->dentry), m ? m->state : -1);
+ }
+ if (m && map_chunk_end(m) == offset) {
+ map_truncate_tail(&di->mapping, offset);
+ ireq_complete(ireq);
+ return;
+ }
+
+
+ if (m == NULL)
+ queue = 1;
+
+ spin_lock(&m->lock);
+ if (valid_for_truncate(m, ireq))
+ queue = 1;
+ spin_unlock(&m->lock);
+
+ if (queue) {
+ if (m) {
+ pcs_map_queue_resolve(m, ireq, 1);
+ } else {
+ map_queue_on_limit(ireq);
+ }
+ } else {
+ map_truncate_tail(&di->mapping, map_chunk_end(m));
+ ireq_complete(ireq);
+ }
+
+ if (m)
+ pcs_map_put(m);
+}
+
+static int commit_cs_record(struct pcs_map_entry * m, struct pcs_cs_record * rec,
+ struct pcs_cs_sync_data * sync, u32 lat, int op_type)
+{
+ int dirtify;
+ struct cs_sync_state * srec = &rec->sync;
+ if (sync->ts_net > sync->ts_io)
+ lat -= sync->ts_net;
+ else
+ lat -= sync->ts_io;
+
+ pcs_cs_update_stat(rec->cslink.cs, sync->ts_io, ((int)lat < 0) ? 0 : lat, op_type);
+ cs_update_io_latency(rec->cslink.cs, sync->ts_io);
+
+ /* First: verify integrity sequence. */
+ if (rec->info.integrity_seq != sync->integrity_seq) {
+ /* Now this is possible only if IO was issued and completed
+ * before CS rebooted, but we see the result after.
+ *
+ * The request is restarted with new map.
+ */
+ pcs_log(LOG_ERR, MAP_FMT " integrity seq mismatch CS" NODE_FMT " %d != %d, %d",
+ MAP_ARGS(m),
+ NODE_ARGS(rec->info.id),
+ rec->info.integrity_seq, sync->integrity_seq, srec->dirty_integrity);
+ return 1;
+ }
+
+ BUG_ON(srec->dirty_integrity && srec->dirty_integrity != sync->integrity_seq);
+
+ dirtify = (op_type == PCS_CS_WRITE_SYNC_RESP || op_type == PCS_CS_WRITE_RESP);
+ /* The following looks scary, could be more clear.
+ * The goal is to update sync seq numbers:
+ *
+ * READ/SYNC (!dirtifying):
+ * - sync_epoch/sync_seq advance sync_epoch/seq
+ * WRITE/WRITE_SYNC (dirtifying):
+ * - sync_epoch/sync_seq advance sync_epoch/seq
+ * - sync_epoch/sync_dirty advance dirty_epoch/seq
+ */
+ if (dirtify && sync->sync_dirty) {
+ srec->dirty_integrity = sync->integrity_seq;
+
+ if (srec->dirty_epoch == 0 ||
+ pcs_sync_seq_compare(sync->sync_epoch, srec->dirty_epoch) > 0) {
+ srec->dirty_epoch = sync->sync_epoch;
+ srec->dirty_seq = sync->sync_dirty;
+ } else if (sync->sync_epoch == srec->dirty_epoch &&
+ pcs_sync_seq_compare(sync->sync_dirty, srec->dirty_seq) > 0) {
+ srec->dirty_seq = sync->sync_dirty;
+ }
+ }
+
+ if (srec->sync_epoch == 0 ||
+ pcs_sync_seq_compare(sync->sync_epoch, srec->sync_epoch) > 0) {
+ srec->sync_epoch = sync->sync_epoch;
+ srec->sync_seq = sync->sync_current;
+ } else if (sync->sync_epoch == srec->sync_epoch &&
+ pcs_sync_seq_compare(sync->sync_current, srec->sync_seq) > 0) {
+ srec->sync_seq = sync->sync_current;
+ }
+ return 0;
+}
+
+static int commit_one_record(struct pcs_map_entry * m, PCS_NODE_ID_T cs_id,
+ struct pcs_cs_sync_data * sync, u32 lat, int op_type)
+{
+ int err = 0;
+ int i;
+
+ BUG_ON(sync->integrity_seq == 0);
+
+ if (m->cs_list == NULL)
+ return 0;
+
+ pcs_log(LOG_DEBUG5, "sync ["NODE_FMT",%u,%u,%u,%u]", NODE_ARGS(cs_id),
+ sync->integrity_seq, sync->sync_epoch, sync->sync_dirty, sync->sync_current);
+
+ for (i = 0; i < m->cs_list->nsrv; i++) {
+ if (m->cs_list->cs[i].info.id.val == cs_id.val) {
+ err = commit_cs_record(m, &m->cs_list->cs[i], sync, lat, op_type);
+
+ pcs_log(LOG_DEBUG5, "commited ["NODE_FMT",%u/%u,%u/%u,%u/%u]", NODE_ARGS(cs_id),
+ m->cs_list->cs[i].info.integrity_seq,
+ m->cs_list->cs[i].sync.dirty_integrity,
+ m->cs_list->cs[i].sync.dirty_epoch,
+ m->cs_list->cs[i].sync.dirty_seq,
+ m->cs_list->cs[i].sync.sync_epoch,
+ m->cs_list->cs[i].sync.sync_seq);
+ break;
+ }
+ }
+ return err;
+}
+
+static void update_net_latency(struct pcs_cs_list * csl, PCS_NODE_ID_T id,
+ struct pcs_cs_sync_data * sync, unsigned int lat)
+{
+ int i;
+
+ if (sync->ts_net > sync->ts_io)
+ lat -= sync->ts_net;
+ else
+ lat -= sync->ts_io;
+
+ if ((int)lat <= 0)
+ return;
+
+ for (i = 0; i < csl->nsrv; i++) {
+ if (id.val == csl->cs[i].info.id.val) {
+ struct pcs_cs * cs = csl->cs[i].cslink.cs;
+
+ if (i != 0 || !(test_bit(CS_SF_LOCAL, &cs->state)))
+ cs_update_net_latency(csl->cs[i].cslink.cs, lat);
+ break;
+ }
+ }
+}
+
+static inline u32 calc_latency(abs_time_t start)
+{
+ abs_time_t now = jiffies;
+ u64 elapsed = (now > start)? now - start: 0;
+ return elapsed > ~0U ? ~0U : elapsed;
+}
+
+static int commit_sync_info(struct pcs_int_request *req,
+ struct pcs_map_entry * m, struct pcs_cs_list * csl,
+ struct pcs_msg * resp)
+{
+ struct pcs_cs_iohdr *h = (struct pcs_cs_iohdr *)resp->_inline_buffer;
+ int err = 0;
+ unsigned int max_iolat, lat = calc_latency(req->ts_sent);
+
+ err |= commit_one_record(m, resp->rpc->peer_id, &h->sync, lat, h->hdr.type);
+
+ /* Network latency is updated only for the first CS in chain.
+ * The results for anothers are ignored, which looks sad, because we lose
+ * alot of information. The thing is that measured latency
+ * is actually sum of network latencies in both directions, so that if we
+ * average all the results we get not CS latency but CS latency + average_over_cluster,
+ * which is even undefined when we use EWMA averaging (it would be defined
+ * if we calculated EWMA latency for each link, otherwise it is EWMA of a random number)
+ * If we fix one node (client in this case), we calculate average sum of client
+ * plus CS, which is enough to use this value to select the least loaded CS for read.
+ */
+ update_net_latency(csl, resp->rpc->peer_id, &h->sync, lat);
+ max_iolat = h->sync.ts_io;
+
+ if (h->hdr.type != PCS_CS_READ_RESP) {
+ struct pcs_cs_sync_resp * srec;
+ lat = h->sync.ts_net;
+ for (srec = (struct pcs_cs_sync_resp*)(h + 1);
+ (void*)(srec + 1) <= (void*)h + h->hdr.len;
+ srec++) {
+ err |= commit_one_record(m, srec->cs_id, &srec->sync, lat, h->hdr.type);
+ lat = srec->sync.ts_net;
+ if (max_iolat < srec->sync.ts_io)
+ max_iolat = srec->sync.ts_io;
+ }
+ }
+ //// temproraly disable logging
+ ////cs_log_io_times(req, resp, max_iolat);
+
+ evaluate_dirty_status(m);
+ return err;
+}
+
+void pcs_map_verify_sync_state(struct pcs_dentry_info *di, struct pcs_int_request *ireq, struct pcs_msg * msg)
+{
+ struct pcs_map_entry * m = ireq->iochunk.map;
+ struct pcs_msg * resp = msg->response;
+
+ if (!m)
+ return;
+
+ spin_lock (&m->lock);
+ if (m->cs_list == NULL || (m->state & PCS_MAP_DEAD)) {
+ spin_unlock(&m->lock);
+ return;
+ }
+ if (commit_sync_info(ireq, m, ireq->iochunk.csl, resp)) {
+ pcs_log(LOG_ERR, MAP_FMT " sync integrity error: map retry follows", MAP_ARGS(m));
+
+ msg->error.value = PCS_ERR_CSD_STALE_MAP;
+ msg->error.remote = 1;
+ msg->error.offender = m->cs_list->cs[0].info.id;
+ }
+ spin_unlock(&m->lock);
+
+ if (ireq->iochunk.flow) {
+ struct pcs_int_request * preq = ireq->completion_data.parent;
+
+ pcs_flow_confirm(ireq->iochunk.flow, &ireq->dentry->mapping.ftab,
+ preq->apireq.req->type == PCS_REQ_T_WRITE,
+ preq->apireq.req->pos, preq->apireq.req->size,
+ &ireq->cc->maps.ftab);
+ }
+
+}
+
+void sync_done(struct pcs_msg * msg)
+{
+ struct pcs_int_request * sreq = msg->private;
+ struct pcs_map_entry * m = sreq->flushreq.map;
+ struct pcs_msg * resp = msg->response;
+
+ spin_lock(&m->lock);
+ if (m->state & PCS_MAP_DEAD)
+ goto done;
+ if (!(m->flags & PCS_MAP_DIRTY))
+ goto done;
+
+ if (pcs_if_error(&msg->error)) {
+ pcs_copy_error(&sreq->error, &msg->error);
+ goto done;
+ }
+
+ if (commit_sync_info(sreq, m, sreq->flushreq.csl, resp)) {
+ pcs_log(LOG_ERR, MAP_FMT " sync integrity error: sync retry follows", MAP_ARGS(m));
+
+ sreq->error.remote = 1;
+ sreq->error.value = PCS_ERR_CSD_STALE_MAP;
+ sreq->error.offender = m->cs_list->cs[0].info.id;
+ }
+
+done:
+ spin_unlock(&m->lock);
+ ireq_complete(sreq);
+ return;
+}
+
+static int sync_is_finished(struct pcs_msg * msg, struct pcs_map_entry * m)
+{
+ struct pcs_cs_iohdr * h = (struct pcs_cs_iohdr *)msg->_inline_buffer;
+ struct pcs_cs_sync_resp * srec;
+
+ if (m->cs_list == NULL)
+ return 1;
+
+ for (srec = (struct pcs_cs_sync_resp *)(h + 1);
+ (void*)(srec + 1) <= (void*)h + h->hdr.len;
+ srec++) {
+ int i;
+
+ pcs_log(LOG_DEBUG5, "Checking cs="NODE_FMT" sync=[%d,%d,%d,%d]", NODE_ARGS(srec->cs_id), srec->sync.integrity_seq,
+ srec->sync.sync_epoch,
+ srec->sync.sync_dirty, srec->sync.sync_current);
+
+ for (i = 0; i < m->cs_list->nsrv; i++) {
+ if (m->cs_list->cs[i].info.id.val == srec->cs_id.val) {
+ pcs_log(LOG_DEBUG5, "Checking against sync=[%d,%d,%d,%d,%d]",
+ m->cs_list->cs[i].sync.dirty_integrity,
+ m->cs_list->cs[i].sync.dirty_epoch,
+ m->cs_list->cs[i].sync.dirty_seq,
+ m->cs_list->cs[i].sync.sync_epoch,
+ m->cs_list->cs[i].sync.sync_seq);
+ if (cs_is_dirty(&m->cs_list->cs[i].sync) &&
+ srec->sync.sync_epoch == m->cs_list->cs[i].sync.sync_epoch &&
+ pcs_sync_seq_compare(srec->sync.sync_current, m->cs_list->cs[i].sync.sync_seq) >= 0)
+ return 0;
+ break;
+ }
+ }
+ }
+ return 1;
+}
+
+void process_flush_req(struct pcs_int_request *ireq)
+{
+ struct pcs_map_entry * m = ireq->flushreq.map;
+
+ spin_lock(&m->lock);
+ if (m->state & PCS_MAP_DEAD)
+ goto done;
+
+ TRACE("process FLUSH " MAP_FMT, MAP_ARGS(m));
+
+ if (!(m->flags & PCS_MAP_DIRTY))
+ goto done;
+ if (sync_is_finished(ireq->flushreq.msg, m)) {
+ TRACE("finished");
+ goto done;
+ }
+ spin_unlock(&m->lock);
+ map_submit(m, ireq, 0);
+ return;
+
+done:
+ if (pcs_if_error(&ireq->error)) {
+ TRACE("oops, delete me %d", ireq->error.value);
+ pcs_clear_error(&ireq->error);
+ }
+ ireq_complete(ireq);
+}
+
+static void pcs_flushreq_complete(struct pcs_int_request * sreq)
+{
+ struct pcs_int_request *ireq = sreq->completion_data.parent;
+ struct pcs_map_entry * m = sreq->flushreq.map;
+ struct pcs_cs_iohdr * ioh = (struct pcs_cs_iohdr*)msg_inline_head(sreq->flushreq.msg);
+ int notify_error = 0;
+
+ spin_lock(&m->lock);
+ if (!ireq)
+ m->flags &= ~PCS_MAP_FLUSHING;
+ m->flags &= ~PCS_MAP_DIRTY_GC;
+
+ if (m->state & PCS_MAP_DEAD)
+ goto done;
+ if (!(m->flags & PCS_MAP_DIRTY))
+ goto done;
+
+ if (!pcs_if_error(&sreq->error)) {
+ if (sync_is_finished(sreq->flushreq.msg, m)) {
+ TRACE("finished");
+ goto done_dirty;
+ }
+ sreq->error.value = PCS_ERR_CSD_STALE_MAP;
+ sreq->error.remote = 1;
+ sreq->error.offender = m->cs_list->cs[0].info.id;
+ }
+
+ if (ireq && !pcs_if_error(&ireq->error)) {
+ if (ireq_check_redo(sreq)) {
+ if (ireq_is_timed_out(sreq)) {
+ pcs_log(LOG_ERR, "timeout while flush request on \"" DENTRY_FMT "\" last_err=%u",
+ DENTRY_ARGS(sreq->dentry), sreq->error.value);
+ BUG();
+ }
+ TRACE("restart after flush error %d", sreq->error.value);
+ if (map_version_compare(&ioh->map_version, &m->version) < 0)
+ sreq->flags &= ~IREQ_F_ONCE;
+ spin_unlock(&m->lock);
+
+ map_notify_error(m, sreq, &ioh->map_version, sreq->flushreq.csl);
+ pcs_deaccount_ireq(sreq, &sreq->error);
+ pcs_clear_error(&sreq->error);
+
+ if (!(sreq->flags & IREQ_F_ONCE)) {
+ sreq->flags |= IREQ_F_ONCE;
+ pcs_cc_submit(sreq->cc, sreq);
+ } else
+ ireq_delay(sreq);
+ return;
+ }
+ TRACE("flush error %d", sreq->error.value);
+ pcs_copy_error(&ireq->error, &sreq->error);
+ notify_error = 1;
+ }
+
+done_dirty:
+ if (!ireq)
+ map_sync_work_add(m, pcs_sync_timeout(cc_from_map(m)));
+done:
+ spin_unlock(&m->lock);
+ if (notify_error)
+ map_notify_error(m, sreq, &ioh->map_version, sreq->flushreq.csl);
+
+ pcs_deaccount_ireq(sreq, &sreq->error);
+
+ if (ireq) {
+ if (!pcs_sreq_detach(sreq))
+ ireq_complete(ireq);
+ }
+
+ pcs_free_msg(sreq->flushreq.msg);
+ pcs_map_put(m);
+ ireq_destroy(sreq);
+}
+
+/* Allocate and format sync message. Important: this message hold values of sync counters
+ * as they are now. If sync request fails and retried, this message is not reallocated
+ * and sync counters remain the same.
+ */
+static void prepare_map_flush_msg(struct pcs_map_entry * m, struct pcs_int_request * sreq, struct pcs_msg * msg)
+{
+ struct pcs_cs_iohdr * ioh;
+ struct pcs_cs_sync_resp * arr;
+
+ assert_spin_locked(&m->lock);
+
+ ioh = (struct pcs_cs_iohdr *)msg->_inline_buffer;
+ arr = (struct pcs_cs_sync_resp *)(ioh + 1);
+
+ ioh->hdr.len = sizeof(struct pcs_cs_iohdr);
+ ioh->hdr.type = PCS_CS_SYNC_REQ;
+ memset(&ioh->sync, 0, sizeof(ioh->sync));
+ ioh->offset = 0;
+ ioh->size = 0;
+ ioh->_reserved = 0;
+ ioh->sync.misc = PCS_CS_IO_SEQ;
+
+ ioh->map_version = m->version;
+ ioh->uid = m->id;
+ ioh->iocontext = (u32)pcs_dentry_from_map(m)->fileinfo.attr.id;
+
+ if (m->cs_list) {
+ int i;
+
+ for (i = 0; i < m->cs_list->nsrv; i++) {
+ struct pcs_cs_record * rec = m->cs_list->cs + i;
+ if (cs_is_dirty(&rec->sync)) {
+ arr->cs_id = rec->info.id;
+ arr->sync.integrity_seq = rec->sync.dirty_integrity;
+ arr->sync.sync_epoch = rec->sync.dirty_epoch;
+ arr->sync.sync_dirty = rec->sync.dirty_seq;
+ arr->sync.sync_current = rec->sync.dirty_seq;
+ arr->sync.misc = 0;
+ arr->sync.ts_io = 0;
+ arr->sync.ts_net = 0;
+ arr->sync._reserved = 0;
+ ioh->hdr.len += sizeof(struct pcs_cs_sync_resp);
+ pcs_log(LOG_DEBUG5, "fill sync "NODE_FMT" [%d,%d,%d,%d]", NODE_ARGS(arr->cs_id),
+ arr->sync.integrity_seq, arr->sync.sync_epoch,
+ arr->sync.sync_dirty, arr->sync.sync_current);
+ arr++;
+ }
+ }
+ }
+ msg->size = ioh->hdr.len;
+ msg->private = sreq;
+ msg->done = sync_done;
+}
+
+static bool valid_for_flush(struct pcs_map_entry *m)
+{
+ if (m->state & PCS_MAP_DEAD)
+ return false;
+
+ if (!(m->flags & PCS_MAP_DIRTY))
+ return false;
+ if (m->flags & PCS_MAP_FLUSHING)
+ return false;
+
+ return true;
+}
+
+static int prepare_map_flush_ireq(struct pcs_map_entry *m, struct pcs_int_request **sreqp)
+{
+ struct pcs_dentry_info *de;
+ struct pcs_cs_list *cslist;
+ struct pcs_int_request *sreq;
+ struct pcs_msg * msg;
+
+ spin_lock(&m->lock);
+ if (!valid_for_flush(m)) {
+ spin_unlock(&m->lock);
+ return 0;
+ }
+
+ if (!m->cs_list || !m->cs_list->nsrv) {
+ /* TODO: userspace allow (cslist->nsrv==0), but IMHO it does not make sense */
+ WARN_ON_ONCE(1);
+ spin_unlock(&m->lock);
+ return 0;
+ }
+
+ cslist = m->cs_list;
+ cslist_get(cslist);
+ /* TODO: Need to grab reference to de? */
+ de = pcs_dentry_from_map(m);
+ spin_unlock(&m->lock);
+
+ sreq = ireq_alloc(de);
+ if (!sreq)
+ goto err_cslist;
+
+ msg = pcs_rpc_alloc_output_msg(sizeof(struct pcs_cs_iohdr) +
+ cslist->nsrv * sizeof(struct pcs_cs_sync_resp));
+ if (!msg)
+ goto err_ireq;
+
+ /* All resources allocated, we need to recheck maps state again */
+ spin_lock(&m->lock);
+ cslist_put(cslist);
+ if (!valid_for_flush(m) || m->cs_list != cslist) {
+ spin_unlock(&m->lock);
+ return 0;
+ }
+ prepare_map_flush_msg(m, sreq, msg);
+ sreq->type = PCS_IREQ_FLUSH;
+ sreq->ts = jiffies;
+ sreq->completion_data.parent = NULL;
+ sreq->flushreq.map = m;
+ sreq->flushreq.csl = NULL;
+ sreq->complete_cb = pcs_flushreq_complete;
+ sreq->flushreq.msg = msg;
+ TRACE("timed FLUSH " MAP_FMT, MAP_ARGS(m));
+ m->flags |= PCS_MAP_FLUSHING;
+ __pcs_map_get(m);
+ spin_unlock(&m->lock);
+ *sreqp = sreq;
+ return 0;
+
+err_ireq:
+ ireq_destroy(sreq);
+err_cslist:
+ cslist_put(cslist);
+ return -ENOMEM;
+}
+
+/* Timer injects a sync request for dirty chunk, when sync timeout expires.
+ * If the request fails, we just retry later.
+ */
+static void sync_timer_work(struct work_struct *w)
+{
+ struct pcs_map_entry *m = container_of(w, struct pcs_map_entry, sync_work.work);
+ struct pcs_int_request * sreq = NULL;
+ int err;
+
+ err = prepare_map_flush_ireq(m, &sreq);
+ if (err) {
+ map_sync_work_add(m, HZ);
+ } else {
+ if (sreq)
+ map_submit(m, sreq, 0);
+ }
+ /* Counter part from map_sync_work_add */
+ pcs_map_put(m);
+}
+
+
+/* Handle for api PCS_REQ_T_SYNC IO request. It scans through current map
+ * and constructs internal subrequests for each chunk, which is dirty at the moment.
+ * Current sync seq number are stored in subrequest right now, so that future
+ * dirtifying writes will not delay execution of this request.
+ *
+ * XXX we can issue a lot of subrequests here: one per each dirty chunk.
+ */
+void map_inject_flush_req(struct pcs_int_request *ireq)
+{
+ struct pcs_dentry_info *di = ireq->dentry;
+ struct list_head ireq_list;
+ unsigned long idx, end_idx;
+ u64 end;
+ struct pcs_map_entry *maps[MAP_BATCH];
+ int nr_maps;
+
+ if (di->fileinfo.sys.map_type != PCS_MAP_PLAIN ||
+ di->fileinfo.sys.stripe_depth != 1) {
+ pcs_log(LOG_ERR, "bad map_type");
+ pcs_set_local_error(&ireq->error, PCS_ERR_PROTOCOL);
+ ireq_complete(ireq);
+ return;
+ }
+
+ atomic_set(&ireq->iocount, 1);
+ INIT_LIST_HEAD(&ireq_list);
+
+ idx = ireq->apireq.req->pos >> DENTRY_CHUNK_SIZE_BITS(di);
+ end = (ireq->apireq.req->pos + ireq->apireq.req->size) >> DENTRY_CHUNK_SIZE_BITS(di);
+ if (end <= ireq->apireq.req->pos)
+ end = ~0ULL;
+ end_idx = end >> DENTRY_CHUNK_SIZE_BITS(di);
+
+ do {
+ int i;
+
+ rcu_read_lock();
+ /* TODO !!!! use radix tree tags for DIRTY flags */
+ nr_maps = radix_tree_gang_lookup(&di->mapping.map_tree,
+ (void **)maps, idx, MAP_BATCH);
+
+ for (i = 0; i < nr_maps; i++) {
+ struct pcs_map_entry *m = maps[i];
+
+ idx = maps[i]->index;
+ if (idx > end_idx)
+ break;
+
+ spin_lock(&m->lock);
+ if (!(m->flags & PCS_MAP_DIRTY) || !pcs_map_get_locked(m))
+ maps[i] = NULL;
+ spin_unlock(&m->lock);
+
+ }
+ rcu_read_unlock();
+ for (i = 0; i < nr_maps; i++) {
+ struct pcs_int_request * sreq = NULL;
+ int err = 0;
+
+ if (idx > end_idx)
+ break;
+ if (!maps[i])
+ continue;
+ err = prepare_map_flush_ireq(maps[i], &sreq);
+ pcs_map_put(maps[i]);
+ if (err) {
+ pcs_set_local_error(&ireq->error, PCS_ERR_NOMEM);
+ break;
+ }
+ /* Request not prepared, so sync is not required */
+ if (!sreq)
+ continue;
+ pcs_sreq_attach(sreq, ireq);
+ list_add_tail(&sreq->list, &ireq_list);
+ }
+ idx++;
+ } while (nr_maps && idx < end_idx + 1);
+
+ pcs_cc_requeue(ireq->dentry->cluster, &ireq_list);
+
+ if (atomic_dec_and_test(&ireq->iocount))
+ ireq_complete(ireq);
+}
diff --git a/fs/fuse/kio/pcs/pcs_map.h b/fs/fuse/kio/pcs/pcs_map.h
new file mode 100644
index 000000000000..754e0f177d46
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_map.h
@@ -0,0 +1,264 @@
+#ifndef _PCS_MAP_H_
+#define _PCS_MAP_H_ 1
+
+#include "pcs_client_types.h"
+#include "pcs_mds_prot.h"
+#include "pcs_flow_detect.h"
+#include "log.h"
+
+struct pcs_dentry_info;
+struct pcs_int_request;
+
+#define PCS_MAP_LIMIT 4096
+
+#define PCS_SYNC_TIMEOUT (20 * HZ)
+
+#define PCS_REPLICATION_BLACKLIST_TIMEOUT HZ
+
+//// TODO:
+#define PCS_MAP_MIN_REBALANCE_TIMEOUT (HZ / 5)
+#define PCS_MAP_MAX_REBALANCE_TIMEOUT (60 * HZ)
+
+#define PCS_TWEAK_REBALANCE_ALWAYS 1
+#define PCS_TWEAK_IGNORE_SEQUENTIAL 2
+#define PCS_TWEAK_USE_FLOW_LOAD 4
+#define PCS_TWEAK_USE_FLOW_WEIGHT 8
+
+struct pcs_cs_link
+{
+ struct pcs_cs * cs;
+ int index;
+ int addr_serno;
+ struct list_head link; /* Link in list of maps routed via cs,
+ * head is cs->map_list */
+};
+
+/*
+ * PCS_MAP_DEAD - mapping is under destruction
+ * PCS_MAP_NEW - version is invalid
+ * PCS_MAP_READABLE - read IO requests can be sent using this map.
+ * PCS_MAP_WRITEABLE - read/write IO requests can be sent using this map.
+ * PCS_MAP_RESOLVING - map is under resolution. If PCS_MAP_WRITEABLE/READABLE
+ * PCS_MAP_ERROR - some error when communicating to CS happened. map requires revalidation.
+ * version is valid, but most likely will be obsoleted.
+ */
+enum
+{
+ PCS_MAP_READABLE = 1,
+ PCS_MAP_WRITEABLE = 2,
+ PCS_MAP_RESOLVING = 4,
+ PCS_MAP_ERROR = 8,
+ PCS_MAP_NEW = 0x10,
+ PCS_MAP_DEAD = 0x20,
+ PCS_MAP_EOF = 0x40,
+};
+
+enum
+{
+ PCS_MAP_DIRTY = 1,
+ PCS_MAP_FLUSHING = 2,
+ PCS_MAP_DIRTY_GC = 4,
+ PCS_MAP_CLIENT_SIZE = 8, /* chunk size is controlled by client */
+ PCS_MAP_CLIENT_ALLOC = 0x10, /* chunk allocation is controlled by client */
+ PCS_MAP_CLIENT_PSIZE = 0x20, /* physical size of chunk on CS must be transmitted to MDS */
+};
+
+struct cs_sync_state
+{
+ PCS_INTEGRITY_SEQ_T dirty_integrity;
+ PCS_SYNC_SEQ_T dirty_epoch;
+ PCS_SYNC_SEQ_T dirty_seq;
+ PCS_SYNC_SEQ_T sync_epoch;
+ PCS_SYNC_SEQ_T sync_seq;
+};
+
+struct pcs_cs_record
+{
+ struct pcs_cs_info info;
+ struct cs_sync_state sync;
+ struct pcs_cs_link cslink;
+};
+
+struct pcs_cs_list
+{
+ struct pcs_map_entry *map;
+ atomic_t refcnt;
+ atomic_t seq_read_in_flight;
+ int read_index; /* volatile read hint */
+ int cong_index; /* volatile cong hint */
+ unsigned long blacklist; /* Atomic bit field */
+ abs_time_t blacklist_expires; /* volatile blacklist stamp */
+ abs_time_t select_stamp; /* volatile read hint stamp */
+ /* members below are immutable accross cslist life time */
+#define CSL_FL_HAS_LOCAL 1
+ unsigned int flags;
+ int read_timeout;
+ int write_timeout;
+ int nsrv;
+ PCS_MAP_VERSION_T version; /* version inherented from map */
+ struct pcs_cs_record cs[0];
+};
+
+/* TODO, LOCKING!!!!!
+ * the only immutable values are id and
+ */
+struct pcs_map_entry
+{
+ unsigned long index;
+ union {
+ struct list_head lru_link;
+ struct rcu_head rcu;
+ };
+ struct pcs_mapping *mapping;
+ struct pcs_map_set *maps;
+
+ spinlock_t lock;
+ int state;
+ int flags;
+ atomic_t __refcnt;
+ u16 mds_flags;
+ u64 res_offset;
+
+ u32 chunk_psize;
+
+ PCS_MAP_VERSION_T version;
+ PCS_CHUNK_UID_T id;
+
+ pcs_error_t iofailure;
+ unsigned long long error_tstamp;
+
+ struct delayed_work sync_work;
+ struct pcs_cs_list *cs_list;
+ struct list_head queue;
+};
+
+static inline u64 map_chunk_start(struct pcs_map_entry *m)
+{
+ return m->index << m->mapping->chunk_size_bits;
+}
+
+static inline u64 map_chunk_end(struct pcs_map_entry *m)
+{
+ return (m->index +1) << m->mapping->chunk_size_bits;
+}
+
+static inline struct pcs_dentry_info * pcs_dentry_from_mapping(struct pcs_mapping * mapping)
+{
+ return container_of(mapping, struct pcs_dentry_info, mapping);
+}
+
+static inline struct pcs_dentry_info * pcs_dentry_from_map(struct pcs_map_entry * m)
+{
+ return pcs_dentry_from_mapping(m->mapping);
+}
+
+static inline struct pcs_cluster_core *cc_from_map(struct pcs_map_entry * m)
+{
+ return pcs_dentry_from_mapping(m->mapping)->cluster;
+}
+
+void pcs_mapping_init(struct pcs_cluster_core *cc, struct pcs_mapping * mapping);
+void pcs_mapping_open(struct pcs_mapping * mapping);
+void pcs_mapping_invalidate(struct pcs_mapping * mapping);
+void pcs_mapping_deinit(struct pcs_mapping * mapping);
+void pcs_mapping_truncate(struct pcs_int_request *ireq, u64 old_size);
+void process_ireq_truncate(struct pcs_int_request *ireq);
+
+struct pcs_map_entry * pcs_find_get_map(struct pcs_dentry_info * de, u64 chunk);
+void map_submit(struct pcs_map_entry * m, struct pcs_int_request *ireq, int requeue);
+void map_notify_iochunk_error(struct pcs_int_request *ireq);
+void map_notify_soft_error(struct pcs_int_request *ireq);
+void __pcs_map_put(struct pcs_map_entry *m);
+
+void pcs_deaccount_ireq(struct pcs_int_request *ireq, pcs_error_t *);
+
+void cs_blacklist(struct pcs_cs * cs, int error, char * reason);
+void cs_whitelist(struct pcs_cs * cs, char * reason);
+void pcs_map_notify_addr_change(struct pcs_cs * cs);
+void pcs_map_force_reselect(struct pcs_cs * cs);
+
+struct pcs_msg;
+void pcs_map_verify_sync_state(struct pcs_dentry_info * de, struct pcs_int_request *ireq, struct pcs_msg *);
+void map_inject_flush_req(struct pcs_int_request *ireq);
+void process_flush_req(struct pcs_int_request *ireq);
+int map_check_limit(struct pcs_map_entry * map, struct pcs_int_request *ireq);
+int pcs_cslist_submit(struct pcs_int_request *ireq, struct pcs_cs_list *csl, int requeue);
+struct pcs_int_request * pcs_ireq_split(struct pcs_int_request *ireq, unsigned int iochunk, int noalign);
+int fuse_map_resolve(struct pcs_map_entry * m, int direction);
+struct pcs_ioc_getmap;
+void pcs_map_complete(struct pcs_map_entry *m, struct pcs_ioc_getmap *omap);
+int pcs_map_encode_req(struct pcs_map_entry*m, struct pcs_ioc_getmap *map, int direction);
+void map_truncate_tail(struct pcs_mapping *mapping, u64 offset);
+unsigned long pcs_map_shrink_scan(struct shrinker *, struct shrink_control *sc);
+
+#define MAP_FMT "(%p) 0x%lld s:%x" DENTRY_FMT
+#define MAP_ARGS(m) (m), (long long)(m)->index, (m)->state, DENTRY_ARGS(pcs_dentry_from_map((m)))
+
+static inline void pcs_map_put(struct pcs_map_entry *m)
+{
+ TRACE("m(%p)->index:%ld ref:%d \n", m, m->index, atomic_read(&m->__refcnt));
+
+ BUG_ON(atomic_read(&m->__refcnt) <= 0);
+ if (atomic_dec_and_lock(&m->__refcnt, &m->lock))
+ __pcs_map_put(m);
+}
+
+static inline void map_add_lru(struct pcs_map_entry *m)
+{
+ assert_spin_locked(&m->lock);
+ if (m->flags & PCS_MAP_DIRTY)
+ list_lru_add(&m->maps->dirty_lru, &m->lru_link);
+ else
+ list_lru_add(&m->maps->lru, &m->lru_link);
+}
+
+static inline void map_del_lru(struct pcs_map_entry *m)
+{
+ assert_spin_locked(&m->lock);
+ if (m->flags & PCS_MAP_DIRTY)
+ list_lru_del(&m->maps->dirty_lru, &m->lru_link);
+ else
+ list_lru_del(&m->maps->lru, &m->lru_link);
+}
+
+static inline void pcs_map_put_locked(struct pcs_map_entry *m)
+{
+ TRACE("m(%p)->index:%ld ref:%d \n", m, m->index, atomic_read(&m->__refcnt));
+
+ BUG_ON(atomic_read(&m->__refcnt) <= 0);
+ BUG_ON(m->state & PCS_MAP_DEAD);
+
+ if (atomic_dec_and_test(&m->__refcnt))
+ map_add_lru(m);
+}
+
+static inline bool pcs_map_get_locked(struct pcs_map_entry *m)
+{
+ TRACE( MAP_FMT " refcnt:%d\n", MAP_ARGS(m), atomic_read(&m->__refcnt));
+ BUG_ON(atomic_read(&m->__refcnt) < 0);
+
+ if (m->state & PCS_MAP_DEAD) {
+ spin_unlock(&m->lock);
+ return 0;
+ }
+
+ if (atomic_inc_return(&m->__refcnt) == 1)
+ map_del_lru(m);
+
+ return 1;
+}
+
+static inline struct pcs_map_entry *pcs_map_get(struct pcs_map_entry *m)
+{
+ spin_lock(&m->lock);
+ if (!pcs_map_get_locked(m)) {
+ spin_unlock(&m->lock);
+ m = NULL;
+ } else
+ spin_unlock(&m->lock);
+
+ return m;
+}
+
+
+#endif /* _PCS_MAP_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_mds_prot.h b/fs/fuse/kio/pcs/pcs_mds_prot.h
new file mode 100644
index 000000000000..80c20fde1537
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_mds_prot.h
@@ -0,0 +1,1335 @@
+#ifndef _PCS_MDS_PROT_H_
+#define _PCS_MDS_PROT_H_ 1
+
+#include "pcs_rpc_prot.h"
+
+
+/* This file contains client interface to MDS.
+ */
+
+/* ---- limits */
+
+#define PCS_MDS_MAX_MSG_SIZE 0x21000 /* So we can transfer fuse request in single message */
+#define PCS_MDS_MAX_RESP_MSG_SIZE PCS_MDS_MAX_MSG_SIZE
+#define PCS_MDS_MAX_PATH 0x10000
+
+/* ---- basic structures */
+
+/* The generation value represents the last paxos commit number. It is sent back and forth
+ * to the client to ensure the mds already have all commits necessary to process client request.
+ * Such approach guarantees consistency even if several mds are processing client requests in parallel.
+ */
+
+typedef u64 PCS_MDS_GEN_T;
+
+#define PCS_MDS_GEN_UNDEFINED 0
+
+/* signof(v1 - v2), -1 if v1 is older than v2 */
+static inline int mds_gen_compare(PCS_MDS_GEN_T v1, PCS_MDS_GEN_T v2)
+{
+ if (v1 == PCS_MDS_GEN_UNDEFINED || v2 == PCS_MDS_GEN_UNDEFINED)
+ return 0;
+ if ((s64)(v1 - v2) < 0)
+ return -1;
+ return 0;
+}
+
+/* Common header of all messages */
+struct pcs_mds_hdr
+{
+ struct pcs_rpc_hdr h;
+ PCS_MDS_GEN_T mds_gen;
+ PCS_CONFIG_SEQ_T cfg_version;
+ u32 cluster_version;
+ u32 flags; /* PCS_MDS_F_XXX */
+ u32 reserved;
+} __attribute__((aligned(8)));
+
+/* Request header flags */
+#define PCS_MDS_F_IS_MASTER 1 /* Set on reply if server is master */
+#define PCS_MDS_F_NEED_MASTER 2 /* Request will fail with PCS_ERR_MDS_NOT_MASTER error if server is not master */
+#define PCS_MDS_F_CLNT_VERSION 0x80 /* Client supply its version in the message */
+/* Check client version (passed in cluster_version) is not less than the cluster version.
+ * Returns PCS_ERR_CLNT_VERSION otherwise. */
+#define PCS_MDS_F_CHK_VERSION 0x100
+
+/*
+ * CS information
+ */
+
+typedef u16 pcs_cs_io_prio_t;
+typedef u8 pcs_cs_net_prio_t;
+
+/* CS info flags */
+enum {
+ CS_FL_LOCAL = 1, /* CS is on the same host as the client */
+ CS_FL_LOCAL_SOCK = 2, /* CS listens on local socket */
+ CS_FL_INACTIVE = 0x10, /* CS is not sending pings for some time */
+ CS_FL_REPLICATING = 0x20, /* This CS is replicating this map */
+ CS_FL_FAILED = 0x40, /* This CS has failed */
+ CS_FL_ROLE = 0xFF00,/* Role of this CS in raid array, 0..depth-1 are data chunks, the rest are syndrome */
+ CS_FL_ROLE_LOG = 8,
+};
+
+#define CS_FL_ROLE_GET(flags) (((flags) & CS_FL_ROLE) >> CS_FL_ROLE_LOG)
+#define CS_FL_ROLE_FLAGS(role) (CS_FL_ROLE & ((role) << CS_FL_ROLE_LOG))
+
+struct pcs_cs_info {
+ /* CS node id */
+ PCS_NODE_ID_T id;
+ /* Integrity sequence number updated every time the CS restarts without properly flushing all client's data */
+ PCS_INTEGRITY_SEQ_T integrity_seq;
+ /* Access priority (higher values are preferable) based on the IO activity, 0 means unknown */
+ pcs_cs_io_prio_t io_prio;
+ /* Network priority (higher values are preferable) based on the network distance, 0 means unknown */
+ pcs_cs_net_prio_t net_prio;
+ /* QoS level of this CS (higher values are preferable) */
+ u8 qos;
+ /* Flags (CS_FL_XXX) */
+ u32 flags;
+ u32 reserved;
+ /* Primary network address */
+ PCS_NET_ADDR_T addr;
+} __attribute__((aligned(8)));
+
+struct pcs_cs_addr_info
+{
+ PCS_NODE_ID_T id;
+ PCS_INTEGRITY_SEQ_T integrity_seq;
+ u32 naddr;
+ PCS_NET_ADDR_T addr[1];
+} __attribute__((aligned(8)));
+
+/* ---- connection request
+ * The following structure serves as a payload for RPC connect messages to deliver MDS server list to the client.
+ */
+
+#define PCS_MDS_CONNECT_PAYLOAD PCS_RPC_APP_PAYLOAD_BASE
+
+struct pcs_mds_node_info {
+ PCS_NODE_ID_T id;
+ PCS_NET_ADDR_T addr;
+} __attribute__((aligned(8)));
+
+struct pcs_mds_conn_payload
+{
+ PCS_MDS_GEN_T mds_gen; /* The last commit sequence number */
+ PCS_MASTER_GENID_T mds_master_ver; /* The mds epoch number (see master field of PCS_MAP_VERSION_T) */
+ u16 mds_list_len; /* The number of MDSes in list */
+ s16 mds_master_idx; /* The index of the master in the list (negative means no master is known) */
+ struct pcs_mds_node_info mds_list[1]; /* The list of MDS */
+} __attribute__((aligned(8)));
+
+/* ---- chunk server resolution request/response
+ * Client issues it to resolve server ID to network address
+ * The message is the same for request and response
+ */
+
+#define PCS_MDS_CS_RESOLVE_REQ (PCS_RPC_MDS_CLIENT_BASE + 0x20)
+#define PCS_MDS_CS_RESOLVE_RESP (PCS_MDS_CS_RESOLVE_REQ | PCS_RPC_DIRECTION)
+
+struct pcs_mds_cs_resolve_msg
+{
+ struct pcs_mds_hdr hdr;
+ struct pcs_cs_addr_info info; /* in/out */
+} __attribute__((aligned(8)));
+
+/* ---- lease requests
+ * Lease provides the mechanism for mutual exclusion of the operations referencing the particular name. The name for
+ * which the lease is being requested may or may not refer to the existing file. Getting exclusive lease for non yet existing
+ * file is required to ensure exclusive file creation semantic.
+ *
+ * Once the lease is granted it must be updated periodically by the client alive requests and released ultimately. Failing
+ * to release exclusive lease by the client will have strong performance impact since the MDS will take care to stop corresponding
+ * IO operations if the file will be lately accessed by another client.
+ *
+ * The message type is pcs_mds_lease_msg (same for request and response). If the lease can not be acquired the pcs_rpc_error_resp
+ * will be returned.
+ */
+
+#define PCS_MDS_LEASE_REQ (PCS_RPC_MDS_CLIENT_BASE + 2)
+#define PCS_MDS_LEASE_RESP (PCS_MDS_LEASE_REQ | PCS_RPC_DIRECTION)
+
+/* Lease flags. */
+enum
+{
+/* Release lease if being held. */
+ PCS_LEASE_NONE = 0,
+/* Shared lease. May be acquired for reading (not mandatory though). */
+ PCS_LEASE_SHARED,
+/* Exclusive lease. Mandatory for file creation, deletion, rename, truncation, resizing and write access. */
+ PCS_LEASE_EXCL,
+/* Lease type mask */
+ PCS_LEASE_TYPE_MASK = PCS_LEASE_SHARED|PCS_LEASE_EXCL,
+/* Just refresh the lease. Return error if the lease wasn't exist prior to the call. */
+ PCS_LEASE_REFRESH = 0x10,
+/* Use timeout from the message instead of the system-wide. */
+ PCS_LEASE_CUSTOM_TOUT = 0x20,
+/* Update all leases granted to the client. The name argument is ignored. If set no other flags are allowed. */
+ PCS_LEASE_ALIVE = 0x100,
+/* Release all leases granted to the client. The name argument is ignored. */
+ PCS_LEASE_DROP_ALL = 0x200,
+/* Query file existence. Just saves one file message in some common use cases. */
+ PCS_LEASE_QUERY_FILE = 0x1000,
+/* Update file modification time */
+ PCS_LEASE_UP_FILE_MTIME = 0x2000,
+/* Enforce strict path checking on file lookup.
+ * If it is set an attempt to lookup file with dir object lacking in the path will fail with PCS_ERR_NOT_FOUND error.
+ */
+ PCS_LEASE_POSIX_PATH = 0x10000,
+/* The following bits are reserved, they can't be set by the client. */
+ PCS_LEASE_RESERVED_ = 0xff000000,
+};
+
+/* Result flags */
+enum
+{
+ PCS_LRES_GRANTED = 0x1,
+ PCS_LRES_RELEASED = 0x2,
+/* File exists flag. The file existence is being checked if PCS_LEASE_QUERY_FILE is set on input.
+ * If the flag is set the file_id is valid on output.
+ */
+ PCS_LRES_FILE_EXISTS = 0x100,
+/* The lease ID is returned (for compatibility with old code) */
+ PCS_LRES_ID_VALID = 0x200,
+};
+
+struct pcs_mds_lease_msg
+{
+ struct pcs_mds_hdr hdr;
+ u32 flags; /* request flags */
+ u32 result; /* result flags */
+ u32 tout; /* Lease expiration timeout (in milliseconds) on output.
+ * May be specified on input with PCS_LEASE_CUSTOM_TOUT flag.
+ * Client may use custom timeout to create lease with shorter lifetime than
+ * the default one.
+ */
+ u32 reserved;
+ struct pcs_mds_fileinfo finfo; /* file info (valid on output if PCS_LRES_FILE_EXISTS result flag is set) */
+ union {
+ PCS_FILE_ID_T root; /* root dir ID on input */
+ PCS_FILE_ID_T lease_id; /* lease inode id on output */
+ };
+ struct pcs_path name; /* path relative to the root dir */
+} __attribute__((aligned(8)));
+
+/*
+ * Refresh the list of leases identified by their IDs. The requet message type is struct pcs_mds_lease_refresh_msg.
+ * The request will always succeed returning just pcs_mds_hdr.
+ */
+
+#define PCS_MDS_LEASE_REFRESH_REQ (PCS_RPC_MDS_CLIENT_BASE + 10)
+#define PCS_MDS_LEASE_REFRESH_RESP (PCS_MDS_LEASE_REFRESH_REQ | PCS_RPC_DIRECTION)
+
+struct pcs_mds_lease_refresh_msg
+{
+ struct pcs_mds_hdr hdr;
+ u64 reserved;
+ u32 nfailed; /* The number of leases that were failed to refresh */
+ u32 nleases; /* The number of lease ID that follows */
+ PCS_FILE_ID_T lease_id[0]; /* The array of lease ID to refresh */
+} __attribute__((aligned(8)));
+
+/* ---- file request
+ * Supports file create, rename, delete and query operations.
+ * The file model assumes that every file has single name as well as fixed length ID assigned to it by MDS itself. The file create and rename
+ * operations are made immune to MDS crashes so they can be safely restarted by the client. The MDS is using the client ID to detect restarted
+ * operations so the client must ensure its uniqueness.
+ *
+ * The file attributes are filled on output whenever the file is referenced. The replication and optionally size (with PCS_FFL_RESIZE flag)
+ * attributes may be used on input as well. The operation to be done is defined by the combination of the op and flags fields.
+ *
+ * The message type is pcs_mds_file_msg (same for request and response). On failure the pcs_rpc_error_resp will be returned.
+*/
+
+#define PCS_MDS_FILE_REQ (PCS_RPC_MDS_CLIENT_BASE + 4)
+#define PCS_MDS_FILE_RESP (PCS_MDS_FILE_REQ | PCS_RPC_DIRECTION)
+
+/* File map type (storage type) */
+enum
+{
+ PCS_MAP_PLAIN = 0, /* Plain replicas */
+ PCS_MAP_RAID6 = 1, /* RAID6 encoded replicas */
+ PCS_MAP_RS = 2, /* Reed-Solomon encoded replicas */
+ PCS_MAP_PLAIN_LOGSTREAM = 3, /* PCS_MAP_PLAIN, but LOGSTREAM is to be used */
+ /* Combined map types are implemented by the client as a collection of files placed in
+ * the container directory - see PCS_FATTR_CONTAINER.
+ */
+ PCS_MAP_COMBINED = 0x80,
+ PCS_MAP_LS = PCS_MAP_COMBINED, /* Log structured storage */
+};
+
+/* Max inline file size */
+#define PCS_MAX_INLINE_SIZE 0x100000 /* 1Mb */
+
+/* File operation. It determines the treatment of the file name and ID parameters in the message. */
+enum
+{
+/* Identify file by its ID. May be used to update file attributes depending on other flags.
+ * Combined with PCS_FFL_DELETE will delete the file.
+ */
+ PCS_FOP_TOUCH = 0,
+/* Similar to TOUCH but identify file by name, setting ID on output.
+ */
+ PCS_FOP_RESOLVE,
+/* Rename the file with specified ID. The exclusive lease on both the current file name and the new one is required.
+ * If the file with new name exists it will be replaced. If the client wants to ensure
+ * exclusive rename semantic it must check the target existence first (via pcs_mds_lease_msg message). Fails if
+ * the file with requested ID does not exists. Note that rename operation will succeed if restarted.
+ */
+ PCS_FOP_RENAME,
+/* Rename file replacing the existing target identified by info.attr.id renaming the target
+ * at the same time. The source file is identified by info.attr.src_id.
+ * This operation is intended to use in scenarios when the file being deleted as
+ * a result of the rename operation is open by the client and should be renamed onto the
+ * temporary file.
+ */
+ PCS_FOP_REPLACE,
+};
+
+/* File operation flags */
+enum
+{
+/* Update existing file size.
+ * Valid with PCS_FOP_TOUCH, PCS_FOP_RESOLVE operations.
+ * The exclusive lease on the file is required.
+ */
+ PCS_FFL_RESIZE = 1,
+
+/* Create file if not yet exists. Valid with PCS_FOP_RESOLVE operation.
+ * The exclusive lease on the file name is required. If the client wants to ensure exclusive
+ * creation semantic it must check it existence first (via pcs_mds_lease_msg message).
+ * Note that create operation will succeed if restarted. If the object is already created it will
+ * be leave intact, the response will contain it attributes.
+*/
+ PCS_FFL_CREATE = 0x10,
+
+/* Create file in container with specific map type (see PCS_MAP_XXX) passed in message as info.repl.policy.create_type.
+ * The lease may be acquired at the container level.
+ */
+ PCS_FFL_CREATE_IN_CONTAINER = 0x20,
+
+/* Delete the file being referenced. Valid with PCS_FOP_TOUCH, PCS_FOP_RESOLVE.
+ * The exclusive lease on the file is required. Not compatible with any other flags.
+ * Note that delete operation will succeed if restarted.
+ */
+ PCS_FFL_DELETE = 0x100,
+
+/* Enforce strict path checking. If the flag is set:
+ * - an attempt to create or resolve file with dir object lacking in the path will fail with PCS_ERR_NOT_FOUND error
+ * - an attempt to delete or rename object with child objects will fail with PCS_ERR_NON_EMPTY_DIR error
+ */
+ PCS_FFL_POSIX_PATH = 0x10000,
+
+/* Recursive action */
+ PCS_FFL_RECURSIVE = 0x100000,
+};
+
+/* File operation result */
+enum {
+ PCS_FRES_FILE_CREATED = 0x1,
+ PCS_FRES_FILE_RENAMED = 0x2,
+ PCS_FRES_FILE_DELETED = 0x8,
+/* Note that upon replacing the existing file on rename both PCS_FRES_FILE_RENAMED and PCS_FRES_FILE_DELETED will be set. */
+};
+
+struct pcs_mds_file_msg
+{
+ struct pcs_mds_hdr hdr;
+ u32 op;
+ u32 flags;
+ u32 result;
+ u32 reserved;
+ PCS_FILE_ID_T root; /* root dir ID on input */
+ struct pcs_mds_fileinfo info; /* file info */
+ struct pcs_path name; /* the path relative to the root */
+} __attribute__((aligned(8)));
+
+/* The aligned size of the pcs_path structure with 1 byte reserved for terminating 0.
+ * Note that the client is not required to zero-pad strings though the strings returned
+ * by MDS are always zero padded.
+ */
+#define PCS_MDS_FILENAME_SZ_ALIGN(sz) PCS_PATH_SZ_ALIGN(sz)
+#define PCS_MDS_FILENAME_SZ_ALIGNED(n) PCS_PATH_SZ_ALIGNED(n)
+
+/* ---- file attributes request
+ * Get/set the particular file attributes with optional possibility to apply them recursively.
+ * The message may contain data of arbitrary size depending on the op parameter.
+ * The valid_mask parameter may contain the bitmask of the individual valid data attributes.
+ * Some operations may support getting/setting parameters of the filesystem root which is
+ * equivalent to changing global configuration with optional possibility to apply new
+ * settings to all existing files.
+ */
+
+#define PCS_MDS_FATTR_REQ (PCS_RPC_MDS_CLIENT_BASE + 6)
+#define PCS_MDS_FATTR_RESP (PCS_MDS_FATTR_REQ | PCS_RPC_DIRECTION)
+
+struct pcs_mds_fattr_msg
+{
+ struct pcs_mds_hdr hdr;
+ u32 op; /* PCS_FA_XXX */
+ u32 reserved; /* currently 0 */
+ union {
+ u64 valid_mask; /* bitmask of valid attributes */
+ struct {
+ /* The offset and the size of the addressed data range. Used for associated
+ * data addressing (PCS_FA_DATA). Otherwise ignored.
+ */
+ u32 attr_offset;
+ u32 attr_size;
+ };
+ };
+ PCS_FILETIME_T modify_ts; /* modification time if appropriate */
+ /* the following field is reserved for the case when we can have more than one version of the attribute data structure */
+ u32 data_version; /* currently 0 */
+ u32 data_size; /* the size in bytes of the attribute data */
+ PCS_FILE_ID_T root; /* root dir ID on input */
+ struct pcs_path name; /* the path relative to the root */
+ /*
+ * The offset of the data relative to the name is PCS_MDS_FILENAME_SZ_ALIGNED(name)
+ */
+};
+
+/* The op field content */
+enum {
+ PCS_FA_SET = 0x80000000, /* Set attributes */
+ PCS_FA_RECURSIVE = 0x40000000, /* Set recursively */
+ PCS_FA_BY_ID = 0x20000000, /* Identify file by ID, path is ignored. Use it with root=0 to adress the root itself. */
+ PCS_FA_MASK_ = (PCS_FA_BY_ID-1), /* The bitmask for attribute type */
+ /* File attributes (set only). Currently only PCS_FATTR_INLINE may be set/cleared and only on the directory. */
+ PCS_FA_ATTRIB = 0x1,
+ /* Associated data. The file must have PCS_FATTR_INLINE attribute. The total size of the data equals to the size of the file. */
+ PCS_FA_DATA = 0x10,
+ /* System attributes represented by struct pcs_mds_sys_info (set only) */
+ PCS_FA_SYS = 0x80,
+ /* Replication attributes represented by struct pcs_mds_repl_info (set only) */
+ PCS_FA_REPL = 0x100,
+ /* Hot hosts represented by struct pcs_mds_hot_hosts (get only) */
+ PCS_FA_HOT_HOSTS = 0x200,
+ /* Don't set anything, just drop all leases */
+ PCS_FA_DROP_LEASES = 0x10000,
+ /* .. whatever you need .. */
+};
+
+/* Valid mask for system attributes (PCS_FA_SYS) */
+enum {
+ PCS_FA_SYS_MAP_TYPE = 0x1,
+ PCS_FA_SYS_CHUNK_SIZE = 0x10,
+ PCS_FA_SYS_STRIPE_DEPTH = 0x100,
+ PCS_FA_SYS_REDUNDANCY = 0x200,
+ PCS_FA_SYS_TOLERANCE = 0x400,
+ PCS_FA_SYS_STRIP_WIDTH = 0x1000,
+};
+
+/* Valid mask for replication attributes (PCS_FA_REPL) */
+enum {
+ PCS_FA_REPL_REPLICAS = 1,
+ PCS_FA_REPL_PLACEMENT = 0x10,
+ PCS_FA_REPL_QOS = 0x100,
+};
+
+#define PCS_N_HOT_HOSTS 8
+
+/* Hot hosts structure */
+struct pcs_mds_hot_hosts {
+ struct {
+ PCS_NODE_ID_T id;
+ u64 nrepl;
+ } host[PCS_N_HOT_HOSTS];
+} __attribute__((aligned(8)));
+
+/* ---- read dir request
+ * Read directory.
+ * The directory information is maintained by MDS treating / as path separator.
+ * The following paths are considered identical: /a/b, /a/b/, a/b, a//b
+ *
+ * The message type is pcs_mds_readdir_msg (same for request and response). On failure the pcs_rpc_error_resp will be returned.
+ */
+
+#define PCS_MDS_READDIR_REQ (PCS_RPC_MDS_CLIENT_BASE + 8)
+#define PCS_MDS_READDIR_RESP (PCS_MDS_READDIR_REQ | PCS_RPC_DIRECTION)
+
+/* The dir entry flags */
+enum {
+ /* The entry corresponds to the file */
+ PCS_DFL_FILE = 1,
+ /* The entry corresponds to the directory (file with PCS_FATTR_DIR) */
+ PCS_DFL_DIR = 2,
+ /* The entry has child objects */
+ PCS_DFL_HAS_CHILDREN = 4,
+ /* The entry corresponds to symlin (file with PCS_FATTR_LINK) */
+ PCS_DFL_LINK = 8,
+ /* The entry is storage container */
+ PCS_DFL_CONTAINER = 0x40,
+ /* The dir end marker, the name is empty */
+ PCS_DFL_END = 0x100,
+ /* Entry is using extended format */
+ PCS_DFL_EX_INFO = 0x10000,
+ /* Entry is followed by symlink target */
+ PCS_DFL_EX_LINK = 0x20000
+};
+
+struct pcs_mds_dentry
+{
+ u32 flags;
+ u32 reserved;
+ PCS_FILE_ID_T id;
+ struct pcs_path name;
+} __attribute__((aligned(8)));
+
+#define PCS_MDS_DENTRY_SZ(d) (offsetof(struct pcs_mds_dentry, name.str) + (d).name.sz)
+#define PCS_MDS_DENTRY_SZ_ALIGN(sz) (offsetof(struct pcs_mds_dentry, name) + PCS_MDS_FILENAME_SZ_ALIGN(sz))
+#define PCS_MDS_DENTRY_SZ_ALIGNED(d) (offsetof(struct pcs_mds_dentry, name) + PCS_MDS_FILENAME_SZ_ALIGNED((d).name))
+
+struct pcs_mds_dentry_ex
+{
+ u32 flags;
+ u32 reserved;
+ struct pcs_mds_fileinfo info;
+ struct pcs_path name;
+} __attribute__((aligned(8)));
+
+#define PCS_MDS_DENTRY_EX_SZ(d) (offsetof(struct pcs_mds_dentry_ex, name.str) + (d).name.sz)
+#define PCS_MDS_DENTRY_EX_SZ_ALIGN(sz) (offsetof(struct pcs_mds_dentry_ex, name) + PCS_MDS_FILENAME_SZ_ALIGN(sz))
+#define PCS_MDS_DENTRY_EX_SZ_ALIGNED(d) (offsetof(struct pcs_mds_dentry_ex, name) + PCS_MDS_FILENAME_SZ_ALIGNED((d).name))
+
+/* The request flags */
+enum {
+/* The directory is identified by its ID, the path argument is ignored
+ */
+ PCS_READDIR_BY_ID = 0x100,
+/* Enforce strict path checking on path lookup.
+ * If it is set:
+ * - an attempt to resolve path with dir object lacking will fail with PCS_ERR_NOT_FOUND error
+ * - an attempt to resolve not a directory will fail with PCS_ERR_NOT_DIR error
+ * - child entries without dir/file objects wont be returned
+ */
+ PCS_READDIR_POSIX_PATH = 0x10000,
+/* Query extended info - returns pcs_mds_dentry_ex structures.
+ */
+ PCS_READDIR_EX_INFO = 0x100000,
+/* Pack links target right after extended info.
+ */
+ PCS_READDIR_EX_LINKS = 0x200000,
+};
+
+struct pcs_mds_readdir_msg
+{
+ struct pcs_mds_hdr hdr;
+ /* (in) The maximum number of entries to return, 0 - no limit */
+ u32 dent_max;
+ /* (in/out) The number of entries that follows */
+ u32 dent_cnt;
+ /* (in) The number of entries to skip */
+ u32 dent_skip;
+ /* (in) The limit on the message size in bytes, 0 - no limit */
+ u32 max_size;
+ /* (in) Flag bits */
+ u32 flags;
+ /* Reserved for future use */
+ u32 reserved;
+ /* (in) root dir ID or the directory ID if PCS_READDIR_BY_ID flag is set */
+ PCS_FILE_ID_T root;
+ /* (in) The path relative to the root (ignored if PCS_READDIR_BY_ID flag is set) */
+ struct pcs_path path;
+ /* After the end of the path the number of pcs_mds_dentry are being placed sequentially with 8 byte alignment,
+ * see PCS_MDS_FILENAME_SZ_ALIGNED, PCS_MDS_DENTRY_SZ_ALIGNED, PCS_MDS_FIRST_DENTRY_OFFSET for details.
+ * In case there are more than dent_max-1 entries in the dir referred by path or max_size limit is exceeded
+ * the directory content may be returned by several calls. Every next call may either specify the dent_skip
+ * count or pass the last returned entry as the single element of the pcs_mds_dentry list on input.
+ * Either dent_max or max_size must have nonzero values on input. The response may have zero dent_cnt
+ * only in case the max_size is too small for the dentry to be returned.
+ */
+} __attribute__((aligned(8)));
+
+#define PCS_MDS_FIRST_DENTRY_OFFSET(msg) (offsetof(struct pcs_mds_readdir_msg, path) + PCS_MDS_FILENAME_SZ_ALIGNED((msg).path))
+
+/* ---- chunk map request/response
+ * Before client may start IO the replication path must be set up.
+ * Client will be given the map version and the id of the chunk server the IO
+ * messages must be sent to. All other details are hidden inside MDS to CS protocol.
+ * In case the IO request returns error the client must set last_err accordingly identifying
+ * failed CS by offender field, request new map and restart failed IO operation.
+ *
+ * The message type is pcs_mds_map_msg (same for request and response).
+ */
+
+#define PCS_MDS_MAP_REQ (PCS_RPC_MDS_CLIENT_BASE + 0x10)
+#define PCS_MDS_MAP_RESP (PCS_MDS_MAP_REQ | PCS_RPC_DIRECTION)
+
+/*
+ * Mode bits
+ */
+#define PCS_MDS_MAP_MODE_READ 1
+#define PCS_MDS_MAP_MODE_WRITE 2
+/*
+ * The retry bit must be set in case we are requesting the map after IO failure.
+ * The corresponding last_err, offender, version and root fields must be set in such case in accordance to the failed map.
+ */
+#define PCS_MDS_MAP_RETRY 0x100
+/* The dirty bit must be set when client completed some write, but it is still not synced */
+#define PCS_MDS_MAP_DIRTY 0x200
+/* "new" bit is set by client on RAID maps, which require allocation of new chunk. In this case
+ * "chunk_size" usually uninitialized by client defines size of chunk to be allocated by MDS.
+ * NOTE: all map requests on the last chunk may have "chunk_size" non-zero and this means
+ * client wishes to expand the last chunk.
+ */
+#define PCS_MDS_MAP_NEW 0x400
+/* This bit is set by client in request, if it contains physical size of chunk for CS.
+ * It is used when MDS cannot calculate size of chunk on CS only from logical chunk size,
+ * which is the case for RAID encoded files with variable strip size. Unless this bit is set,
+ * physical size of chunk on CS is calculated from logical chunk size by formulae already
+ * implemented in MDS.
+ *
+ * MDS sets this flag when it returns physical size of chunk in "psize_ret", otherwise
+ * this flag must be cleared in messages in MDS->client direction. Normally, MDS should
+ * return "psize_ret" when it has chunk_psize in hands.
+ */
+#define PCS_MDS_MAP_PSIZE 0x800
+
+/* Dirty chunk size is 1M to cover 64M chunk with 64 bits. */
+#define PCS_DIRTY_CHUNK_SIZE (1024*1024)
+
+/* Map flags */
+#define PCS_MDS_MAP_ZERO_CHUNK 1 /* The chunk is not yet allocated, valid in response to read-only requests */
+
+struct pcs_mds_map_msg
+{
+ struct pcs_mds_hdr hdr;
+ PCS_CHUNK_ID_T chunkid; /* The chunk id (file ID, offset pair) - must be provided on input */
+ u16 mode; /* read/write mode and other client supplied flags */
+ u16 flags; /* flags set by the server (replicating) */
+ union {
+ u32 last_err; /* last error returned by CS if requesting map on IO retry (in) */
+ u32 psize_ret; /* length of chunk on CS (out) */
+ };
+ PCS_NODE_ID_T offender; /* the failed CS id on retry */
+ PCS_MAP_VERSION_T version; /* in (on retry) / out */
+ PCS_CHUNK_UID_T uid; /* chunk unique id on out */
+ union {
+ u32 read_tout; /* read timeout (msec) on out */
+ u32 chunk_psize; /* physical size of chunk on CS on in */
+ };
+ u32 write_tout; /* write timeout (msec) on out */
+ struct pcs_cs_info root; /* in (on retry) / out */
+ union {
+ struct {
+ u32 chunk_size; /* The chunk size */
+ u32 child_cs_cnt; /* The number of non-root CS entries that follows */
+ };
+ u64 zero_chunk_size;/* Size of hole, valid with PCS_MDS_MAP_ZERO_CHUNK */
+ };
+ /* The list of non-root chunk servers. Any of them may be used as the target for read requests */
+ struct pcs_cs_info child_cs_list[0];
+
+} __attribute__((aligned(8)));
+
+/* known types for ID generators */
+enum {
+ MDS_MID_GEN_TYPE = 0,
+ MDS_CSID_GEN_TYPE,
+ MDS_CID_GEN_TYPE,
+};
+
+/* ---- monitor mds state
+ */
+
+#define PCS_MDS_MONITOR_REQ (PCS_RPC_MDS_CLIENT_BASE + 0x14)
+#define PCS_MDS_MONITOR_RESP (PCS_MDS_MONITOR_REQ | PCS_RPC_DIRECTION)
+
+#define PCS_PERFCNT_MAXSIZE PCS_MDS_MAX_RESP_MSG_SIZE
+
+enum
+{
+ PCS_PC_GEN_UPTIME = 1, /* Milliseconds since program start */
+ PCS_PC_GEN_BUILD_VERSION = 2, /* Build version string */
+ PCS_PC_GEN_LOAD = 4, /* Activity time in msec */
+ PCS_PC_GEN_VERSION = 5, /* MDS's version */
+
+ PCS_PC_LJ_TX_COUNT = 0x10, /* The local journal transaction count / rate */
+ PCS_PC_LJ_TX_TOTAL_SZ = 0x11, /* The local journal transaction total size / rate */
+ PCS_PC_LJ_COMMIT_COUNT = 0x12, /* The local journal commit count / rate */
+ PCS_PC_LJ_WRITE_TOTAL = 0x13, /* The total time spent writing the local journal (msec) */
+ PCS_PC_LJ_WRITE_TIME = 0x14, /* The mean local journal transaction writing time (msec) */
+
+ PCS_PC_RJ_STATUS = 0x20, /* RJ_STATE_XXX, see rjournal.h */
+ PCS_PC_RJ_ROUND = 0x21, /* transaction number */
+ PCS_PC_RJ_MASTER_KNOWN = 0x22, /* is master known? */
+ PCS_PC_RJ_MASTER_ID = 0x23, /* master node id */
+ PCS_PC_RJ_MASTER_EPOCH = 0x24, /* master generation number */
+ PCS_PC_RJ_MASTER_UPTIME = 0x25, /* time since last master change (ms) */
+ PCS_PC_RJ_NODES_STATE = 0x26, /* paxos node's state */
+
+ PCS_PC_REPL_NORM = 0x31, /* normal number of replicas */
+ PCS_PC_REPL_LIMIT = 0x32, /* minimal number of replicas,
+ one cannot write to a chunk
+ that has less or equal
+ number of replicas */
+ PCS_PC_REPL_MAX = 0x33, /* maximum number of replicas */
+
+ PCS_PC_CL_VERSION = 0x40, /* MDS cluster version */
+ PCS_PC_CL_TOTAL_SPACE_TIER = 0x41, /* total space per tier */
+ PCS_PC_CL_FREE_SPACE_TIER = 0x42, /* free space per tier */
+ PCS_PC_CL_TOTAL_EFFECTIVE_TIER = 0x43, /* effective total space available for chunks allocation in tier */
+ PCS_PC_CL_AVAIL_SPACE_TIER = 0x44, /* the amount of free space available for chunks allocation in tier */
+
+ PCS_PC_CL_TOTAL_EFFECTIVE_X = 0x45, /* effective total space matrix per tier and locality */
+ PCS_PC_CL_AVAIL_SPACE_X = 0x46, /* effective available space matrix per tier and locality */
+
+ PCS_PC_CL_STOR_VERSION = 0x50, /* storage cluster version */
+ PCS_PC_CL_TOTAL_SPACE = 0x51, /* total space in the cluster */
+ PCS_PC_CL_FREE_SPACE = 0x52, /* free space in the cluster */
+ PCS_PC_CL_AVAIL_SPACE = 0x53, /* the amount of free space available for chunks allocation in the cluster */
+ PCS_PC_CL_TOTAL_EFFECTIVE = 0x54, /* effective total space available for chunks allocation in the cluster */
+ PCS_PC_CL_AVAIL_RAW = 0x55, /* same as PCS_PC_CL_AVAIL_SPACE but ignoring license limitations */
+ PCS_PC_CL_TOTAL_RAW = 0x56, /* same as PCS_PC_CL_TOTAL_EFFECTIVE but ignoring license limitations */
+
+ PCS_PC_CL_STATUS = 0x58, /* cluster status (pcs_cluster_status_t) */
+
+ PCS_PC_CL_NODES = 0x60, /* CS count */
+ PCS_PC_CL_NODES_ACTIVE = 0x61, /* count of CSs that send pings */
+ PCS_PC_CL_NODES_INACTIVE = 0x62, /* inactive CS count */
+ PCS_PC_CL_NODES_OFFLINE = 0x63, /* offline CS count */
+ PCS_PC_CL_NODES_DROPPED = 0x64, /* count of CSs dropped by administrator */
+ PCS_PC_CL_NODES_AVAIL = 0x68, /* available for allocation CS count */
+ PCS_PC_CL_NODES_REPLICATING = 0x69, /* nodes participating in cooperative replication */
+ PCS_PC_CL_AVER_COST = 0x6a, /* the average allocation cost for available CS */
+ PCS_PC_CL_NODES_FAILED = 0x6b, /* failed CS nodes count */
+ PCS_PC_CL_NODES_NOSPACE = 0x6c, /* count of CS nodes without space available for allocation */
+ PCS_PC_CL_NODES_HOT = 0x6d, /* count of CS nodes considered hot */
+
+ /* cluster chunk info */
+ PCS_PC_CL_CHUNKS_VOID = 0x70, /* unused chunks */
+ PCS_PC_CL_CHUNKS_PENDING = 0x71, /* top priority queue for replication, chunk is blocked, client is waiting */
+ PCS_PC_CL_CHUNKS_BLOCKED = 0x72, /* have too few replicas, writing is impossible */
+ PCS_PC_CL_CHUNKS_URGENT = 0x73, /* chunks that have limit replicas */
+ PCS_PC_CL_CHUNKS_DEGRADED = 0x74, /* chunks that have > limit and < normal replicas */
+ PCS_PC_CL_CHUNKS_STANDBY = 0x75, /* chunks with temporary standby replicas */
+ PCS_PC_CL_CHUNKS_HEALTHY = 0x76, /* chunks that have >= normal and <= max replicas */
+ PCS_PC_CL_CHUNKS_OVERCOMMITTED = 0x77, /* chunks that have > max replicas */
+ PCS_PC_CL_CHUNKS_REPLICATING = 0x78, /* chunks that replicate now */
+ PCS_PC_CL_CHUNKS_OFFLINE = 0x79, /* chunks that have no replicas */
+ PCS_PC_CL_REPL_DELETING = 0x7a, /* replicas queued for deletion */
+ PCS_PC_CL_CHUNKS_REPLICATED = 0x7b, /* the replicated chunks total / rate */
+ PCS_PC_CL_CHUNKS_REBALANCE_TOTAL= 0x7c, /* the total number of chunks being rebalanced (including committing) */
+ PCS_PC_CL_CHUNKS_REBALANCE_COMM = 0x7d, /* the number of rebalanced chunks being committed */
+ PCS_PC_CL_CHUNKS_REPLICATE = 0x7e, /* the number of replicas to add on replication */
+ PCS_PC_CL_CHUNKS_UNIQUE = 0x7f, /* the number of chunks with single replica */
+
+ PCS_PC_REQ_IN = 0x81, /* number of input requests */
+ PCS_PC_REQ_OUT = 0x82, /* number of output request */
+ PCS_PC_REQ_IN_ERR = 0x84, /* number of input requests with errors */
+ PCS_PC_REQ_IN_ERR_CODE = 0x85, /* code of the last error */
+ PCS_PC_REQ_IN_ERR_UPTIME = 0x86, /* time since last error (ms) */
+ PCS_PC_REQ_IN_LATENCY = 0x87, /* avg processing time (ms) */
+ PCS_PC_REQ_IN_COMMIT_LATENCY = 0x88, /* avg processing time for requests updating metadata (ms) */
+ PCS_PC_REQ_IN_MAP_LATENCY = 0x89, /* avg processing time for map requests (ms) */
+ PCS_PC_REQ_PENDING = 0x8e, /* number of requests being currently processed */
+
+ PCS_PC_LEASE_CNT = 0x101, /* number of currently active leases */
+ PCS_PC_LEASE_CLIENTS = 0x103, /* number of clients that have leases */
+
+ PCS_PC_FS_TOTAL_SIZE = 0x110, /* Total size of all files in bytes */
+ PCS_PC_FS_INODES = 0x111, /* inode count */
+ PCS_PC_FS_FILES = 0x112, /* file count */
+ PCS_PC_FS_FILE_MAPS = 0x113, /* file map count */
+ PCS_PC_FS_CHUNK_MAPS = 0x114, /* chunk map count */
+ PCS_PC_FS_CHUNK_NODES = 0x115, /* number of all replicas of all chunks */
+
+ PCS_PC_STOR_STAT = 0x200, /* struct pcs_perf_stor_stat */
+
+ /* cluster ops info */
+ /* rates are calculated in 5s intervals, every rate is a tuple:
+ * (1) total number of events, (2) 5 sec diff, (3) avg for last 1m interval, (4) avg for 5m, (5) avg for 15m */
+ PCS_PC_CL_READS = 0x1101, /* bytes read rate */
+ PCS_PC_CL_WRITES = 0x1102, /* bytes written rate */
+ PCS_PC_CL_REPL_READS = 0x1103, /* replication bytes read rate */
+ PCS_PC_CL_REPL_WRITES = 0x1104, /* replication bytes write rate */
+ PCS_PC_CL_READ_OPS = 0x1106, /* read ops rate */
+ PCS_PC_CL_WRITE_OPS = 0x1107, /* write ops rate */
+ PCS_PC_CL_MAPS = 0x1108, /* map request rate */
+ PCS_PC_CL_FSYNC = 0x1109, /* fsync() rate */
+ PCS_PC_CL_SYNC = 0x110a, /* syncfs() rate */
+
+ PCS_PC_CL_IO_LOAD_AVER = 0x1200, /* average IO load (queue length) across cluster
+ * (queue length 1.0 corresponds to 5000000) */
+ PCS_PC_CL_IO_LOAD_MAX = 0x1201, /* maximum IO load (queue length) across cluster */
+ PCS_PC_CL_IO_LAST_BALANCED = 0x1210, /* the number of hot CSs balanced last time */
+ PCS_PC_CL_IO_LAST_BALANCE_UPTIME= 0x1211, /* time since last balance attempt (ms) */
+
+ PCS_PC_MDS_NODES = 0x1800, /* the number of MDS nodes in cluster */
+ PCS_PC_MISC_FEATURE_MASK = 0x1801, /* returns 2 64bit feature mask registers */
+ PCS_PC_MDS_HOST_INFO = 0x1802, /* return pcs_host_info for MDS */
+ PCS_PC_MDS_HOST_VER_INFO = 0x1803, /* return pcs_mds_host_info */
+
+ PCS_PC_MEM_POOLS = 0x2000, /* overall memory pools statistics */
+ PCS_PC_MEM_POOL = 0x2001, /* the particular memory pool statistics */
+ PCS_PC_MEM_LJ_USED = 0x2011, /* mem allocated for local journal */
+ PCS_PC_MEM_RJ_USED = 0x2012, /* mem allocated for replicated journal */
+ PCS_PC_MEM_RJ_CACHE = 0x2018, /* the total size of the paxos cache */
+ PCS_PC_MEM_PGS_ALLOCATED = 0x2020, /* the total number of pages allocated for memory pools */
+ PCS_PC_MEM_PGS_FREE = 0x2021, /* the current number of free pool pages */
+ PCS_PC_MEM_PGS_STANDBY = 0x2022, /* the current number of standby pool pages */
+
+ PCS_PC_MEM_SNAPSHOTS = 0x2030, /* the number of snapshots */
+ PCS_PC_MEM_SNAP_OBJS = 0x2031, /* the number of objects tracked */
+ PCS_PC_MEM_SNAP_OBJS_ORPHAN = 0x2032, /* the number of deleted objects tracked */
+ PCS_PC_MEM_SNAP_COPIES = 0x2033, /* the number of serialized object copies */
+ PCS_PC_MEM_SNAP_COPIES_ORPHAN = 0x2034, /* the number of serialized copies of the deleted objects */
+
+ PCS_PC_MEM_LAST, /* max id used in mem info */
+
+ PCS_PC_PROC_MEM_RSS = 0x3101, /* number of pages the process has in real memory */
+ PCS_PC_PROC_MEM_VSIZE = 0x3102, /* virtual memory size of process in pages */
+
+ PCS_PC_CS_LIST = 0x4000, /* CS list */
+
+ PCS_PC_CS_ID = 0x20000, /* CS id */
+ PCS_PC_CS_CHUNKS = 0x20001, /* number of chunks in CS */
+ PCS_PC_CS_REG_UPTIME = 0x20002, /* time since last mds registration (ms) */
+ PCS_PC_CS_REG_ADDR = 0x20003, /* CS IP addresses currently registered */
+ PCS_PC_CS_VERSION = 0x20004, /* CS version */
+ PCS_PC_CS_ADM_STATUS = 0x20005, /* administration status, see PCS_CS_ADM_* */
+ PCS_PC_CS_ACT_STATUS = 0x20006, /* activity status, see PCS_CS_ACT_* */
+ PCS_PC_CS_AVAIL = 0x20008, /* 1 if CS is available for allocation */
+ PCS_PC_CS_COST = 0x2000a, /* allocation cost if available */
+ PCS_PC_CS_QOS = 0x2000b, /* qos assigned for CS */
+ PCS_PC_CS_NET_ADDR = 0x2000e, /* the CS connection source network address */
+ PCS_PC_CS_LOCATION = 0x2000f, /* the CS location and host id */
+
+ PCS_PC_CS_ERR_STATUS = 0x20010, /* the CS error status - if non-zero the CS is not currently used for chunks allocation */
+ PCS_PC_CS_LAST_ERR = 0x20011, /* local error status, see PCS_MAP_ERR_* */
+ PCS_PC_CS_LAST_ERR_UPTIME = 0x20012, /* time since last local error (ms) */
+ PCS_PC_CS_LAST_LINK_ERR = 0x20013, /* link error status, see PCS_MAP_ERR_* */
+ PCS_PC_CS_LAST_LINK_ERR_UPTIME = 0x20014, /* time since last link error (ms) */
+
+ PCS_PC_CS_TOTAL_SPACE = 0x20051, /* total space on CS */
+ PCS_PC_CS_FREE_SPACE = 0x20052, /* free space on CS */
+ PCS_PC_CS_AVAIL_SPACE = 0x20053, /* the amount of space available for chunk allocation on CS */
+
+ /* CS chunks info, see PCS_PC_CL_CHUNKS_* */
+ PCS_PC_CS_CHUNKS_VOID = 0x20071,
+ PCS_PC_CS_CHUNKS_BLOCKED = 0x20072,
+ PCS_PC_CS_CHUNKS_URGENT = 0x20073,
+ PCS_PC_CS_CHUNKS_DEGRADED = 0x20074,
+ PCS_PC_CS_CHUNKS_HEALTHY = 0x20075,
+ PCS_PC_CS_CHUNKS_OVERCOMMITTED = 0x20076,
+ PCS_PC_CS_CHUNKS_REPLICATING = 0x20077,
+ PCS_PC_CS_CHUNKS_OFFLINE = 0x20078,
+ PCS_PC_CS_REPL_DELETING = 0x20079,
+ PCS_PC_CS_CHUNKS_UNIQUE = 0x2007f,
+
+ /* CS ops info, see PCS_PC_CL_* */
+ PCS_PC_CS_READS = 0x20101,
+ PCS_PC_CS_WRITES = 0x20102,
+ PCS_PC_CS_REPL_READS = 0x20103,
+ PCS_PC_CS_REPL_WRITES = 0x20104,
+ PCS_PC_CS_IO_WAIT = 0x20105,
+ PCS_PC_CS_READ_OPS = 0x20106,
+ PCS_PC_CS_WRITE_OPS = 0x20107,
+ PCS_PC_CS_MAPS = 0x20108,
+ PCS_PC_CS_FSYNC = 0x20109,
+ PCS_PC_CS_SYNC = 0x2010a,
+ PCS_PC_CS_FEATURES = 0x2010b,
+ PCS_PC_CS_CLIENT_STAT = 0x2010c,
+ PCS_PC_CS_LATENCY = 0x2010d,
+ PCS_PC_CS_LATENCY_MAX = 0x2010e,
+ PCS_PC_CS_J_FULL = 0x2010f,
+ PCS_PC_CS_IO_QUEUE = 0x20110,
+ PCS_PC_CS_RMW_OPS = 0x20111,
+ PCS_PC_CS_SYNC_WAIT = 0x20112,
+ PCS_PC_CS_SYNC_LATENCY = 0x20113,
+ PCS_PC_CS_SYNC_LATENCY_MAX = 0x20114,
+ PCS_PC_CS_CRMW_OPS = 0x20115,
+ PCS_PC_CS_SMART_FAMILY = 0x20120,
+ PCS_PC_CS_SMART_DEVICE = 0x20121,
+ PCS_PC_CS_SMART_SN = 0x20122,
+ PCS_PC_CS_SMART_VENDOR_ATTR = 0x20123,
+
+ /* clients related info */
+ PCS_PC_CLIENTS_LIST = 0x20200,
+
+ PCS_PC_CLIENT_ID = 0x20201,
+ PCS_PC_CLIENT_LEASES = 0x20202,
+ PCS_PC_CLIENT_ADDR = 0x20203,
+ PCS_PC_CLIENT_READS = 0x20204,
+ PCS_PC_CLIENT_WRITES = 0x20205,
+ PCS_PC_CLIENT_READ_OPS = 0x20206,
+ PCS_PC_CLIENT_WRITE_OPS = 0x20207,
+ PCS_PC_CLIENT_FSYNC = 0x20208,
+ PCS_PC_CLIENT_PERIOD = 0x20209,
+ PCS_PC_CLIENT_IOWAIT = 0x2020a,
+ PCS_PC_CLIENT_LATENCY_MAX = 0x2020b,
+ PCS_PC_CLIENT_LATENCY = 0x2020c,
+ PCS_PC_CLIENT_HOST_INFO = 0x2020d,
+ PCS_PC_CLIENT_IO_QUEUE = 0x2020e,
+ PCS_PC_CLIENT_RMW_OPS = 0x2020f,
+
+ PCS_PC_LICENSE_KEYNUM = 0x20301,
+ PCS_PC_LICENSE_STATUS = 0x20302,
+ PCS_PC_LICENSE_CAPACITY = 0x20303,
+ PCS_PC_LICENSE_EXPIRATION = 0x20304,
+
+ PCS_PC_SH_LEASE_INFO = 0x20401,
+ PCS_PC_EX_LEASE_INFO = 0x20402,
+
+ PCS_PC_NETSTAT_NODE_INFO = 0x20501, /* struct pcs_netstat_node_info */
+
+ PCS_PC_DISK_INFO = 0x20601,
+ PCS_PC_DISK_INFO_SERVICE = 0x20602,
+ PCS_PC_DISK_INFO_ID = 0x20603,
+ PCS_PC_DISK_INFO_LIST = 0x20604, /* struct pcs_mds_disk_info_msg */
+ PCS_PC_DISK_INFO_CNT = 0x20605,
+ PCS_PC_DISK_INFO_HOST = 0x20606,
+ PCS_PC_DISK_INFO_CAPACITY = 0x20607,
+};
+
+/* Bits for PCS_PC_CS_FEATURES */
+enum {
+ PCS_CS_FEATURE_JOURNAL = 1,
+ PCS_CS_FEATURE_CHECKSUM = 2,
+ PCS_CS_JOURNAL_CLEAN = 4,
+ PCS_CS_USE_DIRECT_IO = 8,
+ PCS_CS_FAILED_STORAGE = 0x10,
+ PCS_CS_FAILED_CSUM = 0x20,
+ PCS_CS_FAILED_JOURNAL = 0x40,
+ PCS_CS_FAILED_JCSUM = 0x80,
+ PCS_CS_FAILED_REPO = 0x100,
+ PCS_CS_FAILED_TIMEOUT = 0x200,
+};
+
+#define PCS_CS_FAILED_MASK ((u64)PCS_CS_FAILED_STORAGE|PCS_CS_FAILED_CSUM|PCS_CS_FAILED_JOURNAL| \
+ PCS_CS_FAILED_JCSUM|PCS_CS_FAILED_REPO|PCS_CS_FAILED_TIMEOUT)
+
+/* The user-friendly cluster status */
+typedef enum {
+ PCS_CL_STATUS_UNKNOWN, /* Not enough information yet. MDS is ether not master or master not so long ago */
+ PCS_CL_STATUS_HEALTHY, /* No inactive CS */
+ PCS_CL_STATUS_DEGRADED, /* Some CS are inactive */
+ PCS_CL_STATUS_FAILURE, /* Too many inactive CS. Automatic replication is automatically disabled. */
+} pcs_cluster_status_t;
+
+/* The CS activity status */
+typedef enum {
+ PCS_CS_ACT_ACTIVE, /* CS is sending pings. */
+ PCS_CS_ACT_INACTIVE, /* Not sending ping for some time. Replication is not yet started. */
+ PCS_CS_ACT_OFFLINE, /* Not sending ping for quite some time, chunks are being replicated. */
+ PCS_CS_ACT_DROPPED, /* Dropped by administrator. Such CS is banned forever so it's activity status doesn't matter anymore. */
+ PCS_CS_ACT_STATES_
+} pcs_cs_activity_status_t;
+
+struct pcs_mds_monitor_resp_msg
+{
+ struct pcs_mds_hdr hdr;
+ struct pcs_perf_counter counters[0];
+} __attribute__((aligned(8)));
+
+/* The perf counter types structures */
+
+struct pcs_pc_lease_info { /* PCS_PC_XX_LEASE_INFO */
+ PCS_NODE_ID_T clnt_id;
+ u32 age_sec; /* How long it exists */
+ s32 valid_sec; /* How long it will be valid (negative if expired) */
+ PCS_NET_ADDR_T clnt_addr;
+} __attribute__((aligned(8)));
+
+struct pcs_mds_host_info { /* PCS_PC_MDS_HOST_VER_INFO */
+ u32 version;
+ u32 mds_id;
+ struct pcs_host_info host;
+} __attribute__((aligned(8)));
+
+struct pcs_smart_vendor_attr { /* PCS_PC_CS_SMART_VENDOR_ATTR */
+ u32 id;
+ u32 flag;
+ u32 value;
+ u32 worst;
+ u32 thresh;
+ u64 reserved;
+ u64 raw_value;
+} __attribute__((aligned(8)));
+
+/* Request key values */
+enum {
+ PCS_PC_GET_INFO = 0, /* General server info */
+ PCS_PC_GET_CS_LIST, /* The list of known CSs */
+ PCS_PC_GET_CS_INFO, /* The particular CS info (CS ID as index) */
+ PCS_PC_GET_CLNT_LIST, /* The list of the client ID/IP/leases */
+ PCS_PC_GET_CLNT_TOP, /* Not yet implemented */
+ PCS_PC_GET_CLNT_INFO, /* The particular client info (ID as index) */
+ PCS_PC_GET_FILE_LEASES, /* The particular file lease owners ID/IP/lease type/age as the array of PCS_PC_LEASE_INFO */
+ PCS_PC_GET_NETSTAT, /* Get array of PCS_PC_NETSTAT_NODE_INFO */
+ PCS_PC_GET_STOR_STAT, /* Get array of struct pcs_perf_stor_stat entries given the directory ID as index */
+ PCS_PC_GET_MDS_INFO = 0x10, /* Get cluster MDSs host info as the array of PCS_PC_MDS_HOST_VER_INFO accompanied by PCS_PC_MDS_NODES entry */
+};
+
+struct pcs_mds_monitor_req_msg
+{
+ struct pcs_mds_hdr hdr;
+ u32 _reserved;
+ u32 key;
+ u64 index;
+} __attribute__((aligned(8)));
+
+/* ---- file map query request/response
+ * Returns the mapping of the file chunks to chunk servers as long as some valuable information
+ * regarding data integrity and chunk placement.
+ *
+ * The message type is pcs_mds_file_map_info_msg (same for request and response).
+ */
+
+#define PCS_MDS_FILE_MAP_INFO_REQ (PCS_RPC_MDS_CLIENT_BASE + 0x18)
+#define PCS_MDS_FILE_MAP_INFO_RESP (PCS_MDS_FILE_MAP_INFO_REQ | PCS_RPC_DIRECTION)
+
+/* Chunk flags */
+enum {
+ PCS_CH_FL_DEGRADED = 1, /* The number of online replicas is less than normal */
+ PCS_CH_FL_BLOCKED = 2, /* Not enough online replicas, writing is blocked */
+ PCS_CH_FL_OFFLINE = 4, /* No online replicas, any access is impossible */
+ PCS_CH_FL_OVERCOMMITTED = 0x10, /* Too many replicas, trimming is required */
+ PCS_CH_FL_REPLICATING = 0x100, /* Replication is in progress (to the last replica) */
+ PCS_CH_FL_ERROR = 0x400, /* Chunk has error flag on it */
+ PCS_CH_FL_HARD_ERROR = 0x800, /* Some replicas have hard (unrecoverable) error flag */
+ PCS_CH_FL_NOT_REGISTERED= 0x1000, /* Some CS are not registered (so their location info is not available) */
+ PCS_CH_FL_XINFO = 0x4000, /* The struct pcs_mds_chunk_info is followed by pcs_mds_chunk_xinfo extended info */
+ PCS_CH_FL_LOC_INFO = 0x8000, /* Extended format with per-replica location info */
+};
+
+struct pcs_mds_chunk_replica_loc_info {
+ PCS_NODE_ID_T cs_id;
+ struct pcs_host_info host;
+};
+
+struct pcs_mds_chunk_info
+{
+ u64 offset; /* Chunk offset */
+ u32 flags; /* Flags (PCS_CH_FL_XXX) */
+ u32 nreplicas; /* The number of valid replicas */
+ union {
+ /* The array of replica info */
+ PCS_NODE_ID_T replicas[1];
+ struct pcs_mds_chunk_replica_loc_info replicas_loc[1];
+ };
+} __attribute__((aligned(8)));
+
+/* Extension for the above structure */
+struct pcs_mds_chunk_xinfo
+{
+ u32 size; /* Chunk size */
+ u32 reserved[3];
+} __attribute__((aligned(8)));
+
+/* Request flags */
+enum {
+ PCS_MDS_FILE_MAP_FL_SKIP = 1, /* Skip chunk at last_offset (input). Used to restart query after incomplete response.
+ * If not set the start_offset is ignored on input. */
+ PCS_MDS_FILE_MAP_FL_OMIT_CHUNKS= 0x1000,/* Omit chunk data on output (input). Other fields will be valid though. */
+ PCS_MDS_FILE_MAP_FL_EOF = 0x8000,/* No more chunks in the file (output) - if not set the response is incomplete. */
+ PCS_MDS_FILE_MAP_FL_XINFO = 0x80000,/* Retrieve extended chunk info if available */
+ PCS_MDS_FILE_MAP_FL_LOC_INFO = 0x100000,/* Retrieve extended location info (see struct pcs_mds_chunk_replica_loc_info) */
+};
+
+/* The maximum locality value corresponding to the same host placement */
+#define PCS_HOST_LOCALITY (PCS_LOCATION_PATH_LEN+1)
+
+struct pcs_mds_file_map_info_msg
+{
+ struct pcs_mds_hdr hdr;
+ PCS_FILE_ID_T file_id; /* File id on input */
+ PCS_NODE_ID_T home_id; /* The ID of the 'home' node */
+ u64 total_chunks; /* The total number of chunks */
+ u64 last_offset; /* Last chunk offset - valid on output */
+ u32 req_flags; /* The request flags (PCS_MDS_FILE_MAP_FL_XXX) */
+ u16 chunk_flags; /* The OR-ed bitmap of chunk flags (PCS_CH_FL_XXX) */
+ u8 qos; /* Tier */
+ u8 placement; /* The placement policy */
+ u64 reserved[10]; /* Currently not used */
+ u64 per_qos_repl[PCS_NQOS]; /* Replicas per tier */
+ u8 repl_norm; /* Replication factor */
+ u8 repl_min; /* The minimum number of replicas allowed */
+ u8 repl_min_actual;/* Actual minimum number of uptodate replicas */
+ u8 repl_max_actual;/* Actual maximum number of uptodate replicas */
+ u32 nchunks; /* The number of chunks that follows */
+ struct pcs_mds_chunk_info chunks[0]; /* Chunk info array */
+} __attribute__((aligned(8)));
+
+#define PCS_MDS_NETSTAT_REPORT (PCS_RPC_MDS_CLIENT_BASE + 0x1C)
+
+/* Network stat for the particular link */
+struct pcs_connstat_rec
+{
+ PCS_NODE_ID_T id;
+ u32 retrans;
+ /* The following values are in microseconds */
+ u32 lat_min;
+ u32 lat_max;
+ u32 lat_cnt;
+ u64 lat_avg;
+} __attribute__((aligned(8)));
+
+/* Network stat averaged over all in/out links at the particular network node */
+struct pcs_netstat_node_info
+{
+ PCS_NODE_ID_T id;
+ u32 retrans;
+ /* The following values are in microseconds, ~0U means no data available */
+ u32 lat_avg; /* average over all links */
+ u32 lat_mmax; /* median of per link maximums */
+ u32 lat_max; /* top maximum over all links */
+} __attribute__((aligned(8)));
+
+struct pcs_mds_netstat_req
+{
+ struct pcs_mds_hdr hdr;
+ u32 count;
+ u32 reserved;
+ u64 reserved2[2];
+ struct pcs_connstat_rec data[0];
+} __attribute__((aligned(8)));
+
+/*
+ * Punch hole request - drop chunks in given range. In case the range size
+ * is zero it drops the single chunk starting with given offset or returns error
+ * if no such chunk exists. Currently this is the only supported scenario.
+ */
+
+#define PCS_MDS_PUNCH_HOLE_REQ (PCS_RPC_MDS_CLIENT_BASE + 0x24)
+#define PCS_MDS_PUNCH_HOLE_RESP (PCS_MDS_PUNCH_HOLE_REQ | PCS_RPC_DIRECTION)
+
+struct pcs_mds_punch_hole_msg
+{
+ struct pcs_mds_hdr hdr;
+ PCS_FILE_ID_T fileid; /* File ID */
+ u64 offset; /* Start offset */
+ u64 size; /* The hole size (may be zero - see comment above) */
+ u64 reserved[3];
+} __attribute__((aligned(8)));
+
+#define PCS_MDS_DATA_OBJ_REQ (PCS_RPC_MDS_CLIENT_BASE + 0x30)
+#define PCS_MDS_DATA_OBJ_RESP (PCS_MDS_DATA_OBJ_REQ | PCS_RPC_DIRECTION)
+
+/*
+ * Data objects are uniquely identified by (key, type) pair.
+ */
+
+#define PCS_MDS_DATA_OBJ_MAX_SIZE 0x20000
+
+enum {
+ PCS_DOP_SET = 1,
+ PCS_DOP_GET = 2,
+ // delete is currently not supported for safety
+};
+
+struct pcs_mds_data_obj_msg
+{
+ struct pcs_mds_hdr hdr;
+ u32 op;
+ u32 flags;
+ u64 reserved[4];
+ u64 key;
+ u64 attr;
+ u32 type;
+ u32 size;
+ // Object data follows
+} __attribute__((aligned(8)));
+
+/*
+ * Administration API.
+ */
+
+#define PCS_RPC_MDS_ADMIN_BASE (PCS_RPC_MDS_CLIENT_BASE + 0x80)
+
+/* ---- add mds node
+ * Add new mds node. The message type is pcs_mds_node_add_msg (same for request and response).
+ */
+
+#define PCS_MDS_NODE_ADD_REQ (PCS_RPC_MDS_ADMIN_BASE + 2)
+#define PCS_MDS_NODE_ADD_RESP (PCS_MDS_NODE_ADD_REQ | PCS_RPC_DIRECTION)
+
+struct pcs_mds_node_add_msg
+{
+ struct pcs_mds_hdr hdr;
+ PCS_NODE_ID_T id;
+ PCS_NET_ADDR_T addr;
+
+} __attribute__((aligned(8)));
+
+/* ---- remove mds node
+ * Remove existing mds node. The message type is pcs_mds_node_rm_msg (same for request and response).
+ */
+
+#define PCS_MDS_NODE_RM_REQ (PCS_RPC_MDS_ADMIN_BASE + 4)
+#define PCS_MDS_NODE_RM_RESP (PCS_MDS_NODE_RM_REQ | PCS_RPC_DIRECTION)
+
+struct pcs_mds_node_rm_msg
+{
+ struct pcs_mds_hdr hdr;
+ PCS_NODE_ID_T id;
+
+} __attribute__((aligned(8)));
+
+/* ---- remove cs node
+ * Adding new (empty) CS node does not require any special commands. It will be added upon registration.
+ * Removing CS node with some chunks allocated is the more complex process. First the node may be marked
+ * as releasing to initiate migration of the chunks to other nodes. After that the node may be ultimately dropped.
+ * The node being releasing may still contain valid data. It may go back to normal state if admin decided to cancel
+ * releasing. On the contrary dropping node drops all chunks immediately so that they will never be accessed again.
+ * Dropping the CS node is irreversible.
+ *
+ * The node control operations return just pcs_mds_hdr on success.
+ */
+
+#define PCS_MDS_CS_SET_STATUS_REQ (PCS_RPC_MDS_ADMIN_BASE + 6)
+#define PCS_MDS_CS_SET_STATUS_RESP (PCS_MDS_CS_SET_STATUS_REQ | PCS_RPC_DIRECTION)
+
+struct pcs_mds_cs_set_status_msg
+{
+ struct pcs_mds_hdr hdr;
+ PCS_NODE_ID_T id;
+ u32 status;
+ u32 flags;
+
+} __attribute__((aligned(8)));
+
+/* CS administration status */
+typedef enum {
+ PCS_CS_ADM_NORMAL = 0,
+ /* Further chunk allocation suppressed, going to be dropped as soon as all chunks will have replicas on another CS.
+ * This status is being set manually by Administrator.
+ */
+ PCS_CS_ADM_RELEASING,
+ /* The hard IO error was detected so this CS is no longer considered reliable. */
+ PCS_CS_ADM_FAILED,
+ /* Same as PCS_CS_ADM_RELEASING but CS is considered failed */
+ PCS_CS_ADM_FAILED_RELEASING,
+ /* The CS is no longer used, its ID is banned forever */
+ PCS_CS_ADM_DROPPED = 0x10,
+} pcs_cs_adm_status_t;
+
+/* Flags */
+enum {
+ /* Force setting the particular status. Normally MDS does not allow setting dropped
+ * status if it leads to the unrecoverable data loss. The following flag helps to overcome
+ * this limitation.
+ */
+ PCS_CS_ADM_FORCE = 1,
+};
+
+/* ---- client control
+ * The request type is pcs_mds_clnt_ctl_msg. The response type is struct pcs_mds_hdr on success.
+ */
+
+#define PCS_MDS_CLNT_CTL_REQ (PCS_RPC_MDS_ADMIN_BASE + 0x10)
+#define PCS_MDS_CLNT_CTL_RESP (PCS_MDS_CLNT_CTL_REQ | PCS_RPC_DIRECTION)
+
+/* Operation bits */
+enum {
+ PCS_MDS_CLNT_REVOKE_LEASES = 1,
+ PCS_MDS_CLNT_FINIT_LEASES = 2,
+ PCS_MDS_CLNT_BAN = 0x10000,
+};
+
+struct pcs_mds_clnt_ctl_msg
+{
+ struct pcs_mds_hdr hdr;
+ PCS_NODE_ID_T clnt_id;
+ u32 op;
+ u32 reserved;
+ PCS_FILETIME_T modify_ts;
+};
+
+/*
+ * Configuration interface.
+ * The configuration data is replicated among all MDS servers. Some data may belong to CS servers, they may query it by
+ * the public API described below.
+ */
+
+/* The message containing the array of configuration items */
+struct pcs_mds_cfg_msg {
+ struct pcs_mds_hdr hdr;
+ /* The configuration sequence number. Always valid on output. If set to PCS_CONFIG_SEQ_ANY
+ * the configuration will be updated regardless of the current version. Otherwise the operation
+ * will fail with PCS_ERR_CFG_VERSION if the current version differs from one provided by client.
+ */
+ PCS_CONFIG_SEQ_T version;
+ unsigned nitems;
+ struct pcs_cfg_item items[1];
+} __attribute__((aligned(8)));
+
+/* ---- Get configuration request
+ * Get configuration data set matching the specified classes bitmap. The request type is struct pcs_mds_cfg_get_msg.
+ * The response type is struct pcs_mds_cfg_msg. On failure the pcs_rpc_error_resp will be returned.
+ */
+
+#define PCS_MDS_CFG_GET_REQ (PCS_RPC_MDS_ADMIN_BASE + 0x20)
+#define PCS_MDS_CFG_GET_RESP (PCS_MDS_CFG_GET_REQ | PCS_RPC_DIRECTION)
+
+struct pcs_mds_cfg_get_msg {
+ struct pcs_mds_hdr hdr;
+ /* The bitmap of the matching classes */
+ u16 classes;
+ u16 reserved[3];
+} __attribute__((aligned(8)));
+
+/* ---- Set configuration request
+ * Set configuration data set. The request type is struct pcs_mds_cfg_msg. The response type is struct pcs_mds_hdr on success.
+ * On failure the pcs_rpc_error_resp will be returned. The configuration will be updated in a single transaction so the data set will
+ * be either applied entirely or rejected as a whole.
+ */
+
+#define PCS_MDS_CFG_SET_REQ (PCS_RPC_MDS_ADMIN_BASE + 0x22)
+#define PCS_MDS_CFG_SET_RESP (PCS_MDS_CFG_SET_REQ | PCS_RPC_DIRECTION)
+
+/* ---- request new MDS ID ---- */
+#define PCS_MDS_GEN_ID_REQ (PCS_RPC_MDS_ADMIN_BASE + 0x24)
+#define PCS_MDS_GEN_ID_RESP (PCS_MDS_GEN_ID_REQ | PCS_RPC_DIRECTION)
+
+struct pcs_mds_gen_id_msg
+{
+ struct pcs_mds_hdr hdr;
+ PCS_NODE_ID_T id;
+} __attribute__((aligned(8)));
+
+
+
+#define PCS_MDS_DISK_INFO_REQ (PCS_RPC_MDS_ADMIN_BASE + 0x88)
+#define PCS_MDS_DISK_INFO_RESP (PCS_MDS_DISK_INFO_REQ | PCS_RPC_DIRECTION)
+
+#define PCS_MDS_DISK_ID_LEN 64
+
+struct pcs_mds_disk_info_msg {
+ struct pcs_mds_hdr hdr;
+ PCS_NODE_ID_T host_id;
+ u8 disk_id[PCS_MDS_DISK_ID_LEN];
+ u32 cnt;
+ struct pcs_perf_counter info[0];
+} __attribute__((aligned(8)));
+
+/* ---- That's all for now */
+
+/* The function translates bytes offset in file to byte offset in actual storage.
+ * This map is identical for plain layout and non trivial for RAID0 layout.
+ */
+static inline u64 map_file_to_chunk(u64 pos, unsigned int chunk_size, unsigned int stripe_depth, unsigned int strip_width)
+{
+ unsigned int strip_off, chunk_idx;
+ u64 base, strip_idx, chunk_off;
+ u64 group_size;
+
+ if (stripe_depth == 1)
+ return pos;
+
+ group_size = (u64)chunk_size * stripe_depth;
+
+ base = (pos / group_size) * group_size;
+ pos -= base;
+
+ strip_off = pos % strip_width;
+ strip_idx = pos / strip_width;
+ chunk_idx = strip_idx % stripe_depth;
+ chunk_off = strip_idx / stripe_depth;
+
+ return base + (chunk_idx * (chunk_size / strip_width) + chunk_off) * strip_width + strip_off;
+}
+
+#endif /* _PCS_MDS_PROT_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_perfcounters.h b/fs/fuse/kio/pcs/pcs_perfcounters.h
new file mode 100644
index 000000000000..f902ce06d72d
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_perfcounters.h
@@ -0,0 +1,7 @@
+#ifndef _PCS_PERFCOUNTERS_H_
+#define _PCS_PERFCOUNTERS_H_ 1
+
+/* TODO:!!! this is stump for flow_detection */
+#include "pcs_perfcounters_stub.h"
+
+#endif /* _PCS_PERFCOUNTERS_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_perfcounters_stub.h b/fs/fuse/kio/pcs/pcs_perfcounters_stub.h
new file mode 100644
index 000000000000..17dae73fcd08
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_perfcounters_stub.h
@@ -0,0 +1,30 @@
+#ifndef _PCS_PERFCOUNTERS_STUB_H_
+#define _PCS_PERFCOUNTERS_STUB_H_ 1
+
+
+struct pcs_perf_stat_cnt {
+ u64 val_total;
+ u64 events;
+ u64 curr_max;
+ u64 events_last;
+ u64 avg;
+ u64 maximum;
+};
+
+/* Generic event rate counter */
+struct pcs_perf_rate_cnt {
+ /* Total number of events */
+ u64 total;
+ u64 last_total;
+ /* The number of events for the last 5 sec interval */
+ u64 rate;
+ /* The number of events per 5 sec averaged over 1, 5, 15 min and shifted by AV_SHIFT to the left */
+ u64 av1;
+ u64 av5;
+};
+
+
+static inline void pcs_perfcounter_stat_update(struct pcs_perf_stat_cnt *cnt, u64 val) __attribute__((unused));
+
+static inline void pcs_perfcounter_stat_update(struct pcs_perf_stat_cnt *cnt, u64 val) {}
+#endif //_PCS_PERFCOUNTERS_STUB_H_
diff --git a/fs/fuse/kio/pcs/pcs_prot_types.h b/fs/fuse/kio/pcs/pcs_prot_types.h
new file mode 100644
index 000000000000..d8852f6ffda5
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_prot_types.h
@@ -0,0 +1,451 @@
+#ifndef _PCS_PROT_TYPES_H_
+#define _PCS_PROT_TYPES_H_ 1
+
+#include "pcs_types.h"
+/* #include "pcs_net_addr.h" */
+/* #include "cluster_id.h" */
+
+/*
+ * Base types definitions shared by all the components.
+ */
+
+/* Current version */
+#define PCS_VERSION 121
+
+#define PCS_VZ7_VERSION 100
+
+/* milliseconds since Jan 1970 */
+typedef u64 PCS_FILETIME_T;
+
+typedef u64 PCS_FILE_ID_T;
+
+#define PCS_NODE_TYPE_BITS 2
+#define PCS_NODE_TYPE_SHIFT 10
+#define PCS_NODE_TYPE_MASK (((1ULL << PCS_NODE_TYPE_BITS) - 1) << PCS_NODE_TYPE_SHIFT)
+#define PCS_NODE_ID_MASK (~PCS_NODE_TYPE_MASK)
+
+typedef struct __pre_aligned(8) _PCS_CHUNK_ID_T {
+ PCS_FILE_ID_T fileid;
+ u64 offset;
+} PCS_CHUNK_ID_T __aligned(8);
+
+typedef struct __pre_aligned(8) _PCS_XID_T {
+ PCS_NODE_ID_T origin;
+ u64 val;
+} PCS_XID_T __aligned(8);
+
+/* Optional location of the machine. For now it is assumed that network topology
+ * and power supply topology are congruent. Default is all 0s.
+ */
+#define PCS_LOCATION_PATH_LEN 3
+
+struct __pre_aligned(8) pcs_location
+{
+ union {
+ struct {
+ u16 site;
+ u16 room;
+ u16 cabinet;
+ u16 reserved;
+ };
+ u16 path[PCS_LOCATION_PATH_LEN];
+ };
+} __aligned(8);
+
+struct __pre_aligned(8) pcs_host_info {
+ PCS_NODE_ID_T host_id;
+ struct pcs_location location;
+} __aligned(8);
+
+#define PCS_HOST_INFO_EQ(a, b) (!memcmp(&(a), &(b), offsetof(struct pcs_host_info, location.path[PCS_LOCATION_PATH_LEN])))
+#define PCS_TOPO_PATH_FMT "%u.%u.%u"
+#define PCS_HOST_ID_FMT "%016llx"
+#define PCS_HOST_INFO_FMT PCS_TOPO_PATH_FMT "." PCS_HOST_ID_FMT
+#define PCS_TOPO_PATH_ARGS(p) (p)[0], (p)[1], (p)[2]
+#define PCS_HOST_INFO_ARGS(h) PCS_TOPO_PATH_ARGS((h).location.path), (unsigned long long)(h).host_id.val
+
+typedef u32 PCS_MASTER_GENID_T;
+typedef u32 PCS_CLUSTER_GENID_T;
+typedef u32 PCS_FILE_GENID_T;
+typedef u32 PCS_LOST_LEASE_GENID_T;
+typedef u64 PCS_CHUNK_GENID_T;
+typedef u64 PCS_CHUNK_UID_T;
+typedef u64 PCS_LEASE_GEN_T;
+typedef u32 PCS_POLICY_GEN_T;
+
+/*
+ * File attributes
+ */
+
+struct __pre_aligned(8) pcs_mds_fattr
+{
+ PCS_FILE_ID_T id; /* internal ID */
+ u32 attrib; /* attribute flags */
+ u32 reserved; /* reserved for future use */
+ union {
+ struct {
+ u64 size; /* the logical size size */
+ u64 phy_size; /* physical size */
+ };
+ struct {
+ PCS_FILE_ID_T src_id; /* ID of the source - used as some API operation parameter only */
+ PCS_FILETIME_T create_ts; /* file create timestamp (on create input only) */
+ };
+ };
+ PCS_NODE_ID_T create_cid; /* file create client ID */
+ PCS_FILETIME_T modify_ts; /* last file modification timestamp */
+ PCS_LEASE_GEN_T xlease_gen; /* lease generation updated on every exclusive lease release */
+ struct pcs_host_info last_host; /* last requested lease client host info */
+};
+
+struct __pre_aligned(8) pcs_mds_sys_info {
+ u32 map_type; /* reserved for RAID */
+ u32 chunk_size; /* global constant */
+ u8 stripe_depth; /* for RAID6/RS */
+ u8 redundancy; /* number of checksums for RAID6/RS */
+ u8 tolerance; /* write-tolerance (how much lost replicas we can tolerate still allowing writing) */
+ u8 reserved8;
+ u32 strip_width; /* length of strip for RAID6/RS */
+ u32 lease_tout; /* lease expiration timeout (in milliseconds) */
+ u32 reserved;
+} __aligned(8);
+
+#define PCS_CHUNK_SIZE_MIN 4096u
+#define PCS_CHUNK_SIZE_MAX 2147483648u
+#define PCS_STRIPE_DEPTH_MAX 64
+#define PCS_REDUNDANCY_MAX 5
+#define PCS_RAID6_REDUNDANCY 2
+
+
+__pre_packed struct pcs_mds_repl_policy {
+ u8 placement; /* The placement policy. The 0 value corresponds to the maximum physical diversity. Increasing this
+ * number increases placement locality reducing transport latency (see comment on PCS_PLACEMENT_POLICY_CNT).
+ */
+ u8 qos; /* The default QoS */
+ u8 create_type; /* Map type for new file. Valid as parameter for PCS_MDS_FILE_REQ only if the
+ * PCS_FFL_CREATE_IN_CONTAINER flag is set.
+ */
+ u8 reserved[3];
+} __packed;
+
+struct __pre_aligned(8) pcs_mds_repl_info {
+ u8 norm; /* The number of replicas to maintain */
+ u8 limit; /* The minimum number of replicas required to write file */
+ struct pcs_mds_repl_policy policy; /* Replicas allocation policy */
+} __aligned(8);
+
+/* The location defines path to the host so we have 2 more entries in the full path - host itself and the CS node */
+#define PCS_TOPO_MAX_PATH (PCS_LOCATION_PATH_LEN+2)
+
+/* The number of placement policies. The policy 0 force the topmost component of the path to be different for different chunks.
+ * The policy equal to PCS_LOCATION_PATH_LEN force placing replicas on different hosts. The policy equal to PCS_LOCATION_PATH_LEN+1
+ * allows for placing replicas on the same host. Higher values are meaningless since replicas can't be allocated on the same CS more than once.
+ */
+#define PCS_PLACEMENT_POLICY_CNT PCS_TOPO_MAX_PATH
+
+/* The maximum allowed number of replicas */
+#define PCS_REPL_MAX 64
+
+/* The number of QoS levels supported */
+#define PCS_NQOS 4
+
+/* Replication info validation macro */
+#define PCS_PLACEMENT_VALID(pl) ((pl) < PCS_PLACEMENT_POLICY_CNT)
+#define PCS_QOS_VALID(q) ((q) < PCS_NQOS)
+#define PCS_POLICY_VALID(p) (PCS_PLACEMENT_VALID((p).placement) && PCS_QOS_VALID((p).qos))
+#define PCS_REPLICAS_VALID_(r) ((r).limit <= (r).norm && (r).norm <= PCS_REPL_MAX)
+#define PCS_REPLICAS_VALID(r) (PCS_REPLICAS_VALID_(r) && (r).limit > 0)
+#define PCS_REPL_VALID(r) (PCS_REPLICAS_VALID(r) && PCS_POLICY_VALID((r).policy))
+
+struct __pre_aligned(8) pcs_mds_fileinfo
+{
+ struct pcs_mds_fattr attr; /* attributes */
+ struct pcs_mds_sys_info sys; /* system info */
+ struct pcs_mds_repl_info repl; /* replication info */
+} __aligned(8);
+
+/*
+ * Version numbers
+ */
+
+/* The version number corresponding to the deleted file */
+#define PCS_FILE_GEN_DELETED 0
+
+static inline int pcs_compare_master_ver(PCS_MASTER_GENID_T v1, PCS_MASTER_GENID_T v2)
+{
+ return (int)(v1 - v2);
+}
+
+typedef struct __pre_aligned(8) _PCS_MAP_VERSION_T {
+ /* Master generation is being incremented every time the master MDS is changed
+ * invalidating all maps issued by the previous master
+ */
+ PCS_MASTER_GENID_T master;
+ /* Cluster generation is being incremented every time we are dropping one of the CS servers.
+ */
+ PCS_CLUSTER_GENID_T cluster;
+ /* The file generation incremented every time we are changing the file size.
+ */
+ PCS_FILE_GENID_T file;
+ /* The lost lease generation is being incremented every time the exclusive lease is expired and revoked to
+ * invalidate all maps issued to the previous client.
+ */
+ PCS_LOST_LEASE_GENID_T lost_lease;
+ /* The chunk generation is being incremented every time the chunk replica set is changed to invalidate all maps
+ * referencing the old replica set.
+ */
+ PCS_CHUNK_GENID_T chunk;
+} PCS_MAP_VERSION_T;
+
+static inline void map_version_init(PCS_MAP_VERSION_T * v)
+{
+ memset(v, 0, sizeof(*v));
+}
+
+/* Returns negative value if v1 is older than v2, positive if v1 is newer than v2, 0 if they are equal */
+static inline int map_version_compare(PCS_MAP_VERSION_T const* v1, PCS_MAP_VERSION_T const* v2)
+{
+ int d;
+
+ if ((d = v1->master - v2->master))
+ return d;
+
+ if ((d = v1->cluster - v2->cluster))
+ return d;
+
+ if (v1->file == PCS_FILE_GEN_DELETED) {
+ if (v2->file != PCS_FILE_GEN_DELETED)
+ return 1;
+ } else {
+ if (v2->file == PCS_FILE_GEN_DELETED)
+ return -1;
+ }
+
+ if ((d = v1->file - v2->file))
+ return d;
+
+ if ((d = v1->lost_lease - v2->lost_lease))
+ return d;
+
+ return (int)(v1->chunk - v2->chunk);
+}
+
+static inline int map_version_equal(PCS_MAP_VERSION_T * v1, PCS_MAP_VERSION_T *v2)
+{
+ return 0 == map_version_compare(v1, v2);
+}
+
+/* Other version numbers */
+typedef u32 PCS_INTEGRITY_SEQ_T;
+typedef u32 PCS_SYNC_SEQ_T;
+
+static inline int pcs_sync_seq_compare(PCS_SYNC_SEQ_T seq1, PCS_SYNC_SEQ_T seq2)
+{
+ return (int)(seq1 - seq2);
+}
+
+
+//// TODO: dmonakhov perf counted termproraly disabled
+/*
+ * Performance counter.
+ */
+
+struct __pre_aligned(8) pcs_perf_counter
+{
+ u16 len;
+ u16 _reserved;
+ u32 key;
+ u64 value[0];
+} __aligned(8);
+
+#include "pcs_perfcounters.h"
+
+#define PCS_PERF_CNT_NEXT(p) ((struct pcs_perf_counter*)((char*)(p) + (p)->len))
+
+/* Core perf counters ID */
+enum {
+ PCS_PC_RPC_MSG_COUNT = 0x10001, /* number of currently processed RPC messages */
+ PCS_PC_RPC_CONNS = 0x10002, /* number of RPC connections */
+};
+
+/*
+ * Configuration interface.
+ */
+
+typedef u16 pcs_cfg_type_t;
+typedef u16 pcs_cfg_cls_t;
+
+struct __pre_aligned(8) pcs_cfg_data {
+ pcs_cfg_type_t type;
+ pcs_cfg_cls_t cls;
+ u32 size;
+ union {
+ s64 slong;
+ u64 ulong;
+ char string[1];
+ };
+} __aligned(8);
+
+/* Configuration classes */
+enum {
+ PCS_CFG_GENERIC = 1,
+ PCS_CFG_MDS = 2,
+ PCS_CFG_CS = 4,
+ PCS_CFG_CLIENT = 8,
+ PCS_CFG_INT = 0x1000,
+};
+
+/* Item type */
+enum {
+ PCS_DATA_NONE = 0, /* Used to delete the item regardless of its type */
+ PCS_DATA_SLONG = 1, /* Signed 64 bit value */
+ PCS_DATA_ULONG, /* Unsigned 64 bit value */
+ PCS_DATA_STRING = 0x10
+};
+
+/* The size of the data item. String data will include the terminating 0 */
+#define PCS_CFG_DATA_SZ(d) (offsetof(struct pcs_cfg_data, string)+(d).size+((d).type==PCS_DATA_STRING))
+
+struct __pre_aligned(8) pcs_cfg_item {
+ unsigned name_len;
+ unsigned pad;
+ union {
+ struct pcs_cfg_data data;
+ char buff[1];
+ };
+} __aligned(8);
+
+/* The name offset in the name buffer. Equals to the size of the configuration data. */
+#define PCS_CFG_NAME_OFF(i) PCS_CFG_DATA_SZ((i).data)
+/* The total size of the data item */
+#define PCS_CFG_ITEM_SZ(i) PCS_ALIGN(offsetof(struct pcs_cfg_item, buff)+PCS_CFG_NAME_OFF(i)+(i).name_len+1)
+
+/* Configuration sequence number incremented every time the configuration is being updated */
+typedef u32 PCS_CONFIG_SEQ_T;
+
+/* The following configuration sequence numbers have special meaning */
+#define PCS_CONFIG_SEQ_ANY ((PCS_CONFIG_SEQ_T)~0U) /* Don't care on set */
+#define PCS_CONFIG_SEQ_INI 0 /* Initial (default) configuration */
+
+#define PCS_EVT_REC_SZ_ALIGN(msg_sz) PCS_ALIGN(offsetof(struct pcs_evt_rec, msg[msg_sz]))
+#define PCS_EVT_REC_SZ_ALIGNED(descr) PCS_EVT_REC_SZ_ALIGN((descr).size)
+
+/* Generic path representation */
+struct __pre_aligned(8) pcs_path {
+ u32 sz;
+ char str[1];
+} __aligned(8);
+
+/* The size of the pcs_path structure with 1 byte reserved for terminating 0 */
+#define PCS_PATH_SZ_(sz) (offsetof(struct pcs_path,str)+(sz)+1)
+#define PCS_PATH_SZ(path) PCS_PATH_SZ_((path).sz)
+
+/* Path alignment */
+#define PCS_PATH_SZ_ALIGN(sz) PCS_ALIGN(PCS_PATH_SZ_(sz))
+#define PCS_PATH_SZ_ALIGNED(n) PCS_PATH_SZ_ALIGN((n).sz)
+#define PCS_PATH_PAD_SZ(sz) (PCS_PATH_SZ_ALIGN(sz)-offsetof(struct pcs_path,str)-(sz))
+
+static inline int cmp_path(struct pcs_path const* p_a, struct pcs_path const* p_b)
+{
+ unsigned _sz = p_a->sz < p_b->sz ? p_a->sz : p_b->sz;
+ int r = memcmp(p_a->str, p_b->str, _sz);
+ if (r) return r;
+ return (int)p_a->sz - (int)p_b->sz;
+}
+
+/* Generic constant string representation */
+struct pcs_cstr {
+ unsigned sz;
+ const char* str;
+};
+
+static inline int cmp_cstr(struct pcs_cstr const* s_a, struct pcs_cstr const* s_b)
+{
+ unsigned _sz = s_a->sz < s_b->sz ? s_a->sz : s_b->sz;
+ int r = memcmp(s_a->str, s_b->str, _sz);
+ if (r) return r;
+ return (int)s_a->sz - (int)s_b->sz;
+}
+
+/* File attribute bits */
+enum
+{
+ /* Attributes used internally by the system components */
+ PCS_FATTR_INTERNAL_ = 0xff,
+
+ /* Attributes has the physical file size maintained */
+ PCS_FATTR_HAS_PSIZE_ = 0x10,
+
+ /* The file object represents the directory */
+ PCS_FATTR_DIR = 0x1000,
+
+ /* The file object represents symbolic link */
+ PCS_FATTR_LINK = 0x2000,
+
+ /* The directory is the container for combined storage (set with PCS_FATTR_DIR only).
+ * It has several important properties:
+ * - only files are allowed as child objects
+ * - child leases can't be created, the only lease must be acquired on the container
+ * - client may implement IO on the container on its own
+ */
+ PCS_FATTR_CONTAINER = 0x10000,
+
+ /* Our file-inode abstraction is quite generic. The file may be attached to inide tree at any level.
+ * Inodes are being created or deleted automatically while the files are managed by clients. The file may
+ * have child objects but there is no way to create an empty inode except for creating the special file object
+ * with PCS_FATTR_DIR bit set. Resizing of such object as well as IO requests will fail with PCS_ERR_IS_DIR.
+ *
+ * The client may either don't care about directory tree or have an assumption that all directories in path must
+ * be created prior to the file itself. In the latter case it should set flag PCS_FFL_POSIX_PATH in operation request.
+ * If it is set:
+ * - an attempt to create or resolve file with dir object lacking in the path will fail with PCS_ERR_NOT_FOUND error
+ * - an attempt to delete or rename object with child objects will fail with PCS_ERR_NON_EMPTY_DIR error
+ */
+
+ /*
+ The file has inline data. MDS prohibits IO map query for the files with this flag set. The client in turn direct
+ read/write requests to MDS getting/setting file-associated data (see PCS_FA_DATA). May be set on the directory only.
+ Newly created files inherit it from the parent directory.
+ */
+ PCS_FATTR_INLINE = 0x1000000,
+ /*
+ The file consists of variable-length chunks where only the last one is writable. May be set on the directory only.
+ Newly created files inherit it from the parent directory.
+ */
+ PCS_FATTR_LOGSTREAM = 0x2000000,
+
+ /* Don't cache content on the client */
+ PCS_FATTR_NO_CLNT_CACHE = 0x10000000,
+
+ /* The following attributes are being inherited from the parent directory */
+ PCS_FATTR_INHERITABLE_MASK = 0xff000000,
+};
+
+/*
+ * Formatters
+ */
+
+#define VER_FMT "%u:%u:%u:%u:%llu"
+#define VER_ARGS(v) (v).master, (v).cluster, (v).file, (v).lost_lease, (unsigned long long)(v).chunk
+
+#define XID_FMT "[%u.%llu:%llu]"
+#define XID_ARGS(x) (unsigned)(((x).origin.val & PCS_NODE_TYPE_MASK) >> PCS_NODE_TYPE_SHIFT), \
+ NODE_ARGS((x).origin), (unsigned long long)((x).val)
+
+#define CLUSTER_ID_FMT "%08x%08x%08x%08x"
+#define CLUSTER_ID_ARGS(x) (*((unsigned int*)&((x).uuid[12]))), \
+ *((unsigned int*)&((x).uuid[8])), \
+ *((unsigned int*)&((x).uuid[4])), \
+ *((unsigned int*)&((x).uuid[0]))
+
+#define NODE_FMT "%llu"
+#define NODE_ARGS(id) (unsigned long long)((id).val)
+
+#define PEER_FMT "%s#" NODE_FMT
+#define PEER_ARGS(r) pcs_role_to_str((r)->peer_role), NODE_ARGS((r)->peer_id)
+
+#define CUID_FMT "O%08llx"
+#define CUID_ARGS(id) (unsigned long long)(id)
+
+
+#endif /* _PCS_PROT_TYPES_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_req.c b/fs/fuse/kio/pcs/pcs_req.c
new file mode 100644
index 000000000000..117e050691d9
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_req.c
@@ -0,0 +1,116 @@
+
+#include <net/sock.h>
+#include <linux/net.h>
+#include <linux/kthread.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+
+#include "pcs_types.h"
+#include "pcs_sock_io.h"
+#include "pcs_rpc.h"
+#include "pcs_req.h"
+#include "log.h"
+
+static void ireq_timer_handler(unsigned long arg)
+{
+
+ struct pcs_int_request *ireq = (struct pcs_int_request *)arg;
+ pcs_cc_submit(ireq->cc, ireq);
+}
+
+static void __ireq_init(struct pcs_dentry_info *di, struct pcs_cluster_core *cc,
+ struct pcs_int_request *ireq)
+{
+ memset(ireq, 0, sizeof(*ireq));
+ ireq->cc = cc;
+ ireq->ts = ireq->create_ts = jiffies;
+ setup_timer(&ireq->timer, ireq_timer_handler, (unsigned long)ireq);
+ INIT_HLIST_HEAD(&ireq->completion_data.child_list);
+ spin_lock_init(&ireq->completion_data.child_lock);
+ INIT_LIST_HEAD(&ireq->list);
+ ireq->dentry = di;
+}
+
+void ireq_init(struct pcs_dentry_info *di, struct pcs_int_request *ireq)
+{
+ __ireq_init(di, di->cluster, ireq);
+}
+
+void ireq_init_by_cluster(struct pcs_cluster_core *cc, struct pcs_int_request *ireq)
+{
+ __ireq_init(0, cc, ireq);
+}
+
+struct pcs_int_request *ireq_alloc(struct pcs_dentry_info *di)
+{
+ struct pcs_int_request *ireq;
+ ireq =__ireq_alloc();
+ if (!ireq)
+ return NULL;
+
+ __ireq_init(di, di->cluster, ireq);
+ return ireq;
+}
+
+struct pcs_int_request *ireq_alloc_by_cluster(struct pcs_cluster_core *cc)
+{
+ struct pcs_int_request *ireq;
+ ireq =__ireq_alloc();
+ if (!ireq)
+ return NULL;
+
+ __ireq_init(NULL, cc, ireq);
+ return ireq;
+}
+
+void ireq_delay(struct pcs_int_request *ireq)
+{
+ switch (ireq->error.value) {
+ case PCS_ERR_NORES:
+ if (!ireq->last_delay)
+ ireq->last_delay = PCS_ERROR_DELAY;
+ else if ((ireq->last_delay *= 2) > PCS_ERROR_DELAY_MAX)
+ ireq->last_delay = PCS_ERROR_DELAY_MAX;
+ break;
+ default:
+ ireq->last_delay = PCS_ERROR_DELAY;
+ }
+ mod_timer(&ireq->timer, ireq->last_delay);
+}
+
+void ireq_handle_hole(struct pcs_int_request *ireq)
+{
+ unsigned int len;
+ unsigned int offset;
+ struct iov_iter it;
+ pcs_api_iorequest_t * ar = ireq->completion_data.parent->apireq.req;
+
+ BUG_ON(ireq->type != PCS_IREQ_IOCHUNK);
+ BUG_ON(ireq->iochunk.direction);
+
+ len = ireq->iochunk.size;
+ offset = 0;
+ iov_iter_init_bad(&it);
+
+ DTRACE("enter m: " MAP_FMT ", ireq:%p \n", MAP_ARGS(ireq->iochunk.map), ireq);
+
+ while (len > 0) {
+ void * map, *buf;
+ size_t copy;
+
+ if (!iov_iter_count(&it))
+ ar->get_iter(ar->datasource, ireq->iochunk.dio_offset + offset, &it);
+
+ map = iov_iter_kmap_atomic(&it, &buf, ©);
+ if (copy > len)
+ copy = len;
+ memset(buf, 0, copy);
+ if (map)
+ kunmap_atomic(map);
+ len -= copy;
+ offset += copy;
+ iov_iter_advance(&it, copy);
+ }
+
+ ireq_complete(ireq);
+}
diff --git a/fs/fuse/kio/pcs/pcs_req.h b/fs/fuse/kio/pcs/pcs_req.h
new file mode 100644
index 000000000000..c8481a48413a
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_req.h
@@ -0,0 +1,320 @@
+#ifndef _PCS_REQ_H_
+#define _PCS_REQ_H_ 1
+
+#include <linux/workqueue.h>
+#include "pcs_error.h"
+#include "pcs_sock_io.h"
+#include "pcs_map.h"
+#include "pcs_cs_prot.h"
+#include "pcs_rpc.h"
+#include "pcs_cs.h"
+
+///////////////////////////
+
+enum
+{
+ PCS_IREQ_API = 0, /* IO request from API */
+ PCS_IREQ_IOCHUNK= 1, /* Internal IO request */
+ PCS_IREQ_LEASE = 2, /* Lease op request */
+ PCS_IREQ_FILE = 3, /* File op request */
+ PCS_IREQ_READDIR= 4, /* Readdir request */
+ PCS_IREQ_NOOP = 5, /* NOOP request */
+ PCS_IREQ_FINI = 6, /* Stop pcs process */
+ PCS_IREQ_TRUNCATE=7, /* Internal map truncate request */
+ PCS_IREQ_FLUSH = 8, /* Sync request */
+ PCS_IREQ_STATFS = 9, /* statfs request */
+ PCS_IREQ_LOOKUP = 10, /* lookup request */
+ PCS_IREQ_CSCONN = 11, /* connect to CS and auth */
+ PCS_IREQ_CUSTOM = 16, /* generic request */
+ PCS_IREQ_WRAID = 17, /* compound raid6 write request */
+ PCS_IREQ_RRAID = 18, /* compound raid6 read request */
+ PCS_IREQ_KAPI = 65 /* IO request from kernel API */
+};
+
+/* Generic request, all internal messages are queued using this struct.
+ * Messages can be of various "type".
+ */
+
+struct pcs_int_request
+{
+ struct pcs_cluster_core* cc;
+
+ struct list_head list;
+ struct pcs_dentry_info* dentry;
+
+ unsigned int type;
+ pcs_error_t error;
+ int flags;
+#define IREQ_F_FATAL 1
+#define IREQ_F_ONCE 2
+#define IREQ_F_SEQ_READ 4
+#define IREQ_F_RND_WEIGHT 8
+#define IREQ_F_CACHED 0x10
+#define IREQ_F_SEQ 0x20
+#define IREQ_F_MAPPED 0x40
+#define IREQ_F_MAP_REQUIRED 0x80
+#define IREQ_F_LOC_TOKEN 0x100
+#define IREQ_F_NOFLUSH 0x200
+#define IREQ_F_WB_SUSP 0x400
+#define IREQ_F_RECV_SPLICE 0x800
+
+ atomic_t iocount;
+
+ int qdepth;
+ abs_time_t ts;
+ abs_time_t ts_sent;
+ PCS_NODE_ID_T wait_origin;
+
+ struct {
+ struct pcs_int_request * parent;
+ void* ctx;
+ void* priv;
+ struct hlist_head child_list;
+ struct hlist_node child_node;
+ spinlock_t child_lock;
+ } completion_data;
+
+ void (*complete_cb)(struct pcs_int_request *ireq);
+
+ abs_time_t create_ts;
+
+ pcs_timer_t timer;
+ unsigned last_delay;
+
+ /* TODO: work struct only required for API request.
+ Probably should be embeded to apireq
+ */
+ struct work_struct worker;
+
+ union {
+ struct {
+ struct pcs_map_entry *map;
+ //// Temproraly disable flow
+ struct pcs_flow_node *flow;
+ ////struct pcs_splice_buf *splice_rbuf;
+ u8 direction;
+ u8 role;
+ short cs_index;
+ unsigned int size;
+ unsigned int dio_offset;
+ u64 chunk;
+ u64 offset;
+ struct pcs_cs_list *csl;
+ PCS_NODE_ID_T banned_cs;
+ struct pcs_msg msg;
+ struct pcs_cs_iohdr hbuf; /* Buffer for header.
+ * A little ugly
+ */
+ } iochunk;
+
+ struct {
+ struct pcs_map_entry *map; /* map to flush */
+ struct pcs_cs_list *csl;
+ struct pcs_msg *msg;
+ } flushreq;
+
+ struct {
+ u64 offset;
+ int phase;
+ PCS_MAP_VERSION_T version;
+ } truncreq;
+
+ struct {
+ unsigned int flags;
+ unsigned int tout;
+ int retries;
+ } leasereq;
+
+ struct {
+ unsigned int op;
+ unsigned int flags;
+ union {
+ struct pcs_dentry_info *dst_de; /* Only for rename */
+ off_t new_size; /* Only for resize */
+ const char *data; /* Only for symlink */
+ } arg;
+ } filereq;
+
+ struct {
+ pcs_api_csconnreq_t *req; /* Client request */
+ struct pcs_cluster_core *clu; /* dentry == NULL */
+ struct pcs_msg msg;
+ int out_fd;
+ } csconnreq;
+
+ struct {
+ void (*action)(struct pcs_int_request *ireq);
+ void (*destruct)(struct pcs_int_request *ireq);
+ void* ctx;
+ } custom;
+
+ struct {
+ pcs_api_iorequest_t * req; /* Client request */
+ unsigned int dio_offset; /* MBZ */
+ void* h; /* API handle */
+ } apireq;
+
+ };
+};
+
+// FROM pcs_cluste_core.h
+
+struct pcs_clnt_config
+{
+ int map_timeout;
+ int abort_timeout;
+ int kernel_cache_en;
+ int wmss;
+ int rmss;
+ int lmss;
+ int lic_status;
+ int io_locality;
+ int io_tweaks;
+ int net_10gbit;
+ int local_sndbuf;
+ int tcp_sndbuf;
+ int tcp_rcvbuf;
+};
+
+struct pcs_cluster_core
+{
+ struct list_head work_queue; /* Internal queue */
+ struct list_head completion_queue;/* Internal queue for ireqs to complete */
+ struct work_struct main_job;
+ struct work_struct completion_job;
+
+ struct pcs_cs_set css; /* Table of all CSs */
+ struct pcs_map_set maps; /* Global map data */
+ struct pcs_rpc_engine eng; /* RPC engine */
+ struct workqueue_struct *wq;
+//// struct pcs_ratelimit rlim; /* Rate limiter */
+//// struct pcs_rng rng;
+ /* <SKIP */
+
+ struct {
+ struct pcs_clnt_config def;
+ struct pcs_clnt_config curr;
+ PCS_CONFIG_SEQ_T sn;
+ int in_progress;
+ } cfg;
+
+ int io_locality;
+ int io_tweaks;
+ int iolat_cutoff;
+ int netlat_cutoff;
+ int use_unix_socket;
+
+ /*
+ * Our cluster core may be integrated onto the various implementations by customizing the following request processing methods.
+ * The core does not provide any of them out of the box. Note that only the first one is mandatory.
+ */
+ struct {
+ void (*ireq_process) (struct pcs_int_request *);
+ void (*ireq_on_error) (struct pcs_int_request *);
+ int (*ireq_check_redo)(struct pcs_int_request *);
+ } op;
+
+ int (*abort_callback)(struct pcs_cluster_core *cc, struct pcs_int_request *ireq);
+ struct fuse_conn *fc;
+ spinlock_t lock;
+};
+
+static inline struct pcs_cluster_core *cc_from_csset(struct pcs_cs_set * css)
+{
+ return container_of(css, struct pcs_cluster_core, css);
+}
+
+static inline struct pcs_cluster_core *cc_from_cs(struct pcs_cs * cs)
+{
+ return cc_from_csset(cs->css);
+}
+
+static inline struct pcs_cluster_core *cc_from_maps(struct pcs_map_set *maps)
+{
+ return container_of(maps, struct pcs_cluster_core, maps);
+}
+
+void pcs_cc_submit(struct pcs_cluster_core *cc, struct pcs_int_request* ireq);
+void pcs_cc_requeue(struct pcs_cluster_core *cc, struct list_head * q);
+////// FROM pcs_cluster.h
+static inline void pcs_sreq_attach(struct pcs_int_request * sreq, struct pcs_int_request * parent)
+{
+ sreq->completion_data.parent = parent;
+ sreq->ts = parent->ts;
+ spin_lock(&parent->completion_data.child_lock);
+ hlist_add_head(&sreq->completion_data.child_node, &parent->completion_data.child_list);
+ atomic_inc(&parent->iocount);
+ spin_unlock(&parent->completion_data.child_lock);
+}
+
+static inline int pcs_sreq_detach(struct pcs_int_request * sreq)
+{
+ struct pcs_int_request * parent = sreq->completion_data.parent;
+
+ BUG_ON(!parent);
+ BUG_ON(!atomic_read(&parent->iocount));
+
+ spin_lock(&parent->completion_data.child_lock);
+ hlist_del(&sreq->completion_data.child_node);
+ spin_unlock(&parent->completion_data.child_lock);
+
+ return !atomic_dec_and_test(&parent->iocount);
+}
+
+
+static inline struct pcs_int_request *ireq_from_msg(struct pcs_msg *msg)
+{
+ return container_of(msg, struct pcs_int_request, iochunk.msg);
+}
+
+static inline void ireq_process(struct pcs_int_request *ireq)
+{
+ (ireq->cc->op.ireq_process)(ireq);
+}
+
+static inline void ireq_on_error(struct pcs_int_request *ireq)
+{
+ if (ireq->cc->op.ireq_on_error) (ireq->cc->op.ireq_on_error)(ireq);
+}
+
+static inline void ireq_complete(struct pcs_int_request *ireq)
+{
+ BUG_ON(!hlist_empty(&ireq->completion_data.child_list));
+
+ if (pcs_if_error(&ireq->error))
+ ireq_on_error(ireq);
+ ireq->complete_cb(ireq);
+}
+
+static inline int ireq_check_redo(struct pcs_int_request *ireq)
+{
+ if (ireq->flags & IREQ_F_FATAL)
+ return 0;
+ if (ireq->cc->op.ireq_check_redo)
+ return (ireq->cc->op.ireq_check_redo)(ireq);
+ return 1;
+}
+
+static inline int ireq_is_timed_out(struct pcs_int_request *ireq)
+{
+ int timed_out;
+ timed_out = ireq->cc->cfg.curr.abort_timeout &&
+ ireq->create_ts + ireq->cc->cfg.curr.abort_timeout < jiffies;
+ if (timed_out && ireq->cc->abort_callback)
+ return ireq->cc->abort_callback(ireq->cc, ireq);
+ return timed_out;
+}
+
+struct pcs_int_request * __ireq_alloc(void);
+struct pcs_int_request *ireq_alloc(struct pcs_dentry_info *di);
+struct pcs_int_request *ireq_alloc_by_cluster(struct pcs_cluster_core *cc);
+void ireq_init(struct pcs_dentry_info *di, struct pcs_int_request *);
+void ireq_init_by_cluster(struct pcs_cluster_core *cc, struct pcs_int_request *);
+void ireq_destroy(struct pcs_int_request *);
+
+void ireq_delay(struct pcs_int_request *ireq);
+void ireq_handle_hole(struct pcs_int_request *ireq);
+
+void pcs_process_ireq(struct pcs_int_request *ireq);
+
+#endif /* _PCS_REQ_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_rpc.c b/fs/fuse/kio/pcs/pcs_rpc.c
new file mode 100644
index 000000000000..2ec7423a3f54
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_rpc.c
@@ -0,0 +1,1314 @@
+/* An attempt of universal rpc layer.
+ *
+ * All the components (except for MDS) used to assume asymmetrical communication:
+ * if some connection is open actively, it sends requests, but does not receive requests.
+ * If it is open passively, it receives requests, but sends only responses.
+ * This layer does not impose this limitation.
+ *
+ * API:
+ * pcs_rpc_create(struct pcs_rpc_engine * eng, struct pcs_rpc_params *parm, struct rpc_ops * ops)
+ * - create new rpc client with requested parameters/ops
+ * pcs_rpc_close(struct pcs_rpc * ep)
+ * - close client. Probably it will not be destroyed immediately, but it is guaranteed
+ * that ops will not be called anymore. If some messages are queued inside rpc engine,
+ * they will be completed before return from pcs_rpc_close(), but if messages are somewhere
+ * under control of client, msg->done() can be called later.
+ */
+
+#include <net/sock.h>
+#include <linux/net.h>
+#include <linux/kthread.h>
+#include <linux/types.h>
+
+
+#include "pcs_types.h"
+#include "pcs_sock_io.h"
+#include "pcs_rpc.h"
+#include "pcs_cluster.h"
+#include "log.h"
+
+static void timer_work(struct work_struct *w);
+static int rpc_gc_classify(struct pcs_rpc * ep);
+
+static unsigned int rpc_hash(PCS_NODE_ID_T * id)
+{
+ return *(unsigned int*)id % PCS_RPC_HASH_SIZE;
+}
+
+static struct pcs_rpc *
+pcs_rpc_lookup(struct pcs_rpc_engine * eng, PCS_NODE_ID_T * id) __attribute__((unused));
+
+static struct pcs_rpc *
+pcs_rpc_lookup(struct pcs_rpc_engine * eng, PCS_NODE_ID_T * id)
+{
+ struct pcs_rpc * ep;
+
+ hlist_for_each_entry(ep, &eng->ht[rpc_hash(id)], link) {
+ if (memcmp(&ep->peer_id, id, sizeof(ep->peer_id)) == 0)
+ return pcs_rpc_get(ep);
+ }
+ return NULL;
+}
+static void rpc_add_hash(struct pcs_rpc * ep) __attribute__ ((unused));
+static void rpc_del_hash(struct pcs_rpc * ep) __attribute__ ((unused));
+
+static void rpc_add_hash(struct pcs_rpc * ep)
+{
+ if (!hlist_unhashed(&ep->link))
+ hlist_del(&ep->link);
+
+ if (!(ep->flags & PCS_RPC_F_HASHED)) {
+ ep->flags |= PCS_RPC_F_HASHED;
+ pcs_rpc_get(ep);
+ }
+
+ hlist_add_head(&ep->link, &ep->eng->ht[rpc_hash(&ep->peer_id)]);
+}
+
+static void rpc_del_hash(struct pcs_rpc * ep)
+{
+ if (ep->flags & PCS_RPC_F_HASHED) {
+ ep->flags &= ~PCS_RPC_F_HASHED;
+ hlist_del(&ep->link);
+ hlist_add_head(&ep->link, &ep->eng->unhashed);
+ pcs_rpc_put(ep);
+ }
+}
+
+
+struct pcs_msg * pcs_rpc_lookup_xid(struct pcs_rpc * ep, PCS_XID_T * xid)
+{
+ struct pcs_msg * msg;
+
+ /* TODO: lookup may be optimized by using has instead of list */
+ list_for_each_entry(msg, &ep->pending_queue, list) {
+ struct pcs_rpc_hdr * h = (struct pcs_rpc_hdr *)msg_inline_head(msg);
+ if (memcmp(&h->xid, xid, sizeof(PCS_XID_T)) == 0)
+ return msg;
+ }
+ return NULL;
+}
+
+static void pcs_set_rpc_error(pcs_error_t * err, int error, struct pcs_rpc * ep)
+{
+ err->value = error;
+
+ if (error == PCS_ERR_NOMEM) {
+ /* Sad exception, NOMEM is defintely a local error. XXX Find a way to beautify this. */
+ err->remote = 0;
+ } else {
+ err->remote = 1;
+ err->offender = ep->peer_id;
+ }
+}
+
+static void pcs_msg_add_calendar(struct pcs_msg * msg, bool update)
+{
+ unsigned int kill_slot;
+ struct pcs_rpc *ep = msg->rpc;
+
+ BUG_ON(!ep);
+ kill_slot = update? msg->rpc->kill_arrow + ((msg->timeout + HZ -1) / HZ) : msg->kill_slot;
+ kill_slot = kill_slot & (RPC_MAX_CALENDAR - 1);
+ hlist_add_head(&msg->kill_link, &ep->kill_calendar[kill_slot]);
+ msg->kill_slot = kill_slot;
+
+ if (unlikely(!timer_pending(&ep->calendar_work.timer))) {
+ struct pcs_cluster_core *cc = cc_from_rpc(ep->eng);
+
+ mod_delayed_work(cc->wq, &ep->calendar_work, HZ);
+ }
+
+}
+
+void pcs_msg_del_calendar(struct pcs_msg * msg)
+{
+ int kill_slot = msg->kill_slot;
+
+ if (hlist_unhashed(&msg->kill_link))
+ return;
+
+ BUG_ON(kill_slot >= RPC_MAX_CALENDAR);
+ BUG_ON(!msg->rpc);
+ BUG_ON((msg->kill_slot != kill_slot));
+
+ hlist_del_init(&msg->kill_link);
+
+}
+
+void rpc_abort(struct pcs_rpc * ep, int fatal, int error)
+{
+ int state = ep->state;
+ struct list_head failed_list;
+
+ BUG_ON(!mutex_is_locked(&ep->mutex));
+ TRACE("ep:%p->state:%d fatal:%d error:%d\n", ep, state, fatal, error);
+
+ ep->flags &= ~(PCS_RPC_F_PEER_VERIFIED | PCS_RPC_F_PEER_AUTHORIZED);
+ ep->flags &= ~PCS_RPC_F_PEER_ID;
+
+ if (state == PCS_RPC_DESTROY || state == PCS_RPC_ABORT)
+ return;
+
+ /* Passively open connections are not reconnected */
+ if (ep->flags & (PCS_RPC_F_PASSIVE|PCS_RPC_F_NO_RETRY|PCS_RPC_F_DEAD))
+ fatal = 1;
+
+ ep->state = fatal ? PCS_RPC_ABORT : PCS_RPC_UNCONN;
+ cancel_delayed_work(&ep->timer_work);
+
+ pcs_rpc_get(ep);
+ INIT_LIST_HEAD(&failed_list);
+
+ while (!list_empty(&ep->pending_queue)) {
+ struct pcs_msg * msg = list_first_entry(&ep->pending_queue, struct pcs_msg, list);
+ list_move_tail(&msg->list, &failed_list);
+ TRACE("aborted msg to " PEER_FMT ", tmo=%d, err=%d, %ld", PEER_ARGS(ep),
+ msg->timeout, error, (long)(msg->start_time + msg->timeout - jiffies));
+ pcs_msg_del_calendar(msg);
+ msg->stage = PCS_MSG_STAGE_NONE;
+ }
+ if (fatal) {
+ while (!list_empty(&ep->state_queue)) {
+ struct pcs_msg * msg = list_first_entry(&ep->state_queue, struct pcs_msg, list);
+ list_move_tail(&msg->list, &failed_list);
+ TRACE("aborted unsent msg to " PEER_FMT ", tmo=%d, err=%d", PEER_ARGS(ep),
+ msg->timeout, error);
+ pcs_msg_del_calendar(msg);
+ msg->stage = PCS_MSG_STAGE_NONE;
+ }
+ }
+
+ if (ep->conn) {
+ struct pcs_ioconn * ioconn = ep->conn;
+ struct pcs_sockio * conn = sio_from_ioconn(ioconn);
+
+ ep->conn = NULL;
+ if (ep->gc)
+ list_lru_del(&ep->gc->lru, &ep->lru_link);
+
+ conn->parent = NULL;
+ pcs_sock_error(conn, error);
+ }
+
+ if (ep->state == PCS_RPC_UNCONN) {
+ struct pcs_cluster_core *cc = cc_from_rpc(ep->eng);
+
+ ep->state = PCS_RPC_HOLDDOWN;
+ queue_delayed_work(cc->wq, &ep->timer_work, ep->params.holddown_timeout);
+ }
+
+ while (!list_empty(&failed_list)) {
+ struct pcs_msg * msg = list_first_entry(&failed_list, struct pcs_msg, list);
+ list_del_init(&msg->list);
+ pcs_set_rpc_error(&msg->error, error, ep);
+ BUG_ON(!hlist_unhashed(&msg->kill_link));
+ msg->done(msg);
+ }
+
+ if (ep->state != PCS_RPC_ABORT)
+ goto out;
+
+ if (!(ep->flags & PCS_RPC_F_DEAD)) {
+ /* RPC is aborted, notify its owner. Owner is supposed to close us. */
+ if (ep->ops->state_change)
+ ep->ops->state_change(ep, error);
+ }
+
+out:
+ pcs_rpc_put(ep);
+}
+
+/* Client close. */
+void pcs_rpc_close(struct pcs_rpc * ep)
+{
+ mutex_lock(&ep->mutex);
+ BUG_ON(ep->flags & PCS_RPC_F_DEAD);
+ BUG_ON(ep->flags & PCS_RPC_F_PASSIVE);
+
+ ep->flags |= PCS_RPC_F_DEAD;
+ rpc_abort(ep, 1, PCS_ERR_NET_ABORT);
+ ep->state = PCS_RPC_DESTROY;
+ mutex_unlock(&ep->mutex);
+
+ pcs_rpc_put(ep);
+
+}
+
+void pcs_rpc_attach_new_ep(struct pcs_rpc * ep, struct pcs_rpc_engine * eng)
+{
+ eng->nrpcs++;
+ hlist_add_head(&ep->link, &eng->unhashed);
+ ep->eng = eng;
+ ep->state = PCS_RPC_UNCONN;
+ ep->flags = 0;
+ atomic_set(&ep->refcnt, 1);
+ ep->retries = 0;
+ ep->peer_role = PCS_NODE_ROLE_TEST;
+ ep->peer_flags = 0;
+ ep->peer_version = ~0U;
+ ep->conn = NULL;
+ ep->private = NULL;
+ INIT_LIST_HEAD(&ep->pending_queue);
+ INIT_LIST_HEAD(&ep->state_queue);
+ INIT_LIST_HEAD(&ep->input_queue);
+ INIT_LIST_HEAD(&ep->lru_link);
+
+ spin_lock_init(&ep->q_lock);
+ mutex_init(&ep->mutex);
+ ep->accounted = 0;
+ ep->netlat_min = ~0U;
+ ep->netlat_max = 0;
+ atomic_set(&ep->netlat_cnt, 0);
+ atomic64_set(&ep->netlat_avg, 0);
+ ep->cpu = WORK_CPU_UNBOUND;
+
+ ep->gc = NULL;
+ if (eng->max_gc_index)
+ ep->gc = &eng->gc[0];
+
+ if (!timer_pending(&eng->stat_work.timer)) {
+ struct pcs_cluster_core *cc = cc_from_rpc(eng);
+
+ mod_delayed_work(cc->wq, &eng->stat_work, PCS_MSG_MAX_CALENDAR * HZ);
+ }
+}
+
+void pcs_rpc_destroy(struct pcs_rpc * ep)
+{
+ BUG_ON(ep->state != PCS_RPC_DESTROY);
+ BUG_ON(ep->flags & PCS_RPC_F_HASHED);
+ BUG_ON(!(ep->flags & PCS_RPC_F_DEAD));
+ BUG_ON(!list_empty(&ep->input_queue));
+ BUG_ON(!list_empty(&ep->state_queue));
+ BUG_ON(!list_empty(&ep->pending_queue));
+ BUG_ON(timer_pending(&ep->timer_work.timer));
+
+ /* pcs_free(ep->sun); */
+ /* ep->sun = NULL; */
+ if (ep->gc)
+ list_lru_del(&ep->gc->lru, &ep->lru_link);
+ hlist_del(&ep->link);
+ ep->eng->nrpcs--;
+ cancel_delayed_work_sync(&ep->calendar_work);
+ if (ep->eng->nrpcs == 0)
+ cancel_delayed_work_sync(&ep->eng->stat_work);
+
+ memset(ep, 0xFF, sizeof(*ep));
+ kfree(ep);
+}
+
+static void rpc_eof_cb(struct pcs_sockio * sio)
+{
+ struct pcs_rpc * ep = sio->parent;
+
+ if (ep == NULL)
+ return;
+
+ /* Dead socket is finally closed, we could already open another one.
+ * I feel inconvenient about this.
+ */
+ if (&sio->ioconn != ep->conn)
+ return;
+
+ rpc_abort(ep, 0, PCS_ERR_NET_ABORT);
+}
+
+
+struct pcs_msg * pcs_rpc_alloc_error_response(struct pcs_rpc * ep, struct pcs_rpc_hdr * req_hdr, int err, int size)
+{
+ struct pcs_msg * eresp;
+ struct pcs_rpc_error_resp * eh;
+
+ BUG_ON(size < sizeof(struct pcs_rpc_error_resp));
+
+ eresp = pcs_alloc_response(req_hdr, size);
+ if (eresp) {
+ eh = (struct pcs_rpc_error_resp *)eresp->_inline_buffer;
+ eh->hdr.type = PCS_RPC_ERROR_RESP;
+ eh->offender = ep->eng->local_id;
+ eh->code = err;
+ eh->npayloads = 0;
+ memset(&eh->payload, 0, sizeof(eh->payload));
+ }
+ return eresp;
+}
+
+void pcs_rpc_error_respond(struct pcs_rpc * ep, struct pcs_msg * msg, int err)
+{
+ struct pcs_msg * eresp;
+ struct pcs_rpc_hdr * h = (struct pcs_rpc_hdr *)msg->_inline_buffer;
+
+ if (ep->state < PCS_RPC_AUTH || ep->state > PCS_RPC_WORK)
+ return;
+
+ eresp = pcs_rpc_alloc_error_response(ep, h, err, sizeof(struct pcs_rpc_error_resp));
+ if (eresp) {
+ struct pcs_sockio *sio = sio_from_ioconn(ep->conn);
+ pcs_sock_sendmsg(sio, eresp);
+ }
+}
+
+/* After client gets csconn_complete() callback, he makes some actions and completes switch
+ * to WORK state calling this function.
+ */
+static void pcs_rpc_enable(struct pcs_rpc * ep, int error)
+{
+ struct pcs_cluster_core *cc = cc_from_rpc(ep->eng);
+
+ BUG_ON(!mutex_is_locked(&ep->mutex));
+ BUG_ON(ep->state != PCS_RPC_APPWAIT);
+
+ if (error) {
+ rpc_abort(ep, 1, error);
+ return;
+ }
+
+ if (ep->gc) {
+ int idx = rpc_gc_classify(ep);
+
+ if (ep->eng->gc + idx != ep->gc) {
+ list_lru_del(&ep->gc->lru, &ep->lru_link);
+ ep->gc = ep->eng->gc + idx;
+ list_lru_add(&ep->gc->lru, &ep->lru_link);
+ }
+ }
+ TRACE("ep(%p)->state: WORK\n", ep);
+ ep->state = PCS_RPC_WORK;
+ queue_work(cc->wq, &ep->work);
+}
+
+static void handle_response(struct pcs_rpc * ep, struct pcs_msg * msg)
+{
+ struct pcs_rpc_hdr * h = (struct pcs_rpc_hdr *)msg->_inline_buffer;
+ struct pcs_msg * req;
+
+ /* Use of iocount is unusual and deserves an explanation. If response
+ * is processed synchronously, this iocount is unnecessary.
+ * But if done() needs to queue response, it can increase iocount to hold the message
+ * for itself.
+ */
+ pcs_msg_io_start(msg, pcs_free_msg);
+ req = pcs_rpc_lookup_xid(ep, &h->xid);
+ if (req == NULL)
+ goto drop;
+
+ pcs_msg_del_calendar(req);
+ list_del(&req->list);
+ if (h->type == PCS_RPC_ERROR_RESP) {
+ struct pcs_rpc_error_resp * eh = (struct pcs_rpc_error_resp *)msg->_inline_buffer;
+
+ if (msg->size < sizeof(struct pcs_rpc_error_resp))
+ pcs_set_rpc_error(&req->error, PCS_ERR_PROTOCOL, ep);
+ else {
+ req->error = (pcs_error_t){ .value = eh->code, .remote = 1, .offender = eh->offender };
+ req->response = msg;
+ }
+ } else {
+ struct pcs_rpc_hdr * req_h = (struct pcs_rpc_hdr *)msg_inline_head(req);
+
+ if ((req_h->type ^ h->type) & ~PCS_RPC_DIRECTION)
+ pcs_set_rpc_error(&req->error, PCS_ERR_PROTOCOL, ep);
+ else
+ req->response = msg;
+ }
+
+ if (ep->ops->hook_response)
+ ep->ops->hook_response(ep, req);
+
+ req->stage = PCS_MSG_STAGE_DONE;
+ BUG_ON(!hlist_unhashed(&msg->kill_link));
+ req->done(req);
+
+drop:
+ pcs_msg_io_end(msg);
+}
+
+static void handle_keep_waiting(struct pcs_rpc * ep, struct pcs_msg * msg)
+{
+ struct pcs_rpc_keep_waiting * h = (struct pcs_rpc_keep_waiting *)msg->_inline_buffer;
+ struct pcs_msg * req;
+
+ if (h->hdr.len < sizeof(struct pcs_rpc_keep_waiting))
+ return;
+
+ TRACE("Received keep wait from " NODE_FMT " for request " XID_FMT,
+ NODE_ARGS(h->hdr.xid.origin), XID_ARGS(h->xid));
+
+ req = pcs_rpc_lookup_xid(ep, &h->xid);
+ if (!req)
+ return;
+
+ if (ep->ops->keep_waiting)
+ ep->ops->keep_waiting(ep, req, msg);
+
+ /* Restart kill timer as if message arrived right now */
+ if (!hlist_unhashed(&req->kill_link)) {
+ pcs_msg_del_calendar(req);
+ pcs_msg_add_calendar(req, 1);
+ }
+
+ /* Requeue message to tail of pending queue and restart RPC timer */
+ if (req->stage == PCS_MSG_STAGE_WAIT) {
+ req->start_time = jiffies;
+ list_move_tail(&req->list, &ep->pending_queue);
+ }
+}
+
+void pcs_rpc_cancel_request(struct pcs_msg * msg)
+{
+ pcs_msg_del_calendar(msg);
+ list_del(&msg->list);
+ msg->stage = PCS_MSG_STAGE_NONE;
+ pcs_set_rpc_error(&msg->error, PCS_ERR_CANCEL_KEEPWAIT, msg->rpc);
+ msg->done(msg);
+}
+
+void rpc_work_input(struct pcs_msg * msg)
+{
+ struct pcs_rpc * ep = msg->rpc;
+ struct pcs_rpc_hdr * h = (struct pcs_rpc_hdr *)msg->_inline_buffer;
+
+ if (ep == NULL || ep->state != PCS_RPC_WORK)
+ goto drop;
+
+ msg->done = pcs_free_msg;
+
+ if (RPC_IS_RESPONSE(h->type)) {
+ handle_response(ep, msg);
+ return;
+ } else if (h->type == PCS_RPC_KEEP_WAITING) {
+ handle_keep_waiting(ep, msg);
+ } else {
+ int res;
+
+ res = ep->ops->demux_request(ep, msg);
+ /* Successfully demuxed */
+ if (res == 0)
+ return;
+
+ /* Client can return error code to pass back to requestor */
+ pcs_rpc_error_respond(ep, msg, res);
+ }
+
+drop:
+ pcs_free_msg(msg);
+}
+
+struct pcs_msg * rpc_get_hdr(struct pcs_sockio * sio)
+{
+ struct pcs_rpc * ep = (struct pcs_rpc *)sio->parent;
+ struct pcs_rpc_hdr * h = (struct pcs_rpc_hdr*)sio_inline_buffer(sio);
+ struct pcs_msg * msg;
+ void (*next_input)(struct pcs_msg *);
+
+ if (ep == NULL)
+ return NULL;
+
+ /* Fatal stream format error */
+ if (h->len < sizeof(struct pcs_rpc_hdr) || h->len > ep->params.max_msg_size) {
+ pcs_log(0, "Bad message header %u %u\n", h->len, h->type);
+ return NULL;
+ }
+
+ switch (ep->state) {
+ case PCS_RPC_WORK:
+ /* Client can override get_hdr to allocate special buffer. */
+ if (ep->ops->get_hdr) {
+ msg = ep->ops->get_hdr(ep, h);
+ if (msg)
+ return msg;
+ }
+ next_input = rpc_work_input;
+ break;
+ default:
+ pcs_log(0, "Received msg in bad state %u\n", ep->state);
+ BUG();
+ return NULL;
+
+ }
+
+ msg = pcs_rpc_alloc_input_msg(ep, h->len);
+ if (!msg) {
+ pcs_sock_throttle(sio);
+ return NULL;
+ }
+
+ memcpy(msg->_inline_buffer, h, sizeof(struct pcs_rpc_hdr));
+ msg->done = next_input;
+ msg->size = h->len;
+ msg->private = NULL;
+ return msg;
+}
+
+
+/* Start connect. It is triggered by a message sent to this peer or can be called
+ * explicitly, if caller needs to steal csconn from userspace
+ */
+void pcs_rpc_connect(struct pcs_rpc * ep)
+{
+
+ /* Nothing to do, connect is already initiated or in holddown state */
+ if (ep->state != PCS_RPC_UNCONN)
+ return;
+
+ if (ep->flags & PCS_RPC_F_LOCAL) {
+ /* TODO, local path is temprorally disabled */
+ BUG_ON(1);
+ } else {
+ TRACE("Connecting to node " NODE_FMT "\n", NODE_ARGS(ep->peer_id));
+
+
+ BUG_ON(!ep->ops->connect);
+ ep->ops->connect(ep);
+ }
+}
+
+/* Send notification, which does not require waiting for response from peer.
+ * Also it is used internally as "raw" submit.
+ */
+static void pcs_rpc_send(struct pcs_rpc * ep, struct pcs_msg * msg, bool requeue)
+{
+ struct pcs_sockio *sio = sio_from_ioconn(ep->conn);
+
+ BUG_ON(!mutex_is_locked(&ep->mutex));
+ BUG_ON(msg->rpc != (requeue ? ep: NULL));
+
+ TRACE("ENTER ep:%p state:%d msg:%p\n", ep, ep->state, msg);
+
+ if (!requeue) {
+ msg->rpc = pcs_rpc_get(ep);
+ if (msg->timeout) {
+ pcs_msg_add_calendar(msg, 1);
+ } else {
+ msg->kill_slot = RPC_MAX_CALENDAR;
+ INIT_HLIST_NODE(&msg->kill_link);
+ }
+ } else /* Requeued messages must be scheduled in calendar */
+ BUG_ON(msg->timeout && hlist_unhashed(&msg->kill_link));
+
+ if (ep->state == PCS_RPC_WORK) {
+ BUG_ON(ep->conn == NULL);
+ if (msg->size)
+ pcs_sock_sendmsg(sio, msg);
+ else {
+ pcs_msg_del_calendar(msg);
+ msg->done(msg);
+ }
+ return;
+ }
+
+ if (ep->state == PCS_RPC_ABORT || ep->state == PCS_RPC_DESTROY) {
+ pcs_set_rpc_error(&msg->error, PCS_ERR_NET_ABORT, ep);
+ pcs_msg_del_calendar(msg);
+ msg->done(msg);
+ return;
+ }
+
+ list_add_tail(&msg->list, &ep->state_queue);
+ msg->stage = PCS_MSG_STAGE_UNSENT;
+
+ if (ep->state == PCS_RPC_UNCONN)
+ pcs_rpc_connect(ep);
+}
+
+void pcs_rpc_kick_queue(struct pcs_rpc * ep)
+{
+ struct pcs_cluster_core *cc = cc_from_rpc(ep->eng);
+
+ queue_work_on(ep->cpu, cc->wq, &ep->work);
+}
+
+void pcs_rpc_queue(struct pcs_rpc * ep, struct pcs_msg * msg)
+{
+ int was_idle;
+
+ spin_lock(&ep->q_lock);
+ was_idle = list_empty(&ep->input_queue);
+ list_add_tail(&msg->list, &ep->input_queue);
+
+ /* Naive socket-to-cpu binding approach */
+ if (time_is_before_jiffies(ep->cpu_stamp) && was_idle) {
+ ep->cpu_stamp = jiffies + PCS_RPC_CPU_SLICE;
+ ep->cpu = smp_processor_id();
+ }
+ spin_unlock(&ep->q_lock);
+
+ if (was_idle)
+ pcs_rpc_kick_queue(ep);
+}
+
+static void calendar_work(struct work_struct *w)
+{
+ struct pcs_rpc * ep = container_of(w, struct pcs_rpc, calendar_work.work);
+ int kill_slot = ep->kill_arrow & (RPC_MAX_CALENDAR - 1);
+ struct pcs_cluster_core *cc = cc_from_rpc(ep->eng);
+ struct hlist_head * bucket;
+ int i, count = 0;
+
+ mutex_lock(&ep->mutex);
+ bucket = &ep->kill_calendar[kill_slot];
+ while (!hlist_empty(bucket)) {
+ struct pcs_msg * msg = hlist_entry(bucket->first, struct pcs_msg, kill_link);
+ struct pcs_rpc_hdr * h = (struct pcs_rpc_hdr *)msg_inline_head(msg);
+
+ (void)h;
+ TRACE("killing msg to " PEER_FMT " type=%u xid=" XID_FMT " stage=%d tmo=%d exp=%ld rem=%ld\n",
+ PEER_ARGS(msg->rpc), h->type, XID_ARGS(h->xid),
+ msg->stage, msg->timeout,
+ (long)(msg->start_time + msg->timeout - jiffies),
+ (long)(msg->start_time + msg->rpc->params.response_timeout - jiffies));
+
+ pcs_msg_del_calendar(msg);
+ switch (msg->stage) {
+ case PCS_MSG_STAGE_SEND:
+ if (pcs_sock_cancel_msg(msg)) {
+ /* The message is under network IO right now. We cannot kill it
+ * without destruction of the whole connection. So, we just reschedule
+ * kill. When IO will complete, it will be killed not even waiting
+ * for response. But if IO stucks, we will violate deadline, alas.
+ * I hope it is the only place, where we violate deadline now.
+ */
+ msg->kill_slot = (msg->kill_slot + 1 ) & (RPC_MAX_CALENDAR - 1);
+ pcs_msg_add_calendar(msg, 0);
+ continue;
+ }
+ break;
+ default:
+ list_del(&msg->list);
+ break;
+ }
+
+ if (msg->stage == PCS_MSG_STAGE_WAIT) {
+ /* Leave rpc timer running. If it expires before any (late) response
+ * is received, rpc will be shutdown
+ */
+ pcs_set_rpc_error(&msg->error, PCS_ERR_RESPONSE_TIMEOUT, msg->rpc);
+ } else {
+ msg->stage = PCS_MSG_STAGE_SENT;
+ pcs_set_rpc_error(&msg->error, PCS_ERR_WRITE_TIMEOUT, msg->rpc);
+ }
+ BUG_ON(!hlist_unhashed(&msg->kill_link));
+ msg->done(msg);
+ count++;
+ }
+ if (count)
+ printk("%s %d messages to "PEER_FMT" destroyed\n", __FUNCTION__,
+ count, PEER_ARGS(ep));
+
+ for (i=0; i < RPC_MAX_CALENDAR-1; i++) {
+ kill_slot = (ep->kill_arrow + i) & (RPC_MAX_CALENDAR - 1);
+
+ if (!hlist_empty(&ep->kill_calendar[kill_slot])) {
+ /* FIXME: suboptimal scheduling */
+ mod_delayed_work(cc->wq, &ep->calendar_work, HZ);
+ break;
+ }
+ }
+ ep->kill_arrow++;
+ mutex_unlock(&ep->mutex);
+}
+
+static void update_xmit_timeout(struct pcs_rpc *ep)
+{
+ struct pcs_sockio *sio = sio_from_ioconn(ep->conn);
+ struct pcs_cluster_core *cc = cc_from_rpc(ep->eng);
+ struct pcs_msg * msg;
+ unsigned long timeout = 0;
+ unsigned long tx;
+
+ BUG_ON(ep->state != PCS_RPC_WORK);
+
+ if (list_empty(&ep->pending_queue) && list_empty(&sio->write_queue)) {
+ if (timer_pending(&ep->timer_work.timer))
+ cancel_delayed_work(&ep->timer_work);
+ return;
+ }
+ if (!list_empty(&ep->pending_queue)) {
+ msg = list_first_entry(&ep->pending_queue, struct pcs_msg, list);
+
+ timeout = msg->start_time + ep->params.response_timeout;
+ }
+ if (!list_empty(&sio->write_queue)) {
+ msg = list_first_entry(&sio->write_queue, struct pcs_msg, list);
+ tx = msg->start_time + sio->send_timeout;
+ if (time_after(tx, timeout))
+ timeout = tx;
+ }
+ if (time_is_before_jiffies(timeout))
+ timeout = 0;
+ else
+ timeout -= jiffies;
+
+ mod_delayed_work(cc->wq, &ep->timer_work, timeout);
+
+}
+static void rpc_queue_work(struct work_struct *w)
+{
+ LIST_HEAD(input_q);
+ LIST_HEAD(complete_q);
+ LIST_HEAD(state_q);
+ struct pcs_rpc *ep = pcs_rpc_from_work(w);
+ int repeat;
+
+ pcs_rpc_get(ep);
+again:
+ spin_lock(&ep->q_lock);
+ list_splice_tail_init(&ep->input_queue, &input_q);
+ spin_unlock(&ep->q_lock);
+
+ mutex_lock(&ep->mutex);
+
+ TRACE("Handle queues\n");
+
+ /* Process messages which are already in the sock queue */
+ if (ep->state == PCS_RPC_WORK) {
+ struct pcs_sockio *sio = sio_from_ioconn(ep->conn);
+
+ pcs_sockio_xmit(sio);
+ }
+
+ /* Process delayed ones */
+ while (!list_empty(&input_q)) {
+ struct pcs_msg * msg = list_first_entry(&input_q, struct pcs_msg, list);
+
+ list_del_init(&msg->list);
+ pcs_rpc_send(ep, msg, 0);
+ }
+ list_splice_tail_init(&ep->state_queue, &state_q);
+ while (!list_empty(&state_q)) {
+ struct pcs_msg * msg = list_first_entry(&state_q, struct pcs_msg, list);
+
+ /* Original code allow msg->ep can be from alien RPC. This is very
+ strange assumption. Seems this is impossible, and crewup my locking */
+ BUG_ON(msg->rpc != ep);
+
+ list_del_init(&msg->list);
+ pcs_rpc_send(ep, msg, 1);
+ }
+ repeat = 0;
+ if (ep->state == PCS_RPC_WORK) {
+ struct pcs_sockio *sio = sio_from_ioconn(ep->conn);
+
+ if (pcs_sockio_delayed_seg(sio))
+ repeat = 1;
+ update_xmit_timeout(ep);
+ }
+ mutex_unlock(&ep->mutex);
+ if (repeat)
+ goto again;
+ pcs_rpc_put(ep);
+
+}
+
+struct pcs_rpc * pcs_rpc_alloc_ep(void)
+{
+ return kzalloc(sizeof(struct pcs_rpc), GFP_NOIO);
+}
+
+void pcs_rpc_configure_new_ep(struct pcs_rpc * ep, struct pcs_rpc_params *parm, struct pcs_rpc_ops * ops)
+{
+ int i;
+
+ ep->params = *parm;
+ ep->ops = ops;
+ ep->kill_arrow = 0;
+
+ INIT_WORK(&ep->work, rpc_queue_work);
+ INIT_DELAYED_WORK(&ep->timer_work, timer_work);
+ INIT_DELAYED_WORK(&ep->calendar_work, calendar_work);
+
+ for (i = 0; i < RPC_MAX_CALENDAR; i++)
+ INIT_HLIST_HEAD(&ep->kill_calendar[i]);
+}
+
+struct pcs_rpc * pcs_rpc_create(struct pcs_rpc_engine * eng, struct pcs_rpc_params *parm, struct pcs_rpc_ops * ops)
+{
+ struct pcs_rpc * ep = pcs_rpc_alloc_ep();
+ pcs_rpc_attach_new_ep(ep, eng);
+ pcs_rpc_configure_new_ep(ep, parm, ops);
+ return ep;
+}
+
+void pcs_rpc_sent(struct pcs_msg * msg)
+{
+ struct pcs_rpc * ep = msg->rpc;
+
+ BUG_ON(!mutex_is_locked(&ep->mutex));
+
+ msg->start_time = jiffies;
+ list_add_tail(&msg->list, &ep->pending_queue);
+ msg->stage = PCS_MSG_STAGE_WAIT;
+
+ if (!timer_pending(&ep->timer_work.timer)) {
+ struct pcs_cluster_core *cc = cc_from_rpc(ep->eng);
+
+ mod_delayed_work(cc->wq, &ep->timer_work, ep->params.response_timeout);
+ }
+
+ if (msg->timeout) {
+ BUG_ON(msg->kill_slot >= RPC_MAX_CALENDAR);
+
+ pcs_msg_add_calendar(msg, 0);
+ } else
+ INIT_HLIST_NODE(&msg->kill_link);
+}
+
+static void rpc_call_sent_cb(struct pcs_msg * clone)
+{
+ struct pcs_msg * msg = clone->private;
+ struct pcs_rpc * ep = clone->rpc;
+
+ BUG_ON(!mutex_is_locked(&ep->mutex));
+
+ /* Inherit kill slot */
+ msg->kill_slot = clone->kill_slot;
+
+ ///// TODO: dmonakhov@ optimize states
+ if (pcs_if_error(&clone->error)) {
+ switch (ep->state) {
+ case PCS_RPC_UNCONN:
+ case PCS_RPC_HOLDDOWN:
+ case PCS_RPC_CONNECT:
+ case PCS_RPC_AUTH:
+ case PCS_RPC_AUTHWAIT:
+ if (clone->timeout ||
+ clone->error.value == PCS_ERR_WRITE_TIMEOUT ||
+ clone->error.value == PCS_ERR_RESPONSE_TIMEOUT)
+ break;
+
+ pcs_clear_error(&clone->error);
+ list_add_tail(&clone->list, &ep->state_queue);
+ if (ep->state == PCS_RPC_UNCONN)
+ pcs_rpc_connect(ep);
+ return;
+ }
+
+ pcs_copy_error(&msg->error, &clone->error);
+ msg->done(msg);
+ pcs_free_msg(clone);
+ return;
+ }
+
+ /*
+ * TODO: We should performs peiodic rpc health check as userspace do
+ * via rpc_trace_health
+ */
+ pcs_rpc_sent(msg);
+ pcs_free_msg(clone);
+}
+
+/* "User-friendly" send. It is not quite optimal (uses redundant clone), but appropriate
+ * for most of simple rpc calls
+ */
+
+static void rpc_msg_output_destructor(struct pcs_msg * msg)
+{
+ if (msg->rpc)
+ pcs_rpc_put(msg->rpc);
+ memset(msg, 0xFF, sizeof(*msg));
+ kfree(msg);
+}
+
+struct pcs_msg * pcs_rpc_clone_msg(struct pcs_msg * msg)
+{
+ struct pcs_msg *cloned_msg = pcs_clone_msg(msg);
+
+ if (cloned_msg)
+ cloned_msg->destructor = rpc_msg_output_destructor;
+ return cloned_msg;
+}
+
+void pcs_rpc_call(struct pcs_rpc * ep, struct pcs_msg * msg)
+{
+ struct pcs_msg * clone;
+
+ BUG_ON(msg->rpc != NULL);
+ msg->rpc = pcs_rpc_get(ep);
+
+ clone = pcs_rpc_clone_msg(msg);
+ if (clone == NULL) {
+ pcs_set_local_error(&msg->error, PCS_ERR_NOMEM);
+ BUG_ON(!hlist_unhashed(&msg->kill_link));
+ msg->done(msg);
+ return;
+ }
+
+ pcs_clear_error(&clone->error);
+ clone->rpc = NULL;
+ clone->done = rpc_call_sent_cb;
+ clone->timeout = msg->timeout;
+
+ pcs_rpc_queue(ep, clone);
+}
+/* TODO: This pace may not scale well, in fact xid should be unique only
+ across RPC so it may be reasonable to make it percpu
+*/
+void pcs_rpc_get_new_xid(struct pcs_rpc_engine *eng, PCS_XID_T *xid)
+{
+ xid->origin = eng->local_id;
+ /* Remember, xids should be unique per peer. The only reliable way to ensure this is
+ * to generate xids globally.
+ */
+ xid->val = atomic64_inc_return(&eng->xid_generator);
+}
+
+static int rpc_check_memlimit(struct pcs_rpc * ep)
+{
+ struct pcs_rpc_engine * eng = ep->eng;
+
+ if ((ep->flags & PCS_RPC_F_ACCT) &&
+ eng->msg_allocated >= eng->mem_pressure_thresh) {
+ /* If congestion avoidance works, this should not happen.
+ * However, if this happens we must do something.
+ */
+ if (eng->msg_allocated > eng->mem_limit) {
+ pcs_log(LOG_ERR, "Hard memory limit exceeded");
+ return 1;
+ }
+ if (ep->peer_role == PCS_NODE_ROLE_CN) {
+ /* CN contributes 3 (repl.norm) times of memory pressure on cluster */
+ if (3 * ep->accounted * eng->accounted_rpcs >= eng->msg_allocated) {
+ TRACE("Soft memory limit exceeded " PEER_FMT, PEER_ARGS(ep));
+ return 1;
+ }
+ } else {
+ if (ep->accounted * eng->accounted_rpcs >= eng->msg_allocated) {
+ TRACE("Soft memory limit exceeded " PEER_FMT, PEER_ARGS(ep));
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
+
+void pcs_rpc_deaccount_msg(struct pcs_msg * msg)
+{
+ struct pcs_rpc * ep = msg->rpc;
+
+ msg->rpc = NULL;
+ ep->eng->msg_count--;
+
+ if (msg->accounted) {
+ ep->accounted -= msg->accounted;
+ ep->eng->msg_allocated -= msg->accounted;
+ if (ep->accounted == 0)
+ ep->eng->accounted_rpcs--;
+ msg->accounted = 0;
+ if (ep->state == PCS_RPC_WORK)
+ pcs_sock_unthrottle((struct pcs_sockio *)ep->conn);
+ }
+ pcs_rpc_put(ep);
+}
+
+static void pcs_rpc_account_msg(struct pcs_rpc * ep, struct pcs_msg * msg, int accounted)
+{
+ msg->accounted = 0;
+ msg->rpc = pcs_rpc_get(ep);
+
+ ep->eng->msg_count++;
+
+ if (ep->flags & PCS_RPC_F_ACCT) {
+ msg->accounted = accounted;
+
+ if (ep->accounted == 0)
+ ep->eng->accounted_rpcs++;
+
+ ep->eng->msg_allocated += accounted;
+ ep->accounted += accounted;
+ }
+}
+
+void pcs_rpc_account_adjust(struct pcs_msg * msg, int adjustment)
+{
+ if (msg->accounted && (msg->rpc->flags & PCS_RPC_F_ACCT)) {
+ struct pcs_rpc * ep = msg->rpc;
+
+ msg->accounted += adjustment;
+ ep->eng->msg_allocated += adjustment;
+ ep->accounted += adjustment;
+ }
+}
+
+static void pcs_rpc_input_destructor(struct pcs_msg * msg)
+{
+ pcs_rpc_deaccount_msg(msg);
+ kfree(msg);
+}
+
+/* get_iter() handler for messages with embedded payload right after pcs_msg */
+void pcs_rpc_get_iter_inline(struct pcs_msg * msg, int offset, struct iov_iter *it)
+{
+ BUG_ON(offset >= msg->size);
+
+ iov_iter_init_plain(it, msg->_inline_buffer, msg->size, 0);
+ iov_iter_advance(it, offset);
+}
+
+void pcs_rpc_init_input_msg(struct pcs_rpc * ep, struct pcs_msg * msg, int account)
+{
+ pcs_msg_io_init(msg);
+ msg->timeout = 0;
+ INIT_HLIST_NODE(&msg->kill_link);
+ pcs_rpc_account_msg(ep, msg, account);
+ msg->destructor = pcs_rpc_input_destructor;
+}
+
+struct pcs_msg * pcs_rpc_alloc_input_msg(struct pcs_rpc * ep, int datalen)
+{
+ struct pcs_msg * msg;
+
+ if (rpc_check_memlimit(ep))
+ return NULL;
+
+ msg = kzalloc(sizeof(struct pcs_msg) + datalen, GFP_NOIO);
+ if (msg) {
+ pcs_rpc_init_input_msg(ep, msg, sizeof(struct pcs_msg) + datalen);
+ msg->size = datalen;
+ msg->get_iter = pcs_rpc_get_iter_inline;
+ }
+ return msg;
+}
+
+
+static void pcs_msg_output_destructor(struct pcs_msg * msg)
+{
+ if (msg->rpc)
+ pcs_rpc_put(msg->rpc);
+ kfree(msg);
+}
+
+void pcs_rpc_init_output_msg(struct pcs_msg * msg)
+{
+ pcs_msg_io_init(msg);
+ pcs_clear_error(&msg->error);
+ msg->timeout = 0;
+ msg->rpc = NULL;
+ INIT_HLIST_NODE(&msg->kill_link);
+ msg->destructor = pcs_msg_output_destructor;
+}
+
+struct pcs_msg * pcs_rpc_alloc_output_msg(int datalen)
+{
+ struct pcs_msg * msg;
+
+ msg = kzalloc(sizeof(struct pcs_msg) + datalen, GFP_NOIO);
+ if (msg) {
+ pcs_rpc_init_output_msg(msg);
+ msg->size = datalen;
+ msg->get_iter = pcs_rpc_get_iter_inline;
+ }
+ return msg;
+}
+
+void pcs_rpc_init_response(struct pcs_msg * msg, struct pcs_rpc_hdr * req_hdr, int size)
+{
+ struct pcs_rpc_hdr * h;
+
+ h = (struct pcs_rpc_hdr *)msg->_inline_buffer;
+ h->len = size;
+ h->type = req_hdr->type | PCS_RPC_DIRECTION;
+ h->xid = req_hdr->xid;
+}
+
+struct pcs_msg * pcs_alloc_response(struct pcs_rpc_hdr * req_hdr, int size)
+{
+ struct pcs_msg * msg;
+
+ msg = pcs_rpc_alloc_output_msg(size);
+ if (msg == NULL)
+ return NULL;
+
+ pcs_rpc_init_response(msg, req_hdr, size);
+
+ return msg;
+}
+
+void pcs_rpc_set_peer_id(struct pcs_rpc * ep, PCS_NODE_ID_T * id, u8 role)
+{
+ BUG_ON(ep->flags & (PCS_RPC_F_PEER_ID|PCS_RPC_F_HASHED));
+ ep->peer_role = role;
+ memcpy(&ep->peer_id, id, sizeof(PCS_NODE_ID_T));
+ ep->flags |= PCS_RPC_F_CLNT_PEER_ID;
+}
+
+int pcs_rpc_set_address(struct pcs_rpc * ep, PCS_NET_ADDR_T * addr)
+{
+ BUG_ON(ep->state != PCS_RPC_UNCONN);
+
+ ep->addr = *addr;
+ return 0;
+}
+
+/* Reset rpc engine, move it to unconnected state ready for further connects. */
+void pcs_rpc_reset(struct pcs_rpc * ep)
+{
+ rpc_abort(ep, 1, PCS_ERR_NET_ABORT);
+ ep->retries = 0;
+ if (ep->state == PCS_RPC_ABORT)
+ ep->state = PCS_RPC_UNCONN;
+}
+
+static void timer_work(struct work_struct *w)
+{
+ struct pcs_rpc * ep = container_of(w, struct pcs_rpc, timer_work.work);
+
+ mutex_lock(&ep->mutex);
+ switch (ep->state) {
+ case PCS_RPC_HOLDDOWN:
+ ep->state = PCS_RPC_UNCONN;
+ pcs_rpc_connect(ep);
+ break;
+
+ case PCS_RPC_WORK: {
+ int err = list_empty(&ep->pending_queue) ? PCS_ERR_RESPONSE_TIMEOUT : PCS_ERR_WRITE_TIMEOUT;
+
+ TRACE("rpc timer expired, killing connection to " PEER_FMT ", %d",
+ PEER_ARGS(ep), err);
+ rpc_abort(ep, 0, err);
+ break;
+ }
+ /* TODO CLEAN unused states */
+ case PCS_RPC_AUTHWAIT:
+ case PCS_RPC_AUTH:
+ case PCS_RPC_CONNECT:
+ BUG_ON(1);
+ break;
+ }
+ mutex_unlock(&ep->mutex);
+}
+
+static void connstat_work(struct work_struct *w)
+{
+ struct pcs_rpc_engine * eng = container_of(w, struct pcs_rpc_engine, stat_work.work);
+ struct pcs_cluster_core *cc = cc_from_rpc(eng);
+
+ pcs_log(LOG_INFO, "TODO send connstat-s\n");
+ (void)eng;
+ /* account_connstat(eng); */
+ mod_delayed_work(cc->wq, &eng->stat_work, PCS_MSG_MAX_CALENDAR * HZ);
+}
+
+
+void pcs_rpc_engine_init(struct pcs_rpc_engine * eng, u8 role)
+{
+ int i;
+ memset(eng, 0, sizeof(*eng));
+ eng->role = role;
+ for (i = 0; i < RPC_GC_MAX_CLASS; i++)
+ list_lru_init(&eng->gc[i].lru);
+
+ INIT_DELAYED_WORK(&eng->stat_work, connstat_work);
+
+}
+
+void pcs_rpc_engine_fini(struct pcs_rpc_engine * eng)
+{
+ unsigned int i;
+
+ for (i = 0; i < PCS_RPC_HASH_SIZE; i++) {
+ while (!hlist_empty(&eng->ht[i])) {
+ struct pcs_rpc * ep = hlist_entry(eng->ht[i].first, struct pcs_rpc, link);
+
+ pcs_rpc_close(ep);
+ }
+ }
+
+ while (!hlist_empty(&eng->unhashed)) {
+ struct pcs_rpc * ep = hlist_entry(eng->unhashed.first, struct pcs_rpc, link);
+
+ pcs_rpc_close(ep);
+ }
+
+ for (i = 0; i < RPC_GC_MAX_CLASS; i++) {
+ BUG_ON(list_lru_count(&eng->gc[i].lru));
+ list_lru_destroy(&eng->gc[i].lru);
+ }
+}
+
+void pcs_rpc_set_host_id(struct pcs_rpc_engine *eng, PCS_NODE_ID_T *host_id)
+{
+ eng->my_host.host_id.val = host_id->val;
+ eng->flags |= PCS_KNOWN_HOSTID;
+}
+
+void pcs_rpc_set_cluster_id(struct pcs_rpc_engine * eng, PCS_CLUSTER_ID_T * id)
+{
+ memcpy(&eng->cluster_id, id, sizeof(*id));
+ eng->flags |= PCS_KNOWN_CLUSTERID;
+}
+
+void pcs_rpc_set_location(struct pcs_rpc_engine * eng, struct pcs_location * loc)
+{
+ memcpy(&eng->my_host.location, loc, sizeof(*loc));
+}
+
+static int rpc_gc_classify(struct pcs_rpc * ep)
+{
+ BUG_ON(ep->eng->role != PCS_NODE_ROLE_TOOL);
+
+ return 0;
+}
+
+void pcs_rpc_init_gc(struct pcs_rpc_engine * eng, unsigned int limit)
+{
+ eng->max_connections = limit;
+
+ switch (eng->role) {
+ case PCS_NODE_ROLE_MDS:
+ eng->max_gc_index = 3;
+ break;
+ case PCS_NODE_ROLE_CS:
+ eng->max_gc_index = 4;
+ break;
+ case PCS_NODE_ROLE_CN:
+ eng->max_gc_index = 2;
+ break;
+ default:
+ eng->max_gc_index = 1;
+ }
+}
+
+
+void pcs_rpc_set_memlimits(struct pcs_rpc_engine * eng, u64 thresh, u64 limit)
+{
+ eng->mem_pressure_thresh = thresh;
+ eng->mem_limit = limit;
+}
+
+void rpc_connect_done(struct pcs_rpc *ep, struct socket *sock)
+{
+ struct pcs_sockio * sio;
+
+ mutex_lock(&ep->mutex);
+
+ TRACE(PEER_FMT " ->state:%d sock:%p\n", PEER_ARGS(ep), ep->state, sock);
+ cancel_delayed_work(&ep->timer_work);
+ ep->retries++;
+
+ if (ep->state != PCS_RPC_CONNECT) {
+ pcs_log(LOG_ERR, "Invalid state: %u", ep->state);
+ BUG();
+ }
+
+ sio = pcs_sockio_init(sock, ep->params.alloc_hdr_size,
+ sizeof(struct pcs_rpc_hdr));
+ if (sio == NULL)
+ BUG();
+
+ ep->conn = &sio->ioconn;
+ sio->parent = ep;
+ sio->get_msg = rpc_get_hdr;
+ sio->eof = rpc_eof_cb;
+ //pcs_ioconn_register(ep->conn);
+ ep->retries = 0;
+ if (ep->gc)
+ list_lru_add(&ep->gc->lru, &ep->lru_link);
+
+ if (ep->flags & PCS_RPC_F_CLNT_PEER_ID)
+ ep->flags |= PCS_RPC_F_PEER_ID;
+ ep->state = PCS_RPC_APPWAIT;
+ pcs_rpc_enable(ep, 0);
+ mutex_unlock(&ep->mutex);
+
+}
diff --git a/fs/fuse/kio/pcs/pcs_rpc.h b/fs/fuse/kio/pcs/pcs_rpc.h
new file mode 100644
index 000000000000..264657328c53
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_rpc.h
@@ -0,0 +1,290 @@
+#ifndef _PCS_RPC_H_
+#define _PCS_RPC_H_ 1
+
+//#include "pcs_defs.h"
+#include "pcs_rpc_prot.h"
+#include "pcs_sock_io.h"
+
+struct pcs_msg;
+
+#define PCS_RPC_HASH_SIZE 1024
+
+enum
+{
+ PCS_RPC_UNCONN = 0, /* Not connected */
+ PCS_RPC_CONNECT = 1, /* Connect in progress */
+ PCS_RPC_AUTH = 2, /* Connected. Auth request sent. */
+ PCS_RPC_AUTHWAIT= 3, /* Accepted. Waiting for auth request from peer. */
+ PCS_RPC_APPWAIT = 4, /* Auth complete, client is notified */
+ PCS_RPC_WORK = 5, /* Established */
+ PCS_RPC_HOLDDOWN = 6, /* Not connected. Connect must not be reinitiated. */
+ PCS_RPC_ABORT = 7, /* Aborted. Not reconnected automatically. */
+ PCS_RPC_DESTROY = 8 /* Destruction in progress */
+};
+
+struct pcs_rpc_params
+{
+ unsigned int alloc_hdr_size;
+ unsigned int max_msg_size;
+
+ unsigned int connect_timeout;
+ unsigned int holddown_timeout;
+ unsigned int response_timeout;
+
+ unsigned int max_conn_retry;
+
+ unsigned int flags;
+};
+
+#define MAX_BUILD_VERSION_LENGTH 30
+
+#define RPC_GC_MAX_CLASS 4
+
+struct rpc_gc_class
+{
+ struct list_lru lru;
+};
+
+
+/* from: cluster_id.h */
+typedef union __pre_aligned(8) _PCS_CLUSTER_ID_T {
+ unsigned char uuid[16]; /* For now it is opaque string */
+ u64 val[2];
+} PCS_CLUSTER_ID_T __aligned(8);
+
+#define PCS_CLUSTER_ID_VALID(clid) ((clid).val[0] || (clid).val[1])
+/////////////////////////////
+
+#define PCS_RPC_CPU_SLICE (100 * HZ / 1000) /* 100ms */
+struct pcs_rpc
+{
+ struct hlist_node link; /* Link in hash table */
+ struct list_head lru_link; /* Link in LRU */
+ struct rpc_gc_class *gc;
+ struct pcs_rpc_engine *eng; /* Reference to eng, where this peer is assigned to */
+
+ void *parent;
+
+ unsigned int state;
+ unsigned int flags;
+#define PCS_RPC_F_HASHED 1
+#define PCS_RPC_F_PASSIVE 2
+#define PCS_RPC_F_PEER_ID 4
+#define PCS_RPC_F_NO_RETRY 8
+#define PCS_RPC_F_DEAD 0x10
+#define PCS_RPC_F_LISTEN 0x20
+#define PCS_RPC_F_ACQ_ID 0x40
+#define PCS_RPC_F_PEER_VERIFIED 0x80
+#define PCS_RPC_F_CLNT_PEER_ID 0x100 /* peer id set by pcs_rpc_set_peer_id */
+#define PCS_RPC_F_ACCT 0x200
+#define PCS_RPC_F_LOCAL 0x400 /* local AF_UNIX connection */
+#define PCS_RPC_F_PEER_AUTHORIZED 0x800 /* peer authorized by secure method */
+#define PCS_RPC_F_LOCALAUTH 0x1000 /* skip authenitication, it is provided by transport */
+
+ struct pcs_rpc_params params;
+
+ atomic_t refcnt;
+ int retries;
+ PCS_NODE_ID_T peer_id;
+ u8 peer_role;
+ unsigned int peer_flags;
+ u32 peer_version;
+ struct pcs_host_info peer_host;
+ char peer_build_version[MAX_BUILD_VERSION_LENGTH+1];
+ struct work_struct work;
+ struct delayed_work timer_work;
+ PCS_NET_ADDR_T addr;
+/* TODO Reanable local sockets */
+#if 0
+ struct sockaddr_un * sun;
+#endif
+ struct pcs_ioconn * conn; /* Active connection for the peer */
+
+ struct pcs_rpc_ops * ops;
+
+ struct list_head pending_queue; /* Queue of requests sent to the peer */
+ struct list_head state_queue; /* Queue of requests waiting for proper peer state */
+
+ spinlock_t q_lock; /* Protects queues lists below*/
+ struct list_head input_queue; /* Queue of requests waiting to be handled */
+ int cpu;
+ unsigned long cpu_stamp;
+
+ struct mutex mutex;
+ u64 accounted;
+ u32 netlat_min;
+ u32 netlat_max;
+ atomic_t netlat_cnt;
+ atomic64_t netlat_avg;
+
+ struct delayed_work calendar_work;
+ unsigned kill_arrow;
+#define RPC_MAX_CALENDAR PCS_MSG_MAX_CALENDAR
+ struct hlist_head kill_calendar[RPC_MAX_CALENDAR];
+
+ void * private;
+
+ void * private2;
+};
+
+struct pcs_rpc_engine
+{
+ struct hlist_head ht[PCS_RPC_HASH_SIZE];
+ struct hlist_head unhashed;
+ unsigned int nrpcs;
+
+ PCS_CLUSTER_ID_T cluster_id;
+ PCS_NODE_ID_T local_id;
+ unsigned int flags;
+#define PCS_KNOWN_MYID 1
+#define PCS_KNOWN_CLUSTERID 2
+#define PCS_KNOWN_HOSTID 4
+ u8 role;
+ struct pcs_host_info my_host;
+
+ atomic64_t xid_generator; /* Current XID */
+ int msg_count;
+ int accounted_rpcs;
+ u64 msg_allocated;
+
+ u64 mem_pressure_thresh;
+ u64 mem_limit;
+
+ int local_sndbuf;
+ int tcp_sndbuf;
+ int tcp_rcvbuf;
+ struct delayed_work stat_work;
+ int max_connections;
+ int max_gc_index;
+ struct rpc_gc_class gc[RPC_GC_MAX_CLASS];
+
+};
+
+struct pcs_rpc_ops
+{
+ /* Called on each incoming request to process msg */
+ int (*demux_request)(struct pcs_rpc *, struct pcs_msg * msg);
+
+ /* Called on receiving response before done callback */
+ void (*hook_response)(struct pcs_rpc *, struct pcs_msg * msg);
+
+ /* Called after rpc header is received to allocate msg */
+ struct pcs_msg * (*get_hdr)(struct pcs_rpc *, struct pcs_rpc_hdr * h);
+
+ /* Called when rpc enters ABORT state due to peer abort */
+ void (*state_change)(struct pcs_rpc *, int error);
+
+ void (*connect)(struct pcs_rpc *);
+
+ /* Incoming connection was aborted */
+ void (*client_aborted)(struct pcs_rpc *ep, int error);
+
+ /* Called when peer asks to keep waiting on a request */
+ void (*keep_waiting)(struct pcs_rpc *, struct pcs_msg * req, struct pcs_msg * msg);
+
+ /* Submit connection statistics */
+ void (*send_stats)(struct pcs_rpc_engine *, struct pcs_msg * msg);
+};
+
+
+static inline struct pcs_rpc * pcs_rpc_get(struct pcs_rpc * p)
+{
+ BUG_ON(atomic_read(&p->refcnt) <=0);
+ atomic_inc(&p->refcnt);
+ return p;
+}
+
+void pcs_rpc_destroy(struct pcs_rpc * p);
+
+static inline void pcs_rpc_put(struct pcs_rpc * p)
+{
+ BUG_ON(atomic_read(&p->refcnt) <=0);
+ if (atomic_dec_and_test(&p->refcnt))
+ pcs_rpc_destroy(p);
+}
+
+/* Function provided by rpc engine */
+void pcs_rpc_engine_init(struct pcs_rpc_engine * eng, u8 role);
+void pcs_rpc_engine_fini(struct pcs_rpc_engine * eng);
+void pcs_rpc_init_gc(struct pcs_rpc_engine * eng, unsigned int limit);
+void pcs_rpc_get_new_xid(struct pcs_rpc_engine *eng, PCS_XID_T *xid);
+
+void pcs_rpc_set_cluster_id(struct pcs_rpc_engine * eng, PCS_CLUSTER_ID_T * id);
+void pcs_rpc_set_host_id(struct pcs_rpc_engine *eng, PCS_NODE_ID_T *host_id);
+
+/* Main set of functions */
+struct pcs_rpc * pcs_rpc_alloc_ep(void);
+void pcs_rpc_attach_new_ep(struct pcs_rpc * ep, struct pcs_rpc_engine * eng);
+void pcs_rpc_configure_new_ep(struct pcs_rpc * ep, struct pcs_rpc_params *parm,
+ struct pcs_rpc_ops * ops);
+/* All 3 above in one call */
+struct pcs_rpc * pcs_rpc_create(struct pcs_rpc_engine * eng, struct pcs_rpc_params *parm,
+ struct pcs_rpc_ops * ops);
+void pcs_rpc_close(struct pcs_rpc * ep);
+void pcs_rpc_reset(struct pcs_rpc * ep);
+
+int pcs_rpc_listen_ext(struct pcs_rpc * ep, PCS_NET_ADDR_T * addr, int flags);
+static inline int pcs_rpc_listen(struct pcs_rpc * ep, PCS_NET_ADDR_T * addr)
+{
+ return pcs_rpc_listen_ext(ep, addr, 0);
+}
+
+int pcs_rpc_listen_local(struct pcs_rpc * ep, const char *path, int noauth);
+void pcs_rpc_queue(struct pcs_rpc * ep, struct pcs_msg * msg);
+void pcs_rpc_kick_queue(struct pcs_rpc * ep);
+void pcs_rpc_respond(struct pcs_rpc * ep, struct pcs_msg * msg);
+void pcs_rpc_call(struct pcs_rpc * ep, struct pcs_msg * msg);
+void pcs_rpc_connect(struct pcs_rpc * ep);
+void pcs_rpc_cancel_request(struct pcs_msg * msg);
+void pcs_msg_del_calendar(struct pcs_msg * msg);
+
+/* Setting/getting parameters */
+void pcs_rpc_set_peer_id(struct pcs_rpc * ep, PCS_NODE_ID_T * id, u8 role);
+int pcs_rpc_set_address(struct pcs_rpc * ep, PCS_NET_ADDR_T * addr);
+
+int pcs_rpc_set_local(struct pcs_rpc * ep, const char *path, int noauth);
+int pcs_rpc_get_local_addr(struct pcs_rpc * ep, PCS_NET_ADDR_T * addr);
+
+/* Service functions, which are supposed to be used from callbacks */
+void pcs_rpc_sent(struct pcs_msg * msg);
+struct pcs_msg * pcs_rpc_lookup_xid(struct pcs_rpc * ep, PCS_XID_T * xid);
+void rpc_work_input(struct pcs_msg * msg);
+
+void pcs_rpc_error_respond(struct pcs_rpc * ep, struct pcs_msg * msg, int err);
+void rpc_abort(struct pcs_rpc * ep, int fatal, int error);
+/* Message allocation/initialization */
+struct pcs_msg * pcs_alloc_response(struct pcs_rpc_hdr * req_hdr, int size);
+struct pcs_msg * pcs_alloc_aligned_response(struct pcs_rpc_hdr * req_hdr, int size, int hdrlen);
+struct pcs_msg * pcs_rpc_alloc_error_response(struct pcs_rpc * ep, struct pcs_rpc_hdr * req_hdr, int err, int size);
+struct pcs_msg * pcs_rpc_alloc_input_msg(struct pcs_rpc * ep, int datalen);
+struct pcs_msg * pcs_rpc_alloc_aligned_msg(struct pcs_rpc * ep, int datalen, int hdrlen);
+struct pcs_msg * pcs_rpc_alloc_output_msg(int datalen);
+struct pcs_msg * pcs_rpc_clone_msg(struct pcs_msg * msg);
+void pcs_rpc_deaccount_msg(struct pcs_msg * msg);
+void pcs_rpc_init_input_msg(struct pcs_rpc * ep, struct pcs_msg * msg, int account);
+void pcs_rpc_init_output_msg(struct pcs_msg * msg);
+void pcs_rpc_init_response(struct pcs_msg * msg, struct pcs_rpc_hdr * req_hdr, int size);
+
+/* Allocate message and initialize header */
+struct pcs_msg * pcs_rpc_alloc_msg_w_hdr(int type, int size);
+
+void pcs_rpc_set_memlimits(struct pcs_rpc_engine * eng, u64 thresh, u64 limit);
+void pcs_rpc_account_adjust(struct pcs_msg * msg, int adjustment);
+
+struct pcs_perf_counter;
+void perfcnt_collect_rpc(char ** ptr, int * max_size, struct pcs_rpc_engine const*);
+
+int pcs_is_zero_cluster_id(PCS_CLUSTER_ID_T *id);
+int pcs_cluster_id_eq(PCS_CLUSTER_ID_T *id1, PCS_CLUSTER_ID_T *id2);
+
+void rpc_trace_health(struct pcs_rpc * ep);
+void pcs_rpc_enumerate_rpc(struct pcs_rpc_engine *eng, void (*cb)(struct pcs_rpc *ep, void *arg), void *arg);
+void pcs_rpc_set_sock(struct pcs_rpc *ep, struct pcs_sockio * sio);
+void rpc_connect_done(struct pcs_rpc *ep, struct socket *sock);
+
+static inline struct pcs_rpc *pcs_rpc_from_work(struct work_struct *wr)
+{
+ return container_of(wr, struct pcs_rpc, work);
+}
+
+#endif /* _PCS_RPC_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_rpc_prot.h b/fs/fuse/kio/pcs/pcs_rpc_prot.h
new file mode 100644
index 000000000000..594670e9ead6
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_rpc_prot.h
@@ -0,0 +1,97 @@
+#ifndef _PCS_NET_PROT_H_
+#define _PCS_NET_PROT_H_ 1
+
+#include "pcs_prot_types.h"
+
+/* Current version of protocol. We promise to support all the messages forever,
+ * so that no version checks are required. However, we must not send new messages
+ * to old peers, that. where this version is required.
+ */
+#define PCS_VERSION_CURRENT 1U
+
+struct pcs_rpc_hdr
+{
+ u32 len;
+ u32 type;
+ PCS_XID_T xid;
+} __attribute__((aligned(8)));
+
+#define PCS_RPC_DIRECTION 1
+
+#define RPC_IS_RESPONSE(type) (type & PCS_RPC_DIRECTION)
+
+
+#define PCS_RPC_ERROR_RESP 1
+
+struct pcs_rpc_payload
+{
+ u32 len;
+ u32 type;
+ /* Variable size data follows */
+} __attribute__((aligned(8)));
+
+
+struct pcs_rpc_error_resp
+{
+ struct pcs_rpc_hdr hdr;
+ PCS_NODE_ID_T offender;
+ u32 code;
+ u32 npayloads;
+ struct pcs_rpc_payload payload;
+} __attribute__((aligned(8)));
+
+
+#define PCS_RPC_CS_CLIENT_BASE 256
+#define PCS_RPC_MDS_CLIENT_BASE 512
+#define PCS_RPC_CS_CS_BASE 1024
+#define PCS_RPC_LOCAL_BASE 2048
+
+/* Payload types */
+#define PCS_RPC_EMPTY_PAYLOAD 0
+
+/* Authentication payload types */
+#define PCS_RPC_AUTH_TYPE_PAYLOAD 11
+#define PCS_RPC_SSL_PAYLOAD 12
+#define PCS_RPC_DIGEST_PAYLOAD 13
+#define PCS_RPC_AUTH_SIMPLE_PAYLOAD 14
+
+/* System payload types */
+#define PCS_RPC_SYS_PAYLOAD_BASE 128
+#define PCS_RPC_BUILD_VERSION_PAYLOAD PCS_RPC_SYS_PAYLOAD_BASE
+
+/* Application specific payload types */
+#define PCS_RPC_APP_PAYLOAD_BASE 512
+
+/* Node role */
+enum
+{
+ PCS_NODE_ROLE_TEST = 0, /* Can be used for diagnostics. Functionality is reduced. */
+ PCS_NODE_ROLE_CN = 1, /* Client */
+ PCS_NODE_ROLE_CS = 2, /* Chunk server */
+ PCS_NODE_ROLE_MDS = 3, /* Meta-data server */
+ PCS_NODE_ROLE_TOOL = 4, /* Similar to the client but not visible in stat */
+ PCS_NODE_ROLE_SVC = 5, /* Generic service */
+ PCS_NODE_ROLES_
+};
+
+static inline const char *pcs_role_to_str(u8 role)
+{
+ static const char *roles_str[PCS_NODE_ROLES_] = {
+ "TEST", "CN", "CS", "MDS", "TOOL", "SVC"
+ };
+
+ if (role > PCS_NODE_ROLES_)
+ return "Unknown";
+ return roles_str[role];
+}
+
+struct pcs_rpc_keep_waiting
+{
+ struct pcs_rpc_hdr hdr;
+
+ PCS_XID_T xid; /* XID of request which should not timeout */
+} __attribute__((aligned(8)));
+
+#define PCS_RPC_KEEP_WAITING (12)
+
+#endif /* _PCS_RPC_PROT_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_sock_io.c b/fs/fuse/kio/pcs/pcs_sock_io.c
new file mode 100644
index 000000000000..6936dede5b96
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_sock_io.c
@@ -0,0 +1,702 @@
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/net.h>
+#include <linux/kthread.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+
+#include "pcs_types.h"
+#include "pcs_sock_io.h"
+#include "pcs_rpc.h"
+#include "log.h"
+
+
+static inline struct pcs_rpc * sock_to_rpc(struct sock *sk)
+{
+
+ return ((struct pcs_sockio *)sk->sk_user_data)->parent;
+}
+
+static void sio_msg_sent(struct pcs_msg * msg)
+{
+ msg->stage = PCS_MSG_STAGE_SENT;
+ if (msg->timeout) {
+ BUG_ON(msg->rpc == NULL);
+ BUG_ON(msg->kill_slot >= PCS_MSG_MAX_CALENDAR);
+ pcs_msg_del_calendar(msg);
+ }
+}
+
+void sio_push(struct pcs_sockio * sio)
+{
+ struct pcs_rpc *ep = sio->parent;
+
+ TRACE(PEER_FMT" flush \n", PEER_ARGS(ep));
+ if (sio->flags & PCS_SOCK_F_CORK) {
+ int optval = 1;
+ int ret;
+ ret = kernel_setsockopt(sio->ioconn.socket, SOL_TCP, TCP_NODELAY,
+ (char *)&optval, sizeof(optval));
+ if (ret)
+ TRACE("kernel_setsockopt(TCP_NODELAY) failed: %d", ret);
+
+ }
+}
+
+//// TODO:dmonakhov@ implement unregister and close,
+//// socket close must being synchronized with userspace THINK
+//// caseA: userspace close socket and wait for kernelspace
+//// caseB: kernelspace want to close socket and have to somehow
+//// notify about this to userspace (NEW API REQUIRED)
+static void pcs_restore_sockets(struct pcs_ioconn *ioconn);
+void pcs_ioconn_unregister(struct pcs_ioconn *ioconn)
+{
+ if (!test_bit(PCS_IOCONN_BF_DEAD, &ioconn->flags)) {
+ set_bit(PCS_IOCONN_BF_DEAD, &ioconn->flags);
+ pcs_restore_sockets(ioconn);
+ }
+
+}
+
+void pcs_ioconn_close(struct pcs_ioconn *ioconn)
+{
+ kernel_sock_shutdown(ioconn->socket, SHUT_RDWR);
+}
+
+void sio_abort(struct pcs_sockio * sio, int error)
+{
+ if (sio->current_msg) {
+ pcs_free_msg(sio->current_msg);
+ sio->current_msg = NULL;
+ }
+
+ sio->flags &= ~(PCS_SOCK_F_POOLOUT|PCS_SOCK_F_POOLIN);
+ while (!list_empty(&sio->write_queue)) {
+ struct pcs_msg * msg = list_first_entry(&sio->write_queue, struct pcs_msg, list);
+ list_del(&msg->list);
+ sio->write_queue_len -= msg->size;
+ sio_msg_sent(msg);
+
+ pcs_set_local_error(&msg->error, error);
+ BUG_ON(!hlist_unhashed(&msg->kill_link));
+ msg->done(msg);
+ }
+ pcs_ioconn_unregister(&sio->ioconn);
+ pcs_ioconn_close(&sio->ioconn);
+ pcs_set_local_error(&sio->error, error);
+ if (sio->eof) {
+ void (*eof)(struct pcs_sockio *) = sio->eof;
+ sio->eof = NULL;
+ (*eof)(sio);
+ }
+}
+
+
+void pcs_sock_abort(struct pcs_sockio * sio)
+{
+ if (!sio)
+ return;
+
+ sio_abort(sio, PCS_ERR_NET_ABORT);
+}
+
+void pcs_sock_error(struct pcs_sockio * sio, int error)
+{
+ sio_abort(sio, error);
+}
+
+static int do_send_one_seg(struct socket *sock, struct iov_iter *it, bool more)
+{
+ int ret;
+ size_t offset, len;
+ struct page *page;
+ int flags = (MSG_DONTWAIT | MSG_NOSIGNAL) | (more ? MSG_MORE : MSG_EOR);
+
+ DTRACE("sock(%p) len:%ld, more:%d\n", sock, iov_iter_count(it), more);
+
+ page = iov_iter_get_page(it, &offset, &len);
+ if (!page) {
+ /* No page, fallback to memcopy */
+ struct msghdr msg = { .msg_flags = flags};
+ struct page *page;
+ struct kvec vec;
+
+ page = iov_iter_kmap(it, &vec.iov_base, &vec.iov_len);
+ ret = kernel_sendmsg(sock, &msg, &vec, 1, vec.iov_len);
+ if (page)
+ kunmap(page);
+ } else {
+ /* Zerocopy */
+ ret = kernel_sendpage(sock, page, offset, len, flags);
+ put_page(page);
+ }
+
+ DTRACE("sock(%p) len:%ld, more:%d ret:%d\n", sock, iov_iter_count(it), more, ret);
+ return ret;
+}
+
+static int do_sock_recv(struct socket *sock, void *buf, size_t len)
+{
+
+ struct kvec iov = {buf, len};
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+ int ret;
+
+ ret = kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
+
+ TRACE("RET: "PEER_FMT" len:%ld ret:%d\n", PEER_ARGS(sock_to_rpc(sock->sk)),
+ len, ret);
+ return ret;
+}
+
+static void pcs_sockio_recv(struct pcs_sockio *sio)
+{
+ struct pcs_ioconn* conn = &sio->ioconn;
+ struct iov_iter *it = &sio->read_iter;
+ struct pcs_rpc *ep = sio->parent;
+ int count = 0;
+ unsigned long loop_timeout = jiffies + PCS_SIO_SLICE;
+
+ (void)ep;
+ TRACE("ENTER:" PEER_FMT " sio:%p cur_msg:%p\n", PEER_ARGS(ep), sio, sio->current_msg);
+
+ while(!test_bit(PCS_IOCONN_BF_DEAD, &conn->flags)) {
+ int n;
+ struct pcs_msg * msg;
+
+ if (test_bit(PCS_IOCONN_BF_ERROR, &conn->flags)) {
+ sio_abort(sio, PCS_ERR_NET_ABORT);
+ return;
+ }
+ if (!sio->current_msg) {
+ /* New message */
+
+ int copy = (int)(sio->hdr_max - sio->hdr_ptr);
+
+ sio->read_offset = 0;
+ n = 0;
+
+ if (copy)
+ n = do_sock_recv(conn->socket, (char *)sio_inline_buffer(sio) + sio->hdr_ptr, copy);
+
+ if (n > 0 || n == copy /* recv return 0 when copy is 0 */) {
+ sio->hdr_ptr += n;
+ if(sio->hdr_ptr != sio->hdr_max)
+ return;
+
+ msg = sio->get_msg(sio);
+ if (msg == NULL) {
+ if (sio->hdr_ptr < sio->hdr_max)
+ continue;
+ if (sio->flags & PCS_SOCK_F_THROTTLE)
+ continue;
+ sio_abort(sio, PCS_ERR_NOMEM);
+ return;
+ }
+ sio->read_offset = sio->hdr_ptr;
+ sio->hdr_ptr = 0;
+ sio->current_msg = msg;
+ msg->get_iter(msg, sio->read_offset, it);
+ TRACE(PEER_FMT" msg:%p read_off:%d iov_size:%ld\n", PEER_ARGS(ep), msg, sio->read_offset,
+ iov_iter_count(it));
+ } else {
+ if (n == -EAGAIN || n == 0)
+ return;
+
+ sio_abort(sio, PCS_ERR_NET_ABORT);
+ return;
+ }
+ } else { /* Continue recevining message */
+ msg = sio->current_msg;
+
+ while (sio->read_offset < msg->size) {
+ void *buf;
+ size_t len;
+ struct page *page;
+
+ if (!iov_iter_count(it))
+ /* Current iter is exhausted, init new one */
+ msg->get_iter(msg, sio->read_offset, it);
+
+ TRACE(PEER_FMT" msg:%p->size:%d off:%d it_count:%ld\n",
+ PEER_ARGS(ep), msg, msg->size, sio->read_offset,
+ iov_iter_count(it));
+
+ BUG_ON(iov_iter_count(it) > msg->size - sio->read_offset);
+
+ page = iov_iter_kmap(it, &buf, &len);
+ if (len > msg->size - sio->read_offset)
+ len = msg->size - sio->read_offset;
+ n = do_sock_recv(conn->socket, buf, len);
+ if (page)
+ kunmap(page);
+
+ if (n > 0) {
+ sio->read_offset += n;
+ iov_iter_advance(it, n);
+ } else {
+ if (n == -EAGAIN || n == 0)
+ return;
+ sio_abort(sio, PCS_ERR_NET_ABORT);
+ return;
+ }
+ }
+ sio->current_msg = NULL;
+ iov_iter_init_bad(&sio->read_iter);
+ msg->done(msg);
+ if (++count >= PCS_SIO_PREEMPT_LIMIT ||
+ time_is_before_jiffies(loop_timeout)) {
+ sio->flags |= PCS_SOCK_F_POOLIN;
+ break;
+ }
+ }
+ }
+ if (count && !list_empty(&ep->lru_link) && ep->gc)
+ list_lru_add(&ep->gc->lru, &ep->lru_link);
+
+}
+
+static void pcs_sockio_send(struct pcs_sockio *sio)
+{
+ struct pcs_ioconn* conn = &sio->ioconn;
+ struct iov_iter *it = &sio->write_iter;
+ unsigned long loop_timeout = jiffies + PCS_SIO_SLICE;
+ struct pcs_msg * msg;
+ int done = 0;
+ int count = 0;
+ struct pcs_rpc *ep = sio->parent;
+ (void)ep;
+
+ while (!list_empty(&sio->write_queue)) {
+ msg = list_first_entry(&sio->write_queue, struct pcs_msg, list);
+
+ TRACE(PEER_FMT" sio(%p) offset:%d msg:%p\n", PEER_ARGS(ep), sio, sio->write_offset, msg);
+
+ /* This is original check, but it is not clear how connection can becomes
+ dead before sio_abort() was called. Let's simplify it with BUG_ON
+ if (conn->dead) {
+ pcs_set_local_error(&msg->error, PCS_ERR_NET_ABORT);
+ goto done;
+ }
+ */
+ BUG_ON(test_bit(PCS_IOCONN_BF_DEAD, &conn->flags));
+
+ if (test_bit(PCS_IOCONN_BF_ERROR, &conn->flags)) {
+ sio_abort(sio, PCS_ERR_NET_ABORT);
+ return;
+ }
+
+ /* TODO: cond resched here? */
+ while (sio->write_offset < msg->size) {
+ size_t left = msg->size - sio->write_offset;
+ int n;
+
+ TRACE(PEER_FMT "offset:%d msg:%p left:%ld, it->len:%ld\n", PEER_ARGS(ep), sio->write_offset, msg,
+ left, iov_iter_count(it));
+
+ if (!iov_iter_count(it)) {
+ /* Current iter is exhausted, init new one */
+ msg->get_iter(msg, sio->write_offset, it);
+ }
+ BUG_ON(iov_iter_count(it) > left);
+ n = do_send_one_seg(conn->socket, it, iov_iter_single_seg_count(it) < left);
+ if (n > 0) {
+ sio->write_offset += n;
+ iov_iter_advance(it, n);
+ done = 1;
+ } else {
+ if (n == 0)
+ WARN_ON(1);
+
+ if (n == -EAGAIN) {
+ unsigned long timeout = msg->start_time + sio->send_timeout;
+ if (time_is_before_jiffies(timeout))
+ sio_abort(sio, PCS_ERR_WRITE_TIMEOUT);
+ }
+ sio_abort(sio, PCS_ERR_NET_ABORT);
+ return;
+ }
+ }
+ list_del_init(&msg->list);
+ sio->write_queue_len -= msg->size;
+
+ if (sio->write_queue_len == 0) {
+ if (sio->write_wakeup)
+ sio->write_wakeup(sio);
+ }
+ sio->write_offset = 0;
+ iov_iter_init_bad(it);
+ sio_msg_sent(msg);
+ msg->done(msg);
+ if (++count >= PCS_SIO_PREEMPT_LIMIT ||
+ time_is_before_jiffies(loop_timeout)) {
+ sio->flags |= PCS_SOCK_F_POOLOUT;
+ break;
+ }
+ }
+ if (done)
+ sio_push(sio);
+}
+
+void pcs_sockio_xmit(struct pcs_sockio *sio)
+{
+ struct pcs_rpc *ep = sio->parent;
+
+ BUG_ON(!mutex_is_locked(&ep->mutex));
+
+ sio->flags &= ~(PCS_SOCK_F_POOLOUT|PCS_SOCK_F_POOLIN);
+ pcs_sockio_recv(sio);
+ pcs_sockio_send(sio);
+}
+
+int pcs_sockio_delayed_seg(struct pcs_sockio *sio)
+{
+ return sio->flags & (PCS_SOCK_F_POOLOUT|PCS_SOCK_F_POOLIN);
+}
+
+void pcs_sock_sendmsg(struct pcs_sockio * sio, struct pcs_msg *msg)
+{
+ DTRACE("sio(%p) msg:%p\n", sio, msg);
+
+ if (pcs_if_error(&sio->error)) {
+ pcs_set_local_error(&msg->error, sio->error.value);
+ msg->done(msg);
+ return;
+ }
+ msg->sio = sio;
+
+ list_add_tail(&msg->list, &sio->write_queue);
+ sio->write_queue_len += msg->size;
+ msg->start_time = jiffies;
+ msg->stage = PCS_MSG_STAGE_SEND;
+
+ if (!(sio->flags & PCS_SOCK_F_POOLOUT))
+ sio->flags |= PCS_SOCK_F_POOLOUT;
+
+}
+
+/* Try to cancel message send. If it is impossible, because message is in the middle
+ * of write, so nothing and return an error.
+ */
+int pcs_sock_cancel_msg(struct pcs_msg * msg)
+{
+ struct pcs_sockio * sio = msg->sio;
+
+ BUG_ON(msg->sio == NULL);
+
+ if (sio->write_offset && sio->write_queue.next == &msg->list)
+ return -EBUSY;
+
+ list_del_init(&msg->list);
+ sio->write_queue_len -= msg->size;
+ msg->stage = PCS_MSG_STAGE_SENT;
+
+ if (!sio->write_queue_len) {
+ if (sio->write_wakeup)
+ sio->write_wakeup(sio);
+ }
+
+ return 0;
+}
+
+int pcs_sock_queuelen(struct pcs_sockio * sio)
+{
+ return sio->write_queue_len;
+}
+
+static void pcs_restore_sockets(struct pcs_ioconn *ioconn)
+{
+
+ struct sock *sk;
+
+ sk = ioconn->socket->sk;
+
+ write_lock_bh(&sk->sk_callback_lock);
+ sk->sk_user_data = ioconn->orig.user_data;
+ sk->sk_data_ready = ioconn->orig.data_ready;
+ sk->sk_write_space = ioconn->orig.write_space;
+ sk->sk_error_report = ioconn->orig.error_report;
+ //sock->sk->sk_state_change = pcs_state_chage;
+ write_unlock_bh(&sk->sk_callback_lock);
+
+ sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+ sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+}
+
+void pcs_sock_ioconn_destruct(struct pcs_ioconn *ioconn)
+{
+ struct pcs_sockio * sio = sio_from_ioconn(ioconn);
+
+ BUG_ON(sio->current_msg);
+ BUG_ON(!list_empty(&sio->write_queue));
+ BUG_ON(sio->write_queue_len);
+
+ pcs_ioconn_close(ioconn);
+
+ memset(sio, 0xFF, sizeof(*sio));
+ kfree(sio);
+}
+
+static void pcs_sk_data_ready(struct sock *sk, int count)
+{
+ struct pcs_sockio *sio = sk->sk_user_data;
+ struct pcs_rpc *ep = sio->parent;
+
+ TRACE(PEER_FMT" queue count:%d \n", PEER_ARGS(ep), count);
+
+ pcs_rpc_kick_queue(sio->parent);
+}
+static void pcs_sk_write_space(struct sock *sk)
+{
+ struct pcs_sockio *sio = sk->sk_user_data;
+ struct pcs_rpc *ep = sio->parent;
+
+ TRACE(PEER_FMT" queue \n", PEER_ARGS(ep));
+
+ pcs_rpc_kick_queue(sio->parent);
+
+}
+
+/* TODO this call back does not look correct, sane locking/error handling is required */
+static void pcs_sk_error_report(struct sock *sk)
+{
+ struct pcs_sockio * sio = sio_from_ioconn(sk->sk_user_data);
+
+ if (test_bit(PCS_IOCONN_BF_DEAD, &sio->ioconn.flags) ||
+ test_bit(PCS_IOCONN_BF_ERROR, &sio->ioconn.flags))
+ return;
+
+ set_bit(PCS_IOCONN_BF_ERROR, &sio->ioconn.flags);
+ pcs_rpc_kick_queue(sio->parent);
+}
+
+struct pcs_sockio * pcs_sockio_init(struct socket *sock,
+ int alloc_max, int hdr_max)
+{
+ struct pcs_sockio * sio;
+ struct sock *sk;
+
+ sio = kzalloc(sizeof(struct pcs_sockio) + alloc_max, GFP_NOIO);
+ if (!sio)
+ return NULL;
+
+ INIT_LIST_HEAD(&sio->write_queue);
+ sio->write_queue_len = 0;
+ sio->current_msg = NULL;
+ iov_iter_init_bad(&sio->read_iter);
+ iov_iter_init_bad(&sio->write_iter);
+ sio->read_offset = 0;
+ sio->write_offset = 0;
+ sio->hdr_max = hdr_max;
+ sio->hdr_ptr = 0;
+ sio->flags = PCS_SOCK_F_CORK;
+ sio->retrans = 0;
+
+ //// TODO:dmonakhov init ioconn here
+ INIT_LIST_HEAD(&sio->ioconn.list);
+ sk = sock->sk;
+ write_lock_bh(&sk->sk_callback_lock);
+
+ /* Backup original callbaks */
+ sio->ioconn.orig.user_data = sk->sk_user_data;
+ sio->ioconn.orig.data_ready = sk->sk_data_ready;
+ sio->ioconn.orig.write_space = sk->sk_write_space;
+ sio->ioconn.orig.error_report = sk->sk_error_report;
+ //sio->ioconn.orig_state_change = sk->sk_state_change;
+
+ sk->sk_user_data = sio;
+ sk->sk_data_ready = pcs_sk_data_ready;
+ sk->sk_write_space = pcs_sk_write_space;
+ sk->sk_error_report = pcs_sk_error_report;
+ sk->sk_allocation = GFP_NOFS;
+
+ //sock->sk->sk_state_change = pcs_state_chage;
+
+ sk->sk_sndtimeo = PCS_SIO_TIMEOUT;
+ sio->send_timeout = PCS_SIO_TIMEOUT;
+ sio->ioconn.socket = sock;
+ sio->ioconn.destruct = pcs_sock_ioconn_destruct;
+ write_unlock_bh(&sock->sk->sk_callback_lock);
+
+ pcs_clear_error(&sio->error);
+ sio->get_msg = NULL;
+ sio->eof = NULL;
+ sio->write_wakeup = NULL;
+ return sio;
+}
+
+void pcs_sockio_start(struct pcs_sockio * sio)
+{
+ //// TODO: dmonakhov
+ ////pcs_ioconn_register(&sio->ioconn);
+}
+
+static void pcs_deaccount_msg(struct pcs_msg * msg)
+{
+ msg->sio = NULL;
+}
+
+static void pcs_account_msg(struct pcs_sockio * sio, struct pcs_msg * msg)
+{
+ msg->sio = sio;
+
+}
+
+static void pcs_msg_input_destructor(struct pcs_msg * msg)
+{
+ pcs_deaccount_msg(msg);
+ memset(msg, 0xFF, sizeof(*msg));
+ kfree(msg);
+}
+
+/* get_iter() handler for messages with embedded payload right after pcs_msg */
+void pcs_get_iter_inline(struct pcs_msg * msg, int offset, struct iov_iter *it)
+{
+ BUG_ON(offset >= msg->size);
+
+ iov_iter_init_plain(it, msg->_inline_buffer, msg->size, 0);
+ iov_iter_advance(it, offset);
+}
+
+struct pcs_msg * pcs_alloc_input_msg(struct pcs_sockio * sio, int datalen)
+{
+ struct pcs_msg * msg;
+
+ msg = kmalloc(sizeof(struct pcs_msg) + datalen, GFP_NOIO);
+ if (msg) {
+
+ pcs_msg_io_init(msg);
+ pcs_account_msg(sio, msg);
+ msg->destructor = pcs_msg_input_destructor;
+ msg->get_iter = pcs_get_iter_inline;
+ }
+ return msg;
+}
+
+static void pcs_io_msg_output_destructor(struct pcs_msg * msg)
+{
+ BUG_ON(msg->rpc);
+ memset(msg, 0xFF, sizeof(*msg));
+ kfree(msg);
+}
+
+
+struct pcs_msg * pcs_alloc_output_msg(int datalen)
+{
+ struct pcs_msg * msg;
+
+ msg = kmalloc(sizeof(struct pcs_msg) + datalen, GFP_NOIO);
+ if (msg) {
+ pcs_msg_io_init(msg);
+ msg->rpc = NULL;
+ msg->sio = NULL;
+ msg->destructor = pcs_io_msg_output_destructor;
+ msg->get_iter = pcs_get_iter_inline;
+ }
+ return msg;
+}
+
+void pcs_free_msg(struct pcs_msg * msg)
+{
+ pcs_msg_io_fini(msg);
+
+ if (msg->destructor)
+ msg->destructor(msg);
+}
+
+/* iter_iter() handler for cloned messages */
+static void get_iter_clone(struct pcs_msg * msg, int offset, struct iov_iter *it)
+{
+ struct pcs_msg * parent = msg->private;
+
+ BUG_ON(offset >= msg->size);
+
+ parent->get_iter(parent, offset, it);
+}
+
+void pcs_clone_done(struct pcs_msg * msg)
+{
+ struct pcs_msg * parent = msg->private;
+
+ pcs_copy_error_cond(&parent->error, &msg->error);
+
+ pcs_msg_io_end(parent);
+
+ pcs_free_msg(msg);
+}
+
+struct pcs_msg * pcs_clone_msg(struct pcs_msg * msg)
+{
+ struct pcs_msg * clone;
+
+ clone = kmalloc(sizeof(struct pcs_msg), GFP_NOIO);
+ if (clone) {
+ pcs_msg_io_init(clone);
+ clone->rpc = NULL;
+ clone->size = msg->size;
+ clone->timeout = 0;
+ clone->done = pcs_clone_done;
+ clone->destructor = pcs_io_msg_output_destructor;
+ clone->private = msg;
+ clone->get_iter = get_iter_clone;
+ }
+ return clone;
+}
+
+/* iter_iter() handler for cloned messages */
+static void get_iter_cow_clone(struct pcs_msg * msg, int offset, struct iov_iter *it)
+{
+ struct pcs_msg * parent = msg->private;
+
+ BUG_ON(offset >= msg->size);
+
+ if (offset < msg->_inline_len) {
+ iov_iter_init_plain(it, msg->_inline_buffer, msg->_inline_len, 0);
+ iov_iter_advance(it, offset);
+ } else {
+ parent->get_iter(parent, offset, it);
+ }
+}
+
+struct pcs_msg * pcs_cow_msg(struct pcs_msg * msg, int copy_len)
+{
+ struct pcs_msg * clone;
+
+ clone = kmalloc(sizeof(struct pcs_msg) + copy_len, GFP_NOIO);
+ if (clone) {
+ pcs_msg_io_init(clone);
+ clone->rpc = NULL;
+ clone->size = msg->size;
+ clone->timeout = 0;
+ clone->done = pcs_clone_done;
+ clone->destructor = pcs_io_msg_output_destructor;
+ clone->private = msg;
+ BUG_ON(copy_len > SHRT_MAX);
+ clone->_inline_len = (short)copy_len;
+ memcpy(clone->_inline_buffer, msg_inline_head(msg), copy_len);
+ clone->get_iter = get_iter_cow_clone;
+ }
+ return clone;
+}
+
+void pcs_sock_throttle(struct pcs_sockio * sio)
+{
+ if ((sio->flags & PCS_SOCK_F_THROTTLE) ||
+ test_bit(PCS_IOCONN_BF_DEAD, &sio->ioconn.flags))
+ return;
+
+ DTRACE("Throttle on socket %p rpc=%p", sio, sio->parent);
+ sio->flags |= PCS_SOCK_F_THROTTLE;
+}
+
+void pcs_sock_unthrottle(struct pcs_sockio * sio)
+{
+ if (!(sio->flags & PCS_SOCK_F_THROTTLE) ||
+ test_bit(PCS_IOCONN_BF_DEAD, &sio->ioconn.flags))
+ return;
+
+ DTRACE("Unthrottle on socket %p rpc=%p", sio, sio->parent);
+ sio->flags &= ~PCS_SOCK_F_THROTTLE;
+ if ((sio->flags & PCS_SOCK_F_EOF))
+ return;
+}
diff --git a/fs/fuse/kio/pcs/pcs_sock_io.h b/fs/fuse/kio/pcs/pcs_sock_io.h
new file mode 100644
index 000000000000..c1dfd422b360
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_sock_io.h
@@ -0,0 +1,236 @@
+#ifndef _PCS_SOCK_IO_H_
+#define _PCS_SOCK_IO_H_ 1
+
+#include <linux/net.h>
+
+#include "pcs_types.h"
+////#include "pcs_process.h"
+#include "pcs_error.h"
+#include "log.h"
+
+#define PCS_MSG_MAX_CALENDAR 64
+#define PCS_SIO_TIMEOUT (60*HZ)
+
+#define PCS_SIO_PREEMPT_LIMIT 16
+#define PCS_SIO_SLICE (5 * HZ / 1000) /* 5ms */
+
+
+struct pcs_api_channel
+{
+ unsigned sio_count;
+ unsigned msg_count;
+};
+
+__pre_packed struct pcs_msg
+{
+ struct __pre_aligned(16) {
+ struct list_head list;
+
+ pcs_error_t error;
+ abs_time_t start_time;
+
+ void *private;
+ void *private2; /* Huh? Need to do something else here. */
+ struct pcs_msg *response; /* Consider removing. It can be done passing the second
+ * argument to done();
+ */
+ struct pcs_sockio *sio;
+ struct pcs_rpc *rpc;
+
+ int size;
+ int _iocount;
+ unsigned short timeout;
+ unsigned char kill_slot;
+ unsigned char stage;
+ abs_time_t io_start_time;
+
+ struct hlist_node kill_link;
+
+ void (*get_iter)(struct pcs_msg *, int offset, struct iov_iter *it);
+
+ void (*done)(struct pcs_msg *);
+ void (*destructor)(struct pcs_msg *);
+ void *pool;
+ struct iovec _inline_iovec;
+ int accounted;
+
+ short _align_offset;
+ short _inline_len;
+ } __aligned(16);
+ u64 __pad16_8;
+ char _inline_buffer[0];
+} __packed;
+
+static inline void * pcs_msg_aligned_data(struct pcs_msg * msg, int offset)
+{
+ return (void*)((char *)msg + msg->_align_offset + offset);
+}
+
+enum
+{
+ PCS_MSG_STAGE_NONE = 0, /* Initial state */
+ PCS_MSG_STAGE_UNSENT = 1, /* Message queued somewhere before send */
+ PCS_MSG_STAGE_SEND = 2, /* Message queued on socket queue */
+ PCS_MSG_STAGE_SENT = 3, /* Message is sent */
+ PCS_MSG_STAGE_WAIT = 4, /* Message is waiting for respnose */
+ PCS_MSG_STAGE_DONE = 5, /* Response received */
+};
+
+enum
+{
+ PCS_SOCK_F_THROTTLE = 1,
+ PCS_SOCK_F_CORK = 2,
+ PCS_SOCK_F_DYNAMIC_SIZE = 4,
+ PCS_SOCK_F_EOF = 8,
+ PCS_SOCK_F_POOLIN = 0x10,
+ PCS_SOCK_F_POOLOUT = 0x20,
+};
+
+enum
+{
+ PCS_IOCONN_BF_DEAD = 0,
+ PCS_IOCONN_BF_ERROR = 1, /* Notify from ->sk_error_report */
+};
+struct pcs_ioconn {
+
+ struct list_head list;
+ struct socket *socket;
+
+ unsigned long flags; /* atomic bit ops */
+ /* Save original socket->sk callbacks */
+ struct {
+ void *user_data;
+ void (*state_change)(struct sock *sk);
+ void (*error_report)(struct sock *sk);
+ void (*data_ready)(struct sock *sk, int bytes);
+ void (*write_space)(struct sock *sk);
+ } orig;
+ void(*destruct)(struct pcs_ioconn *);
+
+};
+
+struct pcs_sockio
+{
+ struct pcs_ioconn ioconn;
+
+ struct list_head write_queue;
+ int write_queue_len;
+ spinlock_t q_lock;
+ void *parent;
+
+ pcs_error_t error;
+ int send_timeout;
+ int hdr_ptr;
+ int hdr_max;
+ unsigned int flags;
+ u32 retrans;
+
+ struct pcs_msg *current_msg;
+ int read_offset;
+ int write_offset;
+ struct iov_iter read_iter;
+ struct iov_iter write_iter;
+ struct mutex mutex;
+ struct pcs_msg * (*get_msg)(struct pcs_sockio *);
+ /* eof() handler could be called twice: once on graceful socket shutdown and from sio_abort() */
+ void (*eof)(struct pcs_sockio *);
+ void (*write_wakeup)(struct pcs_sockio *);
+
+ char _inline_buffer[0];
+};
+
+#define sio_from_ioconn(conn) container_of(conn, struct pcs_sockio, ioconn)
+
+struct pcs_sockio * pcs_sockio_init(struct socket* sock, int alloc_max, int hdr_max);
+void pcs_sockio_start(struct pcs_sockio * sio);
+void pcs_sock_sendmsg(struct pcs_sockio * sio, struct pcs_msg *msg);
+int pcs_sock_cancel_msg(struct pcs_msg * msg);
+void pcs_sockio_xmit(struct pcs_sockio *sio);
+int pcs_sockio_delayed_seg(struct pcs_sockio *sio);
+int pcs_sock_queuelen(struct pcs_sockio * sio);
+void pcs_sock_abort(struct pcs_sockio * sio);
+void pcs_sock_error(struct pcs_sockio * sio, int error);
+
+void pcs_sock_throttle(struct pcs_sockio * sio);
+void pcs_sock_unthrottle(struct pcs_sockio * sio);
+
+struct pcs_msg * pcs_alloc_input_msg(struct pcs_sockio * sio, int datalen);
+struct pcs_msg * pcs_alloc_output_msg(int datalen);
+struct pcs_msg * pcs_clone_msg(struct pcs_msg * msg);
+struct pcs_msg * pcs_cow_msg(struct pcs_msg * msg, int data_len);
+void pcs_clone_done(struct pcs_msg * msg);
+void pcs_free_msg(struct pcs_msg * msg);
+void pcs_get_iter_inline(struct pcs_msg * msg, int offset, struct iov_iter*it);
+
+static inline void * msg_inline_head(struct pcs_msg * msg)
+{
+ struct iov_iter i;
+ void *map, *buf;
+ size_t len;
+
+ msg->get_iter(msg, 0, &i);
+ map = iov_iter_kmap_atomic(&i, &buf, &len);
+ /* inline head always kernel memory */
+ BUG_ON(map);
+ BUG_ON(len > msg->size);
+
+ return buf;
+}
+
+static inline void * sio_inline_buffer(struct pcs_sockio * sio)
+{
+ return sio->_inline_buffer;
+}
+
+static inline void pcs_msg_io_init(struct pcs_msg * msg)
+{
+ pcs_clear_error(&msg->error);
+ msg->_iocount = 0;
+ msg->done = pcs_free_msg;
+}
+
+static inline void pcs_msg_io_start(struct pcs_msg * msg, void (*done)(struct pcs_msg *))
+{
+ BUG_ON(msg->_iocount != 0);
+ msg->_iocount = 1;
+ msg->done = done;
+}
+
+static inline struct pcs_msg * pcs_msg_io_sched(struct pcs_msg * msg)
+{
+ BUG_ON(msg->_iocount <= 0);
+ msg->_iocount++;
+ return msg;
+}
+
+static inline void pcs_msg_io_end(struct pcs_msg * msg)
+{
+ BUG_ON(msg->_iocount <= 0);
+ if (--msg->_iocount == 0)
+ msg->done(msg);
+}
+
+static inline void pcs_msg_io_fini(struct pcs_msg * msg)
+{
+ BUG_ON(msg->_iocount != 0);
+}
+
+
+struct bufqueue;
+
+/**
+ Present a portion of @bq as a pcs_msg that may be passed to pcs_sock_sendmsg().
+ Reading data from the pcs_msg will drain @bq.
+
+ \param @bq the buffer queue with the data of a message
+ \param @size the length of the head of @bq that will be presented as a pcs_msg
+ \returns a pcs_msg that reads data from @bq
+*/
+struct pcs_msg* bufqueue_as_pcs_output_msg(struct bufqueue *bq, u32 size);
+
+
+void pcs_ioconn_unregister(struct pcs_ioconn *ioconn);
+void pcs_ioconn_close(struct pcs_ioconn *ioconn);
+
+
+#endif /* _PCS_SOCK_IO_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_timer.h b/fs/fuse/kio/pcs/pcs_timer.h
new file mode 100644
index 000000000000..f5ab4375ace1
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_timer.h
@@ -0,0 +1,19 @@
+#ifndef _PCS_TIMER_H_
+#define _PCS_TIMER_H_ 1
+
+#include "pcs_types.h"
+
+abs_time_t get_real_time_ms(void);
+
+static inline abs_time_t get_abs_time_fast_us(void)
+{
+ return ktime_to_ns(ktime_get()) / NSEC_PER_USEC;
+}
+
+static inline abs_time_t get_abs_time_us(void)
+{
+ return ktime_to_ns(ktime_get_real()) / NSEC_PER_USEC;
+}
+
+
+#endif /* _PCS_TIMER_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_types.h b/fs/fuse/kio/pcs/pcs_types.h
new file mode 100644
index 000000000000..f5c886e49619
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_types.h
@@ -0,0 +1,38 @@
+#ifndef __PCS_TYPES_H__
+#define __PCS_TYPES_H__
+
+#include <linux/types.h>
+#include <linux/timer.h>
+
+typedef int pcs_fd_t;
+typedef int pcs_sock_t;
+typedef unsigned long ULONG_PTR;
+typedef unsigned long long abs_time_t;
+typedef struct timer_list pcs_timer_t;
+#define PCS_INVALID_FD (-1)
+#define PCS_API
+
+#include "pcs_align.h"
+
+typedef struct __pre_aligned(8) _PCS_NODE_ID_T {
+ u64 val;
+} PCS_NODE_ID_T __aligned(8);
+
+
+/* from: pcs_net_addr.h */
+enum
+{
+ PCS_ADDRTYPE_NONE = 0,
+ PCS_ADDRTYPE_IP = 1,
+ PCS_ADDRTYPE_IP6 = 2,
+ PCS_ADDRTYPE_UNIX = 3,
+};
+
+/* alignment makes it usable in binary protocols */
+typedef struct __pre_aligned(8) _PCS_NET_ADDR_T {
+ u32 type;
+ u32 port; /* network byteorder! */
+ u8 address[16];
+} PCS_NET_ADDR_T __aligned(8);
+
+#endif /* __PCS_TYPES_H__ */
More information about the Devel
mailing list