[Devel] [PATCH RHEL7 COMMIT] fuse kio: Add pcs engine combo v0.8

Konstantin Khorenko khorenko at virtuozzo.com
Mon Feb 19 14:22:33 MSK 2018


The commit is pushed to "branch-rh7-3.10.0-693.17.1.vz7.45.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-693.17.1.vz7.43.7
------>
commit f51eb62d39b6ffe05458cb548f4bd82bc6f47bc5
Author: Dmitry Monakhov <dmonakhov at openvz.org>
Date:   Mon Feb 19 14:22:33 2018 +0300

    fuse kio: Add pcs engine combo v0.8
    
    https://jira.sw.ru/browse/PSBM-80680
    Signed-off-by: Dmitry Monakhov <dmonakhov at openvz.org>
---
 fs/fuse/Kconfig                         |    6 +
 fs/fuse/Makefile                        |   11 +
 fs/fuse/kio/pcs/fuse_io.c               |  168 ++
 fs/fuse/kio/pcs/log.h                   |   45 +
 fs/fuse/kio/pcs/pcs_align.h             |   18 +
 fs/fuse/kio/pcs/pcs_client_types.h      |  164 ++
 fs/fuse/kio/pcs/pcs_cluster.c           |  332 ++++
 fs/fuse/kio/pcs/pcs_cluster.h           |  106 ++
 fs/fuse/kio/pcs/pcs_cluster_core.c      |  214 +++
 fs/fuse/kio/pcs/pcs_cs.c                | 1067 +++++++++++
 fs/fuse/kio/pcs/pcs_cs.h                |  182 ++
 fs/fuse/kio/pcs/pcs_cs_prot.h           |  125 ++
 fs/fuse/kio/pcs/pcs_error.h             |  189 ++
 fs/fuse/kio/pcs/pcs_flow_detect.h       |    7 +
 fs/fuse/kio/pcs/pcs_flow_detect_stub.h  |   76 +
 fs/fuse/kio/pcs/pcs_fuse_kdirect.c      |  742 ++++++++
 fs/fuse/kio/pcs/pcs_ioctl.h             |   85 +
 fs/fuse/kio/pcs/pcs_map.c               | 2999 +++++++++++++++++++++++++++++++
 fs/fuse/kio/pcs/pcs_map.h               |  264 +++
 fs/fuse/kio/pcs/pcs_mds_prot.h          | 1335 ++++++++++++++
 fs/fuse/kio/pcs/pcs_perfcounters.h      |    7 +
 fs/fuse/kio/pcs/pcs_perfcounters_stub.h |   30 +
 fs/fuse/kio/pcs/pcs_prot_types.h        |  451 +++++
 fs/fuse/kio/pcs/pcs_req.c               |  116 ++
 fs/fuse/kio/pcs/pcs_req.h               |  320 ++++
 fs/fuse/kio/pcs/pcs_rpc.c               | 1314 ++++++++++++++
 fs/fuse/kio/pcs/pcs_rpc.h               |  290 +++
 fs/fuse/kio/pcs/pcs_rpc_prot.h          |   97 +
 fs/fuse/kio/pcs/pcs_sock_io.c           |  702 ++++++++
 fs/fuse/kio/pcs/pcs_sock_io.h           |  236 +++
 fs/fuse/kio/pcs/pcs_timer.h             |   19 +
 fs/fuse/kio/pcs/pcs_types.h             |   38 +
 32 files changed, 11755 insertions(+)

diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index a0591e4b3a04..433a39957c9d 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -39,3 +39,9 @@ config FUSE_KIO_NULLIO
 	depends on FUSE_FS
 	help
 	  This FUSE extension allows to handle io requests directly inside kernel
+
+config FUSE_KIO_PCS
+	tristate "Enable kdirect PCS io engine"
+	depends on FUSE_FS
+	help
+	  This FUSE extension allows to forward io requests directly to PCS
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index f7500f0e832e..cdefac9c4fbe 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -11,4 +11,15 @@ fuse_kio_noop-objs := kio/kio_noop.o
 obj-$(CONFIG_FUSE_KIO_NULLIO)	+= fuse_kio_nullio.o
 fuse_kio_nullio-objs := kio/kio_nullio.o
 
+obj-$(CONFIG_FUSE_KIO_PCS)	+= fuse_kio_pcs.o
+fuse_kio_pcs-objs := kio/pcs/pcs_fuse_kdirect.o \
+	kio/pcs/pcs_sock_io.o \
+	kio/pcs/pcs_rpc.o \
+	kio/pcs/pcs_req.o \
+	kio/pcs/pcs_map.o \
+	kio/pcs/pcs_cluster.o \
+	kio/pcs/pcs_cluster_core.o \
+	kio/pcs/pcs_cs.o \
+	kio/pcs/fuse_io.o
+
 fuse-objs := dev.o dir.o file.o inode.o control.o
diff --git a/fs/fuse/kio/pcs/fuse_io.c b/fs/fuse/kio/pcs/fuse_io.c
new file mode 100644
index 000000000000..c9eaa8d453db
--- /dev/null
+++ b/fs/fuse/kio/pcs/fuse_io.c
@@ -0,0 +1,168 @@
+#include <net/sock.h>
+#include <linux/net.h>
+#include <linux/kthread.h>
+#include <linux/types.h>
+#include <linux/pagemap.h>
+
+#include "pcs_types.h"
+#include "pcs_sock_io.h"
+#include "pcs_rpc.h"
+#include "pcs_sock_io.h"
+#include "pcs_req.h"
+#include "pcs_map.h"
+#include "pcs_cs.h"
+#include "pcs_cluster.h"
+#include "log.h"
+
+#include "../../fuse_i.h"
+
+static void intreq_complete(struct pcs_int_request *ireq)
+{
+	pcs_api_iorequest_t *req = ireq->apireq.req;
+
+	BUG_ON(ireq->type != PCS_IREQ_API);
+
+	if (pcs_if_error(&ireq->error)) {
+		req->flags |= PCS_REQ_F_ERROR;
+		if (ireq->error.value == PCS_ERR_NO_STORAGE ||
+		    ireq->error.value == PCS_ERR_CSD_LACKING)
+			req->flags |= PCS_REQ_F_NOSPACE;
+	}
+	req->complete(req);
+}
+
+static void on_read_done(struct pcs_fuse_req *r, size_t size)
+{
+	struct pcs_fuse_cluster *pfc = cl_from_req(r);
+
+	DTRACE("do fuse_request_end req:%p op:%d err:%d\n", &r->req, r->req.in.h.opcode, r->req.out.h.error);
+	r->req.out.args[0].size = size;
+	request_end(pfc->fc, &r->req);
+}
+
+static void on_sync_done(struct pcs_fuse_req *r)
+{
+	struct pcs_fuse_cluster *pfc = cl_from_req(r);
+
+	DTRACE("do fuse_request_end req:%p op:%d err:%d\n", &r->req, r->req.in.h.opcode, r->req.out.h.error);
+	request_end(pfc->fc, &r->req);
+}
+
+static void on_write_done(struct pcs_fuse_req *r, off_t pos, size_t size)
+{
+	struct fuse_write_out *out = &r->req.misc.write.out;
+	struct pcs_fuse_cluster *pfc = cl_from_req(r);
+
+	out->size = size;
+
+	DTRACE("do fuse_request_end req:%p op:%d err:%d\n", &r->req, r->req.in.h.opcode, r->req.out.h.error);
+	request_end(pfc->fc, &r->req);
+}
+
+static void req_get_iter(void *data, unsigned int offset, struct iov_iter *it)
+{
+	struct pcs_fuse_req *r = data;
+
+	iov_iter_init_bvec(it, r->exec.io.bvec, r->exec.io.num_bvecs, r->exec.io.req.size, 0);
+	iov_iter_advance(it, offset);
+}
+
+static inline void set_io_buff(struct pcs_fuse_req *r, off_t offset, size_t size,
+			       int is_bvec, int zeroing)
+{
+	int i;
+	size_t count = 0;
+	if (is_bvec) {
+		r->exec.io.bvec = r->req.bvec;
+		r->exec.io.num_bvecs = r->req.num_bvecs;
+	} else {
+		r->exec.io.bvec = r->exec.io.inline_bvec;
+		r->exec.io.num_bvecs = r->req.num_pages;
+		for (i = 0; i < r->req.num_pages && count < size; i++) {
+			r->exec.io.bvec[i].bv_page = r->req.pages[i];
+			r->exec.io.bvec[i].bv_offset = r->req.page_descs[i].offset;
+			r->exec.io.bvec[i].bv_len = r->req.page_descs[i].length;
+			count += r->exec.io.bvec[i].bv_len;
+		}
+	}
+	count = 0;
+	for (i = 0; i < r->exec.io.num_bvecs; i++) {
+		count += r->exec.io.bvec[i].bv_len;
+		if (zeroing && r->exec.io.bvec[i].bv_len < PAGE_SIZE)
+			clear_highpage(r->exec.io.bvec[i].bv_page);
+	}
+	BUG_ON(size > count);
+	r->exec.io.req.pos = offset;
+	r->exec.io.req.size = size;
+}
+
+static void prepare_io_(struct pcs_fuse_req *r, unsigned short type, off_t offset, size_t size,
+		       void (*complete)(struct _pcs_api_iorequest_t *))
+{
+	/* Use inline request structure */
+	struct pcs_int_request *ireq = &r->exec.ireq;
+
+	TRACE("INIT r(%p) ireq:%p {%ld, %ld}\n", r, ireq, offset, size);
+
+	/* Initialize IO request */
+	switch (type)
+	{
+	case PCS_REQ_T_READ:
+		BUG_ON(r->req.out.argbvec && r->req.out.argpages);
+		set_io_buff(r, offset, size, r->req.out.argbvec, r->req.out.page_zeroing);
+		break;
+	case PCS_REQ_T_WRITE:
+		BUG_ON(r->req.in.argbvec && r->req.in.argpages);
+		set_io_buff(r, offset, size, r->req.in.argbvec, 0);
+		break;
+	}
+
+	r->exec.io.req.type = type;
+	r->exec.io.req.datasource = r;
+	r->exec.io.req.get_iter = req_get_iter;
+	r->exec.io.req.complete = complete;
+
+	/* Initialize internal request structure */
+	ireq->type = PCS_IREQ_API;
+	ireq->apireq.req = &r->exec.io.req;
+	ireq->complete_cb = intreq_complete;
+	ireq->completion_data.parent = 0;
+	ireq->completion_data.ctx = r;
+	ireq->completion_data.priv = r;
+}
+
+static void ioreq_complete(pcs_api_iorequest_t *ioreq)
+{
+	struct pcs_fuse_req *r = ioreq->datasource;
+
+	BUG_ON(ioreq != &r->exec.io.req);
+
+	if (ioreq->flags & PCS_REQ_F_ERROR) {
+		if (ioreq->flags & PCS_REQ_F_NOSPACE)
+			r->req.out.h.error = -ENOSPC;
+		else
+			r->req.out.h.error = -EIO;
+	} else {
+		r->req.out.h.error = 0;
+	}
+
+	switch (ioreq->type) {
+	case PCS_REQ_T_READ:
+		on_read_done(r, ioreq->size);
+		break;
+	case PCS_REQ_T_WRITE:
+		on_write_done(r, ioreq->pos, ioreq->size);
+		break;
+	case PCS_REQ_T_SYNC:
+		on_sync_done(r);
+		break;
+	default:
+		BUG();
+	}
+
+}
+
+void pcs_fuse_prep_io(struct pcs_fuse_req *r, unsigned short type, off_t offset, size_t size)
+{
+	prepare_io_(r, type, offset, size, ioreq_complete);
+}
diff --git a/fs/fuse/kio/pcs/log.h b/fs/fuse/kio/pcs/log.h
new file mode 100644
index 000000000000..ee524a8b7a34
--- /dev/null
+++ b/fs/fuse/kio/pcs/log.h
@@ -0,0 +1,45 @@
+#ifndef __PCSLOG_H__
+#define __PCSLOG_H__
+
+#include <linux/printk.h>
+
+/*
+ * Log level values and flags
+ */
+#define LOG_ERR		0
+#define LOG_WARN	1
+#define LOG_INFO	2
+#define LOG_DEBUG	4
+/* The high debug levels are used for dumping the system state */
+#define LOG_DEBUG2	5
+#define LOG_DEBUG3	6
+/* Tracing levels */
+#define LOG_TRACE	7
+#define LOG_DEBUG4	8
+#define LOG_DEBUG5	9
+#define LOG_LEVEL_MAX	LOG_DEBUG5
+
+
+#define __PCS_DEBUG__ 1
+#define __PCS_DTRACE__ 1
+
+#ifndef __PCS_DEBUG__
+#define pcs_log(level, fmt, ...)
+#define TRACE(fmt, ...) do {} while (0)
+#define DTRACE(fmt, ...) do {} while (0)
+#else
+static int pcs_loglevel __attribute__ ((unused)) = LOG_DEBUG;
+#define pcs_log(level, fmt, args...) do					\
+	{								\
+		if (level <= pcs_loglevel)				\
+			pr_debug(fmt , ##args);				\
+	} while (0)
+#define TRACE(fmt, args...)	trace_printk("%d: " fmt "\n", __LINE__, ## args)
+
+#ifndef __PCS_DTRACE__
+#define DTRACE(fmt, ...) do {} while (0)
+#else
+#define DTRACE(fmt, args...)	trace_printk("%d: " fmt "\n", __LINE__, ## args)
+#endif
+#endif
+#endif /* __PCSLOG_H__ */
diff --git a/fs/fuse/kio/pcs/pcs_align.h b/fs/fuse/kio/pcs/pcs_align.h
new file mode 100644
index 000000000000..8dac73cb9713
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_align.h
@@ -0,0 +1,18 @@
+#ifndef __PCS_ALIGN_H__
+#define __PCS_ALIGN_H__
+
+#include "pcs_types.h"
+
+/* ----- helpers ----- */
+
+#if defined(__GNUC__) || defined(__clang__)
+
+#define __pre_aligned(x)
+#define __pre_packed
+#define __unaligned		__attribute__((packed, may_alias))
+#endif
+
+#define PCS_ALIGN_TO(sz, align) (((sz)+(align)-1)&~((align)-1))
+#define PCS_ALIGN(sz) PCS_ALIGN_TO(sz, 8)
+
+#endif /* __PCS_ALIGN_H__ */
diff --git a/fs/fuse/kio/pcs/pcs_client_types.h b/fs/fuse/kio/pcs/pcs_client_types.h
new file mode 100644
index 000000000000..3bffd4992221
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_client_types.h
@@ -0,0 +1,164 @@
+#ifndef _PCS_CLIENT_TYPES_H_
+#define _PCS_CLIENT_TYPES_H_ 1
+
+#include "pcs_prot_types.h"
+#include "pcs_mds_prot.h"
+#include "pcs_flow_detect.h"
+
+/* Values of lease. It is value, not bitmask. */
+#define PCS_LEASE_NONE		0
+#define PCS_LEASE_READ		1
+#define PCS_LEASE_WRITE		2
+#define PCS_LEASE_VALIDATE	3
+
+struct pcs_dentry_name {
+	const char		*data;
+	int			len;
+};
+
+struct pcs_dentry_id {
+	PCS_FILE_ID_T		parent;
+	struct pcs_dentry_name	name;
+};
+
+struct pcs_map_set {
+	struct list_lru		lru;
+	struct list_lru		dirty_lru;
+	struct list_head	dirty_queue;
+	spinlock_t		lock;
+	atomic_t		count;
+	atomic_t		dirty_count;
+	int			map_thresh;
+	int			map_dirty_thresh;
+	int			map_max;
+	struct shrinker		shrinker;
+
+	/* TODO: temproraly disabled */
+	struct pcs_flow_table_global ftab;
+};
+
+struct pcs_mapping {
+	struct pcs_cluster_core	*cluster;
+	unsigned		chunk_size_bits;
+	unsigned long		nrmaps;
+	struct radix_tree_root  map_tree; /* GFP_ATOMIC */
+	spinlock_t		map_lock;
+	struct pcs_flow_table	ftab;
+};
+
+struct fuse_inode;
+struct pcs_dentry_info {
+	struct pcs_dentry_id	id;
+	struct pcs_mds_fileinfo	fileinfo;
+	PCS_FILETIME_T		local_mtime;
+	struct pcs_mapping	mapping;
+	struct pcs_cluster_core	*cluster;
+	struct fuse_inode	*inode;
+};
+
+static inline void pcs_clear_fileinfo(struct pcs_dentry_info *i)
+{
+	struct pcs_mds_fileinfo *mi = (struct pcs_mds_fileinfo *)&i->fileinfo;
+
+	memset(mi, 0, sizeof(*mi));
+}
+
+static inline void pcs_set_fileinfo(struct pcs_dentry_info *i, const struct pcs_mds_fileinfo *finfo)
+{
+	struct pcs_mds_fileinfo *mi = &i->fileinfo;
+
+	*mi = *finfo;
+
+	if (mi->sys.stripe_depth == 0) {
+		mi->sys.stripe_depth = 1;
+		mi->sys.strip_width = mi->sys.chunk_size;
+	}
+	i->mapping.chunk_size_bits = ilog2(mi->sys.chunk_size);
+
+}
+
+/* Size constants */
+#define PCS_MAX_SYMLINK_SIZE	4095
+#define PCS_DFLT_MSS_WRITE	(64*1024)
+#define PCS_DFLT_MSS_READ	(128*1024)
+#define PCS_DFLT_MSS_LOCAL	(512*1024)
+
+/* Minimal delay before retrying failed operation. */
+#define PCS_ERROR_DELAY		200
+/* Maximum delay before retrying failed operation. */
+#define PCS_ERROR_DELAY_MAX	5000
+#define PCS_LEASE_RETRY		3
+
+#define PCS_INFO_DIR_COMPAT	".pstorage.info"
+#define PCS_INFO_DIR ".vstorage.info"
+
+/* Special magic suffix. readlink() on a name which such suffix from fuse-mouted pcs
+ * gives URI of file, which can be accessible via pcs api. If the file is pstorage symlink,
+ * it returns its contents to run it though VFS layer again: we cannot do this internally.
+ */
+#define PCS_API_URI_SUFFIX "#._PSTORAGE_URI_"
+
+enum {
+	PCS_REQ_T_READ = 0,
+	PCS_REQ_T_WRITE = 1,
+	PCS_REQ_T_SYNC = 2,
+};
+
+/* Request flags */
+#define PCS_REQ_F_ERROR		2
+#define PCS_REQ_F_NOSPACE	4
+#define PCS_REQ_F_CACHED	0x10
+
+struct iov_iter;
+typedef struct _pcs_api_iorequest_t {
+	off_t		pos;
+	size_t		size;
+	unsigned short	type;
+	unsigned short	flags;
+
+	void		*datasource;
+	void		(*get_iter)(void *datasource, unsigned int offset, struct iov_iter *it);
+
+	void		(*complete)(struct _pcs_api_iorequest_t *);
+} pcs_api_iorequest_t;
+
+typedef struct _pcs_api_csconnreq_t {
+	PCS_NODE_ID_T   id;    /* CS id */
+	PCS_NET_ADDR_T  addr;  /* CS addr */
+	int             error; /* pcs_errors.h */
+	void		(*complete)(struct _pcs_api_csconnreq_t *, int);
+} pcs_api_csconnreq_t;
+
+/*
+ * Useful macro
+ */
+
+#define PCS_FILE_ID_FMT       "[%08llx]"
+#define PCS_FILE_ID_ARGS(id)  (unsigned long long)(id)
+#define DENTRY_NAME_FMT       "%*.*s"
+#define DENTRY_FMT            PCS_FILE_ID_FMT "/" DENTRY_NAME_FMT
+#define DENTRY_NAME_ARGS(n)   (n).len, (n).len, (n).data
+#define DENTRY_ID_ARGS(id)    PCS_FILE_ID_ARGS((id).parent), DENTRY_NAME_ARGS((id).name)
+#define DENTRY_ARGS(de)	      DENTRY_ID_ARGS(((struct pcs_dentry_info *)(de))->id)
+
+#define DENTRY_SIZE(de)       ((de)->fileinfo.attr.size)
+#define DENTRY_CHUNK_SIZE(de) ((de)->fileinfo.sys.chunk_size)
+#define DENTRY_CHUNK_SIZE_BITS(de) ((de)->mapping.chunk_size_bits)
+
+void pcs_mapset_limit(struct pcs_map_set *maps, int limit);
+
+
+/* Inode id comparison function */
+static inline int pcs_dentry_id_cmp(struct pcs_dentry_id const *a, struct pcs_dentry_id const *b)
+{
+	int res;
+	res = memcmp(&a->parent, &b->parent, sizeof(a->parent));
+	if (res)
+		return res;
+	res = a->name.len - b->name.len;
+	if (res)
+		return res;
+	return memcmp(a->name.data, b->name.data, a->name.len);
+}
+
+#endif  /* _PCS_CLIENT_TYPES_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_cluster.c b/fs/fuse/kio/pcs/pcs_cluster.c
new file mode 100644
index 000000000000..7a9af9683e5e
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_cluster.c
@@ -0,0 +1,332 @@
+#include <net/sock.h>
+#include <linux/net.h>
+#include <linux/kthread.h>
+#include <linux/types.h>
+#include <linux/rbtree.h>
+
+#include "pcs_types.h"
+#include "pcs_sock_io.h"
+#include "pcs_rpc.h"
+#include "pcs_sock_io.h"
+#include "pcs_req.h"
+#include "pcs_map.h"
+#include "pcs_cs.h"
+#include "pcs_cluster.h"
+#include "log.h"
+
+#include "../../fuse_i.h"
+
+static inline int is_file_inline(struct pcs_dentry_info *di)
+{
+	return di->fileinfo.attr.attrib & PCS_FATTR_INLINE;
+}
+
+
+void pcs_sreq_complete(struct pcs_int_request *sreq)
+{
+	struct pcs_int_request *ireq = sreq->completion_data.parent;
+	struct pcs_cluster_core *cluster = sreq->cc;
+
+	if (pcs_if_error(&sreq->error)) {
+		if (!pcs_if_error(&ireq->error)) {
+			/* If we decided to abort api request, do not redo chunk request
+			 * even if the error is harmless. Otherwise, analyze sreq error
+			 * and, most likely, resubmit request.
+			 */
+			if (ireq_check_redo(sreq)) {
+				if (ireq_is_timed_out(sreq)) {
+					DTRACE("timeout while IO request on \"" DENTRY_FMT "\" last_err=%u",
+						DENTRY_ARGS(sreq->dentry), sreq->error.value);
+				}
+				if (sreq->type != PCS_IREQ_CUSTOM) {
+					map_notify_soft_error(sreq);
+
+					if (!(sreq->flags & IREQ_F_ONCE)) {
+						sreq->flags |= IREQ_F_ONCE;
+						pcs_clear_error(&sreq->error);
+						pcs_cc_submit(sreq->cc, sreq);
+						return;
+					}
+				}
+				pcs_clear_error(&sreq->error);
+				ireq_delay(sreq);
+				return;
+			}
+			pcs_copy_error(&ireq->error, &sreq->error);
+		}
+
+		if (sreq->type != PCS_IREQ_CUSTOM)
+			map_notify_iochunk_error(sreq);
+	}
+
+	if (sreq->type != PCS_IREQ_CUSTOM) {
+		if (!(sreq->flags & IREQ_F_CACHED))
+			ireq->flags &= ~IREQ_F_CACHED;
+		pcs_deaccount_ireq(sreq, &sreq->error);
+	} else if (sreq->custom.destruct)
+		sreq->custom.destruct(sreq);
+
+	if (!pcs_sreq_detach(sreq))
+		ireq_complete(ireq);
+
+	if (sreq->type == PCS_IREQ_IOCHUNK && sreq->iochunk.flow)
+		pcs_flow_put(sreq->iochunk.flow, &cluster->maps.ftab);
+
+	ireq_destroy(sreq);
+}
+
+void pcs_cc_process_ireq_chunk(struct pcs_int_request *ireq)
+{
+	struct pcs_map_entry *map;
+
+	TRACE(PCS_FILE_ID_FMT" [%llx]\n", ireq->dentry->fileinfo.attr.id,
+	      (unsigned long long)ireq->iochunk.chunk);
+
+	map = pcs_find_get_map(ireq->dentry, ireq->iochunk.chunk +
+			   ((ireq->flags & IREQ_F_MAPPED) ? 0 : ireq->iochunk.offset));
+
+	if (map_check_limit(map, ireq))
+		return;
+	if (ireq->iochunk.map)
+		pcs_map_put(ireq->iochunk.map);
+	ireq->iochunk.map = map;
+
+	map_submit(map, ireq, 0);
+}
+
+/* TODO Remove noinline in production */
+static noinline void __pcs_cc_process_ireq_rw(struct pcs_int_request *ireq)
+{
+	struct pcs_dentry_info *di = ireq->dentry;
+	u64 pos = ireq->apireq.req->pos;
+	unsigned int sz = ireq->apireq.req->size;
+	unsigned int dio_offset = 0;
+	struct pcs_flow_node *fl;
+
+	if (di->fileinfo.sys.map_type != PCS_MAP_PLAIN) {
+		BUG_ON(1);
+		return;
+	}
+
+	TRACE(DENTRY_FMT " %p op=%d at %llu [%llu]\n", DENTRY_ARGS(di), ireq, ireq->apireq.req->type,
+	      (unsigned long long)ireq->apireq.req->pos, (unsigned long long)ireq->apireq.req->size);
+
+
+	atomic_set(&ireq->iocount, 1);
+	ireq->flags |= IREQ_F_CACHED;
+
+	fl =  pcs_flow_record(&di->mapping.ftab, ireq->apireq.req->type == PCS_REQ_T_WRITE,
+			      pos, sz, &di->cluster->maps.ftab);
+
+	while (sz) {
+		struct pcs_int_request *sreq;
+		unsigned int len;
+		u64 rpos, chunk, end_pos;
+
+		rpos = map_file_to_chunk(pos, di->fileinfo.sys.chunk_size, di->fileinfo.sys.stripe_depth, di->fileinfo.sys.strip_width);
+
+		chunk = rpos & ~((u64)di->fileinfo.sys.chunk_size - 1);
+		end_pos = ((rpos / di->fileinfo.sys.strip_width) + 1) * (u64)di->fileinfo.sys.strip_width;
+
+		sreq = ireq_alloc(di);
+		if (!sreq) {
+			pcs_set_local_error(&ireq->error, PCS_ERR_NOMEM);
+			break;
+		}
+
+		sreq->dentry = di;
+		sreq->type = PCS_IREQ_IOCHUNK;
+		sreq->iochunk.map = NULL;
+		sreq->iochunk.flow = pcs_flow_get(fl);
+		sreq->iochunk.direction = ireq->apireq.req->type;
+		sreq->iochunk.cs_index = 0;
+		sreq->iochunk.chunk = chunk;
+		sreq->iochunk.offset = rpos % di->fileinfo.sys.chunk_size;
+		sreq->iochunk.dio_offset = dio_offset;
+		len = di->fileinfo.sys.chunk_size - sreq->iochunk.offset;
+		if (len > sz)
+			len = sz;
+		if (rpos + len > end_pos)
+			len = end_pos - rpos;
+		sreq->iochunk.size = len;
+		sreq->iochunk.csl = NULL;
+		sreq->iochunk.banned_cs.val = 0;
+		sreq->iochunk.msg.destructor = NULL;
+		sreq->iochunk.msg.rpc = NULL;
+
+		pcs_sreq_attach(sreq, ireq);
+		sreq->complete_cb = pcs_sreq_complete;
+		pcs_cc_process_ireq_chunk(sreq);
+
+		pos += len;
+		sz -= len;
+		dio_offset += len;
+	}
+	pcs_flow_put(fl, &di->cluster->maps.ftab);
+	if (atomic_dec_and_test(&ireq->iocount))
+		ireq_complete(ireq);
+}
+
+static void pcs_cc_process_ireq_ioreq(struct pcs_int_request *ireq)
+{
+
+	if (ireq->apireq.req->type == PCS_REQ_T_SYNC) {
+		map_inject_flush_req(ireq);
+		return;
+	}
+	if (ireq->apireq.req->type != PCS_REQ_T_READ &&
+	    ireq->apireq.req->type != PCS_REQ_T_WRITE) {
+		pcs_set_local_error(&ireq->error, PCS_ERR_PROTOCOL);
+		ireq_complete(ireq);
+	}
+	return __pcs_cc_process_ireq_rw(ireq);
+
+}
+
+static void ireq_process_(struct pcs_int_request *ireq)
+{
+	TRACE("enter " DENTRY_FMT " type=%u\n", DENTRY_ARGS(ireq->dentry), ireq->type);
+
+	switch (ireq->type) {
+	case PCS_IREQ_NOOP:
+		ireq_complete(ireq);
+		break;
+	case PCS_IREQ_IOCHUNK:
+		pcs_cc_process_ireq_chunk(ireq);
+		break;
+	case PCS_IREQ_API:
+		pcs_cc_process_ireq_ioreq(ireq);
+		break;
+	case PCS_IREQ_FLUSH:
+		process_flush_req(ireq);
+		break;
+	case PCS_IREQ_TRUNCATE:
+		process_ireq_truncate(ireq);
+		break;
+	case PCS_IREQ_CUSTOM:
+		ireq->custom.action(ireq);
+		break;
+	default:
+		BUG();
+		break;
+	}
+}
+
+static void ireq_notify_err(struct pcs_int_request *ireq, pcs_error_t *err)
+{
+	if (ireq->completion_data.parent)
+		ireq_notify_err(ireq->completion_data.parent, err);
+
+	else if (ireq->completion_data.priv) {
+		struct pcs_fuse_req *r = ireq->completion_data.priv;
+		r->exec.ctl.last_err = *err;
+	}
+}
+
+static void ireq_on_error_(struct pcs_int_request *ireq)
+{
+	/* Distinguish unrecoverable errors and recoverable ones.
+	 * Recoverable errors must trigger restart.
+	 */
+	ireq_notify_err(ireq, &ireq->error);
+	switch (ireq->error.value) {
+		/* This can happen if we lost connection for long time and lease has been lost.
+		 * We should try to reacquire lock. Server must reject reacquisition, if someone
+		 * took the lock after us.
+		 */
+	case PCS_ERR_LEASE_REQUIRED:
+	case PCS_ERR_LEASE_EXPIRED:
+	case PCS_ERR_INTEGRITY_FAIL: {
+		/* TODO:  tag ireq->dentry with EIO here */
+		goto fatal;
+	}
+	case PCS_ERR_CSD_LACKING:
+		goto fatal;
+		break;
+	case PCS_ERR_INV_PARAMS:
+	case PCS_ERR_NOT_FOUND:
+	case PCS_ERR_NON_EMPTY_DIR:
+	case PCS_ERR_NOT_DIR:
+	case PCS_ERR_IS_DIR:
+	case PCS_ERR_NO_STORAGE:
+	case PCS_ERR_UNAVAIL:
+fatal:
+		printk(KERN_INFO "%s fatal error:%d nodeid:%llu", __func__,
+		       ireq->error.value, ireq->dentry->inode->nodeid);
+		ireq->flags |= IREQ_F_FATAL;
+		break;
+	case PCS_ERR_LEASE_CONFLICT:
+		WARN_ON_ONCE(1);
+		break;
+	default:
+		break;
+		;
+	}
+}
+
+static int ireq_check_redo_(struct pcs_int_request *ireq)
+{
+	pcs_error_t *err = &ireq->error;
+
+	if (ireq->flags & IREQ_F_FATAL)
+		return 0;
+
+	if (ireq->completion_data.parent &&
+	    pcs_if_error(&ireq->completion_data.parent->error) &&
+	    !ireq_check_redo(ireq->completion_data.parent))
+		return 0;
+
+	/* Fatal errors */
+	switch (err->value) {
+	case PCS_ERR_PROTOCOL:
+	case PCS_ERR_INV_PARAMS:
+	case PCS_ERR_NOT_FOUND:
+	case PCS_ERR_IS_DIR:
+	case PCS_ERR_NOT_DIR:
+		return 0;
+	}
+
+	/* Remote errors are never fatal */
+	if (err->remote)
+		return 1;
+
+	/* Fatal errors */
+	switch (err->value) {
+	case PCS_ERR_NOMEM:
+	case PCS_ERR_LEASE_REQUIRED:
+	case PCS_ERR_LEASE_EXPIRED:
+	case PCS_ERR_INTEGRITY_FAIL:
+	case PCS_ERR_NO_STORAGE:
+		return 0;
+	}
+
+	return 1;
+}
+
+int pcs_cluster_init(struct pcs_fuse_cluster *pfc, struct workqueue_struct *wq,
+		     struct fuse_conn *fc, PCS_CLUSTER_ID_T *cl_id,
+		     PCS_NODE_ID_T *id)
+{
+	struct pcs_cluster_core_attr attr;
+
+	attr.cluster = *cl_id;
+	attr.node = *id;
+	attr.abort_timeout_ms = 0;
+
+	pfc->fc = fc;
+
+	/* core init */
+	if (pcs_cc_init(&pfc->cc, wq, NULL, &attr))
+		return -1;
+	pfc->cc.op.ireq_process	   = ireq_process_;
+	pfc->cc.op.ireq_on_error   = ireq_on_error_;
+	pfc->cc.op.ireq_check_redo = ireq_check_redo_;
+
+	return 0;
+}
+
+void pcs_cluster_fini(struct pcs_fuse_cluster *pfc)
+{
+	pcs_cc_fini(&pfc->cc);
+	kfree(pfc);
+}
diff --git a/fs/fuse/kio/pcs/pcs_cluster.h b/fs/fuse/kio/pcs/pcs_cluster.h
new file mode 100644
index 000000000000..3a8116b705df
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_cluster.h
@@ -0,0 +1,106 @@
+#ifndef _PCS_CLUSTER_H_
+#define _PCS_CLUSTER_H_ 1
+
+#include "pcs_req.h"
+#include "../../fuse_i.h"
+struct fuse_conn;
+
+/* Try to follows pcs/client/fused structure style */
+struct pcs_fuse_exec_ctx {
+	struct pcs_int_request	ireq;
+	struct {
+		pcs_api_iorequest_t	req;
+		struct bio_vec		*bvec;
+		unsigned		num_bvecs;
+		/* Fuck mem economy, make it simple for testing purpose
+		   TODO: IMPLEMENT  fuse_req iterator similar to bvec one */
+		struct bio_vec inline_bvec[FUSE_MAX_PAGES_PER_REQ];
+	} io;
+	struct {
+		unsigned		retry_cnt;
+		pcs_error_t		last_err;
+	} ctl;
+};
+
+struct pcs_fuse_req {
+	struct fuse_req req;
+	struct pcs_fuse_exec_ctx exec;	/* Zero initialized context */
+};
+
+struct pcs_fuse_cluster {
+	struct pcs_cluster_core cc;
+	struct fuse_conn *fc;
+};
+
+struct pcs_fuse_work {
+	struct work_struct work;
+	pcs_error_t status;
+	void *ctx;
+	void *ctx2;
+};
+
+int pcs_cluster_init(struct pcs_fuse_cluster *c, struct workqueue_struct *,
+		     struct fuse_conn *fc, PCS_CLUSTER_ID_T *cl_id,
+		     PCS_NODE_ID_T *id);
+void pcs_cluster_fini(struct pcs_fuse_cluster *c);
+
+static inline struct pcs_fuse_req *pcs_fuse_req_from_work(struct pcs_fuse_exec_ctx *ctx)
+{
+	return container_of(ctx, struct pcs_fuse_req, exec);
+}
+
+static inline struct fuse_req *fuse_req_from_pcs(struct pcs_fuse_req *r)
+{
+	return (struct fuse_req *)r;
+}
+
+static inline struct pcs_fuse_req *pcs_req_from_fuse(struct fuse_req *req)
+{
+	return container_of(req, struct pcs_fuse_req, req);
+}
+
+static inline struct pcs_fuse_cluster *pcs_cluster_from_cc(struct pcs_cluster_core *c)
+{
+	return container_of(c, struct pcs_fuse_cluster, cc);
+}
+
+static inline struct pcs_dentry_info *pcs_inode_from_fuse(struct fuse_inode *fi)
+{
+
+	BUG_ON(!fi->private);
+
+	return (struct pcs_dentry_info *)fi->private;
+}
+
+static inline struct pcs_fuse_cluster *cl_from_req(struct pcs_fuse_req *r)
+{
+	return pcs_cluster_from_cc(r->exec.ireq.cc);
+}
+
+static inline struct pcs_cluster_core *cc_from_rpc(struct pcs_rpc_engine *eng)
+{
+	return container_of(eng, struct pcs_cluster_core, eng);
+}
+
+/* from pcs_cluter_core.h */
+struct pcs_cluster_core_attr {
+	PCS_CLUSTER_ID_T	cluster;
+	PCS_NODE_ID_T		node;
+
+	/* Timeouts */
+	int			abort_timeout_ms;
+};
+int pcs_cc_init(struct pcs_cluster_core *cc, struct workqueue_struct *wq,
+		const char *cluster_name, struct pcs_cluster_core_attr *attr);
+void pcs_cc_fini(struct pcs_cluster_core *cc);
+
+void pcs_fuse_prep_io(struct pcs_fuse_req *r, unsigned short type, off_t offset, size_t size);
+int fuse_pcs_csconn_send(struct fuse_conn *fc, struct pcs_rpc *ep, int flags);
+
+
+static inline void pcs_cc_set_abort_timeout(struct pcs_cluster_core *cc, int timeout)
+{
+	cc->cfg.def.abort_timeout = cc->cfg.curr.abort_timeout = timeout;
+}
+
+#endif /* _PCS_CLUSTER_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_cluster_core.c b/fs/fuse/kio/pcs/pcs_cluster_core.c
new file mode 100644
index 000000000000..a5bdbc8ebd82
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_cluster_core.c
@@ -0,0 +1,214 @@
+#include <net/sock.h>
+#include <linux/net.h>
+#include <linux/kthread.h>
+#include <linux/types.h>
+#include <linux/rbtree.h>
+
+#include "pcs_types.h"
+#include "pcs_sock_io.h"
+#include "pcs_rpc.h"
+#include "pcs_sock_io.h"
+#include "pcs_req.h"
+#include "pcs_map.h"
+#include "pcs_cs.h"
+#include "pcs_cluster.h"
+#include "log.h"
+
+#include "../../fuse_i.h"
+
+void pcs_mapset_limit(struct pcs_map_set *maps, int limit)
+{
+	maps->map_thresh = limit - limit/4;
+	maps->map_dirty_thresh = limit - limit/8;
+	maps->map_max = limit;
+}
+
+static unsigned long pcs_map_shrink_count(struct shrinker *shrinker, struct shrink_control *sc)
+{
+	struct pcs_map_set *maps = container_of(shrinker,
+					struct pcs_map_set, shrinker);
+
+	return list_lru_count_node(&maps->lru, sc->nid) +
+		list_lru_count_node(&maps->dirty_lru, sc->nid);
+}
+
+
+static int pcs_mapset_init(struct pcs_map_set *maps)
+{
+	if (list_lru_init(&maps->lru))
+		return -ENOMEM;
+
+	if (list_lru_init(&maps->dirty_lru)) {
+		list_lru_destroy(&maps->lru);
+		return -ENOMEM;
+	}
+	INIT_LIST_HEAD(&maps->dirty_queue);
+	atomic_set(&maps->count, 0);
+	atomic_set(&maps->dirty_count, 0);
+	pcs_mapset_limit(maps, PCS_MAP_LIMIT);
+	pcs_flow_table_global_init(&maps->ftab);
+
+	maps->shrinker.count_objects = pcs_map_shrink_count;
+	maps->shrinker.scan_objects  = pcs_map_shrink_scan;
+	maps->shrinker.seeks = DEFAULT_SEEKS;
+	maps->shrinker.flags = SHRINKER_NUMA_AWARE;
+	register_shrinker(&maps->shrinker);
+
+	return 0;
+}
+
+static void pcs_mapset_fini(struct pcs_map_set *maps)
+{
+	unregister_shrinker(&maps->shrinker);
+
+	BUG_ON(list_lru_count(&maps->lru));
+	BUG_ON(list_lru_count(&maps->dirty_lru));
+	BUG_ON(!list_empty(&maps->dirty_queue));
+
+	list_lru_destroy(&maps->lru);
+	list_lru_destroy(&maps->dirty_lru);
+}
+
+static void init_def_mss(struct pcs_cluster_core *cc)
+{
+	cc->cfg.def.wmss = PCS_DFLT_MSS_WRITE;
+	cc->cfg.def.rmss = PCS_DFLT_MSS_READ;
+	cc->cfg.def.lmss = PCS_DFLT_MSS_LOCAL;
+}
+
+
+static void cc_workqueue_handler(struct work_struct *w)
+{
+	LIST_HEAD(queue);
+	struct pcs_cluster_core *cc = (struct pcs_cluster_core *)
+		container_of(w, struct pcs_cluster_core, main_job);
+
+	spin_lock_irq(&cc->lock);
+	list_splice_tail_init(&cc->work_queue, &queue);
+	spin_unlock_irq(&cc->lock);
+
+	while (!list_empty(&queue)) {
+		struct pcs_int_request *ireq = list_first_entry(&queue, struct pcs_int_request, list);
+
+		list_del_init(&ireq->list);
+		TRACE("process ireq:%p" DENTRY_FMT " type=%u\n", ireq, DENTRY_ARGS(ireq->dentry), ireq->type);
+		cc->op.ireq_process(ireq);
+	}
+}
+
+static void cc_completion_handler(struct work_struct *w)
+{
+	struct pcs_cluster_core *cc = (struct pcs_cluster_core *)
+		container_of(w, struct pcs_cluster_core, completion_job);
+	LIST_HEAD(queue);
+
+	spin_lock_irq(&cc->lock);
+	list_splice_tail_init(&cc->completion_queue, &queue);
+	spin_unlock_irq(&cc->lock);
+
+	while (!list_empty(&queue)) {
+		struct pcs_int_request *ireq = list_first_entry(&queue, struct pcs_int_request, list);
+
+		list_del_init(&ireq->list);
+		TRACE("complete " DENTRY_FMT " type=%u\n", DENTRY_ARGS(ireq->dentry), ireq->type);
+		ireq_complete(ireq);
+	}
+}
+
+int pcs_cc_init(struct pcs_cluster_core *cc, struct workqueue_struct *wq,
+		const char *cluster_name, struct pcs_cluster_core_attr *attr)
+{
+	int err;
+	/* Ignore this for now, i have cluter_id and node_id*/
+	/* if (cluster_name == NULL) */
+	/*	   return -1; */
+
+	spin_lock_init(&cc->lock);
+	INIT_LIST_HEAD(&cc->work_queue);
+	INIT_LIST_HEAD(&cc->completion_queue); /* completion queue only for sanity */
+	INIT_WORK(&cc->main_job, cc_workqueue_handler);
+	INIT_WORK(&cc->completion_job, cc_completion_handler);
+	cc->wq = wq;
+
+	pcs_csset_init(&cc->css);
+
+	err = pcs_mapset_init(&cc->maps);
+	if (err)
+		return err;
+
+	pcs_rpc_engine_init(&cc->eng, PCS_NODE_ROLE_TOOL);
+	pcs_rpc_init_gc(&cc->eng, 1024);
+	if (attr) {
+		pcs_rpc_set_cluster_id(&cc->eng, &attr->cluster);
+		pcs_rpc_set_host_id(&cc->eng, &attr->node);
+		if (attr->abort_timeout_ms)
+			pcs_cc_set_abort_timeout(cc, attr->abort_timeout_ms);
+	}
+	/* TODO resurect ratelimit and randeng
+	 * pcs_ratelimit_init(cc, &cc->rlim);
+	 * pcs_srandomdev(&cc->rng);
+	 */
+
+	memset(&cc->cfg,   0, sizeof(cc->cfg));
+	memset(&cc->op,	   0, sizeof(cc->op));
+
+	init_def_mss(cc);
+	cc->cfg.def.kernel_cache_en = 1;
+	cc->cfg.curr = cc->cfg.def;
+	cc->cfg.sn = PCS_CONFIG_SEQ_ANY;
+
+	cc->io_locality = 0;
+	cc->io_tweaks = 0;
+	cc->netlat_cutoff = PCS_MAX_NETWORK_LATENCY*1000;
+	cc->iolat_cutoff = PCS_MAX_IO_LATENCY*1000;
+	cc->abort_callback = NULL;
+
+	TRACE("Ok cc->{ cl_id:" CLUSTER_ID_FMT ", node_id:" NODE_FMT ", f:%x}\n",
+	      CLUSTER_ID_ARGS(cc->eng.cluster_id), NODE_ARGS(cc->eng.local_id),
+	      cc->eng.flags);
+
+	return 0;
+}
+
+void pcs_cc_fini(struct pcs_cluster_core *cc)
+{
+	pcs_csset_fini(&cc->css);
+	pcs_mapset_fini(&cc->maps);
+	pcs_rpc_engine_fini(&cc->eng);
+
+	BUG_ON(!list_empty(&cc->completion_queue));
+	BUG_ON(!list_empty(&cc->work_queue));
+	pcs_flow_table_global_fini(&cc->maps.ftab);
+}
+
+void pcs_cc_submit(struct pcs_cluster_core *cc, struct pcs_int_request *ireq)
+{
+	int was_idle = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cc->lock, flags);
+	was_idle = list_empty(&cc->work_queue);
+	list_add_tail(&ireq->list, &cc->work_queue);
+	spin_unlock_irqrestore(&cc->lock, flags);
+
+	if (was_idle)
+		queue_work(cc->wq, &cc->main_job);
+}
+
+/* move request queue "q" back to main work_queue, it will be processed from the very beginning */
+void pcs_cc_requeue(struct pcs_cluster_core *cc, struct list_head *q)
+{
+	unsigned long flags;
+	int was_idle = 0;
+
+	if (list_empty(q))
+		return;
+
+	spin_lock_irqsave(&cc->lock, flags);
+	was_idle = list_empty(&cc->work_queue);
+	list_splice_tail_init(q, &cc->work_queue);
+	spin_unlock_irqrestore(&cc->lock, flags);
+
+	if (was_idle)
+		queue_work(cc->wq, &cc->main_job);
+}
diff --git a/fs/fuse/kio/pcs/pcs_cs.c b/fs/fuse/kio/pcs/pcs_cs.c
new file mode 100644
index 000000000000..0f7463e8f13a
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_cs.c
@@ -0,0 +1,1067 @@
+#include <net/sock.h>
+#include <linux/net.h>
+#include <linux/kthread.h>
+#include <linux/types.h>
+#include <linux/rbtree.h>
+
+#include "pcs_types.h"
+#include "pcs_sock_io.h"
+#include "pcs_rpc.h"
+#include "pcs_sock_io.h"
+#include "pcs_req.h"
+#include "pcs_map.h"
+#include "pcs_cs.h"
+#include "pcs_cs_prot.h"
+#include "pcs_cluster.h"
+#include "pcs_ioctl.h"
+#include "log.h"
+
+/* Lock order: cs->lock -> css->lock (lru, hash, bl_list) */
+
+
+struct pcs_rpc_params cn_rpc_params = {
+	.alloc_hdr_size		= sizeof(struct pcs_rpc_hdr),
+	.max_msg_size		= PCS_CS_MSG_MAX_SIZE,
+	.holddown_timeout	= HZ,
+	.connect_timeout	= 5*HZ,
+	.response_timeout	= 30*HZ,
+	.max_conn_retry		= 3,
+	.flags			= 0,
+};
+
+static void cs_aborting(struct pcs_rpc *ep, int error);
+static struct pcs_msg *cs_get_hdr(struct pcs_rpc *ep, struct pcs_rpc_hdr *h);
+static int cs_input(struct pcs_rpc *ep, struct pcs_msg *msg);
+static void cs_keep_waiting(struct pcs_rpc *ep, struct pcs_msg *req, struct pcs_msg *msg);
+static void cs_connect(struct pcs_rpc *ep);
+static void pcs_cs_isolate(struct pcs_cs *cs, struct list_head *dispose);
+static void pcs_cs_destroy(struct pcs_cs *cs);
+
+struct pcs_rpc_ops cn_rpc_ops = {
+	.demux_request		= cs_input,
+	.get_hdr		= cs_get_hdr,
+	.state_change		= cs_aborting,
+	.keep_waiting		= cs_keep_waiting,
+	.connect		= cs_connect,
+};
+
+struct pcs_cs *pcs_cs_alloc(struct pcs_cs_set *css,
+			     struct pcs_cluster_core *cc)
+{
+	struct pcs_cs *cs;
+
+	cs = kzalloc(sizeof(struct pcs_cs), GFP_NOIO);
+	if (cs == NULL)
+		return NULL;
+
+	INIT_HLIST_NODE(&cs->hlist);
+	INIT_LIST_HEAD(&cs->lru_link);
+	spin_lock_init(&cs->lock);
+	cs->css = css;
+	cs->in_flight = 0;
+	cs->cwnd = PCS_CS_INIT_CWND;
+	cs->eff_cwnd = PCS_CS_INIT_CWND;
+	cs->cwr_state = 0;
+	atomic_set(&cs->latency_avg, 0);
+	cs->net_latency_avg = 0;
+	cs->last_latency = 0;
+	cs->latency_stamp = 0;
+	cs->net_latency_stamp = 0;
+	cs->idle_stamp = 0;
+	cs->in_flight_hwm = 0;
+	cs->in_flight_hwm_stamp = 0;
+	pcs_cs_init_cong_queue(cs);
+	pcs_cs_init_active_list(cs);
+
+	cs->io_prio = -1;
+	cs->mds_flags = 0;
+	cs->io_prio_stamp = 0;
+
+	INIT_LIST_HEAD(&cs->flow_lru);
+	cs->nflows = 0;
+
+	cs->state = 0;
+	cs->is_probing = 0;
+	cs->is_dead = 0;
+	INIT_LIST_HEAD(&cs->bl_link);
+
+	cs->addr_serno = 0;
+
+	cs->rpc = pcs_rpc_create(&cc->eng, &cn_rpc_params, &cn_rpc_ops);
+	if (cs->rpc == NULL) {
+		kfree(cs);
+		return NULL;
+	}
+	cs->rpc->private = cs;
+	cs->nmaps = 0;
+	INIT_LIST_HEAD(&cs->map_list);
+	memset(&cs->stat, 0, sizeof(cs->stat));
+	return cs;
+}
+
+unsigned int pcs_cs_hash(PCS_NODE_ID_T *id)
+{
+	return *(unsigned int *)id % PCS_CS_HASH_SIZE;
+}
+
+static struct pcs_cs *
+__lookup_cs(struct pcs_cs_set *csset, PCS_NODE_ID_T *id)
+{
+	struct pcs_cs *cs;
+	hlist_for_each_entry_rcu(cs, &csset->ht[pcs_cs_hash(id)], hlist) {
+		if (memcmp(&cs->id, id, sizeof(cs->id)) == 0)
+			return cs;
+	}
+	return NULL;
+}
+
+static struct pcs_cs *
+lookup_and_lock_cs(struct pcs_cs_set *csset, PCS_NODE_ID_T *id)
+{
+	struct pcs_cs *cs;
+retry:
+	rcu_read_lock();
+	cs = __lookup_cs(csset, id);
+	if (!cs) {
+		rcu_read_unlock();
+		return NULL;
+	}
+	spin_lock(&cs->lock);
+	rcu_read_unlock();
+	if (cs->is_dead) {
+		spin_unlock(&cs->lock);
+		goto retry;
+	}
+	return cs;
+}
+
+static void add_cs(struct pcs_cs_set *csset, struct pcs_cs *cs)
+{
+	unsigned int hash = pcs_cs_hash(&cs->id);
+
+	assert_spin_locked(&csset->lock);
+
+	list_add_tail(&cs->lru_link, &csset->lru);
+	csset->ncs++;
+	hlist_add_head_rcu(&cs->hlist, &csset->ht[hash]);
+}
+
+static inline int netaddr_cmp(PCS_NET_ADDR_T const *addr1, PCS_NET_ADDR_T const *addr2, int ignore_port)
+{
+	unsigned int d;
+	size_t sz = 0;
+
+	d = addr1->type - addr2->type;
+	if (d)
+		return d;
+	d = addr1->port - addr2->port;
+	if (!ignore_port && d)
+		return d;
+
+	switch (addr1->type) {
+	case PCS_ADDRTYPE_IP:
+		sz = sizeof(struct in_addr);
+		break;
+	case PCS_ADDRTYPE_IP6:
+		sz = sizeof(struct in6_addr);
+		break;
+	default:
+		BUG();
+	}
+
+	return memcmp(addr1->address, addr2->address, sz);
+}
+
+int pcs_netaddr_cmp(PCS_NET_ADDR_T const *addr1, PCS_NET_ADDR_T const *addr2)
+{
+	return netaddr_cmp(addr1, addr2, 0);
+}
+
+/* Return locked cs */
+struct pcs_cs *pcs_cs_find_create(struct pcs_cs_set *csset, PCS_NODE_ID_T *id, PCS_NET_ADDR_T *addr, int flags)
+{
+	struct pcs_cs *cs;
+
+again:
+	cs = lookup_and_lock_cs(csset, id);
+	if (cs) {
+		/* If rpc is connected, leave it connected until failure.
+		 * After current connect fails, reconnect will be done to new address
+		 */
+		if (addr) {
+			if (pcs_netaddr_cmp(&cs->addr, addr)) {
+				cs->addr = *addr;
+				cs->addr_serno++;
+				if (!(flags & CS_FL_INACTIVE))
+					pcs_map_notify_addr_change(cs);
+				TRACE("Port change CS" NODE_FMT " seq=%d", NODE_ARGS(*id), cs->addr_serno);
+				pcs_rpc_set_address(cs->rpc, addr);
+
+			}
+		}
+		/* TODO: (flags & PCS_RPC_F_LOCAL) should be checker here */
+		return cs;
+	}
+	BUG_ON(addr == NULL);
+
+	cs = pcs_cs_alloc(csset, cc_from_csset(csset));
+	if (!cs)
+		return NULL;
+
+	cs->id = *id;
+
+	cs->addr = *addr;
+	cs->addr_serno = 1;
+
+	pcs_rpc_set_peer_id(cs->rpc, id, PCS_NODE_ROLE_CS);
+	pcs_rpc_set_address(cs->rpc, addr);
+
+
+	/* TODO: Init PCS_RPC_F_LOCAL if available here */
+
+	spin_lock(&cs->lock);
+	spin_lock(&csset->lock);
+	if (__lookup_cs(csset, id)) {
+		spin_unlock(&csset->lock);
+		cs->is_dead = 1;
+		spin_unlock(&cs->lock);
+		pcs_cs_destroy(cs);
+		goto again;
+	}
+	add_cs(csset, cs);
+	spin_unlock(&csset->lock);
+	return cs;
+}
+
+static void (*io_times_logger_cb)(struct pcs_int_request *ireq, struct pcs_msg *resp, u32 max_iolat, void *ctx) = NULL;
+static void *io_times_logger_ctx = NULL;
+
+void cs_set_io_times_logger(void (*logger)(struct pcs_int_request *ireq, struct pcs_msg *resp, u32 max_iolat, void *ctx), void *ctx)
+{
+	io_times_logger_cb = logger;
+	io_times_logger_ctx = ctx;
+}
+
+
+void pcs_cs_update_stat(struct pcs_cs *cs, u32 iolat, u32 netlat, int op_type)
+{
+	pcs_perfcounter_stat_update(&cs->stat.iolat, iolat);
+	pcs_perfcounter_stat_update(&cs->stat.netlat, netlat);
+	switch (op_type) {
+	case PCS_CS_WRITE_SYNC_RESP:
+	case PCS_CS_WRITE_RESP:
+		cs->stat.write_ops_rate.total++;
+		break;
+	case PCS_CS_READ_RESP:
+		cs->stat.read_ops_rate.total++;
+		break;
+	case PCS_CS_SYNC_RESP:
+		cs->stat.sync_ops_rate.total++;
+		break;
+	}
+}
+
+static void cs_response_done(struct pcs_msg *msg)
+{
+	struct pcs_int_request *ireq = ireq_from_msg(msg);
+
+	if (!pcs_if_error(&msg->error)) {
+		struct pcs_cs_iohdr *h = (struct pcs_cs_iohdr *)msg_inline_head(msg->response);
+
+		if (h->sync.misc & PCS_CS_IO_CACHED)
+			ireq->flags |= IREQ_F_CACHED;
+
+		pcs_map_verify_sync_state(ireq->dentry, ireq, msg);
+	} else {
+		TRACE(XID_FMT " IO error %d %lu : %llu:%u+%u\n", XID_ARGS(ireq->iochunk.hbuf.hdr.xid), msg->error.value, msg->error.remote ? (unsigned long)msg->error.offender.val : 0UL,
+		      (unsigned long long)ireq->iochunk.chunk, (unsigned)ireq->iochunk.offset, ireq->iochunk.size);
+	}
+
+	pcs_copy_error_cond(&ireq->error, &msg->error);
+	if (msg->rpc) {
+		pcs_rpc_put(msg->rpc);
+		msg->rpc = NULL;
+	}
+	ireq_complete(ireq);
+}
+
+static void cs_get_read_response_iter(struct pcs_msg *msg, int offset, struct iov_iter *it)
+{
+	if (offset < sizeof(struct pcs_cs_iohdr)) {
+		iov_iter_init_plain(it, msg->_inline_buffer,
+				  sizeof(struct pcs_cs_iohdr), 0);
+		iov_iter_advance(it, offset);
+		TRACE("return msg:%p->size:%d off:%d it_len:%ld\n\n", msg, msg->size, offset, iov_iter_count(it));
+		return;
+	} else {
+		struct pcs_msg *req = msg->private;
+		struct pcs_int_request *ireq = req->private2;
+		struct pcs_int_request *parent = ireq->completion_data.parent;
+
+		if (parent->type == PCS_IREQ_API) {
+			pcs_api_iorequest_t *ar = parent->apireq.req;
+
+			/* Read directly to memory given by user */
+			BUG_ON(ireq->iochunk.direction != PCS_REQ_T_READ);
+
+			offset -= (unsigned int)sizeof(struct pcs_cs_iohdr);
+			ar->get_iter(ar->datasource, ireq->iochunk.dio_offset, it);
+			iov_iter_truncate(it, ireq->iochunk.size);
+			iov_iter_advance(it, offset);
+
+			TRACE("return msg:%p->size:%d off:%d it_len:%ld\n\n", msg, msg->size, offset, iov_iter_count(it));
+			return;
+		} else
+			BUG();
+	}
+}
+
+static void cs_connect(struct pcs_rpc *ep)
+{
+	struct pcs_cluster_core *cc = cc_from_rpc(ep->eng);
+	struct pcs_fuse_cluster *pfc = pcs_cluster_from_cc(cc);
+
+	ep->state = PCS_RPC_CONNECT;
+	if (fuse_pcs_csconn_send(pfc->fc, ep, PCS_IOC_CS_OPEN))
+		pcs_rpc_reset(ep);
+}
+
+static struct pcs_msg *cs_get_hdr(struct pcs_rpc *ep, struct pcs_rpc_hdr *h)
+{
+	struct pcs_msg *msg, *resp;
+	struct pcs_rpc_hdr *req_h;
+
+	if (!RPC_IS_RESPONSE(h->type))
+		return NULL;
+
+	if (h->type != PCS_CS_READ_RESP)
+		return NULL;
+
+	/* The goal is to avoid allocation new msg and reuse one inlined in ireq */
+
+	msg = pcs_rpc_lookup_xid(ep, &h->xid);
+	if (msg == NULL)
+		return NULL;
+
+	req_h = (struct pcs_rpc_hdr *)msg_inline_head(msg);
+	if (req_h->type != PCS_CS_READ_REQ)
+		return NULL;
+
+	resp = pcs_rpc_alloc_input_msg(ep, sizeof(struct pcs_cs_iohdr));
+	if (!resp)
+		return NULL;
+
+	memcpy(resp->_inline_buffer, h, sizeof(struct pcs_rpc_hdr));
+	resp->size = h->len;
+	resp->private = msg;
+	resp->get_iter = cs_get_read_response_iter;
+	resp->done = rpc_work_input;
+	pcs_msg_del_calendar(msg);
+
+	return resp;
+}
+
+static void cs_get_data(struct pcs_msg *msg, int offset, struct iov_iter *it)
+{
+	struct pcs_int_request *ireq = ireq_from_msg(msg);
+
+	if (offset < sizeof(struct pcs_cs_iohdr)) {
+		iov_iter_init_plain(it, (char *)&ireq->iochunk.hbuf,
+				  sizeof(struct pcs_cs_iohdr), 0);
+		iov_iter_advance(it, offset);
+		TRACE("return msg:%p->size:%d off:%d it_len:%ld\n\n", msg, msg->size, offset, iov_iter_count(it));
+
+		return;
+	} else {
+		struct pcs_int_request *parent = ireq->completion_data.parent;
+		if (parent->type == PCS_IREQ_API) {
+			pcs_api_iorequest_t *ar = parent->apireq.req;
+
+			BUG_ON(ireq->iochunk.direction != PCS_REQ_T_WRITE);
+
+			offset -= (unsigned int)sizeof(struct pcs_cs_iohdr);
+			ar->get_iter(ar->datasource, ireq->iochunk.dio_offset, it);
+			iov_iter_truncate(it, ireq->iochunk.size);
+			iov_iter_advance(it, offset);
+
+			TRACE("return msg:%p->size:%d off:%d it_len:%ld\n\n", msg, msg->size, offset, iov_iter_count(it));
+			return;
+		} else
+			BUG();
+	}
+}
+
+static void cs_sent(struct pcs_msg *msg)
+{
+	msg->done = cs_response_done;
+	if (pcs_if_error(&msg->error)) {
+		msg->done(msg);
+		return;
+	}
+	pcs_rpc_sent(msg);
+}
+
+void pcs_cs_submit(struct pcs_cs *cs, struct pcs_int_request *ireq)
+{
+	struct pcs_msg *msg = &ireq->iochunk.msg;
+	struct pcs_cs_iohdr *ioh;
+	struct pcs_cs_list *csl = ireq->iochunk.csl;
+
+	msg->private = cs;
+
+	BUG_ON(msg->rpc);
+	msg->private2 = ireq;
+
+	ioh = &ireq->iochunk.hbuf;
+	ioh->hdr.len = sizeof(struct pcs_cs_iohdr) +
+		(ireq->iochunk.direction ? ireq->iochunk.size : 0);
+	ioh->hdr.type = ireq->iochunk.direction ? PCS_CS_WRITE_REQ : PCS_CS_READ_REQ;
+	pcs_rpc_get_new_xid(&cc_from_cs(cs)->eng, &ioh->hdr.xid);
+	ioh->offset = ireq->iochunk.offset;
+	ioh->size = ireq->iochunk.size;
+	ioh->iocontext = (u32)ireq->dentry->fileinfo.attr.id;
+	ioh->_reserved = 0;
+	memset(&ioh->sync, 0, sizeof(ioh->sync));
+
+	if (ireq->flags & IREQ_F_SEQ)
+		ioh->sync.misc = PCS_CS_IO_SEQ;
+
+	msg->size = ioh->hdr.len;
+	msg->rpc = NULL;
+	pcs_clear_error(&msg->error);
+	msg->done = cs_sent;
+	msg->get_iter = cs_get_data;
+
+	/* TODO
+	 * Theoretically at this moment this map may already becomes dead
+	 * what should I do then?
+	 * This may happens only in case of aio/dio vs	truncate race
+	 */
+	BUG_ON(ireq->iochunk.map->state & PCS_MAP_DEAD);
+	ioh->map_version = csl->version;
+	if (ireq->iochunk.direction)
+		msg->timeout = csl->write_timeout;
+	else
+		msg->timeout = csl->read_timeout;
+	ireq->ts_sent = jiffies;
+	ireq->wait_origin.val = 0;
+
+
+	DTRACE(XID_FMT " About to send msg:%p, ireq:%p :  %llu:%u+%u\n", XID_ARGS(ireq->iochunk.hbuf.hdr.xid),
+	      msg, ireq,
+	      (unsigned long long)ireq->iochunk.chunk,
+	      (unsigned)ireq->iochunk.offset,
+	      ireq->iochunk.size);
+
+/* TODO reanable ratelimiting */
+#if 0
+	if (cc_from_cs(cs)->rlim.rate)
+		pcs_submit_ratelimited(&cc_from_cs(cs)->rlim, ireq);
+	else
+		pcs_rpc_send(cs->rpc, msg);
+#endif
+	pcs_rpc_queue(cs->rpc, msg);
+}
+
+static void handle_congestion(struct pcs_cs *cs, struct pcs_rpc_hdr *h)
+{
+	struct pcs_cs *who;
+
+	TRACE("Received congestion notification from CS" NODE_FMT, NODE_ARGS(h->xid.origin));
+
+	if (cs->id.val == h->xid.origin.val) {
+		who = cs;
+		spin_lock(&who->lock);
+	} else
+		who = lookup_and_lock_cs(cs->css, &h->xid.origin);
+
+	if (who && !who->cwr_state) {
+		/* Unless node is already reducing congestion window, shrink it
+		 * to half of min(in_flight, cwnd) and enter congestion reduction state,
+		 * where we ignore further congestion notifications until window is reduced
+		 */
+		if (who->in_flight < who->cwnd)
+			who->cwnd = who->in_flight;
+		who->cwnd /= 2;
+		if (who->cwnd == 0)
+			who->cwnd = 1;
+		if (who->eff_cwnd > who->cwnd)
+			who->eff_cwnd = who->cwnd;
+		if (who->in_flight >= who->eff_cwnd)
+			who->cwr_state = 1;
+	}
+	spin_unlock(&who->lock);
+}
+
+static int may_reroute(struct pcs_cs_list *csl, PCS_NODE_ID_T cs_id)
+{
+	int i;
+	int legit = 0;
+
+	for (i = csl->nsrv - 1; i >= 0; i--) {
+		struct pcs_cs *cs = csl->cs[i].cslink.cs;
+
+		if (cs->id.val == cs_id.val)
+			continue;
+		if (test_bit(CS_SF_FAILED, &cs->state))
+			continue;
+		if (cs_is_blacklisted(cs))
+			continue;
+		if (test_bit(i, &csl->blacklist) &&
+		    jiffies < READ_ONCE(csl->blacklist_expires))
+			continue;
+		legit++;
+	}
+	return legit;
+}
+
+static void cs_keep_waiting(struct pcs_rpc *ep, struct pcs_msg *req, struct pcs_msg *msg)
+{
+	struct pcs_rpc_hdr *h = (struct pcs_rpc_hdr *)msg_inline_head(msg);
+	struct pcs_cs *cs = ep->private;
+	struct pcs_cs *who;
+
+	/* Some CS reported it cannot complete local IO in time, close congestion window */
+	who = lookup_and_lock_cs(cs->css, &h->xid.origin);
+	if (who) {
+		struct pcs_int_request *ireq = req->private2;
+		abs_time_t lat = ((jiffies - ireq->ts_sent) * 1000) / HZ;
+		if (ireq)
+			ireq->wait_origin = h->xid.origin;
+
+		if (!who->cwr_state) {
+			DTRACE("Congestion window on CS" NODE_FMT " reducing %d/%d/%d", NODE_ARGS(h->xid.origin),
+			       who->in_flight, who->eff_cwnd, who->cwnd);
+			if (who->in_flight < who->cwnd)
+				who->cwnd = who->in_flight;
+			who->cwnd /= 2;
+			if (who->cwnd == 0)
+				who->cwnd = 1;
+			if (who->eff_cwnd > who->cwnd)
+				who->eff_cwnd = who->cwnd;
+			if (who->in_flight >= who->eff_cwnd)
+				who->cwr_state = 1;
+		}
+		cs_update_io_latency(who, lat);
+		if (ireq && ireq->type == PCS_IREQ_IOCHUNK && ireq->iochunk.direction == 0) {
+			/* Force CS reselection */
+			pcs_map_force_reselect(who);
+
+			/* If request still has no banned CS and delayed for too long,
+			 * cancel and reroute
+			 */
+			if (ireq->iochunk.banned_cs.val == 0 && lat >= PCS_MAX_READ_IO_LATENCY*1000
+			    && may_reroute(ireq->iochunk.csl, h->xid.origin)) {
+				TRACE("Canceling read on CS" NODE_FMT, NODE_ARGS(h->xid.origin));
+				ireq->iochunk.banned_cs = h->xid.origin;
+				spin_unlock(&who->lock);
+				pcs_rpc_cancel_request(req);
+				return;
+			}
+		}
+
+		spin_unlock(&who->lock);
+	}
+
+}
+
+static int cs_input(struct pcs_rpc *ep, struct pcs_msg *msg)
+{
+	struct pcs_rpc_hdr *h = (struct pcs_rpc_hdr *)msg->_inline_buffer;
+
+	switch (h->type) {
+	case PCS_CS_CONG_NOTIFY:
+		handle_congestion(ep->private, h);
+		msg->done(msg);
+		return 0;
+	default:
+		pcs_log(0, "Unsupported message type %u\n", h->type);
+		return PCS_ERR_PROTOCOL;
+	}
+}
+
+void pcs_cs_notify_error(struct pcs_cluster_core *cc, pcs_error_t *err)
+{
+	struct list_head queue;
+	struct pcs_cs *cs;
+
+	INIT_LIST_HEAD(&queue);
+
+	/* Filter out errors specific for particular chunk.
+	 * Probably, we should handle only timeouts here.
+	 */
+	switch (err->value) {
+	case PCS_ERR_CSD_STALE_MAP:
+	case PCS_ERR_CSD_REPLICATING:
+	case PCS_ERR_PROTOCOL:
+	case PCS_ERR_CSD_RO_MAP:
+		return;
+	}
+
+	cs = lookup_and_lock_cs(&cc->css, &err->offender);
+	if (cs == NULL)
+		return;
+
+	list_splice_tail_init(&cs->cong_queue, &queue);
+	clear_bit(CS_SF_CONGESTED, &cs->state);
+	cs->cong_queue_len = 0;
+	cs_blacklist(cs, err->value, "notify error");
+	spin_unlock(&cs->lock);
+
+	pcs_cc_requeue(cc, &queue);
+
+}
+
+static void pcs_cs_isolate(struct pcs_cs *cs, struct list_head *dispose)
+{
+	assert_spin_locked(&cs->lock);
+
+	list_splice_tail_init(&cs->active_list, dispose);
+	list_splice_tail_init(&cs->cong_queue, dispose);
+	cs->active_list_len = 0;
+	cs->cong_queue_len = 0;
+	clear_bit(CS_SF_CONGESTED, &cs->state);
+
+	cs->is_dead = 1;
+	spin_lock(&cs->css->lock);
+	if (!hlist_unhashed(&cs->hlist))
+		hlist_del_rcu(&cs->hlist);
+	list_del(&cs->lru_link);
+	list_del(&cs->bl_link);
+	cs->css->ncs--;
+
+	if (list_empty(&cs->css->bl_list))
+		cancel_delayed_work(&cs->css->bl_work);
+	spin_unlock(&cs->css->lock);
+
+	while (!list_empty(&cs->map_list)) {
+		struct pcs_cs_link *csl = list_first_entry(&cs->map_list,
+							       struct pcs_cs_link,
+							       link);
+		csl->cs = NULL;
+		cs->nmaps--;
+		list_del_init(&csl->link);
+	}
+
+
+	BUG_ON(cs->nmaps);
+
+	if (!list_empty(&cs->flow_lru))
+		pcs_flow_cs_unbind_all(cs);
+	BUG_ON(cs->nflows);
+}
+
+static void cs_free_callback(struct rcu_head *head)
+{
+	struct pcs_cs *cs = container_of(head, struct pcs_cs, rcu);
+
+	kfree(cs);
+}
+
+static void pcs_cs_destroy(struct pcs_cs *cs)
+{
+	BUG_ON(!list_empty(&cs->active_list));
+	BUG_ON(!list_empty(&cs->cong_queue));
+	BUG_ON(!cs->is_dead);
+
+	if (cs->rpc) {
+		pcs_rpc_close(cs->rpc);
+		cs->rpc = NULL;
+	}
+	call_rcu(&cs->rcu, cs_free_callback);
+}
+
+
+void cs_aborting(struct pcs_rpc *ep, int error)
+{
+	pcs_rpc_reset(ep);
+}
+
+/* Latency is difficult value to use for any decisions.
+ * It is sampled at random, we do not know what is happening while
+ * we have no samples. For now we do the following: arriving samples
+ * are locked and used as if latency stays at this value until the next sample.
+ * If we have no samples, latency value slowly decays. This prepared value
+ * is used to take EWMA.
+ */
+
+static unsigned int lat_decay(unsigned int lat, unsigned decay_period,
+				 abs_time_t now, abs_time_t stamp)
+{
+	unsigned int interval;
+
+	if (now < stamp + decay_period)
+		return lat;
+
+	if (stamp  == 0 || now > stamp + 30 * decay_period)
+		return 0;
+
+	interval = (now - stamp) / decay_period;
+
+	return lat >>= interval;
+
+}
+
+unsigned int __cs_get_avg_latency(struct pcs_cs *cs, abs_time_t now)
+{
+	return lat_decay(atomic_read(&cs->latency_avg), CS_LAT_DECAY_INTERVAL,
+			 now, READ_ONCE(cs->latency_stamp));
+}
+
+unsigned int cs_get_avg_latency(struct pcs_cs *cs)
+{
+	return __cs_get_avg_latency(cs, jiffies);
+}
+unsigned int __cs_get_avg_net_latency(struct pcs_cs *cs, abs_time_t now)
+{
+	return lat_decay(READ_ONCE(cs->net_latency_avg), CS_LAT_DECAY_INTERVAL,
+			 now, READ_ONCE(cs->net_latency_stamp));
+
+}
+
+unsigned int cs_get_avg_net_latency(struct pcs_cs *cs)
+{
+	return __cs_get_avg_net_latency(cs, jiffies);
+}
+
+void cs_account_latency(struct pcs_cs *cs, unsigned int cost)
+{
+	unsigned lat;
+	abs_time_t now = jiffies;
+
+	lat = __cs_get_avg_latency(cs, now);
+
+	atomic_add(cost, &cs->latency_avg);
+	WRITE_ONCE(cs->latency_stamp, now);
+}
+
+void cs_update_io_latency(struct pcs_cs *cs, u32 lat)
+{
+	abs_time_t now = jiffies;
+	u32 cur_latency;
+
+	cur_latency = __cs_get_avg_latency(cs, jiffies);
+
+	atomic_add((int)(lat - cur_latency) >> CS_LAT_EWMA_LOG, &cs->latency_avg);
+	WRITE_ONCE(cs->last_latency, lat);
+	WRITE_ONCE(cs->latency_stamp, now);
+}
+
+
+void cs_update_net_latency(struct pcs_cs *cs, u32 lat)
+{
+	abs_time_t now = jiffies;
+	struct pcs_rpc *ep = cs->rpc;
+	u32 cur_latency;
+
+	cur_latency = __cs_get_avg_net_latency(cs, now);
+
+	cur_latency += ((int)(lat - cur_latency) >> CS_LAT_EWMA_LOG);
+
+	WRITE_ONCE(cs->net_latency_avg, cur_latency);
+	WRITE_ONCE(cs->net_latency_stamp, now);
+
+	if (lat < READ_ONCE(ep->netlat_min))
+		WRITE_ONCE(ep->netlat_min, lat);
+	if (lat > READ_ONCE(ep->netlat_max))
+		WRITE_ONCE(ep->netlat_max, lat);
+	atomic_inc(&ep->netlat_cnt);
+	atomic64_add(lat, &ep->netlat_avg);
+}
+
+unsigned int cs_get_avg_in_flight(struct pcs_cs *cs)
+{
+	assert_spin_locked(&cs->lock);
+
+	if (cs->in_flight == 0) {
+		abs_time_t now;
+
+		now = jiffies;
+
+		if (now >= cs->idle_stamp + CS_LAT_DECAY_INTERVAL) {
+			if (cs->idle_stamp == 0 || now > cs->idle_stamp + 30*CS_LAT_DECAY_INTERVAL) {
+				cs->in_flight_avg = 0;
+			} else {
+				unsigned int interval;
+
+				interval = (now - cs->idle_stamp)/CS_LAT_DECAY_INTERVAL;
+				cs->idle_stamp = now;
+				cs->in_flight_avg >>= interval;
+			}
+			if (cs->cwnd > PCS_CS_INIT_CWND) {
+				cs->cwnd = PCS_CS_INIT_CWND;
+				if (cs->eff_cwnd > PCS_CS_INIT_CWND)
+					cs->eff_cwnd = PCS_CS_INIT_CWND;
+			}
+		}
+	}
+
+	return cs->in_flight_avg;
+}
+
+void cs_increment_in_flight(struct pcs_cs *cs, unsigned int to_add)
+{
+	unsigned int avg;
+
+	spin_lock(&cs->lock);
+	avg = cs_get_avg_in_flight(cs);
+
+	cs->in_flight += to_add;
+
+	cs->in_flight_avg = avg + (((int)(cs->in_flight - avg)) >> CS_LAT_EWMA_LOG);
+
+	if (cs->in_flight > cs->in_flight_hwm) {
+		cs->in_flight_hwm = cs->in_flight;
+		cs->in_flight_hwm_stamp = jiffies;
+		DTRACE("HWM on CS" NODE_FMT " is %u\n", NODE_ARGS(cs->id), cs->in_flight);
+	}
+	spin_unlock(&cs->lock);
+}
+
+void cs_decrement_in_flight(struct pcs_cs *cs, unsigned int to_dec)
+{
+	assert_spin_locked(&cs->lock);
+
+	cs->in_flight -= to_dec;
+
+	BUG_ON((int)cs->in_flight < 0);
+
+	if (cs->in_flight < cs->eff_cwnd) {
+		cs->cwr_state = 0;
+		pcs_cs_flush_cong_queue(cs);
+	}
+	if (cs->in_flight == 0)
+		cs->idle_stamp = jiffies;
+}
+
+/* Check that cwnd was used recently. If it was not used, drop it. */
+
+void cs_cwnd_use_or_lose(struct pcs_cs *cs)
+{
+	assert_spin_locked(&cs->lock);
+
+	if (cs->in_flight_hwm < cs->cwnd && cs->cwnd > PCS_CS_INIT_CWND) {
+		abs_time_t now = jiffies;
+
+		if (now > cs->in_flight_hwm_stamp + CS_LAT_DECAY_INTERVAL) {
+			unsigned int cwnd;
+
+			cwnd = cs->in_flight_hwm;
+			if (cwnd < PCS_CS_INIT_CWND)
+				cwnd = PCS_CS_INIT_CWND;
+
+			TRACE("Congestion window on CS#" NODE_FMT " was not used, shrink %u -> %u", NODE_ARGS(cs->id),
+			      cs->cwnd, cwnd);
+			cs->cwnd = cwnd;
+			if (cs->eff_cwnd > cwnd)
+				cs->eff_cwnd = cwnd;
+			cs->in_flight_hwm_stamp = now;
+			cs->in_flight_hwm = cs->in_flight;
+		}
+	}
+}
+
+static void cs_probe_done(struct pcs_msg *msg)
+{
+	struct pcs_cs_set *css = msg->private;
+	struct pcs_cs *cs;
+
+	cs = lookup_and_lock_cs(css, &msg->rpc->peer_id);
+
+	if (cs) {
+		if (!pcs_if_error(&msg->error)) {
+			cs_whitelist(cs, "probe");
+		} else {
+			TRACE("probe error %d", msg->error.value);
+			cs_blacklist(cs, msg->error.value, "probe");
+		}
+		cs->is_probing = 0;
+	}
+	spin_unlock(&cs->lock);
+	pcs_free_msg(msg);
+}
+
+static struct pcs_msg *cs_prep_probe(struct pcs_cs *cs)
+{
+	struct pcs_msg *msg;
+	struct pcs_cs_map_prop *m;
+	unsigned int msg_sz = offsetof(struct pcs_cs_map_prop, nodes) + sizeof(struct pcs_cs_node_desc);
+
+
+	msg = pcs_rpc_alloc_output_msg(msg_sz);
+	if (!msg)
+		return NULL;
+
+	m = (struct pcs_cs_map_prop *)msg_inline_head(msg);
+	memset(m, 0, msg_sz);
+
+	m->hdr.h.type = PCS_CS_MAP_PROP_REQ;
+	m->hdr.h.len = msg_sz;
+
+	m->flags = CS_MAPF_PING;
+	m->nnodes = 1;
+	m->nodes[0].state     = CS_OBJ_UNKNOWN;
+	m->nodes[0].info.id   = cs->id;
+	m->nodes[0].info.addr = cs->rpc->addr;
+
+	msg->done = cs_probe_done;
+	msg->private = cs->css;
+	msg->timeout = PCS_CS_BLACKLIST_TIMER / 2;
+	return msg;
+}
+
+static void bl_timer_work(struct work_struct *w)
+{
+	struct pcs_cs_set *css = container_of(w, struct pcs_cs_set, bl_work.work);
+	struct pcs_cluster_core *cc = cc_from_csset(css);
+	LIST_HEAD(local_lst);
+	LIST_HEAD(to_blacklist);
+	LIST_HEAD(to_resubmit);
+
+	spin_lock(&css->lock);
+	list_splice_tail_init(&css->bl_list, &local_lst);
+	spin_unlock(&css->lock);
+	if (list_empty(&local_lst))
+		return;
+
+	while (!list_empty(&local_lst)) {
+		struct pcs_cs *cs;
+		struct pcs_msg *msg;
+
+		cs = list_first_entry(&local_lst, struct pcs_cs, bl_link);
+
+		spin_lock(&cs->lock);
+		BUG_ON(cs->is_dead);
+		list_move(&cs->bl_link, &to_blacklist);
+		if (cs->is_probing) {
+			spin_unlock(&cs->lock);
+			continue;
+		}
+		if (!cs->nmaps) {
+			pcs_cs_isolate(cs, &to_resubmit);
+			spin_unlock(&cs->lock);
+			pcs_cs_destroy(cs);
+			continue;
+		}
+		cs->is_probing = 1;
+		spin_unlock(&cs->lock);
+		msg = cs_prep_probe(cs);
+		if (msg)
+			pcs_rpc_call(cs->rpc, msg);
+		spin_lock(&cs->lock);
+		if (!msg)
+			cs->is_probing = 0;
+		spin_unlock(&cs->lock);
+	}
+	spin_lock(&css->lock);
+	list_splice(&to_blacklist, &css->bl_list);
+	if (list_empty(&css->bl_list))
+		mod_delayed_work(cc->wq, &css->bl_work, PCS_CS_BLACKLIST_TIMER);
+	spin_unlock(&css->lock);
+
+	pcs_cc_requeue(cc, &to_resubmit);
+}
+
+void pcs_csset_init(struct pcs_cs_set *css)
+{
+	unsigned int i;
+
+	for (i = 0; i < PCS_CS_HASH_SIZE; i++)
+		INIT_HLIST_HEAD(&css->ht[i]);
+
+	INIT_LIST_HEAD(&css->lru);
+	INIT_LIST_HEAD(&css->bl_list);
+	INIT_DELAYED_WORK(&css->bl_work, bl_timer_work);
+	css->ncs = 0;
+	spin_lock_init(&css->lock);
+}
+
+void pcs_csset_fini(struct pcs_cs_set *css)
+{
+	unsigned int i;
+	LIST_HEAD(to_resubmit);
+
+	for (i = 0; i < PCS_CS_HASH_SIZE; i++) {
+		spin_lock(&css->lock);
+		while (!hlist_empty(&css->ht[i])) {
+			struct pcs_cs *cs;
+
+			rcu_read_lock();
+			cs = hlist_entry(css->ht[i].first, struct pcs_cs, hlist);
+			hlist_del_init_rcu(&cs->hlist);
+			spin_unlock(&css->lock);
+
+			spin_lock(&cs->lock);
+			if (cs->is_dead) {
+				spin_unlock(&cs->lock);
+				rcu_read_unlock();
+				spin_lock(&css->lock);
+				continue;
+			}
+			rcu_read_unlock();
+			pcs_cs_isolate(cs, &to_resubmit);
+			spin_unlock(&cs->lock);
+			pcs_cs_destroy(cs);
+
+			spin_lock(&css->lock);
+		}
+		spin_unlock(&css->lock);
+
+	}
+	cancel_delayed_work_sync(&css->bl_work);
+	/* NOTE: It looks like	must being empty at destruction */
+	BUG_ON(!list_empty(&to_resubmit));
+	pcs_cc_requeue(cc_from_csset(css), &to_resubmit);
+
+	BUG_ON(timer_pending(&css->bl_work.timer));
+	BUG_ON(!list_empty(&css->bl_list));
+	BUG_ON(!list_empty(&css->lru));
+	BUG_ON(css->ncs);
+
+
+}
+
+int pcs_cs_for_each_entry(struct pcs_cs_set *set, int (*cb)(struct pcs_cs *cs, void *arg), void *arg)
+{
+	int rc = 0;
+	unsigned int i;
+	struct pcs_cs *cs;
+	struct hlist_node *node;
+
+	spin_lock(&set->lock);
+	for (i = 0; i < PCS_CS_HASH_SIZE; i++) {
+		hlist_for_each_entry_safe(cs, node, &set->ht[i], hlist) {
+			rc = cb(cs, arg);
+			if (rc < 0) {
+				spin_lock(&set->lock);
+				return rc;
+			}
+		}
+	}
+	spin_unlock(&set->lock);
+	return rc;
+}
+
+static int do_update_stat(struct pcs_cs *cs, void *arg)
+{
+	(void)arg;
+	pcs_cs_stat_up(cs);
+	return 0;
+}
+
+void pcs_cs_set_stat_up(struct pcs_cs_set *set)
+{
+	pcs_cs_for_each_entry(set, do_update_stat, 0);
+}
+
+void pcs_cs_cong_enqueue(struct pcs_int_request *ireq, struct pcs_cs *cs)
+{
+	spin_lock(&cs->lock);
+	if (test_bit(CS_SF_CONGESTED, &cs->state))
+		test_bit(CS_SF_CONGESTED, &cs->state);
+	list_add_tail(&ireq->list, &cs->cong_queue);
+	cs->cong_queue_len++;
+	if (!ireq->qdepth)
+		ireq->qdepth = cs->cong_queue_len + cs->active_list_len;
+	spin_unlock(&cs->lock);
+}
diff --git a/fs/fuse/kio/pcs/pcs_cs.h b/fs/fuse/kio/pcs/pcs_cs.h
new file mode 100644
index 000000000000..c04317e4a9a9
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_cs.h
@@ -0,0 +1,182 @@
+#ifndef _PCS_CS_H_
+#define _PCS_CS_H_ 1
+
+#include "pcs_prot_types.h"
+#include "pcs_perfcounters.h"
+
+struct pcs_map_entry;
+
+#define PCS_CS_INIT_CWND	(1*1024*1024)
+#define PCS_CS_MAX_CWND		(16*1024*1024)
+#define PCS_MAX_NETWORK_LATENCY	((2000*3)/4)
+#define PCS_MAX_IO_LATENCY	(8*HZ)
+#define PCS_MAX_READ_IO_LATENCY	(5*HZ)
+
+/* io_prio received from MDS is valid during this time, otherwise it is stale and cannot be used */
+#define PCS_CS_IO_PRIO_VALID_TIME	(60*HZ)
+
+/* When CS is idle its latency halves after CS_LAT_DECAY_INTERVAL */
+#define CS_LAT_DECAY_INTERVAL	(HZ/2)
+
+/* When CS is active time constant is ln(2) * 2^CS_LAT_EWMA_LOG / IOPS,
+ * so that with IOPS=100 and CS_LAT_EWMA_LOG=6 we have ~400ms
+ */
+#define CS_LAT_EWMA_LOG		(6)
+
+#define PCS_CS_BLACKLIST_TIMER	(10*HZ)
+
+enum {
+	CS_SF_LOCAL,
+	CS_SF_LOCAL_SOCK,
+	CS_SF_INACTIVE,
+	CS_SF_REPLICATING,
+	CS_SF_FAILED,
+	CS_SF_BLACKLISTED,
+	CS_SF_ACTIVE,
+	CS_SF_CONGESTED,
+};
+
+struct pcs_cs {
+	struct hlist_node	hlist;
+	union {
+		struct list_head lru_link;
+		struct rcu_head	 rcu;
+	};
+	spinlock_t		lock;
+	struct pcs_cs_set	*css;
+
+	PCS_NODE_ID_T		id;
+
+	unsigned int		in_flight;
+	unsigned int		eff_cwnd;
+	unsigned int		cwnd;
+	int			cwr_state;
+	atomic_t		latency_avg;
+	unsigned int		net_latency_avg;
+	unsigned int		in_flight_avg;
+	unsigned int		last_latency;
+	unsigned int		in_flight_hwm;
+	abs_time_t		in_flight_hwm_stamp;
+	abs_time_t		latency_stamp;
+	abs_time_t		net_latency_stamp;
+	abs_time_t		idle_stamp;
+	struct list_head	cong_queue;
+	int			cong_queue_len;
+	struct list_head	active_list;
+	int			active_list_len;
+
+	pcs_cs_io_prio_t	io_prio;
+	pcs_cs_net_prio_t	net_prio;
+	u8			mds_flags;
+	abs_time_t		io_prio_stamp;
+
+	struct list_head		flow_lru;
+	int			nflows;
+
+	unsigned long		state;
+	int			blacklist_reason;
+	struct list_head	bl_link;
+	unsigned		is_probing:1;
+	unsigned		is_dead:1;
+
+
+	int			addr_serno;
+	PCS_NET_ADDR_T		addr;
+
+	struct pcs_rpc		*rpc;
+
+	int			nmaps;
+	struct list_head	map_list;
+
+	struct {
+		struct pcs_perf_stat_cnt iolat;
+		struct pcs_perf_stat_cnt netlat;
+		struct pcs_perf_rate_cnt read_ops_rate;
+		struct pcs_perf_rate_cnt write_ops_rate;
+		struct pcs_perf_rate_cnt sync_ops_rate;
+	} stat;
+};
+
+static inline void pcs_cs_init_cong_queue(struct pcs_cs *cs)
+{
+	INIT_LIST_HEAD(&cs->cong_queue);
+	cs->cong_queue_len = 0;
+	clear_bit(CS_SF_CONGESTED, &cs->state);
+}
+
+static inline void pcs_cs_init_active_list(struct pcs_cs *cs)
+{
+	INIT_LIST_HEAD(&cs->active_list);
+	cs->active_list_len = 0;
+}
+
+static inline void pcs_cs_flush_cong_queue(struct pcs_cs *cs)
+{
+	assert_spin_locked(&cs->lock);
+	list_splice_tail(&cs->cong_queue, &cs->active_list);
+	cs->active_list_len += cs->cong_queue_len;
+	pcs_cs_init_cong_queue(cs);
+}
+
+void pcs_cs_cong_enqueue(struct pcs_int_request *ireq, struct pcs_cs *cs);
+
+#define PCS_CS_HASH_SIZE 1024
+
+struct pcs_cs_set {
+	struct hlist_head	ht[PCS_CS_HASH_SIZE];
+	struct list_head	lru;
+	struct list_head	bl_list;
+	struct delayed_work	bl_work;
+	unsigned int		ncs;
+	spinlock_t		lock;
+};
+
+void pcs_cs_submit(struct pcs_cs *cs, struct pcs_int_request *ireq);
+struct pcs_cs *pcs_cs_find_create(struct pcs_cs_set *csset, PCS_NODE_ID_T *id, PCS_NET_ADDR_T *addr, int local);
+void pcs_cs_notify_error(struct pcs_cluster_core *cc, pcs_error_t *err);
+
+void cs_update_io_latency(struct pcs_cs *cs, u32 lat);
+unsigned int cs_get_avg_latency(struct pcs_cs *cs);
+unsigned int __cs_get_avg_latency(struct pcs_cs *cs, abs_time_t now);
+void cs_account_latency(struct pcs_cs *cs, unsigned int to_add);
+void cs_update_net_latency(struct pcs_cs *cs, u32 lat);
+unsigned int cs_get_avg_net_latency(struct pcs_cs *cs);
+unsigned int __cs_get_avg_net_latency(struct pcs_cs *cs, abs_time_t now);
+void cs_increment_in_flight(struct pcs_cs *cs, unsigned int to_add);
+void cs_decrement_in_flight(struct pcs_cs *cs, unsigned int to_dec);
+void cs_cwnd_use_or_lose(struct pcs_cs *cs);
+unsigned int cs_get_avg_in_flight(struct pcs_cs *cs);
+
+void pcs_csset_init(struct pcs_cs_set *css);
+void pcs_csset_fini(struct pcs_cs_set *css);
+
+struct pcs_cs *pcs_cs_alloc(struct pcs_cs_set *css, struct pcs_cluster_core *cc);
+
+void cs_log_io_times(struct pcs_int_request *ireq, struct pcs_msg *resp, unsigned int max_iolat);
+int pcs_cs_format_io_times(char *buf, int buflen, struct pcs_int_request *ireq, struct pcs_msg *resp);
+void cs_set_io_times_logger(void (*logger)(struct pcs_int_request *ireq, struct pcs_msg *resp, u32 max_iolat, void *ctx), void *ctx);
+
+int pcs_cs_for_each_entry(struct pcs_cs_set *set, int (*cb)(struct pcs_cs *cs, void *arg), void *arg);
+
+void pcs_cs_update_stat(struct pcs_cs *cs, u32 iolat, u32 netlat, int op_type);
+
+static inline void pcs_cs_stat_up(struct pcs_cs *cs)
+{
+#if 0
+	/* TODO: temproraly disable perf counters */
+	pcs_perfcounter_stat_up(&cs->stat.iolat);
+	pcs_perfcounter_stat_up(&cs->stat.netlat);
+	pcs_perfcounter_up_rate(&cs->stat.write_ops_rate);
+	pcs_perfcounter_up_rate(&cs->stat.read_ops_rate);
+	pcs_perfcounter_up_rate(&cs->stat.sync_ops_rate);
+#endif
+}
+
+static inline bool cs_is_blacklisted(struct pcs_cs *cs)
+{
+	return test_bit(CS_SF_BLACKLISTED, &cs->state);
+}
+
+void pcs_cs_set_stat_up(struct pcs_cs_set *set);
+
+#endif /* _PCS_CS_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_cs_prot.h b/fs/fuse/kio/pcs/pcs_cs_prot.h
new file mode 100644
index 000000000000..f6b1c7f0dedf
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_cs_prot.h
@@ -0,0 +1,125 @@
+#ifndef _PCS_CS_PROT_H_
+#define _PCS_CS_PROT_H_ 1
+
+#include "pcs_rpc_prot.h"
+
+#define PCS_CS_FLUSH_WEIGHT	(128*1024)
+
+struct pcs_cs_sync_data
+{
+	PCS_INTEGRITY_SEQ_T	integrity_seq;	/* Invariant. Changed only on CS host crash */
+	PCS_SYNC_SEQ_T		sync_epoch;	/* Invariant. Changed on CSD startup. */
+	PCS_SYNC_SEQ_T		sync_dirty;	/* Sync number of CS upon completion of local write */
+	PCS_SYNC_SEQ_T		sync_current;	/* Current sync number of CS. If > sync_dirty, write is synced */
+
+	u64			misc;		/* Message received by CS */
+	u32			ts_io;		/* Local IO finished */
+	u32			ts_net;		/* Net finished */
+	u64			_reserved;	/* For future extensions */
+} __attribute__((aligned(8)));
+
+/* IO req/resp flags. Older version have flag field zero, so zero value should be neutral.
+ * We have room for 12 flags.
+ */
+#define PCS_CS_IO_CACHED	(1ULL<<63)	/* Resp: result is read from cache or written ahead to journal */
+#define PCS_CS_IO_SEQ		(1ULL<<62)	/* Req: request is part of sequential flow */
+
+#define PCS_CS_RESET_TS_RECV(sdata, ts)	do { (sdata)->misc = ((u64)ts & 0xFFFFFFFFFFFFFULL); } while (0)
+#define PCS_CS_SET_TS_RECV(sdata, ts)	do { (sdata)->misc = ((sdata)->misc & ~0xFFFFFFFFFFFFFULL) | ((u64)ts & 0xFFFFFFFFFFFFFULL); } while (0)
+#define PCS_CS_ADD_TS_RECV(sdata, ts)	do { (sdata)->misc |= ((u64)ts & 0xFFFFFFFFFFFFFULL); } while (0)
+#define PCS_CS_GET_TS_RECV(sdata)	((sdata)->misc & 0xFFFFFFFFFFFFFULL)
+
+struct pcs_cs_sync_resp {
+	PCS_NODE_ID_T		cs_id;
+	struct pcs_cs_sync_data	sync;
+} __attribute__((aligned(8)));
+
+struct pcs_cs_iohdr {
+	struct pcs_rpc_hdr	hdr;
+
+	PCS_MAP_VERSION_T	map_version;
+	PCS_CHUNK_UID_T		uid;
+	u64			offset;
+	u32			size;
+	u32			iocontext;
+	u64			_reserved;	/* For future extensions */
+	struct pcs_cs_sync_data	sync;		/* Filled in all requests and responses */
+	struct pcs_cs_sync_resp sync_resp[0];	/* Used only in response to write/sync */
+} __attribute__((aligned(8)));
+
+
+/* Maximal message size. Actually, random */
+#define PCS_CS_MSG_MAX_SIZE	(1024*1024 + sizeof(struct pcs_cs_iohdr))
+
+#define PCS_CS_READ_REQ		(PCS_RPC_CS_CLIENT_BASE)
+#define PCS_CS_READ_RESP	(PCS_CS_READ_REQ|PCS_RPC_DIRECTION)
+
+#define PCS_CS_WRITE_REQ	(PCS_RPC_CS_CLIENT_BASE + 2)
+#define PCS_CS_WRITE_RESP	(PCS_CS_WRITE_REQ|PCS_RPC_DIRECTION)
+
+#define PCS_CS_REPLICATE_REQ	(PCS_RPC_CS_CLIENT_BASE + 4)
+#define PCS_CS_REPLICATE_RESP	(PCS_CS_REPLICATE_REQ|PCS_RPC_DIRECTION)
+
+#define PCS_CS_SYNC_REQ		(PCS_RPC_CS_CLIENT_BASE + 6)
+#define PCS_CS_SYNC_RESP	(PCS_CS_SYNC_REQ|PCS_RPC_DIRECTION)
+
+#define PCS_CS_WRITE_SYNC_REQ	(PCS_RPC_CS_CLIENT_BASE + 8)
+#define PCS_CS_WRITE_SYNC_RESP	(PCS_CS_WRITE_SYNC_REQ|PCS_RPC_DIRECTION)
+
+struct pcs_cs_cong_notification {
+	struct pcs_rpc_hdr	hdr;
+
+	PCS_XID_T		xid;	/* XID of request triggered congestion notification */
+} __attribute__((aligned(8)));
+
+#define PCS_CS_CONG_NOTIFY	(PCS_RPC_CS_CLIENT_BASE + 10)
+
+////////////////////////////////////////////
+//// from pcs_mds_cs_prot.h
+//// required for PCS_CS_MAP_PROP_REQ/ping to work
+struct pcs_cs_fs_info {
+	u64	free_space;
+	u64	total_space;
+};
+
+struct pcs_cs_node_desc {
+	s32			state;	 /* CS_OBJ_XXX */
+	u8			flags;	 /* CS_OBJF_XXX */
+	u8			role;
+	u16			csum_lo;
+	u32			status;	 /* PCS_ERR_XXX filled in response */
+	u16			csum_hi;
+	u8			parent_idx; /* Index of parent in replica tree. Undefined for root. */
+	u8			source_idx; /* Index of replication source for this replica */
+	u64			dirty_mask; /* Initialized by CS before forwarding the map downstream */
+	struct pcs_cs_info	info;	 /* CS id and address */
+	struct pcs_cs_fs_info	fs_info; /* Filled by CS in response */
+} __attribute__((aligned(8)));
+
+struct pcs_cs_map_prop {
+	struct pcs_mds_hdr	hdr;
+
+	PCS_CHUNK_UID_T		chunk_uid;
+	/* Messages with version less or equal to the current one (if available) will be ignored unless
+	 * the CS_MAPF_PING flag is set. Otherwise the version is ignored as well as chunk state/flags.
+	 */
+	PCS_MAP_VERSION_T	version;
+	/* During replication this version indicates the newest dirty mask version allowed to be using for recovery. */
+	PCS_MAP_VERSION_T	dirty_version;
+	u32			flags;	/* CS_MAPF_XXX */
+	u32			chunk_size;
+	/* The maximum number of nodes in the chain. Intended to be using in timeout calculation. */
+	u16			chain_nodes;
+	u16			reserved;
+	u32			nnodes;
+	struct pcs_cs_node_desc	nodes[0];
+} __attribute__((aligned(8)));
+
+#define CS_OBJ_UNKNOWN		-1
+#define CS_MAPF_PING		0x1000
+#define PCS_CS_MAP_PROP_REQ	(PCS_RPC_CS_CS_BASE + 2)
+#define PCS_CS_MAP_PROP_RESP	(PCS_CS_MAP_PROP_REQ | PCS_RPC_DIRECTION)
+//////////////////////////////////////////// end pcs_mds_cs_prot.h
+
+
+#endif /* _PCS_CS_PROT_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_error.h b/fs/fuse/kio/pcs/pcs_error.h
new file mode 100644
index 000000000000..f4ec588943dc
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_error.h
@@ -0,0 +1,189 @@
+#ifndef _PCS_ERROR_H_
+#define _PCS_ERROR_H_ 1
+
+#include "pcs_types.h"
+
+typedef enum {
+	PCS_ERR_OK		= 0,	/* No error */
+	PCS_ERR_NOMEM		= 1,	/* Out of memory: malloc failure */
+	PCS_ERR_PROTOCOL	= 2,	/* Fatal protocol error. Some condition, which should happen
+					 * only if we have some bug in protocol implementation
+					 */
+	PCS_ERR_AUTH		= 3,	/* Authentication failure due to wrong credentials */
+	PCS_ERR_NET		= 4,	/* Misc network error */
+	PCS_ERR_NOSPACE		= 5,	/* ENOSPC/EDQUOT while local file io */
+	PCS_ERR_IO		= 6,	/* Misc error while local file io */
+	PCS_ERR_LOST_LOCK	= 7,	/* CN did not get response from MDS for lease update,
+					 * it is generated by CN itself, sort of self-fencing
+					 * Probably, useless and should be removed.
+					 */
+
+	PCS_ERR_NOT_FOUND	= 8,	/* Requested object not found */
+	PCS_ERR_INTERRUPTED	= 9,	/* The operation was interrupted, should be retried */
+	PCS_ERR_NET_ABORT	= 10,	/* Message dropped due to abort of network connection */
+	PCS_ERR_CONNECT_TIMEOUT	= 11,	/* Failed connect() */
+	PCS_ERR_AUTH_TIMEOUT	= 12,	/* Authentication failure due to timeout */
+	PCS_ERR_RESPONSE_TIMEOUT= 13,	/* Peer did not respond or did not hold deadline */
+	PCS_ERR_WRITE_TIMEOUT	= 14,	/* Socket write() failed, peer is stuck or network is broken */
+
+	PCS_ERR_CANCEL_REQUEST	= 18,	/* Request was cancelled by user */
+	PCS_ERR_CANCEL_IO	= 19,	/* IO request was cancelled */
+
+	PCS_ERR_LEASE_REQUIRED	= 20,	/* Lease required */
+	PCS_ERR_LEASE_EXPIRED	= 21,	/* Lease is expired */
+	PCS_ERR_LEASE_CONFLICT	= 22,	/* Lease request conflicts with another lease */
+	PCS_ERR_INV_PATH	= 23,	/* The path is invalid. Usually means an attempt to make a directory a subdirectory of itself. */
+	PCS_ERR_NOT_DIR		= 24,	/* Attempt to read non-directory */
+	PCS_ERR_IS_DIR		= 25,	/* Attempt to access directory (resize/io) */
+	PCS_ERR_NON_EMPTY_DIR	= 26,	/* Attempt to rename/delete non empty directory */
+	PCS_ERR_ZERO_CHUNK	= 27,	/* The requested chunk was not written yet and contains zero data */
+	PCS_ERR_INVALID		= 29,	/* Object is invalid */
+	PCS_ERR_INV_PARAMS	= 30,	/* Invalid parameters */
+	PCS_ERR_NO_ID		= 31,	/* Request from the client without ID */
+	PCS_ERR_INVALID_ID	= 32,	/* The client or server ID is invalid or banned */
+	PCS_ERR_NORES		= 33,	/* Not enough resources (too many requests) */
+	PCS_ERR_UNAVAIL		= 34,	/* Service unavailable */
+	PCS_ERR_BAD_CLUSTER	= 35,	/* The cluster id specified by client is invalid */
+	PCS_ERR_READONLY	= 36,	/* Invalid operation on read-only object */
+	PCS_ERR_PERM		= 37,	/* Permission denied */
+	PCS_ERR_UNSUPPORTED	= 38,	/* Operation is not supported */
+
+	PCS_ERR_TEMP_UNAVAIL	= 40,	/* The resource is temporary unavailable */
+	PCS_ERR_INTEGRITY	= 41,	/* Not enough alive replicas available */
+	PCS_ERR_INTEGRITY_FAIL	= 42,	/* Fatal. Returned by MDS to client, when it is known that
+					 * some unsynced data could be lost.
+					 */
+
+	PCS_ERR_NO_STORAGE	= 50,	/* The number of chunk servers in cluster is less than the required number of replicas */
+	PCS_ERR_NOT_ALLOWED	= 51,	/* Operation is not allowed due to licensing limitations */
+	PCS_ERR_CFG_VERSION	= 60,	/* Configuration version mismatch */
+	PCS_ERR_CLNT_VERSION	= 61,	/* Client version is incompatible with sever version (outdated) */
+	PCS_ERR_EXISTS		= 70,	/* Specified object already exists */
+	PCS_ERR_EPOCH_MISMATCH	= 72,	/* Object epoch mismatch due to concurrent update */
+	PCS_ERR_NO_DIR		= 75,	/* Name directory does not exists */
+	PCS_ERR_DIR_INST_VER	= 76,	/* Name instance version mismatch */
+	PCS_ERR_CONTEXT_LOST	= 80,	/* Operation context is lost on server restart */
+	PCS_ERR_NS_LEASE_BUSY	= 81,	/* Lease wasn't acquired due to other active lease */
+	PCS_ERR_NS_LEASE_INVALID= 82,	/* Active lease doesn't have reference with id provided in the request */
+	PCS_ERR_NS_LOCK_EXPIRED = 83,	/* Lock at object's name NS has already expired */
+
+	PCS_ERR_CSD_STALE_MAP	= 100,	/* Old map (or no map) at CS */
+	PCS_ERR_CSD_RO_MAP	= 101,	/* Write request with read-only map */
+	PCS_ERR_CSD_WR_IN_PROGR	= 102,	/* Read only map is rejected due to write requests being processed */
+	PCS_ERR_CSD_REPLICATING	= 103,	/* Attempt to read from unfinished replica */
+	PCS_ERR_CSD_STALLED_REPL= 104,	/* Relplication stalled */
+	PCS_ERR_CANCEL_KEEPWAIT	= 105,	/* IO request was canceled and redirected to another CS */
+	PCS_ERR_CSD_LACKING	= 110,	/* Not enough CS servers available */
+	PCS_ERR_CSD_DROPPED	= 120,	/* The CS server was dropped by administrator */
+	PCS_ERR_MDS_NOT_MASTER	= 200,	/* The target MDS is not current master */
+	PCS_ERR_MDS_EXIST	= 201,	/* The MDS with such id already exist in cluster */
+	PCS_ERR_MDS_RM_TOOMANY	= 202,	/* Removing this MDS will make the cluster unusable */
+
+	PCS_ERR_LICENSE_LIMIT	= 300,	/* Operation can't be completed due to license limitations */
+	PCS_ERR_NO_LICENSE	= 301,	/* No active license */
+
+	PCS_ERR_SSL_CERTIFICATE_REVOKED	  = 400, /* Certificate revoked */
+	PCS_ERR_SSL_CERTIFICATE_EXPIRED	  = 401, /* Certificate expired */
+	PCS_ERR_SSL_UNKNOWN_CA		  = 402, /* Certificate issued by a CA the peer does not know and trust */
+	PCS_ERR_PEER_CERTIFICATE_REJECTED = 403, /* The peer certificate has failed the verification */
+
+	PCS_ERR_UNKNOWN		= 4095, /* Unknown error */
+	PCS_ERR_MAX		= PCS_ERR_UNKNOWN
+} pcs_err_t;
+
+/* Get long description of the error */
+const char *pcs_strerror(pcs_err_t errnum);
+
+/* Get short mnemonic */
+const char *pcs_errname(pcs_err_t errnum);
+
+/* Render string describing errno (on Linux and Mac) or Windows system error code. Return 0 on success or positive error number */
+int pcs_sys_strerror_r(int err, char *buf, int buflen);
+
+/* ----------------------------------------------------------------------------------- */
+
+/* Error code handling. "value" is one of error codes defined below,
+ * all the components share one error namespace. System errnos are not used,
+ * each subsystem, using syscalls, must recode errnos to one of PCS error codes.
+ * "remote" means that "offender" is valid. "offender" is node id, where this error
+ * was generated.
+ *
+ * XXX TODO there is one important case. Now "offender" is set when we have connection
+ * to peer and that peer returned an error via RPC. This is wrong (in many situation):
+ * we should return remote error, when communication with node fails due to
+ * failure of network between us and peer, or the peer itself. This is important.
+ * But tricky, we should remember that not all the communication channels should generate
+ * remote error. F.e. failure of communication with MDS is local error for CS communicating
+ * with MDS and remote error with that CS as offender for another nodes. It is easy to mess
+ * up and I did mess it up form the very beginning. :-)
+ */
+
+struct _pcs_error_t
+{
+	unsigned int	value : 31, remote: 1;
+
+	PCS_NODE_ID_T		offender;
+};
+typedef struct _pcs_error_t pcs_error_t;
+
+static __inline void pcs_clear_error(pcs_error_t * err)
+{
+	err->value = 0;
+}
+
+static __inline int pcs_if_error(pcs_error_t const* err)
+{
+	return err->value != 0;
+}
+
+static __inline void pcs_copy_error(pcs_error_t * dst, pcs_error_t const* src)
+{
+	dst->value = src->value;
+	dst->remote = src->remote;
+	if (dst->remote)
+		dst->offender = src->offender;
+}
+
+static __inline void pcs_copy_error_cond(pcs_error_t * dst, pcs_error_t const* src)
+{
+	if (src->value && !dst->value)
+		pcs_copy_error(dst, src);
+}
+
+static __inline void pcs_set_local_error(pcs_error_t * status, int err)
+{
+	status->value = err;
+	status->remote = 0;
+}
+
+int pcs_error_to_errno(pcs_error_t *);
+
+static __inline void *pcs_err_ptr(int err)
+{
+	return (void*)(~(ULONG_PTR)err);
+}
+
+static __inline int pcs_ptr_err(void *ptr)
+{
+	return (int)(~(ULONG_PTR)ptr);
+}
+
+static __inline int pcs_is_ptr_err(void *ptr)
+{
+	return 0 < ~(ULONG_PTR)ptr && ~(ULONG_PTR)ptr <= PCS_ERR_MAX;
+}
+
+static __inline int pcs_is_ptr_err_or_null(void *ptr)
+{
+	return !ptr || pcs_is_ptr_err(ptr);
+}
+
+/* Convert errno on Linux/Mac or Windows error code to pcs_err_t */
+pcs_err_t pcs_errno_to_err(int err);
+
+__must_check static inline int errno_eagain(int err)
+{
+	return err == EAGAIN || err == EWOULDBLOCK;
+}
+
+#endif /* _PCS_ERROR_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_flow_detect.h b/fs/fuse/kio/pcs/pcs_flow_detect.h
new file mode 100644
index 000000000000..1ac936dcc28f
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_flow_detect.h
@@ -0,0 +1,7 @@
+#ifndef _PCS_FLOW_DETECT_H_
+#define _PCS_FLOW_DETECT_H_ 1
+
+/* TODO:!!! this is stump for  flow_detection */
+#include "pcs_flow_detect_stub.h"
+
+#endif /* _PCS_FLOW_DETECT_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_flow_detect_stub.h b/fs/fuse/kio/pcs/pcs_flow_detect_stub.h
new file mode 100644
index 000000000000..f9d0ffe68829
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_flow_detect_stub.h
@@ -0,0 +1,76 @@
+#ifndef _PCS_FLOW_DETECT_STUB_H_
+#define _PCS_FLOW_DETECT_STUB_H_ 1
+
+/* TODO:!!! this is stump for  flow_detection */
+
+/* This should be enough for 1000 iops, otherwise lifetime is to be decreased or/and limit increased. */
+#define PCS_FLOW_LIFETIME	(512)
+#define PCS_FLOW_LIMIT_DFLT	(512)
+
+#define PCS_FLOW_RECENTTIME	(50)
+#define PCS_FLOW_THRESH		(6)
+
+struct pcs_flow_node
+{
+	int STUMB;
+};
+
+struct pcs_flow_table
+{
+	struct pcs_flow_node *STUMB;
+};
+
+struct pcs_flow_table_global
+{
+	struct pcs_flow_table *STUMB;
+	int		       nflows;
+};
+
+struct pcs_cs;
+
+static void pcs_flow_table_global_init(struct pcs_flow_table_global * gtab) __attribute__((unused));
+static void pcs_flow_table_global_fini(struct pcs_flow_table_global * gtab) __attribute__((unused));
+static void pcs_flow_table_init(struct pcs_flow_table * tab, struct pcs_flow_table_global * gtab) __attribute__((unused));
+static void pcs_flow_table_fini(struct pcs_flow_table * tab, struct pcs_flow_table_global * gtab) __attribute__((unused));
+static struct pcs_flow_node * pcs_flow_record(struct pcs_flow_table * tab, int dir, u64 start, unsigned int len,
+				       struct pcs_flow_table_global * gtab) __attribute__((unused));
+static void pcs_flow_confirm(struct pcs_flow_node * fl, struct pcs_flow_table * tab, int dir, u64 start, unsigned int len,
+			      struct pcs_flow_table_global * gtab) __attribute__((unused));
+static void pcs_flow_truncate(struct pcs_flow_table * tab, u64 new_size, struct pcs_flow_table_global * gtab) __attribute__((unused));
+static int pcs_flow_analysis(struct pcs_flow_table_global * gtab) __attribute__((unused));
+static int pcs_flow_cs_analysis(struct pcs_cs * cs) __attribute__((unused));
+static void pcs_flow_bind_cs(struct pcs_flow_node * fl, struct pcs_cs * cs) __attribute__((unused));
+static void pcs_flow_cs_unbind_all(struct pcs_cs * cs) __attribute__((unused));
+static void pcs_flow_put(struct pcs_flow_node * fl, struct pcs_flow_table_global * gtab) __attribute__((unused));
+static struct pcs_flow_node * pcs_flow_get(struct pcs_flow_node * fl) __attribute__((unused));
+static int pcs_flow_sequential(struct pcs_flow_node * fl) __attribute__((unused));
+
+
+
+
+
+
+static void pcs_flow_table_global_init(struct pcs_flow_table_global * gtab) {}
+static void pcs_flow_table_global_fini(struct pcs_flow_table_global * gtab) {}
+static void pcs_flow_table_init(struct pcs_flow_table * tab, struct pcs_flow_table_global * gtab) {}
+static void pcs_flow_table_fini(struct pcs_flow_table * tab, struct pcs_flow_table_global * gtab) {}
+
+static struct pcs_flow_node * pcs_flow_record(struct pcs_flow_table * tab, int dir, u64 start, unsigned int len,
+				       struct pcs_flow_table_global * gtab)
+{
+	return NULL;
+}
+static void pcs_flow_confirm(struct pcs_flow_node * fl, struct pcs_flow_table * tab, int dir, u64 start, unsigned int len,
+			      struct pcs_flow_table_global * gtab) {}
+static void pcs_flow_truncate(struct pcs_flow_table * tab, u64 new_size, struct pcs_flow_table_global * gtab) {}
+static int pcs_flow_analysis(struct pcs_flow_table_global * gtab) { return 0; }
+static int pcs_flow_cs_analysis(struct pcs_cs * cs) {return 0;}
+static void pcs_flow_bind_cs(struct pcs_flow_node * fl, struct pcs_cs * cs) {}
+static void pcs_flow_cs_unbind_all(struct pcs_cs * cs) {}
+
+static void pcs_flow_put(struct pcs_flow_node * fl, struct pcs_flow_table_global * gtab) {}
+static struct pcs_flow_node * pcs_flow_get(struct pcs_flow_node * fl) {return NULL;}
+static int pcs_flow_sequential(struct pcs_flow_node * fl) {return 0;}
+
+
+#endif /* _PCS_FLOW_DETECT_STUB_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
new file mode 100644
index 000000000000..f7226021d469
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
@@ -0,0 +1,742 @@
+/*
+ * Implement kdirect API for PCS cluster client kernel implementation
+ */
+#include "../../fuse_i.h"
+
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/compat.h>
+#include <linux/swap.h>
+#include <linux/aio.h>
+#include <linux/falloc.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/virtinfo.h>
+#include <linux/file.h>
+#include <linux/workqueue.h>
+#include <linux/delay.h>
+#include <linux/socket.h>
+#include <linux/net.h>
+
+#include "pcs_ioctl.h"
+#include "pcs_cluster.h"
+#include "pcs_rpc.h"
+
+static struct kmem_cache *pcs_fuse_req_cachep;
+static struct kmem_cache *pcs_ireq_cachep;
+static struct workqueue_struct *pcs_wq;
+static struct fuse_kio_ops kio_pcs_ops;
+
+static void process_pcs_init_reply(struct fuse_conn *fc, struct fuse_req *req)
+{
+	struct pcs_fuse_cluster *pfc;
+	struct fuse_ioctl_out *arg = &req->misc.ioctl.out;
+	struct	pcs_ioc_init_kdirect *info = req->out.args[1].value;
+
+	if (req->out.h.error || arg->result) {
+		printk("Fail to initialize has_kdirect {%d,%d}\n",
+		       req->out.h.error, arg->result);
+		fc->conn_error = 1;
+		goto out;
+	}
+	pfc = kmalloc(sizeof(*pfc), GFP_NOIO);
+	if (!pfc) {
+		fc->conn_error = 1;
+		goto out;
+	}
+
+	if (pcs_cluster_init(pfc, pcs_wq, fc, &info->cluster_id, &info->node_id)) {
+		fc->conn_error = 1;
+		goto out;
+	}
+	/* TODO: Not yet implemented PSBM-80365 */
+	fc->no_fiemap = 1;
+	fc->no_fallocate = 1;
+
+	fc->kio.ctx = pfc;
+	printk("FUSE: kio_pcs: cl: " CLUSTER_ID_FMT ", clientid: " NODE_FMT "\n",
+	       CLUSTER_ID_ARGS(info->cluster_id), NODE_ARGS(info->node_id));
+out:
+	kfree(info);
+	/*  We are called from	process_init_reply before connection
+	 * was not initalized yet. Do it now. */
+	fuse_set_initialized(fc);
+	wake_up_all(&fc->blocked_waitq);
+
+}
+
+int kpcs_conn_init(struct fuse_conn *fc)
+{
+	struct fuse_req *req;
+	struct fuse_ioctl_in *inarg;
+	struct fuse_ioctl_out *outarg;
+	struct pcs_ioc_init_kdirect *info;
+
+	BUG_ON(!fc->conn_init);
+
+	info = kzalloc(sizeof(*info), GFP_NOIO);
+	if (!info)
+		return -ENOMEM;
+
+	req = fuse_request_alloc(fc, 0);
+	if (IS_ERR(req)) {
+		kfree(info);
+		return PTR_ERR(req);
+	}
+
+	__set_bit(FR_BACKGROUND, &req->flags);
+	memset(&req->misc.ioctl, 0, sizeof(req->misc.ioctl));
+	/* filehandle and nodeid are null, but this is OK */
+	inarg = &req->misc.ioctl.in;
+	outarg = &req->misc.ioctl.out;
+	inarg->cmd = PCS_IOC_INIT_KDIRECT;
+
+	req->in.h.opcode = FUSE_IOCTL;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(*inarg);
+	req->in.args[0].value = inarg;
+	req->out.numargs = 2;
+	req->out.args[0].size = sizeof(*outarg);
+	req->out.args[0].value = outarg;
+	req->out.args[1].size = sizeof(*info);
+	req->out.args[1].value = info;
+	req->misc.ioctl.ctx = info;
+	req->end = process_pcs_init_reply;
+
+	fuse_request_send_background(fc, req);
+	return 0;
+}
+
+void kpcs_conn_fini(struct fuse_conn *fc)
+{
+	if (!fc->kio.ctx)
+		return;
+
+	TRACE("%s fc:%p\n", __FUNCTION__, fc);
+	flush_workqueue(pcs_wq);
+	pcs_cluster_fini((struct pcs_fuse_cluster *) fc->kio.ctx);
+}
+
+void kpcs_conn_abort(struct fuse_conn *fc)
+{
+	if (!fc->kio.ctx)
+		return;
+
+	//pcs_cluster_fini((struct pcs_fuse_cluster *) fc->kio.ctx);
+	printk("%s TODO: implement this method\n", __FUNCTION__);
+
+}
+
+static int kpcs_probe(struct fuse_conn *fc, char *name)
+
+{
+	printk("%s TODO IMPLEMENT check fuse_conn args here!\n", __FUNCTION__);
+	if (!strncmp(name, kio_pcs_ops.name, FUSE_KIO_NAME))
+		return 1;
+
+	return 0;
+}
+
+
+static int fuse_pcs_getfileinfo(struct fuse_conn *fc, struct file *file,
+				struct pcs_mds_fileinfo *info)
+{
+	struct fuse_file *ff = file->private_data;
+	struct fuse_req *req;
+	struct fuse_ioctl_in *inarg;
+	struct fuse_ioctl_out *outarg;
+	struct pcs_ioc_fileinfo ioc_info;
+	int err = 0;
+
+	req = fuse_get_req(fc, 0);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	memset(&req->misc.ioctl, 0, sizeof(req->misc.ioctl));
+	inarg = &req->misc.ioctl.in;
+	outarg = &req->misc.ioctl.out;
+
+	req->in.h.opcode = FUSE_IOCTL;
+	req->in.h.nodeid = ff->nodeid;
+
+	inarg->cmd = PCS_IOC_GETFILEINFO;
+	inarg->fh = ff->fh;
+	inarg->arg = 0;
+	inarg->flags = 0;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(*inarg);
+	req->in.args[0].value = inarg;
+
+	memset(&ioc_info, 0, sizeof(ioc_info));
+
+	req->out.numargs = 2;
+	req->out.args[0].size = sizeof(*outarg);
+	req->out.args[0].value = outarg;
+	req->out.args[1].size = sizeof(ioc_info);
+	req->out.args[1].value = &ioc_info;
+
+	fuse_request_send(fc, req);
+
+	if (req->out.h.error || outarg->result) {
+		printk("%s:%d h.err:%d result:%d\n", __FUNCTION__, __LINE__,
+		       req->out.h.error, outarg->result);
+		err = req->out.h.error ? req->out.h.error : outarg->result;
+		fuse_put_request(fc, req);
+		return err;
+	} else
+		*info = ioc_info.fileinfo;
+
+	fuse_put_request(fc, req);
+	return 0;
+}
+
+static int fuse_pcs_kdirect_claim_op(struct fuse_conn *fc, struct file *file,
+				     bool claim)
+{
+	struct fuse_file *ff = file->private_data;
+	struct fuse_req *req;
+	struct fuse_ioctl_in *inarg;
+	struct fuse_ioctl_out *outarg;
+	int err = 0;
+
+	req = fuse_get_req(fc, 0);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	memset(&req->misc.ioctl, 0, sizeof(req->misc.ioctl));
+	inarg = &req->misc.ioctl.in;
+	outarg = &req->misc.ioctl.out;
+
+	req->in.h.opcode = FUSE_IOCTL;
+	req->in.h.nodeid = ff->nodeid;
+
+	if (claim)
+		inarg->cmd = PCS_IOC_KDIRECT_CLAIM;
+	else
+		inarg->cmd = PCS_IOC_KDIRECT_RELEASE;
+
+	inarg->fh = ff->fh;
+	inarg->arg = 0;
+	inarg->flags = 0;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(*inarg);
+	req->in.args[0].value = inarg;
+	req->out.numargs = 1;
+	req->out.args[0].size = sizeof(*outarg);
+	req->out.args[0].value = outarg;
+	fuse_request_send(fc, req);
+	if (req->out.h.error || outarg->result) {
+		printk("%s:%d h.err:%d result:%d\n", __FUNCTION__, __LINE__,
+		       req->out.h.error, outarg->result);
+		err = req->out.h.error ? req->out.h.error : outarg->result;
+	}
+
+	fuse_put_request(fc, req);
+	return err;
+}
+
+static int kpcs_do_file_open(struct fuse_conn *fc, struct file *file, struct inode *inode)
+{
+	struct pcs_mds_fileinfo info;
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct pcs_fuse_cluster *pfc = (struct pcs_fuse_cluster*)fc->kio.ctx;
+	struct pcs_dentry_info *di = NULL;
+	int ret;
+
+	ret = fuse_pcs_getfileinfo(fc, file, &info);
+	if (ret)
+		return ret;
+
+	if (info.sys.map_type != PCS_MAP_PLAIN) {
+		TRACE("Unsupported map_type:%x, ignore\n", info.sys.map_type);
+		return 0;
+	}
+
+	di = kzalloc(sizeof(*di), GFP_KERNEL);
+	if (!di)
+		return -ENOMEM;
+
+	/* TODO Init fields */
+	/* di.id.parent	    = id->parent; */
+	/* di.id.name.data  = name; */
+	/* di.id.name.len   = id->name.len; */
+
+	pcs_mapping_init(&pfc->cc, &di->mapping);
+	pcs_set_fileinfo(di, &info);
+	di->cluster = &pfc->cc;
+	di->inode = fi;
+	TRACE("init id:%llu chunk_size:%d stripe_depth:%d strip_width:%d\n",
+	      fi->nodeid, di->fileinfo.sys.chunk_size,
+	      di->fileinfo.sys.stripe_depth, di->fileinfo.sys.strip_width);
+
+	mutex_lock(&inode->i_mutex);
+	/* Some one already initialized it under us ? */
+	if (fi->private) {
+		mutex_unlock(&inode->i_mutex);
+		pcs_mapping_invalidate(&di->mapping);
+		pcs_mapping_deinit(&di->mapping);
+		kfree(di);
+		return 0;
+	}
+	ret = fuse_pcs_kdirect_claim_op(fc, file, true);
+	if (ret) {
+		mutex_unlock(&inode->i_mutex);
+		pcs_mapping_invalidate(&di->mapping);
+		pcs_mapping_deinit(&di->mapping);
+		kfree(di);
+		return ret;
+	}
+	/* TODO: Propper initialization of dentry should be here!!! */
+	fi->private = di;
+	mutex_unlock(&inode->i_mutex);
+	return 0;
+}
+
+int kpcs_file_open(struct fuse_conn *fc, struct file *file, struct inode *inode)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	if (!S_ISREG(inode->i_mode))
+		return 0;
+	if (fi->nodeid - FUSE_ROOT_ID >= PCS_FUSE_INO_SPECIAL_)
+		return 0;
+	/* Already initialized */
+	if (fi->private) {
+		/*TODO: propper refcount for claim_cnt should be here */
+		return 0;
+	}
+	return kpcs_do_file_open(fc, file, inode);
+}
+
+void kpcs_inode_release(struct fuse_inode *fi)
+{
+	struct pcs_dentry_info *di = fi->private;
+
+	if(!di)
+		return;
+
+	pcs_mapping_invalidate(&di->mapping);
+	pcs_mapping_deinit(&di->mapping);
+	/* TODO: properly destroy dentry info here!! */
+	kfree(di);
+}
+
+static void pcs_fuse_reply_handle(struct fuse_conn *fc, struct fuse_req *req)
+{
+	struct pcs_fuse_work *work = (struct pcs_fuse_work*) req->misc.ioctl.ctx;
+	int err;
+
+	err = req->out.h.error ? req->out.h.error : req->misc.ioctl.out.result;
+	if (err) {
+		/* TODO	 Fine grane error conversion here */
+		pcs_set_local_error(&work->status, PCS_ERR_PROTOCOL);
+	}
+	queue_work(pcs_wq, &work->work);
+}
+
+#define MAX_CS_CNT 32
+static void fuse_complete_map_work(struct work_struct *w)
+{
+	struct pcs_fuse_work *work = container_of(w, struct pcs_fuse_work, work);
+	struct pcs_map_entry *m = (struct pcs_map_entry *)work->ctx;
+	struct pcs_ioc_getmap *omap = (struct pcs_ioc_getmap *)work->ctx2;
+
+	BUG_ON(!m);
+	BUG_ON(!omap);
+	pcs_copy_error_cond(&omap->error, &work->status);
+	if (omap->cs_cnt > MAX_CS_CNT) {
+		printk("Corrupted cs_cnt from userspace");
+		pcs_set_local_error(&omap->error, PCS_ERR_PROTOCOL);
+	}
+
+	pcs_map_complete(m, omap);
+	kfree(omap);
+	kfree(work);
+}
+
+int fuse_map_resolve(struct pcs_map_entry *m, int direction)
+{
+	struct pcs_dentry_info *di = pcs_dentry_from_mapping(m->mapping);
+	struct fuse_conn *fc = pcs_cluster_from_cc(di->cluster)->fc;
+	struct fuse_req *req;
+	struct fuse_ioctl_in *inarg;
+	struct fuse_ioctl_out *outarg;
+	struct pcs_ioc_getmap *map_ioc;
+	struct pcs_fuse_work *reply_work;
+	size_t map_sz;
+
+	DTRACE("enter m: " MAP_FMT ", dir:%d \n", MAP_ARGS(m),	direction);
+
+	BUG_ON(!(m->state & PCS_MAP_RESOLVING));
+
+	map_sz = sizeof(*map_ioc) + MAX_CS_CNT * sizeof(struct pcs_cs_info);
+	map_ioc = kzalloc(map_sz, GFP_NOIO);
+	if (!map_ioc)
+		return -ENOMEM;
+
+	reply_work = kzalloc(sizeof(*reply_work), GFP_NOIO);
+	if (!reply_work) {
+		kfree(map_ioc);
+		return -ENOMEM;
+	}
+	req = fuse_get_req_for_background(fc, 0);
+	if (IS_ERR(req)) {
+		kfree(map_ioc);
+		kfree(reply_work);
+		return PTR_ERR(req);
+	}
+
+
+	memset(&req->misc.ioctl, 0, sizeof(req->misc.ioctl));
+	inarg = &req->misc.ioctl.in;
+	outarg = &req->misc.ioctl.out;
+	inarg->cmd = PCS_IOC_GETMAP;
+	map_ioc->cs_max = MAX_CS_CNT;
+
+	/* fill ioc_map struct */
+	if (pcs_map_encode_req(m, map_ioc, direction) != 0) {
+		kfree(map_ioc);
+		kfree(reply_work);
+		fuse_put_request(fc, req);
+		return 0;
+	}
+
+	/* Fill core ioctl */
+	req->in.h.opcode = FUSE_IOCTL;
+	/* FH is null, peer will lookup by nodeid */
+	inarg->fh = 0;
+	req->in.h.nodeid = di->inode->nodeid;
+	req->in.numargs = 2;
+	req->in.args[0].size = sizeof(*inarg);
+	req->in.args[0].value = inarg;
+	req->in.args[1].size = map_sz;
+	req->in.args[1].value = map_ioc;
+
+	req->out.numargs = 2;
+	/* TODO: make this ioctl varsizable */
+	req->out.argvar = 1;
+	req->out.args[0].size = sizeof(*outarg);
+	req->out.args[0].value = outarg;
+	req->out.args[1].size = map_sz;
+	req->out.args[1].value = map_ioc;
+
+	INIT_WORK(&reply_work->work, fuse_complete_map_work);
+	reply_work->ctx = m;
+	reply_work->ctx2 = map_ioc;
+	req->misc.ioctl.ctx = reply_work;
+	req->end = pcs_fuse_reply_handle;
+
+	fuse_request_send_background(fc, req);
+
+	return 0;
+}
+static void pfocess_pcs_csconn_work(struct work_struct *w)
+{
+	struct pcs_fuse_work *work = container_of(w, struct pcs_fuse_work, work);
+	struct pcs_rpc *ep  = (struct pcs_rpc *)work->ctx;
+	struct socket *sock = (struct socket *)work->ctx2;
+	BUG_ON(!ep);
+
+	if (pcs_if_error(&work->status)) {
+		mutex_lock(&ep->mutex);
+		pcs_rpc_reset(ep);
+		mutex_unlock(&ep->mutex);
+		TRACE(PEER_FMT" fail with %d\n", PEER_ARGS(ep), work->status.value);
+	} else	{
+		if (sock)
+			rpc_connect_done(ep, sock);
+	}
+	pcs_rpc_put(ep);
+	kfree(work);
+}
+
+static void process_pcs_csconn_reply(struct fuse_conn *fc, struct fuse_req *req)
+{
+	struct pcs_ioc_csconn *csconn = (struct pcs_ioc_csconn *)req->in.args[1].value;
+	struct fuse_ioctl_out *arg = &req->misc.ioctl.out;
+	struct pcs_fuse_work *work = (struct pcs_fuse_work*) req->misc.ioctl.ctx;
+	int is_open = csconn->flags & PCS_IOC_CS_OPEN;
+
+	if (req->out.h.error || arg->result < 0) {
+		pcs_set_local_error(&work->status, PCS_ERR_PROTOCOL);
+		goto out;
+	}
+	/* Grab socket from caller's context (fuse-evloop) and do the rest in kwork */
+	if (is_open) {
+		struct socket *sock;
+		struct file* filp;
+		int err;
+
+		filp = fget((unsigned int)arg->result);
+		arg->result = 0;
+		if (!filp) {
+			pcs_set_local_error(&work->status, PCS_ERR_PROTOCOL);
+			goto out;
+		}
+		sock = sock_from_file(filp, &err);
+		if (!sock) {
+			fput(filp);
+			pcs_set_local_error(&work->status, PCS_ERR_PROTOCOL);
+		} else
+			TRACE("id: "NODE_FMT" sock:%p\n", NODE_ARGS(csconn->id), sock);
+		work->ctx2 = sock;
+	}
+out:
+	kfree(csconn);
+	pcs_fuse_reply_handle(fc, req);
+
+}
+
+int fuse_pcs_csconn_send(struct fuse_conn *fc, struct pcs_rpc *ep, int flags)
+{
+	struct fuse_req *req;
+	struct fuse_ioctl_in *inarg;
+	struct fuse_ioctl_out *outarg;
+	struct pcs_ioc_csconn *csconn;
+	struct pcs_fuse_work *reply_work;
+
+	/* Socket must being freed from kernelspace before requesting new one*/
+	BUG_ON(!(flags & PCS_IOC_CS_REOPEN));
+
+	TRACE("start %s cmd:%ld id:%lld flags:%x\n", __FUNCTION__,
+	      PCS_IOC_CSCONN, ep->peer_id.val, flags);
+
+	csconn = kzalloc(sizeof(*csconn), GFP_NOIO);
+	if (!csconn)
+		return -ENOMEM;
+
+	reply_work = kzalloc(sizeof(*reply_work), GFP_NOIO);
+	if (!reply_work) {
+		kfree(csconn);
+		return -ENOMEM;
+	}
+
+	req = fuse_get_req_for_background(fc, 0);
+	if (IS_ERR(req)) {
+		kfree(csconn);
+		kfree(reply_work);
+		return PTR_ERR(req);
+	}
+
+	memset(&req->misc.ioctl, 0, sizeof(req->misc.ioctl));
+	inarg = &req->misc.ioctl.in;
+	outarg = &req->misc.ioctl.out;
+
+	inarg->cmd = PCS_IOC_CSCONN;
+	inarg->fh = 0;
+	inarg->arg = 0;
+	inarg->flags = 0;
+
+	csconn->id.val = ep->peer_id.val;
+	memcpy(&csconn->address, &ep->addr, sizeof(ep->addr));
+	csconn->flags = flags;
+
+	req->in.h.opcode = FUSE_IOCTL;
+	req->in.numargs = 2;
+	req->in.args[0].size = sizeof(*inarg);
+	req->in.args[0].value = inarg;
+	req->in.args[1].size = sizeof(*csconn);
+	req->in.args[1].value = csconn;
+
+	req->out.numargs = 1;
+	req->out.args[0].size = sizeof(*outarg);
+	req->out.args[0].value = outarg;
+
+	INIT_WORK(&reply_work->work, pfocess_pcs_csconn_work);
+	reply_work->ctx = pcs_rpc_get(ep);
+	reply_work->ctx2 = NULL; /* return socket should be here */
+	req->misc.ioctl.ctx = reply_work;
+
+	req->end = process_pcs_csconn_reply;
+	fuse_request_send_background(fc, req);
+
+	return 0;
+}
+
+struct fuse_req *kpcs_req_alloc(struct fuse_conn *fc,
+					unsigned npages, gfp_t flags)
+{
+	return fuse_generic_request_alloc(fc, pcs_fuse_req_cachep,
+					  npages, flags);
+}
+
+/* IOHOOKS */
+
+struct pcs_int_request * __ireq_alloc(void)
+{
+	return kmem_cache_alloc(pcs_ireq_cachep, GFP_NOIO);
+}
+void ireq_destroy(struct pcs_int_request *ireq)
+{
+	kmem_cache_free(pcs_ireq_cachep, ireq);
+}
+
+static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req, int async)
+{
+	struct pcs_fuse_req *r = pcs_req_from_fuse(req);
+	struct fuse_inode *fi = get_fuse_inode(req->io_inode);
+	struct pcs_dentry_info *di = pcs_inode_from_fuse(fi);
+	struct pcs_int_request* ireq;
+
+	BUG_ON(!di);
+	BUG_ON(req->cache != pcs_fuse_req_cachep);
+
+	/* Init pcs_fuse_req */
+	memset(&r->exec.io, 0, sizeof(r->exec.io));
+	memset(&r->exec.ctl, 0, sizeof(r->exec.ctl));
+	/* Use inline request structure */
+	ireq = &r->exec.ireq;
+	ireq_init(di, ireq);
+
+	switch (r->req.in.h.opcode) {
+	case FUSE_WRITE: {
+		struct fuse_write_in *in = &r->req.misc.write.in;
+		struct fuse_write_out *out = &r->req.misc.write.out;
+		out->size = in->size;
+		break;
+	}
+	case FUSE_READ: {
+		struct fuse_read_in *in = &r->req.misc.read.in;
+		size_t size = in->size;
+
+		if (in->offset + in->size > di->fileinfo.attr.size) {
+			if (in->offset >= di->fileinfo.attr.size) {
+				req->out.args[0].size = 0;
+				break;
+			}
+			size = di->fileinfo.attr.size - in->offset;
+		}
+		pcs_fuse_prep_io(r, PCS_REQ_T_READ, in->offset, size);
+		goto submit;
+	}
+	case FUSE_FSYNC:
+		/*NOOP */
+		break;
+	}
+	r->req.out.h.error = 0;
+	DTRACE("do fuse_request_end req:%p op:%d err:%d\n", &r->req, r->req.in.h.opcode, r->req.out.h.error);
+
+	request_end(pfc->fc, &r->req);
+	return;
+submit:
+	if (async)
+		pcs_cc_submit(ireq->cc, ireq);
+	else
+		ireq_process(ireq);
+}
+
+
+int kpcs_req_send(struct fuse_conn* fc, struct fuse_req *req, bool bg, bool lk)
+{
+	struct pcs_fuse_cluster *pfc = (struct pcs_fuse_cluster*)fc->kio.ctx;
+	struct fuse_inode *fi = get_fuse_inode(req->io_inode);
+
+	if (!fc->initialized || fc->conn_error)
+		return 1;
+
+	BUG_ON(!pfc);
+	/* HYPOTHESIS #1
+	 * IFAIU at this point request can not belongs to any list
+	 * so I cant avoid grab fc->lock here at all
+	 */
+	BUG_ON(!list_empty(&req->list));
+
+	TRACE(" Enter req:%p op:%d bg:%d lk:%d\n", req, req->in.h.opcode, bg, lk);
+
+	/* TODO: This is just a crunch, Conn cleanup requires sane locking */
+	if (req->in.h.opcode == FUSE_DESTROY) {
+		kpcs_conn_fini(fc);
+		spin_lock(&fc->lock);
+		fc->kio.ctx = NULL;
+		spin_unlock(&fc->lock);
+		return 1;
+	}
+	if ((req->in.h.opcode != FUSE_READ &&
+	     req->in.h.opcode != FUSE_WRITE))
+		return 1;
+
+	fi = get_fuse_inode(req->io_inode);
+	if (!fi->private)
+		return 1;
+
+	/* TODO, fetch only read requests for now */
+	if (req->in.h.opcode != FUSE_READ)
+		return 1;
+
+	__clear_bit(FR_BACKGROUND, &req->flags);
+	__clear_bit(FR_PENDING, &req->flags);
+	/* request_end below will do fuse_put_request() */
+	if (!bg)
+		atomic_inc(&req->count);
+	pcs_fuse_submit(pfc, req, lk);
+	if (!bg)
+		wait_event(req->waitq, test_bit(FR_FINISHED, &req->flags));
+
+	return 0;
+}
+
+
+static struct fuse_kio_ops kio_pcs_ops = {
+	.name		= "pcs",
+	.owner		= THIS_MODULE,
+	.probe		= kpcs_probe, /*TODO: check sb->dev name */
+
+	.conn_init	= kpcs_conn_init,
+	.conn_fini	= kpcs_conn_fini,
+	.conn_abort	= kpcs_conn_abort,
+	.req_alloc	= kpcs_req_alloc,
+	.req_send	= kpcs_req_send,
+	.file_open	= kpcs_file_open,
+	.inode_release	= kpcs_inode_release,
+};
+
+
+static int __init kpcs_mod_init(void)
+{
+	int err = -ENOMEM;
+	pcs_fuse_req_cachep = kmem_cache_create("pcs_fuse_request",
+						sizeof(struct pcs_fuse_req),
+						0, 0, NULL);
+
+	if (!pcs_fuse_req_cachep)
+		return err;
+
+	pcs_ireq_cachep = kmem_cache_create("pcs_ireq",
+					    sizeof(struct pcs_int_request),
+					    0, SLAB_MEM_SPREAD, NULL);
+	if (!pcs_ireq_cachep)
+		goto free_fuse_cache;
+	pcs_wq = alloc_workqueue("pcs_cluster", WQ_MEM_RECLAIM, 0);
+	if (!pcs_wq)
+		goto free_ireq_cache;
+
+	if(fuse_register_kio(&kio_pcs_ops))
+		goto free_wq;
+	printk("%s fuse_c:%p ireq_c:%p pcs_wq:%p\n", __FUNCTION__,
+	       pcs_fuse_req_cachep, pcs_ireq_cachep, pcs_wq);
+
+	return 0;
+free_wq:
+	destroy_workqueue(pcs_wq);
+free_ireq_cache:
+	kmem_cache_destroy(pcs_ireq_cachep);
+free_fuse_cache:
+	kmem_cache_destroy(pcs_fuse_req_cachep);
+	return err;
+}
+
+static void __exit kpcs_mod_exit(void)
+{
+	fuse_unregister_kio(&kio_pcs_ops);
+	destroy_workqueue(pcs_wq);
+	kmem_cache_destroy(pcs_ireq_cachep);
+	kmem_cache_destroy(pcs_fuse_req_cachep);
+}
+
+module_init(kpcs_mod_init);
+module_exit(kpcs_mod_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel at openvz.org>");
diff --git a/fs/fuse/kio/pcs/pcs_ioctl.h b/fs/fuse/kio/pcs/pcs_ioctl.h
new file mode 100644
index 000000000000..6451baabb492
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_ioctl.h
@@ -0,0 +1,85 @@
+#ifndef _PCS_IOCTL_H_
+#define _PCS_IOCTL_H_ 1
+
+#include <linux/ioctl.h>
+
+
+#include "pcs_prot_types.h"
+#include "pcs_mds_prot.h"
+#include "pcs_error.h"
+#include "pcs_map.h"
+#include "pcs_rpc.h"
+
+#define PCS_FUSE_INO_SPECIAL_ ((unsigned long long)-0x1000)
+
+struct pcs_client_lease_info
+{
+	u32				type;
+	u32				pad;
+	struct pcs_pc_lease_info	info;
+};
+
+struct pcs_getleases_resp {
+	u32				nleases;
+	u32				nleases_total;
+	struct pcs_client_lease_info	leases[0];
+};
+
+union pcs_getleases_ioc
+{
+	char				path[4096];
+	struct pcs_getleases_resp	resp;
+};
+
+struct pcs_ioc_init_kdirect
+{
+	PCS_NODE_ID_T node_id;
+	PCS_CLUSTER_ID_T cluster_id;
+};
+
+struct pcs_ioc_fileinfo
+{
+	struct pcs_mds_fileinfo fileinfo;
+};
+
+struct pcs_ioc_getmap
+{
+	PCS_CHUNK_UID_T		uid;		/* chunk unique id on out */
+	PCS_MAP_VERSION_T	version;	/* in (on retry) / out */
+	u64			chunk_start;	/* in / out */
+	u64			chunk_end;	/* out */
+	u32			state;		/* in/out: PCS_IOC_MAP_S_XXX */
+#define PCS_IOC_MAP_S_READ	0x1
+#define PCS_IOC_MAP_S_WRITE	0x2
+#define PCS_IOC_MAP_S_NEW	0x4
+#define PCS_IOC_MAP_S_ERROR	0x8
+	pcs_error_t		error;		/* in/out */
+	u16			mds_flags;	/* in/out */
+	u32			psize_ret;	/* length of chunk on CS (out) */
+	u32			chunk_psize;	/* physical size of chunk on CS on in */
+	u32			read_tout;	/* read	 timeout (msec) on out */
+	u32			write_tout;	/* write timeout (msec) on out */
+	/* TODO: cs array is only for OUT ? */
+	u32			cs_cnt;		/* The number of CS (including root) entries that follows */
+	u32			cs_max;		/* Max number of CS (including root) entries requested */
+	struct pcs_cs_info	cs[0];		/* Array of CS including root */
+};
+
+struct pcs_ioc_csconn
+{
+	PCS_NODE_ID_T		id;
+	PCS_NET_ADDR_T		address;
+	u32			flags;
+#define PCS_IOC_CS_OPEN		0x1
+#define PCS_IOC_CS_CLOSE	0x2
+#define PCS_IOC_CS_REOPEN	(PCS_IOC_CS_OPEN|PCS_IOC_CS_CLOSE)
+};
+
+#define PCS_IOC_INIT_KDIRECT	_IOR('V',32, struct pcs_ioc_init_kdirect)
+#define PCS_IOC_CSCONN		_IOR('V',33, struct pcs_ioc_csconn)
+#define PCS_IOC_GETFILEINFO	_IOR('V',34, struct pcs_ioc_fileinfo)
+#define PCS_IOC_KDIRECT_CLAIM	_IO('V',35)
+#define PCS_IOC_KDIRECT_RELEASE _IO('V',36)
+#define PCS_IOC_GETMAP		_IOWR('V',37, struct pcs_ioc_getmap)
+
+#endif /* _PCS_IOCTL_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_map.c b/fs/fuse/kio/pcs/pcs_map.c
new file mode 100644
index 000000000000..32cfd073befd
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_map.c
@@ -0,0 +1,2999 @@
+#include <net/sock.h>
+#include <linux/net.h>
+#include <linux/kthread.h>
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/highmem.h>
+#include <linux/log2.h>
+
+#include "pcs_types.h"
+#include "pcs_sock_io.h"
+#include "pcs_rpc.h"
+#include "pcs_sock_io.h"
+#include "pcs_req.h"
+#include "pcs_map.h"
+#include "pcs_cs.h"
+#include "pcs_ioctl.h"
+#include "log.h"
+
+/*  Lock order
+   ->map->lock	: Motivated by truncate
+     ->mapping->map_lock
+
+   map->lock
+    ->cs->lock	 : pcs_map_set_cslist
+
+*/
+#define MAP_BATCH 16
+
+static void pcs_ireq_queue_fail(struct list_head *queue, int error);
+
+abs_time_t get_real_time_ms(void)
+{
+	struct timespec tv = current_kernel_time();
+	return (abs_time_t)tv.tv_sec * 1000 + tv.tv_nsec / 1000000;
+}
+
+
+static inline unsigned int pcs_sync_timeout(struct pcs_cluster_core *cc)
+{
+	/* This is ~8 second distribution around PCS_SYNC_TIMEOUT */
+	//// TODO: timeout randomization temproraly disabled
+	////return PCS_SYNC_TIMEOUT - 0x1000 + (pcs_random(&cc->rng) & 0x1FFF);
+	return PCS_SYNC_TIMEOUT;
+}
+
+static void cslist_destroy(struct pcs_cs_list * csl)
+{
+	int i;
+
+	TRACE("csl:%p csl->map:%p refcnt:%d\n", csl, csl->map, atomic_read(&csl->refcnt));
+	BUG_ON(csl->map);
+
+	for (i = 0; i < csl->nsrv; i++) {
+		struct pcs_cs_link * cslink = &csl->cs[i].cslink;
+
+		/* Possible after error inside cslist_alloc() */
+		if (!cslink->cs)
+			continue;
+
+		spin_lock(&cslink->cs->lock);
+		if (!list_empty(&cslink->link)) {
+			list_del_init(&cslink->link);
+			cslink->cs->nmaps--;
+		}
+		spin_unlock(&cslink->cs->lock);
+	}
+	kfree(csl);
+}
+
+static inline void cslist_get(struct pcs_cs_list * csl)
+{
+	TRACE("csl:%p csl->map:%p refcnt:%d\n", csl, csl->map, atomic_read(&csl->refcnt));
+
+	atomic_inc(&csl->refcnt);
+}
+static inline void cslist_put(struct pcs_cs_list * csl)
+{
+	TRACE("csl:%p csl->map:%p refcnt:%d\n", csl, csl->map, atomic_read(&csl->refcnt));
+	if (atomic_dec_and_test(&csl->refcnt))
+		cslist_destroy(csl);
+}
+
+static void map_drop_cslist(struct pcs_map_entry * m)
+{
+	assert_spin_locked(&m->lock);
+
+	if (m->cs_list == NULL)
+		return;
+
+	m->cs_list->map = NULL;
+	/* Barrier here is only for sanity checks in cslist_destroy() */
+	smp_mb__before_atomic_dec();
+	cslist_put(m->cs_list);
+	m->cs_list = NULL;
+}
+
+static void pcs_map_callback(struct rcu_head *head)
+{
+	struct pcs_map_entry *m = container_of(head, struct pcs_map_entry, rcu);
+
+	BUG_ON(atomic_read(&m->__refcnt));
+	BUG_ON(!list_empty(&m->queue));
+	BUG_ON(!(m->state & PCS_MAP_DEAD));
+	BUG_ON(m->cs_list);
+
+	kfree(m);
+}
+
+static void __pcs_map_free(struct pcs_map_entry *m)
+{
+	call_rcu(&m->rcu, pcs_map_callback);
+}
+
+void __pcs_map_put(struct pcs_map_entry *m)
+__releases(m->lock)
+{
+	TRACE(" %p id:%lld state:%x ref:%d\n",m, m->id, m->state, atomic_read(&m->__refcnt));
+
+	assert_spin_locked(&m->lock);
+	if (m->state & PCS_MAP_DEAD) {
+		spin_unlock(&m->lock);
+		__pcs_map_free(m);
+		return;
+	}
+	map_add_lru(m);
+	spin_unlock(&m->lock);
+}
+
+static struct pcs_map_entry *  __pcs_map_get(struct pcs_map_entry *m)
+{
+	//TRACE( MAP_FMT " ref:%d, maps-count:%d \n", MAP_ARGS(m), m->__refcnt);
+	BUG_ON(atomic_inc_return(&m->__refcnt) <= 1);
+
+	return m;
+}
+
+static void pcs_map_reset(struct pcs_map_entry * m)
+{
+	m->state &= ~(PCS_MAP_READABLE|PCS_MAP_WRITEABLE);
+}
+static void pcs_ireq_queue_fail(struct list_head *queue, int error);
+static void map_sync_work_add(struct pcs_map_entry *m, unsigned long timeout);
+static void map_sync_work_del(struct pcs_map_entry *m);
+
+/* Truncate map from mapping */
+static void pcs_map_truncate(struct pcs_map_entry *m, struct list_head *queue)
+{
+
+	void *ret;
+
+	TRACE( MAP_FMT " ref:%d\n", MAP_ARGS(m), atomic_read(&m->__refcnt));
+
+	assert_spin_locked(&m->lock);
+	BUG_ON(m->state & PCS_MAP_DEAD);
+	BUG_ON(!m->mapping);
+	BUG_ON(!list_empty(&m->queue) && !queue);
+
+	spin_lock(&m->mapping->map_lock);
+	ret = radix_tree_delete(&m->mapping->map_tree, m->index);
+	BUG_ON(!ret || ret != m);
+	m->mapping->nrmaps--;
+	spin_unlock(&m->mapping->map_lock);
+
+	list_splice_tail_init(&m->queue, queue);
+	m->mapping = NULL;
+	map_sync_work_del(m);
+	pcs_map_reset(m);
+	m->state |= PCS_MAP_DEAD;
+	map_drop_cslist(m);
+}
+
+void pcs_mapping_init(struct pcs_cluster_core *cc, struct pcs_mapping * mapping)
+{
+	mapping->cluster = cc;
+	INIT_RADIX_TREE(&mapping->map_tree, GFP_ATOMIC);
+	spin_lock_init(&mapping->map_lock);
+	pcs_flow_table_init(&mapping->ftab, &cc->maps.ftab);
+}
+
+/* Must be called once right after lease is acquired. At that point we already
+ * have all the file attributes.
+ */
+void pcs_mapping_open(struct pcs_mapping * mapping)
+{
+	struct pcs_dentry_info *di = pcs_dentry_from_mapping(mapping);
+
+	switch (di->fileinfo.sys.map_type) {
+	default:
+		BUG();
+	case PCS_MAP_PLAIN:
+		return;
+	}
+}
+
+void pcs_mapping_dump(struct pcs_mapping * mapping)
+{
+	struct pcs_dentry_info *di = pcs_dentry_from_mapping(mapping);
+	unsigned long pos = 0;
+	struct pcs_map_entry *maps[MAP_BATCH];
+	int nr_maps, total = 0;
+
+	if (!mapping->nrmaps)
+		return;
+
+	DTRACE(DENTRY_FMT "\n", DENTRY_ARGS(di));
+
+	do {
+		int i;
+		rcu_read_lock();
+		nr_maps = radix_tree_gang_lookup(&mapping->map_tree,
+				(void **)maps, pos, MAP_BATCH);
+
+		for (i = 0; i < nr_maps; i++, total++) {
+			pos = maps[i]->index;
+			DTRACE("[%d] " MAP_FMT ", id:" CUID_FMT ",  v:" VER_FMT " ref:%d\n", total,  MAP_ARGS(maps[i]),
+			       CUID_ARGS(maps[i]->id), VER_ARGS(maps[i]->version),
+			       atomic_read(&maps[i]->__refcnt));
+		}
+		pos++;
+		rcu_read_unlock();
+	} while (nr_maps);
+}
+
+void map_truncate_tail(struct pcs_mapping * mapping, u64 offset)
+{
+
+	unsigned long pos = offset >> mapping->chunk_size_bits;
+	struct pcs_map_entry *maps[MAP_BATCH];
+	int nr_maps;
+	LIST_HEAD(dispose);
+
+	TRACE("%s " DENTRY_FMT "\n", __FUNCTION__, DENTRY_ARGS(pcs_dentry_from_mapping(mapping)));
+	do {
+		int i;
+
+		rcu_read_lock();
+		nr_maps = radix_tree_gang_lookup(&mapping->map_tree,
+				(void **)maps, pos, MAP_BATCH);
+
+		for (i = 0; i < nr_maps; i++) {
+			struct pcs_map_entry *m = maps[i];
+
+			spin_lock(&m->lock);
+			if (!pcs_map_get_locked(m)) {
+				spin_unlock(&m->lock);
+				continue;
+			}
+			pcs_map_truncate(m, &dispose);
+			map_del_lru(m);
+			spin_unlock(&m->lock);
+			pcs_map_put(m);
+		}
+		pos++;
+		rcu_read_unlock();
+	} while (nr_maps);
+
+	pcs_ireq_queue_fail(&dispose, PCS_ERR_NET_ABORT);
+}
+
+void pcs_mapping_invalidate(struct pcs_mapping * mapping)
+{
+	pcs_mapping_dump(mapping);
+	map_truncate_tail(mapping, 0);
+	/* If some CSes are still not shutdown, we can have some map entries referenced in their queues */
+	pcs_flow_table_fini(&mapping->ftab, &pcs_dentry_from_mapping(mapping)->cluster->maps.ftab);
+}
+
+void pcs_mapping_deinit(struct pcs_mapping * mapping)
+{
+
+	BUG_ON(mapping->nrmaps);
+}
+
+static inline int map_reclaimable(struct pcs_map_entry * m)
+{
+	return list_empty(&m->queue)
+		&& !(m->state & (PCS_MAP_ERROR|PCS_MAP_RESOLVING));
+}
+
+static enum lru_status map_isolate(struct list_head *item,
+		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+{
+	struct list_head *dispose = arg;
+	struct pcs_map_entry *m = list_entry(item, struct pcs_map_entry, lru_link);
+
+	if (!spin_trylock(&m->lock))
+		return LRU_SKIP;
+
+	if (!map_reclaimable(m)) {
+		spin_unlock(&m->lock);
+		return LRU_SKIP;
+	}
+
+	pcs_map_truncate(m, NULL);
+	list_lru_isolate_move(lru, item, dispose);
+	spin_unlock(&m->lock);
+
+	return LRU_REMOVED;
+}
+
+static enum lru_status map_dirty_walk(struct list_head *item,
+		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+{
+	struct pcs_map_entry *m = list_entry(item, struct pcs_map_entry, lru_link);
+
+
+	if (!spin_trylock(&m->lock))
+		return LRU_SKIP;
+
+	BUG_ON(!(m->flags & PCS_MAP_DIRTY));
+	/* Flushes are not limited by ireq_delay(). So, we have
+	 * to suppress too frequent flushes when MDS fails to update map
+	 * by any reason.
+	 */
+	if (!(m->flags & (PCS_MAP_FLUSHING|PCS_MAP_DIRTY_GC)) &&
+	    timer_pending(&m->sync_work.timer) &&
+	    (jiffies >= m->error_tstamp + PCS_ERROR_DELAY)) {
+		m->flags |= PCS_MAP_DIRTY_GC;
+		map_sync_work_add(m, 0);
+	}
+	spin_unlock(&m->lock);
+	return LRU_SKIP;
+}
+
+unsigned long pcs_map_shrink_scan(struct shrinker *shrink,
+					 struct shrink_control *sc)
+{
+	LIST_HEAD(dispose);
+	unsigned long freed = 0;
+	unsigned long nr_to_scan = sc->nr_to_scan;
+	struct pcs_map_set *maps = container_of(shrink,
+					struct pcs_map_set, shrinker);
+
+	/* This shrinker performs only atomic operations,
+	 * any GFP maks will works
+	 * if (!(sc->gfp_mask & __GFP_FS)) */
+	/*	return SHRINK_STOP; */
+
+	freed = list_lru_walk_node(&maps->lru, sc->nid, map_isolate,
+				       &dispose, &nr_to_scan);
+
+	if (nr_to_scan)
+		list_lru_walk_node(&maps->dirty_lru, sc->nid,
+				   map_dirty_walk, NULL, &nr_to_scan);
+
+	while (!list_empty(&dispose)) {
+		struct pcs_map_entry *m;
+		m = list_first_entry(&dispose, struct pcs_map_entry, lru_link);
+		list_del_init(&m->lru_link);
+		__pcs_map_free(m);
+	}
+
+	if (!list_empty(&maps->dirty_queue)) {
+		INIT_LIST_HEAD(&dispose);
+		spin_lock(&maps->lock);
+		list_splice_tail(&maps->dirty_queue, &dispose);
+		spin_unlock(&maps->lock);
+		pcs_cc_requeue(container_of(maps, struct pcs_cluster_core, maps), &dispose);
+	}
+	TRACE(" lru_freed:%ld \n", freed);
+	return freed;
+}
+
+unsigned long map_gc(struct pcs_map_set *maps)
+{
+	struct shrink_control sc = {
+		.gfp_mask = GFP_NOIO,
+		.nr_to_scan = 1,
+		.nid = numa_node_id(),
+	};
+
+	return pcs_map_shrink_scan(&maps->shrinker, &sc);
+}
+
+static inline int is_dirtying(struct pcs_map_entry * map, struct pcs_int_request *ireq)
+{
+	if (!ireq->iochunk.direction)
+		return 0;
+
+	/* Was not dirty? */
+	if (!(map->flags & PCS_MAP_DIRTY))
+		return 1;
+
+	/* Is already dirty, but we work on flush right now. Wait for end of flush. */
+	if (map->flags & (PCS_MAP_FLUSHING|PCS_MAP_DIRTY_GC))
+		return 1;
+
+	return 0;
+}
+
+static void map_queue_on_limit(struct pcs_int_request *ireq)
+{
+	struct pcs_map_set * maps = &ireq->dentry->cluster->maps;
+
+	TRACE("queueing due to dirty limit\n");
+
+	if (ireq_is_timed_out(ireq)) {
+		pcs_log(LOG_ERR, "timeout while map get on \"" DENTRY_FMT "\" last_err=%u",
+			DENTRY_ARGS(ireq->dentry), ireq->error.value);
+		BUG();
+	}
+
+	if (ireq->type == PCS_IREQ_IOCHUNK && ireq->iochunk.map) {
+		pcs_map_put(ireq->iochunk.map);
+		ireq->iochunk.map = NULL;
+	}
+
+	list_add_tail(&ireq->list, &maps->dirty_queue);
+	map_gc(maps);
+}
+
+/* TODO: this check differ from original */
+int map_check_limit(struct pcs_map_entry * map, struct pcs_int_request *ireq)
+{
+	struct pcs_map_set * maps = &ireq->dentry->cluster->maps;
+
+	if (map == NULL) {
+		map_queue_on_limit(ireq);
+		return 1;
+	}
+
+	if (list_empty(&maps->dirty_queue))
+		return 0;
+
+	/* The goal is to queue request which is going to increase pressure on map limit. */
+
+	/* If map failed the request must pass. If it is under resolution it can pass.
+	 *
+	 * This looks dangerous, error maps can overflow map table.
+	 * Nevertheless, altogether this combines to another statement: if map is not reclaimable,
+	 * the request passes. So, it really does not increase pressure.
+	 */
+
+	if (!map_reclaimable(map))
+		return 0;
+	/*
+	 * When map is new, the request definitely increases the pressure.
+	 *
+	 * Also it does if the request is going to move clean map to dirty state
+	 */
+	if (((map->state & PCS_MAP_NEW) || is_dirtying(map, ireq))) {
+		int nid = page_to_nid(virt_to_page(map));
+
+		if (list_lru_count_node(&maps->dirty_lru, nid) >
+		    maps->map_dirty_thresh)
+		map_queue_on_limit(ireq);
+		return 1;
+	}
+	return 0;
+}
+
+static void map_sync_work_add(struct pcs_map_entry *m, unsigned long timeout)
+{
+	struct pcs_cluster_core *cc = cc_from_maps(m->maps);
+
+	assert_spin_locked(&m->lock);
+
+	if (!timer_pending(&m->sync_work.timer))
+		__pcs_map_get(m);
+	mod_delayed_work(cc->wq, &m->sync_work, timeout);
+}
+static void map_sync_work_del(struct pcs_map_entry *m)
+{
+	assert_spin_locked(&m->lock);
+
+	if (!timer_pending(&m->sync_work.timer))
+		return;
+	cancel_delayed_work(&m->sync_work);
+	pcs_map_put_locked(m);
+}
+static void sync_timer_work(struct work_struct *w);
+
+/* Returns map with incremented refcnt */
+struct pcs_map_entry * pcs_find_get_map(struct pcs_dentry_info *di, u64 offset)
+{
+	struct pcs_map_set * maps = &di->mapping.cluster->maps;
+	unsigned long idx = offset >> DENTRY_CHUNK_SIZE_BITS(di);
+	struct pcs_map_entry *m;
+
+again:
+	for (;;) {
+		rcu_read_lock();
+		m = radix_tree_lookup(&di->mapping.map_tree, idx);
+		if (m) {
+			BUG_ON(m->index != idx);
+			m = pcs_map_get(m);
+			rcu_read_unlock();
+			if (!m)
+				continue;
+			else
+				return m;
+		}
+		rcu_read_unlock();
+		/* No direct throttler here */
+		break;
+	}
+	m = kzalloc(sizeof(struct pcs_map_entry), GFP_NOIO);
+	if (!m)
+		return NULL;
+
+	if (radix_tree_preload(GFP_NOIO)) {
+		kfree(m);
+		return NULL;
+	}
+
+	m->mapping = NULL;
+	m->maps = NULL;
+	m->res_offset = offset;
+	m->chunk_psize = 0;
+	m->index = idx;
+
+	map_version_init(&m->version);
+	m->id = 0;		/* For logging only, it is not used before map is completed */
+	m->state = PCS_MAP_NEW;
+	m->flags = 0;
+	atomic_set(&m->__refcnt, 1);
+	m->mds_flags = 0;
+	m->cs_list = NULL;
+	m->error_tstamp = 0;
+	m->mapping = &di->mapping;
+	INIT_DELAYED_WORK(&m->sync_work, sync_timer_work);
+	INIT_LIST_HEAD(&m->queue);
+	INIT_LIST_HEAD(&m->lru_link);
+	spin_lock_init(&m->lock);
+	atomic_inc(&maps->count);
+	m->maps = maps;
+
+	spin_lock(&di->mapping.map_lock);
+	m->mapping->nrmaps++;
+	if (radix_tree_insert(&di->mapping.map_tree, idx, m)) {
+		m->mapping->nrmaps--;
+		spin_unlock(&di->mapping.map_lock);
+		radix_tree_preload_end();
+		kfree(m);
+		goto again;
+	}
+	spin_unlock(&di->mapping.map_lock);
+	radix_tree_preload_end();
+
+	return m;
+}
+
+/* When CS goes up/down invalidate read_index on all the maps using this CS.
+ * This results in reevaluation of CS used for reads from this chunk at the next read.
+ */
+
+static void map_recalc_maps(struct pcs_cs * cs)
+{
+	struct pcs_cs_link * csl;
+	assert_spin_locked(&cs->lock);
+
+	list_for_each_entry(csl, &cs->map_list, link) {
+		struct pcs_cs_record * cs_rec;
+		struct pcs_cs_list * cs_list;
+		int read_idx;
+
+		cs_rec = container_of(csl, struct pcs_cs_record, cslink);
+		cs_list = container_of(cs_rec - csl->index, struct pcs_cs_list, cs[0]);
+		read_idx = READ_ONCE(cs_list->read_index);
+
+		if (read_idx >= 0 && (!cs_is_blacklisted(cs) ||
+				      cs_list->cs[read_idx].cslink.cs == cs))
+		    WRITE_ONCE(cs_list->read_index, -1);
+	}
+}
+
+void pcs_map_force_reselect(struct pcs_cs * cs)
+{
+	struct pcs_cs_link * csl;
+	assert_spin_locked(&cs->lock);
+
+	list_for_each_entry(csl, &cs->map_list, link) {
+		struct pcs_cs_record * cs_rec;
+		struct pcs_cs_list * cs_list;
+		int read_idx;
+
+		cs_rec = container_of(csl, struct pcs_cs_record, cslink);
+		cs_list = container_of(cs_rec - csl->index, struct pcs_cs_list, cs[0]);
+		read_idx = READ_ONCE(cs_list->read_index);
+
+		if (read_idx >= 0 && cs_list->cs[read_idx].cslink.cs == cs)
+			WRITE_ONCE(cs_list->read_index, -1);
+	}
+}
+
+static int all_blacklisted(struct pcs_cs_list * csl)
+{
+	int i = 0;
+
+	for (i = 0; i < csl->nsrv; i++) {
+		if (test_bit(i, &csl->blacklist)) {
+			if (jiffies < READ_ONCE(csl->blacklist_expires))
+				continue;
+			TRACE("expire replication blacklist");
+			clear_bit(i, &csl->blacklist);
+		}
+		if (!test_bit(CS_SF_BLACKLISTED, &csl->cs[i].cslink.cs->state))
+			break;
+	}
+	return i == csl->nsrv;
+}
+
+static int urgent_whitelist(struct pcs_cs * cs)
+{
+	struct pcs_cs_link * csl;
+	assert_spin_locked(&cs->lock);
+
+	list_for_each_entry(csl, &cs->map_list, link) {
+		struct pcs_cs_record * cs_rec;
+		struct pcs_cs_list * cs_list;
+
+		cs_rec = container_of(csl, struct pcs_cs_record, cslink);
+		cs_list = container_of(cs_rec - csl->index, struct pcs_cs_list, cs[0]);
+
+		if (cs_list->map == NULL)
+			continue;
+
+		if (all_blacklisted(cs_list))
+			return 1;
+	}
+	return 0;
+}
+
+void cs_blacklist(struct pcs_cs * cs, int error, char * reason)
+{
+	assert_spin_locked(&cs->lock);
+
+	if (!cs_is_blacklisted(cs)) {
+		spin_lock(&cs->css->lock);
+		set_bit(CS_SF_BLACKLISTED, &cs->state);
+		cs->blacklist_reason = error;
+		TRACE("Blacklisting CS" NODE_FMT " by %s, err=%d", NODE_ARGS(cs->id), reason, error);
+		if (list_empty(&cs->css->bl_list)) {
+			struct pcs_cluster_core *cc = cc_from_csset(cs->css);
+
+			mod_delayed_work(cc->wq, &cs->css->bl_work, PCS_CS_BLACKLIST_TIMER);
+		}
+		list_add_tail(&cs->bl_link, &cs->css->bl_list);
+		spin_unlock(&cs->css->lock);
+		map_recalc_maps(cs);
+	}
+}
+
+static void cs_blacklist_unlocked(struct pcs_cs * cs, int error, char * reason)
+{
+	spin_lock(&cs->lock);
+	cs_blacklist(cs, error, reason);
+	spin_unlock(&cs->lock);
+}
+
+void cs_whitelist(struct pcs_cs * cs, char * reason)
+{
+	assert_spin_locked(&cs->lock);
+
+	if (cs_is_blacklisted(cs)) {
+		clear_bit(CS_SF_BLACKLISTED, &cs->state);
+		TRACE("Whitelisting CS" NODE_FMT " by %s", NODE_ARGS(cs->id), reason);
+
+		map_recalc_maps(cs);
+
+		spin_lock(&cs->css->lock);
+		list_del_init(&cs->bl_link);
+		if (list_empty(&cs->css->bl_list))
+			cancel_delayed_work(&cs->css->bl_work);
+		spin_unlock(&cs->css->lock);
+	}
+}
+
+static inline void __map_error(struct pcs_map_entry *m , int remote, int error, u64 offender)
+{
+	assert_spin_locked(&m->lock);
+	m->state = PCS_MAP_ERROR;
+	m->iofailure.remote = remote;
+	m->iofailure.value = error;
+	m->iofailure.offender.val = offender;
+}
+
+static inline void map_remote_error_nolock(struct pcs_map_entry *m , int error, u64 offender)
+{
+	__map_error(m, 1 , error, offender);
+}
+static void map_remote_error(struct pcs_map_entry *m , int error, u64 offender)
+{
+	spin_lock(&m->lock);
+	map_remote_error_nolock(m, error, offender);
+	spin_unlock(&m->lock);
+}
+
+void pcs_map_notify_addr_change(struct pcs_cs * cs)
+{
+	struct pcs_cs_link * csl;
+	assert_spin_locked(&cs->lock);
+
+	cs_whitelist(cs, "addr update");
+
+	list_for_each_entry(csl, &cs->map_list, link) {
+		struct pcs_cs_record * cs_rec;
+		struct pcs_cs_list * cs_list;
+		struct pcs_map_entry * m;
+
+		cs_rec = container_of(csl, struct pcs_cs_record, cslink);
+		cs_list = container_of(cs_rec - csl->index, struct pcs_cs_list, cs[0]);
+
+		if (csl->addr_serno == cs->addr_serno)
+			continue;
+
+		if ((m = cs_list->map) == NULL)
+			continue;
+
+		spin_lock(&m->lock);
+		if ((m->state & PCS_MAP_DEAD) || m->cs_list != cs_list)
+			goto unlock;
+
+		if (m->state & (PCS_MAP_ERROR|PCS_MAP_RESOLVING|PCS_MAP_NEW))
+			goto unlock;
+
+		TRACE(MAP_FMT " invalidating due to address change of CS#"NODE_FMT,
+		      MAP_ARGS(m), NODE_ARGS(cs->id));
+
+		map_remote_error_nolock(m, PCS_ERR_CSD_STALE_MAP, cs->id.val);
+	unlock:
+		spin_unlock(&m->lock);
+
+	}
+}
+
+noinline static void pcs_ireq_queue_fail(struct list_head *queue, int error)
+{
+	while (!list_empty(queue)) {
+		struct pcs_int_request *ireq = list_first_entry(queue, struct pcs_int_request, list);
+
+		list_del_init(&ireq->list);
+
+		pcs_set_local_error(&ireq->error, error);
+
+		if (ireq->type == PCS_IREQ_TRUNCATE) {
+			ireq_on_error(ireq);
+
+			if (!(ireq->flags & IREQ_F_FATAL)) {
+				if (ireq_is_timed_out(ireq)) {
+					pcs_log(LOG_ERR, "timeout while truncate(%d) request on \"" DENTRY_FMT "\" last err=%u",
+						ireq->type, DENTRY_ARGS(ireq->dentry), ireq->error.value);
+					BUG();
+				}
+				pcs_clear_error(&ireq->error);
+
+				TRACE("requeue truncate(%d) %llu@" DENTRY_FMT "\n", ireq->type,
+				      (unsigned long long)ireq->truncreq.offset, DENTRY_ARGS(ireq->dentry));
+
+				ireq_delay(ireq);
+				continue;
+			}
+		}
+		ireq_complete(ireq);
+	}
+}
+
+void transfer_sync_data(struct pcs_cs_list * new_cs_list, struct pcs_cs_list * old_cs_list)
+{
+	int i, k;
+
+	if (new_cs_list->nsrv == 0 || old_cs_list->nsrv == 0)
+		return;
+
+	for (i = 0; i < new_cs_list->nsrv; i++) {
+		for (k = 0; k < old_cs_list->nsrv; k++) {
+			if (old_cs_list->cs[k].info.id.val == new_cs_list->cs[i].info.id.val) {
+				new_cs_list->cs[i].sync = old_cs_list->cs[k].sync;
+				break;
+			}
+		}
+	}
+}
+
+static int cs_is_dirty(struct cs_sync_state * sync)
+{
+	int res;
+
+	if (!sync->dirty_integrity || !sync->dirty_epoch || !sync->dirty_seq)
+		return 0;
+
+	res = pcs_sync_seq_compare(sync->dirty_epoch, sync->sync_epoch);
+	if (!res)
+		res = pcs_sync_seq_compare(sync->dirty_seq, sync->sync_seq);
+
+	return res >= 0;
+}
+
+static void evaluate_dirty_status(struct pcs_map_entry * m)
+{
+	int i;
+
+	assert_spin_locked(&m->lock);
+
+	if (m->flags & PCS_MAP_DIRTY) {
+		m->flags &= ~PCS_MAP_DIRTY;
+		atomic_dec(&m->maps->dirty_count);
+	}
+
+	if (m->cs_list == NULL)
+		return;
+
+	for (i = 0; i < m->cs_list->nsrv; i++) {
+		struct pcs_cs_record * rec = m->cs_list->cs + i;
+
+		BUG_ON(rec->info.integrity_seq == 0);
+
+		if (cs_is_dirty(&rec->sync)) {
+			if (rec->sync.dirty_integrity == rec->info.integrity_seq) {
+				if (!(m->flags & PCS_MAP_DIRTY)) {
+					m->flags |= PCS_MAP_DIRTY;
+					atomic_inc(&m->maps->dirty_count);
+				}
+			} else {
+				TRACE(MAP_FMT " integrity seq advanced on CS#"NODE_FMT,
+				      MAP_ARGS(m), NODE_ARGS(rec->info.id));
+
+				rec->sync.dirty_integrity = 0;
+				rec->sync.dirty_epoch = 0;
+				rec->sync.dirty_seq = 0;
+			}
+		} else
+			rec->sync.dirty_integrity = 0;
+	}
+
+	if (!(m->flags & PCS_MAP_DIRTY)) {
+		map_sync_work_del(m);
+		pcs_log(LOG_DEBUG5, "map %p is clean", m);
+	} else {
+		pcs_log(LOG_DEBUG5, "map %p is dirty", m);
+		if (!timer_pending(&m->sync_work.timer) && !(m->flags & PCS_MAP_FLUSHING))
+			map_sync_work_add(m, pcs_sync_timeout(cc_from_map(m)));
+	}
+}
+
+int pcs_map_encode_req(struct pcs_map_entry*m, struct pcs_ioc_getmap *map, int direction)
+{
+	int i;
+
+	spin_lock(&m->lock);
+	BUG_ON(map_chunk_start(m) > m->res_offset);
+	BUG_ON(map_chunk_end(m) < m->res_offset);
+	/*
+	 * Someone truncate mapping while IO is in progress
+	 * aio_dio vs truncate race ?
+	*/
+	if (m->state & PCS_MAP_DEAD) {
+		spin_unlock(&m->lock);
+		pcs_map_put(m);
+		return 1;
+	}
+
+	map->uid = m->id;
+	map->version = m->version;
+	map->chunk_start = m->res_offset;
+	map->chunk_end = map_chunk_end(m);
+	map->state = 0;
+	if (m->state & PCS_MAP_READABLE)
+		map->state |= PCS_IOC_MAP_S_READ;
+	if (m->state & PCS_MAP_WRITEABLE || direction)
+		map->state |= PCS_IOC_MAP_S_WRITE;
+	if (m->state & PCS_MAP_NEW)
+		map->state |= PCS_IOC_MAP_S_NEW;
+	if (m->state & PCS_MAP_ERROR) {
+		map->state |= PCS_IOC_MAP_S_ERROR;
+		map->error = m->iofailure;
+	}
+	map->mds_flags = m->mds_flags;
+	map->psize_ret = 0;  /* UNUSED */
+	map->chunk_psize = 0; /* UNUSED */
+
+	if (m->cs_list && m->cs_list->nsrv) {
+		map->cs_cnt = m->cs_list->nsrv;
+		for (i = 0; i < m->cs_list->nsrv; i++) {
+			map->cs[i] = m->cs_list->cs[i].info;
+			if (!(m->flags & PCS_MAP_DIRTY) || !cs_is_dirty(&m->cs_list->cs[i].sync))
+				map->cs[i].integrity_seq = 0;
+		}
+	}
+
+#ifdef __PCS_DEBUG
+	printk("%s submit  m(%p)->uid:%lld\n", __FUNCTION__, m, m->id);
+	printk("map {id:%lld [%lld, %lld] v:{" VER_FMT "} st:%x, cnt:%d max:%d SZ:%ld}\n",
+	       m->id, map->chunk_start, map->chunk_end, VER_ARGS(m->version),
+	       map->state, map->cs_cnt, map->cs_max, map_sz);
+
+	printk("cs_list: ");
+	for (i = 0; i < map->cs_cnt; i++) {
+		printk("[%d]{id:%lld fl:%x} ",
+		       i, map->cs[i].id.val, map->cs[i].flags);
+	}
+	printk("\n.");
+#endif
+	spin_unlock(&m->lock);
+	return 0;
+}
+
+/*
+ * Alloc and initialize cslist, grab cs->lock inside
+ */
+struct pcs_cs_list* cslist_alloc( struct pcs_cs_set *css, struct pcs_cs_info *rec, int cs_cnt,
+				     int read_tout, int write_tout, int error_clear)
+{
+	struct pcs_cs_list * cs_list = NULL;
+	int i;
+
+	cs_list = kzalloc(sizeof(struct pcs_cs_list) + cs_cnt * sizeof(struct pcs_cs_record), GFP_NOFS);
+	if (!cs_list)
+		return NULL;
+
+	atomic_set(&cs_list->refcnt, 1);
+	atomic_set(&cs_list->seq_read_in_flight, 0);
+	cs_list->read_index = -1;
+	cs_list->cong_index = -1 ;
+	cs_list->flags = 0;
+	cs_list->blacklist = 0;
+	cs_list->read_timeout = (read_tout * HZ) / 1000;
+	cs_list->write_timeout = (write_tout * HZ) / 1000;
+	cs_list->nsrv = cs_cnt;
+	for (i = 0; i < cs_cnt; i++) {
+		cs_list->cs[i].info = rec[i];
+		memset(&cs_list->cs[i].sync, 0, sizeof(cs_list->cs[i].sync));
+		cs_list->cs[i].cslink.cs = NULL;
+		INIT_LIST_HEAD(&cs_list->cs[i].cslink.link);
+		cs_list->cs[i].cslink.index = i;
+	}
+
+
+	for (i = 0; i < cs_cnt; i++) {
+		struct pcs_cs_link * cslink = &cs_list->cs[i].cslink;
+		struct pcs_cs * cs;
+
+		if (cs_list->cs[i].info.flags & CS_FL_REPLICATING) {
+			__set_bit(i, &cs_list->blacklist);
+			cs_list->blacklist_expires = jiffies + PCS_REPLICATION_BLACKLIST_TIMEOUT;
+		}
+
+		cs = pcs_cs_find_create(css, &cs_list->cs[i].info.id,
+				 &cs_list->cs[i].info.addr, cs_list->cs[i].info.flags);
+
+		if (!cs) {
+			cslist_destroy(cs_list);
+			return NULL;
+		}
+		assert_spin_locked(&cs->lock);
+		BUG_ON(cs->is_dead);
+
+		cslink->cs = cs;
+		cslink->addr_serno = cs->addr_serno;
+
+		cs->io_prio = cs_list->cs[i].info.io_prio;
+		cs->net_prio = cs_list->cs[i].info.net_prio;
+		cs->io_prio_stamp = jiffies;
+
+		/* update cs state */
+		cs->mds_flags = cs_list->cs[i].info.flags;
+		if (cs->mds_flags & CS_FL_LOCAL) {
+			set_bit(CS_SF_LOCAL, &cs->state);
+			cs_list->flags |= CSL_FL_HAS_LOCAL;
+		}
+		if (cs->mds_flags & CS_FL_LOCAL_SOCK)
+			set_bit(CS_SF_LOCAL_SOCK, &cs->state);
+		if (cs->mds_flags & CS_FL_INACTIVE) {
+			set_bit(CS_SF_INACTIVE, &cs->state);
+			cs_blacklist(cs, PCS_ERR_NET_ABORT, "mds hint");
+		}
+		if (cs->mds_flags & CS_FL_REPLICATING)
+			set_bit(CS_SF_REPLICATING, &cs->state);
+		if (cs->mds_flags & CS_FL_FAILED)
+			set_bit(CS_SF_FAILED, &cs->state);
+
+		list_add(&cslink->link, &cs->map_list);
+		cs->nmaps++;
+		spin_unlock(&cs->lock);
+	}
+
+	for (i = cs_cnt - 1; i >= 0; i--) {
+		struct pcs_cs * cs = cs_list->cs[i].cslink.cs;
+		spin_lock(&cs->lock);
+		if (cs_is_blacklisted(cs) && !(test_bit(CS_SF_INACTIVE, &cs->state))) {
+			if (error_clear)
+				cs_whitelist(cs, "mds hint");
+			else if (urgent_whitelist(cs))
+				cs_whitelist(cs, "urgent");
+		}
+		spin_unlock(&cs->lock);
+	}
+
+	return cs_list;
+}
+
+void pcs_map_complete(struct pcs_map_entry *m, struct pcs_ioc_getmap *omap)
+{
+	pcs_error_t error = omap->error;
+	struct pcs_cs_list * cs_list = NULL;
+	struct list_head queue;
+	int error_sensed = 0;
+
+	INIT_LIST_HEAD(&queue);
+
+	spin_lock(&m->lock);
+
+	TRACE(" recv m: " MAP_FMT " resp{ st:%d, err:%d, v:" VER_FMT "}\n",
+	       MAP_ARGS(m), omap->state, omap->error.value, VER_ARGS(omap->version));
+
+	if (pcs_if_error(&omap->error))
+		goto error;
+
+	if (m->state & PCS_MAP_DEAD) {
+		spin_unlock(&m->lock);
+		goto out_ignore;
+	}
+
+	error_sensed = m->state & PCS_MAP_ERROR;
+
+	if (omap->cs_cnt) {
+		spin_unlock(&m->lock);
+		cs_list = cslist_alloc(&cc_from_map(m)->css, omap->cs, omap->cs_cnt, omap->read_tout, omap->write_tout, error_sensed);
+		spin_lock(&m->lock);
+		if (!cs_list) {
+			pcs_set_local_error(&error, PCS_ERR_NOMEM);
+			goto error;
+		}
+		/* Recheck one more time because we drop the lock */
+		if (m->state & PCS_MAP_DEAD) {
+			spin_unlock(&m->lock);
+			goto out_ignore;
+		}
+	}
+
+	if (!(m->state & PCS_MAP_RESOLVING)) {
+		/* This may happen because of __pcs_map_error() explicit assign
+		   m->state = PCS_MAP_ERROR;
+		   If m->state becomes atomic bit fields this will be impossible.
+		 */
+		spin_unlock(&m->lock);
+		goto out_ignore;
+	}
+	pcs_map_reset(m);
+	m->id = omap->uid;
+	m->version = omap->version;
+
+	if (cs_list) {
+		if (m->cs_list) {
+			transfer_sync_data(cs_list, m->cs_list);
+			map_drop_cslist(m);
+		}
+		cs_list->map = m;
+		cs_list->version = m->version;
+		m->cs_list = cs_list;
+		cs_list = NULL;
+	} else if (m->state & PCS_MAP_NEW) {
+		/* This suppose to be zero chunk */
+		BUG_ON(!(m->state & (PCS_MAP_READABLE|PCS_MAP_NEW)));
+		map_drop_cslist(m);
+		m->chunk_psize = 0;
+		if (m->flags & PCS_MAP_DIRTY) {
+			m->flags &= ~PCS_MAP_DIRTY;
+			atomic_dec(&m->maps->dirty_count);
+		}
+
+	}
+
+	m->state = 0;
+	if (omap->state & PCS_IOC_MAP_S_READ)
+		m->state |= PCS_MAP_READABLE;
+	if (omap->state & PCS_IOC_MAP_S_WRITE)
+		m->state |= PCS_MAP_WRITEABLE;
+	if (omap->state & PCS_IOC_MAP_S_ERROR)
+		m->state |= PCS_MAP_ERROR;
+	if (omap->state & PCS_IOC_MAP_S_NEW) {
+		m->state |= PCS_MAP_NEW;
+		/* Userspace has optimization which may return map
+		 * which cover larger range, But this complicate locking.
+		 * Simply ignore it for now. */
+		if (omap->chunk_start < map_chunk_start(m))
+			omap->chunk_start = map_chunk_start(m);
+		if (map_chunk_end(m) < omap->chunk_end)
+			omap->chunk_end = map_chunk_end(m);
+	}
+	m->mds_flags = omap->mds_flags;
+	m->chunk_psize = omap->chunk_psize; /* UNUSED */
+	m->res_offset  = omap->chunk_start;
+	if (map_chunk_start(m) != omap->chunk_start ||
+	    map_chunk_end(m)   != omap->chunk_end) {
+		BUG();
+	}
+
+	evaluate_dirty_status(m);
+#ifdef __PCS_DEBUG
+	if (1) {
+		int i;
+		TRACE(MAP_FMT " -> " CUID_FMT " psize=%u %d node map { ",
+			MAP_ARGS(m), CUID_ARGS(m->id),
+		      m->chunk_psize, m->cs_list ? m->cs_list->nsrv : 0);
+		if (m->cs_list) {
+			for (i = 0; i < m->cs_list->nsrv; i++)
+				printk( NODE_FMT ":%x:%u ",
+				       NODE_ARGS(m->cs_list->cs[i].info.id),
+				       m->cs_list->cs[i].info.flags,
+				       CS_FL_ROLE_GET(m->cs_list->cs[i].info.flags));
+		}
+		printk("}\n");
+	}
+#endif
+	m->error_tstamp = 0;
+	list_splice_tail_init(&m->queue, &queue);
+	spin_unlock(&m->lock);
+
+	/* Success, resubmit waiting requests */
+	pcs_cc_requeue(cc_from_map(m), &queue);
+	BUG_ON(!list_empty(&queue));
+	pcs_map_put(m);
+
+	return;
+
+error:
+	TRACE(" map error: %d for " MAP_FMT "\n", error.value, MAP_ARGS(m));
+	BUG_ON(!pcs_if_error(&error));
+
+	m->state &= ~PCS_MAP_RESOLVING;
+	m->error_tstamp = jiffies;
+	list_splice_tail_init(&m->queue, &queue);
+	pcs_map_reset(m);
+	spin_unlock(&m->lock);
+
+	pcs_ireq_queue_fail(&queue, error.value);
+out_ignore:
+	BUG_ON(!list_empty(&queue));
+	pcs_map_put(m);
+	if (cs_list)
+		cslist_put(cs_list);
+}
+
+/* Atomically schedule map resolve and push ireq to wait completion */
+static void pcs_map_queue_resolve(struct pcs_map_entry * m, struct pcs_int_request *ireq, int direction)
+{
+
+	DTRACE("enter m: " MAP_FMT ", ireq:%p dir:%d \n", MAP_ARGS(m), ireq,   direction);
+
+	spin_lock(&m->lock);
+	/* This should not happen unless aio_dio/fsync vs truncate race */
+	if (m->state & PCS_MAP_DEAD) {
+		struct list_head l;
+
+		spin_unlock(&m->lock);
+		INIT_LIST_HEAD(&l);
+		list_add(&ireq->list, &l);
+		pcs_ireq_queue_fail(&l, PCS_ERR_NET_ABORT);
+		return;
+	}
+	DTRACE("%p {%p %p}\n",ireq,  ireq->list.next, ireq->list.prev);
+	BUG_ON(!list_empty(&ireq->list));
+
+	list_add_tail(&ireq->list, &m->queue);
+	if (m->state & PCS_MAP_RESOLVING) {
+		spin_unlock(&m->lock);
+		return;
+	}
+	/* If converting a hole, adjust res_offset */
+	if (direction && !m->cs_list && !(m->state & PCS_MAP_RESOLVING)
+	    && ireq->type == PCS_IREQ_IOCHUNK)
+		m->res_offset = ireq->iochunk.chunk + ireq->iochunk.offset;
+
+	m->state |= PCS_MAP_RESOLVING;
+	__pcs_map_get(m); /* drop on pcs_map_complete */
+
+	spin_unlock(&m->lock);
+	/// TODO: THINK!!!!
+	/// May be it is reasonable to schedule fuse_map_resolve from work_queue?
+	fuse_map_resolve(m, direction);
+}
+
+/* If version on m is not already advanced, we must notify MDS about the error.
+ * It it is still not advanced, we just ignore the error in hope new map
+ * will work.
+ */
+static void map_notify_error(struct pcs_map_entry * m, struct pcs_int_request * sreq,
+			     PCS_MAP_VERSION_T * failed_version, struct pcs_cs_list * csl)
+{
+	int cs_notify = 0;
+
+	spin_lock(&m->lock);
+	if (m->state & PCS_MAP_DEAD) {
+		spin_unlock(&m->lock);
+		return;
+	}
+	if (sreq->error.remote &&
+	    !(m->state & (PCS_MAP_ERROR|PCS_MAP_NEW|PCS_MAP_RESOLVING|PCS_MAP_DEAD)) &&
+	    map_version_compare(failed_version, &m->version) >= 0) {
+		int suppress_error = 0;
+
+		if (csl) {
+			int i;
+
+			for (i = 0; i < csl->nsrv; i++) {
+				if (csl->cs[i].info.id.val == sreq->error.offender.val) {
+					if (csl->cs[i].cslink.cs->addr_serno != csl->cs[i].cslink.addr_serno) {
+						TRACE("error for CS"NODE_FMT " has been suppressed", NODE_ARGS(sreq->error.offender));
+						suppress_error = 1;
+					}
+					break;
+				}
+			}
+		}
+		if (suppress_error)
+			map_remote_error_nolock(m, PCS_ERR_CSD_STALE_MAP, sreq->error.offender.val);
+		else {
+			map_remote_error_nolock(m, sreq->error.value, sreq->error.offender.val);
+			cs_notify = 1;
+		}
+	}
+	spin_unlock(&m->lock);
+	if (cs_notify)
+		pcs_cs_notify_error(sreq->dentry->cluster, &sreq->error);
+
+}
+
+/* This function notifies map about fatal error, which does not result in request restart.
+ * Even though the request is not retried internally, it can be retried by client, so that
+ * we have to force invalidation of current version.
+ */
+void map_notify_iochunk_error(struct pcs_int_request * sreq)
+{
+	struct pcs_map_entry * m = sreq->iochunk.map;
+
+	if (!m || (m->state & PCS_MAP_DEAD))
+		return;
+
+	map_notify_error(m, sreq, &sreq->iochunk.hbuf.map_version, sreq->iochunk.csl);
+}
+
+static void map_replicating(struct pcs_int_request *ireq)
+{
+	struct pcs_cs_list * csl = ireq->iochunk.csl;
+	int read_idx = READ_ONCE(csl->read_index);
+
+	BUG_ON(ireq->iochunk.direction);
+
+	if (csl == NULL || csl->map == NULL)
+		return;
+
+	TRACE("reading unfinished replica %lx %d", csl->blacklist, read_idx);
+
+	if (ireq->iochunk.cs_index != read_idx)
+		return;
+
+	BUG_ON(read_idx < 0 || read_idx > csl->nsrv);
+
+	if (!ireq->error.remote ||
+	    csl->cs[read_idx].cslink.cs->id.val != ireq->error.offender.val) {
+		TRACE("wrong cs id " NODE_FMT " " NODE_FMT, NODE_ARGS(csl->cs[read_idx].cslink.cs->id), NODE_ARGS(ireq->error.offender));
+		return;
+	}
+
+	/* If request was issued for the last CS in the list, clear error. */
+	pcs_clear_error(&ireq->error);
+	WRITE_ONCE(csl->blacklist_expires, jiffies + PCS_REPLICATION_BLACKLIST_TIMEOUT);
+
+	/* And blacklist the last replica */
+	if (!(test_bit(read_idx, &csl->blacklist))) {
+		WRITE_ONCE(csl->read_index, -1);
+		set_bit(read_idx, &csl->blacklist);
+	}
+}
+
+static void map_read_error(struct pcs_int_request *ireq)
+{
+	struct pcs_cs_list * csl = ireq->iochunk.csl;
+	struct pcs_cs * cs;
+
+	BUG_ON(ireq->iochunk.direction);
+
+	if (csl == NULL || csl->map == NULL || (csl->map->state & PCS_MAP_ERROR))
+		return;
+
+	cs = csl->cs[ireq->iochunk.cs_index].cslink.cs;
+
+	if (ireq->flags & IREQ_F_MAPPED) {
+		cs_blacklist_unlocked(cs, ireq->error.value, "error on directly mapped CS");
+		return;
+	}
+
+	/* If everything is already backlisted, proceed reporting error to MDS */
+	if (all_blacklisted(csl)) {
+		cs_blacklist_unlocked(cs, ireq->error.value, "total read error");
+		return;
+	}
+
+	/* If this CS is already blacklisted, select another CS, we have spare ones */
+	if (cs_is_blacklisted(cs)) {
+		TRACE("Skipping CS" NODE_FMT, NODE_ARGS(cs->id));
+		WRITE_ONCE(csl->read_index, -1);
+		pcs_clear_error(&ireq->error);
+		return;
+	}
+
+	/* Mark CS as dubioius */
+	if (csl->cs[ireq->iochunk.cs_index].cslink.addr_serno == cs->addr_serno)
+		cs_blacklist_unlocked(cs, ireq->error.value, "read error");
+
+	/* If some clean CSes remained, select another one, otherwise report error to MDS */
+	if (!all_blacklisted(csl)) {
+		WRITE_ONCE(csl->read_index, -1);
+		pcs_clear_error(&ireq->error);
+	}
+}
+
+static unsigned int cong_roundup(unsigned int size)
+{
+	return (size + 65535) & ~65535;
+}
+
+static int worth_to_grow(struct pcs_int_request *ireq, struct pcs_cs * cs)
+{
+	if (ireq->type == PCS_IREQ_FLUSH)
+		return 0;
+
+	return jiffies < ireq->ts_sent + cc_from_csset(cs->css)->netlat_cutoff;
+}
+
+static void pcs_cs_deaccount(struct pcs_int_request *ireq, struct pcs_cs * cs, int error)
+{
+	unsigned int cost;
+
+	spin_lock(&cs->lock);
+	if (ireq->type == PCS_IREQ_IOCHUNK)
+		cost = (ireq->flags & IREQ_F_RND_WEIGHT) ? 512*1024 : cong_roundup(ireq->iochunk.size);
+	else
+		cost = PCS_CS_FLUSH_WEIGHT;
+
+	if (!error) {
+		int iolat_cutoff = cc_from_csset(cs->css)->iolat_cutoff;
+
+		if (cs->last_latency > iolat_cutoff && ireq->type != PCS_IREQ_FLUSH) {
+			unsigned int clamp;
+
+			clamp = PCS_CS_INIT_CWND;
+			if (cs->last_latency > iolat_cutoff*8)
+				clamp = PCS_CS_INIT_CWND/8;
+			else if (cs->last_latency > iolat_cutoff*4)
+				clamp = PCS_CS_INIT_CWND/4;
+			else if (cs->last_latency > iolat_cutoff*2)
+				clamp = PCS_CS_INIT_CWND/2;
+
+			TRACE("IO latency on CS" NODE_FMT " is %u, cwnd %u, clamp %u", NODE_ARGS(cs->id), cs->last_latency, cs->cwnd, clamp);
+
+			if (cs->cwnd > clamp)
+				cs->cwnd = clamp;
+		} else if (cs->in_flight >= cs->cwnd && !cs->cwr_state && worth_to_grow(ireq, cs)) {
+			unsigned int cwnd;
+
+			if (cs->cwnd < PCS_CS_INIT_CWND)
+				cwnd = cs->cwnd + cost;
+			else
+				cwnd = cs->cwnd + 0x100000000ULL/cs->cwnd;
+
+			if (cwnd > PCS_CS_MAX_CWND)
+				cwnd = PCS_CS_MAX_CWND;
+			if (cwnd != cs->cwnd) {
+				cs->cwnd = cwnd;
+				DTRACE("Congestion window on CS" NODE_FMT " UP %u", NODE_ARGS(cs->id), cwnd);
+			}
+		}
+		cs->eff_cwnd = cs->cwnd;
+		cs_whitelist(cs, "io hint");
+	} else if (error > 0) {
+		/* In case of error coming from some CS temporarily shrink congestion
+		 * window to minimum to allow one request in flight. It will come back to normal
+		 * as soon as CS is probed for aliveness.
+		 */
+		TRACE("Congestion window on CS" NODE_FMT " is closed (%u)", NODE_ARGS(cs->id), cs->cwnd);
+		cs->eff_cwnd = 1;
+	}
+	cs_decrement_in_flight(cs, cost);
+	spin_unlock(&cs->lock);
+}
+
+static void pcs_cs_wakeup(struct pcs_cs * cs, int requeue)
+{
+	struct pcs_int_request * sreq;
+	struct pcs_map_entry * map;
+
+	while (1) {
+		spin_lock(&cs->lock);
+
+		if (cs->in_flight >= cs->eff_cwnd || list_empty(&cs->active_list)) {
+			spin_unlock(&cs->lock);
+			break;
+		}
+		sreq = list_first_entry(&cs->active_list, struct pcs_int_request, list);
+		BUG_ON(!cs->active_list_len);
+		list_del_init(&sreq->list);
+		cs->active_list_len--;
+		spin_unlock(&cs->lock);
+
+		if (sreq->type != PCS_IREQ_FLUSH) {
+			map = pcs_find_get_map(sreq->dentry, sreq->iochunk.chunk +
+						   ((sreq->flags & IREQ_F_MAPPED) ? 0 : sreq->iochunk.offset));
+			if (map) {
+				if (sreq->iochunk.map)
+					pcs_map_put(sreq->iochunk.map);
+				sreq->iochunk.map = map;
+				if (sreq->iochunk.flow) {
+					struct pcs_int_request * preq = sreq->completion_data.parent;
+
+					pcs_flow_confirm(sreq->iochunk.flow, &map->mapping->ftab, preq->apireq.req->type == PCS_REQ_T_WRITE,
+							 preq->apireq.req->pos, preq->apireq.req->size,
+							 &sreq->cc->maps.ftab);
+				}
+				map_submit(map, sreq, requeue);
+			} else {
+				map_queue_on_limit(sreq);
+			}
+		} else {
+			map = sreq->flushreq.map;
+			if (map->state & PCS_MAP_DEAD) {
+				pcs_clear_error(&sreq->error);
+				ireq_complete(sreq);
+			} else
+				map_submit(map, sreq, requeue);
+		}
+	}
+}
+
+static int __pcs_cs_still_congested(struct pcs_cs * cs)
+{
+
+	assert_spin_locked(&cs->lock);
+
+	if (!list_empty(&cs->active_list)) {
+		BUG_ON(!cs->active_list_len);
+		list_splice_tail(&cs->active_list, &cs->cong_queue);
+		cs->cong_queue_len += cs->active_list_len;
+		set_bit(CS_SF_CONGESTED, &cs->state);
+		pcs_cs_init_active_list(cs);
+	} else if (list_empty(&cs->cong_queue)) {
+		BUG_ON(cs->cong_queue_len);
+		BUG_ON(test_bit(CS_SF_CONGESTED, &cs->state));
+		return 0;
+	} else {
+		BUG_ON(cs->active_list_len);
+	}
+
+	if (cs->in_flight >= cs->eff_cwnd)
+		return 0;
+
+	/* Exceptional situation: CS is not congested, but still has congestion queue.
+	 * This can happen f.e. when CS was congested with reads and has some writes in queue,
+	 * then all reads are complete, but writes cannot be sent because of congestion
+	 * on another CSes in chain. This is absolutely normal, we just should queue
+	 * not on this CS, but on actualle congested CSes. With current algorithm of preventing
+	 * reordering, we did a mistake and queued on node which used to be congested.
+	 * Solution for now is to retry sending with flag "requeue" set, it will requeue
+	 * requests on another nodes. It is difficult to say how frequently this happens,
+	 * so we spit out message. If we will have lots of them in logs, we have to select
+	 * different solution.
+	 */
+
+	TRACE("CS#" NODE_FMT " is free, but still has queue", NODE_ARGS(cs->id));
+	pcs_cs_flush_cong_queue(cs);
+
+	return 1;
+}
+static int pcs_cs_still_congested(struct pcs_cs * cs)
+{
+	int ret;
+
+	spin_lock(&cs->lock);
+	ret = __pcs_cs_still_congested(cs);
+	spin_unlock(&cs->lock);
+	return ret;
+}
+
+void pcs_deaccount_ireq(struct pcs_int_request *ireq, pcs_error_t * err)
+{
+	int error = 0;
+	unsigned long long match_id = 0;
+	struct pcs_cs_list * csl, ** csl_p = 0;
+
+	switch (ireq->type) {
+	case PCS_IREQ_IOCHUNK:
+		csl_p = &ireq->iochunk.csl;
+		if (ireq->iochunk.map) {
+			pcs_map_put(ireq->iochunk.map);
+			ireq->iochunk.map = NULL;
+		}
+		break;
+	case PCS_IREQ_FLUSH:
+		csl_p = &ireq->flushreq.csl;
+		break;
+	default:
+		BUG();
+	}
+
+	if ((csl = *csl_p) == NULL)
+		return;
+
+	if (pcs_if_error(err)) {
+		if (!err->remote) {
+			error = -1;
+		} else {
+			match_id = err->offender.val;
+			error = err->value;
+
+			switch (error) {
+			case PCS_ERR_CSD_STALE_MAP:
+			case PCS_ERR_CSD_REPLICATING:
+			case PCS_ERR_CSD_RO_MAP:
+				error = 0;
+			}
+		}
+	}
+
+	if (ireq->type == PCS_IREQ_FLUSH || (ireq->iochunk.direction && !(ireq->flags & IREQ_F_MAPPED))) {
+		int i;
+		int requeue = 0;
+
+		for (i = csl->nsrv - 1; i >= 0; i--) {
+			if (!match_id || csl->cs[i].cslink.cs->id.val == match_id)
+				break;
+
+			pcs_cs_deaccount(ireq, csl->cs[i].cslink.cs, -1);
+		}
+
+		if (i >= 0) {
+			pcs_cs_deaccount(ireq, csl->cs[i].cslink.cs, error);
+			i--;
+		}
+
+		for ( ; i >= 0; i--) {
+			pcs_cs_deaccount(ireq, csl->cs[i].cslink.cs, 0);
+		}
+
+		do {
+			for (i = csl->nsrv - 1; i >= 0; i--)
+				pcs_cs_wakeup(csl->cs[i].cslink.cs, requeue);
+
+			requeue = 0;
+			for (i = csl->nsrv - 1; i >= 0; i--)
+				requeue += pcs_cs_still_congested(csl->cs[i].cslink.cs);
+		} while (requeue);
+	} else {
+		int requeue = 0;
+		struct pcs_cs * rcs = csl->cs[ireq->iochunk.cs_index].cslink.cs;
+
+		if (ireq->flags & IREQ_F_SEQ_READ) {
+			ireq->flags &= ~IREQ_F_SEQ_READ;
+			if (atomic_dec_and_test(&csl->seq_read_in_flight))
+				WRITE_ONCE(csl->select_stamp, jiffies);
+		}
+
+		pcs_cs_deaccount(ireq, rcs, error);
+
+		do {
+			pcs_cs_wakeup(rcs, requeue);
+
+			requeue = pcs_cs_still_congested(rcs);
+		} while (requeue);
+	}
+	*csl_p = NULL;
+	cslist_put(csl);
+}
+
+void map_notify_soft_error(struct pcs_int_request *ireq)
+{
+	pcs_error_t err;
+
+	if (ireq->error.value == PCS_ERR_CSD_REPLICATING)
+		map_replicating(ireq);
+
+	if (ireq->error.value == PCS_ERR_CANCEL_KEEPWAIT)
+		pcs_clear_error(&ireq->error);
+
+	err = ireq->error;
+
+	if (!ireq->iochunk.direction &&
+	    pcs_if_error(&err) &&
+	    err.remote &&
+	    err.value != PCS_ERR_CSD_STALE_MAP &&
+	    err.value != PCS_ERR_CSD_REPLICATING &&
+	    err.value != PCS_ERR_CSD_RO_MAP)
+		map_read_error(ireq);
+
+	if (pcs_if_error(&ireq->error))
+		map_notify_iochunk_error(ireq);
+
+	if (map_version_compare(&ireq->iochunk.hbuf.map_version, &ireq->iochunk.map->version) < 0)
+		ireq->flags &= ~IREQ_F_ONCE;
+
+	pcs_deaccount_ireq(ireq, &err);
+}
+
+static unsigned int map_ioprio_to_latency(unsigned int io_prio)
+{
+	static unsigned int map[] = {
+		50000,
+		50000,
+		10000,
+		4000,
+		2000,
+	};
+
+	if (io_prio < sizeof(map)/sizeof(map[0]))
+		return map[io_prio];
+	else
+		return 500;
+}
+
+static int get_io_locality(struct pcs_cluster_core *cc)
+{
+	int io_locality;
+
+	io_locality = cc->io_locality;
+	if (io_locality == 0)
+		io_locality = cc->cfg.curr.io_locality;
+
+	return io_locality;
+}
+
+static unsigned int get_io_tweaks(struct pcs_cluster_core *cc)
+{
+	unsigned int io_tweaks;
+
+	io_tweaks = cc->io_tweaks;
+	if (io_tweaks == 0)
+		io_tweaks = cc->cfg.curr.io_tweaks;
+
+	return io_tweaks;
+}
+
+static int select_cs_for_read(struct pcs_cluster_core *cc, struct pcs_cs_list * csl, int is_seq, unsigned int pos, PCS_NODE_ID_T banned_cs)
+{
+	abs_time_t now = jiffies;
+	unsigned int local_min, remote_min, local_pipe, remote_pipe;
+	unsigned int local_mask, local_busy_mask;
+	int local_idx, remote_idx, selected;
+	int io_locality = get_io_locality(cc);
+	int io_cost;
+	int failed_cnt = 0;
+	int i;
+
+next_pass:
+
+	local_min = remote_min = local_pipe = remote_pipe = ~0U;
+	local_idx = remote_idx = -1;
+	local_mask = local_busy_mask = 0;
+
+	for (i = csl->nsrv - 1; i >= 0; i--) {
+		struct pcs_cs * cs = csl->cs[i].cslink.cs;
+		unsigned int w, io_lat, net_lat;
+		unsigned int in_flight;
+		abs_time_t io_prio_stamp;
+
+		if (failed_cnt >= 0 && ((test_bit(CS_SF_FAILED, &cs->state)) || cs->id.val == banned_cs.val)) {
+			failed_cnt++;
+			continue;
+		}
+
+		if (test_bit(i, &csl->blacklist)) {
+			if (jiffies < READ_ONCE(csl->blacklist_expires))
+				continue;
+			TRACE("expire replication blacklist");
+			clear_bit(i, &csl->blacklist);
+		}
+
+		if (cs_is_blacklisted(cs))
+			continue;
+
+		io_lat = __cs_get_avg_latency(cs, now);
+		net_lat = __cs_get_avg_net_latency(cs, now);
+		in_flight = READ_ONCE(cs->in_flight);
+		io_prio_stamp = READ_ONCE(cs->io_prio_stamp);
+
+		w = io_lat + net_lat;
+
+		if ((io_lat >> CS_LAT_EWMA_LOG) == 0 &&
+		    now < io_prio_stamp + PCS_CS_IO_PRIO_VALID_TIME)
+			w = map_ioprio_to_latency(READ_ONCE(cs->io_prio)) + net_lat;
+
+		if (get_io_tweaks(cc) & PCS_TWEAK_USE_FLOW_LOAD)
+			w += pcs_flow_cs_analysis(cs) * 8000;
+
+		if (w <= remote_min) {
+
+			if (w < remote_min || in_flight <= remote_pipe) {
+				remote_min = w;
+				remote_pipe = in_flight;
+				remote_idx = i;
+			}
+		}
+
+		if (test_bit(CS_SF_LOCAL, &cs->state)) {
+			local_mask |= (1 << i);
+			if (io_lat > 1000)
+				local_busy_mask |= (1 << i);
+
+			if (w < local_min || (w == local_min && in_flight <= local_pipe)) {
+				local_min = w;
+				local_pipe = in_flight;
+				local_idx = i;
+			}
+		}
+	}
+
+	if (remote_idx < 0) {
+		if (failed_cnt > 0) {
+			failed_cnt = -1;
+			goto next_pass;
+		}
+		return -1;
+	}
+
+	/* If the flow is sequential, but we have too many sequential flows, consider
+	 * all of them random, which is essentially true.
+	 */
+	io_cost = 8000;
+	if (is_seq) {
+		int nflows = pcs_flow_analysis(&cc->maps.ftab);
+
+		if (nflows >= PCS_FLOW_THRESH && io_locality < 0)
+			is_seq = 0;
+
+		if (nflows < PCS_FLOW_THRESH)
+			io_cost = 500;
+	}
+
+	if (local_idx < 0)
+		selected = remote_idx;
+	else if (io_locality > 0)
+		selected = local_idx;
+	else if (io_locality == 0 && local_mask != local_busy_mask) {
+		selected = local_idx;
+		io_cost = local_min / 16;
+	} else if (get_io_tweaks(cc) & PCS_TWEAK_IGNORE_SEQUENTIAL)
+		selected = remote_idx;
+	else {
+		if (is_seq)
+			selected = local_idx;
+		else
+			selected = remote_idx;
+	}
+
+	/* Add penalty. The result of current decision will reflect itself in latency
+	 * after at least one round-trip time. Penalty poisons weight until that moment.
+	 * Ideally it should decay and be replaced with EWMA average introduced by increased latency.
+	 * Think about better algorithm, maybe, it is the key to finally correct algorithm.
+	 */
+	if (!(get_io_tweaks(cc) & PCS_TWEAK_USE_FLOW_LOAD))
+		cs_account_latency(csl->cs[selected].cslink.cs, io_cost);
+
+	return selected;
+}
+
+struct pcs_int_request *
+pcs_ireq_split(struct pcs_int_request *ireq, unsigned int iochunk, int noalign)
+{
+	struct pcs_int_request * sreq;
+
+	sreq = ireq_alloc(ireq->dentry);
+	if (!sreq)
+		return NULL;
+
+	sreq->dentry = ireq->dentry;
+	sreq->type = PCS_IREQ_IOCHUNK;
+	sreq->flags = ireq->flags;
+	sreq->iochunk.map = ireq->iochunk.map;
+	if (sreq->iochunk.map)
+		__pcs_map_get(sreq->iochunk.map);
+	sreq->iochunk.flow = pcs_flow_get(ireq->iochunk.flow);
+	sreq->iochunk.direction = ireq->iochunk.direction;
+	sreq->iochunk.role = ireq->iochunk.role;
+	sreq->iochunk.cs_index = ireq->iochunk.cs_index;
+	sreq->iochunk.chunk = ireq->iochunk.chunk;
+	sreq->iochunk.offset = ireq->iochunk.offset;
+	sreq->iochunk.dio_offset = ireq->iochunk.dio_offset;
+	if (!noalign &&
+	    (sreq->iochunk.offset & 4095) &&
+	    iochunk > (sreq->iochunk.offset & 4095) &&
+	    ireq->iochunk.map &&
+	    sreq->iochunk.chunk + sreq->iochunk.offset + iochunk != map_chunk_end(ireq->iochunk.map))
+		iochunk -= (sreq->iochunk.offset & 4095);
+	sreq->iochunk.size = iochunk;
+
+	if (ireq->flags & IREQ_F_LOC_TOKEN)
+		BUG();
+
+	sreq->iochunk.csl = NULL;
+	sreq->iochunk.banned_cs.val = 0;
+	sreq->complete_cb = ireq->complete_cb;
+	sreq->iochunk.msg.destructor = NULL;
+	sreq->iochunk.msg.rpc = NULL;
+	pcs_sreq_attach(sreq, ireq->completion_data.parent);
+
+	ireq->iochunk.size -= iochunk;
+	ireq->iochunk.offset += iochunk;
+	ireq->iochunk.dio_offset += iochunk;
+
+	return sreq;
+}
+
+static int pcs_cslist_submit_read(struct pcs_int_request *ireq, struct pcs_cs_list * csl, int requeue)
+{
+	struct pcs_cluster_core *cc = ireq->cc;
+	struct pcs_cs * cs;
+	unsigned int iochunk;
+	int allot;
+	int i = -1;
+	int is_seq, csl_seq = atomic_read(&csl->seq_read_in_flight);
+
+	is_seq = csl_seq || pcs_flow_sequential(ireq->iochunk.flow);
+	i = READ_ONCE(csl->read_index);
+
+	if (i >= 0) {
+		abs_time_t now = jiffies;
+		abs_time_t selected = READ_ONCE(csl->select_stamp);
+
+		cs = csl->cs[i].cslink.cs;
+
+		/* Force rebalance after long timeout or when there is no sequential IO
+		 * on this chunk and new read begins from chunk start.
+		 * Also rebalance after short timeout, but only if one of the following conditions hold:
+		 * 1. No active sequential reads on this chunk, including this one.
+		 * 2. io_locality < 0
+		 * 3. No active sequential reads, sequential read from remote CS. Maybe, we want to switch to local.
+		 */
+		if (now > selected + PCS_MAP_MAX_REBALANCE_TIMEOUT ||
+		    (!csl_seq && ireq->iochunk.offset == 0) ||
+		    (get_io_tweaks(cc) & PCS_TWEAK_REBALANCE_ALWAYS) ||
+		    (now > selected + PCS_MAP_MIN_REBALANCE_TIMEOUT &&
+		     (!is_seq || get_io_locality(cc) < 0 ||
+		      (!csl_seq &&
+		       !(test_bit(CS_SF_LOCAL, &cs->state)) && (csl->flags & CSL_FL_HAS_LOCAL))))) {
+			i = -1;
+			WRITE_ONCE(csl->read_index, -1);
+		}
+	}
+
+	if (i < 0) {
+		i = select_cs_for_read(cc, csl, is_seq, ireq->iochunk.offset, ireq->iochunk.banned_cs);
+
+		if (i < 0) {
+			/* All CSes are blacklisted. Generate error for the first one
+			 * and let MDS to figure what heppened with the rest.
+			 */
+			cs = csl->cs[0].cslink.cs;
+			map_remote_error(ireq->iochunk.map, cs->blacklist_reason, cs->id.val);
+
+			TRACE("Read from " MAP_FMT " blocked by blacklist error %d, CS" NODE_FMT,
+			      MAP_ARGS(ireq->iochunk.map), cs->blacklist_reason, NODE_ARGS(cs->id));
+			return -1;
+		}
+
+		WRITE_ONCE(csl->read_index, i);
+		WRITE_ONCE(csl->select_stamp, jiffies);
+
+		TRACE("Selected read map " MAP_FMT " to CS" NODE_FMT "; is_seq=%d\n", MAP_ARGS(ireq->iochunk.map),
+		      NODE_ARGS(csl->cs[i].cslink.cs->id), is_seq);
+		pcs_flow_bind_cs(ireq->iochunk.flow, csl->cs[i].cslink.cs);
+	}
+	cs = csl->cs[i].cslink.cs;
+
+	ireq->iochunk.cs_index = i;
+
+	spin_lock(&cs->lock);
+	cs_cwnd_use_or_lose(cs);
+	allot = cs->eff_cwnd - cs->in_flight;
+	spin_unlock(&cs->lock);
+
+	if (allot < 0) {
+		pcs_cs_cong_enqueue(ireq, cs);
+
+		return 0;
+	}
+
+	if (allot < ireq->dentry->cluster->cfg.curr.lmss)
+		allot = ireq->dentry->cluster->cfg.curr.lmss;
+
+	if (test_bit(CS_SF_LOCAL, &cs->state))
+		iochunk = ireq->dentry->cluster->cfg.curr.lmss;
+	else
+		iochunk = ireq->dentry->cluster->cfg.curr.rmss;
+
+	for (;;) {
+		struct pcs_int_request * sreq = ireq;
+		unsigned int weight;
+
+		if (ireq->iochunk.size > iochunk) {
+			sreq = pcs_ireq_split(ireq, iochunk, 0);
+
+			if (sreq == NULL) {
+				pcs_set_local_error(&ireq->error, PCS_ERR_NOMEM);
+				ireq_complete(ireq);
+				return 0;
+			}
+		}
+
+		sreq->flags &= ~(IREQ_F_RND_WEIGHT | IREQ_F_SEQ);
+		BUG_ON(sreq->flags & IREQ_F_SEQ_READ);
+		if (pcs_flow_sequential(sreq->iochunk.flow)) {
+			sreq->flags |= IREQ_F_SEQ_READ | IREQ_F_SEQ;
+			atomic_inc(&csl->seq_read_in_flight);
+			weight = cong_roundup(sreq->iochunk.size);
+		} else if (sreq->iochunk.size >= 512*1024 || !(get_io_tweaks(cc) & PCS_TWEAK_USE_FLOW_WEIGHT)) {
+			weight = cong_roundup(sreq->iochunk.size);
+		} else {
+			sreq->flags |= IREQ_F_RND_WEIGHT;
+			weight = 512*1024;
+		}
+
+		cs_increment_in_flight(cs, weight);
+		allot -= weight;
+
+		BUG_ON(sreq->iochunk.csl);
+		cslist_get(csl);
+		sreq->iochunk.csl = csl;
+		pcs_cs_submit(cs, sreq);
+
+		if (sreq == ireq)
+			return 0;
+
+		if (allot < 0) {
+			pcs_cs_cong_enqueue(ireq, cs);
+			return 0;
+		}
+	}
+}
+
+static int pcs_cslist_submit_write(struct pcs_int_request *ireq, struct pcs_cs_list * csl, int requeue)
+{
+	struct pcs_cs * cs;
+	unsigned int iochunk;
+	int i;
+	int congested_idx;
+	int max_excess;
+	int allot;
+
+	ireq->iochunk.cs_index = 0;
+	iochunk = ireq->dentry->cluster->cfg.curr.lmss;
+
+restart:
+	congested_idx = -1;
+	max_excess = 0;
+	allot = ireq->iochunk.size;
+
+	for (i = 0; i < csl->nsrv; i++) {
+		int cs_allot;
+
+		cs = csl->cs[i].cslink.cs;
+		if (cs_is_blacklisted(cs)) {
+			map_remote_error(ireq->iochunk.map, cs->blacklist_reason, cs->id.val);
+			TRACE("Write to " MAP_FMT " blocked by blacklist error %d, CS" NODE_FMT,
+			      MAP_ARGS(ireq->iochunk.map), cs->blacklist_reason, NODE_ARGS(cs->id));
+			return -1;
+		}
+		spin_lock(&cs->lock);
+		cs_cwnd_use_or_lose(cs);
+		cs_allot = cs->eff_cwnd - cs->in_flight;
+		spin_unlock(&cs->lock);
+
+		if (cs_allot < 0) {
+			cs_allot = -cs_allot;
+			if (cs_allot > max_excess) {
+				congested_idx = i;
+				max_excess = cs_allot;
+			}
+		} else {
+			if (cs_allot < allot)
+				allot = cs_allot;
+		}
+
+		if (!(test_bit(CS_SF_LOCAL, &cs->state)))
+			iochunk = ireq->dentry->cluster->cfg.curr.wmss;
+	}
+
+	if (congested_idx >= 0) {
+		int cur_cong_idx = READ_ONCE(csl->cong_index);
+
+
+		if (cur_cong_idx >= 0 && !requeue &&
+		    (READ_ONCE(csl->cs[cur_cong_idx].cslink.cs->cong_queue_len) ||
+		     READ_ONCE(csl->cs[cur_cong_idx].cslink.cs->active_list_len)))
+			congested_idx = cur_cong_idx;
+		else
+			WRITE_ONCE(csl->cong_index, congested_idx);
+
+		pcs_cs_cong_enqueue(ireq, csl->cs[congested_idx].cslink.cs);
+		return 0;
+	}
+	WRITE_ONCE(csl->cong_index, -1);
+
+	if (allot < ireq->dentry->cluster->cfg.curr.lmss)
+		allot = ireq->dentry->cluster->cfg.curr.lmss;
+
+	for (;;) {
+		struct pcs_int_request * sreq = ireq;
+		unsigned int weight;
+
+		if (ireq->iochunk.size > iochunk) {
+			sreq = pcs_ireq_split(ireq, iochunk, 0);
+
+			if (sreq == NULL) {
+				pcs_set_local_error(&ireq->error, PCS_ERR_NOMEM);
+				ireq_complete(ireq);
+				return 0;
+			}
+		}
+
+		sreq->flags &= ~(IREQ_F_RND_WEIGHT | IREQ_F_SEQ);
+		BUG_ON(sreq->flags & IREQ_F_SEQ_READ);
+		if (pcs_flow_sequential(sreq->iochunk.flow)) {
+			weight = cong_roundup(sreq->iochunk.size);
+			sreq->flags |= IREQ_F_SEQ;
+		} else if (!(get_io_tweaks(ireq->cc) & PCS_TWEAK_USE_FLOW_WEIGHT) ||
+			   sreq->iochunk.size > 512*1024) {
+			weight = cong_roundup(sreq->iochunk.size);
+		} else {
+			weight = 512*1024;
+			sreq->flags |= IREQ_F_RND_WEIGHT;
+		}
+
+		for (i = 0; i < csl->nsrv; i++)
+			cs_increment_in_flight(csl->cs[i].cslink.cs, weight);
+
+		allot -= weight;
+		cs = csl->cs[0].cslink.cs;
+
+		cslist_get(csl);
+		BUG_ON(sreq->iochunk.csl);
+		sreq->iochunk.csl = csl;
+		pcs_cs_submit(cs, sreq);
+
+		if (ireq == sreq)
+			return 0;
+
+		/* Window for some of CSes is closed. Restart processing remaining part
+		 * of request. Note, if state of map has been changed, it even can fail
+		 * and return to caller with -1.
+		 */
+		if (allot < 0)
+			goto restart;
+	}
+}
+
+static int pcs_cslist_submit_flush(struct pcs_int_request *ireq, struct pcs_cs_list * csl, int requeue)
+{
+	struct pcs_cs * cs;
+	int i;
+	int congested_idx;
+	int max_excess;
+	int allot = PCS_CS_FLUSH_WEIGHT;
+	struct pcs_msg * msg;
+	struct pcs_cs_iohdr * ioh;
+
+	congested_idx = -1;
+	max_excess = 0;
+
+	for (i = 0; i < csl->nsrv; i++) {
+		int cs_allot;
+
+		cs = csl->cs[i].cslink.cs;
+
+		if (cs_is_blacklisted(cs)) {
+			map_remote_error(ireq->flushreq.map, cs->blacklist_reason, cs->id.val);
+			TRACE("Flush to " MAP_FMT " blocked by blacklist error %d, CS" NODE_FMT,
+			      MAP_ARGS(ireq->flushreq.map), cs->blacklist_reason, NODE_ARGS(cs->id));
+			return -1;
+		}
+
+		spin_lock(&cs->lock);
+		cs_cwnd_use_or_lose(cs);
+		cs_allot = cs->eff_cwnd - cs->in_flight;
+		spin_unlock(&cs->lock);
+
+		if (cs_allot < 0) {
+			cs_allot = -cs_allot;
+			if (cs_allot > max_excess) {
+				congested_idx = i;
+				max_excess = cs_allot;
+			}
+		}
+	}
+
+	if (congested_idx >= 0) {
+		int cur_cong_idx = READ_ONCE(csl->cong_index);
+
+		if (cur_cong_idx >= 0 && !requeue &&
+		    (READ_ONCE(csl->cs[cur_cong_idx].cslink.cs->cong_queue_len) ||
+		     READ_ONCE(csl->cs[cur_cong_idx].cslink.cs->active_list_len)))
+			congested_idx = cur_cong_idx;
+		else
+			WRITE_ONCE(csl->cong_index, congested_idx);
+
+		pcs_cs_cong_enqueue(ireq, csl->cs[congested_idx].cslink.cs);
+		return 0;
+	}
+
+	WRITE_ONCE(csl->cong_index, -1);
+
+	for (i = 0; i < csl->nsrv; i++) {
+		cs = csl->cs[i].cslink.cs;
+		cs_increment_in_flight(cs, allot);
+	}
+
+	cs = csl->cs[0].cslink.cs;
+
+	BUG_ON(ireq->flushreq.csl);
+	cslist_get(csl);
+	ireq->flushreq.csl = csl;
+	ireq->ts_sent = jiffies;
+	ireq->wait_origin.val = 0;
+
+	msg = ireq->flushreq.msg;
+	msg->private2 = ireq;
+
+	ioh = (struct pcs_cs_iohdr *)msg->_inline_buffer;
+
+	if (msg->rpc) {
+		pcs_rpc_put(msg->rpc);
+		msg->rpc = NULL;
+	}
+	pcs_clear_error(&msg->error);
+	msg->timeout = csl->write_timeout;
+
+	pcs_rpc_get_new_xid(cs->rpc->eng, &ioh->hdr.xid);
+	ioh->map_version = csl->version;
+
+	pcs_rpc_call(cs->rpc, msg);
+	return 0;
+}
+
+
+
+int pcs_cslist_submit(struct pcs_int_request *ireq, struct pcs_cs_list *csl, int requeue)
+{
+	BUG_ON(!atomic_read(&csl->refcnt));
+
+	if (ireq->type == PCS_IREQ_FLUSH) {
+		BUG();
+		return pcs_cslist_submit_flush(ireq, csl, requeue);
+	} else if (!ireq->iochunk.direction) {
+		return pcs_cslist_submit_read(ireq, csl, requeue);
+	} else if (ireq->flags & IREQ_F_MAPPED) {
+		BUG();
+		return -EIO;
+	} else {
+		BUG();
+		return pcs_cslist_submit_write(ireq, csl, requeue);
+	}
+	BUG();
+	return -EIO;
+}
+
+void map_submit(struct pcs_map_entry * m, struct pcs_int_request *ireq, int requeue)
+{
+	int direction;
+	int done;
+
+	DTRACE("enter m: " MAP_FMT ", ireq:%p \n", MAP_ARGS(m),	 ireq);
+	BUG_ON(ireq->type != PCS_IREQ_IOCHUNK && ireq->type != PCS_IREQ_FLUSH);
+
+	if (ireq_is_timed_out(ireq)) {
+		pcs_log(LOG_ERR, "timeout while getting map \"" MAP_FMT "\", last err=%d",
+			MAP_ARGS(m), ireq->error.value);
+		BUG();
+	}
+
+	BUG_ON(pcs_if_error(&ireq->error));
+
+
+	direction = (ireq->type != PCS_IREQ_FLUSH ? ireq->iochunk.direction : 1);
+
+	do {
+		struct pcs_cs_list *csl = NULL;
+
+		spin_lock(&m->lock);
+		if (ireq->type == PCS_IREQ_IOCHUNK) {
+			ireq->iochunk.hbuf.map_version = m->version;
+			ireq->iochunk.hbuf.uid = ireq->iochunk.map->id;
+		}
+		if (!(m->state & (1 << direction))) {
+			spin_unlock(&m->lock);
+			pcs_map_queue_resolve(m, ireq, direction);
+			return;
+		}
+		csl = m->cs_list;
+		if (csl)
+			cslist_get(csl);
+		spin_unlock(&m->lock);
+
+		if (ireq->type != PCS_IREQ_FLUSH && !(ireq->flags & IREQ_F_MAPPED)) {
+			u64 pos = ireq->iochunk.chunk + ireq->iochunk.offset;
+			u64 len = map_chunk_end(m) - pos;
+
+			/*
+			 * For non variable chunks all alligment should be done
+			 * inside pcs_cc_process_ireq_ioreq();
+			 */
+			BUG_ON(pos < map_chunk_start(m));
+			BUG_ON(ireq->iochunk.chunk != map_chunk_start(m));
+			BUG_ON(ireq->iochunk.offset != pos - ireq->iochunk.chunk);
+			if (ireq->iochunk.size > len) {
+				struct pcs_int_request * sreq;
+
+				sreq = pcs_ireq_split(ireq, len, 0);
+				if (ireq->iochunk.map) {
+					pcs_map_put(ireq->iochunk.map);
+					ireq->iochunk.map = NULL;
+				}
+				ireq->iochunk.chunk = map_chunk_end(m);
+				ireq->iochunk.offset = 0;
+				pcs_cc_submit(ireq->dentry->cluster, ireq);
+				ireq = sreq;
+			}
+		}
+
+		if (!csl) {
+			if (ireq->type != PCS_IREQ_FLUSH)
+				ireq_handle_hole(ireq);
+			else
+				ireq_complete(ireq);
+			return;
+		}
+
+		if (direction && ireq->type != PCS_IREQ_FLUSH)
+			ireq->dentry->local_mtime = get_real_time_ms();
+
+		done = !pcs_cslist_submit(ireq, csl, requeue);
+		cslist_put(csl);
+	} while (!done);
+}
+
+static int valid_for_truncate(struct pcs_map_entry * m, struct pcs_int_request *ireq)
+{
+	/* This weird test means that map is valid, but points to a hole. In this case
+	 * truncate is noop.
+	 */
+	if ((m->state & (PCS_MAP_ERROR|PCS_MAP_RESOLVING|PCS_MAP_NEW|PCS_MAP_READABLE)) ==
+	    (PCS_MAP_NEW|PCS_MAP_READABLE))
+		return 1;
+
+	/* If we already have valid map, remember its version
+	 * and switch to the next phase: invalidation and requesting
+	 * new map.
+	 */
+	if (!(m->state & (PCS_MAP_ERROR|PCS_MAP_RESOLVING|PCS_MAP_NEW))) {
+		map_remote_error_nolock(m, PCS_ERR_CSD_STALE_MAP, m->cs_list ? m->cs_list->cs[0].info.id.val : 0);
+		ireq->truncreq.version = m->version;
+	}
+	/* Otherwise lookup valid map first. */
+	return 0;
+}
+
+
+//// TODO: truncate should probably synhroniously truncate local mapping.
+void process_ireq_truncate(struct pcs_int_request *ireq)
+{
+	struct pcs_dentry_info *di = ireq->dentry;
+	struct pcs_map_entry * m;
+	u64 end;
+
+	/* Special case: full truncate */
+	if (ireq->truncreq.offset == 0) {
+		map_truncate_tail(&di->mapping, 0);
+		ireq_complete(ireq);
+		return;
+	}
+
+	m = pcs_find_get_map(di, ireq->truncreq.offset - 1);
+
+	TRACE("process TRUNCATE %llu@" DENTRY_FMT " %x",
+	      (unsigned long long)ireq->truncreq.offset, DENTRY_ARGS(di), m ? m->state : -1);
+
+	if (m == NULL) {
+		map_queue_on_limit(ireq);
+		return;
+	}
+	end = map_chunk_end(m);
+	if (end <= ireq->truncreq.offset) {
+		map_truncate_tail(&di->mapping, end);
+		ireq_complete(ireq);
+		return;
+	}
+
+	if (ireq->truncreq.phase == 0) {
+		if (valid_for_truncate(m, ireq)) {
+			map_truncate_tail(&di->mapping, end);
+			ireq_complete(ireq);
+			return;
+		}
+	} else {
+		/* We already had some valid map. Must get new one. */
+
+
+		spin_lock(&m->lock);
+		if ((m->state & (PCS_MAP_ERROR|PCS_MAP_RESOLVING|PCS_MAP_NEW|PCS_MAP_READABLE)) ==
+		    (PCS_MAP_NEW|PCS_MAP_READABLE)) {
+
+			spin_unlock(&m->lock);
+			pcs_log(LOG_INFO, "map " MAP_FMT " unexpectedly converted to hole", MAP_ARGS(m));
+			map_truncate_tail(&di->mapping, end);
+			ireq_complete(ireq);
+			return;
+		}
+
+		if (m->state & PCS_MAP_RESOLVING) {
+			list_add_tail(&ireq->list, &m->queue);
+			spin_unlock(&m->lock);
+			return;
+		}
+
+		if (!(m->state & (PCS_MAP_ERROR|PCS_MAP_NEW))) {
+			if (map_version_compare(&m->version, &ireq->truncreq.version) > 0) {
+				spin_unlock(&m->lock);
+				map_truncate_tail(&di->mapping, end);
+				ireq_complete(ireq);
+				return;
+			}
+
+			TRACE("map " MAP_FMT " is not updated yet", MAP_ARGS(m));
+			map_remote_error_nolock(m, PCS_ERR_CSD_STALE_MAP, m->cs_list ? m->cs_list->cs[0].info.id.val : 0);
+
+		}
+		spin_unlock(&m->lock);
+	}
+	pcs_map_queue_resolve(m, ireq, 1);
+}
+
+
+noinline void pcs_mapping_truncate(struct pcs_int_request *ireq, u64 old_size)
+{
+	struct pcs_dentry_info *di = ireq->dentry;
+	u64 new_size = DENTRY_SIZE(di);
+	u64 offset;
+	struct pcs_map_entry * m = NULL;
+	int queue = 0;
+
+	di->local_mtime = get_real_time_ms();
+
+	if (new_size < old_size)
+		pcs_flow_truncate(&di->mapping.ftab, new_size, &di->cluster->maps.ftab);
+
+	if (old_size < new_size)
+		offset = old_size;
+	else
+		offset = new_size;
+
+	ireq->truncreq.offset = offset;
+	ireq->truncreq.phase = 0;
+
+	if (offset == 0) {
+		map_truncate_tail(&di->mapping, offset);
+		ireq_complete(ireq);
+		return;
+	}
+
+	map_truncate_tail(&di->mapping, offset + 1);
+
+	m = pcs_find_get_map(di, offset - 1);
+
+	if (m) {
+		TRACE("mapping truncate %llu->%llu " DENTRY_FMT " %x", (unsigned long long)old_size,
+		      (unsigned long long)new_size, DENTRY_ARGS(ireq->dentry), m ? m->state : -1);
+	}
+	if (m && map_chunk_end(m) == offset) {
+		map_truncate_tail(&di->mapping, offset);
+		ireq_complete(ireq);
+		return;
+	}
+
+
+	if (m == NULL)
+		queue = 1;
+
+	spin_lock(&m->lock);
+	if (valid_for_truncate(m, ireq))
+		queue = 1;
+	spin_unlock(&m->lock);
+
+	if (queue) {
+		if (m) {
+			pcs_map_queue_resolve(m, ireq, 1);
+		} else {
+			map_queue_on_limit(ireq);
+		}
+	} else {
+		map_truncate_tail(&di->mapping, map_chunk_end(m));
+		ireq_complete(ireq);
+	}
+
+	if (m)
+		pcs_map_put(m);
+}
+
+static int commit_cs_record(struct pcs_map_entry * m, struct pcs_cs_record * rec,
+			     struct pcs_cs_sync_data * sync, u32 lat, int op_type)
+{
+	int dirtify;
+	struct cs_sync_state * srec = &rec->sync;
+	if (sync->ts_net > sync->ts_io)
+		lat -= sync->ts_net;
+	else
+		lat -= sync->ts_io;
+
+	pcs_cs_update_stat(rec->cslink.cs, sync->ts_io, ((int)lat < 0) ? 0 : lat, op_type);
+	cs_update_io_latency(rec->cslink.cs, sync->ts_io);
+
+	/* First: verify integrity sequence. */
+	if (rec->info.integrity_seq != sync->integrity_seq) {
+		/* Now this is possible only if IO was issued and completed
+		 * before CS rebooted, but we see the result after.
+		 *
+		 * The request is restarted with new map.
+		 */
+		pcs_log(LOG_ERR, MAP_FMT " integrity seq mismatch CS" NODE_FMT " %d != %d, %d",
+			MAP_ARGS(m),
+			NODE_ARGS(rec->info.id),
+			rec->info.integrity_seq, sync->integrity_seq, srec->dirty_integrity);
+		return 1;
+	}
+
+	BUG_ON(srec->dirty_integrity && srec->dirty_integrity != sync->integrity_seq);
+
+	dirtify = (op_type == PCS_CS_WRITE_SYNC_RESP || op_type == PCS_CS_WRITE_RESP);
+	/* The following looks scary, could be more clear.
+	 * The goal is to update sync seq numbers:
+	 *
+	 * READ/SYNC (!dirtifying):
+	 * - sync_epoch/sync_seq advance sync_epoch/seq
+	 * WRITE/WRITE_SYNC (dirtifying):
+	 * - sync_epoch/sync_seq advance sync_epoch/seq
+	 * - sync_epoch/sync_dirty advance dirty_epoch/seq
+	 */
+	if (dirtify && sync->sync_dirty) {
+		srec->dirty_integrity = sync->integrity_seq;
+
+		if (srec->dirty_epoch == 0 ||
+		    pcs_sync_seq_compare(sync->sync_epoch, srec->dirty_epoch) > 0) {
+			srec->dirty_epoch = sync->sync_epoch;
+			srec->dirty_seq = sync->sync_dirty;
+		} else if (sync->sync_epoch == srec->dirty_epoch &&
+			   pcs_sync_seq_compare(sync->sync_dirty, srec->dirty_seq) > 0) {
+			srec->dirty_seq = sync->sync_dirty;
+		}
+	}
+
+	if (srec->sync_epoch == 0 ||
+	    pcs_sync_seq_compare(sync->sync_epoch, srec->sync_epoch) > 0) {
+		srec->sync_epoch = sync->sync_epoch;
+		srec->sync_seq = sync->sync_current;
+	} else if (sync->sync_epoch == srec->sync_epoch &&
+		   pcs_sync_seq_compare(sync->sync_current, srec->sync_seq) > 0) {
+		srec->sync_seq = sync->sync_current;
+	}
+	return 0;
+}
+
+static int commit_one_record(struct pcs_map_entry * m, PCS_NODE_ID_T cs_id,
+			     struct pcs_cs_sync_data * sync, u32 lat, int op_type)
+{
+	int err = 0;
+	int i;
+
+	BUG_ON(sync->integrity_seq == 0);
+
+	if (m->cs_list == NULL)
+		return 0;
+
+	pcs_log(LOG_DEBUG5, "sync ["NODE_FMT",%u,%u,%u,%u]", NODE_ARGS(cs_id),
+	      sync->integrity_seq, sync->sync_epoch, sync->sync_dirty, sync->sync_current);
+
+	for (i = 0; i < m->cs_list->nsrv; i++) {
+		if (m->cs_list->cs[i].info.id.val == cs_id.val) {
+			err = commit_cs_record(m, &m->cs_list->cs[i], sync, lat, op_type);
+
+			pcs_log(LOG_DEBUG5, "commited ["NODE_FMT",%u/%u,%u/%u,%u/%u]", NODE_ARGS(cs_id),
+			      m->cs_list->cs[i].info.integrity_seq,
+			      m->cs_list->cs[i].sync.dirty_integrity,
+			      m->cs_list->cs[i].sync.dirty_epoch,
+			      m->cs_list->cs[i].sync.dirty_seq,
+			      m->cs_list->cs[i].sync.sync_epoch,
+			      m->cs_list->cs[i].sync.sync_seq);
+			break;
+		}
+	}
+	return err;
+}
+
+static void update_net_latency(struct pcs_cs_list * csl, PCS_NODE_ID_T id,
+			       struct pcs_cs_sync_data * sync, unsigned int lat)
+{
+	int i;
+
+	if (sync->ts_net > sync->ts_io)
+		lat -= sync->ts_net;
+	else
+		lat -= sync->ts_io;
+
+	if ((int)lat <= 0)
+		return;
+
+	for (i = 0; i < csl->nsrv; i++) {
+		if (id.val == csl->cs[i].info.id.val) {
+			struct pcs_cs * cs = csl->cs[i].cslink.cs;
+
+			if (i != 0 || !(test_bit(CS_SF_LOCAL, &cs->state)))
+				cs_update_net_latency(csl->cs[i].cslink.cs, lat);
+			break;
+		}
+	}
+}
+
+static inline u32 calc_latency(abs_time_t start)
+{
+	abs_time_t now = jiffies;
+	u64 elapsed = (now > start)? now - start: 0;
+	return elapsed > ~0U ? ~0U : elapsed;
+}
+
+static int commit_sync_info(struct pcs_int_request *req,
+			struct pcs_map_entry * m, struct pcs_cs_list * csl,
+			struct pcs_msg * resp)
+{
+	struct pcs_cs_iohdr *h = (struct pcs_cs_iohdr *)resp->_inline_buffer;
+	int err = 0;
+	unsigned int max_iolat, lat = calc_latency(req->ts_sent);
+
+	err |= commit_one_record(m, resp->rpc->peer_id, &h->sync, lat, h->hdr.type);
+
+	/* Network latency is updated only for the first CS in chain.
+	 * The results for anothers are ignored, which looks sad, because we lose
+	 * alot of information. The thing is that measured latency
+	 * is actually sum of network latencies in both directions, so that if we
+	 * average all the results we get not CS latency but CS latency + average_over_cluster,
+	 * which is even undefined when we use EWMA averaging (it would be defined
+	 * if we calculated EWMA latency for each link, otherwise it is EWMA of a random number)
+	 * If we fix one node (client in this case), we calculate average sum of client
+	 * plus CS, which is enough to use this value to select the least loaded CS for read.
+	 */
+	update_net_latency(csl, resp->rpc->peer_id, &h->sync, lat);
+	max_iolat = h->sync.ts_io;
+
+	if (h->hdr.type != PCS_CS_READ_RESP) {
+		struct pcs_cs_sync_resp * srec;
+		lat = h->sync.ts_net;
+		for (srec = (struct pcs_cs_sync_resp*)(h + 1);
+		     (void*)(srec + 1) <= (void*)h + h->hdr.len;
+		     srec++) {
+			err |= commit_one_record(m, srec->cs_id, &srec->sync, lat, h->hdr.type);
+			lat  = srec->sync.ts_net;
+			if (max_iolat < srec->sync.ts_io)
+				max_iolat = srec->sync.ts_io;
+		}
+	}
+	//// temproraly disable logging
+	////cs_log_io_times(req, resp, max_iolat);
+
+	evaluate_dirty_status(m);
+	return err;
+}
+
+void pcs_map_verify_sync_state(struct pcs_dentry_info *di, struct pcs_int_request *ireq, struct pcs_msg * msg)
+{
+	struct pcs_map_entry * m = ireq->iochunk.map;
+	struct pcs_msg * resp = msg->response;
+
+	if (!m)
+		return;
+
+	spin_lock (&m->lock);
+	if (m->cs_list == NULL || (m->state & PCS_MAP_DEAD)) {
+		spin_unlock(&m->lock);
+		return;
+	}
+	if (commit_sync_info(ireq, m, ireq->iochunk.csl, resp)) {
+		pcs_log(LOG_ERR, MAP_FMT " sync integrity error: map retry follows", MAP_ARGS(m));
+
+		msg->error.value = PCS_ERR_CSD_STALE_MAP;
+		msg->error.remote = 1;
+		msg->error.offender = m->cs_list->cs[0].info.id;
+	}
+	spin_unlock(&m->lock);
+
+	if (ireq->iochunk.flow) {
+		struct pcs_int_request * preq = ireq->completion_data.parent;
+
+		pcs_flow_confirm(ireq->iochunk.flow, &ireq->dentry->mapping.ftab,
+				 preq->apireq.req->type == PCS_REQ_T_WRITE,
+				 preq->apireq.req->pos, preq->apireq.req->size,
+				 &ireq->cc->maps.ftab);
+	}
+
+}
+
+void sync_done(struct pcs_msg * msg)
+{
+	struct pcs_int_request * sreq = msg->private;
+	struct pcs_map_entry * m = sreq->flushreq.map;
+	struct pcs_msg * resp = msg->response;
+
+	spin_lock(&m->lock);
+	if (m->state & PCS_MAP_DEAD)
+		goto done;
+	if (!(m->flags & PCS_MAP_DIRTY))
+		goto done;
+
+	if (pcs_if_error(&msg->error)) {
+		pcs_copy_error(&sreq->error, &msg->error);
+		goto done;
+	}
+
+	if (commit_sync_info(sreq, m, sreq->flushreq.csl, resp)) {
+		pcs_log(LOG_ERR, MAP_FMT " sync integrity error: sync retry follows", MAP_ARGS(m));
+
+		sreq->error.remote = 1;
+		sreq->error.value = PCS_ERR_CSD_STALE_MAP;
+		sreq->error.offender = m->cs_list->cs[0].info.id;
+	}
+
+done:
+	spin_unlock(&m->lock);
+	ireq_complete(sreq);
+	return;
+}
+
+static int sync_is_finished(struct pcs_msg * msg, struct pcs_map_entry * m)
+{
+	struct pcs_cs_iohdr * h = (struct pcs_cs_iohdr *)msg->_inline_buffer;
+	struct pcs_cs_sync_resp * srec;
+
+	if (m->cs_list == NULL)
+		return 1;
+
+	for (srec = (struct pcs_cs_sync_resp *)(h + 1);
+	     (void*)(srec + 1) <= (void*)h + h->hdr.len;
+	     srec++) {
+		int i;
+
+		pcs_log(LOG_DEBUG5, "Checking cs="NODE_FMT" sync=[%d,%d,%d,%d]", NODE_ARGS(srec->cs_id), srec->sync.integrity_seq,
+		      srec->sync.sync_epoch,
+		      srec->sync.sync_dirty, srec->sync.sync_current);
+
+		for (i = 0; i < m->cs_list->nsrv; i++) {
+			if (m->cs_list->cs[i].info.id.val == srec->cs_id.val) {
+				pcs_log(LOG_DEBUG5, "Checking against sync=[%d,%d,%d,%d,%d]",
+				      m->cs_list->cs[i].sync.dirty_integrity,
+				      m->cs_list->cs[i].sync.dirty_epoch,
+				      m->cs_list->cs[i].sync.dirty_seq,
+				      m->cs_list->cs[i].sync.sync_epoch,
+				      m->cs_list->cs[i].sync.sync_seq);
+				if (cs_is_dirty(&m->cs_list->cs[i].sync) &&
+				    srec->sync.sync_epoch == m->cs_list->cs[i].sync.sync_epoch &&
+				    pcs_sync_seq_compare(srec->sync.sync_current, m->cs_list->cs[i].sync.sync_seq) >= 0)
+					return 0;
+				break;
+			}
+		}
+	}
+	return 1;
+}
+
+void process_flush_req(struct pcs_int_request *ireq)
+{
+	struct pcs_map_entry * m = ireq->flushreq.map;
+
+	spin_lock(&m->lock);
+	if (m->state & PCS_MAP_DEAD)
+		goto done;
+
+	TRACE("process FLUSH " MAP_FMT, MAP_ARGS(m));
+
+	if (!(m->flags & PCS_MAP_DIRTY))
+		goto done;
+	if (sync_is_finished(ireq->flushreq.msg, m)) {
+		TRACE("finished");
+		goto done;
+	}
+	spin_unlock(&m->lock);
+	map_submit(m, ireq, 0);
+	return;
+
+done:
+	if (pcs_if_error(&ireq->error)) {
+		TRACE("oops, delete me %d", ireq->error.value);
+		pcs_clear_error(&ireq->error);
+	}
+	ireq_complete(ireq);
+}
+
+static void pcs_flushreq_complete(struct pcs_int_request * sreq)
+{
+	struct pcs_int_request *ireq = sreq->completion_data.parent;
+	struct pcs_map_entry * m = sreq->flushreq.map;
+	struct pcs_cs_iohdr * ioh = (struct pcs_cs_iohdr*)msg_inline_head(sreq->flushreq.msg);
+	int notify_error = 0;
+
+	spin_lock(&m->lock);
+	if (!ireq)
+		m->flags &= ~PCS_MAP_FLUSHING;
+	m->flags &= ~PCS_MAP_DIRTY_GC;
+
+	if (m->state & PCS_MAP_DEAD)
+		goto done;
+	if (!(m->flags & PCS_MAP_DIRTY))
+		goto done;
+
+	if (!pcs_if_error(&sreq->error)) {
+		if (sync_is_finished(sreq->flushreq.msg, m)) {
+			TRACE("finished");
+			goto done_dirty;
+		}
+		sreq->error.value = PCS_ERR_CSD_STALE_MAP;
+		sreq->error.remote = 1;
+		sreq->error.offender = m->cs_list->cs[0].info.id;
+	}
+
+	if (ireq && !pcs_if_error(&ireq->error)) {
+		if (ireq_check_redo(sreq)) {
+			if (ireq_is_timed_out(sreq)) {
+				pcs_log(LOG_ERR, "timeout while flush request on \"" DENTRY_FMT "\" last_err=%u",
+					DENTRY_ARGS(sreq->dentry), sreq->error.value);
+				BUG();
+			}
+			TRACE("restart after flush error %d", sreq->error.value);
+			if (map_version_compare(&ioh->map_version, &m->version) < 0)
+				sreq->flags &= ~IREQ_F_ONCE;
+			spin_unlock(&m->lock);
+
+			map_notify_error(m, sreq, &ioh->map_version, sreq->flushreq.csl);
+			pcs_deaccount_ireq(sreq, &sreq->error);
+			pcs_clear_error(&sreq->error);
+
+			if (!(sreq->flags & IREQ_F_ONCE)) {
+				sreq->flags |= IREQ_F_ONCE;
+				pcs_cc_submit(sreq->cc, sreq);
+			} else
+				ireq_delay(sreq);
+			return;
+		}
+		TRACE("flush error %d", sreq->error.value);
+		pcs_copy_error(&ireq->error, &sreq->error);
+		notify_error = 1;
+	}
+
+done_dirty:
+	if (!ireq)
+		map_sync_work_add(m, pcs_sync_timeout(cc_from_map(m)));
+done:
+	spin_unlock(&m->lock);
+	if (notify_error)
+		map_notify_error(m, sreq, &ioh->map_version, sreq->flushreq.csl);
+
+	pcs_deaccount_ireq(sreq, &sreq->error);
+
+	if (ireq) {
+		if (!pcs_sreq_detach(sreq))
+			ireq_complete(ireq);
+	}
+
+	pcs_free_msg(sreq->flushreq.msg);
+	pcs_map_put(m);
+	ireq_destroy(sreq);
+}
+
+/* Allocate and format sync message. Important: this message hold values of sync counters
+ * as they are now. If sync request fails and retried, this message is not reallocated
+ * and sync counters remain the same.
+ */
+static void prepare_map_flush_msg(struct pcs_map_entry * m, struct pcs_int_request * sreq, struct pcs_msg * msg)
+{
+	struct pcs_cs_iohdr * ioh;
+	struct pcs_cs_sync_resp * arr;
+
+	assert_spin_locked(&m->lock);
+
+	ioh = (struct pcs_cs_iohdr *)msg->_inline_buffer;
+	arr = (struct pcs_cs_sync_resp *)(ioh + 1);
+
+	ioh->hdr.len = sizeof(struct pcs_cs_iohdr);
+	ioh->hdr.type = PCS_CS_SYNC_REQ;
+	memset(&ioh->sync, 0, sizeof(ioh->sync));
+	ioh->offset = 0;
+	ioh->size = 0;
+	ioh->_reserved = 0;
+	ioh->sync.misc = PCS_CS_IO_SEQ;
+
+	ioh->map_version = m->version;
+	ioh->uid = m->id;
+	ioh->iocontext = (u32)pcs_dentry_from_map(m)->fileinfo.attr.id;
+
+	if (m->cs_list) {
+		int i;
+
+		for (i = 0; i < m->cs_list->nsrv; i++) {
+			struct pcs_cs_record * rec = m->cs_list->cs + i;
+			if (cs_is_dirty(&rec->sync)) {
+				arr->cs_id = rec->info.id;
+				arr->sync.integrity_seq = rec->sync.dirty_integrity;
+				arr->sync.sync_epoch = rec->sync.dirty_epoch;
+				arr->sync.sync_dirty = rec->sync.dirty_seq;
+				arr->sync.sync_current = rec->sync.dirty_seq;
+				arr->sync.misc = 0;
+				arr->sync.ts_io = 0;
+				arr->sync.ts_net = 0;
+				arr->sync._reserved = 0;
+				ioh->hdr.len += sizeof(struct pcs_cs_sync_resp);
+				pcs_log(LOG_DEBUG5, "fill sync "NODE_FMT" [%d,%d,%d,%d]", NODE_ARGS(arr->cs_id),
+					arr->sync.integrity_seq, arr->sync.sync_epoch,
+					arr->sync.sync_dirty, arr->sync.sync_current);
+				arr++;
+			}
+		}
+	}
+	msg->size = ioh->hdr.len;
+	msg->private = sreq;
+	msg->done = sync_done;
+}
+
+static bool valid_for_flush(struct pcs_map_entry *m)
+{
+	if (m->state & PCS_MAP_DEAD)
+		return false;
+
+	if (!(m->flags & PCS_MAP_DIRTY))
+		return false;
+	if (m->flags & PCS_MAP_FLUSHING)
+		return false;
+
+	return true;
+}
+
+static int prepare_map_flush_ireq(struct pcs_map_entry *m, struct pcs_int_request **sreqp)
+{
+	struct pcs_dentry_info *de;
+	struct pcs_cs_list *cslist;
+	struct pcs_int_request *sreq;
+	struct pcs_msg * msg;
+
+	spin_lock(&m->lock);
+	if (!valid_for_flush(m)) {
+		spin_unlock(&m->lock);
+		return 0;
+	}
+
+	if (!m->cs_list || !m->cs_list->nsrv) {
+		/* TODO: userspace allow (cslist->nsrv==0), but IMHO it does not make sense */
+		WARN_ON_ONCE(1);
+		spin_unlock(&m->lock);
+		return 0;
+	}
+
+	cslist = m->cs_list;
+	cslist_get(cslist);
+	/* TODO: Need to grab reference to de? */
+	de = pcs_dentry_from_map(m);
+	spin_unlock(&m->lock);
+
+	sreq = ireq_alloc(de);
+	if (!sreq)
+		goto err_cslist;
+
+	msg = pcs_rpc_alloc_output_msg(sizeof(struct pcs_cs_iohdr) +
+				       cslist->nsrv * sizeof(struct pcs_cs_sync_resp));
+	if (!msg)
+		goto err_ireq;
+
+	/* All resources allocated, we need to recheck maps state again */
+	spin_lock(&m->lock);
+	cslist_put(cslist);
+	if (!valid_for_flush(m) || m->cs_list != cslist) {
+		spin_unlock(&m->lock);
+		return 0;
+	}
+	prepare_map_flush_msg(m, sreq, msg);
+	sreq->type = PCS_IREQ_FLUSH;
+	sreq->ts = jiffies;
+	sreq->completion_data.parent = NULL;
+	sreq->flushreq.map = m;
+	sreq->flushreq.csl = NULL;
+	sreq->complete_cb = pcs_flushreq_complete;
+	sreq->flushreq.msg = msg;
+	TRACE("timed FLUSH " MAP_FMT, MAP_ARGS(m));
+	m->flags |= PCS_MAP_FLUSHING;
+	__pcs_map_get(m);
+	spin_unlock(&m->lock);
+	*sreqp	= sreq;
+	return 0;
+
+err_ireq:
+	ireq_destroy(sreq);
+err_cslist:
+	cslist_put(cslist);
+	return -ENOMEM;
+}
+
+/* Timer injects a sync request for dirty chunk, when sync timeout expires.
+ * If the request fails, we just retry later.
+ */
+static void sync_timer_work(struct work_struct *w)
+{
+	struct pcs_map_entry *m = container_of(w, struct pcs_map_entry, sync_work.work);
+	struct pcs_int_request * sreq = NULL;
+	int err;
+
+	err = prepare_map_flush_ireq(m, &sreq);
+	if (err) {
+		map_sync_work_add(m, HZ);
+	} else {
+		if (sreq)
+			map_submit(m, sreq, 0);
+	}
+	/* Counter part from map_sync_work_add */
+	pcs_map_put(m);
+}
+
+
+/* Handle for api PCS_REQ_T_SYNC IO request. It scans through current map
+ * and constructs internal subrequests for each chunk, which is dirty at the moment.
+ * Current sync seq number are stored in subrequest right now, so that future
+ * dirtifying writes will not delay execution of this request.
+ *
+ * XXX we can issue a lot of subrequests here: one per each dirty chunk.
+ */
+void map_inject_flush_req(struct pcs_int_request *ireq)
+{
+	struct pcs_dentry_info *di = ireq->dentry;
+	struct list_head ireq_list;
+	unsigned long idx, end_idx;
+	u64 end;
+	struct pcs_map_entry *maps[MAP_BATCH];
+	int nr_maps;
+
+	if (di->fileinfo.sys.map_type != PCS_MAP_PLAIN ||
+	    di->fileinfo.sys.stripe_depth != 1) {
+		pcs_log(LOG_ERR, "bad map_type");
+		pcs_set_local_error(&ireq->error, PCS_ERR_PROTOCOL);
+		ireq_complete(ireq);
+		return;
+	}
+
+	atomic_set(&ireq->iocount, 1);
+	INIT_LIST_HEAD(&ireq_list);
+
+	idx = ireq->apireq.req->pos >> DENTRY_CHUNK_SIZE_BITS(di);
+	end = (ireq->apireq.req->pos + ireq->apireq.req->size) >> DENTRY_CHUNK_SIZE_BITS(di);
+	if (end <= ireq->apireq.req->pos)
+		end = ~0ULL;
+	end_idx = end >> DENTRY_CHUNK_SIZE_BITS(di);
+
+	do {
+		int i;
+
+		rcu_read_lock();
+		/* TODO !!!! use radix tree tags for DIRTY flags */
+		nr_maps = radix_tree_gang_lookup(&di->mapping.map_tree,
+				(void **)maps, idx, MAP_BATCH);
+
+		for (i = 0; i < nr_maps; i++) {
+			struct pcs_map_entry *m = maps[i];
+
+			idx = maps[i]->index;
+			if (idx > end_idx)
+				break;
+
+			spin_lock(&m->lock);
+			if (!(m->flags & PCS_MAP_DIRTY) || !pcs_map_get_locked(m))
+					maps[i] = NULL;
+			spin_unlock(&m->lock);
+
+		}
+		rcu_read_unlock();
+		for (i = 0; i < nr_maps; i++) {
+			struct pcs_int_request * sreq = NULL;
+			int err = 0;
+
+			if (idx > end_idx)
+				break;
+			if (!maps[i])
+				continue;
+			err = prepare_map_flush_ireq(maps[i], &sreq);
+			pcs_map_put(maps[i]);
+			if (err) {
+				pcs_set_local_error(&ireq->error, PCS_ERR_NOMEM);
+				break;
+			}
+			/* Request not prepared, so sync is not required */
+			if (!sreq)
+				continue;
+			pcs_sreq_attach(sreq, ireq);
+			list_add_tail(&sreq->list, &ireq_list);
+		}
+		idx++;
+	} while (nr_maps && idx < end_idx + 1);
+
+	pcs_cc_requeue(ireq->dentry->cluster, &ireq_list);
+
+	if (atomic_dec_and_test(&ireq->iocount))
+		ireq_complete(ireq);
+}
diff --git a/fs/fuse/kio/pcs/pcs_map.h b/fs/fuse/kio/pcs/pcs_map.h
new file mode 100644
index 000000000000..754e0f177d46
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_map.h
@@ -0,0 +1,264 @@
+#ifndef _PCS_MAP_H_
+#define _PCS_MAP_H_ 1
+
+#include "pcs_client_types.h"
+#include "pcs_mds_prot.h"
+#include "pcs_flow_detect.h"
+#include "log.h"
+
+struct pcs_dentry_info;
+struct pcs_int_request;
+
+#define PCS_MAP_LIMIT		4096
+
+#define PCS_SYNC_TIMEOUT		(20 * HZ)
+
+#define PCS_REPLICATION_BLACKLIST_TIMEOUT  HZ
+
+//// TODO:
+#define PCS_MAP_MIN_REBALANCE_TIMEOUT	(HZ / 5)
+#define PCS_MAP_MAX_REBALANCE_TIMEOUT	(60 * HZ)
+
+#define PCS_TWEAK_REBALANCE_ALWAYS	1
+#define PCS_TWEAK_IGNORE_SEQUENTIAL	2
+#define PCS_TWEAK_USE_FLOW_LOAD		4
+#define PCS_TWEAK_USE_FLOW_WEIGHT	8
+
+struct pcs_cs_link
+{
+	struct pcs_cs	* cs;
+	int		index;
+	int		addr_serno;
+	struct list_head	link;  /* Link in list of maps routed via cs,
+					* head is cs->map_list */
+};
+
+/*
+ * PCS_MAP_DEAD		- mapping is under destruction
+ * PCS_MAP_NEW		- version is invalid
+ * PCS_MAP_READABLE	- read IO requests can be sent using this map.
+ * PCS_MAP_WRITEABLE	- read/write IO requests can be sent using this map.
+ * PCS_MAP_RESOLVING	- map is under resolution. If PCS_MAP_WRITEABLE/READABLE
+ * PCS_MAP_ERROR	- some error when communicating to CS happened. map requires revalidation.
+ *			  version is valid, but most likely will be obsoleted.
+ */
+enum
+{
+	PCS_MAP_READABLE	= 1,
+	PCS_MAP_WRITEABLE	= 2,
+	PCS_MAP_RESOLVING	= 4,
+	PCS_MAP_ERROR		= 8,
+	PCS_MAP_NEW		= 0x10,
+	PCS_MAP_DEAD		= 0x20,
+	PCS_MAP_EOF		= 0x40,
+};
+
+enum
+{
+	PCS_MAP_DIRTY		= 1,
+	PCS_MAP_FLUSHING	= 2,
+	PCS_MAP_DIRTY_GC	= 4,
+	PCS_MAP_CLIENT_SIZE	= 8,	/* chunk size is controlled by client */
+	PCS_MAP_CLIENT_ALLOC	= 0x10,	/* chunk allocation is controlled by client */
+	PCS_MAP_CLIENT_PSIZE	= 0x20, /* physical size of chunk on CS must be transmitted to MDS */
+};
+
+struct cs_sync_state
+{
+	PCS_INTEGRITY_SEQ_T	dirty_integrity;
+	PCS_SYNC_SEQ_T		dirty_epoch;
+	PCS_SYNC_SEQ_T		dirty_seq;
+	PCS_SYNC_SEQ_T		sync_epoch;
+	PCS_SYNC_SEQ_T		sync_seq;
+};
+
+struct pcs_cs_record
+{
+	struct pcs_cs_info	info;
+	struct cs_sync_state	sync;
+	struct pcs_cs_link	cslink;
+};
+
+struct pcs_cs_list
+{
+	struct pcs_map_entry	*map;
+	atomic_t		refcnt;
+	atomic_t		seq_read_in_flight;
+	int			read_index;		/* volatile read hint */
+	int			cong_index;		/* volatile cong hint */
+	unsigned long		blacklist;		/* Atomic bit field */
+	abs_time_t		blacklist_expires;	/* volatile blacklist stamp */
+	abs_time_t		select_stamp;		/* volatile read hint stamp */
+	/* members below are immutable accross cslist life time */
+#define CSL_FL_HAS_LOCAL	1
+	unsigned int		flags;
+	int			read_timeout;
+	int			write_timeout;
+	int			nsrv;
+	PCS_MAP_VERSION_T	version;		/* version inherented from map */
+	struct pcs_cs_record	cs[0];
+};
+
+/* TODO, LOCKING!!!!!
+ * the only immutable values are id and
+ */
+struct pcs_map_entry
+{
+	unsigned long		index;
+	union {
+		struct list_head lru_link;
+		struct rcu_head	 rcu;
+	};
+	struct pcs_mapping	*mapping;
+	struct pcs_map_set	*maps;
+
+	spinlock_t		lock;
+	int			state;
+	int			flags;
+	atomic_t		__refcnt;
+	u16			mds_flags;
+	u64			res_offset;
+
+	u32			chunk_psize;
+
+	PCS_MAP_VERSION_T	version;
+	PCS_CHUNK_UID_T		id;
+
+	pcs_error_t		iofailure;
+	unsigned long long	error_tstamp;
+
+	struct delayed_work	sync_work;
+	struct pcs_cs_list	*cs_list;
+	struct list_head	queue;
+};
+
+static inline u64 map_chunk_start(struct pcs_map_entry *m)
+{
+	return m->index << m->mapping->chunk_size_bits;
+}
+
+static inline u64 map_chunk_end(struct pcs_map_entry *m)
+{
+	return (m->index +1) << m->mapping->chunk_size_bits;
+}
+
+static inline struct pcs_dentry_info * pcs_dentry_from_mapping(struct pcs_mapping * mapping)
+{
+	return container_of(mapping, struct pcs_dentry_info, mapping);
+}
+
+static inline struct pcs_dentry_info * pcs_dentry_from_map(struct pcs_map_entry * m)
+{
+	return pcs_dentry_from_mapping(m->mapping);
+}
+
+static inline struct pcs_cluster_core *cc_from_map(struct pcs_map_entry * m)
+{
+	return pcs_dentry_from_mapping(m->mapping)->cluster;
+}
+
+void pcs_mapping_init(struct pcs_cluster_core *cc, struct pcs_mapping * mapping);
+void pcs_mapping_open(struct pcs_mapping * mapping);
+void pcs_mapping_invalidate(struct pcs_mapping * mapping);
+void pcs_mapping_deinit(struct pcs_mapping * mapping);
+void pcs_mapping_truncate(struct pcs_int_request *ireq, u64 old_size);
+void process_ireq_truncate(struct pcs_int_request *ireq);
+
+struct pcs_map_entry * pcs_find_get_map(struct pcs_dentry_info * de, u64 chunk);
+void map_submit(struct pcs_map_entry * m, struct pcs_int_request *ireq, int requeue);
+void map_notify_iochunk_error(struct pcs_int_request *ireq);
+void map_notify_soft_error(struct pcs_int_request *ireq);
+void __pcs_map_put(struct pcs_map_entry *m);
+
+void pcs_deaccount_ireq(struct pcs_int_request *ireq, pcs_error_t *);
+
+void cs_blacklist(struct pcs_cs * cs, int error, char * reason);
+void cs_whitelist(struct pcs_cs * cs, char * reason);
+void pcs_map_notify_addr_change(struct pcs_cs * cs);
+void pcs_map_force_reselect(struct pcs_cs * cs);
+
+struct pcs_msg;
+void pcs_map_verify_sync_state(struct pcs_dentry_info * de, struct pcs_int_request *ireq, struct pcs_msg *);
+void map_inject_flush_req(struct pcs_int_request *ireq);
+void process_flush_req(struct pcs_int_request *ireq);
+int map_check_limit(struct pcs_map_entry * map, struct pcs_int_request *ireq);
+int pcs_cslist_submit(struct pcs_int_request *ireq, struct pcs_cs_list *csl, int requeue);
+struct pcs_int_request * pcs_ireq_split(struct pcs_int_request *ireq, unsigned int iochunk, int noalign);
+int  fuse_map_resolve(struct pcs_map_entry * m, int direction);
+struct pcs_ioc_getmap;
+void pcs_map_complete(struct pcs_map_entry *m, struct pcs_ioc_getmap *omap);
+int pcs_map_encode_req(struct pcs_map_entry*m, struct pcs_ioc_getmap *map, int direction);
+void map_truncate_tail(struct pcs_mapping *mapping, u64 offset);
+unsigned long pcs_map_shrink_scan(struct shrinker *,  struct shrink_control *sc);
+
+#define MAP_FMT	"(%p) 0x%lld s:%x" DENTRY_FMT
+#define MAP_ARGS(m) (m), (long long)(m)->index,	 (m)->state, DENTRY_ARGS(pcs_dentry_from_map((m)))
+
+static inline void pcs_map_put(struct pcs_map_entry *m)
+{
+	TRACE("m(%p)->index:%ld ref:%d \n", m, m->index, atomic_read(&m->__refcnt));
+
+	BUG_ON(atomic_read(&m->__refcnt) <= 0);
+	if (atomic_dec_and_lock(&m->__refcnt, &m->lock))
+		__pcs_map_put(m);
+}
+
+static inline void map_add_lru(struct pcs_map_entry *m)
+{
+	assert_spin_locked(&m->lock);
+	if (m->flags & PCS_MAP_DIRTY)
+		list_lru_add(&m->maps->dirty_lru, &m->lru_link);
+	else
+		list_lru_add(&m->maps->lru, &m->lru_link);
+}
+
+static inline void map_del_lru(struct pcs_map_entry *m)
+{
+	assert_spin_locked(&m->lock);
+	if (m->flags & PCS_MAP_DIRTY)
+		list_lru_del(&m->maps->dirty_lru, &m->lru_link);
+	else
+		list_lru_del(&m->maps->lru, &m->lru_link);
+}
+
+static inline void pcs_map_put_locked(struct pcs_map_entry *m)
+{
+	TRACE("m(%p)->index:%ld ref:%d \n", m, m->index, atomic_read(&m->__refcnt));
+
+	BUG_ON(atomic_read(&m->__refcnt) <= 0);
+	BUG_ON(m->state & PCS_MAP_DEAD);
+
+	if (atomic_dec_and_test(&m->__refcnt))
+		map_add_lru(m);
+}
+
+static inline bool pcs_map_get_locked(struct pcs_map_entry *m)
+{
+	TRACE( MAP_FMT " refcnt:%d\n", MAP_ARGS(m), atomic_read(&m->__refcnt));
+	BUG_ON(atomic_read(&m->__refcnt) < 0);
+
+	if (m->state & PCS_MAP_DEAD) {
+		spin_unlock(&m->lock);
+		return 0;
+	}
+
+	if (atomic_inc_return(&m->__refcnt) == 1)
+		map_del_lru(m);
+
+	return 1;
+}
+
+static inline struct pcs_map_entry *pcs_map_get(struct pcs_map_entry *m)
+{
+	spin_lock(&m->lock);
+	if (!pcs_map_get_locked(m)) {
+		spin_unlock(&m->lock);
+		m = NULL;
+	} else
+		spin_unlock(&m->lock);
+
+	return m;
+}
+
+
+#endif /* _PCS_MAP_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_mds_prot.h b/fs/fuse/kio/pcs/pcs_mds_prot.h
new file mode 100644
index 000000000000..80c20fde1537
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_mds_prot.h
@@ -0,0 +1,1335 @@
+#ifndef _PCS_MDS_PROT_H_
+#define _PCS_MDS_PROT_H_ 1
+
+#include "pcs_rpc_prot.h"
+
+
+/* This file contains client interface to MDS.
+ */
+
+/* ---- limits */
+
+#define PCS_MDS_MAX_MSG_SIZE		0x21000 /* So we can transfer fuse request in single message */
+#define PCS_MDS_MAX_RESP_MSG_SIZE	PCS_MDS_MAX_MSG_SIZE
+#define PCS_MDS_MAX_PATH		0x10000
+
+/* ---- basic structures */
+
+/* The generation value represents the last paxos commit number. It is sent back and forth
+ * to the client to ensure the mds already have all commits necessary to process client request.
+ * Such approach guarantees consistency even if several mds are processing client requests in parallel.
+ */
+
+typedef u64 PCS_MDS_GEN_T;
+
+#define PCS_MDS_GEN_UNDEFINED 0
+
+/* signof(v1 - v2), -1 if v1 is older than v2 */
+static inline int mds_gen_compare(PCS_MDS_GEN_T v1, PCS_MDS_GEN_T v2)
+{
+	if (v1 == PCS_MDS_GEN_UNDEFINED || v2 == PCS_MDS_GEN_UNDEFINED)
+		return 0;
+	if ((s64)(v1 - v2) < 0)
+		return -1;
+	return 0;
+}
+
+/* Common header of all messages */
+struct pcs_mds_hdr
+{
+	struct pcs_rpc_hdr	h;
+	PCS_MDS_GEN_T		mds_gen;
+	PCS_CONFIG_SEQ_T	cfg_version;
+	u32			cluster_version;
+	u32			flags; /* PCS_MDS_F_XXX */
+	u32			reserved;
+} __attribute__((aligned(8)));
+
+/* Request header flags */
+#define PCS_MDS_F_IS_MASTER   1	    /* Set on reply if server is master */
+#define PCS_MDS_F_NEED_MASTER 2	    /* Request will fail with PCS_ERR_MDS_NOT_MASTER error if server is not master */
+#define PCS_MDS_F_CLNT_VERSION 0x80 /* Client supply its version in the message */
+/* Check client version (passed in cluster_version) is not less than the cluster version.
+ * Returns PCS_ERR_CLNT_VERSION otherwise. */
+#define PCS_MDS_F_CHK_VERSION 0x100
+
+/*
+ * CS information
+ */
+
+typedef u16 pcs_cs_io_prio_t;
+typedef u8  pcs_cs_net_prio_t;
+
+/* CS info flags */
+enum {
+	CS_FL_LOCAL	  = 1,	  /* CS is on the same host as the client */
+	CS_FL_LOCAL_SOCK  = 2,	  /* CS listens on local socket */
+	CS_FL_INACTIVE	  = 0x10, /* CS is not sending pings for some time */
+	CS_FL_REPLICATING = 0x20, /* This CS is replicating this map */
+	CS_FL_FAILED	  = 0x40, /* This CS has failed */
+	CS_FL_ROLE	  = 0xFF00,/* Role of this CS in raid array, 0..depth-1 are data chunks, the rest are syndrome */
+	CS_FL_ROLE_LOG	  = 8,
+};
+
+#define CS_FL_ROLE_GET(flags) (((flags) & CS_FL_ROLE) >> CS_FL_ROLE_LOG)
+#define CS_FL_ROLE_FLAGS(role) (CS_FL_ROLE & ((role) << CS_FL_ROLE_LOG))
+
+struct pcs_cs_info {
+	/* CS node id */
+	PCS_NODE_ID_T		id;
+	/* Integrity sequence number updated every time the CS restarts without properly flushing all client's data */
+	PCS_INTEGRITY_SEQ_T	integrity_seq;
+	/* Access priority (higher values are preferable) based on the IO activity, 0 means unknown */
+	pcs_cs_io_prio_t	io_prio;
+	/* Network priority (higher values are preferable) based on the network distance, 0 means unknown */
+	pcs_cs_net_prio_t	net_prio;
+	/* QoS level of this CS (higher values are preferable) */
+	u8			qos;
+	/* Flags (CS_FL_XXX) */
+	u32			flags;
+	u32			reserved;
+	/* Primary network address */
+	PCS_NET_ADDR_T		addr;
+} __attribute__((aligned(8)));
+
+struct pcs_cs_addr_info
+{
+	PCS_NODE_ID_T		id;
+	PCS_INTEGRITY_SEQ_T	integrity_seq;
+	u32			naddr;
+	PCS_NET_ADDR_T		addr[1];
+} __attribute__((aligned(8)));
+
+/* ---- connection request
+ * The following structure serves as a payload for RPC connect messages to deliver MDS server list to the client.
+ */
+
+#define PCS_MDS_CONNECT_PAYLOAD PCS_RPC_APP_PAYLOAD_BASE
+
+struct pcs_mds_node_info {
+	PCS_NODE_ID_T	id;
+	PCS_NET_ADDR_T	addr;
+} __attribute__((aligned(8)));
+
+struct pcs_mds_conn_payload
+{
+	PCS_MDS_GEN_T		mds_gen;	 /* The last commit sequence number */
+	PCS_MASTER_GENID_T	mds_master_ver;	 /* The mds epoch number (see master field of PCS_MAP_VERSION_T) */
+	u16			mds_list_len;	 /* The number of MDSes in list */
+	s16			mds_master_idx;	 /* The index of the master in the list (negative means no master is known) */
+	struct pcs_mds_node_info mds_list[1];	  /* The list of MDS */
+} __attribute__((aligned(8)));
+
+/* ---- chunk server resolution request/response
+ * Client issues it to resolve server ID to network address
+ * The message is the same for request and response
+ */
+
+#define PCS_MDS_CS_RESOLVE_REQ	(PCS_RPC_MDS_CLIENT_BASE + 0x20)
+#define PCS_MDS_CS_RESOLVE_RESP	(PCS_MDS_CS_RESOLVE_REQ | PCS_RPC_DIRECTION)
+
+struct pcs_mds_cs_resolve_msg
+{
+	struct pcs_mds_hdr	hdr;
+	struct pcs_cs_addr_info	info; /* in/out */
+} __attribute__((aligned(8)));
+
+/* ---- lease requests
+ * Lease provides the mechanism for mutual exclusion of the operations referencing the particular name. The name for
+ * which the lease is being requested may or may not refer to the existing file. Getting exclusive lease for non yet existing
+ * file is required to ensure exclusive file creation semantic.
+ *
+ * Once the lease is granted it must be updated periodically by the client alive requests and released ultimately. Failing
+ * to release exclusive lease by the client will have strong performance impact since the MDS will take care to stop corresponding
+ * IO operations if the file will be lately accessed by another client.
+ *
+ * The message type is pcs_mds_lease_msg (same for request and response). If the lease can not be acquired the pcs_rpc_error_resp
+ * will be returned.
+ */
+
+#define PCS_MDS_LEASE_REQ	(PCS_RPC_MDS_CLIENT_BASE + 2)
+#define PCS_MDS_LEASE_RESP	(PCS_MDS_LEASE_REQ | PCS_RPC_DIRECTION)
+
+/* Lease flags. */
+enum
+{
+/* Release lease if being held. */
+	PCS_LEASE_NONE = 0,
+/* Shared lease. May be acquired for reading (not mandatory though). */
+	PCS_LEASE_SHARED,
+/* Exclusive lease. Mandatory for file creation, deletion, rename, truncation, resizing and write access. */
+	PCS_LEASE_EXCL,
+/* Lease type mask */
+	PCS_LEASE_TYPE_MASK = PCS_LEASE_SHARED|PCS_LEASE_EXCL,
+/* Just refresh the lease. Return error if the lease wasn't exist prior to the call. */
+	PCS_LEASE_REFRESH = 0x10,
+/* Use timeout from the message instead of the system-wide. */
+	PCS_LEASE_CUSTOM_TOUT = 0x20,
+/* Update all leases granted to the client. The name argument is ignored. If set no other flags are allowed. */
+	PCS_LEASE_ALIVE = 0x100,
+/* Release all leases granted to the client. The name argument is ignored. */
+	PCS_LEASE_DROP_ALL = 0x200,
+/* Query file existence. Just saves one file message in some common use cases. */
+	PCS_LEASE_QUERY_FILE = 0x1000,
+/* Update file modification time */
+	PCS_LEASE_UP_FILE_MTIME = 0x2000,
+/* Enforce strict path checking on file lookup.
+ * If it is set an attempt to lookup file with dir object lacking in the path will fail with PCS_ERR_NOT_FOUND error.
+ */
+	PCS_LEASE_POSIX_PATH = 0x10000,
+/* The following bits are reserved, they can't be set by the client. */
+	PCS_LEASE_RESERVED_ = 0xff000000,
+};
+
+/* Result flags */
+enum
+{
+	PCS_LRES_GRANTED  = 0x1,
+	PCS_LRES_RELEASED = 0x2,
+/* File exists flag. The file existence is being checked if PCS_LEASE_QUERY_FILE is set on input.
+ * If the flag is set the file_id is valid on output.
+ */
+	PCS_LRES_FILE_EXISTS = 0x100,
+/* The lease ID is returned (for compatibility with old code) */
+	PCS_LRES_ID_VALID    = 0x200,
+};
+
+struct pcs_mds_lease_msg
+{
+	struct pcs_mds_hdr	hdr;
+	u32			flags;	  /* request flags */
+	u32			result;	  /* result flags */
+	u32			tout;	  /* Lease expiration timeout (in milliseconds) on output.
+					   * May be specified on input with PCS_LEASE_CUSTOM_TOUT flag.
+					   * Client may use custom timeout to create lease with shorter lifetime than
+					   * the default one.
+					   */
+	u32			reserved;
+	struct pcs_mds_fileinfo	finfo;	  /* file info (valid on output if PCS_LRES_FILE_EXISTS result flag is set) */
+	union {
+		PCS_FILE_ID_T	root;	  /* root dir ID on input */
+		PCS_FILE_ID_T	lease_id; /* lease inode id on output */
+	};
+	struct pcs_path		name;	  /* path relative to the root dir */
+} __attribute__((aligned(8)));
+
+/*
+ * Refresh the list of leases identified by their IDs. The requet message type is struct pcs_mds_lease_refresh_msg.
+ * The request will always succeed returning just pcs_mds_hdr.
+ */
+
+#define PCS_MDS_LEASE_REFRESH_REQ	(PCS_RPC_MDS_CLIENT_BASE + 10)
+#define PCS_MDS_LEASE_REFRESH_RESP	(PCS_MDS_LEASE_REFRESH_REQ | PCS_RPC_DIRECTION)
+
+struct pcs_mds_lease_refresh_msg
+{
+	struct pcs_mds_hdr	hdr;
+	u64			reserved;
+	u32			nfailed;     /* The number of leases that were failed to refresh */
+	u32			nleases;     /* The number of lease ID that follows */
+	PCS_FILE_ID_T		lease_id[0]; /* The array of lease ID to refresh */
+} __attribute__((aligned(8)));
+
+/* ---- file request
+ * Supports file create, rename, delete and query operations.
+ * The file model assumes that every file has single name as well as fixed length ID assigned to it by MDS itself. The file create and rename
+ * operations are made immune to MDS crashes so they can be safely restarted by the client. The MDS is using the client ID to detect restarted
+ * operations so the client must ensure its uniqueness.
+ *
+ * The file attributes are filled on output whenever the file is referenced. The replication and optionally size (with PCS_FFL_RESIZE flag)
+ * attributes may be used on input as well. The operation to be done is defined by the combination of the op and flags fields.
+ *
+ * The message type is pcs_mds_file_msg (same for request and response). On failure the pcs_rpc_error_resp will be returned.
+*/
+
+#define PCS_MDS_FILE_REQ	(PCS_RPC_MDS_CLIENT_BASE + 4)
+#define PCS_MDS_FILE_RESP	(PCS_MDS_FILE_REQ | PCS_RPC_DIRECTION)
+
+/* File map type (storage type) */
+enum
+{
+	PCS_MAP_PLAIN = 0, /* Plain replicas */
+	PCS_MAP_RAID6 = 1, /* RAID6 encoded replicas */
+	PCS_MAP_RS    = 2, /* Reed-Solomon encoded replicas */
+	PCS_MAP_PLAIN_LOGSTREAM = 3, /* PCS_MAP_PLAIN, but LOGSTREAM is to be used */
+	/* Combined map types are implemented by the client as a collection of files placed in
+	 * the container directory - see PCS_FATTR_CONTAINER.
+	 */
+	PCS_MAP_COMBINED = 0x80,
+	PCS_MAP_LS = PCS_MAP_COMBINED, /* Log structured storage */
+};
+
+/* Max inline file size */
+#define PCS_MAX_INLINE_SIZE 0x100000 /* 1Mb */
+
+/* File operation. It determines the treatment of the file name and ID parameters in the message. */
+enum
+{
+/* Identify file by its ID. May be used to update file attributes depending on other flags.
+ * Combined with PCS_FFL_DELETE will delete the file.
+ */
+	PCS_FOP_TOUCH = 0,
+/* Similar to TOUCH but identify file by name, setting ID on output.
+ */
+	PCS_FOP_RESOLVE,
+/* Rename the file with specified ID. The exclusive lease on both the current file name and the new one is required.
+ * If the file with new name exists it will be replaced. If the client wants to ensure
+ * exclusive rename semantic it must check the target existence first (via pcs_mds_lease_msg message). Fails if
+ * the file with requested ID does not exists. Note that rename operation will succeed if restarted.
+ */
+	PCS_FOP_RENAME,
+/* Rename file replacing the existing target identified by info.attr.id renaming the target
+ * at the same time. The source file is identified by info.attr.src_id.
+ * This operation is intended to use in scenarios when the file being deleted as
+ * a result of the rename operation is open by the client and should be renamed onto the
+ * temporary file.
+ */
+	PCS_FOP_REPLACE,
+};
+
+/* File operation flags */
+enum
+{
+/* Update existing file size.
+ * Valid with PCS_FOP_TOUCH, PCS_FOP_RESOLVE operations.
+ * The exclusive lease on the file is required.
+ */
+	PCS_FFL_RESIZE = 1,
+
+/* Create file if not yet exists. Valid with PCS_FOP_RESOLVE operation.
+ * The exclusive lease on the file name is required. If the client wants to ensure exclusive
+ * creation semantic it must check it existence first (via pcs_mds_lease_msg message).
+ * Note that create operation will succeed if restarted. If the object is already created it will
+ * be leave intact, the response will contain it attributes.
+*/
+	PCS_FFL_CREATE = 0x10,
+
+/* Create file in container with specific map type (see PCS_MAP_XXX) passed in message as info.repl.policy.create_type.
+ * The lease may be acquired at the container level.
+ */
+	PCS_FFL_CREATE_IN_CONTAINER = 0x20,
+
+/* Delete the file being referenced. Valid with PCS_FOP_TOUCH, PCS_FOP_RESOLVE.
+ * The exclusive lease on the file is required. Not compatible with any other flags.
+ * Note that delete operation will succeed if restarted.
+ */
+	PCS_FFL_DELETE = 0x100,
+
+/* Enforce strict path checking. If the flag is set:
+ *     - an attempt to create or resolve file with dir object lacking in the path will fail with PCS_ERR_NOT_FOUND error
+ *     - an attempt to delete or rename object with child objects will fail with PCS_ERR_NON_EMPTY_DIR error
+ */
+	PCS_FFL_POSIX_PATH = 0x10000,
+
+/* Recursive action */
+	PCS_FFL_RECURSIVE = 0x100000,
+};
+
+/* File operation result */
+enum {
+	PCS_FRES_FILE_CREATED  = 0x1,
+	PCS_FRES_FILE_RENAMED  = 0x2,
+	PCS_FRES_FILE_DELETED  = 0x8,
+/* Note that upon replacing the existing file on rename both PCS_FRES_FILE_RENAMED and PCS_FRES_FILE_DELETED will be set. */
+};
+
+struct pcs_mds_file_msg
+{
+	struct pcs_mds_hdr	hdr;
+	u32			op;
+	u32			flags;
+	u32			result;
+	u32			reserved;
+	PCS_FILE_ID_T		root; /* root dir ID on input */
+	struct pcs_mds_fileinfo info; /* file info */
+	struct pcs_path		name; /* the path relative to the root */
+} __attribute__((aligned(8)));
+
+/* The aligned size of the pcs_path structure with 1 byte reserved for terminating 0.
+ * Note that the client is not required to zero-pad strings though the strings returned
+ * by MDS are always zero padded.
+ */
+#define PCS_MDS_FILENAME_SZ_ALIGN(sz)  PCS_PATH_SZ_ALIGN(sz)
+#define PCS_MDS_FILENAME_SZ_ALIGNED(n) PCS_PATH_SZ_ALIGNED(n)
+
+/* ---- file attributes request
+ * Get/set the particular file attributes with optional possibility to apply them recursively.
+ * The message may contain data of arbitrary size depending on the op parameter.
+ * The valid_mask parameter may contain the bitmask of the individual valid data attributes.
+ * Some operations may support getting/setting parameters of the filesystem root which is
+ * equivalent to changing global configuration with optional possibility to apply new
+ * settings to all existing files.
+ */
+
+#define PCS_MDS_FATTR_REQ	(PCS_RPC_MDS_CLIENT_BASE + 6)
+#define PCS_MDS_FATTR_RESP	(PCS_MDS_FATTR_REQ | PCS_RPC_DIRECTION)
+
+struct pcs_mds_fattr_msg
+{
+	struct pcs_mds_hdr	hdr;
+	u32			op;	      /* PCS_FA_XXX */
+	u32			reserved;     /* currently 0 */
+	union {
+		u64		valid_mask;   /* bitmask of valid attributes */
+		struct {
+			/* The offset and the size of the addressed data range. Used for associated
+			 * data addressing (PCS_FA_DATA). Otherwise ignored.
+			 */
+			u32	attr_offset;
+			u32	attr_size;
+		};
+	};
+	PCS_FILETIME_T		modify_ts;    /* modification time if appropriate */
+	/* the following field is reserved for the case when we can have more than one version of the attribute data structure */
+	u32			data_version; /* currently 0 */
+	u32			data_size;    /* the size in bytes of the attribute data */
+	PCS_FILE_ID_T		root;	      /* root dir ID on input */
+	struct pcs_path		name;	      /* the path relative to the root */
+	/*
+	 * The offset of the data relative to the name is PCS_MDS_FILENAME_SZ_ALIGNED(name)
+	 */
+};
+
+/* The op field content */
+enum {
+	PCS_FA_SET	   = 0x80000000,       /* Set attributes */
+	PCS_FA_RECURSIVE   = 0x40000000,       /* Set recursively */
+	PCS_FA_BY_ID	   = 0x20000000,       /* Identify file by ID, path is ignored. Use it with root=0 to adress the root itself. */
+	PCS_FA_MASK_	   = (PCS_FA_BY_ID-1), /* The bitmask for attribute type */
+	/* File attributes (set only). Currently only PCS_FATTR_INLINE may be set/cleared and only on the directory. */
+	PCS_FA_ATTRIB = 0x1,
+	/* Associated data. The file must have PCS_FATTR_INLINE attribute. The total size of the data equals to the size of the file. */
+	PCS_FA_DATA = 0x10,
+	/* System attributes represented by struct pcs_mds_sys_info (set only) */
+	PCS_FA_SYS = 0x80,
+	/* Replication attributes represented by struct pcs_mds_repl_info (set only) */
+	PCS_FA_REPL = 0x100,
+	/* Hot hosts represented by struct pcs_mds_hot_hosts (get only) */
+	PCS_FA_HOT_HOSTS = 0x200,
+	/* Don't set anything, just drop all leases */
+	PCS_FA_DROP_LEASES = 0x10000,
+	/* .. whatever you need .. */
+};
+
+/* Valid mask for system attributes (PCS_FA_SYS) */
+enum {
+	PCS_FA_SYS_MAP_TYPE	= 0x1,
+	PCS_FA_SYS_CHUNK_SIZE	= 0x10,
+	PCS_FA_SYS_STRIPE_DEPTH = 0x100,
+	PCS_FA_SYS_REDUNDANCY	= 0x200,
+	PCS_FA_SYS_TOLERANCE	= 0x400,
+	PCS_FA_SYS_STRIP_WIDTH	= 0x1000,
+};
+
+/* Valid mask for replication attributes (PCS_FA_REPL) */
+enum {
+	PCS_FA_REPL_REPLICAS  = 1,
+	PCS_FA_REPL_PLACEMENT = 0x10,
+	PCS_FA_REPL_QOS	      = 0x100,
+};
+
+#define PCS_N_HOT_HOSTS 8
+
+/* Hot hosts structure */
+struct pcs_mds_hot_hosts {
+	struct {
+		PCS_NODE_ID_T	id;
+		u64		nrepl;
+	} host[PCS_N_HOT_HOSTS];
+} __attribute__((aligned(8)));
+
+/* ---- read dir request
+ * Read directory.
+ * The directory information is maintained by MDS treating / as path separator.
+ * The following paths are considered identical: /a/b, /a/b/, a/b, a//b
+ *
+ * The message type is pcs_mds_readdir_msg (same for request and response). On failure the pcs_rpc_error_resp will be returned.
+ */
+
+#define PCS_MDS_READDIR_REQ	(PCS_RPC_MDS_CLIENT_BASE + 8)
+#define PCS_MDS_READDIR_RESP	(PCS_MDS_READDIR_REQ | PCS_RPC_DIRECTION)
+
+/* The dir entry flags */
+enum {
+	/* The entry corresponds to the file */
+	PCS_DFL_FILE = 1,
+	/* The entry corresponds to the directory (file with PCS_FATTR_DIR) */
+	PCS_DFL_DIR  = 2,
+	/* The entry has child objects */
+	PCS_DFL_HAS_CHILDREN = 4,
+	/* The entry corresponds to symlin (file with PCS_FATTR_LINK) */
+	PCS_DFL_LINK = 8,
+	/* The entry is storage container */
+	PCS_DFL_CONTAINER = 0x40,
+	/* The dir end marker, the name is empty */
+	PCS_DFL_END  = 0x100,
+	/* Entry is using extended format */
+	PCS_DFL_EX_INFO = 0x10000,
+	/* Entry is followed by symlink target */
+	PCS_DFL_EX_LINK = 0x20000
+};
+
+struct pcs_mds_dentry
+{
+	u32			flags;
+	u32			reserved;
+	PCS_FILE_ID_T		id;
+	struct pcs_path		name;
+} __attribute__((aligned(8)));
+
+#define PCS_MDS_DENTRY_SZ(d)	     (offsetof(struct pcs_mds_dentry, name.str) + (d).name.sz)
+#define PCS_MDS_DENTRY_SZ_ALIGN(sz)  (offsetof(struct pcs_mds_dentry, name) + PCS_MDS_FILENAME_SZ_ALIGN(sz))
+#define PCS_MDS_DENTRY_SZ_ALIGNED(d) (offsetof(struct pcs_mds_dentry, name) + PCS_MDS_FILENAME_SZ_ALIGNED((d).name))
+
+struct pcs_mds_dentry_ex
+{
+	u32			flags;
+	u32			reserved;
+	struct pcs_mds_fileinfo	info;
+	struct pcs_path		name;
+} __attribute__((aligned(8)));
+
+#define PCS_MDS_DENTRY_EX_SZ(d)		(offsetof(struct pcs_mds_dentry_ex, name.str) + (d).name.sz)
+#define PCS_MDS_DENTRY_EX_SZ_ALIGN(sz)	(offsetof(struct pcs_mds_dentry_ex, name) + PCS_MDS_FILENAME_SZ_ALIGN(sz))
+#define PCS_MDS_DENTRY_EX_SZ_ALIGNED(d) (offsetof(struct pcs_mds_dentry_ex, name) + PCS_MDS_FILENAME_SZ_ALIGNED((d).name))
+
+/* The request flags */
+enum {
+/* The directory is identified by its ID, the path argument is ignored
+ */
+	PCS_READDIR_BY_ID = 0x100,
+/* Enforce strict path checking on path lookup.
+ * If it is set:
+ *    - an attempt to resolve path with dir object lacking will fail with PCS_ERR_NOT_FOUND error
+ *    - an attempt to resolve not a directory will fail with PCS_ERR_NOT_DIR error
+ *    - child entries without dir/file objects wont be returned
+ */
+	PCS_READDIR_POSIX_PATH = 0x10000,
+/* Query extended info - returns pcs_mds_dentry_ex structures.
+ */
+	PCS_READDIR_EX_INFO = 0x100000,
+/* Pack links target right after extended info.
+ */
+	PCS_READDIR_EX_LINKS = 0x200000,
+};
+
+struct pcs_mds_readdir_msg
+{
+	struct pcs_mds_hdr	hdr;
+	/* (in) The maximum number of entries to return, 0 - no limit */
+	u32			dent_max;
+	/* (in/out) The number of entries that follows */
+	u32			dent_cnt;
+	/* (in) The number of entries to skip */
+	u32			dent_skip;
+	/* (in) The limit on the message size in bytes, 0 - no limit */
+	u32			max_size;
+	/* (in) Flag bits */
+	u32			flags;
+	/* Reserved for future use */
+	u32			reserved;
+	/* (in) root dir ID or the directory ID if PCS_READDIR_BY_ID flag is set */
+	PCS_FILE_ID_T		root;
+	/* (in) The path relative to the root (ignored if PCS_READDIR_BY_ID flag is set) */
+	struct pcs_path		path;
+	/* After the end of the path the number of pcs_mds_dentry are being placed sequentially with 8 byte alignment,
+	 * see PCS_MDS_FILENAME_SZ_ALIGNED, PCS_MDS_DENTRY_SZ_ALIGNED, PCS_MDS_FIRST_DENTRY_OFFSET for details.
+	 * In case there are more than dent_max-1 entries in the dir referred by path or max_size limit is exceeded
+	 * the directory content may be returned by several calls. Every next call may either specify the dent_skip
+	 * count or pass the last returned entry as the single element of the pcs_mds_dentry list on input.
+	 * Either dent_max or max_size must have nonzero values on input. The response may have zero dent_cnt
+	 * only in case the max_size is too small for the dentry to be returned.
+	 */
+} __attribute__((aligned(8)));
+
+#define PCS_MDS_FIRST_DENTRY_OFFSET(msg) (offsetof(struct pcs_mds_readdir_msg, path) + PCS_MDS_FILENAME_SZ_ALIGNED((msg).path))
+
+/* ---- chunk map request/response
+ * Before client may start IO the replication path must be set up.
+ * Client will be given the map version and the id of the chunk server the IO
+ * messages must be sent to. All other details are hidden inside MDS to CS protocol.
+ * In case the IO request returns error the client must set last_err accordingly identifying
+ * failed CS by offender field, request new map and restart failed IO operation.
+ *
+ * The message type is pcs_mds_map_msg (same for request and response).
+ */
+
+#define PCS_MDS_MAP_REQ		(PCS_RPC_MDS_CLIENT_BASE + 0x10)
+#define PCS_MDS_MAP_RESP	(PCS_MDS_MAP_REQ | PCS_RPC_DIRECTION)
+
+/*
+ * Mode bits
+ */
+#define PCS_MDS_MAP_MODE_READ	1
+#define PCS_MDS_MAP_MODE_WRITE	2
+/*
+ * The retry bit must be set in case we are requesting the map after IO failure.
+ * The corresponding last_err, offender, version and root fields must be set in such case in accordance to the failed map.
+ */
+#define PCS_MDS_MAP_RETRY 0x100
+/* The dirty bit must be set when client completed some write, but it is still not synced */
+#define PCS_MDS_MAP_DIRTY 0x200
+/* "new" bit is set by client on RAID maps, which require allocation of new chunk. In this case
+ * "chunk_size" usually uninitialized by client defines size of chunk to be allocated by MDS.
+ * NOTE: all map requests on the last chunk may have "chunk_size" non-zero and this means
+ * client wishes to expand the last chunk.
+ */
+#define PCS_MDS_MAP_NEW	  0x400
+/* This bit is set by client in request, if it contains physical size of chunk for CS.
+ * It is used when MDS cannot calculate size of chunk on CS only from logical chunk size,
+ * which is the case for RAID encoded files with variable strip size. Unless this bit is set,
+ * physical size of chunk on CS is calculated from logical chunk size by formulae already
+ * implemented in MDS.
+ *
+ * MDS sets this flag when it returns physical size of chunk in "psize_ret", otherwise
+ * this flag must be cleared in messages in MDS->client direction. Normally, MDS should
+ * return "psize_ret" when it has chunk_psize in hands.
+ */
+#define PCS_MDS_MAP_PSIZE 0x800
+
+/* Dirty chunk size is 1M to cover 64M chunk with 64 bits. */
+#define PCS_DIRTY_CHUNK_SIZE	(1024*1024)
+
+/* Map flags */
+#define PCS_MDS_MAP_ZERO_CHUNK	1	/* The chunk is not yet allocated, valid in response to read-only requests */
+
+struct pcs_mds_map_msg
+{
+	struct pcs_mds_hdr	hdr;
+	PCS_CHUNK_ID_T		chunkid;	/* The chunk id (file ID, offset pair) - must be provided on input */
+	u16			mode;		/* read/write mode and other client supplied flags */
+	u16			flags;		/* flags set by the server (replicating) */
+	union {
+		u32		last_err;	/* last error returned by CS if requesting map on IO retry (in) */
+		u32		psize_ret;	/* length of chunk on CS (out) */
+	};
+	PCS_NODE_ID_T		offender;	/* the failed CS id on retry */
+	PCS_MAP_VERSION_T	version;	/* in (on retry) / out */
+	PCS_CHUNK_UID_T		uid;		/* chunk unique id on out */
+	union {
+		u32		read_tout;	/* read	 timeout (msec) on out */
+		u32		chunk_psize;	/* physical size of chunk on CS on in */
+	};
+	u32			write_tout;	/* write timeout (msec) on out */
+	struct pcs_cs_info	root;		/* in (on retry) / out */
+	union {
+		struct {
+			u32	chunk_size;	/* The chunk size */
+			u32	child_cs_cnt;	/* The number of non-root CS entries that follows */
+		};
+		u64		zero_chunk_size;/* Size of hole, valid with PCS_MDS_MAP_ZERO_CHUNK */
+	};
+	/* The list of non-root chunk servers. Any of them may be used as the target for read requests */
+	struct pcs_cs_info	child_cs_list[0];
+
+} __attribute__((aligned(8)));
+
+/* known types for ID generators */
+enum {
+	MDS_MID_GEN_TYPE = 0,
+	MDS_CSID_GEN_TYPE,
+	MDS_CID_GEN_TYPE,
+};
+
+/* ---- monitor mds state
+ */
+
+#define PCS_MDS_MONITOR_REQ		(PCS_RPC_MDS_CLIENT_BASE + 0x14)
+#define PCS_MDS_MONITOR_RESP		(PCS_MDS_MONITOR_REQ | PCS_RPC_DIRECTION)
+
+#define PCS_PERFCNT_MAXSIZE		PCS_MDS_MAX_RESP_MSG_SIZE
+
+enum
+{
+	PCS_PC_GEN_UPTIME		= 1, /* Milliseconds since program start */
+	PCS_PC_GEN_BUILD_VERSION	= 2, /* Build version string */
+	PCS_PC_GEN_LOAD			= 4, /* Activity time in msec */
+	PCS_PC_GEN_VERSION		= 5, /* MDS's version */
+
+	PCS_PC_LJ_TX_COUNT		= 0x10, /* The local journal transaction count / rate */
+	PCS_PC_LJ_TX_TOTAL_SZ		= 0x11, /* The local journal transaction total size / rate */
+	PCS_PC_LJ_COMMIT_COUNT		= 0x12, /* The local journal commit count / rate */
+	PCS_PC_LJ_WRITE_TOTAL		= 0x13,	/* The total time spent writing the local journal (msec) */
+	PCS_PC_LJ_WRITE_TIME		= 0x14,	/* The mean local journal transaction writing time (msec) */
+
+	PCS_PC_RJ_STATUS		= 0x20, /* RJ_STATE_XXX, see rjournal.h */
+	PCS_PC_RJ_ROUND			= 0x21, /* transaction number */
+	PCS_PC_RJ_MASTER_KNOWN		= 0x22, /* is master known? */
+	PCS_PC_RJ_MASTER_ID		= 0x23, /* master node id */
+	PCS_PC_RJ_MASTER_EPOCH		= 0x24, /* master generation number */
+	PCS_PC_RJ_MASTER_UPTIME		= 0x25, /* time since last master change (ms) */
+	PCS_PC_RJ_NODES_STATE		= 0x26, /* paxos node's state */
+
+	PCS_PC_REPL_NORM		= 0x31, /* normal number of replicas */
+	PCS_PC_REPL_LIMIT		= 0x32, /* minimal number of replicas,
+						   one cannot write to a chunk
+						   that has less or equal
+						   number of replicas */
+	PCS_PC_REPL_MAX			= 0x33, /* maximum number of replicas */
+
+	PCS_PC_CL_VERSION		= 0x40, /* MDS cluster version */
+	PCS_PC_CL_TOTAL_SPACE_TIER	= 0x41, /* total space per tier */
+	PCS_PC_CL_FREE_SPACE_TIER	= 0x42, /* free space per tier */
+	PCS_PC_CL_TOTAL_EFFECTIVE_TIER	= 0x43, /* effective total space available for chunks allocation in tier */
+	PCS_PC_CL_AVAIL_SPACE_TIER	= 0x44, /* the amount of free space available for chunks allocation in tier */
+
+	PCS_PC_CL_TOTAL_EFFECTIVE_X	= 0x45, /* effective total space matrix per tier and locality */
+	PCS_PC_CL_AVAIL_SPACE_X		= 0x46, /* effective available space matrix per tier and locality */
+
+	PCS_PC_CL_STOR_VERSION		= 0x50, /* storage cluster version */
+	PCS_PC_CL_TOTAL_SPACE		= 0x51, /* total space in the cluster */
+	PCS_PC_CL_FREE_SPACE		= 0x52, /* free space in the cluster */
+	PCS_PC_CL_AVAIL_SPACE		= 0x53, /* the amount of free space available for chunks allocation in the cluster */
+	PCS_PC_CL_TOTAL_EFFECTIVE	= 0x54, /* effective total space available for chunks allocation in the cluster */
+	PCS_PC_CL_AVAIL_RAW		= 0x55, /* same as PCS_PC_CL_AVAIL_SPACE but ignoring license limitations */
+	PCS_PC_CL_TOTAL_RAW		= 0x56, /* same as PCS_PC_CL_TOTAL_EFFECTIVE but ignoring license limitations */
+
+	PCS_PC_CL_STATUS		= 0x58, /* cluster status (pcs_cluster_status_t) */
+
+	PCS_PC_CL_NODES			= 0x60, /* CS count */
+	PCS_PC_CL_NODES_ACTIVE		= 0x61, /* count of CSs that send pings */
+	PCS_PC_CL_NODES_INACTIVE	= 0x62, /* inactive CS count */
+	PCS_PC_CL_NODES_OFFLINE		= 0x63, /* offline CS count */
+	PCS_PC_CL_NODES_DROPPED		= 0x64, /* count of CSs dropped by administrator */
+	PCS_PC_CL_NODES_AVAIL		= 0x68, /* available for allocation CS count */
+	PCS_PC_CL_NODES_REPLICATING	= 0x69, /* nodes participating in cooperative replication */
+	PCS_PC_CL_AVER_COST		= 0x6a, /* the average allocation cost for available CS */
+	PCS_PC_CL_NODES_FAILED		= 0x6b, /* failed CS nodes count */
+	PCS_PC_CL_NODES_NOSPACE		= 0x6c, /* count of CS nodes without space available for allocation */
+	PCS_PC_CL_NODES_HOT		= 0x6d, /* count of CS nodes considered hot */
+
+	/* cluster chunk info */
+	PCS_PC_CL_CHUNKS_VOID		= 0x70, /* unused chunks */
+	PCS_PC_CL_CHUNKS_PENDING	= 0x71, /* top priority queue for replication, chunk is blocked, client is waiting */
+	PCS_PC_CL_CHUNKS_BLOCKED	= 0x72, /* have too few replicas, writing is impossible */
+	PCS_PC_CL_CHUNKS_URGENT		= 0x73, /* chunks that have limit replicas */
+	PCS_PC_CL_CHUNKS_DEGRADED	= 0x74, /* chunks that have > limit and < normal replicas */
+	PCS_PC_CL_CHUNKS_STANDBY	= 0x75, /* chunks with temporary standby replicas */
+	PCS_PC_CL_CHUNKS_HEALTHY	= 0x76, /* chunks that have >= normal and <= max replicas */
+	PCS_PC_CL_CHUNKS_OVERCOMMITTED	= 0x77, /* chunks that have > max replicas */
+	PCS_PC_CL_CHUNKS_REPLICATING	= 0x78, /* chunks that replicate now */
+	PCS_PC_CL_CHUNKS_OFFLINE	= 0x79, /* chunks that have no replicas */
+	PCS_PC_CL_REPL_DELETING		= 0x7a, /* replicas queued for deletion */
+	PCS_PC_CL_CHUNKS_REPLICATED	= 0x7b,	/* the replicated chunks total / rate */
+	PCS_PC_CL_CHUNKS_REBALANCE_TOTAL= 0x7c, /* the total number of chunks being rebalanced (including committing) */
+	PCS_PC_CL_CHUNKS_REBALANCE_COMM = 0x7d, /* the number of rebalanced chunks being committed */
+	PCS_PC_CL_CHUNKS_REPLICATE	= 0x7e, /* the number of replicas to add on replication */
+	PCS_PC_CL_CHUNKS_UNIQUE		= 0x7f, /* the number of chunks with single replica */
+
+	PCS_PC_REQ_IN			= 0x81, /* number of input requests */
+	PCS_PC_REQ_OUT			= 0x82, /* number of output request */
+	PCS_PC_REQ_IN_ERR		= 0x84, /* number of input requests with errors */
+	PCS_PC_REQ_IN_ERR_CODE		= 0x85, /* code of the last error */
+	PCS_PC_REQ_IN_ERR_UPTIME	= 0x86, /* time since last error (ms) */
+	PCS_PC_REQ_IN_LATENCY		= 0x87, /* avg processing time (ms) */
+	PCS_PC_REQ_IN_COMMIT_LATENCY	= 0x88, /* avg processing time for requests updating metadata (ms) */
+	PCS_PC_REQ_IN_MAP_LATENCY	= 0x89, /* avg processing time for map requests (ms) */
+	PCS_PC_REQ_PENDING		= 0x8e, /* number of requests being currently processed */
+
+	PCS_PC_LEASE_CNT		= 0x101, /* number of currently active leases */
+	PCS_PC_LEASE_CLIENTS		= 0x103, /* number of clients that have leases */
+
+	PCS_PC_FS_TOTAL_SIZE		= 0x110, /* Total size of all files in bytes */
+	PCS_PC_FS_INODES		= 0x111, /* inode count */
+	PCS_PC_FS_FILES			= 0x112, /* file count */
+	PCS_PC_FS_FILE_MAPS		= 0x113, /* file map count */
+	PCS_PC_FS_CHUNK_MAPS		= 0x114, /* chunk map count */
+	PCS_PC_FS_CHUNK_NODES		= 0x115, /* number of all replicas of all chunks */
+
+	PCS_PC_STOR_STAT		= 0x200, /* struct pcs_perf_stor_stat */
+
+	/* cluster ops info */
+	/* rates are calculated in 5s intervals, every rate is a tuple:
+	 * (1) total number of events, (2) 5 sec diff, (3) avg for last 1m interval, (4) avg for 5m, (5) avg for 15m */
+	PCS_PC_CL_READS			= 0x1101, /* bytes read rate */
+	PCS_PC_CL_WRITES		= 0x1102, /* bytes written rate */
+	PCS_PC_CL_REPL_READS		= 0x1103, /* replication bytes read rate */
+	PCS_PC_CL_REPL_WRITES		= 0x1104, /* replication bytes write rate */
+	PCS_PC_CL_READ_OPS		= 0x1106, /* read ops rate */
+	PCS_PC_CL_WRITE_OPS		= 0x1107, /* write ops rate */
+	PCS_PC_CL_MAPS			= 0x1108, /* map request rate */
+	PCS_PC_CL_FSYNC			= 0x1109, /* fsync() rate */
+	PCS_PC_CL_SYNC			= 0x110a, /* syncfs() rate */
+
+	PCS_PC_CL_IO_LOAD_AVER		= 0x1200, /* average IO load (queue length) across cluster
+						   * (queue length 1.0 corresponds to 5000000) */
+	PCS_PC_CL_IO_LOAD_MAX		= 0x1201, /* maximum IO load (queue length) across cluster */
+	PCS_PC_CL_IO_LAST_BALANCED	= 0x1210, /* the number of hot CSs balanced last time */
+	PCS_PC_CL_IO_LAST_BALANCE_UPTIME= 0x1211, /* time since last balance attempt (ms) */
+
+	PCS_PC_MDS_NODES		= 0x1800, /* the number of MDS nodes in cluster */
+	PCS_PC_MISC_FEATURE_MASK	= 0x1801, /* returns 2 64bit feature mask registers */
+	PCS_PC_MDS_HOST_INFO		= 0x1802, /* return pcs_host_info for MDS */
+	PCS_PC_MDS_HOST_VER_INFO	= 0x1803, /* return pcs_mds_host_info  */
+
+	PCS_PC_MEM_POOLS		= 0x2000, /* overall memory pools statistics */
+	PCS_PC_MEM_POOL			= 0x2001, /* the particular memory pool statistics */
+	PCS_PC_MEM_LJ_USED		= 0x2011, /* mem allocated for local journal */
+	PCS_PC_MEM_RJ_USED		= 0x2012, /* mem allocated for replicated journal */
+	PCS_PC_MEM_RJ_CACHE		= 0x2018, /* the total size of the paxos cache	*/
+	PCS_PC_MEM_PGS_ALLOCATED	= 0x2020, /* the total number of pages allocated for memory pools */
+	PCS_PC_MEM_PGS_FREE		= 0x2021, /* the current number of free pool pages */
+	PCS_PC_MEM_PGS_STANDBY		= 0x2022, /* the current number of standby pool pages */
+
+	PCS_PC_MEM_SNAPSHOTS		= 0x2030, /* the number of snapshots */
+	PCS_PC_MEM_SNAP_OBJS		= 0x2031, /* the number of objects tracked */
+	PCS_PC_MEM_SNAP_OBJS_ORPHAN	= 0x2032, /* the number of deleted objects tracked */
+	PCS_PC_MEM_SNAP_COPIES		= 0x2033, /* the number of serialized object copies */
+	PCS_PC_MEM_SNAP_COPIES_ORPHAN	= 0x2034, /* the number of serialized copies of the deleted objects */
+
+	PCS_PC_MEM_LAST,			  /* max id used in mem info */
+
+	PCS_PC_PROC_MEM_RSS		= 0x3101, /* number of pages the process has in real memory */
+	PCS_PC_PROC_MEM_VSIZE		= 0x3102, /* virtual memory size of process in pages */
+
+	PCS_PC_CS_LIST			= 0x4000, /* CS list */
+
+	PCS_PC_CS_ID			= 0x20000, /* CS id */
+	PCS_PC_CS_CHUNKS		= 0x20001, /* number of chunks in CS */
+	PCS_PC_CS_REG_UPTIME		= 0x20002, /* time since last mds registration (ms) */
+	PCS_PC_CS_REG_ADDR		= 0x20003, /* CS IP addresses currently registered */
+	PCS_PC_CS_VERSION		= 0x20004, /* CS version */
+	PCS_PC_CS_ADM_STATUS		= 0x20005, /* administration status, see PCS_CS_ADM_* */
+	PCS_PC_CS_ACT_STATUS		= 0x20006, /* activity status,	     see PCS_CS_ACT_* */
+	PCS_PC_CS_AVAIL			= 0x20008, /* 1 if CS is available for allocation */
+	PCS_PC_CS_COST			= 0x2000a, /* allocation cost if available */
+	PCS_PC_CS_QOS			= 0x2000b, /* qos assigned for CS */
+	PCS_PC_CS_NET_ADDR		= 0x2000e, /* the CS connection source network address */
+	PCS_PC_CS_LOCATION		= 0x2000f, /* the CS location and host id */
+
+	PCS_PC_CS_ERR_STATUS		= 0x20010, /* the CS error status - if non-zero the CS is not currently used for chunks allocation */
+	PCS_PC_CS_LAST_ERR		= 0x20011, /* local error status, see PCS_MAP_ERR_* */
+	PCS_PC_CS_LAST_ERR_UPTIME	= 0x20012, /* time since last local error (ms) */
+	PCS_PC_CS_LAST_LINK_ERR		= 0x20013, /* link error status, see PCS_MAP_ERR_* */
+	PCS_PC_CS_LAST_LINK_ERR_UPTIME	= 0x20014, /* time since last link error (ms) */
+
+	PCS_PC_CS_TOTAL_SPACE		= 0x20051, /* total space on CS */
+	PCS_PC_CS_FREE_SPACE		= 0x20052, /* free space on CS */
+	PCS_PC_CS_AVAIL_SPACE		= 0x20053, /* the amount of space available for chunk allocation on CS */
+
+	/* CS chunks info, see PCS_PC_CL_CHUNKS_* */
+	PCS_PC_CS_CHUNKS_VOID		= 0x20071,
+	PCS_PC_CS_CHUNKS_BLOCKED	= 0x20072,
+	PCS_PC_CS_CHUNKS_URGENT		= 0x20073,
+	PCS_PC_CS_CHUNKS_DEGRADED	= 0x20074,
+	PCS_PC_CS_CHUNKS_HEALTHY	= 0x20075,
+	PCS_PC_CS_CHUNKS_OVERCOMMITTED	= 0x20076,
+	PCS_PC_CS_CHUNKS_REPLICATING	= 0x20077,
+	PCS_PC_CS_CHUNKS_OFFLINE	= 0x20078,
+	PCS_PC_CS_REPL_DELETING		= 0x20079,
+	PCS_PC_CS_CHUNKS_UNIQUE		= 0x2007f,
+
+	/* CS ops info, see PCS_PC_CL_* */
+	PCS_PC_CS_READS			= 0x20101,
+	PCS_PC_CS_WRITES		= 0x20102,
+	PCS_PC_CS_REPL_READS		= 0x20103,
+	PCS_PC_CS_REPL_WRITES		= 0x20104,
+	PCS_PC_CS_IO_WAIT		= 0x20105,
+	PCS_PC_CS_READ_OPS		= 0x20106,
+	PCS_PC_CS_WRITE_OPS		= 0x20107,
+	PCS_PC_CS_MAPS			= 0x20108,
+	PCS_PC_CS_FSYNC			= 0x20109,
+	PCS_PC_CS_SYNC			= 0x2010a,
+	PCS_PC_CS_FEATURES		= 0x2010b,
+	PCS_PC_CS_CLIENT_STAT		= 0x2010c,
+	PCS_PC_CS_LATENCY		= 0x2010d,
+	PCS_PC_CS_LATENCY_MAX		= 0x2010e,
+	PCS_PC_CS_J_FULL		= 0x2010f,
+	PCS_PC_CS_IO_QUEUE		= 0x20110,
+	PCS_PC_CS_RMW_OPS		= 0x20111,
+	PCS_PC_CS_SYNC_WAIT		= 0x20112,
+	PCS_PC_CS_SYNC_LATENCY		= 0x20113,
+	PCS_PC_CS_SYNC_LATENCY_MAX	= 0x20114,
+	PCS_PC_CS_CRMW_OPS		= 0x20115,
+	PCS_PC_CS_SMART_FAMILY		= 0x20120,
+	PCS_PC_CS_SMART_DEVICE		= 0x20121,
+	PCS_PC_CS_SMART_SN		= 0x20122,
+	PCS_PC_CS_SMART_VENDOR_ATTR	= 0x20123,
+
+	/* clients related info */
+	PCS_PC_CLIENTS_LIST	     = 0x20200,
+
+	PCS_PC_CLIENT_ID	     = 0x20201,
+	PCS_PC_CLIENT_LEASES	     = 0x20202,
+	PCS_PC_CLIENT_ADDR	     = 0x20203,
+	PCS_PC_CLIENT_READS	     = 0x20204,
+	PCS_PC_CLIENT_WRITES	     = 0x20205,
+	PCS_PC_CLIENT_READ_OPS	     = 0x20206,
+	PCS_PC_CLIENT_WRITE_OPS	     = 0x20207,
+	PCS_PC_CLIENT_FSYNC	     = 0x20208,
+	PCS_PC_CLIENT_PERIOD	     = 0x20209,
+	PCS_PC_CLIENT_IOWAIT	     = 0x2020a,
+	PCS_PC_CLIENT_LATENCY_MAX    = 0x2020b,
+	PCS_PC_CLIENT_LATENCY	     = 0x2020c,
+	PCS_PC_CLIENT_HOST_INFO	     = 0x2020d,
+	PCS_PC_CLIENT_IO_QUEUE	     = 0x2020e,
+	PCS_PC_CLIENT_RMW_OPS	     = 0x2020f,
+
+	PCS_PC_LICENSE_KEYNUM	     = 0x20301,
+	PCS_PC_LICENSE_STATUS	     = 0x20302,
+	PCS_PC_LICENSE_CAPACITY	     = 0x20303,
+	PCS_PC_LICENSE_EXPIRATION    = 0x20304,
+
+	PCS_PC_SH_LEASE_INFO	     = 0x20401,
+	PCS_PC_EX_LEASE_INFO	     = 0x20402,
+
+	PCS_PC_NETSTAT_NODE_INFO     = 0x20501, /* struct pcs_netstat_node_info */
+
+	PCS_PC_DISK_INFO	     = 0x20601,
+	PCS_PC_DISK_INFO_SERVICE     = 0x20602,
+	PCS_PC_DISK_INFO_ID	     = 0x20603,
+	PCS_PC_DISK_INFO_LIST	     = 0x20604, /* struct pcs_mds_disk_info_msg */
+	PCS_PC_DISK_INFO_CNT	     = 0x20605,
+	PCS_PC_DISK_INFO_HOST	     = 0x20606,
+	PCS_PC_DISK_INFO_CAPACITY    = 0x20607,
+};
+
+/* Bits for PCS_PC_CS_FEATURES */
+enum {
+	PCS_CS_FEATURE_JOURNAL		= 1,
+	PCS_CS_FEATURE_CHECKSUM		= 2,
+	PCS_CS_JOURNAL_CLEAN		= 4,
+	PCS_CS_USE_DIRECT_IO		= 8,
+	PCS_CS_FAILED_STORAGE		= 0x10,
+	PCS_CS_FAILED_CSUM		= 0x20,
+	PCS_CS_FAILED_JOURNAL		= 0x40,
+	PCS_CS_FAILED_JCSUM		= 0x80,
+	PCS_CS_FAILED_REPO		= 0x100,
+	PCS_CS_FAILED_TIMEOUT		= 0x200,
+};
+
+#define PCS_CS_FAILED_MASK ((u64)PCS_CS_FAILED_STORAGE|PCS_CS_FAILED_CSUM|PCS_CS_FAILED_JOURNAL| \
+		PCS_CS_FAILED_JCSUM|PCS_CS_FAILED_REPO|PCS_CS_FAILED_TIMEOUT)
+
+/* The user-friendly cluster status */
+typedef enum {
+	PCS_CL_STATUS_UNKNOWN,	/* Not enough information yet. MDS is ether not master or master not so long ago */
+	PCS_CL_STATUS_HEALTHY,	/* No inactive CS */
+	PCS_CL_STATUS_DEGRADED,	/* Some CS are inactive */
+	PCS_CL_STATUS_FAILURE,	/* Too many inactive CS. Automatic replication is automatically disabled. */
+} pcs_cluster_status_t;
+
+/* The CS activity status */
+typedef enum {
+	PCS_CS_ACT_ACTIVE,	/* CS is sending pings. */
+	PCS_CS_ACT_INACTIVE,	/* Not sending ping for some time. Replication is not yet started. */
+	PCS_CS_ACT_OFFLINE,	/* Not sending ping for quite some time, chunks are being replicated. */
+	PCS_CS_ACT_DROPPED,	/* Dropped by administrator. Such CS is banned forever so it's activity status doesn't matter anymore. */
+	PCS_CS_ACT_STATES_
+} pcs_cs_activity_status_t;
+
+struct pcs_mds_monitor_resp_msg
+{
+	struct pcs_mds_hdr	hdr;
+	struct pcs_perf_counter	counters[0];
+} __attribute__((aligned(8)));
+
+/* The perf counter types structures */
+
+struct pcs_pc_lease_info { /* PCS_PC_XX_LEASE_INFO */
+	PCS_NODE_ID_T	clnt_id;
+	u32		age_sec;	/* How long it exists */
+	s32		valid_sec;	/* How long it will be valid (negative if expired) */
+	PCS_NET_ADDR_T	clnt_addr;
+} __attribute__((aligned(8)));
+
+struct pcs_mds_host_info { /* PCS_PC_MDS_HOST_VER_INFO */
+	u32			version;
+	u32			mds_id;
+	struct pcs_host_info	host;
+} __attribute__((aligned(8)));
+
+struct pcs_smart_vendor_attr { /* PCS_PC_CS_SMART_VENDOR_ATTR */
+	u32 id;
+	u32 flag;
+	u32 value;
+	u32 worst;
+	u32 thresh;
+	u64 reserved;
+	u64 raw_value;
+} __attribute__((aligned(8)));
+
+/* Request key values */
+enum {
+	PCS_PC_GET_INFO = 0,	/* General server info */
+	PCS_PC_GET_CS_LIST,	/* The list of known CSs */
+	PCS_PC_GET_CS_INFO,	/* The particular CS info (CS ID as index) */
+	PCS_PC_GET_CLNT_LIST,	/* The list of the client ID/IP/leases */
+	PCS_PC_GET_CLNT_TOP,	/* Not yet implemented */
+	PCS_PC_GET_CLNT_INFO,	/* The particular client info (ID as index) */
+	PCS_PC_GET_FILE_LEASES, /* The particular file lease owners ID/IP/lease type/age as the array of PCS_PC_LEASE_INFO */
+	PCS_PC_GET_NETSTAT,	/* Get array of PCS_PC_NETSTAT_NODE_INFO */
+	PCS_PC_GET_STOR_STAT,	/* Get array of struct pcs_perf_stor_stat entries given the directory ID as index */
+	PCS_PC_GET_MDS_INFO = 0x10, /* Get cluster MDSs host info as the array of PCS_PC_MDS_HOST_VER_INFO accompanied by PCS_PC_MDS_NODES entry */
+};
+
+struct pcs_mds_monitor_req_msg
+{
+	struct pcs_mds_hdr	hdr;
+	u32	_reserved;
+	u32	key;
+	u64	index;
+} __attribute__((aligned(8)));
+
+/* ---- file map query request/response
+ * Returns the mapping of the file chunks to chunk servers as long as some valuable information
+ * regarding data integrity and chunk placement.
+ *
+ * The message type is pcs_mds_file_map_info_msg (same for request and response).
+ */
+
+#define PCS_MDS_FILE_MAP_INFO_REQ		(PCS_RPC_MDS_CLIENT_BASE + 0x18)
+#define PCS_MDS_FILE_MAP_INFO_RESP		(PCS_MDS_FILE_MAP_INFO_REQ | PCS_RPC_DIRECTION)
+
+/* Chunk flags */
+enum {
+	PCS_CH_FL_DEGRADED	= 1,		/* The number of online replicas is less than normal */
+	PCS_CH_FL_BLOCKED	= 2,		/* Not enough online replicas, writing is blocked */
+	PCS_CH_FL_OFFLINE	= 4,		/* No online replicas, any access is impossible */
+	PCS_CH_FL_OVERCOMMITTED	= 0x10,		/* Too many replicas, trimming is required */
+	PCS_CH_FL_REPLICATING	= 0x100,	/* Replication is in progress (to the last replica) */
+	PCS_CH_FL_ERROR		= 0x400,	/* Chunk has error flag on it */
+	PCS_CH_FL_HARD_ERROR	= 0x800,	/* Some replicas have hard (unrecoverable) error flag */
+	PCS_CH_FL_NOT_REGISTERED= 0x1000,	/* Some CS are not registered (so their location info is not available) */
+	PCS_CH_FL_XINFO		= 0x4000,	/* The struct pcs_mds_chunk_info is followed by pcs_mds_chunk_xinfo extended info */
+	PCS_CH_FL_LOC_INFO	= 0x8000,	/* Extended format with per-replica location info */
+};
+
+struct pcs_mds_chunk_replica_loc_info {
+	PCS_NODE_ID_T		cs_id;
+	struct pcs_host_info	host;
+};
+
+struct pcs_mds_chunk_info
+{
+	u64		offset;		/* Chunk offset */
+	u32		flags;		/* Flags (PCS_CH_FL_XXX) */
+	u32		nreplicas;	/* The number of valid replicas */
+	union {
+		/* The array of replica info */
+		PCS_NODE_ID_T replicas[1];
+		struct pcs_mds_chunk_replica_loc_info replicas_loc[1];
+	};
+} __attribute__((aligned(8)));
+
+/* Extension for the above structure */
+struct pcs_mds_chunk_xinfo
+{
+	u32		size; /* Chunk size */
+	u32		reserved[3];
+} __attribute__((aligned(8)));
+
+/* Request flags */
+enum {
+	PCS_MDS_FILE_MAP_FL_SKIP       = 1,	/* Skip chunk at last_offset (input). Used to restart query after incomplete response.
+						 * If not set the start_offset is ignored on input. */
+	PCS_MDS_FILE_MAP_FL_OMIT_CHUNKS= 0x1000,/* Omit chunk data on output (input). Other fields will be valid though. */
+	PCS_MDS_FILE_MAP_FL_EOF	       = 0x8000,/* No more chunks in the file (output) - if not set the response is incomplete. */
+	PCS_MDS_FILE_MAP_FL_XINFO      = 0x80000,/* Retrieve extended chunk info if available */
+	PCS_MDS_FILE_MAP_FL_LOC_INFO   = 0x100000,/* Retrieve extended location info (see struct pcs_mds_chunk_replica_loc_info) */
+};
+
+/* The maximum locality value corresponding to the same host placement */
+#define PCS_HOST_LOCALITY (PCS_LOCATION_PATH_LEN+1)
+
+struct pcs_mds_file_map_info_msg
+{
+	struct pcs_mds_hdr		hdr;
+	PCS_FILE_ID_T			file_id;	/* File id on input */
+	PCS_NODE_ID_T			home_id;	/* The ID of the 'home' node */
+	u64				total_chunks;	/* The total number of chunks */
+	u64				last_offset;	/* Last chunk offset - valid on output */
+	u32				req_flags;	/* The request flags (PCS_MDS_FILE_MAP_FL_XXX) */
+	u16				chunk_flags;	/* The OR-ed bitmap of chunk flags (PCS_CH_FL_XXX) */
+	u8				qos;		/* Tier */
+	u8				placement;	/* The placement policy */
+	u64				reserved[10];	/* Currently not used */
+	u64				per_qos_repl[PCS_NQOS];	/* Replicas per tier */
+	u8				repl_norm;	/* Replication factor */
+	u8				repl_min;	/* The minimum number of replicas allowed */
+	u8				repl_min_actual;/* Actual minimum number of uptodate replicas */
+	u8				repl_max_actual;/* Actual maximum number of uptodate replicas */
+	u32				nchunks;	/* The number of chunks that follows */
+	struct pcs_mds_chunk_info	chunks[0];	/* Chunk info array */
+} __attribute__((aligned(8)));
+
+#define PCS_MDS_NETSTAT_REPORT		(PCS_RPC_MDS_CLIENT_BASE + 0x1C)
+
+/* Network stat for the particular link */
+struct pcs_connstat_rec
+{
+	PCS_NODE_ID_T	id;
+	u32		retrans;
+	/* The following values are in microseconds */
+	u32		lat_min;
+	u32		lat_max;
+	u32		lat_cnt;
+	u64		lat_avg;
+} __attribute__((aligned(8)));
+
+/* Network stat averaged over all in/out links at the particular network node */
+struct pcs_netstat_node_info
+{
+	PCS_NODE_ID_T	id;
+	u32		retrans;
+	/* The following values are in microseconds, ~0U means no data available */
+	u32		lat_avg;  /* average over all links */
+	u32		lat_mmax; /* median of per link maximums */
+	u32		lat_max;  /* top maximum over all links */
+} __attribute__((aligned(8)));
+
+struct pcs_mds_netstat_req
+{
+	struct pcs_mds_hdr		hdr;
+	u32				count;
+	u32				reserved;
+	u64				reserved2[2];
+	struct pcs_connstat_rec		data[0];
+} __attribute__((aligned(8)));
+
+/*
+ * Punch hole request - drop chunks in given range. In case the range size
+ * is zero it drops the single chunk starting with given offset or returns error
+ * if no such chunk exists. Currently this is the only supported scenario.
+ */
+
+#define PCS_MDS_PUNCH_HOLE_REQ	(PCS_RPC_MDS_CLIENT_BASE + 0x24)
+#define PCS_MDS_PUNCH_HOLE_RESP	(PCS_MDS_PUNCH_HOLE_REQ | PCS_RPC_DIRECTION)
+
+struct pcs_mds_punch_hole_msg
+{
+	struct pcs_mds_hdr	hdr;
+	PCS_FILE_ID_T		fileid; /* File ID */
+	u64			offset; /* Start offset */
+	u64			size;	/* The hole size (may be zero - see comment above) */
+	u64			reserved[3];
+} __attribute__((aligned(8)));
+
+#define PCS_MDS_DATA_OBJ_REQ  (PCS_RPC_MDS_CLIENT_BASE + 0x30)
+#define PCS_MDS_DATA_OBJ_RESP (PCS_MDS_DATA_OBJ_REQ | PCS_RPC_DIRECTION)
+
+/*
+ * Data objects are uniquely identified by (key, type) pair.
+ */
+
+#define PCS_MDS_DATA_OBJ_MAX_SIZE 0x20000
+
+enum {
+	PCS_DOP_SET = 1,
+	PCS_DOP_GET = 2,
+	// delete is currently not supported for safety
+};
+
+struct pcs_mds_data_obj_msg
+{
+	struct pcs_mds_hdr	hdr;
+	u32			op;
+	u32			flags;
+	u64			reserved[4];
+	u64			key;
+	u64			attr;
+	u32			type;
+	u32			size;
+	// Object data follows
+} __attribute__((aligned(8)));
+
+/*
+ * Administration API.
+ */
+
+#define PCS_RPC_MDS_ADMIN_BASE	(PCS_RPC_MDS_CLIENT_BASE + 0x80)
+
+/* ---- add mds node
+ * Add new mds node. The message type is pcs_mds_node_add_msg (same for request and response).
+ */
+
+#define PCS_MDS_NODE_ADD_REQ	(PCS_RPC_MDS_ADMIN_BASE + 2)
+#define PCS_MDS_NODE_ADD_RESP	(PCS_MDS_NODE_ADD_REQ | PCS_RPC_DIRECTION)
+
+struct pcs_mds_node_add_msg
+{
+	struct pcs_mds_hdr	hdr;
+	PCS_NODE_ID_T		id;
+	PCS_NET_ADDR_T		addr;
+
+} __attribute__((aligned(8)));
+
+/* ---- remove mds node
+ * Remove existing mds node. The message type is pcs_mds_node_rm_msg (same for request and response).
+ */
+
+#define PCS_MDS_NODE_RM_REQ	(PCS_RPC_MDS_ADMIN_BASE + 4)
+#define PCS_MDS_NODE_RM_RESP	(PCS_MDS_NODE_RM_REQ | PCS_RPC_DIRECTION)
+
+struct pcs_mds_node_rm_msg
+{
+	struct pcs_mds_hdr	hdr;
+	PCS_NODE_ID_T		id;
+
+} __attribute__((aligned(8)));
+
+/* ---- remove cs node
+ * Adding new (empty) CS node does not require any special commands. It will be added upon registration.
+ * Removing CS node with some chunks allocated is the more complex process. First the node may be marked
+ * as releasing to initiate migration of the chunks to other nodes. After that the node may be ultimately dropped.
+ * The node being releasing may still contain valid data. It may go back to normal state if admin decided to cancel
+ * releasing. On the contrary dropping node drops all chunks immediately so that they will never be accessed again.
+ * Dropping the CS node is irreversible.
+ *
+ * The node control operations return just pcs_mds_hdr on success.
+ */
+
+#define PCS_MDS_CS_SET_STATUS_REQ	(PCS_RPC_MDS_ADMIN_BASE + 6)
+#define PCS_MDS_CS_SET_STATUS_RESP	(PCS_MDS_CS_SET_STATUS_REQ | PCS_RPC_DIRECTION)
+
+struct pcs_mds_cs_set_status_msg
+{
+	struct pcs_mds_hdr	hdr;
+	PCS_NODE_ID_T		id;
+	u32			status;
+	u32			flags;
+
+} __attribute__((aligned(8)));
+
+/* CS administration status */
+typedef enum {
+	PCS_CS_ADM_NORMAL = 0,
+	/* Further chunk allocation suppressed, going to be dropped as soon as all chunks will have replicas on another CS.
+	 * This status is being set manually by Administrator.
+	 */
+	PCS_CS_ADM_RELEASING,
+	/* The hard IO error was detected so this CS is no longer considered reliable. */
+	PCS_CS_ADM_FAILED,
+	/* Same as PCS_CS_ADM_RELEASING but CS is considered failed */
+	PCS_CS_ADM_FAILED_RELEASING,
+	/* The CS is no longer used, its ID is banned forever */
+	PCS_CS_ADM_DROPPED = 0x10,
+} pcs_cs_adm_status_t;
+
+/* Flags */
+enum {
+	/* Force setting the particular status. Normally MDS does not allow setting dropped
+	 * status if it leads to the unrecoverable data loss. The following flag helps to overcome
+	 * this limitation.
+	 */
+	PCS_CS_ADM_FORCE = 1,
+};
+
+/* ---- client control
+ * The request type is pcs_mds_clnt_ctl_msg. The response type is struct pcs_mds_hdr on success.
+ */
+
+#define PCS_MDS_CLNT_CTL_REQ	(PCS_RPC_MDS_ADMIN_BASE + 0x10)
+#define PCS_MDS_CLNT_CTL_RESP	(PCS_MDS_CLNT_CTL_REQ | PCS_RPC_DIRECTION)
+
+/* Operation bits */
+enum {
+	PCS_MDS_CLNT_REVOKE_LEASES = 1,
+	PCS_MDS_CLNT_FINIT_LEASES  = 2,
+	PCS_MDS_CLNT_BAN	   = 0x10000,
+};
+
+struct pcs_mds_clnt_ctl_msg
+{
+	struct pcs_mds_hdr	hdr;
+	PCS_NODE_ID_T		clnt_id;
+	u32			op;
+	u32			reserved;
+	PCS_FILETIME_T		modify_ts;
+};
+
+/*
+ * Configuration interface.
+ * The configuration data is replicated among all MDS servers. Some data may belong to CS servers, they may query it by
+ * the public API described below.
+ */
+
+/* The message containing the array of configuration items */
+struct pcs_mds_cfg_msg {
+	struct pcs_mds_hdr	hdr;
+	/* The configuration sequence number. Always valid on output. If set to PCS_CONFIG_SEQ_ANY
+	 * the configuration will be updated regardless of the current version. Otherwise the operation
+	 * will fail with PCS_ERR_CFG_VERSION if the current version differs from one provided by client.
+	 */
+	PCS_CONFIG_SEQ_T	version;
+	unsigned		nitems;
+	struct pcs_cfg_item	items[1];
+} __attribute__((aligned(8)));
+
+/* ---- Get configuration request
+ * Get configuration data set matching the specified classes bitmap. The request type is struct pcs_mds_cfg_get_msg.
+ * The response type is struct pcs_mds_cfg_msg. On failure the pcs_rpc_error_resp will be returned.
+ */
+
+#define PCS_MDS_CFG_GET_REQ	(PCS_RPC_MDS_ADMIN_BASE + 0x20)
+#define PCS_MDS_CFG_GET_RESP	(PCS_MDS_CFG_GET_REQ | PCS_RPC_DIRECTION)
+
+struct pcs_mds_cfg_get_msg {
+	struct pcs_mds_hdr	hdr;
+	/* The bitmap of the matching classes */
+	u16			classes;
+	u16			reserved[3];
+} __attribute__((aligned(8)));
+
+/* ---- Set configuration request
+ * Set configuration data set. The request type is struct pcs_mds_cfg_msg. The response type is struct pcs_mds_hdr on success.
+ * On failure the pcs_rpc_error_resp will be returned. The configuration will be updated in a single transaction so the data set will
+ * be either applied entirely or rejected as a whole.
+ */
+
+#define PCS_MDS_CFG_SET_REQ	(PCS_RPC_MDS_ADMIN_BASE + 0x22)
+#define PCS_MDS_CFG_SET_RESP	(PCS_MDS_CFG_SET_REQ | PCS_RPC_DIRECTION)
+
+/* ---- request new MDS ID ---- */
+#define PCS_MDS_GEN_ID_REQ	(PCS_RPC_MDS_ADMIN_BASE + 0x24)
+#define PCS_MDS_GEN_ID_RESP	(PCS_MDS_GEN_ID_REQ | PCS_RPC_DIRECTION)
+
+struct pcs_mds_gen_id_msg
+{
+	struct pcs_mds_hdr	hdr;
+	PCS_NODE_ID_T		id;
+} __attribute__((aligned(8)));
+
+
+
+#define PCS_MDS_DISK_INFO_REQ	(PCS_RPC_MDS_ADMIN_BASE + 0x88)
+#define PCS_MDS_DISK_INFO_RESP (PCS_MDS_DISK_INFO_REQ | PCS_RPC_DIRECTION)
+
+#define PCS_MDS_DISK_ID_LEN	64
+
+struct pcs_mds_disk_info_msg {
+	struct pcs_mds_hdr	hdr;
+	PCS_NODE_ID_T		host_id;
+	u8			disk_id[PCS_MDS_DISK_ID_LEN];
+	u32			cnt;
+	struct pcs_perf_counter info[0];
+} __attribute__((aligned(8)));
+
+/* ---- That's all for now */
+
+/* The function translates bytes offset in file to byte offset in actual storage.
+ * This map is identical for plain layout and non trivial for RAID0 layout.
+ */
+static inline u64 map_file_to_chunk(u64 pos, unsigned int chunk_size, unsigned int stripe_depth, unsigned int strip_width)
+{
+	unsigned int strip_off, chunk_idx;
+	u64 base, strip_idx, chunk_off;
+	u64 group_size;
+
+	if (stripe_depth == 1)
+		return pos;
+
+	group_size = (u64)chunk_size * stripe_depth;
+
+	base = (pos / group_size) * group_size;
+	pos -= base;
+
+	strip_off = pos % strip_width;
+	strip_idx = pos / strip_width;
+	chunk_idx = strip_idx % stripe_depth;
+	chunk_off = strip_idx / stripe_depth;
+
+	return base + (chunk_idx * (chunk_size / strip_width) + chunk_off) * strip_width + strip_off;
+}
+
+#endif /* _PCS_MDS_PROT_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_perfcounters.h b/fs/fuse/kio/pcs/pcs_perfcounters.h
new file mode 100644
index 000000000000..f902ce06d72d
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_perfcounters.h
@@ -0,0 +1,7 @@
+#ifndef _PCS_PERFCOUNTERS_H_
+#define _PCS_PERFCOUNTERS_H_ 1
+
+/* TODO:!!! this is stump for  flow_detection */
+#include "pcs_perfcounters_stub.h"
+
+#endif /* _PCS_PERFCOUNTERS_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_perfcounters_stub.h b/fs/fuse/kio/pcs/pcs_perfcounters_stub.h
new file mode 100644
index 000000000000..17dae73fcd08
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_perfcounters_stub.h
@@ -0,0 +1,30 @@
+#ifndef _PCS_PERFCOUNTERS_STUB_H_
+#define _PCS_PERFCOUNTERS_STUB_H_ 1
+
+
+struct pcs_perf_stat_cnt {
+        u64     val_total;
+        u64     events;
+        u64     curr_max;
+        u64     events_last;
+        u64     avg;
+        u64     maximum;
+};
+
+/* Generic event rate counter */
+struct pcs_perf_rate_cnt {
+        /* Total number of events */
+        u64     total;
+        u64     last_total;
+        /* The number of events for the last 5 sec interval */
+        u64     rate;
+        /* The number of events per 5 sec averaged over 1, 5, 15 min and shifted by AV_SHIFT to the left */
+        u64     av1;
+        u64     av5;
+};
+
+
+static inline void pcs_perfcounter_stat_update(struct pcs_perf_stat_cnt *cnt, u64 val) __attribute__((unused));
+
+static inline void pcs_perfcounter_stat_update(struct pcs_perf_stat_cnt *cnt, u64 val) {}
+#endif //_PCS_PERFCOUNTERS_STUB_H_
diff --git a/fs/fuse/kio/pcs/pcs_prot_types.h b/fs/fuse/kio/pcs/pcs_prot_types.h
new file mode 100644
index 000000000000..d8852f6ffda5
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_prot_types.h
@@ -0,0 +1,451 @@
+#ifndef _PCS_PROT_TYPES_H_
+#define _PCS_PROT_TYPES_H_ 1
+
+#include "pcs_types.h"
+/* #include "pcs_net_addr.h" */
+/* #include "cluster_id.h" */
+
+/*
+ * Base types definitions shared by all the components.
+ */
+
+/* Current version */
+#define PCS_VERSION 121
+
+#define PCS_VZ7_VERSION 100
+
+/* milliseconds since Jan 1970 */
+typedef u64 PCS_FILETIME_T;
+
+typedef u64 PCS_FILE_ID_T;
+
+#define PCS_NODE_TYPE_BITS	2
+#define PCS_NODE_TYPE_SHIFT	10
+#define PCS_NODE_TYPE_MASK	(((1ULL << PCS_NODE_TYPE_BITS) - 1) << PCS_NODE_TYPE_SHIFT)
+#define PCS_NODE_ID_MASK	(~PCS_NODE_TYPE_MASK)
+
+typedef struct __pre_aligned(8) _PCS_CHUNK_ID_T {
+	PCS_FILE_ID_T	fileid;
+	u64		offset;
+} PCS_CHUNK_ID_T __aligned(8);
+
+typedef struct __pre_aligned(8) _PCS_XID_T {
+	PCS_NODE_ID_T	origin;
+	u64		val;
+} PCS_XID_T __aligned(8);
+
+/* Optional location of the machine. For now it is assumed that network topology
+ * and power supply topology are congruent. Default is all 0s.
+ */
+#define PCS_LOCATION_PATH_LEN 3
+
+struct __pre_aligned(8) pcs_location
+{
+	union {
+		struct {
+		u16	site;
+		u16	room;
+		u16	cabinet;
+		u16	reserved;
+		};
+		u16	path[PCS_LOCATION_PATH_LEN];
+	};
+} __aligned(8);
+
+struct __pre_aligned(8) pcs_host_info {
+	PCS_NODE_ID_T		host_id;
+	struct pcs_location	location;
+} __aligned(8);
+
+#define PCS_HOST_INFO_EQ(a, b) (!memcmp(&(a), &(b), offsetof(struct pcs_host_info, location.path[PCS_LOCATION_PATH_LEN])))
+#define PCS_TOPO_PATH_FMT     "%u.%u.%u"
+#define PCS_HOST_ID_FMT	      "%016llx"
+#define PCS_HOST_INFO_FMT     PCS_TOPO_PATH_FMT "." PCS_HOST_ID_FMT
+#define PCS_TOPO_PATH_ARGS(p) (p)[0], (p)[1], (p)[2]
+#define PCS_HOST_INFO_ARGS(h) PCS_TOPO_PATH_ARGS((h).location.path), (unsigned long long)(h).host_id.val
+
+typedef u32 PCS_MASTER_GENID_T;
+typedef u32 PCS_CLUSTER_GENID_T;
+typedef u32 PCS_FILE_GENID_T;
+typedef u32 PCS_LOST_LEASE_GENID_T;
+typedef u64 PCS_CHUNK_GENID_T;
+typedef u64 PCS_CHUNK_UID_T;
+typedef u64 PCS_LEASE_GEN_T;
+typedef u32 PCS_POLICY_GEN_T;
+
+/*
+ * File attributes
+ */
+
+struct __pre_aligned(8) pcs_mds_fattr
+{
+	PCS_FILE_ID_T		id;	      /* internal ID */
+	u32			attrib;	      /* attribute flags */
+	u32			reserved;     /* reserved for future use */
+	union {
+	struct {
+		u64		size;	      /* the logical size size */
+		u64		phy_size;     /* physical size */
+	};
+	struct {
+		PCS_FILE_ID_T	src_id;	      /* ID of the source - used as some API operation parameter only */
+		PCS_FILETIME_T	create_ts;    /* file create timestamp (on create input only) */
+	};
+	};
+	PCS_NODE_ID_T		create_cid;   /* file create client ID */
+	PCS_FILETIME_T		modify_ts;    /* last file modification timestamp */
+	PCS_LEASE_GEN_T		xlease_gen;   /* lease generation updated on every exclusive lease release */
+	struct pcs_host_info	last_host;    /* last requested lease client host info */
+};
+
+struct __pre_aligned(8) pcs_mds_sys_info {
+	u32	map_type;     /* reserved for RAID */
+	u32	chunk_size;   /* global constant */
+	u8	stripe_depth; /* for RAID6/RS  */
+	u8	redundancy;   /* number of checksums for RAID6/RS */
+	u8	tolerance;    /* write-tolerance (how much lost replicas we can tolerate still allowing writing) */
+	u8	reserved8;
+	u32	strip_width;  /* length of strip for RAID6/RS */
+	u32	lease_tout;   /* lease expiration timeout (in milliseconds) */
+	u32	reserved;
+} __aligned(8);
+
+#define PCS_CHUNK_SIZE_MIN 4096u
+#define PCS_CHUNK_SIZE_MAX 2147483648u
+#define PCS_STRIPE_DEPTH_MAX 64
+#define PCS_REDUNDANCY_MAX 5
+#define PCS_RAID6_REDUNDANCY 2
+
+
+__pre_packed struct pcs_mds_repl_policy {
+	u8	placement;	/* The placement policy. The 0 value corresponds to the maximum physical diversity. Increasing this
+				 * number increases placement locality reducing transport latency (see comment on PCS_PLACEMENT_POLICY_CNT).
+				 */
+	u8	qos;		/* The default QoS */
+	u8	create_type;	/* Map type for new file. Valid as parameter for PCS_MDS_FILE_REQ only if the
+				 * PCS_FFL_CREATE_IN_CONTAINER flag is set.
+				 */
+	u8	reserved[3];
+} __packed;
+
+struct __pre_aligned(8) pcs_mds_repl_info {
+	u8	norm;		/* The number of replicas to maintain */
+	u8	limit;		/* The minimum number of replicas required to write file */
+	struct pcs_mds_repl_policy policy; /* Replicas allocation policy */
+} __aligned(8);
+
+/* The location defines path to the host so we have 2 more entries in the full path - host itself and the CS node */
+#define PCS_TOPO_MAX_PATH (PCS_LOCATION_PATH_LEN+2)
+
+/* The number of placement policies. The policy 0 force the topmost component of the path to be different for different chunks.
+ * The policy equal to PCS_LOCATION_PATH_LEN force placing replicas on different hosts. The policy equal to PCS_LOCATION_PATH_LEN+1
+ * allows for placing replicas on the same host. Higher values are meaningless since replicas can't be allocated on the same CS more than once.
+ */
+#define PCS_PLACEMENT_POLICY_CNT PCS_TOPO_MAX_PATH
+
+/* The maximum allowed number of replicas */
+#define PCS_REPL_MAX 64
+
+/* The number of QoS levels supported */
+#define PCS_NQOS 4
+
+/* Replication info validation macro */
+#define PCS_PLACEMENT_VALID(pl) ((pl) < PCS_PLACEMENT_POLICY_CNT)
+#define PCS_QOS_VALID(q)	((q) < PCS_NQOS)
+#define PCS_POLICY_VALID(p)	(PCS_PLACEMENT_VALID((p).placement) && PCS_QOS_VALID((p).qos))
+#define PCS_REPLICAS_VALID_(r)	((r).limit <= (r).norm && (r).norm <= PCS_REPL_MAX)
+#define PCS_REPLICAS_VALID(r)	(PCS_REPLICAS_VALID_(r) && (r).limit > 0)
+#define PCS_REPL_VALID(r)	(PCS_REPLICAS_VALID(r) && PCS_POLICY_VALID((r).policy))
+
+struct __pre_aligned(8) pcs_mds_fileinfo
+{
+	struct pcs_mds_fattr		attr;  /* attributes */
+	struct pcs_mds_sys_info		sys;   /* system info */
+	struct pcs_mds_repl_info	repl;  /* replication info */
+} __aligned(8);
+
+/*
+ * Version numbers
+ */
+
+/* The version number corresponding to the deleted file */
+#define PCS_FILE_GEN_DELETED 0
+
+static inline int pcs_compare_master_ver(PCS_MASTER_GENID_T v1, PCS_MASTER_GENID_T v2)
+{
+	return (int)(v1 - v2);
+}
+
+typedef struct __pre_aligned(8) _PCS_MAP_VERSION_T {
+	/* Master generation is being incremented every time the master MDS is changed
+	 * invalidating all maps issued by the previous master
+	 */
+	PCS_MASTER_GENID_T	master;
+	/* Cluster generation is being incremented every time we are dropping one of the CS servers.
+	 */
+	PCS_CLUSTER_GENID_T	cluster;
+	/* The file generation incremented every time we are changing the file size.
+	 */
+	PCS_FILE_GENID_T	file;
+	/* The lost lease generation is being incremented every time the exclusive lease is expired and revoked to
+	 * invalidate all maps issued to the previous client.
+	 */
+	PCS_LOST_LEASE_GENID_T	lost_lease;
+	/* The chunk generation is being incremented every time the chunk replica set is changed to invalidate all maps
+	 * referencing the old replica set.
+	 */
+	PCS_CHUNK_GENID_T	chunk;
+} PCS_MAP_VERSION_T;
+
+static inline void map_version_init(PCS_MAP_VERSION_T * v)
+{
+	memset(v, 0, sizeof(*v));
+}
+
+/* Returns negative value if v1 is older than v2, positive if v1 is newer than v2, 0 if they are equal */
+static inline int map_version_compare(PCS_MAP_VERSION_T const* v1, PCS_MAP_VERSION_T const* v2)
+{
+	int d;
+
+	if ((d = v1->master - v2->master))
+		return d;
+
+	if ((d = v1->cluster - v2->cluster))
+		return d;
+
+	if (v1->file == PCS_FILE_GEN_DELETED) {
+		if (v2->file != PCS_FILE_GEN_DELETED)
+			return 1;
+	} else {
+		if (v2->file == PCS_FILE_GEN_DELETED)
+			return -1;
+	}
+
+	if ((d = v1->file - v2->file))
+		return d;
+
+	if ((d = v1->lost_lease - v2->lost_lease))
+		return d;
+
+	return (int)(v1->chunk - v2->chunk);
+}
+
+static inline int map_version_equal(PCS_MAP_VERSION_T * v1, PCS_MAP_VERSION_T *v2)
+{
+	return 0 == map_version_compare(v1, v2);
+}
+
+/* Other version numbers */
+typedef u32 PCS_INTEGRITY_SEQ_T;
+typedef u32 PCS_SYNC_SEQ_T;
+
+static inline int pcs_sync_seq_compare(PCS_SYNC_SEQ_T seq1, PCS_SYNC_SEQ_T seq2)
+{
+	return (int)(seq1 - seq2);
+}
+
+
+//// TODO: dmonakhov perf counted termproraly disabled
+/*
+ * Performance counter.
+ */
+
+struct __pre_aligned(8) pcs_perf_counter
+{
+	u16	len;
+	u16	_reserved;
+	u32	key;
+	u64	value[0];
+} __aligned(8);
+
+#include "pcs_perfcounters.h"
+
+#define PCS_PERF_CNT_NEXT(p) ((struct pcs_perf_counter*)((char*)(p) + (p)->len))
+
+/* Core perf counters ID */
+enum {
+	PCS_PC_RPC_MSG_COUNT	= 0x10001, /* number of currently processed RPC messages */
+	PCS_PC_RPC_CONNS	= 0x10002, /* number of RPC connections */
+};
+
+/*
+ * Configuration interface.
+ */
+
+typedef u16 pcs_cfg_type_t;
+typedef u16 pcs_cfg_cls_t;
+
+struct __pre_aligned(8) pcs_cfg_data {
+	pcs_cfg_type_t	type;
+	pcs_cfg_cls_t	cls;
+	u32		size;
+	union {
+		s64	slong;
+		u64	ulong;
+		char	string[1];
+	};
+} __aligned(8);
+
+/* Configuration classes */
+enum {
+	PCS_CFG_GENERIC = 1,
+	PCS_CFG_MDS	= 2,
+	PCS_CFG_CS	= 4,
+	PCS_CFG_CLIENT	= 8,
+	PCS_CFG_INT	= 0x1000,
+};
+
+/* Item type */
+enum {
+	PCS_DATA_NONE  = 0,	/* Used to delete the item regardless of its type */
+	PCS_DATA_SLONG = 1,	/* Signed 64 bit value */
+	PCS_DATA_ULONG,		/* Unsigned 64 bit value */
+	PCS_DATA_STRING = 0x10
+};
+
+/* The size of the data item. String data will include the terminating 0 */
+#define PCS_CFG_DATA_SZ(d) (offsetof(struct pcs_cfg_data, string)+(d).size+((d).type==PCS_DATA_STRING))
+
+struct __pre_aligned(8) pcs_cfg_item {
+	unsigned		name_len;
+	unsigned		pad;
+	union {
+	struct pcs_cfg_data	data;
+	char			buff[1];
+	};
+} __aligned(8);
+
+/* The name offset in the name buffer. Equals to the size of the configuration data. */
+#define PCS_CFG_NAME_OFF(i) PCS_CFG_DATA_SZ((i).data)
+/* The total size of the data item */
+#define PCS_CFG_ITEM_SZ(i)  PCS_ALIGN(offsetof(struct pcs_cfg_item, buff)+PCS_CFG_NAME_OFF(i)+(i).name_len+1)
+
+/* Configuration sequence number incremented every time the configuration is being updated */
+typedef u32 PCS_CONFIG_SEQ_T;
+
+/* The following configuration sequence numbers have special meaning */
+#define PCS_CONFIG_SEQ_ANY ((PCS_CONFIG_SEQ_T)~0U)	/* Don't care on set */
+#define PCS_CONFIG_SEQ_INI 0				/* Initial (default) configuration */
+
+#define PCS_EVT_REC_SZ_ALIGN(msg_sz)  PCS_ALIGN(offsetof(struct pcs_evt_rec, msg[msg_sz]))
+#define PCS_EVT_REC_SZ_ALIGNED(descr) PCS_EVT_REC_SZ_ALIGN((descr).size)
+
+/* Generic path representation */
+struct __pre_aligned(8) pcs_path {
+	u32			sz;
+	char			str[1];
+} __aligned(8);
+
+/* The size of the pcs_path structure with 1 byte reserved for terminating 0 */
+#define PCS_PATH_SZ_(sz) (offsetof(struct pcs_path,str)+(sz)+1)
+#define PCS_PATH_SZ(path) PCS_PATH_SZ_((path).sz)
+
+/* Path alignment */
+#define PCS_PATH_SZ_ALIGN(sz)	PCS_ALIGN(PCS_PATH_SZ_(sz))
+#define PCS_PATH_SZ_ALIGNED(n)	PCS_PATH_SZ_ALIGN((n).sz)
+#define PCS_PATH_PAD_SZ(sz)	(PCS_PATH_SZ_ALIGN(sz)-offsetof(struct pcs_path,str)-(sz))
+
+static inline int cmp_path(struct pcs_path const* p_a, struct pcs_path const* p_b)
+{
+	unsigned _sz = p_a->sz < p_b->sz ? p_a->sz : p_b->sz;
+	int r = memcmp(p_a->str, p_b->str, _sz);
+	if (r) return r;
+	return (int)p_a->sz - (int)p_b->sz;
+}
+
+/* Generic constant string representation */
+struct pcs_cstr {
+	unsigned sz;
+	const char* str;
+};
+
+static inline int cmp_cstr(struct pcs_cstr const* s_a, struct pcs_cstr const* s_b)
+{
+	unsigned _sz = s_a->sz < s_b->sz ? s_a->sz : s_b->sz;
+	int r = memcmp(s_a->str, s_b->str, _sz);
+	if (r) return r;
+	return (int)s_a->sz - (int)s_b->sz;
+}
+
+/* File attribute bits */
+enum
+{
+	/* Attributes used internally by the system components */
+	PCS_FATTR_INTERNAL_ = 0xff,
+
+	/* Attributes has the physical file size maintained */
+	PCS_FATTR_HAS_PSIZE_ = 0x10,
+
+	/* The file object represents the directory */
+	PCS_FATTR_DIR = 0x1000,
+
+	/* The file object represents symbolic link */
+	PCS_FATTR_LINK = 0x2000,
+
+	/* The directory is the container for combined storage (set with PCS_FATTR_DIR only).
+	 * It has several important properties:
+	 *  - only files are allowed as child objects
+	 *  - child leases can't be created, the only lease must be acquired on the container
+	 *  - client may implement IO on the container on its own
+	 */
+	PCS_FATTR_CONTAINER = 0x10000,
+
+	/* Our file-inode abstraction is quite generic. The file may be attached to inide tree at any level.
+	 * Inodes are being created or deleted automatically while the files are managed by clients. The file may
+	 * have child objects but there is no way to create an empty inode except for creating the special file object
+	 * with PCS_FATTR_DIR bit set. Resizing of such object as well as IO requests will fail with PCS_ERR_IS_DIR.
+	 *
+	 * The client may either don't care about directory tree or have an assumption that all directories in path must
+	 * be created prior to the file itself. In the latter case it should set flag PCS_FFL_POSIX_PATH in operation request.
+	 * If it is set:
+	 *     - an attempt to create or resolve file with dir object lacking in the path will fail with PCS_ERR_NOT_FOUND error
+	 *     - an attempt to delete or rename object with child objects will fail with PCS_ERR_NON_EMPTY_DIR error
+	 */
+
+	/*
+	   The file has inline data. MDS prohibits IO map query for the files with this flag set. The client in turn direct
+	   read/write requests to MDS getting/setting file-associated data (see PCS_FA_DATA). May be set on the directory only.
+	   Newly created files inherit it from the parent directory.
+	 */
+	PCS_FATTR_INLINE    = 0x1000000,
+	/*
+	   The file consists of variable-length chunks where only the last one is writable. May be set on the directory only.
+	   Newly created files inherit it from the parent directory.
+	*/
+	PCS_FATTR_LOGSTREAM = 0x2000000,
+
+	/* Don't cache content on the client */
+	PCS_FATTR_NO_CLNT_CACHE = 0x10000000,
+
+	/* The following attributes are being inherited from the parent directory */
+	PCS_FATTR_INHERITABLE_MASK = 0xff000000,
+};
+
+/*
+ * Formatters
+ */
+
+#define VER_FMT "%u:%u:%u:%u:%llu"
+#define VER_ARGS(v) (v).master, (v).cluster, (v).file, (v).lost_lease, (unsigned long long)(v).chunk
+
+#define XID_FMT "[%u.%llu:%llu]"
+#define XID_ARGS(x) (unsigned)(((x).origin.val & PCS_NODE_TYPE_MASK) >> PCS_NODE_TYPE_SHIFT), \
+		NODE_ARGS((x).origin), (unsigned long long)((x).val)
+
+#define CLUSTER_ID_FMT	"%08x%08x%08x%08x"
+#define CLUSTER_ID_ARGS(x)	(*((unsigned int*)&((x).uuid[12]))), \
+		*((unsigned int*)&((x).uuid[8])),	\
+		*((unsigned int*)&((x).uuid[4])),	\
+		*((unsigned int*)&((x).uuid[0]))
+
+#define NODE_FMT "%llu"
+#define NODE_ARGS(id) (unsigned long long)((id).val)
+
+#define PEER_FMT "%s#" NODE_FMT
+#define PEER_ARGS(r)  pcs_role_to_str((r)->peer_role), NODE_ARGS((r)->peer_id)
+
+#define CUID_FMT "O%08llx"
+#define CUID_ARGS(id) (unsigned long long)(id)
+
+
+#endif /* _PCS_PROT_TYPES_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_req.c b/fs/fuse/kio/pcs/pcs_req.c
new file mode 100644
index 000000000000..117e050691d9
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_req.c
@@ -0,0 +1,116 @@
+
+#include <net/sock.h>
+#include <linux/net.h>
+#include <linux/kthread.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+
+#include "pcs_types.h"
+#include "pcs_sock_io.h"
+#include "pcs_rpc.h"
+#include "pcs_req.h"
+#include "log.h"
+
+static void ireq_timer_handler(unsigned long arg)
+{
+
+	struct pcs_int_request *ireq = (struct pcs_int_request *)arg;
+	pcs_cc_submit(ireq->cc, ireq);
+}
+
+static void __ireq_init(struct pcs_dentry_info *di, struct pcs_cluster_core *cc,
+		 struct pcs_int_request *ireq)
+{
+	memset(ireq, 0, sizeof(*ireq));
+	ireq->cc = cc;
+	ireq->ts = ireq->create_ts = jiffies;
+	setup_timer(&ireq->timer, ireq_timer_handler, (unsigned long)ireq);
+	INIT_HLIST_HEAD(&ireq->completion_data.child_list);
+	spin_lock_init(&ireq->completion_data.child_lock);
+	INIT_LIST_HEAD(&ireq->list);
+	ireq->dentry = di;
+}
+
+void ireq_init(struct pcs_dentry_info *di, struct pcs_int_request *ireq)
+{
+	__ireq_init(di, di->cluster, ireq);
+}
+
+void ireq_init_by_cluster(struct pcs_cluster_core *cc, struct pcs_int_request *ireq)
+{
+	__ireq_init(0, cc, ireq);
+}
+
+struct pcs_int_request *ireq_alloc(struct pcs_dentry_info *di)
+{
+	struct pcs_int_request *ireq;
+	ireq =__ireq_alloc();
+	if (!ireq)
+		return NULL;
+
+	__ireq_init(di, di->cluster, ireq);
+	return ireq;
+}
+
+struct pcs_int_request *ireq_alloc_by_cluster(struct pcs_cluster_core *cc)
+{
+	struct pcs_int_request *ireq;
+	ireq =__ireq_alloc();
+	if (!ireq)
+		return NULL;
+
+	__ireq_init(NULL, cc, ireq);
+	return ireq;
+}
+
+void ireq_delay(struct pcs_int_request *ireq)
+{
+	switch (ireq->error.value) {
+	case PCS_ERR_NORES:
+		if (!ireq->last_delay)
+			ireq->last_delay = PCS_ERROR_DELAY;
+		else if ((ireq->last_delay *= 2) > PCS_ERROR_DELAY_MAX)
+			ireq->last_delay = PCS_ERROR_DELAY_MAX;
+		break;
+	default:
+		ireq->last_delay = PCS_ERROR_DELAY;
+	}
+	mod_timer(&ireq->timer, ireq->last_delay);
+}
+
+void ireq_handle_hole(struct pcs_int_request *ireq)
+{
+	unsigned int len;
+	unsigned int offset;
+	struct iov_iter it;
+	pcs_api_iorequest_t * ar = ireq->completion_data.parent->apireq.req;
+
+	BUG_ON(ireq->type != PCS_IREQ_IOCHUNK);
+	BUG_ON(ireq->iochunk.direction);
+
+	len = ireq->iochunk.size;
+	offset = 0;
+	iov_iter_init_bad(&it);
+
+	DTRACE("enter m: " MAP_FMT ", ireq:%p \n", MAP_ARGS(ireq->iochunk.map),	 ireq);
+
+	while (len > 0) {
+		void * map, *buf;
+		size_t copy;
+
+		if (!iov_iter_count(&it))
+			ar->get_iter(ar->datasource, ireq->iochunk.dio_offset + offset, &it);
+
+		map = iov_iter_kmap_atomic(&it, &buf, &copy);
+		if (copy > len)
+			copy = len;
+		memset(buf, 0, copy);
+		if (map)
+			kunmap_atomic(map);
+		len -= copy;
+		offset += copy;
+		iov_iter_advance(&it, copy);
+	}
+
+	ireq_complete(ireq);
+}
diff --git a/fs/fuse/kio/pcs/pcs_req.h b/fs/fuse/kio/pcs/pcs_req.h
new file mode 100644
index 000000000000..c8481a48413a
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_req.h
@@ -0,0 +1,320 @@
+#ifndef _PCS_REQ_H_
+#define _PCS_REQ_H_ 1
+
+#include <linux/workqueue.h>
+#include "pcs_error.h"
+#include "pcs_sock_io.h"
+#include "pcs_map.h"
+#include "pcs_cs_prot.h"
+#include "pcs_rpc.h"
+#include "pcs_cs.h"
+
+///////////////////////////
+
+enum
+{
+	PCS_IREQ_API	= 0,	/* IO request from API */
+	PCS_IREQ_IOCHUNK= 1,	/* Internal IO request */
+	PCS_IREQ_LEASE	= 2,	/* Lease op request */
+	PCS_IREQ_FILE	= 3,	/* File op request */
+	PCS_IREQ_READDIR= 4,	/* Readdir request */
+	PCS_IREQ_NOOP	= 5,	/* NOOP request */
+	PCS_IREQ_FINI	= 6,	/* Stop pcs process */
+	PCS_IREQ_TRUNCATE=7,	/* Internal map truncate request */
+	PCS_IREQ_FLUSH	= 8,	/* Sync request */
+	PCS_IREQ_STATFS	= 9,	/* statfs request */
+	PCS_IREQ_LOOKUP	= 10,	/* lookup request */
+	PCS_IREQ_CSCONN = 11,	/* connect to CS and auth */
+	PCS_IREQ_CUSTOM = 16,	/* generic request */
+	PCS_IREQ_WRAID	= 17,	/* compound raid6 write request */
+	PCS_IREQ_RRAID	= 18,	/* compound raid6 read request */
+	PCS_IREQ_KAPI	= 65	/* IO request from kernel API */
+};
+
+/* Generic request, all internal messages are queued using this struct.
+ * Messages can be of various "type".
+ */
+
+struct pcs_int_request
+{
+	struct pcs_cluster_core* cc;
+
+	struct list_head	list;
+	struct pcs_dentry_info*	dentry;
+
+	unsigned int		type;
+	pcs_error_t		error;
+	int			flags;
+#define IREQ_F_FATAL		1
+#define IREQ_F_ONCE		2
+#define IREQ_F_SEQ_READ		4
+#define IREQ_F_RND_WEIGHT	8
+#define IREQ_F_CACHED		0x10
+#define IREQ_F_SEQ		0x20
+#define IREQ_F_MAPPED		0x40
+#define IREQ_F_MAP_REQUIRED	0x80
+#define IREQ_F_LOC_TOKEN	0x100
+#define IREQ_F_NOFLUSH		0x200
+#define IREQ_F_WB_SUSP		0x400
+#define IREQ_F_RECV_SPLICE	0x800
+
+	atomic_t		iocount;
+
+	int			qdepth;
+	abs_time_t		ts;
+	abs_time_t		ts_sent;
+	PCS_NODE_ID_T		wait_origin;
+
+	struct {
+		struct pcs_int_request *	parent;
+		void*				ctx;
+		void*				priv;
+		struct hlist_head		child_list;
+		struct hlist_node		child_node;
+		spinlock_t			child_lock;
+	} completion_data;
+
+	void (*complete_cb)(struct pcs_int_request *ireq);
+
+	abs_time_t		create_ts;
+
+	pcs_timer_t		timer;
+	unsigned		last_delay;
+
+	/* TODO: work struct only required for API request.
+	   Probably should be embeded to apireq
+	*/
+	struct work_struct worker;
+
+	union {
+		struct {
+			struct pcs_map_entry	*map;
+			//// Temproraly disable flow
+			 struct pcs_flow_node	*flow;
+			////struct pcs_splice_buf	*splice_rbuf;
+			u8			direction;
+			u8			role;
+			short			cs_index;
+			unsigned int		size;
+			unsigned int		dio_offset;
+			u64			chunk;
+			u64			offset;
+			struct pcs_cs_list	*csl;
+			PCS_NODE_ID_T		banned_cs;
+			struct pcs_msg		msg;
+			struct pcs_cs_iohdr	hbuf;		/* Buffer for header.
+								 * A little ugly
+								 */
+		} iochunk;
+
+		struct {
+			struct pcs_map_entry	*map;		/* map to flush */
+			struct pcs_cs_list	*csl;
+			struct pcs_msg		*msg;
+		} flushreq;
+
+		struct {
+			u64			offset;
+			int			phase;
+			PCS_MAP_VERSION_T	version;
+		} truncreq;
+
+		struct {
+			unsigned int		flags;
+			unsigned int		tout;
+			int			retries;
+		} leasereq;
+
+		struct {
+			unsigned int		op;
+			unsigned int		flags;
+			union {
+				struct pcs_dentry_info	*dst_de;	/* Only for rename */
+				off_t			new_size;	/* Only for resize */
+				const char		*data;		/* Only for symlink */
+			} arg;
+		} filereq;
+
+		struct {
+			pcs_api_csconnreq_t *req; /* Client request */
+			struct pcs_cluster_core	 *clu; /* dentry == NULL */
+			struct pcs_msg	    msg;
+			int		    out_fd;
+		} csconnreq;
+
+		struct {
+			void			(*action)(struct pcs_int_request *ireq);
+			void			(*destruct)(struct pcs_int_request *ireq);
+			void*			ctx;
+		} custom;
+
+		struct {
+			pcs_api_iorequest_t *	req;		/* Client request */
+			unsigned int		dio_offset;	/* MBZ */
+			void*			h;		/* API handle */
+		} apireq;
+
+	};
+};
+
+// FROM pcs_cluste_core.h
+
+struct pcs_clnt_config
+{
+	int		map_timeout;
+	int		abort_timeout;
+	int		kernel_cache_en;
+	int		wmss;
+	int		rmss;
+	int		lmss;
+	int		lic_status;
+	int		io_locality;
+	int		io_tweaks;
+	int		net_10gbit;
+	int		local_sndbuf;
+	int		tcp_sndbuf;
+	int		tcp_rcvbuf;
+};
+
+struct pcs_cluster_core
+{
+	struct list_head	work_queue;	/* Internal queue */
+	struct list_head	completion_queue;/* Internal queue for ireqs to complete */
+	struct work_struct	main_job;
+	struct work_struct	completion_job;
+
+	struct pcs_cs_set	css;		/* Table of all CSs */
+	struct pcs_map_set	maps;		/* Global map data */
+	struct pcs_rpc_engine	eng;		/* RPC engine */
+	struct workqueue_struct *wq;
+////	struct pcs_ratelimit	rlim;		/* Rate limiter */
+////	struct pcs_rng		rng;
+	/* <SKIP */
+
+	struct {
+		struct pcs_clnt_config	def;
+		struct pcs_clnt_config	curr;
+		PCS_CONFIG_SEQ_T	sn;
+		int			in_progress;
+	} cfg;
+
+	int			io_locality;
+	int			io_tweaks;
+	int			iolat_cutoff;
+	int			netlat_cutoff;
+	int			use_unix_socket;
+
+	/*
+	 * Our cluster core may be integrated onto the various implementations by customizing the following request processing methods.
+	 * The core does not provide any of them out of the box. Note that only the first one is mandatory.
+	 */
+	struct {
+		void (*ireq_process)   (struct pcs_int_request *);
+		void (*ireq_on_error)  (struct pcs_int_request *);
+		int  (*ireq_check_redo)(struct pcs_int_request *);
+	} op;
+
+	int (*abort_callback)(struct pcs_cluster_core *cc, struct pcs_int_request *ireq);
+	struct fuse_conn *fc;
+	spinlock_t		lock;
+};
+
+static inline struct pcs_cluster_core *cc_from_csset(struct pcs_cs_set * css)
+{
+	return container_of(css, struct pcs_cluster_core, css);
+}
+
+static inline struct pcs_cluster_core *cc_from_cs(struct pcs_cs * cs)
+{
+	return cc_from_csset(cs->css);
+}
+
+static inline struct pcs_cluster_core *cc_from_maps(struct pcs_map_set *maps)
+{
+	return container_of(maps, struct pcs_cluster_core, maps);
+}
+
+void pcs_cc_submit(struct pcs_cluster_core *cc, struct pcs_int_request* ireq);
+void pcs_cc_requeue(struct pcs_cluster_core *cc, struct list_head * q);
+////// FROM pcs_cluster.h
+static inline void pcs_sreq_attach(struct pcs_int_request * sreq, struct pcs_int_request * parent)
+{
+	sreq->completion_data.parent = parent;
+	sreq->ts = parent->ts;
+	spin_lock(&parent->completion_data.child_lock);
+	hlist_add_head(&sreq->completion_data.child_node, &parent->completion_data.child_list);
+	atomic_inc(&parent->iocount);
+	spin_unlock(&parent->completion_data.child_lock);
+}
+
+static inline int pcs_sreq_detach(struct pcs_int_request * sreq)
+{
+	struct pcs_int_request * parent = sreq->completion_data.parent;
+
+	BUG_ON(!parent);
+	BUG_ON(!atomic_read(&parent->iocount));
+
+	spin_lock(&parent->completion_data.child_lock);
+	hlist_del(&sreq->completion_data.child_node);
+	spin_unlock(&parent->completion_data.child_lock);
+
+	return !atomic_dec_and_test(&parent->iocount);
+}
+
+
+static inline struct pcs_int_request *ireq_from_msg(struct pcs_msg *msg)
+{
+	return container_of(msg, struct pcs_int_request, iochunk.msg);
+}
+
+static inline void ireq_process(struct pcs_int_request *ireq)
+{
+	(ireq->cc->op.ireq_process)(ireq);
+}
+
+static inline void ireq_on_error(struct pcs_int_request *ireq)
+{
+	if (ireq->cc->op.ireq_on_error) (ireq->cc->op.ireq_on_error)(ireq);
+}
+
+static inline void ireq_complete(struct pcs_int_request *ireq)
+{
+	BUG_ON(!hlist_empty(&ireq->completion_data.child_list));
+
+	if (pcs_if_error(&ireq->error))
+		ireq_on_error(ireq);
+	ireq->complete_cb(ireq);
+}
+
+static inline int ireq_check_redo(struct pcs_int_request *ireq)
+{
+	if (ireq->flags & IREQ_F_FATAL)
+		return 0;
+	if (ireq->cc->op.ireq_check_redo)
+		return (ireq->cc->op.ireq_check_redo)(ireq);
+	return 1;
+}
+
+static inline int ireq_is_timed_out(struct pcs_int_request *ireq)
+{
+	int timed_out;
+	timed_out = ireq->cc->cfg.curr.abort_timeout &&
+		ireq->create_ts + ireq->cc->cfg.curr.abort_timeout < jiffies;
+	if (timed_out && ireq->cc->abort_callback)
+		return ireq->cc->abort_callback(ireq->cc, ireq);
+	return timed_out;
+}
+
+struct pcs_int_request * __ireq_alloc(void);
+struct pcs_int_request *ireq_alloc(struct pcs_dentry_info *di);
+struct pcs_int_request *ireq_alloc_by_cluster(struct pcs_cluster_core *cc);
+void ireq_init(struct pcs_dentry_info *di, struct pcs_int_request *);
+void ireq_init_by_cluster(struct pcs_cluster_core *cc, struct pcs_int_request *);
+void ireq_destroy(struct pcs_int_request *);
+
+void ireq_delay(struct pcs_int_request *ireq);
+void ireq_handle_hole(struct pcs_int_request *ireq);
+
+void pcs_process_ireq(struct pcs_int_request *ireq);
+
+#endif /* _PCS_REQ_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_rpc.c b/fs/fuse/kio/pcs/pcs_rpc.c
new file mode 100644
index 000000000000..2ec7423a3f54
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_rpc.c
@@ -0,0 +1,1314 @@
+/* An attempt of universal rpc layer.
+ *
+ * All the components (except for MDS) used to assume asymmetrical communication:
+ * if some connection is open actively, it sends requests, but does not receive requests.
+ * If it is open passively, it receives requests, but sends only responses.
+ * This layer does not impose this limitation.
+ *
+ * API:
+ * pcs_rpc_create(struct pcs_rpc_engine * eng, struct pcs_rpc_params *parm, struct rpc_ops * ops)
+ *   - create new rpc client with requested parameters/ops
+ * pcs_rpc_close(struct pcs_rpc * ep)
+ *   - close client. Probably it will not be destroyed immediately, but it is guaranteed
+ *     that ops will not be called anymore. If some messages are queued inside rpc engine,
+ *     they will be completed before return from pcs_rpc_close(), but if messages are somewhere
+ *     under control of client, msg->done() can be called later.
+ */
+
+#include <net/sock.h>
+#include <linux/net.h>
+#include <linux/kthread.h>
+#include <linux/types.h>
+
+
+#include "pcs_types.h"
+#include "pcs_sock_io.h"
+#include "pcs_rpc.h"
+#include "pcs_cluster.h"
+#include "log.h"
+
+static void timer_work(struct work_struct *w);
+static int rpc_gc_classify(struct pcs_rpc * ep);
+
+static unsigned int rpc_hash(PCS_NODE_ID_T * id)
+{
+	return *(unsigned int*)id % PCS_RPC_HASH_SIZE;
+}
+
+static struct pcs_rpc *
+pcs_rpc_lookup(struct pcs_rpc_engine * eng, PCS_NODE_ID_T * id) __attribute__((unused));
+
+static struct pcs_rpc *
+pcs_rpc_lookup(struct pcs_rpc_engine * eng, PCS_NODE_ID_T * id)
+{
+	struct pcs_rpc * ep;
+
+	hlist_for_each_entry(ep, &eng->ht[rpc_hash(id)], link) {
+		if (memcmp(&ep->peer_id, id, sizeof(ep->peer_id)) == 0)
+			return pcs_rpc_get(ep);
+	}
+	return NULL;
+}
+static void rpc_add_hash(struct pcs_rpc * ep) __attribute__ ((unused));
+static void rpc_del_hash(struct pcs_rpc * ep) __attribute__ ((unused));
+
+static void rpc_add_hash(struct pcs_rpc * ep)
+{
+	if (!hlist_unhashed(&ep->link))
+		hlist_del(&ep->link);
+
+	if (!(ep->flags & PCS_RPC_F_HASHED)) {
+		ep->flags |= PCS_RPC_F_HASHED;
+		pcs_rpc_get(ep);
+	}
+
+	hlist_add_head(&ep->link, &ep->eng->ht[rpc_hash(&ep->peer_id)]);
+}
+
+static void rpc_del_hash(struct pcs_rpc * ep)
+{
+	if (ep->flags & PCS_RPC_F_HASHED) {
+		ep->flags &= ~PCS_RPC_F_HASHED;
+		hlist_del(&ep->link);
+		hlist_add_head(&ep->link, &ep->eng->unhashed);
+		pcs_rpc_put(ep);
+	}
+}
+
+
+struct pcs_msg * pcs_rpc_lookup_xid(struct pcs_rpc * ep, PCS_XID_T * xid)
+{
+	struct pcs_msg * msg;
+
+	/* TODO: lookup may be optimized by using has instead of list */
+	list_for_each_entry(msg, &ep->pending_queue, list) {
+		struct pcs_rpc_hdr * h = (struct pcs_rpc_hdr *)msg_inline_head(msg);
+		if (memcmp(&h->xid, xid, sizeof(PCS_XID_T)) == 0)
+			return msg;
+	}
+	return NULL;
+}
+
+static void pcs_set_rpc_error(pcs_error_t * err, int error, struct pcs_rpc * ep)
+{
+	err->value = error;
+
+	if (error == PCS_ERR_NOMEM) {
+		/* Sad exception, NOMEM is defintely a local error. XXX Find a way to beautify this. */
+		err->remote = 0;
+	} else {
+		err->remote = 1;
+		err->offender = ep->peer_id;
+	}
+}
+
+static void pcs_msg_add_calendar(struct pcs_msg * msg,	bool update)
+{
+	unsigned int kill_slot;
+	struct pcs_rpc *ep = msg->rpc;
+
+	BUG_ON(!ep);
+	kill_slot = update? msg->rpc->kill_arrow + ((msg->timeout + HZ -1) / HZ) : msg->kill_slot;
+	kill_slot = kill_slot & (RPC_MAX_CALENDAR - 1);
+	hlist_add_head(&msg->kill_link, &ep->kill_calendar[kill_slot]);
+	msg->kill_slot = kill_slot;
+
+	if (unlikely(!timer_pending(&ep->calendar_work.timer))) {
+		struct pcs_cluster_core *cc = cc_from_rpc(ep->eng);
+
+		mod_delayed_work(cc->wq, &ep->calendar_work, HZ);
+	}
+
+}
+
+void pcs_msg_del_calendar(struct pcs_msg * msg)
+{
+	int kill_slot = msg->kill_slot;
+
+	if (hlist_unhashed(&msg->kill_link))
+		return;
+
+	BUG_ON(kill_slot >= RPC_MAX_CALENDAR);
+	BUG_ON(!msg->rpc);
+	BUG_ON((msg->kill_slot != kill_slot));
+
+	hlist_del_init(&msg->kill_link);
+
+}
+
+void rpc_abort(struct pcs_rpc * ep, int fatal, int error)
+{
+	int state = ep->state;
+	struct list_head failed_list;
+
+	BUG_ON(!mutex_is_locked(&ep->mutex));
+	TRACE("ep:%p->state:%d fatal:%d error:%d\n", ep, state, fatal, error);
+
+	ep->flags &= ~(PCS_RPC_F_PEER_VERIFIED | PCS_RPC_F_PEER_AUTHORIZED);
+	ep->flags &= ~PCS_RPC_F_PEER_ID;
+
+	if (state == PCS_RPC_DESTROY || state == PCS_RPC_ABORT)
+		return;
+
+	/* Passively open connections are not reconnected */
+	if (ep->flags & (PCS_RPC_F_PASSIVE|PCS_RPC_F_NO_RETRY|PCS_RPC_F_DEAD))
+		fatal = 1;
+
+	ep->state = fatal ? PCS_RPC_ABORT : PCS_RPC_UNCONN;
+	cancel_delayed_work(&ep->timer_work);
+
+	pcs_rpc_get(ep);
+	INIT_LIST_HEAD(&failed_list);
+
+	while (!list_empty(&ep->pending_queue)) {
+		struct pcs_msg * msg = list_first_entry(&ep->pending_queue, struct pcs_msg, list);
+		list_move_tail(&msg->list, &failed_list);
+		TRACE("aborted msg to " PEER_FMT ", tmo=%d, err=%d, %ld", PEER_ARGS(ep),
+		      msg->timeout, error, (long)(msg->start_time + msg->timeout - jiffies));
+		pcs_msg_del_calendar(msg);
+		msg->stage = PCS_MSG_STAGE_NONE;
+	}
+	if (fatal) {
+		while (!list_empty(&ep->state_queue)) {
+			struct pcs_msg * msg = list_first_entry(&ep->state_queue, struct pcs_msg, list);
+			list_move_tail(&msg->list, &failed_list);
+			TRACE("aborted unsent msg to " PEER_FMT ", tmo=%d, err=%d", PEER_ARGS(ep),
+			      msg->timeout, error);
+			pcs_msg_del_calendar(msg);
+			msg->stage = PCS_MSG_STAGE_NONE;
+		}
+	}
+
+	if (ep->conn) {
+		struct pcs_ioconn * ioconn = ep->conn;
+		struct pcs_sockio * conn = sio_from_ioconn(ioconn);
+
+		ep->conn = NULL;
+		if (ep->gc)
+			list_lru_del(&ep->gc->lru, &ep->lru_link);
+
+		conn->parent = NULL;
+		pcs_sock_error(conn, error);
+	}
+
+	if (ep->state == PCS_RPC_UNCONN) {
+		struct pcs_cluster_core *cc = cc_from_rpc(ep->eng);
+
+		ep->state = PCS_RPC_HOLDDOWN;
+		queue_delayed_work(cc->wq, &ep->timer_work, ep->params.holddown_timeout);
+	}
+
+	while (!list_empty(&failed_list)) {
+		struct pcs_msg * msg = list_first_entry(&failed_list, struct pcs_msg, list);
+		list_del_init(&msg->list);
+		pcs_set_rpc_error(&msg->error, error, ep);
+		BUG_ON(!hlist_unhashed(&msg->kill_link));
+		msg->done(msg);
+	}
+
+	if (ep->state != PCS_RPC_ABORT)
+		goto out;
+
+	if (!(ep->flags & PCS_RPC_F_DEAD)) {
+		/* RPC is aborted, notify its owner. Owner is supposed to close us. */
+		if (ep->ops->state_change)
+			ep->ops->state_change(ep, error);
+	}
+
+out:
+	pcs_rpc_put(ep);
+}
+
+/* Client close. */
+void pcs_rpc_close(struct pcs_rpc * ep)
+{
+	mutex_lock(&ep->mutex);
+	BUG_ON(ep->flags & PCS_RPC_F_DEAD);
+	BUG_ON(ep->flags & PCS_RPC_F_PASSIVE);
+
+	ep->flags |= PCS_RPC_F_DEAD;
+	rpc_abort(ep, 1, PCS_ERR_NET_ABORT);
+	ep->state = PCS_RPC_DESTROY;
+	mutex_unlock(&ep->mutex);
+
+	pcs_rpc_put(ep);
+
+}
+
+void pcs_rpc_attach_new_ep(struct pcs_rpc * ep, struct pcs_rpc_engine * eng)
+{
+	eng->nrpcs++;
+	hlist_add_head(&ep->link, &eng->unhashed);
+	ep->eng = eng;
+	ep->state = PCS_RPC_UNCONN;
+	ep->flags = 0;
+	atomic_set(&ep->refcnt, 1);
+	ep->retries = 0;
+	ep->peer_role = PCS_NODE_ROLE_TEST;
+	ep->peer_flags = 0;
+	ep->peer_version = ~0U;
+	ep->conn = NULL;
+	ep->private = NULL;
+	INIT_LIST_HEAD(&ep->pending_queue);
+	INIT_LIST_HEAD(&ep->state_queue);
+	INIT_LIST_HEAD(&ep->input_queue);
+	INIT_LIST_HEAD(&ep->lru_link);
+
+	spin_lock_init(&ep->q_lock);
+	mutex_init(&ep->mutex);
+	ep->accounted = 0;
+	ep->netlat_min = ~0U;
+	ep->netlat_max = 0;
+	atomic_set(&ep->netlat_cnt, 0);
+	atomic64_set(&ep->netlat_avg, 0);
+	ep->cpu = WORK_CPU_UNBOUND;
+
+	ep->gc = NULL;
+	if (eng->max_gc_index)
+		ep->gc = &eng->gc[0];
+
+	if (!timer_pending(&eng->stat_work.timer)) {
+		struct pcs_cluster_core *cc = cc_from_rpc(eng);
+
+		mod_delayed_work(cc->wq, &eng->stat_work, PCS_MSG_MAX_CALENDAR * HZ);
+	}
+}
+
+void pcs_rpc_destroy(struct pcs_rpc * ep)
+{
+	BUG_ON(ep->state != PCS_RPC_DESTROY);
+	BUG_ON(ep->flags & PCS_RPC_F_HASHED);
+	BUG_ON(!(ep->flags & PCS_RPC_F_DEAD));
+	BUG_ON(!list_empty(&ep->input_queue));
+	BUG_ON(!list_empty(&ep->state_queue));
+	BUG_ON(!list_empty(&ep->pending_queue));
+	BUG_ON(timer_pending(&ep->timer_work.timer));
+
+	/* pcs_free(ep->sun); */
+	/* ep->sun = NULL; */
+	if (ep->gc)
+		list_lru_del(&ep->gc->lru, &ep->lru_link);
+	hlist_del(&ep->link);
+	ep->eng->nrpcs--;
+	cancel_delayed_work_sync(&ep->calendar_work);
+	if (ep->eng->nrpcs == 0)
+		cancel_delayed_work_sync(&ep->eng->stat_work);
+
+	memset(ep, 0xFF, sizeof(*ep));
+	kfree(ep);
+}
+
+static void rpc_eof_cb(struct pcs_sockio * sio)
+{
+	struct pcs_rpc * ep = sio->parent;
+
+	if (ep == NULL)
+		return;
+
+	/* Dead socket is finally closed, we could already open another one.
+	 * I feel inconvenient about this.
+	 */
+	if (&sio->ioconn != ep->conn)
+		return;
+
+	rpc_abort(ep, 0, PCS_ERR_NET_ABORT);
+}
+
+
+struct pcs_msg * pcs_rpc_alloc_error_response(struct pcs_rpc * ep, struct pcs_rpc_hdr * req_hdr, int err, int size)
+{
+	struct pcs_msg * eresp;
+	struct pcs_rpc_error_resp * eh;
+
+	BUG_ON(size < sizeof(struct pcs_rpc_error_resp));
+
+	eresp = pcs_alloc_response(req_hdr, size);
+	if (eresp) {
+		eh = (struct pcs_rpc_error_resp *)eresp->_inline_buffer;
+		eh->hdr.type = PCS_RPC_ERROR_RESP;
+		eh->offender = ep->eng->local_id;
+		eh->code = err;
+		eh->npayloads = 0;
+		memset(&eh->payload, 0, sizeof(eh->payload));
+	}
+	return eresp;
+}
+
+void pcs_rpc_error_respond(struct pcs_rpc * ep, struct pcs_msg * msg, int err)
+{
+	struct pcs_msg * eresp;
+	struct pcs_rpc_hdr * h = (struct pcs_rpc_hdr *)msg->_inline_buffer;
+
+	if (ep->state < PCS_RPC_AUTH || ep->state > PCS_RPC_WORK)
+		return;
+
+	eresp = pcs_rpc_alloc_error_response(ep, h, err, sizeof(struct pcs_rpc_error_resp));
+	if (eresp) {
+		struct pcs_sockio *sio = sio_from_ioconn(ep->conn);
+		pcs_sock_sendmsg(sio, eresp);
+	}
+}
+
+/* After client gets csconn_complete() callback, he makes some actions and completes switch
+ * to WORK state calling this function.
+ */
+static void pcs_rpc_enable(struct pcs_rpc * ep, int error)
+{
+	struct pcs_cluster_core *cc = cc_from_rpc(ep->eng);
+
+	BUG_ON(!mutex_is_locked(&ep->mutex));
+	BUG_ON(ep->state != PCS_RPC_APPWAIT);
+
+	if (error) {
+		rpc_abort(ep, 1, error);
+		return;
+	}
+
+	if (ep->gc) {
+		int idx = rpc_gc_classify(ep);
+
+		if (ep->eng->gc + idx != ep->gc) {
+			list_lru_del(&ep->gc->lru, &ep->lru_link);
+			ep->gc = ep->eng->gc + idx;
+			list_lru_add(&ep->gc->lru, &ep->lru_link);
+		}
+	}
+	TRACE("ep(%p)->state: WORK\n", ep);
+	ep->state = PCS_RPC_WORK;
+	queue_work(cc->wq, &ep->work);
+}
+
+static void handle_response(struct pcs_rpc * ep, struct pcs_msg * msg)
+{
+	struct pcs_rpc_hdr * h = (struct pcs_rpc_hdr *)msg->_inline_buffer;
+	struct pcs_msg * req;
+
+	/* Use of iocount is unusual and deserves an explanation. If response
+	 * is processed synchronously, this iocount is unnecessary.
+	 * But if done() needs to queue response, it can increase iocount to hold the message
+	 * for itself.
+	 */
+	pcs_msg_io_start(msg, pcs_free_msg);
+	req = pcs_rpc_lookup_xid(ep, &h->xid);
+	if (req == NULL)
+		goto drop;
+
+	pcs_msg_del_calendar(req);
+	list_del(&req->list);
+	if (h->type == PCS_RPC_ERROR_RESP) {
+		struct pcs_rpc_error_resp * eh = (struct pcs_rpc_error_resp *)msg->_inline_buffer;
+
+		if (msg->size < sizeof(struct pcs_rpc_error_resp))
+			pcs_set_rpc_error(&req->error, PCS_ERR_PROTOCOL, ep);
+		else {
+			req->error = (pcs_error_t){ .value = eh->code, .remote = 1, .offender = eh->offender };
+			req->response = msg;
+		}
+	} else {
+		struct pcs_rpc_hdr * req_h = (struct pcs_rpc_hdr *)msg_inline_head(req);
+
+		if ((req_h->type ^ h->type) & ~PCS_RPC_DIRECTION)
+			pcs_set_rpc_error(&req->error, PCS_ERR_PROTOCOL, ep);
+		else
+			req->response = msg;
+	}
+
+	if (ep->ops->hook_response)
+		ep->ops->hook_response(ep, req);
+
+	req->stage = PCS_MSG_STAGE_DONE;
+	BUG_ON(!hlist_unhashed(&msg->kill_link));
+	req->done(req);
+
+drop:
+	pcs_msg_io_end(msg);
+}
+
+static void handle_keep_waiting(struct pcs_rpc * ep, struct pcs_msg * msg)
+{
+	struct pcs_rpc_keep_waiting * h = (struct pcs_rpc_keep_waiting *)msg->_inline_buffer;
+	struct pcs_msg * req;
+
+	if (h->hdr.len < sizeof(struct pcs_rpc_keep_waiting))
+		return;
+
+	TRACE("Received keep wait from " NODE_FMT " for request " XID_FMT,
+	      NODE_ARGS(h->hdr.xid.origin), XID_ARGS(h->xid));
+
+	req = pcs_rpc_lookup_xid(ep, &h->xid);
+	if (!req)
+		return;
+
+	if (ep->ops->keep_waiting)
+		ep->ops->keep_waiting(ep, req, msg);
+
+	/* Restart kill timer as if message arrived right now */
+	if (!hlist_unhashed(&req->kill_link)) {
+		pcs_msg_del_calendar(req);
+		pcs_msg_add_calendar(req, 1);
+	}
+
+	/* Requeue message to tail of pending queue and restart RPC timer */
+	if (req->stage == PCS_MSG_STAGE_WAIT) {
+		req->start_time = jiffies;
+		list_move_tail(&req->list, &ep->pending_queue);
+	}
+}
+
+void pcs_rpc_cancel_request(struct pcs_msg * msg)
+{
+	pcs_msg_del_calendar(msg);
+	list_del(&msg->list);
+	msg->stage = PCS_MSG_STAGE_NONE;
+	pcs_set_rpc_error(&msg->error, PCS_ERR_CANCEL_KEEPWAIT, msg->rpc);
+	msg->done(msg);
+}
+
+void rpc_work_input(struct pcs_msg * msg)
+{
+	struct pcs_rpc * ep = msg->rpc;
+	struct pcs_rpc_hdr * h = (struct pcs_rpc_hdr *)msg->_inline_buffer;
+
+	if (ep == NULL || ep->state != PCS_RPC_WORK)
+		goto drop;
+
+	msg->done = pcs_free_msg;
+
+	if (RPC_IS_RESPONSE(h->type)) {
+		handle_response(ep, msg);
+		return;
+	} else if (h->type == PCS_RPC_KEEP_WAITING) {
+		handle_keep_waiting(ep, msg);
+	} else {
+		int res;
+
+		res = ep->ops->demux_request(ep, msg);
+		/* Successfully demuxed */
+		if (res == 0)
+			return;
+
+		/* Client can return error code to pass back to requestor */
+		pcs_rpc_error_respond(ep, msg, res);
+	}
+
+drop:
+	pcs_free_msg(msg);
+}
+
+struct pcs_msg * rpc_get_hdr(struct pcs_sockio * sio)
+{
+	struct pcs_rpc * ep = (struct pcs_rpc *)sio->parent;
+	struct pcs_rpc_hdr * h = (struct pcs_rpc_hdr*)sio_inline_buffer(sio);
+	struct pcs_msg * msg;
+	void (*next_input)(struct pcs_msg *);
+
+	if (ep == NULL)
+		return NULL;
+
+	/* Fatal stream format error */
+	if (h->len < sizeof(struct pcs_rpc_hdr) || h->len > ep->params.max_msg_size) {
+		pcs_log(0, "Bad message header %u %u\n", h->len, h->type);
+		return NULL;
+	}
+
+	switch (ep->state) {
+	case PCS_RPC_WORK:
+		/* Client can override get_hdr to allocate special buffer. */
+		if (ep->ops->get_hdr) {
+			msg = ep->ops->get_hdr(ep, h);
+			if (msg)
+				return msg;
+		}
+		next_input = rpc_work_input;
+		break;
+	default:
+		pcs_log(0, "Received msg in bad state %u\n", ep->state);
+		BUG();
+		return NULL;
+
+	}
+
+	msg = pcs_rpc_alloc_input_msg(ep, h->len);
+	if (!msg) {
+		pcs_sock_throttle(sio);
+		return NULL;
+	}
+
+	memcpy(msg->_inline_buffer, h, sizeof(struct pcs_rpc_hdr));
+	msg->done = next_input;
+	msg->size = h->len;
+	msg->private = NULL;
+	return msg;
+}
+
+
+/* Start connect. It is triggered by a message sent to this peer or can be called
+ * explicitly, if caller needs to steal csconn from userspace
+ */
+void pcs_rpc_connect(struct pcs_rpc * ep)
+{
+
+	/* Nothing to do, connect is already initiated or in holddown state */
+	if (ep->state != PCS_RPC_UNCONN)
+		return;
+
+	if (ep->flags & PCS_RPC_F_LOCAL) {
+		/* TODO, local path is temprorally disabled */
+		BUG_ON(1);
+	} else {
+		TRACE("Connecting to node " NODE_FMT "\n", NODE_ARGS(ep->peer_id));
+
+
+		BUG_ON(!ep->ops->connect);
+		ep->ops->connect(ep);
+	}
+}
+
+/* Send notification, which does not require waiting for response from peer.
+ * Also it is used internally as "raw" submit.
+ */
+static void pcs_rpc_send(struct pcs_rpc * ep, struct pcs_msg * msg, bool requeue)
+{
+	struct pcs_sockio *sio = sio_from_ioconn(ep->conn);
+
+	BUG_ON(!mutex_is_locked(&ep->mutex));
+	BUG_ON(msg->rpc != (requeue ? ep: NULL));
+
+	TRACE("ENTER ep:%p state:%d msg:%p\n", ep, ep->state, msg);
+
+	if (!requeue) {
+		msg->rpc = pcs_rpc_get(ep);
+		if (msg->timeout) {
+			pcs_msg_add_calendar(msg, 1);
+		} else {
+			msg->kill_slot = RPC_MAX_CALENDAR;
+			INIT_HLIST_NODE(&msg->kill_link);
+		}
+	} else /* Requeued messages must be scheduled in calendar */
+		BUG_ON(msg->timeout && hlist_unhashed(&msg->kill_link));
+
+	if (ep->state == PCS_RPC_WORK) {
+		BUG_ON(ep->conn == NULL);
+		if (msg->size)
+			pcs_sock_sendmsg(sio, msg);
+		else {
+			pcs_msg_del_calendar(msg);
+			msg->done(msg);
+		}
+		return;
+	}
+
+	if (ep->state == PCS_RPC_ABORT || ep->state == PCS_RPC_DESTROY) {
+		pcs_set_rpc_error(&msg->error, PCS_ERR_NET_ABORT, ep);
+		pcs_msg_del_calendar(msg);
+		msg->done(msg);
+		return;
+	}
+
+	list_add_tail(&msg->list, &ep->state_queue);
+	msg->stage = PCS_MSG_STAGE_UNSENT;
+
+	if (ep->state == PCS_RPC_UNCONN)
+		pcs_rpc_connect(ep);
+}
+
+void pcs_rpc_kick_queue(struct pcs_rpc * ep)
+{
+	struct pcs_cluster_core *cc = cc_from_rpc(ep->eng);
+
+	queue_work_on(ep->cpu, cc->wq, &ep->work);
+}
+
+void pcs_rpc_queue(struct pcs_rpc * ep, struct pcs_msg * msg)
+{
+	int was_idle;
+
+	spin_lock(&ep->q_lock);
+	was_idle = list_empty(&ep->input_queue);
+	list_add_tail(&msg->list, &ep->input_queue);
+
+	/* Naive socket-to-cpu binding approach */
+	if (time_is_before_jiffies(ep->cpu_stamp) && was_idle) {
+		ep->cpu_stamp = jiffies + PCS_RPC_CPU_SLICE;
+		ep->cpu = smp_processor_id();
+	}
+	spin_unlock(&ep->q_lock);
+
+	if (was_idle)
+		pcs_rpc_kick_queue(ep);
+}
+
+static void calendar_work(struct work_struct *w)
+{
+	struct pcs_rpc * ep = container_of(w, struct pcs_rpc, calendar_work.work);
+	int kill_slot = ep->kill_arrow & (RPC_MAX_CALENDAR - 1);
+	struct pcs_cluster_core *cc = cc_from_rpc(ep->eng);
+	struct hlist_head * bucket;
+	int i, count = 0;
+
+	mutex_lock(&ep->mutex);
+	bucket = &ep->kill_calendar[kill_slot];
+	while (!hlist_empty(bucket)) {
+		struct pcs_msg * msg = hlist_entry(bucket->first, struct pcs_msg, kill_link);
+		struct pcs_rpc_hdr * h = (struct pcs_rpc_hdr *)msg_inline_head(msg);
+
+		(void)h;
+		TRACE("killing msg to " PEER_FMT " type=%u xid=" XID_FMT " stage=%d tmo=%d exp=%ld rem=%ld\n",
+		      PEER_ARGS(msg->rpc), h->type, XID_ARGS(h->xid),
+		      msg->stage, msg->timeout,
+		      (long)(msg->start_time + msg->timeout - jiffies),
+		      (long)(msg->start_time + msg->rpc->params.response_timeout - jiffies));
+
+		pcs_msg_del_calendar(msg);
+		switch (msg->stage) {
+		case PCS_MSG_STAGE_SEND:
+			if (pcs_sock_cancel_msg(msg)) {
+				/* The message is under network IO right now. We cannot kill it
+				 * without destruction of the whole connection. So, we just reschedule
+				 * kill. When IO will complete, it will be killed not even waiting
+				 * for response. But if IO stucks, we will violate deadline, alas.
+				 * I hope it is the only place, where we violate deadline now.
+				 */
+				msg->kill_slot = (msg->kill_slot + 1 ) & (RPC_MAX_CALENDAR - 1);
+				pcs_msg_add_calendar(msg, 0);
+				continue;
+			}
+			break;
+		default:
+			list_del(&msg->list);
+			break;
+		}
+
+		if (msg->stage == PCS_MSG_STAGE_WAIT) {
+			/* Leave rpc timer running. If it expires before any (late) response
+			 * is received, rpc will be shutdown
+			 */
+			pcs_set_rpc_error(&msg->error, PCS_ERR_RESPONSE_TIMEOUT, msg->rpc);
+		} else {
+			msg->stage = PCS_MSG_STAGE_SENT;
+			pcs_set_rpc_error(&msg->error, PCS_ERR_WRITE_TIMEOUT, msg->rpc);
+		}
+		BUG_ON(!hlist_unhashed(&msg->kill_link));
+		msg->done(msg);
+		count++;
+	}
+	if (count)
+		printk("%s %d messages to "PEER_FMT" destroyed\n", __FUNCTION__,
+		       count, PEER_ARGS(ep));
+
+	for (i=0; i < RPC_MAX_CALENDAR-1; i++) {
+		kill_slot = (ep->kill_arrow  + i) & (RPC_MAX_CALENDAR - 1);
+
+		if (!hlist_empty(&ep->kill_calendar[kill_slot])) {
+			/* FIXME: suboptimal scheduling */
+			mod_delayed_work(cc->wq, &ep->calendar_work, HZ);
+			break;
+		}
+	}
+	ep->kill_arrow++;
+	mutex_unlock(&ep->mutex);
+}
+
+static void update_xmit_timeout(struct pcs_rpc *ep)
+{
+	struct pcs_sockio *sio = sio_from_ioconn(ep->conn);
+	struct pcs_cluster_core *cc = cc_from_rpc(ep->eng);
+	struct pcs_msg * msg;
+	unsigned long timeout = 0;
+	unsigned long tx;
+
+	BUG_ON(ep->state != PCS_RPC_WORK);
+
+	if (list_empty(&ep->pending_queue) && list_empty(&sio->write_queue)) {
+		if (timer_pending(&ep->timer_work.timer))
+			cancel_delayed_work(&ep->timer_work);
+		return;
+	}
+	if (!list_empty(&ep->pending_queue)) {
+		msg = list_first_entry(&ep->pending_queue, struct pcs_msg, list);
+
+		timeout = msg->start_time + ep->params.response_timeout;
+	}
+	if (!list_empty(&sio->write_queue)) {
+		msg = list_first_entry(&sio->write_queue, struct pcs_msg, list);
+		tx = msg->start_time + sio->send_timeout;
+		if (time_after(tx, timeout))
+			timeout = tx;
+	}
+	if (time_is_before_jiffies(timeout))
+		timeout = 0;
+	else
+		timeout -= jiffies;
+
+	mod_delayed_work(cc->wq, &ep->timer_work, timeout);
+
+}
+static void rpc_queue_work(struct work_struct *w)
+{
+	LIST_HEAD(input_q);
+	LIST_HEAD(complete_q);
+	LIST_HEAD(state_q);
+	struct pcs_rpc *ep = pcs_rpc_from_work(w);
+	int repeat;
+
+	pcs_rpc_get(ep);
+again:
+	spin_lock(&ep->q_lock);
+	list_splice_tail_init(&ep->input_queue, &input_q);
+	spin_unlock(&ep->q_lock);
+
+	mutex_lock(&ep->mutex);
+
+	TRACE("Handle queues\n");
+
+	/* Process messages which are already in the sock queue */
+	if (ep->state == PCS_RPC_WORK) {
+		struct pcs_sockio *sio = sio_from_ioconn(ep->conn);
+
+		pcs_sockio_xmit(sio);
+	}
+
+	/* Process delayed ones */
+	while (!list_empty(&input_q)) {
+		struct pcs_msg * msg = list_first_entry(&input_q, struct pcs_msg, list);
+
+		list_del_init(&msg->list);
+		pcs_rpc_send(ep, msg, 0);
+	}
+	list_splice_tail_init(&ep->state_queue, &state_q);
+	while (!list_empty(&state_q)) {
+		struct pcs_msg * msg = list_first_entry(&state_q, struct pcs_msg, list);
+
+		/* Original code allow msg->ep can be from alien RPC. This is very
+		   strange assumption. Seems this is impossible, and crewup my locking */
+		BUG_ON(msg->rpc != ep);
+
+		list_del_init(&msg->list);
+		pcs_rpc_send(ep, msg, 1);
+	}
+	repeat = 0;
+	if (ep->state == PCS_RPC_WORK) {
+		struct pcs_sockio *sio = sio_from_ioconn(ep->conn);
+
+		if (pcs_sockio_delayed_seg(sio))
+			repeat = 1;
+		update_xmit_timeout(ep);
+	}
+	mutex_unlock(&ep->mutex);
+	if (repeat)
+		goto again;
+	pcs_rpc_put(ep);
+
+}
+
+struct pcs_rpc * pcs_rpc_alloc_ep(void)
+{
+	return kzalloc(sizeof(struct pcs_rpc), GFP_NOIO);
+}
+
+void pcs_rpc_configure_new_ep(struct pcs_rpc * ep, struct pcs_rpc_params *parm, struct pcs_rpc_ops * ops)
+{
+	int i;
+
+	ep->params = *parm;
+	ep->ops = ops;
+	ep->kill_arrow = 0;
+
+	INIT_WORK(&ep->work, rpc_queue_work);
+	INIT_DELAYED_WORK(&ep->timer_work, timer_work);
+	INIT_DELAYED_WORK(&ep->calendar_work, calendar_work);
+
+	for (i = 0; i < RPC_MAX_CALENDAR; i++)
+		INIT_HLIST_HEAD(&ep->kill_calendar[i]);
+}
+
+struct pcs_rpc * pcs_rpc_create(struct pcs_rpc_engine * eng, struct pcs_rpc_params *parm, struct pcs_rpc_ops * ops)
+{
+	struct pcs_rpc * ep = pcs_rpc_alloc_ep();
+	pcs_rpc_attach_new_ep(ep, eng);
+	pcs_rpc_configure_new_ep(ep, parm, ops);
+	return ep;
+}
+
+void pcs_rpc_sent(struct pcs_msg * msg)
+{
+	struct pcs_rpc * ep = msg->rpc;
+
+	BUG_ON(!mutex_is_locked(&ep->mutex));
+
+	msg->start_time = jiffies;
+	list_add_tail(&msg->list, &ep->pending_queue);
+	msg->stage = PCS_MSG_STAGE_WAIT;
+
+	if (!timer_pending(&ep->timer_work.timer)) {
+		struct pcs_cluster_core *cc = cc_from_rpc(ep->eng);
+
+		mod_delayed_work(cc->wq, &ep->timer_work, ep->params.response_timeout);
+	}
+
+	if (msg->timeout) {
+		BUG_ON(msg->kill_slot >= RPC_MAX_CALENDAR);
+
+		pcs_msg_add_calendar(msg, 0);
+	} else
+		INIT_HLIST_NODE(&msg->kill_link);
+}
+
+static void rpc_call_sent_cb(struct pcs_msg * clone)
+{
+	struct pcs_msg * msg = clone->private;
+	struct pcs_rpc * ep = clone->rpc;
+
+	BUG_ON(!mutex_is_locked(&ep->mutex));
+
+	/* Inherit kill slot */
+	msg->kill_slot = clone->kill_slot;
+
+	///// TODO: dmonakhov@ optimize states
+	if (pcs_if_error(&clone->error)) {
+		switch (ep->state) {
+		case PCS_RPC_UNCONN:
+		case PCS_RPC_HOLDDOWN:
+		case PCS_RPC_CONNECT:
+		case PCS_RPC_AUTH:
+		case PCS_RPC_AUTHWAIT:
+			if (clone->timeout ||
+			    clone->error.value == PCS_ERR_WRITE_TIMEOUT ||
+			    clone->error.value == PCS_ERR_RESPONSE_TIMEOUT)
+				break;
+
+			pcs_clear_error(&clone->error);
+			list_add_tail(&clone->list, &ep->state_queue);
+			if (ep->state == PCS_RPC_UNCONN)
+				pcs_rpc_connect(ep);
+			return;
+		}
+
+		pcs_copy_error(&msg->error, &clone->error);
+		msg->done(msg);
+		pcs_free_msg(clone);
+		return;
+	}
+
+	/*
+	 * TODO: We should performs peiodic rpc health check as userspace do
+	 * via rpc_trace_health
+	 */
+	pcs_rpc_sent(msg);
+	pcs_free_msg(clone);
+}
+
+/* "User-friendly" send. It is not quite optimal (uses redundant clone), but appropriate
+ * for most of simple rpc calls
+ */
+
+static void rpc_msg_output_destructor(struct pcs_msg * msg)
+{
+	if (msg->rpc)
+		pcs_rpc_put(msg->rpc);
+	memset(msg, 0xFF, sizeof(*msg));
+	kfree(msg);
+}
+
+struct pcs_msg * pcs_rpc_clone_msg(struct pcs_msg * msg)
+{
+	struct pcs_msg *cloned_msg = pcs_clone_msg(msg);
+
+	if (cloned_msg)
+		cloned_msg->destructor = rpc_msg_output_destructor;
+	return cloned_msg;
+}
+
+void pcs_rpc_call(struct pcs_rpc * ep, struct pcs_msg * msg)
+{
+	struct pcs_msg * clone;
+
+	BUG_ON(msg->rpc != NULL);
+	msg->rpc = pcs_rpc_get(ep);
+
+	clone = pcs_rpc_clone_msg(msg);
+	if (clone == NULL) {
+		pcs_set_local_error(&msg->error, PCS_ERR_NOMEM);
+		BUG_ON(!hlist_unhashed(&msg->kill_link));
+		msg->done(msg);
+		return;
+	}
+
+	pcs_clear_error(&clone->error);
+	clone->rpc = NULL;
+	clone->done = rpc_call_sent_cb;
+	clone->timeout = msg->timeout;
+
+	pcs_rpc_queue(ep, clone);
+}
+/* TODO: This pace may not scale well, in fact xid should be unique only
+   across RPC so it may be reasonable to make it percpu
+*/
+void pcs_rpc_get_new_xid(struct pcs_rpc_engine *eng, PCS_XID_T *xid)
+{
+	xid->origin = eng->local_id;
+	/* Remember, xids should be unique per peer. The only reliable way to ensure this is
+	 * to generate xids globally.
+	 */
+	xid->val = atomic64_inc_return(&eng->xid_generator);
+}
+
+static int rpc_check_memlimit(struct pcs_rpc * ep)
+{
+	struct pcs_rpc_engine * eng = ep->eng;
+
+	if ((ep->flags & PCS_RPC_F_ACCT) &&
+	    eng->msg_allocated >= eng->mem_pressure_thresh) {
+		/* If congestion avoidance works, this should not happen.
+		 * However, if this happens we must do something.
+		 */
+		if (eng->msg_allocated > eng->mem_limit) {
+			pcs_log(LOG_ERR, "Hard memory limit exceeded");
+			return 1;
+		}
+		if (ep->peer_role == PCS_NODE_ROLE_CN) {
+			/* CN contributes 3 (repl.norm) times of memory pressure on cluster */
+			if (3 * ep->accounted * eng->accounted_rpcs >= eng->msg_allocated) {
+				TRACE("Soft memory limit exceeded " PEER_FMT, PEER_ARGS(ep));
+				return 1;
+			}
+		} else {
+			if (ep->accounted * eng->accounted_rpcs >= eng->msg_allocated) {
+				TRACE("Soft memory limit exceeded " PEER_FMT, PEER_ARGS(ep));
+				return 1;
+			}
+		}
+	}
+	return 0;
+}
+
+void pcs_rpc_deaccount_msg(struct pcs_msg * msg)
+{
+	struct pcs_rpc * ep = msg->rpc;
+
+	msg->rpc = NULL;
+	ep->eng->msg_count--;
+
+	if (msg->accounted) {
+		ep->accounted -= msg->accounted;
+		ep->eng->msg_allocated -= msg->accounted;
+		if (ep->accounted == 0)
+			ep->eng->accounted_rpcs--;
+		msg->accounted = 0;
+		if (ep->state == PCS_RPC_WORK)
+			pcs_sock_unthrottle((struct pcs_sockio *)ep->conn);
+	}
+	pcs_rpc_put(ep);
+}
+
+static void pcs_rpc_account_msg(struct pcs_rpc * ep, struct pcs_msg * msg, int accounted)
+{
+	msg->accounted = 0;
+	msg->rpc = pcs_rpc_get(ep);
+
+	ep->eng->msg_count++;
+
+	if (ep->flags & PCS_RPC_F_ACCT) {
+		msg->accounted = accounted;
+
+		if (ep->accounted == 0)
+			ep->eng->accounted_rpcs++;
+
+		ep->eng->msg_allocated += accounted;
+		ep->accounted += accounted;
+	}
+}
+
+void pcs_rpc_account_adjust(struct pcs_msg * msg, int adjustment)
+{
+	if (msg->accounted && (msg->rpc->flags & PCS_RPC_F_ACCT)) {
+		struct pcs_rpc * ep = msg->rpc;
+
+		msg->accounted += adjustment;
+		ep->eng->msg_allocated += adjustment;
+		ep->accounted += adjustment;
+	}
+}
+
+static void pcs_rpc_input_destructor(struct pcs_msg * msg)
+{
+	pcs_rpc_deaccount_msg(msg);
+	kfree(msg);
+}
+
+/* get_iter() handler for messages with embedded payload right after pcs_msg */
+void pcs_rpc_get_iter_inline(struct pcs_msg * msg, int offset, struct iov_iter *it)
+{
+	BUG_ON(offset >= msg->size);
+
+	iov_iter_init_plain(it, msg->_inline_buffer, msg->size, 0);
+	iov_iter_advance(it, offset);
+}
+
+void pcs_rpc_init_input_msg(struct pcs_rpc * ep, struct pcs_msg * msg, int account)
+{
+	pcs_msg_io_init(msg);
+	msg->timeout = 0;
+	INIT_HLIST_NODE(&msg->kill_link);
+	pcs_rpc_account_msg(ep, msg, account);
+	msg->destructor = pcs_rpc_input_destructor;
+}
+
+struct pcs_msg * pcs_rpc_alloc_input_msg(struct pcs_rpc * ep, int datalen)
+{
+	struct pcs_msg * msg;
+
+	if (rpc_check_memlimit(ep))
+		return NULL;
+
+	msg = kzalloc(sizeof(struct pcs_msg) + datalen, GFP_NOIO);
+	if (msg) {
+		pcs_rpc_init_input_msg(ep, msg, sizeof(struct pcs_msg) + datalen);
+		msg->size = datalen;
+		msg->get_iter = pcs_rpc_get_iter_inline;
+	}
+	return msg;
+}
+
+
+static void pcs_msg_output_destructor(struct pcs_msg * msg)
+{
+	if (msg->rpc)
+		pcs_rpc_put(msg->rpc);
+	kfree(msg);
+}
+
+void pcs_rpc_init_output_msg(struct pcs_msg * msg)
+{
+	pcs_msg_io_init(msg);
+	pcs_clear_error(&msg->error);
+	msg->timeout = 0;
+	msg->rpc = NULL;
+	INIT_HLIST_NODE(&msg->kill_link);
+	msg->destructor = pcs_msg_output_destructor;
+}
+
+struct pcs_msg * pcs_rpc_alloc_output_msg(int datalen)
+{
+	struct pcs_msg * msg;
+
+	msg = kzalloc(sizeof(struct pcs_msg) + datalen, GFP_NOIO);
+	if (msg) {
+		pcs_rpc_init_output_msg(msg);
+		msg->size = datalen;
+		msg->get_iter = pcs_rpc_get_iter_inline;
+	}
+	return msg;
+}
+
+void pcs_rpc_init_response(struct pcs_msg * msg, struct pcs_rpc_hdr * req_hdr, int size)
+{
+	struct pcs_rpc_hdr * h;
+
+	h = (struct pcs_rpc_hdr *)msg->_inline_buffer;
+	h->len = size;
+	h->type = req_hdr->type | PCS_RPC_DIRECTION;
+	h->xid = req_hdr->xid;
+}
+
+struct pcs_msg * pcs_alloc_response(struct pcs_rpc_hdr * req_hdr, int size)
+{
+	struct pcs_msg * msg;
+
+	msg = pcs_rpc_alloc_output_msg(size);
+	if (msg == NULL)
+		return NULL;
+
+	pcs_rpc_init_response(msg, req_hdr, size);
+
+	return msg;
+}
+
+void pcs_rpc_set_peer_id(struct pcs_rpc * ep, PCS_NODE_ID_T * id, u8 role)
+{
+	BUG_ON(ep->flags & (PCS_RPC_F_PEER_ID|PCS_RPC_F_HASHED));
+	ep->peer_role = role;
+	memcpy(&ep->peer_id, id, sizeof(PCS_NODE_ID_T));
+	ep->flags |= PCS_RPC_F_CLNT_PEER_ID;
+}
+
+int pcs_rpc_set_address(struct pcs_rpc * ep, PCS_NET_ADDR_T * addr)
+{
+	BUG_ON(ep->state != PCS_RPC_UNCONN);
+
+	ep->addr = *addr;
+	return 0;
+}
+
+/* Reset rpc engine, move it to unconnected state ready for further connects. */
+void pcs_rpc_reset(struct pcs_rpc * ep)
+{
+	rpc_abort(ep, 1, PCS_ERR_NET_ABORT);
+	ep->retries = 0;
+	if (ep->state == PCS_RPC_ABORT)
+		ep->state = PCS_RPC_UNCONN;
+}
+
+static void timer_work(struct work_struct *w)
+{
+	struct pcs_rpc * ep = container_of(w, struct pcs_rpc, timer_work.work);
+
+	mutex_lock(&ep->mutex);
+	switch (ep->state) {
+	case PCS_RPC_HOLDDOWN:
+		ep->state = PCS_RPC_UNCONN;
+		pcs_rpc_connect(ep);
+		break;
+
+	case PCS_RPC_WORK: {
+		int err = list_empty(&ep->pending_queue) ? PCS_ERR_RESPONSE_TIMEOUT : PCS_ERR_WRITE_TIMEOUT;
+
+		TRACE("rpc timer expired, killing connection to " PEER_FMT ", %d",
+		      PEER_ARGS(ep), err);
+		rpc_abort(ep, 0, err);
+		break;
+	}
+	/* TODO CLEAN unused states */
+	case PCS_RPC_AUTHWAIT:
+	case PCS_RPC_AUTH:
+	case PCS_RPC_CONNECT:
+		BUG_ON(1);
+		break;
+	}
+	mutex_unlock(&ep->mutex);
+}
+
+static void connstat_work(struct work_struct *w)
+{
+	struct pcs_rpc_engine * eng = container_of(w, struct pcs_rpc_engine, stat_work.work);
+	struct pcs_cluster_core *cc = cc_from_rpc(eng);
+
+	pcs_log(LOG_INFO, "TODO send connstat-s\n");
+	(void)eng;
+	/* account_connstat(eng); */
+	mod_delayed_work(cc->wq, &eng->stat_work, PCS_MSG_MAX_CALENDAR * HZ);
+}
+
+
+void pcs_rpc_engine_init(struct pcs_rpc_engine * eng, u8 role)
+{
+	int i;
+	memset(eng, 0, sizeof(*eng));
+	eng->role = role;
+	for (i = 0; i < RPC_GC_MAX_CLASS; i++)
+		list_lru_init(&eng->gc[i].lru);
+
+	INIT_DELAYED_WORK(&eng->stat_work, connstat_work);
+
+}
+
+void pcs_rpc_engine_fini(struct pcs_rpc_engine * eng)
+{
+	unsigned int i;
+
+	for (i = 0; i < PCS_RPC_HASH_SIZE; i++) {
+		while (!hlist_empty(&eng->ht[i])) {
+			struct pcs_rpc * ep = hlist_entry(eng->ht[i].first, struct pcs_rpc, link);
+
+			pcs_rpc_close(ep);
+		}
+	}
+
+	while (!hlist_empty(&eng->unhashed)) {
+		struct pcs_rpc * ep = hlist_entry(eng->unhashed.first, struct pcs_rpc, link);
+
+		pcs_rpc_close(ep);
+	}
+
+	for (i = 0; i < RPC_GC_MAX_CLASS; i++) {
+		BUG_ON(list_lru_count(&eng->gc[i].lru));
+		list_lru_destroy(&eng->gc[i].lru);
+	}
+}
+
+void pcs_rpc_set_host_id(struct pcs_rpc_engine *eng, PCS_NODE_ID_T *host_id)
+{
+	eng->my_host.host_id.val = host_id->val;
+	eng->flags |= PCS_KNOWN_HOSTID;
+}
+
+void pcs_rpc_set_cluster_id(struct pcs_rpc_engine * eng, PCS_CLUSTER_ID_T * id)
+{
+	memcpy(&eng->cluster_id, id, sizeof(*id));
+	eng->flags |= PCS_KNOWN_CLUSTERID;
+}
+
+void pcs_rpc_set_location(struct pcs_rpc_engine * eng, struct pcs_location * loc)
+{
+	memcpy(&eng->my_host.location, loc, sizeof(*loc));
+}
+
+static int rpc_gc_classify(struct pcs_rpc * ep)
+{
+	BUG_ON(ep->eng->role != PCS_NODE_ROLE_TOOL);
+
+	return 0;
+}
+
+void pcs_rpc_init_gc(struct pcs_rpc_engine * eng, unsigned int limit)
+{
+	eng->max_connections = limit;
+
+	switch (eng->role) {
+	case PCS_NODE_ROLE_MDS:
+		eng->max_gc_index = 3;
+		break;
+	case PCS_NODE_ROLE_CS:
+		eng->max_gc_index = 4;
+		break;
+	case PCS_NODE_ROLE_CN:
+		eng->max_gc_index = 2;
+		break;
+	default:
+		eng->max_gc_index = 1;
+	}
+}
+
+
+void pcs_rpc_set_memlimits(struct pcs_rpc_engine * eng, u64 thresh, u64 limit)
+{
+	eng->mem_pressure_thresh = thresh;
+	eng->mem_limit = limit;
+}
+
+void rpc_connect_done(struct pcs_rpc *ep, struct socket *sock)
+{
+	struct pcs_sockio * sio;
+
+	mutex_lock(&ep->mutex);
+
+	TRACE(PEER_FMT " ->state:%d sock:%p\n", PEER_ARGS(ep), ep->state, sock);
+	cancel_delayed_work(&ep->timer_work);
+	ep->retries++;
+
+	if (ep->state != PCS_RPC_CONNECT) {
+		pcs_log(LOG_ERR, "Invalid state: %u", ep->state);
+		BUG();
+	}
+
+	sio = pcs_sockio_init(sock, ep->params.alloc_hdr_size,
+			      sizeof(struct pcs_rpc_hdr));
+	if (sio == NULL)
+		BUG();
+
+	ep->conn = &sio->ioconn;
+	sio->parent = ep;
+	sio->get_msg = rpc_get_hdr;
+	sio->eof = rpc_eof_cb;
+	//pcs_ioconn_register(ep->conn);
+	ep->retries = 0;
+	if (ep->gc)
+		list_lru_add(&ep->gc->lru, &ep->lru_link);
+
+	if (ep->flags & PCS_RPC_F_CLNT_PEER_ID)
+		ep->flags |= PCS_RPC_F_PEER_ID;
+	ep->state = PCS_RPC_APPWAIT;
+	pcs_rpc_enable(ep, 0);
+	mutex_unlock(&ep->mutex);
+
+}
diff --git a/fs/fuse/kio/pcs/pcs_rpc.h b/fs/fuse/kio/pcs/pcs_rpc.h
new file mode 100644
index 000000000000..264657328c53
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_rpc.h
@@ -0,0 +1,290 @@
+#ifndef _PCS_RPC_H_
+#define _PCS_RPC_H_ 1
+
+//#include "pcs_defs.h"
+#include "pcs_rpc_prot.h"
+#include "pcs_sock_io.h"
+
+struct pcs_msg;
+
+#define PCS_RPC_HASH_SIZE	1024
+
+enum
+{
+	PCS_RPC_UNCONN	= 0,		/* Not connected */
+	PCS_RPC_CONNECT	= 1,		/* Connect in progress */
+	PCS_RPC_AUTH	= 2,		/* Connected. Auth request sent. */
+	PCS_RPC_AUTHWAIT= 3,		/* Accepted. Waiting for auth request from peer. */
+	PCS_RPC_APPWAIT = 4,		/* Auth complete, client is notified */
+	PCS_RPC_WORK	= 5,		/* Established */
+	PCS_RPC_HOLDDOWN = 6,		/* Not connected. Connect must not be reinitiated. */
+	PCS_RPC_ABORT	= 7,		/* Aborted. Not reconnected automatically. */
+	PCS_RPC_DESTROY	= 8		/* Destruction in progress */
+};
+
+struct pcs_rpc_params
+{
+	unsigned int	alloc_hdr_size;
+	unsigned int	max_msg_size;
+
+	unsigned int	connect_timeout;
+	unsigned int	holddown_timeout;
+	unsigned int	response_timeout;
+
+	unsigned int	max_conn_retry;
+
+	unsigned int	flags;
+};
+
+#define MAX_BUILD_VERSION_LENGTH 30
+
+#define RPC_GC_MAX_CLASS 4
+
+struct rpc_gc_class
+{
+	struct list_lru		lru;
+};
+
+
+/* from: cluster_id.h */
+typedef union __pre_aligned(8) _PCS_CLUSTER_ID_T {
+	unsigned char uuid[16];		/* For now it is opaque string */
+	u64	      val[2];
+} PCS_CLUSTER_ID_T __aligned(8);
+
+#define PCS_CLUSTER_ID_VALID(clid) ((clid).val[0] || (clid).val[1])
+/////////////////////////////
+
+#define PCS_RPC_CPU_SLICE (100 * HZ / 1000) /* 100ms */
+struct pcs_rpc
+{
+	struct hlist_node	link;		/* Link in hash table */
+	struct list_head	lru_link;	/* Link in LRU */
+	struct rpc_gc_class	*gc;
+	struct pcs_rpc_engine	*eng;		/* Reference to eng, where this peer is assigned to */
+
+	void			*parent;
+
+	unsigned int		state;
+	unsigned int		flags;
+#define PCS_RPC_F_HASHED		1
+#define PCS_RPC_F_PASSIVE		2
+#define PCS_RPC_F_PEER_ID		4
+#define PCS_RPC_F_NO_RETRY		8
+#define PCS_RPC_F_DEAD			0x10
+#define PCS_RPC_F_LISTEN		0x20
+#define PCS_RPC_F_ACQ_ID		0x40
+#define PCS_RPC_F_PEER_VERIFIED		0x80
+#define PCS_RPC_F_CLNT_PEER_ID		0x100 /* peer id set by pcs_rpc_set_peer_id */
+#define PCS_RPC_F_ACCT			0x200
+#define PCS_RPC_F_LOCAL			0x400 /* local AF_UNIX connection */
+#define PCS_RPC_F_PEER_AUTHORIZED	0x800 /* peer authorized by secure method */
+#define PCS_RPC_F_LOCALAUTH		0x1000 /* skip authenitication, it is provided by transport */
+
+	struct pcs_rpc_params	params;
+
+	atomic_t		refcnt;
+	int			retries;
+	PCS_NODE_ID_T		peer_id;
+	u8			peer_role;
+	unsigned int		peer_flags;
+	u32			peer_version;
+	struct pcs_host_info	peer_host;
+	char			peer_build_version[MAX_BUILD_VERSION_LENGTH+1];
+	struct work_struct	work;
+	struct delayed_work	timer_work;
+	PCS_NET_ADDR_T		addr;
+/* TODO Reanable local sockets */
+#if 0
+	struct sockaddr_un *	sun;
+#endif
+	struct pcs_ioconn *	conn;		/* Active connection for the peer */
+
+	struct pcs_rpc_ops *	ops;
+
+	struct list_head	pending_queue;	/* Queue of requests sent to the peer */
+	struct list_head	state_queue;	/* Queue of requests waiting for proper peer state */
+
+	spinlock_t		q_lock;		/* Protects queues lists below*/
+	struct list_head	input_queue;	/* Queue of requests waiting to be handled */
+	int			cpu;
+	unsigned long		cpu_stamp;
+
+	struct mutex		mutex;
+	u64			accounted;
+	u32			netlat_min;
+	u32			netlat_max;
+	atomic_t		netlat_cnt;
+	atomic64_t		netlat_avg;
+
+	struct delayed_work	calendar_work;
+	unsigned		kill_arrow;
+#define RPC_MAX_CALENDAR	PCS_MSG_MAX_CALENDAR
+	struct hlist_head	kill_calendar[RPC_MAX_CALENDAR];
+
+	void *			private;
+
+	void *			private2;
+};
+
+struct pcs_rpc_engine
+{
+	struct hlist_head	ht[PCS_RPC_HASH_SIZE];
+	struct hlist_head	unhashed;
+	unsigned int		nrpcs;
+
+	PCS_CLUSTER_ID_T	cluster_id;
+	PCS_NODE_ID_T		local_id;
+	unsigned int		flags;
+#define PCS_KNOWN_MYID		1
+#define PCS_KNOWN_CLUSTERID	2
+#define PCS_KNOWN_HOSTID	4
+	u8			role;
+	struct pcs_host_info	my_host;
+
+	atomic64_t		xid_generator;		/* Current XID */
+	int			msg_count;
+	int			accounted_rpcs;
+	u64			msg_allocated;
+
+	u64			mem_pressure_thresh;
+	u64			mem_limit;
+
+	int			local_sndbuf;
+	int			tcp_sndbuf;
+	int			tcp_rcvbuf;
+	struct delayed_work	stat_work;
+	int			max_connections;
+	int			max_gc_index;
+	struct rpc_gc_class	gc[RPC_GC_MAX_CLASS];
+
+};
+
+struct pcs_rpc_ops
+{
+	/* Called on each incoming request to process msg */
+	int			(*demux_request)(struct pcs_rpc *, struct pcs_msg * msg);
+
+	/* Called on receiving response before done callback */
+	void			(*hook_response)(struct pcs_rpc *, struct pcs_msg * msg);
+
+	/* Called after rpc header is received to allocate msg */
+	struct pcs_msg *	(*get_hdr)(struct pcs_rpc *, struct pcs_rpc_hdr * h);
+
+	/* Called when rpc enters ABORT state due to peer abort */
+	void			(*state_change)(struct pcs_rpc *, int error);
+
+	void			(*connect)(struct pcs_rpc *);
+
+	/* Incoming connection was aborted */
+	void			(*client_aborted)(struct pcs_rpc *ep, int error);
+
+	/* Called when peer asks to keep waiting on a request */
+	void			(*keep_waiting)(struct pcs_rpc *, struct pcs_msg * req, struct pcs_msg * msg);
+
+	/* Submit connection statistics */
+	void			(*send_stats)(struct pcs_rpc_engine *, struct pcs_msg * msg);
+};
+
+
+static inline struct pcs_rpc * pcs_rpc_get(struct pcs_rpc * p)
+{
+	BUG_ON(atomic_read(&p->refcnt) <=0);
+	atomic_inc(&p->refcnt);
+	return p;
+}
+
+void pcs_rpc_destroy(struct pcs_rpc * p);
+
+static inline void pcs_rpc_put(struct pcs_rpc * p)
+{
+	BUG_ON(atomic_read(&p->refcnt) <=0);
+	if (atomic_dec_and_test(&p->refcnt))
+		pcs_rpc_destroy(p);
+}
+
+/* Function provided by rpc engine */
+void pcs_rpc_engine_init(struct pcs_rpc_engine * eng, u8 role);
+void pcs_rpc_engine_fini(struct pcs_rpc_engine * eng);
+void pcs_rpc_init_gc(struct pcs_rpc_engine * eng, unsigned int limit);
+void pcs_rpc_get_new_xid(struct pcs_rpc_engine *eng, PCS_XID_T *xid);
+
+void pcs_rpc_set_cluster_id(struct pcs_rpc_engine * eng, PCS_CLUSTER_ID_T * id);
+void pcs_rpc_set_host_id(struct pcs_rpc_engine *eng, PCS_NODE_ID_T *host_id);
+
+/* Main set of functions */
+struct pcs_rpc * pcs_rpc_alloc_ep(void);
+void pcs_rpc_attach_new_ep(struct pcs_rpc * ep, struct pcs_rpc_engine * eng);
+void pcs_rpc_configure_new_ep(struct pcs_rpc * ep, struct pcs_rpc_params *parm,
+				struct pcs_rpc_ops * ops);
+/* All 3 above in one call */
+struct pcs_rpc * pcs_rpc_create(struct pcs_rpc_engine * eng, struct pcs_rpc_params *parm,
+				struct pcs_rpc_ops * ops);
+void pcs_rpc_close(struct pcs_rpc * ep);
+void pcs_rpc_reset(struct pcs_rpc * ep);
+
+int pcs_rpc_listen_ext(struct pcs_rpc * ep, PCS_NET_ADDR_T * addr, int flags);
+static inline int pcs_rpc_listen(struct pcs_rpc * ep, PCS_NET_ADDR_T * addr)
+{
+	return pcs_rpc_listen_ext(ep, addr, 0);
+}
+
+int pcs_rpc_listen_local(struct pcs_rpc * ep, const char *path, int noauth);
+void pcs_rpc_queue(struct pcs_rpc * ep, struct pcs_msg * msg);
+void pcs_rpc_kick_queue(struct pcs_rpc * ep);
+void pcs_rpc_respond(struct pcs_rpc * ep, struct pcs_msg * msg);
+void pcs_rpc_call(struct pcs_rpc * ep, struct pcs_msg * msg);
+void pcs_rpc_connect(struct pcs_rpc * ep);
+void pcs_rpc_cancel_request(struct pcs_msg * msg);
+void pcs_msg_del_calendar(struct pcs_msg * msg);
+
+/* Setting/getting parameters */
+void pcs_rpc_set_peer_id(struct pcs_rpc * ep, PCS_NODE_ID_T * id, u8 role);
+int pcs_rpc_set_address(struct pcs_rpc * ep, PCS_NET_ADDR_T * addr);
+
+int pcs_rpc_set_local(struct pcs_rpc * ep, const char *path, int noauth);
+int pcs_rpc_get_local_addr(struct pcs_rpc * ep, PCS_NET_ADDR_T * addr);
+
+/* Service functions, which are supposed to be used from callbacks */
+void pcs_rpc_sent(struct pcs_msg * msg);
+struct pcs_msg * pcs_rpc_lookup_xid(struct pcs_rpc * ep, PCS_XID_T * xid);
+void rpc_work_input(struct pcs_msg * msg);
+
+void pcs_rpc_error_respond(struct pcs_rpc * ep, struct pcs_msg * msg, int err);
+void rpc_abort(struct pcs_rpc * ep, int fatal, int error);
+/* Message allocation/initialization */
+struct pcs_msg * pcs_alloc_response(struct pcs_rpc_hdr * req_hdr, int size);
+struct pcs_msg * pcs_alloc_aligned_response(struct pcs_rpc_hdr * req_hdr, int size, int hdrlen);
+struct pcs_msg * pcs_rpc_alloc_error_response(struct pcs_rpc * ep, struct pcs_rpc_hdr * req_hdr, int err, int size);
+struct pcs_msg * pcs_rpc_alloc_input_msg(struct pcs_rpc * ep, int datalen);
+struct pcs_msg * pcs_rpc_alloc_aligned_msg(struct pcs_rpc * ep, int datalen, int hdrlen);
+struct pcs_msg * pcs_rpc_alloc_output_msg(int datalen);
+struct pcs_msg * pcs_rpc_clone_msg(struct pcs_msg * msg);
+void pcs_rpc_deaccount_msg(struct pcs_msg * msg);
+void pcs_rpc_init_input_msg(struct pcs_rpc * ep, struct pcs_msg * msg, int account);
+void pcs_rpc_init_output_msg(struct pcs_msg * msg);
+void pcs_rpc_init_response(struct pcs_msg * msg, struct pcs_rpc_hdr * req_hdr, int size);
+
+/* Allocate message and initialize header */
+struct pcs_msg * pcs_rpc_alloc_msg_w_hdr(int type, int size);
+
+void pcs_rpc_set_memlimits(struct pcs_rpc_engine * eng, u64 thresh, u64 limit);
+void pcs_rpc_account_adjust(struct pcs_msg * msg, int adjustment);
+
+struct pcs_perf_counter;
+void perfcnt_collect_rpc(char ** ptr, int * max_size, struct pcs_rpc_engine const*);
+
+int pcs_is_zero_cluster_id(PCS_CLUSTER_ID_T *id);
+int pcs_cluster_id_eq(PCS_CLUSTER_ID_T *id1, PCS_CLUSTER_ID_T *id2);
+
+void rpc_trace_health(struct pcs_rpc * ep);
+void pcs_rpc_enumerate_rpc(struct pcs_rpc_engine *eng, void (*cb)(struct pcs_rpc *ep, void *arg), void *arg);
+void pcs_rpc_set_sock(struct pcs_rpc *ep, struct pcs_sockio * sio);
+void rpc_connect_done(struct pcs_rpc *ep, struct socket *sock);
+
+static inline struct pcs_rpc *pcs_rpc_from_work(struct work_struct *wr)
+{
+	return container_of(wr, struct pcs_rpc, work);
+}
+
+#endif /* _PCS_RPC_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_rpc_prot.h b/fs/fuse/kio/pcs/pcs_rpc_prot.h
new file mode 100644
index 000000000000..594670e9ead6
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_rpc_prot.h
@@ -0,0 +1,97 @@
+#ifndef _PCS_NET_PROT_H_
+#define _PCS_NET_PROT_H_ 1
+
+#include "pcs_prot_types.h"
+
+/* Current version of protocol. We promise to support all the messages forever,
+ * so that no version checks are required. However, we must not send new messages
+ * to old peers, that. where this version is required.
+ */
+#define PCS_VERSION_CURRENT	1U
+
+struct pcs_rpc_hdr
+{
+	u32		len;
+	u32		type;
+	PCS_XID_T	xid;
+} __attribute__((aligned(8)));
+
+#define PCS_RPC_DIRECTION	1
+
+#define RPC_IS_RESPONSE(type) (type & PCS_RPC_DIRECTION)
+
+
+#define PCS_RPC_ERROR_RESP	1
+
+struct pcs_rpc_payload
+{
+	u32	len;
+	u32	type;
+	/* Variable size data follows */
+} __attribute__((aligned(8)));
+
+
+struct pcs_rpc_error_resp
+{
+	struct pcs_rpc_hdr	hdr;
+	PCS_NODE_ID_T		offender;
+	u32			code;
+	u32			npayloads;
+	struct pcs_rpc_payload	payload;
+} __attribute__((aligned(8)));
+
+
+#define PCS_RPC_CS_CLIENT_BASE	256
+#define PCS_RPC_MDS_CLIENT_BASE	512
+#define PCS_RPC_CS_CS_BASE	1024
+#define PCS_RPC_LOCAL_BASE	2048
+
+/* Payload types */
+#define PCS_RPC_EMPTY_PAYLOAD		0
+
+/* Authentication payload types */
+#define PCS_RPC_AUTH_TYPE_PAYLOAD	11
+#define PCS_RPC_SSL_PAYLOAD		12
+#define PCS_RPC_DIGEST_PAYLOAD		13
+#define PCS_RPC_AUTH_SIMPLE_PAYLOAD	14
+
+/* System payload types */
+#define PCS_RPC_SYS_PAYLOAD_BASE	128
+#define PCS_RPC_BUILD_VERSION_PAYLOAD	PCS_RPC_SYS_PAYLOAD_BASE
+
+/* Application specific payload types */
+#define PCS_RPC_APP_PAYLOAD_BASE	512
+
+/* Node role */
+enum
+{
+	PCS_NODE_ROLE_TEST	= 0,			/* Can be used for diagnostics. Functionality is reduced. */
+	PCS_NODE_ROLE_CN	= 1,			/* Client */
+	PCS_NODE_ROLE_CS	= 2,			/* Chunk server */
+	PCS_NODE_ROLE_MDS	= 3,			/* Meta-data server */
+	PCS_NODE_ROLE_TOOL	= 4,			/* Similar to the client but not visible in stat */
+	PCS_NODE_ROLE_SVC	= 5,			/* Generic service */
+	PCS_NODE_ROLES_
+};
+
+static inline const char *pcs_role_to_str(u8 role)
+{
+	static const char *roles_str[PCS_NODE_ROLES_] = {
+		"TEST", "CN", "CS", "MDS", "TOOL", "SVC"
+	};
+
+	if (role > PCS_NODE_ROLES_)
+		return "Unknown";
+	return roles_str[role];
+}
+
+struct pcs_rpc_keep_waiting
+{
+	struct pcs_rpc_hdr	hdr;
+
+	PCS_XID_T		xid;	/* XID of request which should not timeout */
+} __attribute__((aligned(8)));
+
+#define PCS_RPC_KEEP_WAITING	(12)
+
+#endif /* _PCS_RPC_PROT_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_sock_io.c b/fs/fuse/kio/pcs/pcs_sock_io.c
new file mode 100644
index 000000000000..6936dede5b96
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_sock_io.c
@@ -0,0 +1,702 @@
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/net.h>
+#include <linux/kthread.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+
+#include "pcs_types.h"
+#include "pcs_sock_io.h"
+#include "pcs_rpc.h"
+#include "log.h"
+
+
+static inline struct pcs_rpc * sock_to_rpc(struct sock *sk)
+{
+
+	return ((struct pcs_sockio *)sk->sk_user_data)->parent;
+}
+
+static void sio_msg_sent(struct pcs_msg * msg)
+{
+	msg->stage = PCS_MSG_STAGE_SENT;
+	if (msg->timeout) {
+		BUG_ON(msg->rpc == NULL);
+		BUG_ON(msg->kill_slot >= PCS_MSG_MAX_CALENDAR);
+		pcs_msg_del_calendar(msg);
+	}
+}
+
+void sio_push(struct pcs_sockio * sio)
+{
+	struct pcs_rpc *ep = sio->parent;
+
+	TRACE(PEER_FMT" flush \n", PEER_ARGS(ep));
+	if (sio->flags & PCS_SOCK_F_CORK) {
+		int optval = 1;
+		int ret;
+		ret = kernel_setsockopt(sio->ioconn.socket, SOL_TCP, TCP_NODELAY,
+					(char *)&optval, sizeof(optval));
+		if (ret)
+			TRACE("kernel_setsockopt(TCP_NODELAY) failed: %d",  ret);
+
+	}
+}
+
+//// TODO:dmonakhov@ implement unregister and close,
+//// socket close must being synchronized with userspace THINK
+//// caseA: userspace close socket and wait for kernelspace
+//// caseB: kernelspace want to close socket and have to somehow
+////	    notify about this to userspace (NEW API REQUIRED)
+static void pcs_restore_sockets(struct pcs_ioconn *ioconn);
+void pcs_ioconn_unregister(struct pcs_ioconn *ioconn)
+{
+	if (!test_bit(PCS_IOCONN_BF_DEAD, &ioconn->flags)) {
+		set_bit(PCS_IOCONN_BF_DEAD, &ioconn->flags);
+		pcs_restore_sockets(ioconn);
+	}
+
+}
+
+void pcs_ioconn_close(struct pcs_ioconn *ioconn)
+{
+	kernel_sock_shutdown(ioconn->socket, SHUT_RDWR);
+}
+
+void sio_abort(struct pcs_sockio * sio, int error)
+{
+	if (sio->current_msg) {
+		pcs_free_msg(sio->current_msg);
+		sio->current_msg = NULL;
+	}
+
+	sio->flags &= ~(PCS_SOCK_F_POOLOUT|PCS_SOCK_F_POOLIN);
+	while (!list_empty(&sio->write_queue)) {
+		struct pcs_msg * msg = list_first_entry(&sio->write_queue, struct pcs_msg, list);
+		list_del(&msg->list);
+		sio->write_queue_len -= msg->size;
+		sio_msg_sent(msg);
+
+		pcs_set_local_error(&msg->error, error);
+		BUG_ON(!hlist_unhashed(&msg->kill_link));
+		msg->done(msg);
+	}
+	pcs_ioconn_unregister(&sio->ioconn);
+	pcs_ioconn_close(&sio->ioconn);
+	pcs_set_local_error(&sio->error, error);
+	if (sio->eof) {
+		void (*eof)(struct pcs_sockio *) = sio->eof;
+		sio->eof = NULL;
+		(*eof)(sio);
+	}
+}
+
+
+void pcs_sock_abort(struct pcs_sockio * sio)
+{
+	if (!sio)
+		return;
+
+	sio_abort(sio, PCS_ERR_NET_ABORT);
+}
+
+void pcs_sock_error(struct pcs_sockio * sio, int error)
+{
+	sio_abort(sio, error);
+}
+
+static int do_send_one_seg(struct socket *sock, struct iov_iter *it, bool more)
+{
+	int ret;
+	size_t offset, len;
+	struct page *page;
+	int flags = (MSG_DONTWAIT | MSG_NOSIGNAL) | (more ? MSG_MORE : MSG_EOR);
+
+	DTRACE("sock(%p)  len:%ld, more:%d\n", sock, iov_iter_count(it), more);
+
+	page = iov_iter_get_page(it, &offset, &len);
+	if (!page) {
+		/* No page, fallback to memcopy */
+		struct msghdr msg = { .msg_flags = flags};
+		struct page *page;
+		struct kvec vec;
+
+		page = iov_iter_kmap(it, &vec.iov_base, &vec.iov_len);
+		ret = kernel_sendmsg(sock, &msg, &vec, 1, vec.iov_len);
+		if (page)
+			kunmap(page);
+	} else {
+		/* Zerocopy */
+		ret = kernel_sendpage(sock, page, offset, len, flags);
+		put_page(page);
+	}
+
+	DTRACE("sock(%p) len:%ld, more:%d ret:%d\n", sock, iov_iter_count(it), more, ret);
+	return ret;
+}
+
+static int do_sock_recv(struct socket *sock, void *buf, size_t len)
+{
+
+	struct kvec iov = {buf, len};
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+	int ret;
+
+	ret =  kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
+
+	TRACE("RET: "PEER_FMT" len:%ld ret:%d\n", PEER_ARGS(sock_to_rpc(sock->sk)),
+	      len, ret);
+	return ret;
+}
+
+static void pcs_sockio_recv(struct pcs_sockio *sio)
+{
+	struct pcs_ioconn* conn = &sio->ioconn;
+	struct iov_iter *it = &sio->read_iter;
+	struct pcs_rpc *ep = sio->parent;
+	int count = 0;
+	unsigned long loop_timeout = jiffies + PCS_SIO_SLICE;
+
+	(void)ep;
+	TRACE("ENTER:" PEER_FMT " sio:%p cur_msg:%p\n", PEER_ARGS(ep), sio, sio->current_msg);
+
+	while(!test_bit(PCS_IOCONN_BF_DEAD, &conn->flags)) {
+		int n;
+		struct pcs_msg * msg;
+
+		if (test_bit(PCS_IOCONN_BF_ERROR, &conn->flags)) {
+			sio_abort(sio, PCS_ERR_NET_ABORT);
+			return;
+		}
+		if (!sio->current_msg) {
+			/* New message */
+
+			int copy = (int)(sio->hdr_max - sio->hdr_ptr);
+
+			sio->read_offset = 0;
+			n = 0;
+
+			if (copy)
+				n = do_sock_recv(conn->socket, (char *)sio_inline_buffer(sio) + sio->hdr_ptr, copy);
+
+			if (n > 0 || n == copy /* recv return 0 when copy is 0 */) {
+				sio->hdr_ptr += n;
+				if(sio->hdr_ptr != sio->hdr_max)
+					return;
+
+				msg = sio->get_msg(sio);
+				if (msg == NULL) {
+					if (sio->hdr_ptr < sio->hdr_max)
+						continue;
+					if (sio->flags & PCS_SOCK_F_THROTTLE)
+						continue;
+					sio_abort(sio, PCS_ERR_NOMEM);
+					return;
+				}
+				sio->read_offset = sio->hdr_ptr;
+				sio->hdr_ptr = 0;
+				sio->current_msg = msg;
+				msg->get_iter(msg, sio->read_offset, it);
+				TRACE(PEER_FMT" msg:%p read_off:%d iov_size:%ld\n", PEER_ARGS(ep), msg, sio->read_offset,
+				      iov_iter_count(it));
+			} else {
+				if (n == -EAGAIN || n == 0)
+					return;
+
+				sio_abort(sio, PCS_ERR_NET_ABORT);
+				return;
+			}
+		} else { /* Continue recevining message */
+			msg = sio->current_msg;
+
+			while (sio->read_offset < msg->size) {
+				void *buf;
+				size_t len;
+				struct page *page;
+
+				if (!iov_iter_count(it))
+					/* Current iter is exhausted, init new one */
+					msg->get_iter(msg, sio->read_offset, it);
+
+				TRACE(PEER_FMT" msg:%p->size:%d off:%d it_count:%ld\n",
+				      PEER_ARGS(ep), msg, msg->size, sio->read_offset,
+				      iov_iter_count(it));
+
+				BUG_ON(iov_iter_count(it) > msg->size - sio->read_offset);
+
+				page = iov_iter_kmap(it, &buf, &len);
+				if (len > msg->size - sio->read_offset)
+					len = msg->size - sio->read_offset;
+				n = do_sock_recv(conn->socket, buf, len);
+				if (page)
+					kunmap(page);
+
+				if (n > 0) {
+					sio->read_offset += n;
+					iov_iter_advance(it, n);
+				} else {
+					if (n == -EAGAIN || n == 0)
+						return;
+					sio_abort(sio, PCS_ERR_NET_ABORT);
+					return;
+				}
+			}
+			sio->current_msg = NULL;
+			iov_iter_init_bad(&sio->read_iter);
+			msg->done(msg);
+			if (++count >= PCS_SIO_PREEMPT_LIMIT ||
+			    time_is_before_jiffies(loop_timeout)) {
+				sio->flags |= PCS_SOCK_F_POOLIN;
+				break;
+			}
+		}
+	}
+	if (count && !list_empty(&ep->lru_link) && ep->gc)
+		list_lru_add(&ep->gc->lru, &ep->lru_link);
+
+}
+
+static void pcs_sockio_send(struct pcs_sockio *sio)
+{
+	struct pcs_ioconn* conn = &sio->ioconn;
+	struct iov_iter *it = &sio->write_iter;
+	unsigned long loop_timeout = jiffies + PCS_SIO_SLICE;
+	struct pcs_msg * msg;
+	int done = 0;
+	int count = 0;
+	struct pcs_rpc *ep = sio->parent;
+	(void)ep;
+
+	while (!list_empty(&sio->write_queue)) {
+		msg = list_first_entry(&sio->write_queue, struct pcs_msg, list);
+
+		TRACE(PEER_FMT" sio(%p) offset:%d msg:%p\n", PEER_ARGS(ep), sio, sio->write_offset, msg);
+
+		/* This is original check, but it is not clear how connection can becomes
+		   dead before sio_abort() was called. Let's simplify it with BUG_ON
+		if (conn->dead) {
+			pcs_set_local_error(&msg->error, PCS_ERR_NET_ABORT);
+			goto done;
+		}
+		*/
+		BUG_ON(test_bit(PCS_IOCONN_BF_DEAD, &conn->flags));
+
+		if (test_bit(PCS_IOCONN_BF_ERROR, &conn->flags)) {
+			sio_abort(sio, PCS_ERR_NET_ABORT);
+			return;
+		}
+
+		/* TODO: cond resched here? */
+		while (sio->write_offset < msg->size) {
+			size_t left = msg->size - sio->write_offset;
+			int n;
+
+			TRACE(PEER_FMT "offset:%d msg:%p left:%ld, it->len:%ld\n", PEER_ARGS(ep), sio->write_offset, msg,
+			      left, iov_iter_count(it));
+
+			if (!iov_iter_count(it)) {
+				/* Current iter is exhausted, init new one */
+				msg->get_iter(msg, sio->write_offset, it);
+			}
+			BUG_ON(iov_iter_count(it) > left);
+			n = do_send_one_seg(conn->socket, it, iov_iter_single_seg_count(it) < left);
+			if (n > 0) {
+				sio->write_offset += n;
+				iov_iter_advance(it, n);
+				done = 1;
+			} else {
+				if (n == 0)
+					WARN_ON(1);
+
+				if (n == -EAGAIN) {
+					unsigned long timeout = msg->start_time + sio->send_timeout;
+					if (time_is_before_jiffies(timeout))
+						sio_abort(sio, PCS_ERR_WRITE_TIMEOUT);
+				}
+				sio_abort(sio, PCS_ERR_NET_ABORT);
+				return;
+			}
+		}
+		list_del_init(&msg->list);
+		sio->write_queue_len -= msg->size;
+
+		if (sio->write_queue_len == 0) {
+			if (sio->write_wakeup)
+				sio->write_wakeup(sio);
+		}
+		sio->write_offset = 0;
+		iov_iter_init_bad(it);
+		sio_msg_sent(msg);
+		msg->done(msg);
+		if (++count >= PCS_SIO_PREEMPT_LIMIT ||
+		    time_is_before_jiffies(loop_timeout)) {
+			sio->flags |= PCS_SOCK_F_POOLOUT;
+			break;
+		}
+	}
+	if (done)
+		sio_push(sio);
+}
+
+void pcs_sockio_xmit(struct pcs_sockio *sio)
+{
+	struct pcs_rpc *ep = sio->parent;
+
+	BUG_ON(!mutex_is_locked(&ep->mutex));
+
+	sio->flags &= ~(PCS_SOCK_F_POOLOUT|PCS_SOCK_F_POOLIN);
+	pcs_sockio_recv(sio);
+	pcs_sockio_send(sio);
+}
+
+int pcs_sockio_delayed_seg(struct pcs_sockio *sio)
+{
+	return sio->flags & (PCS_SOCK_F_POOLOUT|PCS_SOCK_F_POOLIN);
+}
+
+void pcs_sock_sendmsg(struct pcs_sockio * sio, struct pcs_msg *msg)
+{
+	DTRACE("sio(%p) msg:%p\n", sio, msg);
+
+	if (pcs_if_error(&sio->error)) {
+		pcs_set_local_error(&msg->error, sio->error.value);
+		msg->done(msg);
+		return;
+	}
+	msg->sio = sio;
+
+	list_add_tail(&msg->list, &sio->write_queue);
+	sio->write_queue_len += msg->size;
+	msg->start_time = jiffies;
+	msg->stage = PCS_MSG_STAGE_SEND;
+
+	if (!(sio->flags & PCS_SOCK_F_POOLOUT))
+		sio->flags |= PCS_SOCK_F_POOLOUT;
+
+}
+
+/* Try to cancel message send. If it is impossible, because message is in the middle
+ * of write, so nothing and return an error.
+ */
+int pcs_sock_cancel_msg(struct pcs_msg * msg)
+{
+	struct pcs_sockio * sio = msg->sio;
+
+	BUG_ON(msg->sio == NULL);
+
+	if (sio->write_offset && sio->write_queue.next == &msg->list)
+		return -EBUSY;
+
+	list_del_init(&msg->list);
+	sio->write_queue_len -= msg->size;
+	msg->stage = PCS_MSG_STAGE_SENT;
+
+	if (!sio->write_queue_len) {
+		if (sio->write_wakeup)
+			sio->write_wakeup(sio);
+	}
+
+	return 0;
+}
+
+int pcs_sock_queuelen(struct pcs_sockio * sio)
+{
+	return sio->write_queue_len;
+}
+
+static void pcs_restore_sockets(struct pcs_ioconn *ioconn)
+{
+
+	struct sock *sk;
+
+	sk = ioconn->socket->sk;
+
+	write_lock_bh(&sk->sk_callback_lock);
+	sk->sk_user_data =    ioconn->orig.user_data;
+	sk->sk_data_ready =   ioconn->orig.data_ready;
+	sk->sk_write_space =  ioconn->orig.write_space;
+	sk->sk_error_report = ioconn->orig.error_report;
+	//sock->sk->sk_state_change = pcs_state_chage;
+	write_unlock_bh(&sk->sk_callback_lock);
+
+	sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+	sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+}
+
+void pcs_sock_ioconn_destruct(struct pcs_ioconn *ioconn)
+{
+	struct pcs_sockio * sio = sio_from_ioconn(ioconn);
+
+	BUG_ON(sio->current_msg);
+	BUG_ON(!list_empty(&sio->write_queue));
+	BUG_ON(sio->write_queue_len);
+
+	pcs_ioconn_close(ioconn);
+
+	memset(sio, 0xFF, sizeof(*sio));
+	kfree(sio);
+}
+
+static void pcs_sk_data_ready(struct sock *sk, int count)
+{
+	struct pcs_sockio *sio = sk->sk_user_data;
+	struct pcs_rpc *ep = sio->parent;
+
+	TRACE(PEER_FMT" queue count:%d \n", PEER_ARGS(ep), count);
+
+	pcs_rpc_kick_queue(sio->parent);
+}
+static void pcs_sk_write_space(struct sock *sk)
+{
+	struct pcs_sockio *sio = sk->sk_user_data;
+	struct pcs_rpc *ep = sio->parent;
+
+	TRACE(PEER_FMT" queue \n", PEER_ARGS(ep));
+
+	pcs_rpc_kick_queue(sio->parent);
+
+}
+
+/* TODO this call back does not look correct, sane locking/error handling is required */
+static void pcs_sk_error_report(struct sock *sk)
+{
+	struct pcs_sockio * sio = sio_from_ioconn(sk->sk_user_data);
+
+	if (test_bit(PCS_IOCONN_BF_DEAD, &sio->ioconn.flags) ||
+		test_bit(PCS_IOCONN_BF_ERROR, &sio->ioconn.flags))
+		return;
+
+	set_bit(PCS_IOCONN_BF_ERROR, &sio->ioconn.flags);
+	pcs_rpc_kick_queue(sio->parent);
+}
+
+struct pcs_sockio * pcs_sockio_init(struct socket *sock,
+				    int alloc_max, int hdr_max)
+{
+	struct pcs_sockio * sio;
+	struct sock *sk;
+
+	sio = kzalloc(sizeof(struct pcs_sockio) + alloc_max, GFP_NOIO);
+	if (!sio)
+		return NULL;
+
+	INIT_LIST_HEAD(&sio->write_queue);
+	sio->write_queue_len = 0;
+	sio->current_msg = NULL;
+	iov_iter_init_bad(&sio->read_iter);
+	iov_iter_init_bad(&sio->write_iter);
+	sio->read_offset = 0;
+	sio->write_offset = 0;
+	sio->hdr_max = hdr_max;
+	sio->hdr_ptr = 0;
+	sio->flags = PCS_SOCK_F_CORK;
+	sio->retrans = 0;
+
+	//// TODO:dmonakhov init ioconn here
+	INIT_LIST_HEAD(&sio->ioconn.list);
+	sk = sock->sk;
+	write_lock_bh(&sk->sk_callback_lock);
+
+	/* Backup original callbaks */
+	sio->ioconn.orig.user_data = sk->sk_user_data;
+	sio->ioconn.orig.data_ready = sk->sk_data_ready;
+	sio->ioconn.orig.write_space = sk->sk_write_space;
+	sio->ioconn.orig.error_report = sk->sk_error_report;
+	//sio->ioconn.orig_state_change = sk->sk_state_change;
+
+	sk->sk_user_data = sio;
+	sk->sk_data_ready = pcs_sk_data_ready;
+	sk->sk_write_space = pcs_sk_write_space;
+	sk->sk_error_report = pcs_sk_error_report;
+	sk->sk_allocation = GFP_NOFS;
+
+	//sock->sk->sk_state_change = pcs_state_chage;
+
+	sk->sk_sndtimeo = PCS_SIO_TIMEOUT;
+	sio->send_timeout = PCS_SIO_TIMEOUT;
+	sio->ioconn.socket = sock;
+	sio->ioconn.destruct = pcs_sock_ioconn_destruct;
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+
+	pcs_clear_error(&sio->error);
+	sio->get_msg = NULL;
+	sio->eof = NULL;
+	sio->write_wakeup = NULL;
+	return sio;
+}
+
+void pcs_sockio_start(struct pcs_sockio * sio)
+{
+	//// TODO: dmonakhov
+	////pcs_ioconn_register(&sio->ioconn);
+}
+
+static void pcs_deaccount_msg(struct pcs_msg * msg)
+{
+	msg->sio = NULL;
+}
+
+static void pcs_account_msg(struct pcs_sockio * sio, struct pcs_msg * msg)
+{
+	msg->sio = sio;
+
+}
+
+static void pcs_msg_input_destructor(struct pcs_msg * msg)
+{
+	pcs_deaccount_msg(msg);
+	memset(msg, 0xFF, sizeof(*msg));
+	kfree(msg);
+}
+
+/* get_iter() handler for messages with embedded payload right after pcs_msg */
+void pcs_get_iter_inline(struct pcs_msg * msg, int offset, struct iov_iter *it)
+{
+	BUG_ON(offset >= msg->size);
+
+	iov_iter_init_plain(it, msg->_inline_buffer, msg->size, 0);
+	iov_iter_advance(it, offset);
+}
+
+struct pcs_msg * pcs_alloc_input_msg(struct pcs_sockio * sio, int datalen)
+{
+	struct pcs_msg * msg;
+
+	msg = kmalloc(sizeof(struct pcs_msg) + datalen, GFP_NOIO);
+	if (msg) {
+
+		pcs_msg_io_init(msg);
+		pcs_account_msg(sio, msg);
+		msg->destructor = pcs_msg_input_destructor;
+		msg->get_iter = pcs_get_iter_inline;
+	}
+	return msg;
+}
+
+static void pcs_io_msg_output_destructor(struct pcs_msg * msg)
+{
+	BUG_ON(msg->rpc);
+	memset(msg, 0xFF, sizeof(*msg));
+	kfree(msg);
+}
+
+
+struct pcs_msg * pcs_alloc_output_msg(int datalen)
+{
+	struct pcs_msg * msg;
+
+	msg = kmalloc(sizeof(struct pcs_msg) + datalen, GFP_NOIO);
+	if (msg) {
+		pcs_msg_io_init(msg);
+		msg->rpc = NULL;
+		msg->sio = NULL;
+		msg->destructor = pcs_io_msg_output_destructor;
+		msg->get_iter = pcs_get_iter_inline;
+	}
+	return msg;
+}
+
+void pcs_free_msg(struct pcs_msg * msg)
+{
+	pcs_msg_io_fini(msg);
+
+	if (msg->destructor)
+		msg->destructor(msg);
+}
+
+/* iter_iter() handler for cloned messages */
+static void get_iter_clone(struct pcs_msg * msg, int offset, struct iov_iter *it)
+{
+	struct pcs_msg * parent = msg->private;
+
+	BUG_ON(offset >= msg->size);
+
+	parent->get_iter(parent, offset, it);
+}
+
+void pcs_clone_done(struct pcs_msg * msg)
+{
+	struct pcs_msg * parent = msg->private;
+
+	pcs_copy_error_cond(&parent->error, &msg->error);
+
+	pcs_msg_io_end(parent);
+
+	pcs_free_msg(msg);
+}
+
+struct pcs_msg * pcs_clone_msg(struct pcs_msg * msg)
+{
+	struct pcs_msg * clone;
+
+	clone = kmalloc(sizeof(struct pcs_msg), GFP_NOIO);
+	if (clone) {
+		pcs_msg_io_init(clone);
+		clone->rpc = NULL;
+		clone->size = msg->size;
+		clone->timeout = 0;
+		clone->done = pcs_clone_done;
+		clone->destructor = pcs_io_msg_output_destructor;
+		clone->private = msg;
+		clone->get_iter = get_iter_clone;
+	}
+	return clone;
+}
+
+/* iter_iter() handler for cloned messages */
+static void get_iter_cow_clone(struct pcs_msg * msg, int offset, struct iov_iter *it)
+{
+	struct pcs_msg * parent = msg->private;
+
+	BUG_ON(offset >= msg->size);
+
+	if (offset < msg->_inline_len) {
+		iov_iter_init_plain(it, msg->_inline_buffer, msg->_inline_len, 0);
+		iov_iter_advance(it, offset);
+	} else {
+		parent->get_iter(parent, offset, it);
+	}
+}
+
+struct pcs_msg * pcs_cow_msg(struct pcs_msg * msg, int copy_len)
+{
+	struct pcs_msg * clone;
+
+	clone = kmalloc(sizeof(struct pcs_msg) + copy_len, GFP_NOIO);
+	if (clone) {
+		pcs_msg_io_init(clone);
+		clone->rpc = NULL;
+		clone->size = msg->size;
+		clone->timeout = 0;
+		clone->done = pcs_clone_done;
+		clone->destructor = pcs_io_msg_output_destructor;
+		clone->private = msg;
+		BUG_ON(copy_len > SHRT_MAX);
+		clone->_inline_len = (short)copy_len;
+		memcpy(clone->_inline_buffer, msg_inline_head(msg), copy_len);
+		clone->get_iter = get_iter_cow_clone;
+	}
+	return clone;
+}
+
+void pcs_sock_throttle(struct pcs_sockio * sio)
+{
+	if ((sio->flags & PCS_SOCK_F_THROTTLE) ||
+	    test_bit(PCS_IOCONN_BF_DEAD, &sio->ioconn.flags))
+		return;
+
+	DTRACE("Throttle on socket %p rpc=%p", sio, sio->parent);
+	sio->flags |= PCS_SOCK_F_THROTTLE;
+}
+
+void pcs_sock_unthrottle(struct pcs_sockio * sio)
+{
+	if (!(sio->flags & PCS_SOCK_F_THROTTLE) ||
+	    test_bit(PCS_IOCONN_BF_DEAD, &sio->ioconn.flags))
+		return;
+
+	DTRACE("Unthrottle on socket %p rpc=%p", sio, sio->parent);
+	sio->flags &= ~PCS_SOCK_F_THROTTLE;
+	if ((sio->flags & PCS_SOCK_F_EOF))
+		return;
+}
diff --git a/fs/fuse/kio/pcs/pcs_sock_io.h b/fs/fuse/kio/pcs/pcs_sock_io.h
new file mode 100644
index 000000000000..c1dfd422b360
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_sock_io.h
@@ -0,0 +1,236 @@
+#ifndef _PCS_SOCK_IO_H_
+#define _PCS_SOCK_IO_H_ 1
+
+#include <linux/net.h>
+
+#include "pcs_types.h"
+////#include "pcs_process.h"
+#include "pcs_error.h"
+#include "log.h"
+
+#define PCS_MSG_MAX_CALENDAR	64
+#define PCS_SIO_TIMEOUT		(60*HZ)
+
+#define PCS_SIO_PREEMPT_LIMIT	16
+#define PCS_SIO_SLICE (5 * HZ / 1000) /* 5ms */
+
+
+struct pcs_api_channel
+{
+	unsigned	sio_count;
+	unsigned	msg_count;
+};
+
+__pre_packed struct pcs_msg
+{
+	struct __pre_aligned(16) {
+		struct list_head list;
+
+		pcs_error_t	error;
+		abs_time_t	start_time;
+
+		void		*private;
+		void		*private2;	/* Huh? Need to do something else here. */
+		struct pcs_msg	*response;	/* Consider removing. It can be done passing the second
+						 * argument to done();
+						 */
+		struct pcs_sockio *sio;
+		struct pcs_rpc	*rpc;
+
+		int		size;
+		int		_iocount;
+		unsigned short	timeout;
+		unsigned char	kill_slot;
+		unsigned char	stage;
+		abs_time_t	io_start_time;
+
+		struct hlist_node	kill_link;
+
+		void		(*get_iter)(struct pcs_msg *, int offset, struct iov_iter *it);
+
+		void		(*done)(struct pcs_msg *);
+		void		(*destructor)(struct pcs_msg *);
+		void		*pool;
+		struct iovec	_inline_iovec;
+		int		accounted;
+
+		short		_align_offset;
+		short		_inline_len;
+	} __aligned(16);
+	u64		__pad16_8;
+	char		_inline_buffer[0];
+} __packed;
+
+static inline void * pcs_msg_aligned_data(struct pcs_msg * msg, int offset)
+{
+	return (void*)((char *)msg + msg->_align_offset + offset);
+}
+
+enum
+{
+	PCS_MSG_STAGE_NONE	= 0,	/* Initial state */
+	PCS_MSG_STAGE_UNSENT	= 1,	/* Message queued somewhere before send */
+	PCS_MSG_STAGE_SEND	= 2,	/* Message queued on socket queue */
+	PCS_MSG_STAGE_SENT	= 3,	/* Message is sent */
+	PCS_MSG_STAGE_WAIT	= 4,	/* Message is waiting for respnose */
+	PCS_MSG_STAGE_DONE	= 5,	/* Response received */
+};
+
+enum
+{
+	PCS_SOCK_F_THROTTLE		= 1,
+	PCS_SOCK_F_CORK			= 2,
+	PCS_SOCK_F_DYNAMIC_SIZE		= 4,
+	PCS_SOCK_F_EOF			= 8,
+	PCS_SOCK_F_POOLIN		= 0x10,
+	PCS_SOCK_F_POOLOUT		= 0x20,
+};
+
+enum
+{
+	PCS_IOCONN_BF_DEAD		= 0,
+	PCS_IOCONN_BF_ERROR		= 1, /* Notify from ->sk_error_report */
+};
+struct pcs_ioconn {
+
+	struct list_head	list;
+	struct socket		*socket;
+
+	unsigned long		flags;		/* atomic bit ops */
+	/* Save original socket->sk callbacks */
+	struct {
+		void			*user_data;
+		void			(*state_change)(struct sock *sk);
+		void			(*error_report)(struct sock *sk);
+		void			(*data_ready)(struct sock *sk, int bytes);
+		void			(*write_space)(struct sock *sk);
+	} orig;
+	void(*destruct)(struct pcs_ioconn *);
+
+};
+
+struct pcs_sockio
+{
+	struct pcs_ioconn	ioconn;
+
+	struct list_head	write_queue;
+	int			write_queue_len;
+	spinlock_t		q_lock;
+	void			*parent;
+
+	pcs_error_t		error;
+	int			send_timeout;
+	int			hdr_ptr;
+	int			hdr_max;
+	unsigned int		flags;
+	u32			retrans;
+
+	struct pcs_msg		*current_msg;
+	int			read_offset;
+	int			write_offset;
+	struct iov_iter		read_iter;
+	struct iov_iter		write_iter;
+	struct mutex		mutex;
+	struct pcs_msg *	(*get_msg)(struct pcs_sockio *);
+	/* eof() handler could be called twice: once on graceful socket shutdown and from sio_abort() */
+	void			(*eof)(struct pcs_sockio *);
+	void			(*write_wakeup)(struct pcs_sockio *);
+
+	char			_inline_buffer[0];
+};
+
+#define sio_from_ioconn(conn) container_of(conn, struct pcs_sockio, ioconn)
+
+struct pcs_sockio * pcs_sockio_init(struct socket* sock, int alloc_max, int hdr_max);
+void pcs_sockio_start(struct pcs_sockio * sio);
+void pcs_sock_sendmsg(struct pcs_sockio * sio, struct pcs_msg *msg);
+int pcs_sock_cancel_msg(struct pcs_msg * msg);
+void pcs_sockio_xmit(struct pcs_sockio *sio);
+int  pcs_sockio_delayed_seg(struct pcs_sockio *sio);
+int pcs_sock_queuelen(struct pcs_sockio * sio);
+void pcs_sock_abort(struct pcs_sockio * sio);
+void pcs_sock_error(struct pcs_sockio * sio, int error);
+
+void pcs_sock_throttle(struct pcs_sockio * sio);
+void pcs_sock_unthrottle(struct pcs_sockio * sio);
+
+struct pcs_msg * pcs_alloc_input_msg(struct pcs_sockio * sio, int datalen);
+struct pcs_msg * pcs_alloc_output_msg(int datalen);
+struct pcs_msg * pcs_clone_msg(struct pcs_msg * msg);
+struct pcs_msg * pcs_cow_msg(struct pcs_msg * msg, int data_len);
+void pcs_clone_done(struct pcs_msg * msg);
+void pcs_free_msg(struct pcs_msg * msg);
+void pcs_get_iter_inline(struct pcs_msg * msg, int offset, struct iov_iter*it);
+
+static inline void * msg_inline_head(struct pcs_msg * msg)
+{
+	struct iov_iter i;
+	void *map, *buf;
+	size_t len;
+
+	msg->get_iter(msg, 0, &i);
+	map = iov_iter_kmap_atomic(&i, &buf, &len);
+	/* inline head always kernel memory */
+	BUG_ON(map);
+	BUG_ON(len > msg->size);
+
+	return buf;
+}
+
+static inline void * sio_inline_buffer(struct pcs_sockio * sio)
+{
+	return sio->_inline_buffer;
+}
+
+static inline void pcs_msg_io_init(struct pcs_msg * msg)
+{
+	pcs_clear_error(&msg->error);
+	msg->_iocount = 0;
+	msg->done = pcs_free_msg;
+}
+
+static inline void pcs_msg_io_start(struct pcs_msg * msg, void (*done)(struct pcs_msg *))
+{
+	BUG_ON(msg->_iocount != 0);
+	msg->_iocount = 1;
+	msg->done = done;
+}
+
+static inline struct pcs_msg * pcs_msg_io_sched(struct pcs_msg * msg)
+{
+	BUG_ON(msg->_iocount <= 0);
+	msg->_iocount++;
+	return msg;
+}
+
+static inline void pcs_msg_io_end(struct pcs_msg * msg)
+{
+	BUG_ON(msg->_iocount <= 0);
+	if (--msg->_iocount == 0)
+		msg->done(msg);
+}
+
+static inline void pcs_msg_io_fini(struct pcs_msg * msg)
+{
+	BUG_ON(msg->_iocount != 0);
+}
+
+
+struct bufqueue;
+
+/**
+   Present a portion of @bq as a pcs_msg that may be passed to pcs_sock_sendmsg().
+   Reading data from the pcs_msg will drain @bq.
+
+   \param @bq the buffer queue with the data of a message
+   \param @size the length of the head of @bq that will be presented as a pcs_msg
+   \returns a pcs_msg that reads data from @bq
+*/
+struct pcs_msg* bufqueue_as_pcs_output_msg(struct bufqueue *bq, u32 size);
+
+
+void pcs_ioconn_unregister(struct pcs_ioconn *ioconn);
+void pcs_ioconn_close(struct pcs_ioconn *ioconn);
+
+
+#endif /* _PCS_SOCK_IO_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_timer.h b/fs/fuse/kio/pcs/pcs_timer.h
new file mode 100644
index 000000000000..f5ab4375ace1
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_timer.h
@@ -0,0 +1,19 @@
+#ifndef _PCS_TIMER_H_
+#define _PCS_TIMER_H_ 1
+
+#include "pcs_types.h"
+
+abs_time_t get_real_time_ms(void);
+
+static inline abs_time_t get_abs_time_fast_us(void)
+{
+	return ktime_to_ns(ktime_get()) / NSEC_PER_USEC;
+}
+
+static inline abs_time_t get_abs_time_us(void)
+{
+	return ktime_to_ns(ktime_get_real()) / NSEC_PER_USEC;
+}
+
+
+#endif /* _PCS_TIMER_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_types.h b/fs/fuse/kio/pcs/pcs_types.h
new file mode 100644
index 000000000000..f5c886e49619
--- /dev/null
+++ b/fs/fuse/kio/pcs/pcs_types.h
@@ -0,0 +1,38 @@
+#ifndef __PCS_TYPES_H__
+#define __PCS_TYPES_H__
+
+#include <linux/types.h>
+#include <linux/timer.h>
+
+typedef int pcs_fd_t;
+typedef int pcs_sock_t;
+typedef unsigned long ULONG_PTR;
+typedef unsigned long long abs_time_t;
+typedef struct timer_list pcs_timer_t;
+#define PCS_INVALID_FD (-1)
+#define PCS_API
+
+#include "pcs_align.h"
+
+typedef struct __pre_aligned(8) _PCS_NODE_ID_T {
+	u64    val;
+} PCS_NODE_ID_T __aligned(8);
+
+
+/* from: pcs_net_addr.h */
+enum
+{
+	PCS_ADDRTYPE_NONE = 0,
+	PCS_ADDRTYPE_IP = 1,
+	PCS_ADDRTYPE_IP6 = 2,
+	PCS_ADDRTYPE_UNIX = 3,
+};
+
+/* alignment makes it usable in binary protocols */
+typedef struct __pre_aligned(8) _PCS_NET_ADDR_T {
+	u32	type;
+	u32	port;			/* network byteorder! */
+	u8	address[16];
+} PCS_NET_ADDR_T __aligned(8);
+
+#endif /* __PCS_TYPES_H__ */


More information about the Devel mailing list