[Devel] [PATCH RHEL7 COMMIT] ms/kcmp: add KCMP_EPOLL_TFD mode to compare epoll target files

Konstantin Khorenko khorenko at virtuozzo.com
Wed May 31 06:25:57 PDT 2017


The commit is pushed to "branch-rh7-3.10.0-514.16.1.vz7.32.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-514.16.1.vz7.32.5
------>
commit fb23849f5380ab23997f11233ee2e1a1d2efe868
Author: Cyrill Gorcunov <gorcunov at virtuozzo.com>
Date:   Wed May 31 17:25:57 2017 +0400

    ms/kcmp: add KCMP_EPOLL_TFD mode to compare epoll target files
    
    Patchset description:
    kcmp: Add KCMP_EPOLL_TFD mode
    
    In the series we add new KCMP_EPOLL_TFD mode for kcmp() syscall,
    will need it to distinguish epoll target files with reopened file
    descriptors. CRIU support is in draft currently but will be posted
    into mailing list soon.
    
    ====================
    This patch description:
    
    With current epoll architecture target files are addressed with
    file_struct and file descriptor number, where the last is not unique.
    Moreover files can be transferred from another process via unix socket,
    added into queue and closed then so we won't find this descriptor in the
    task fdinfo list.
    
    Thus to checkpoint and restore such processes CRIU needs to find out where
    exactly the target file is present to add it into epoll queue.  For this
    sake one can use kcmp call where some particular target file from the
    queue is compared with arbitrary file passed as an argument.
    
    Because epoll target files can have same file descriptor number but
    different file_struct a caller should explicitly specify the offset
    within.
    
    To test if some particular file is matching entry inside
    epoll one have to
    
     - fill kcmp_epoll_slot structure with epoll file descriptor,
       target file number and target file offset (in case if only
       one target is present then it should be 0)
    
     - call kcmp as kcmp(pid1, pid2, KCMP_EPOLL_TFD, fd, &kcmp_epoll_slot)
        - the kernel fetch file pointer matching file descriptor @fd of pid1
        - lookups for file struct in epoll queue of pid2 and returns traditional
          0,1,2 result for sorting purpose
    
    https://jira.sw.ru/browse/PSBM-60161
    
    Link: http://lkml.kernel.org/r/20170424154423.511592110@gmail.com
    Signed-off-by: Cyrill Gorcunov <gorcunov at openvz.org>
    
    Acked-by: Andrey Vagin <avagin at openvz.org>
    Cc: Al Viro <viro at zeniv.linux.org.uk>
    Cc: Pavel Emelyanov <xemul at virtuozzo.com>
    Cc: Michael Kerrisk <mtk.manpages at gmail.com>
    Cc: Jason Baron <jbaron at akamai.com>
    Cc: Andy Lutomirski <luto at amacapital.net>
    Signed-off-by: Andrew Morton <akpm at linux-foundation.org>
---
 fs/eventpoll.c            | 42 ++++++++++++++++++++++++++++++++++
 include/linux/eventpoll.h |  2 ++
 include/linux/kcmp.h      | 10 +++++++++
 kernel/kcmp.c             | 57 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 111 insertions(+)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 31b4741..1fbba90 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -997,6 +997,48 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
 	return epir;
 }
 
+static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
+{
+	struct rb_node *rbp;
+	struct epitem *epi;
+
+	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+		epi = rb_entry(rbp, struct epitem, rbn);
+		if (epi->ffd.fd == tfd) {
+			if (toff == 0)
+				return epi;
+			else
+				toff--;
+		}
+		cond_resched();
+	}
+
+	return NULL;
+}
+
+struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
+				     unsigned long toff)
+{
+	struct file *file_raw;
+	struct eventpoll *ep;
+	struct epitem *epi;
+
+	if (!is_file_epoll(file))
+		return ERR_PTR(-EINVAL);
+
+	ep = file->private_data;
+
+	mutex_lock(&ep->mtx);
+	epi = ep_find_tfd(ep, tfd, toff);
+	if (epi)
+		file_raw = epi->ffd.file;
+	else
+		file_raw = ERR_PTR(-ENOENT);
+	mutex_unlock(&ep->mtx);
+
+	return file_raw;
+}
+
 /*
  * This is the callback that is passed to the wait queue wakeup
  * mechanism. It is called by the stored file descriptors when they
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index 6daf6d4..f2b07df 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -22,6 +22,8 @@ struct file;
 
 #ifdef CONFIG_EPOLL
 
+struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff);
+
 /* Used to initialize the epoll bits inside the "struct file" */
 static inline void eventpoll_init_file(struct file *file)
 {
diff --git a/include/linux/kcmp.h b/include/linux/kcmp.h
index 2dcd1b3..88a7e6b 100644
--- a/include/linux/kcmp.h
+++ b/include/linux/kcmp.h
@@ -1,6 +1,8 @@
 #ifndef _LINUX_KCMP_H
 #define _LINUX_KCMP_H
 
+#include <linux/types.h>
+
 /* Comparison type */
 enum kcmp_type {
 	KCMP_FILE,
@@ -10,8 +12,16 @@ enum kcmp_type {
 	KCMP_SIGHAND,
 	KCMP_IO,
 	KCMP_SYSVSEM,
+	KCMP_EPOLL_TFD,
 
 	KCMP_TYPES,
 };
 
+/* Slot for KCMP_EPOLL_TFD */
+struct kcmp_epoll_slot {
+	__u32 efd;		/* epoll file descriptor */
+	__u32 tfd;		/* target file number */
+	__u32 toff;		/* target offset within same numbered sequence */
+};
+
 #endif /* _LINUX_KCMP_H */
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index 0aa69ea..b8ecf9d 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -11,6 +11,10 @@
 #include <linux/bug.h>
 #include <linux/err.h>
 #include <linux/kcmp.h>
+#include <linux/capability.h>
+#include <linux/list.h>
+#include <linux/eventpoll.h>
+#include <linux/file.h>
 
 #include <asm/unistd.h>
 
@@ -94,6 +98,56 @@ static int kcmp_lock(struct mutex *m1, struct mutex *m2)
 	return err;
 }
 
+#ifdef CONFIG_EPOLL
+static int kcmp_epoll_target(struct task_struct *task1,
+			     struct task_struct *task2,
+			     unsigned long idx1,
+			     struct kcmp_epoll_slot __user *uslot)
+{
+	struct file *filp, *filp_epoll, *filp_tgt;
+	struct kcmp_epoll_slot slot;
+	struct files_struct *files;
+
+	if (copy_from_user(&slot, uslot, sizeof(slot)))
+		return -EFAULT;
+
+	filp = get_file_raw_ptr(task1, idx1);
+	if (!filp)
+		return -EBADF;
+
+	files = get_files_struct(task2);
+	if (!files)
+		return -EBADF;
+
+	spin_lock(&files->file_lock);
+	filp_epoll = fcheck_files(files, slot.efd);
+	if (filp_epoll)
+		get_file(filp_epoll);
+	else
+		filp_tgt = ERR_PTR(-EBADF);
+	spin_unlock(&files->file_lock);
+	put_files_struct(files);
+
+	if (filp_epoll) {
+		filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff);
+		fput(filp_epoll);
+	} else
+
+	if (IS_ERR(filp_tgt))
+		return PTR_ERR(filp_tgt);
+
+	return kcmp_ptr(filp, filp_tgt, KCMP_FILE);
+}
+#else
+static int kcmp_epoll_target(struct task_struct *task1,
+			     struct task_struct *task2,
+			     unsigned long idx1,
+			     struct kcmp_epoll_slot __user *uslot)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
 SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
 		unsigned long, idx1, unsigned long, idx2)
 {
@@ -165,6 +219,9 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
 		ret = -EOPNOTSUPP;
 #endif
 		break;
+	case KCMP_EPOLL_TFD:
+		ret = kcmp_epoll_target(task1, task2, idx1, (void *)idx2);
+		break;
 	default:
 		ret = -EINVAL;
 		break;


More information about the Devel mailing list