[Devel] [PATCH RHEL7 COMMIT] ms/shm: add memfd_create() syscall

Konstantin Khorenko khorenko at virtuozzo.com
Thu Oct 15 04:04:18 PDT 2015


The commit is pushed to "branch-rh7-3.10.0-229.7.2.vz7.8.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-229.7.2.vz7.8.6
------>
commit 9e421edd0c467fb8d3a230520421a58f55e2a46e
Author: Andrew Vagin <avagin at openvz.org>
Date:   Thu Oct 15 15:04:18 2015 +0400

    ms/shm: add memfd_create() syscall
    
    The patch is required for CRIU.
    
    https://jira.sw.ru/browse/PSBM-39834
    
    ML: 9183df25fe7b194563db3fec6dc3202a5855839c
    
    memfd_create() is similar to mmap(MAP_ANON), but returns a file-descriptor
    that you can pass to mmap().  It can support sealing and avoids any
    connection to user-visible mount-points.  Thus, it's not subject to quotas
    on mounted file-systems, but can be used like malloc()'ed memory, but with
    a file-descriptor to it.
    
    memfd_create() returns the raw shmem file, so calls like ftruncate() can
    be used to modify the underlying inode.  Also calls like fstat() will
    return proper information and mark the file as regular file.  If you want
    sealing, you can specify MFD_ALLOW_SEALING.  Otherwise, sealing is not
    supported (like on all other regular files).
    
    Compared to O_TMPFILE, it does not require a tmpfs mount-point and is not
    subject to a filesystem size limit.  It is still properly accounted to
    memcg limits, though, and to the same overcommit or no-overcommit
    accounting as all user memory.
    
    Signed-off-by: David Herrmann <dh.herrmann at gmail.com>
    Acked-by: Hugh Dickins <hughd at google.com>
    Cc: Michael Kerrisk <mtk.manpages at gmail.com>
    Cc: Ryan Lortie <desrt at desrt.ca>
    Cc: Lennart Poettering <lennart at poettering.net>
    Cc: Daniel Mack <zonque at gmail.com>
    Cc: Andy Lutomirski <luto at amacapital.net>
    Signed-off-by: Andrew Morton <akpm at linux-foundation.org>
    Signed-off-by: Linus Torvalds <torvalds at linux-foundation.org>
    
    Conflicts:
    
    	arch/x86/syscalls/syscall_32.tbl
    	arch/x86/syscalls/syscall_64.tbl
    Signed-off-by: Andrew Vagin <avagin at openvz.org>
---
 arch/x86/syscalls/syscall_32.tbl |  1 +
 arch/x86/syscalls/syscall_64.tbl |  1 +
 include/linux/syscalls.h         |  1 +
 kernel/sys_ni.c                  |  1 +
 mm/shmem.c                       | 73 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 77 insertions(+)

diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index 5d1de5d..4d0e1b4 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -357,6 +357,7 @@
 348	i386	process_vm_writev	sys_process_vm_writev		compat_sys_process_vm_writev
 349	i386	kcmp			sys_kcmp
 350	i386	finit_module		sys_finit_module
+356	i386	memfd_create		sys_memfd_create
 
 500	i386	fairsched_mknod		sys_fairsched_mknod
 501	i386	fairsched_rmnod		sys_fairsched_rmnod
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 3ed05b4..2415f42 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -321,6 +321,7 @@
 312	common	kcmp			sys_kcmp
 313	common	finit_module		sys_finit_module
 316	common	renameat2		sys_renameat2
+319	common	memfd_create		sys_memfd_create
 320	common	kexec_file_load		sys_kexec_file_load
 
 497	64	fairsched_nodemask	sys_fairsched_nodemask
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index c89c938..2c2e396 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -786,6 +786,7 @@ asmlinkage long sys_timerfd_settime(int ufd, int flags,
 asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr);
 asmlinkage long sys_eventfd(unsigned int count);
 asmlinkage long sys_eventfd2(unsigned int count, int flags);
+asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags);
 asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
 asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int);
 asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7c98d8f..75a69b0 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -194,6 +194,7 @@ cond_syscall(compat_sys_timerfd_settime);
 cond_syscall(compat_sys_timerfd_gettime);
 cond_syscall(sys_eventfd);
 cond_syscall(sys_eventfd2);
+cond_syscall(sys_memfd_create);
 
 /* performance counters: */
 cond_syscall(sys_perf_event_open);
diff --git a/mm/shmem.c b/mm/shmem.c
index 3964468..bc8e08b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -66,7 +66,9 @@ static struct vfsmount *shm_mnt;
 #include <linux/highmem.h>
 #include <linux/seq_file.h>
 #include <linux/magic.h>
+#include <linux/syscalls.h>
 #include <linux/fcntl.h>
+#include <uapi/linux/memfd.h>
 
 #include <bc/beancounter.h>
 #include <bc/vmpages.h>
@@ -2854,6 +2856,77 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
 	shmem_show_mpol(seq, sbinfo->mpol);
 	return 0;
 }
+
+#define MFD_NAME_PREFIX "memfd:"
+#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
+#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
+
+#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING)
+
+SYSCALL_DEFINE2(memfd_create,
+		const char __user *, uname,
+		unsigned int, flags)
+{
+	struct shmem_inode_info *info;
+	struct file *file;
+	int fd, error;
+	char *name;
+	long len;
+
+	if (flags & ~(unsigned int)MFD_ALL_FLAGS)
+		return -EINVAL;
+
+	/* length includes terminating zero */
+	len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
+	if (len <= 0)
+		return -EFAULT;
+	if (len > MFD_NAME_MAX_LEN + 1)
+		return -EINVAL;
+
+	name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_TEMPORARY);
+	if (!name)
+		return -ENOMEM;
+
+	strcpy(name, MFD_NAME_PREFIX);
+	if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
+		error = -EFAULT;
+		goto err_name;
+	}
+
+	/* terminating-zero may have changed after strnlen_user() returned */
+	if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
+		error = -EFAULT;
+		goto err_name;
+	}
+
+	fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
+	if (fd < 0) {
+		error = fd;
+		goto err_name;
+	}
+
+	file = shmem_file_setup(name, 0, VM_NORESERVE);
+	if (IS_ERR(file)) {
+		error = PTR_ERR(file);
+		goto err_fd;
+	}
+	info = SHMEM_I(file_inode(file));
+	file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
+	file->f_flags |= O_RDWR | O_LARGEFILE;
+	if (flags & MFD_ALLOW_SEALING)
+		info->seals &= ~F_SEAL_SEAL;
+
+	fd_install(fd, file);
+	kfree(name);
+	return fd;
+
+err_fd:
+	put_unused_fd(fd);
+err_name:
+	kfree(name);
+	return error;
+}
+
 #endif /* CONFIG_TMPFS */
 
 static void shmem_put_super(struct super_block *sb)



More information about the Devel mailing list