[Devel] [PATCH RHEL7 COMMIT] ms/vfs: syscall: Add open_tree(2) to reference or clone a mount
Vasily Averin
vvs at virtuozzo.com
Wed Aug 26 09:47:34 MSK 2020
The commit is pushed to "branch-rh7-3.10.0-1127.18.2.vz7.163.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-1127.18.2.vz7.163.11
------>
commit 27cf55493f5c43e75a69a5a24c108b8d6886a964
Author: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
Date: Wed Aug 26 09:47:34 2020 +0300
ms/vfs: syscall: Add open_tree(2) to reference or clone a mount
Patchset description:
These syscalls were added as preparation step for new mount api (fsopen,
fsconfig, fsmount and fspick will be ported separately).
We can use them to implement "cross-namespace bind-mounting" like this:
fd = open_tree(AT_FDCWD, "/mnt", OPEN_TREE_CLONE);
setns(nsfd, CLONE_NEWNS);
move_mount(fd, "", AT_FDCWD, "/mnt2", MOVE_MOUNT_F_EMPTY_PATH);
This will allow us implementing feature of adding bindmounts to runing
container instead of having unreliable external propagations.
It is needed to VZ8, but does not apply cleanly so I will send it
separately.
https://jira.sw.ru/browse/PSBM-107263
Current patch description:
From: Al Viro <viro at zeniv.linux.org.uk>
open_tree(dfd, pathname, flags)
Returns an O_PATH-opened file descriptor or an error.
dfd and pathname specify the location to open, in usual
fashion (see e.g. fstatat(2)). flags should be an OR of
some of the following:
* AT_PATH_EMPTY, AT_NO_AUTOMOUNT, AT_SYMLINK_NOFOLLOW -
same meanings as usual
* OPEN_TREE_CLOEXEC - make the resulting descriptor
close-on-exec
* OPEN_TREE_CLONE or OPEN_TREE_CLONE | AT_RECURSIVE -
instead of opening the location in question, create a detached
mount tree matching the subtree rooted at location specified by
dfd/pathname. With AT_RECURSIVE the entire subtree is cloned,
without it - only the part within in the mount containing the
location in question. In other words, the same as mount --rbind
or mount --bind would've taken. The detached tree will be
dissolved on the final close of obtained file. Creation of such
detached trees requires the same capabilities as doing mount --bind.
Signed-off-by: Al Viro <viro at zeniv.linux.org.uk>
Signed-off-by: David Howells <dhowells at redhat.com>
cc: linux-api at vger.kernel.org
Signed-off-by: Al Viro <viro at zeniv.linux.org.uk>
vfs: syscall: Add open_tree(2) to reference or clone a mount
(cherry-picked from commit a07b20004793d8926f78d63eb5980559f7813404)
uapi, x86: Fix the syscall numbering of the mount API syscalls [ver #2]
(cherry-picked from commit 9c8ad7a2ff0bfe58f019ec0abc1fb965114dde7d)
fs/namespace: add __user to open_tree and move_mount syscalls
(cherry-picked from commit 2658ce095df583cdf9ede475ec4da0b3cc7f7b05)
https://jira.sw.ru/browse/PSBM-107263
Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
---
arch/x86/syscalls/syscall_32.tbl | 2 +
arch/x86/syscalls/syscall_64.tbl | 2 +
fs/file_table.c | 9 ++-
fs/internal.h | 1 +
fs/namespace.c | 157 +++++++++++++++++++++++++++++++++------
include/linux/fs.h | 3 +
include/linux/syscalls.h | 2 +
include/uapi/linux/fcntl.h | 1 +
include/uapi/linux/fs.h | 6 ++
9 files changed, 158 insertions(+), 25 deletions(-)
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index 307c296..7b32f7c 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -371,6 +371,8 @@
381 i386 pkey_alloc sys_pkey_alloc
382 i386 pkey_free sys_pkey_free
+428 i386 open_tree sys_open_tree
+
510 i386 getluid sys_getluid
511 i386 setluid sys_setluid
512 i386 setublimit sys_setublimit compat_sys_setublimit
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index b304a2e..6b3a1d1 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -336,6 +336,8 @@
330 common pkey_alloc sys_pkey_alloc
331 common pkey_free sys_pkey_free
+428 common open_tree sys_open_tree
+
500 64 getluid sys_getluid
501 64 setluid sys_setluid
502 64 setublimit sys_setublimit
diff --git a/fs/file_table.c b/fs/file_table.c
index 4b82c0a8..16f1efd 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -208,6 +208,7 @@ static void __fput(struct file *file)
struct dentry *dentry = file->f_path.dentry;
struct vfsmount *mnt = file->f_path.mnt;
struct inode *inode = file->f_inode;
+ fmode_t mode = file->f_mode;
might_sleep();
@@ -228,14 +229,14 @@ static void __fput(struct file *file)
file->f_op->release(inode, file);
security_file_free(file);
if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
- !(file->f_mode & FMODE_PATH))) {
+ !(mode & FMODE_PATH))) {
cdev_put(inode->i_cdev);
}
fops_put(file->f_op);
put_pid(file->f_owner.pid);
- if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+ if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
i_readcount_dec(inode);
- if (file->f_mode & FMODE_WRITER) {
+ if (mode & FMODE_WRITER) {
put_write_access(inode);
__mnt_drop_write(mnt);
}
@@ -244,6 +245,8 @@ static void __fput(struct file *file)
file->f_inode = NULL;
file_free(file);
dput(dentry);
+ if (unlikely(mode & FMODE_NEED_UNMOUNT))
+ dissolve_on_fput(mnt);
mntput(mnt);
}
diff --git a/fs/internal.h b/fs/internal.h
index 3d3d0b5..1c50f43 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -76,6 +76,7 @@ extern void __mnt_drop_write(struct vfsmount *);
extern void __mnt_drop_write_file(struct file *);
extern void mnt_drop_write_file_path(struct file *);
+extern void dissolve_on_fput(struct vfsmount *);
/*
* fs_struct.c
*/
diff --git a/fs/namespace.c b/fs/namespace.c
index 59ec3235..694e3d6 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -19,6 +19,7 @@
#include <linux/init.h> /* init_rootfs */
#include <linux/fs_struct.h> /* get_fs_root et.al. */
#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
+#include <linux/file.h>
#include <linux/uaccess.h>
#include <linux/proc_ns.h>
#include <linux/magic.h>
@@ -1975,6 +1976,21 @@ struct vfsmount *collect_mounts(const struct path *path)
return &tree->mnt;
}
+static void free_mnt_ns(struct mnt_namespace *);
+static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool);
+
+void dissolve_on_fput(struct vfsmount *mnt)
+{
+ struct mnt_namespace *ns;
+ namespace_lock();
+ lock_mount_hash();
+ ns = real_mount(mnt)->mnt_ns;
+ umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
+ unlock_mount_hash();
+ namespace_unlock();
+ free_mnt_ns(ns);
+}
+
void drop_collected_mounts(struct vfsmount *mnt)
{
namespace_lock();
@@ -2335,6 +2351,30 @@ static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
return false;
}
+static struct mount *__do_loopback(struct path *old_path, int recurse)
+{
+ struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);
+
+ if (IS_MNT_UNBINDABLE(old))
+ return mnt;
+
+ if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations)
+ return mnt;
+
+ if (!recurse && has_locked_children(old, old_path->dentry))
+ return mnt;
+
+ if (recurse)
+ mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
+ else
+ mnt = clone_mnt(old, old_path->dentry, 0);
+
+ if (!IS_ERR(mnt))
+ mnt->mnt.mnt_flags &= ~MNT_LOCKED;
+
+ return mnt;
+}
+
/*
* do loopback mount.
*/
@@ -2342,7 +2382,7 @@ static int do_loopback(struct path *path, const char *old_name,
int recurse)
{
struct path old_path;
- struct mount *mnt = NULL, *old, *parent;
+ struct mount *mnt = NULL, *parent;
struct mountpoint *mp;
int err;
if (!old_name || !*old_name)
@@ -2356,38 +2396,21 @@ static int do_loopback(struct path *path, const char *old_name,
goto out;
mp = lock_mount(path);
- err = PTR_ERR(mp);
- if (IS_ERR(mp))
+ if (IS_ERR(mp)) {
+ err = PTR_ERR(mp);
goto out;
+ }
- old = real_mount(old_path.mnt);
parent = real_mount(path->mnt);
-
- err = -EINVAL;
- if (IS_MNT_UNBINDABLE(old))
- goto out2;
-
if (!check_mnt(parent))
goto out2;
- if (!check_mnt(old) && old_path.dentry->d_op != &ns_dentry_operations)
- goto out2;
-
- if (!recurse && has_locked_children(old, old_path.dentry))
- goto out2;
-
- if (recurse)
- mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);
- else
- mnt = clone_mnt(old, old_path.dentry, 0);
-
+ mnt = __do_loopback(&old_path, recurse);
if (IS_ERR(mnt)) {
err = PTR_ERR(mnt);
goto out2;
}
- mnt->mnt.mnt_flags &= ~MNT_LOCKED;
-
err = graft_tree(mnt, parent, mp);
if (err) {
lock_mount_hash();
@@ -2401,6 +2424,96 @@ out:
return err;
}
+static struct file *open_detached_copy(struct path *path, bool recursive)
+{
+ struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
+ struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true);
+ struct mount *mnt, *p;
+ struct file *file;
+
+ if (IS_ERR(ns))
+ return ERR_CAST(ns);
+
+ namespace_lock();
+ mnt = __do_loopback(path, recursive);
+ if (IS_ERR(mnt)) {
+ namespace_unlock();
+ free_mnt_ns(ns);
+ return ERR_CAST(mnt);
+ }
+
+ lock_mount_hash();
+ for (p = mnt; p; p = next_mnt(p, mnt)) {
+ p->mnt_ns = ns;
+ ns->mounts++;
+ }
+ ns->root = mnt;
+ list_add_tail(&ns->list, &mnt->mnt_list);
+ mntget(&mnt->mnt);
+ unlock_mount_hash();
+ namespace_unlock();
+
+ mntput(path->mnt);
+ path->mnt = &mnt->mnt;
+ file = dentry_open(path, O_PATH, current_cred());
+ if (IS_ERR(file))
+ dissolve_on_fput(path->mnt);
+ else
+ file->f_mode |= FMODE_NEED_UNMOUNT;
+ return file;
+}
+
+SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
+{
+ struct file *file;
+ struct path path;
+ int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
+ bool detached = flags & OPEN_TREE_CLONE;
+ int error;
+ int fd;
+
+ BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
+
+ if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
+ AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
+ OPEN_TREE_CLOEXEC))
+ return -EINVAL;
+
+ if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
+ return -EINVAL;
+
+ if (flags & AT_NO_AUTOMOUNT)
+ lookup_flags &= ~LOOKUP_AUTOMOUNT;
+ if (flags & AT_SYMLINK_NOFOLLOW)
+ lookup_flags &= ~LOOKUP_FOLLOW;
+ if (flags & AT_EMPTY_PATH)
+ lookup_flags |= LOOKUP_EMPTY;
+
+ if (detached && !may_mount())
+ return -EPERM;
+
+ fd = get_unused_fd_flags(flags & O_CLOEXEC);
+ if (fd < 0)
+ return fd;
+
+ error = user_path_at(dfd, filename, lookup_flags, &path);
+ if (unlikely(error)) {
+ file = ERR_PTR(error);
+ } else {
+ if (detached)
+ file = open_detached_copy(&path, flags & AT_RECURSIVE);
+ else
+ file = dentry_open(&path, O_PATH, current_cred());
+ path_put(&path);
+ }
+ if (IS_ERR(file)) {
+ put_unused_fd(fd);
+ return PTR_ERR(file);
+ }
+ fd_install(fd, file);
+ return fd;
+}
+
static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
{
int error = 0;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f2daf45..55a92ce 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -146,6 +146,9 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
/* The extended KABI iterate method in struct file_operations is present */
#define FMODE_KABI_ITERATE ((__force fmode_t)0x80000000)
+/* File represents mount that needs unmounting */
+#define FMODE_NEED_UNMOUNT ((__force fmode_t)0x10000000)
+
/*
* Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
* that indicates that they should check the contents of the iovec are
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 8146184..0e30297 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -891,4 +891,6 @@ asmlinkage long sys_membarrier(int cmd, int flags);
asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags);
+asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags);
+
#endif
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index beed138..aa700a3 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -63,5 +63,6 @@
#define AT_NO_AUTOMOUNT 0x800 /* Suppress terminal automount traversal */
#define AT_EMPTY_PATH 0x1000 /* Allow empty relative pathname */
+#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */
#endif /* _UAPI_LINUX_FCNTL_H */
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 45e2752..0fc42f1 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -125,6 +125,12 @@ struct inodes_stat_t {
#define MS_MGC_VAL 0xC0ED0000
#define MS_MGC_MSK 0xffff0000
+/*
+ * open_tree() flags.
+ */
+#define OPEN_TREE_CLONE 1 /* Clone the target tree and attach the clone */
+#define OPEN_TREE_CLOEXEC O_CLOEXEC /* Close the file on execve() */
+
/* the read-only stuff doesn't really belong here, but any other place is
probably as bad and I don't want to create yet another include file. */
More information about the Devel
mailing list