[Devel] [PATCH RH7 20/32] take the targets of /proc/*/ns/* symlinks to separate fs
Pavel Tikhomirov
ptikhomirov at virtuozzo.com
Mon Jun 8 20:05:50 MSK 2020
From: Al Viro <viro at zeniv.linux.org.uk>
New pseudo-filesystem: nsfs. Targets of /proc/*/ns/* live there now.
It's not mountable (not even registered, so it's not in /proc/filesystems,
etc.). Files on it *are* bindable - we explicitly permit that in do_loopback().
This stuff lives in fs/nsfs.c now; proc_ns_fget() moved there as well.
get_proc_ns() is a macro now (it's simply returning ->i_private; would
have been an inline, if not for header ordering headache).
proc_ns_inode() is an ex-parrot. The interface used in procfs is
ns_get_path(path, task, ops) and ns_get_name(buf, size, task, ops).
Dentries and inodes are never hashed; a non-counting reference to dentry
is stashed in ns_common (removed by ->d_prune()) and reused by ns_get_path()
if present. See ns_get_path()/ns_prune_dentry/nsfs_evict() for details
of that mechanism.
As the result, proc_ns_follow_link() has stopped poking in nd->path.mnt;
it does nd_jump_link() on a consistent <vfsmount,dentry> pair it gets
from ns_get_path().
Signed-off-by: Al Viro <viro at zeniv.linux.org.uk>
(cherry picked from VZ8 commit e149ed2b805fefdccf7ccdfc19eca22fdd4514ac)
https://jira.sw.ru/browse/PSBM-102357
Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
---
fs/Makefile | 2 +-
fs/internal.h | 5 ++
fs/namespace.c | 9 +-
fs/nsfs.c | 167 ++++++++++++++++++++++++++++++++++++
fs/proc/inode.c | 5 --
fs/proc/namespaces.c | 171 +++++--------------------------------
include/linux/ns_common.h | 1 +
include/linux/proc_ns.h | 32 ++++---
include/uapi/linux/magic.h | 1 +
init/main.c | 2 +
10 files changed, 221 insertions(+), 174 deletions(-)
create mode 100644 fs/nsfs.c
diff --git a/fs/Makefile b/fs/Makefile
index 15607740ad44..eaf855395381 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \
attr.o bad_inode.o file.o filesystems.o namespace.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o splice.o sync.o utimes.o \
- stack.o fs_struct.o statfs.o fs_pin.o
+ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o
ifeq ($(CONFIG_BLOCK),y)
obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
diff --git a/fs/internal.h b/fs/internal.h
index e3f25efb6a32..3d3d0b57c8ff 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -170,3 +170,8 @@ loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
*/
extern void group_pin_kill(struct hlist_head *p);
extern void mnt_pin_kill(struct mount *m);
+
+/*
+ * fs/nsfs.c
+ */
+extern struct dentry_operations ns_dentry_operations;
diff --git a/fs/namespace.c b/fs/namespace.c
index b88e6083d8d6..ab4fcd9a5b1f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1877,8 +1877,8 @@ SYSCALL_DEFINE1(oldumount, char __user *, name)
static bool is_mnt_ns_file(struct dentry *dentry)
{
/* Is this a proxy for a mount namespace? */
- struct inode *inode = dentry->d_inode;
- return proc_ns_inode(inode) && dentry->d_fsdata == &mntns_operations;
+ return dentry->d_op == &ns_dentry_operations &&
+ dentry->d_fsdata == &mntns_operations;
}
struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
@@ -2367,7 +2367,10 @@ static int do_loopback(struct path *path, const char *old_name,
if (IS_MNT_UNBINDABLE(old))
goto out2;
- if (!check_mnt(parent) || !check_mnt(old))
+ if (!check_mnt(parent))
+ goto out2;
+
+ if (!check_mnt(old) && old_path.dentry->d_op != &ns_dentry_operations)
goto out2;
if (!recurse && has_locked_children(old, old_path.dentry))
diff --git a/fs/nsfs.c b/fs/nsfs.c
new file mode 100644
index 000000000000..cb8323fac27f
--- /dev/null
+++ b/fs/nsfs.c
@@ -0,0 +1,167 @@
+#include <linux/mount.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/proc_ns.h>
+#include <linux/magic.h>
+#include <linux/ktime.h>
+
+static struct vfsmount *nsfs_mnt;
+
+static const struct file_operations ns_file_operations = {
+ .llseek = no_llseek,
+};
+
+static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+ struct inode *inode = dentry->d_inode;
+ const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
+
+ return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]",
+ ns_ops->name, inode->i_ino);
+}
+
+static void ns_prune_dentry(struct dentry *dentry)
+{
+ struct inode *inode = dentry->d_inode;
+ if (inode) {
+ struct ns_common *ns = inode->i_private;
+ atomic_long_set(&ns->stashed, 0);
+ }
+}
+
+static int ns_delete_dentry(const struct dentry *dentry)
+{
+ /* Don't cache namespace inodes when not in use */
+ return 1;
+}
+
+const struct dentry_operations ns_dentry_operations =
+{
+ .d_prune = ns_prune_dentry,
+ .d_delete = ns_delete_dentry,
+ .d_dname = ns_dname,
+};
+
+static void nsfs_evict(struct inode *inode)
+{
+ struct ns_common *ns = inode->i_private;
+ clear_inode(inode);
+ ns->ops->put(ns);
+}
+
+void *ns_get_path(struct path *path, struct task_struct *task,
+ const struct proc_ns_operations *ns_ops)
+{
+ struct vfsmount *mnt = mntget(nsfs_mnt);
+ struct qstr qname = { .name = "", };
+ struct dentry *dentry;
+ struct inode *inode;
+ struct ns_common *ns;
+ unsigned long d;
+
+again:
+ ns = ns_ops->get(task);
+ if (!ns) {
+ mntput(mnt);
+ return ERR_PTR(-ENOENT);
+ }
+ rcu_read_lock();
+ d = atomic_long_read(&ns->stashed);
+ if (!d)
+ goto slow;
+ dentry = (struct dentry *)d;
+ if (!lockref_get_not_dead(&dentry->d_lockref))
+ goto slow;
+ rcu_read_unlock();
+ ns_ops->put(ns);
+got_it:
+ path->mnt = mnt;
+ path->dentry = dentry;
+ return NULL;
+slow:
+ rcu_read_unlock();
+ inode = new_inode_pseudo(mnt->mnt_sb);
+ if (!inode) {
+ ns_ops->put(ns);
+ mntput(mnt);
+ return ERR_PTR(-ENOMEM);
+ }
+ inode->i_ino = ns->inum;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ inode->i_flags |= S_IMMUTABLE;
+ inode->i_mode = S_IFREG | S_IRUGO;
+ inode->i_fop = &ns_file_operations;
+ inode->i_private = ns;
+
+ dentry = d_alloc_pseudo(mnt->mnt_sb, &qname);
+ if (!dentry) {
+ iput(inode);
+ mntput(mnt);
+ return ERR_PTR(-ENOMEM);
+ }
+ d_instantiate(dentry, inode);
+ dentry->d_fsdata = (void *)ns_ops;
+ d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry);
+ if (d) {
+ d_delete(dentry); /* make sure ->d_prune() does nothing */
+ dput(dentry);
+ cpu_relax();
+ goto again;
+ }
+ goto got_it;
+}
+
+int ns_get_name(char *buf, size_t size, struct task_struct *task,
+ const struct proc_ns_operations *ns_ops)
+{
+ struct ns_common *ns;
+ int res = -ENOENT;
+ ns = ns_ops->get(task);
+ if (ns) {
+ res = snprintf(buf, size, "%s:[%u]", ns_ops->name, ns->inum);
+ ns_ops->put(ns);
+ }
+ return res;
+}
+
+struct file *proc_ns_fget(int fd)
+{
+ struct file *file;
+
+ file = fget(fd);
+ if (!file)
+ return ERR_PTR(-EBADF);
+
+ if (file->f_op != &ns_file_operations)
+ goto out_invalid;
+
+ return file;
+
+out_invalid:
+ fput(file);
+ return ERR_PTR(-EINVAL);
+}
+
+static const struct super_operations nsfs_ops = {
+ .statfs = simple_statfs,
+ .evict_inode = nsfs_evict,
+};
+static struct dentry *nsfs_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
+{
+ return mount_pseudo(fs_type, "nsfs:", &nsfs_ops,
+ &ns_dentry_operations, NSFS_MAGIC);
+}
+static struct file_system_type nsfs = {
+ .name = "nsfs",
+ .mount = nsfs_mount,
+ .kill_sb = kill_anon_super,
+};
+
+void __init nsfs_init(void)
+{
+ nsfs_mnt = kern_mount(&nsfs);
+ if (IS_ERR(nsfs_mnt))
+ panic("can't set nsfs up\n");
+ nsfs_mnt->mnt_sb->s_flags &= ~MS_NOUSER;
+}
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 8918cf3060a8..fba30eb2aed6 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -32,7 +32,6 @@ static void proc_evict_inode(struct inode *inode)
{
struct proc_dir_entry *de;
struct ctl_table_header *head;
- struct ns_common *ns;
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
@@ -50,10 +49,6 @@ static void proc_evict_inode(struct inode *inode)
rcu_assign_pointer(PROC_I(inode)->sysctl, NULL);
proc_sys_evict_inode(inode, head);
}
- /* Release any associated namespace */
- ns = PROC_I(inode)->ns.ns;
- if (ns && ns->ops)
- ns->ops->put(ns);
}
static struct kmem_cache * proc_inode_cachep;
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 1f152f9f6ec4..49d459f05546 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -1,10 +1,6 @@
#include <linux/proc_fs.h>
#include <linux/nsproxy.h>
-#include <linux/sched.h>
#include <linux/ptrace.h>
-#include <linux/fs_struct.h>
-#include <linux/mount.h>
-#include <linux/path.h>
#include <linux/namei.h>
#include <linux/file.h>
#include <linux/utsname.h>
@@ -34,151 +30,50 @@ static const struct proc_ns_operations *ns_entries[] = {
&mntns_operations,
};
-static const struct file_operations ns_file_operations = {
- .llseek = no_llseek,
-};
-
-static const struct inode_operations ns_inode_operations = {
- .setattr = proc_setattr,
-};
-
-static int ns_delete_dentry(const struct dentry *dentry)
-{
- /* Don't cache namespace inodes when not in use */
- return 1;
-}
-
-static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
-{
- struct inode *inode = dentry->d_inode;
- const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
-
- return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]",
- ns_ops->name, inode->i_ino);
-}
-
-const struct dentry_operations ns_dentry_operations =
-{
- .d_delete = ns_delete_dentry,
- .d_dname = ns_dname,
-};
-
-static struct dentry *proc_ns_get_dentry(struct super_block *sb,
- struct task_struct *task, const struct proc_ns_operations *ns_ops)
-{
- struct dentry *dentry, *result;
- struct inode *inode;
- struct proc_inode *ei;
- struct qstr qname = { .name = "", };
- struct ns_common *ns;
-
- ns = ns_ops->get(task);
- if (!ns)
- return ERR_PTR(-ENOENT);
-
- dentry = d_alloc_pseudo(sb, &qname);
- if (!dentry) {
- ns_ops->put(ns);
- return ERR_PTR(-ENOMEM);
- }
- dentry->d_fsdata = (void *)ns_ops;
-
- inode = iget_locked(sb, ns->inum);
- if (!inode) {
- dput(dentry);
- ns_ops->put(ns);
- return ERR_PTR(-ENOMEM);
- }
-
- ei = PROC_I(inode);
- if (inode->i_state & I_NEW) {
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
- inode->i_op = &ns_inode_operations;
- inode->i_mode = S_IFREG | S_IRUGO;
- inode->i_fop = &ns_file_operations;
- ei->ns.ns_ops = ns_ops;
- ei->ns.ns = ns;
- unlock_new_inode(inode);
- } else {
- ns_ops->put(ns);
- }
-
- d_set_d_op(dentry, &ns_dentry_operations);
- result = d_instantiate_unique(dentry, inode);
- if (result) {
- dput(dentry);
- dentry = result;
- }
-
- return dentry;
-}
-
static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd)
{
struct inode *inode = dentry->d_inode;
- struct super_block *sb = inode->i_sb;
- struct proc_inode *ei = PROC_I(inode);
+ const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns.ns_ops;
struct task_struct *task;
struct path ns_path;
void *error = ERR_PTR(-EACCES);
task = get_proc_task(inode);
if (!task)
- goto out;
-
- if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
- goto out_put_task;
+ return error;
- ns_path.dentry = proc_ns_get_dentry(sb, task, ei->ns.ns_ops);
- if (IS_ERR(ns_path.dentry)) {
- error = ERR_CAST(ns_path.dentry);
- goto out_put_task;
+ if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
+ error = ns_get_path(&ns_path, task, ns_ops);
+ if (!error)
+ nd_jump_link(nd, &ns_path);
}
-
- ns_path.mnt = mntget(nd->path.mnt);
- nd_jump_link(nd, &ns_path);
- error = NULL;
-
-out_put_task:
put_task_struct(task);
-out:
return error;
}
static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
struct inode *inode = dentry->d_inode;
- struct proc_inode *ei = PROC_I(inode);
- const struct proc_ns_operations *ns_ops = ei->ns.ns_ops;
+ const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns.ns_ops;
struct task_struct *task;
- struct ns_common *ns;
char name[50];
int len = -EACCES;
task = get_proc_task(inode);
if (!task)
- goto out;
-
- if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
- goto out_put_task;
-
- len = -ENOENT;
- ns = ns_ops->get(task);
- if (!ns)
- goto out_put_task;
-
- snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns->inum);
- len = strlen(name);
-
- if (len > buflen)
- len = buflen;
- if (copy_to_user(buffer, name, len))
- len = -EFAULT;
-
- ns_ops->put(ns);
-out_put_task:
+ return len;
+ if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
+ len = ns_get_name(name, sizeof(name), task, ns_ops);
+ if (len >= 0) {
+ len = strlen(name);
+
+ if (len > buflen)
+ len = buflen;
+ if (copy_to_user(buffer, name, len))
+ len = -EFAULT;
+ }
+ }
put_task_struct(task);
-out:
return len;
}
@@ -318,31 +213,3 @@ const struct inode_operations proc_ns_dir_inode_operations = {
.getattr = pid_getattr,
.setattr = proc_setattr,
};
-
-struct file *proc_ns_fget(int fd)
-{
- struct file *file;
-
- file = fget(fd);
- if (!file)
- return ERR_PTR(-EBADF);
-
- if (file->f_op != &ns_file_operations)
- goto out_invalid;
-
- return file;
-
-out_invalid:
- fput(file);
- return ERR_PTR(-EINVAL);
-}
-
-struct ns_common *get_proc_ns(struct inode *inode)
-{
- return PROC_I(inode)->ns.ns;
-}
-
-bool proc_ns_inode(struct inode *inode)
-{
- return inode->i_fop == &ns_file_operations;
-}
diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index ce23cf4bbe69..85a5c8c16be9 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -4,6 +4,7 @@
struct proc_ns_operations;
struct ns_common {
+ atomic_long_t stashed;
const struct proc_ns_operations *ops;
unsigned int inum;
};
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 0068daaf7c3c..065d651fde7c 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -4,10 +4,13 @@
#ifndef _LINUX_PROC_NS_H
#define _LINUX_PROC_NS_H
+#include <linux/ns_common.h>
+
struct super_block;
+
struct pid_namespace;
struct nsproxy;
-struct ns_common;
+struct path;
struct proc_ns_operations {
const char *name;
@@ -39,11 +42,8 @@ enum {
extern int pid_ns_prepare_proc(struct pid_namespace *ns);
extern void pid_ns_release_proc(struct pid_namespace *ns);
-extern struct file *proc_ns_fget(int fd);
-extern struct ns_common *get_proc_ns(struct inode *);
extern int proc_alloc_inum(unsigned int *pino);
extern void proc_free_inum(unsigned int inum);
-extern bool proc_ns_inode(struct inode *inode);
extern bool proc_in_container(struct super_block *sb);
@@ -52,24 +52,30 @@ extern bool proc_in_container(struct super_block *sb);
static inline int pid_ns_prepare_proc(struct pid_namespace *ns) { return 0; }
static inline void pid_ns_release_proc(struct pid_namespace *ns) {}
-static inline struct file *proc_ns_fget(int fd)
-{
- return ERR_PTR(-EINVAL);
-}
-
-static inline struct ns_common *get_proc_ns(struct inode *inode) { return NULL; }
-
static inline int proc_alloc_inum(unsigned int *inum)
{
*inum = 1;
return 0;
}
static inline void proc_free_inum(unsigned int inum) {}
-static inline bool proc_ns_inode(struct inode *inode) { return false; }
#endif /* CONFIG_PROC_FS */
-#define ns_alloc_inum(ns) proc_alloc_inum(&(ns)->inum)
+static inline int ns_alloc_inum(struct ns_common *ns)
+{
+ atomic_long_set(&ns->stashed, 0);
+ return proc_alloc_inum(&ns->inum);
+}
+
#define ns_free_inum(ns) proc_free_inum((ns)->inum)
+extern struct file *proc_ns_fget(int fd);
+#define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private)
+extern void *ns_get_path(struct path *path, struct task_struct *task,
+ const struct proc_ns_operations *ns_ops);
+
+extern int ns_get_name(char *buf, size_t size, struct task_struct *task,
+ const struct proc_ns_operations *ns_ops);
+extern void nsfs_init(void);
+
#endif /* _LINUX_PROC_NS_H */
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index 8af2941feda3..3ee2a9a9392f 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -76,6 +76,7 @@
#define MTD_INODE_FS_MAGIC 0x11307854
#define ANON_INODE_FS_MAGIC 0x09041934
#define BTRFS_TEST_MAGIC 0x73727279
+#define NSFS_MAGIC 0x6e736673
#define BPF_FS_MAGIC 0xcafe4a11
#endif /* __LINUX_MAGIC_H__ */
diff --git a/init/main.c b/init/main.c
index 9d60adf86c36..0c2c6b41f4f0 100644
--- a/init/main.c
+++ b/init/main.c
@@ -77,6 +77,7 @@
#include <linux/random.h>
#include <linux/context_tracking.h>
#include <linux/list.h>
+#include <linux/proc_ns.h>
#include <linux/io.h>
#include <linux/jump_label.h>
#include <linux/ve.h>
@@ -671,6 +672,7 @@ asmlinkage void __init start_kernel(void)
#ifdef CONFIG_PROC_FS
proc_root_init();
#endif
+ nsfs_init();
cgroup_init();
ub_init_late();
cpuset_init();
--
2.24.1
More information about the Devel
mailing list