[Devel] Re: [PATCH] Restore task fs_root and pwd (v3)
Serge E. Hallyn
serue at us.ibm.com
Mon Jan 25 21:29:41 PST 2010
Quoting Oren Laadan (orenl at cs.columbia.edu):
> Checkpoint and restore task->fs. Tasks sharing task->fs will
> share them again after restart.
>
> Original patch by Serge Hallyn <serue at us.ibm.com>
>
> Changelog:
> Jan 25: [orenl] Addressed comments by .. myself:
> - add leak detection
> - change order of save/restore of chroot and cwd
> - save/restore fs only after file-table and mm
> - rename functions to adapt existing conventions
> Dec 28: [serge] Addressed comments by Oren (and Dave)
> - define and use {get,put}_fs_struct helpers
> - fix locking comment
> - define ckpt_read_fname() and use in checkpoint/files.c
>
> Signed-off-by: Oren Laadan <orenl at cs.columbia.edu>
> Signed-off-by: Serge E. Hallyn <serue at us.ibm.com>
Thanks, Oren. Looks good to me.
-serge
> ---
> checkpoint/files.c | 204 +++++++++++++++++++++++++++++++++++++++-
> checkpoint/objhash.c | 34 +++++++
> checkpoint/process.c | 17 ++++
> fs/fs_struct.c | 21 ++++
> fs/open.c | 53 ++++++-----
> include/linux/checkpoint.h | 8 ++-
> include/linux/checkpoint_hdr.h | 12 +++
> include/linux/fs.h | 4 +
> include/linux/fs_struct.h | 2 +
> 9 files changed, 329 insertions(+), 26 deletions(-)
>
> diff --git a/checkpoint/files.c b/checkpoint/files.c
> index ff486cd..797ce40 100644
> --- a/checkpoint/files.c
> +++ b/checkpoint/files.c
> @@ -15,6 +15,9 @@
> #include <linux/module.h>
> #include <linux/sched.h>
> #include <linux/file.h>
> +#include <linux/namei.h>
> +#include <linux/fs_struct.h>
> +#include <linux/fs.h>
> #include <linux/fdtable.h>
> #include <linux/fsnotify.h>
> #include <linux/pipe_fs_i.h>
> @@ -387,6 +390,62 @@ int checkpoint_obj_file_table(struct ckpt_ctx *ctx, struct task_struct *t)
> return objref;
> }
>
> +int checkpoint_obj_fs(struct ckpt_ctx *ctx, struct task_struct *t)
> +{
> + struct fs_struct *fs;
> + int fs_objref;
> +
> + task_lock(current);
> + fs = t->fs;
> + get_fs_struct(fs);
> + task_unlock(current);
> +
> + fs_objref = checkpoint_obj(ctx, fs, CKPT_OBJ_FS);
> + put_fs_struct(fs);
> +
> + return fs_objref;
> +}
> +
> +/* called with fs refcount bumped so it won't disappear */
> +static int do_checkpoint_fs(struct ckpt_ctx *ctx, struct fs_struct *fs)
> +{
> + struct ckpt_hdr_fs *h;
> + struct fs_struct *fscopy;
> + int ret;
> +
> + h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FS);
> + if (!h)
> + return -ENOMEM;
> + ret = ckpt_write_obj(ctx, &h->h);
> + ckpt_hdr_put(ctx, h);
> + if (ret)
> + return ret;
> +
> + fscopy = copy_fs_struct(fs);
> + if (!fs)
> + return -ENOMEM;
> +
> + ret = checkpoint_fname(ctx, &fscopy->pwd, &ctx->root_fs_path);
> + if (ret < 0) {
> + ckpt_err(ctx, ret, "%(T)writing path of cwd");
> + goto out;
> + }
> + ret = checkpoint_fname(ctx, &fscopy->root, &ctx->root_fs_path);
> + if (ret < 0) {
> + ckpt_err(ctx, ret, "%(T)writing path of fs root");
> + goto out;
> + }
> + ret = 0;
> +out:
> + free_fs_struct(fscopy);
> + return ret;
> +}
> +
> +int checkpoint_fs(struct ckpt_ctx *ctx, void *ptr)
> +{
> + return do_checkpoint_fs(ctx, (struct fs_struct *) ptr);
> +}
> +
> /***********************************************************************
> * Collect
> */
> @@ -473,10 +532,41 @@ int ckpt_collect_file_table(struct ckpt_ctx *ctx, struct task_struct *t)
> return ret;
> }
>
> +int ckpt_collect_fs(struct ckpt_ctx *ctx, struct task_struct *t)
> +{
> + struct fs_struct *fs;
> + int ret;
> +
> + task_lock(t);
> + fs = t->fs;
> + get_fs_struct(fs);
> + task_unlock(t);
> +
> + ret = ckpt_obj_collect(ctx, fs, CKPT_OBJ_FS);
> +
> + put_fs_struct(fs);
> + return ret;
> +}
> +
> /**************************************************************************
> * Restart
> */
>
> +static int ckpt_read_fname(struct ckpt_ctx *ctx, char **fname)
> +{
> + int len;
> +
> + len = ckpt_read_payload(ctx, (void **) fname,
> + PATH_MAX, CKPT_HDR_FILE_NAME);
> + if (len < 0)
> + return len;
> +
> + (*fname)[len - 1] = '\0'; /* always play if safe */
> + ckpt_debug("read filename '%s'\n", *fname);
> +
> + return len;
> +}
> +
> /**
> * restore_open_fname - read a file name and open a file
> * @ctx: checkpoint context
> @@ -492,11 +582,9 @@ struct file *restore_open_fname(struct ckpt_ctx *ctx, int flags)
> if (flags & (O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC))
> return ERR_PTR(-EINVAL);
>
> - len = ckpt_read_payload(ctx, (void **) &fname,
> - PATH_MAX, CKPT_HDR_FILE_NAME);
> + len = ckpt_read_fname(ctx, &fname);
> if (len < 0)
> return ERR_PTR(len);
> - fname[len - 1] = '\0'; /* always play if safe */
> ckpt_debug("fname '%s' flags %#x\n", fname, flags);
>
> file = filp_open(fname, flags, 0);
> @@ -836,3 +924,113 @@ int restore_obj_file_table(struct ckpt_ctx *ctx, int files_objref)
>
> return 0;
> }
> +
> +/*
> + * Called by task restore code to set the restarted task's
> + * current->fs to an entry on the hash
> + */
> +int restore_obj_fs(struct ckpt_ctx *ctx, int fs_objref)
> +{
> + struct fs_struct *newfs, *oldfs;
> +
> + newfs = ckpt_obj_fetch(ctx, fs_objref, CKPT_OBJ_FS);
> + if (IS_ERR(newfs))
> + return PTR_ERR(newfs);
> +
> + task_lock(current);
> + get_fs_struct(newfs);
> + oldfs = current->fs;
> + current->fs = newfs;
> + task_unlock(current);
> + put_fs_struct(oldfs);
> +
> + return 0;
> +}
> +
> +static int restore_chroot(struct ckpt_ctx *ctx, struct fs_struct *fs, char *name)
> +{
> + struct nameidata nd;
> + int ret;
> +
> + ckpt_debug("attempting chroot to %s\n", name);
> + ret = path_lookup(name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &nd);
> + if (ret) {
> + ckpt_err(ctx, ret, "%(T)Opening chroot dir %s", name);
> + return ret;
> + }
> + ret = do_chroot(fs, &nd.path);
> + path_put(&nd.path);
> + if (ret) {
> + ckpt_err(ctx, ret, "%(T)Setting chroot %s", name);
> + return ret;
> + }
> + return 0;
> +}
> +
> +static int restore_cwd(struct ckpt_ctx *ctx, struct fs_struct *fs, char *name)
> +{
> + struct nameidata nd;
> + int ret;
> +
> + ckpt_debug("attempting chdir to %s\n", name);
> + ret = path_lookup(name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &nd);
> + if (ret) {
> + ckpt_err(ctx, ret, "%(T)Opening cwd %s", name);
> + return ret;
> + }
> + ret = do_chdir(fs, &nd.path);
> + path_put(&nd.path);
> + if (ret) {
> + ckpt_err(ctx, ret, "%(T)Setting cwd %s", name);
> + return ret;
> + }
> + return 0;
> +}
> +
> +/*
> + * Called by objhash when it runs into a CKPT_OBJ_FS entry. Creates
> + * an fs_struct with desired chroot/cwd and places it in the hash.
> + */
> +static struct fs_struct *do_restore_fs(struct ckpt_ctx *ctx)
> +{
> + struct ckpt_hdr_fs *h;
> + struct fs_struct *fs;
> + char *path;
> + int ret = 0;
> +
> + h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_FS);
> + if (IS_ERR(h))
> + return ERR_PTR(PTR_ERR(h));
> + ckpt_hdr_put(ctx, h);
> +
> + fs = copy_fs_struct(current->fs);
> + if (!fs)
> + return ERR_PTR(-ENOMEM);
> +
> + ret = ckpt_read_fname(ctx, &path);
> + if (ret < 0)
> + goto out;
> + ret = restore_cwd(ctx, fs, path);
> + kfree(path);
> + if (ret)
> + goto out;
> +
> + ret = ckpt_read_fname(ctx, &path);
> + if (ret < 0)
> + goto out;
> + ret = restore_chroot(ctx, fs, path);
> + kfree(path);
> +
> +out:
> + if (ret) {
> + free_fs_struct(fs);
> + return ERR_PTR(ret);
> + }
> + return fs;
> +}
> +
> +void *restore_fs(struct ckpt_ctx *ctx)
> +{
> + return (void *) do_restore_fs(ctx);
> +}
> +
> diff --git a/checkpoint/objhash.c b/checkpoint/objhash.c
> index 782661d..190983b 100644
> --- a/checkpoint/objhash.c
> +++ b/checkpoint/objhash.c
> @@ -15,6 +15,7 @@
> #include <linux/hash.h>
> #include <linux/file.h>
> #include <linux/fdtable.h>
> +#include <linux/fs_struct.h>
> #include <linux/sched.h>
> #include <linux/kref.h>
> #include <linux/ipc_namespace.h>
> @@ -127,6 +128,29 @@ static int obj_mm_users(void *ptr)
> return atomic_read(&((struct mm_struct *) ptr)->mm_users);
> }
>
> +static int obj_fs_grab(void *ptr)
> +{
> + get_fs_struct((struct fs_struct *) ptr);
> + return 0;
> +}
> +
> +static void obj_fs_drop(void *ptr, int lastref)
> +{
> + put_fs_struct((struct fs_struct *) ptr);
> +}
> +
> +static int obj_fs_users(void *ptr)
> +{
> + /*
> + * It's safe to not use fs->lock because the fs referenced.
> + * It's also sufficient for leak detection: with no leak the
> + * count can't change; with a leak it will be too big already
> + * (even if it's about to grow), and if it's about to shrink
> + * then it's as if we sampled the count a bit earlier.
> + */
> + return ((struct fs_struct *) ptr)->users;
> +}
> +
> static int obj_sighand_grab(void *ptr)
> {
> atomic_inc(&((struct sighand_struct *) ptr)->count);
> @@ -361,6 +385,16 @@ static struct ckpt_obj_ops ckpt_obj_ops[] = {
> .checkpoint = checkpoint_mm,
> .restore = restore_mm,
> },
> + /* struct fs_struct */
> + {
> + .obj_name = "FS",
> + .obj_type = CKPT_OBJ_FS,
> + .ref_drop = obj_fs_drop,
> + .ref_grab = obj_fs_grab,
> + .ref_users = obj_fs_users,
> + .checkpoint = checkpoint_fs,
> + .restore = restore_fs,
> + },
> /* sighand object */
> {
> .obj_name = "SIGHAND",
> diff --git a/checkpoint/process.c b/checkpoint/process.c
> index 6655cc7..94cd0c1 100644
> --- a/checkpoint/process.c
> +++ b/checkpoint/process.c
> @@ -232,6 +232,7 @@ static int checkpoint_task_objs(struct ckpt_ctx *ctx, struct task_struct *t)
> struct ckpt_hdr_task_objs *h;
> int files_objref;
> int mm_objref;
> + int fs_objref;
> int sighand_objref;
> int signal_objref;
> int first, ret;
> @@ -272,6 +273,13 @@ static int checkpoint_task_objs(struct ckpt_ctx *ctx, struct task_struct *t)
> return mm_objref;
> }
>
> + /* note: this must come *after* file-table and mm */
> + fs_objref = checkpoint_obj_fs(ctx, t);
> + if (fs_objref < 0) {
> + ckpt_err(ctx, fs_objref, "%(T)process fs\n");
> + return fs_objref;
> + }
> +
> sighand_objref = checkpoint_obj_sighand(ctx, t);
> ckpt_debug("sighand: objref %d\n", sighand_objref);
> if (sighand_objref < 0) {
> @@ -299,6 +307,7 @@ static int checkpoint_task_objs(struct ckpt_ctx *ctx, struct task_struct *t)
> return -ENOMEM;
> h->files_objref = files_objref;
> h->mm_objref = mm_objref;
> + h->fs_objref = fs_objref;
> h->sighand_objref = sighand_objref;
> h->signal_objref = signal_objref;
> ret = ckpt_write_obj(ctx, &h->h);
> @@ -477,6 +486,9 @@ int ckpt_collect_task(struct ckpt_ctx *ctx, struct task_struct *t)
> ret = ckpt_collect_mm(ctx, t);
> if (ret < 0)
> return ret;
> + ret = ckpt_collect_fs(ctx, t);
> + if (ret < 0)
> + return ret;
> ret = ckpt_collect_sighand(ctx, t);
>
> return ret;
> @@ -645,6 +657,11 @@ static int restore_task_objs(struct ckpt_ctx *ctx)
> if (ret < 0)
> goto out;
>
> + ret = restore_obj_fs(ctx, h->fs_objref);
> + ckpt_debug("fs: ret %d (%p)\n", ret, current->fs);
> + if (ret < 0)
> + return ret;
> +
> ret = restore_obj_sighand(ctx, h->sighand_objref);
> ckpt_debug("sighand: ret %d (%p)\n", ret, current->sighand);
> if (ret < 0)
> diff --git a/fs/fs_struct.c b/fs/fs_struct.c
> index eee0590..2a4c6f5 100644
> --- a/fs/fs_struct.c
> +++ b/fs/fs_struct.c
> @@ -6,6 +6,27 @@
> #include <linux/fs_struct.h>
>
> /*
> + * call with owning task locked
> + */
> +void get_fs_struct(struct fs_struct *fs)
> +{
> + write_lock(&fs->lock);
> + fs->users++;
> + write_unlock(&fs->lock);
> +}
> +
> +void put_fs_struct(struct fs_struct *fs)
> +{
> + int kill;
> +
> + write_lock(&fs->lock);
> + kill = !--fs->users;
> + write_unlock(&fs->lock);
> + if (kill)
> + free_fs_struct(fs);
> +}
> +
> +/*
> * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
> * It can block.
> */
> diff --git a/fs/open.c b/fs/open.c
> index 4f01e06..75c395d 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -524,6 +524,18 @@ SYSCALL_DEFINE2(access, const char __user *, filename, int, mode)
> return sys_faccessat(AT_FDCWD, filename, mode);
> }
>
> +int do_chdir(struct fs_struct *fs, struct path *path)
> +{
> + int error;
> +
> + error = inode_permission(path->dentry->d_inode, MAY_EXEC | MAY_ACCESS);
> + if (error)
> + return error;
> +
> + set_fs_pwd(fs, path);
> + return 0;
> +}
> +
> SYSCALL_DEFINE1(chdir, const char __user *, filename)
> {
> struct path path;
> @@ -531,17 +543,10 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename)
>
> error = user_path_dir(filename, &path);
> if (error)
> - goto out;
> -
> - error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS);
> - if (error)
> - goto dput_and_out;
> -
> - set_fs_pwd(current->fs, &path);
> + return error;
>
> -dput_and_out:
> + error = do_chdir(current->fs, &path);
> path_put(&path);
> -out:
> return error;
> }
>
> @@ -571,6 +576,21 @@ out:
> return error;
> }
>
> +int do_chroot(struct fs_struct *fs, struct path *path)
> +{
> + int error;
> +
> + error = inode_permission(path->dentry->d_inode, MAY_EXEC | MAY_ACCESS);
> + if (error)
> + return error;
> +
> + if (!capable(CAP_SYS_CHROOT))
> + return -EPERM;
> +
> + set_fs_root(fs, path);
> + return 0;
> +}
> +
> SYSCALL_DEFINE1(chroot, const char __user *, filename)
> {
> struct path path;
> @@ -578,21 +598,10 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
>
> error = user_path_dir(filename, &path);
> if (error)
> - goto out;
> -
> - error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS);
> - if (error)
> - goto dput_and_out;
> -
> - error = -EPERM;
> - if (!capable(CAP_SYS_CHROOT))
> - goto dput_and_out;
> + return error;
>
> - set_fs_root(current->fs, &path);
> - error = 0;
> -dput_and_out:
> + error = do_chroot(current->fs, &path);
> path_put(&path);
> -out:
> return error;
> }
>
> diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
> index 90c8436..b2cbd30 100644
> --- a/include/linux/checkpoint.h
> +++ b/include/linux/checkpoint.h
> @@ -10,7 +10,7 @@
> * distribution for more details.
> */
>
> -#define CHECKPOINT_VERSION 4
> +#define CHECKPOINT_VERSION 5
>
> /* checkpoint user flags */
> #define CHECKPOINT_SUBTREE 0x1
> @@ -245,6 +245,12 @@ extern int checkpoint_file_common(struct ckpt_ctx *ctx, struct file *file,
> extern int restore_file_common(struct ckpt_ctx *ctx, struct file *file,
> struct ckpt_hdr_file *h);
>
> +extern int ckpt_collect_fs(struct ckpt_ctx *ctx, struct task_struct *t);
> +extern int checkpoint_obj_fs(struct ckpt_ctx *ctx, struct task_struct *t);
> +extern int restore_obj_fs(struct ckpt_ctx *ctx, int fs_objref);
> +extern int checkpoint_fs(struct ckpt_ctx *ctx, void *ptr);
> +extern void *restore_fs(struct ckpt_ctx *ctx);
> +
> /* credentials */
> extern int checkpoint_groupinfo(struct ckpt_ctx *ctx, void *ptr);
> extern int checkpoint_user(struct ckpt_ctx *ctx, void *ptr);
> diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
> index a1b67c8..53ae8bf 100644
> --- a/include/linux/checkpoint_hdr.h
> +++ b/include/linux/checkpoint_hdr.h
> @@ -144,6 +144,9 @@ enum {
> CKPT_HDR_MM_CONTEXT,
> #define CKPT_HDR_MM_CONTEXT CKPT_HDR_MM_CONTEXT
>
> + CKPT_HDR_FS, /* must be after file-table, mm */
> +#define CKPT_HDR_FS CKPT_HDR_FS
> +
> CKPT_HDR_IPC = 501,
> #define CKPT_HDR_IPC CKPT_HDR_IPC
> CKPT_HDR_IPC_SHM,
> @@ -218,6 +221,8 @@ enum obj_type {
> #define CKPT_OBJ_FILE CKPT_OBJ_FILE
> CKPT_OBJ_MM,
> #define CKPT_OBJ_MM CKPT_OBJ_MM
> + CKPT_OBJ_FS,
> +#define CKPT_OBJ_FS CKPT_OBJ_FS
> CKPT_OBJ_SIGHAND,
> #define CKPT_OBJ_SIGHAND CKPT_OBJ_SIGHAND
> CKPT_OBJ_SIGNAL,
> @@ -452,6 +457,7 @@ struct ckpt_hdr_task_objs {
>
> __s32 files_objref;
> __s32 mm_objref;
> + __s32 fs_objref;
> __s32 sighand_objref;
> __s32 signal_objref;
> } __attribute__((aligned(8)));
> @@ -489,6 +495,12 @@ enum restart_block_type {
> };
>
> /* file system */
> +struct ckpt_hdr_fs {
> + struct ckpt_hdr h;
> + /* char *fs_root */
> + /* char *fs_pwd */
> +} __attribute__((aligned(8)));
> +
> struct ckpt_hdr_file_table {
> struct ckpt_hdr h;
> __s32 fdt_nfds;
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 089549b..c18b864 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1826,6 +1826,10 @@ extern void drop_collected_mounts(struct vfsmount *);
>
> extern int vfs_statfs(struct dentry *, struct kstatfs *);
>
> +struct fs_struct;
> +extern int do_chdir(struct fs_struct *fs, struct path *path);
> +extern int do_chroot(struct fs_struct *fs, struct path *path);
> +
> extern int current_umask(void);
>
> /* /sys/fs */
> diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
> index 78a05bf..a73cbcb 100644
> --- a/include/linux/fs_struct.h
> +++ b/include/linux/fs_struct.h
> @@ -20,5 +20,7 @@ extern struct fs_struct *copy_fs_struct(struct fs_struct *);
> extern void free_fs_struct(struct fs_struct *);
> extern void daemonize_fs_struct(void);
> extern int unshare_fs_struct(void);
> +extern void get_fs_struct(struct fs_struct *);
> +extern void put_fs_struct(struct fs_struct *);
>
> #endif /* _LINUX_FS_STRUCT_H */
> --
> 1.6.3.3
_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
More information about the Devel
mailing list