[Devel] Re: [RFC v4][PATCH 8/9] File descriprtors (dump)
MinChan Kim
minchan.kim at gmail.com
Wed Sep 10 22:02:57 PDT 2008
On Tue, Sep 9, 2008 at 4:42 PM, Oren Laadan <orenl at cs.columbia.edu> wrote:
> Dump the files_struct of a task with 'struct cr_hdr_files', followed by
> all open file descriptors. Since FDs can be shared, they are assigned an
> objref and registered in the object hash.
>
> For each open FD there is a 'struct cr_hdr_fd_ent' with the FD, its objref
> and its close-on-exec property. If the FD is to be saved (first time)
> then this is followed by a 'struct cr_hdr_fd_data' with the FD state.
> Then will come the next FD and so on.
>
> This patch only handles basic FDs - regular files, directories and also
> symbolic links.
>
> Signed-off-by: Oren Laadan <orenl at cs.columbia.edu>
> ---
> checkpoint/Makefile | 2 +-
> checkpoint/checkpoint.c | 4 +
> checkpoint/ckpt_file.c | 221 ++++++++++++++++++++++++++++++++++++++++++++++
> checkpoint/ckpt_file.h | 17 ++++
> include/linux/ckpt.h | 7 +-
> include/linux/ckpt_hdr.h | 34 +++++++-
> 6 files changed, 280 insertions(+), 5 deletions(-)
> create mode 100644 checkpoint/ckpt_file.c
> create mode 100644 checkpoint/ckpt_file.h
>
> diff --git a/checkpoint/Makefile b/checkpoint/Makefile
> index 9843fb9..7496695 100644
> --- a/checkpoint/Makefile
> +++ b/checkpoint/Makefile
> @@ -3,4 +3,4 @@
> #
>
> obj-$(CONFIG_CHECKPOINT_RESTART) += sys.o checkpoint.o restart.o objhash.o \
> - ckpt_mem.o rstr_mem.o
> + ckpt_mem.o rstr_mem.o ckpt_file.o
> diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
> index 4dae775..aebbf22 100644
> --- a/checkpoint/checkpoint.c
> +++ b/checkpoint/checkpoint.c
> @@ -217,6 +217,10 @@ static int cr_write_task(struct cr_ctx *ctx, struct task_struct *t)
> cr_debug("memory: ret %d\n", ret);
> if (ret < 0)
> goto out;
> + ret = cr_write_files(ctx, t);
> + cr_debug("files: ret %d\n", ret);
> + if (ret < 0)
> + goto out;
> ret = cr_write_thread(ctx, t);
> cr_debug("thread: ret %d\n", ret);
> if (ret < 0)
> diff --git a/checkpoint/ckpt_file.c b/checkpoint/ckpt_file.c
> new file mode 100644
> index 0000000..ca58b28
> --- /dev/null
> +++ b/checkpoint/ckpt_file.c
> @@ -0,0 +1,221 @@
> +/*
> + * Checkpoint file descriptors
> + *
> + * Copyright (C) 2008 Oren Laadan
> + *
> + * This file is subject to the terms and conditions of the GNU General Public
> + * License. See the file COPYING in the main directory of the Linux
> + * distribution for more details.
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/sched.h>
> +#include <linux/file.h>
> +#include <linux/fdtable.h>
> +#include <linux/ckpt.h>
> +#include <linux/ckpt_hdr.h>
> +
> +#include "ckpt_file.h"
> +
> +#define CR_DEFAULT_FDTABLE 256 /* an initial guess */
> +
> +/**
> + * cr_scan_fds - scan file table and construct array of open fds
> + * @files: files_struct pointer
> + * @fdtable: (output) array of open fds
> + * @return: the number of open fds found
> + *
> + * Allocates the file descriptors array (*fdtable), caller should free
> + */
> +int cr_scan_fds(struct files_struct *files, int **fdtable)
> +{
> + struct fdtable *fdt;
> + int *fdlist;
> + int i, n, max;
> +
> + n = 0;
> + max = CR_DEFAULT_FDTABLE;
max is read-only variable so that you don't need to declare local variable.
You can use macro.
> + fdlist = kmalloc(max * sizeof(*fdlist), GFP_KERNEL);
> + if (!fdlist)
> + return -ENOMEM;
> +
> + spin_lock(&files->file_lock);
> + fdt = files_fdtable(files);
> + for (i = 0; i < fdt->max_fds; i++) {
> + if (!fcheck_files(files, i))
> + continue;
> + if (n == max) {
> + /* fcheck_files() is safe with drop/re-acquire
> + * of the lock, as it tests: fd < max_fds */
> + spin_unlock(&files->file_lock);
> + max *= 2;
> + if (max < 0) { /* overflow ? */
> + n = -EMFILE;
> + goto out;
> + }
> + fdlist = krealloc(fdlist, max, GFP_KERNEL);
> + if (!fdlist) {
> + n = -ENOMEM;
> + goto out;
> + }
> + spin_lock(&files->file_lock);
> + }
> + fdlist[n++] = i;
> + }
> + spin_unlock(&files->file_lock);
> +
> + *fdtable = fdlist;
> + out:
> + return n;
> +}
> +
> +/* cr_write_fd_data - dump the state of a given file pointer */
> +static int cr_write_fd_data(struct cr_ctx *ctx, struct file *file, int parent)
> +{
> + struct cr_hdr h;
> + struct cr_hdr_fd_data *hh = cr_hbuf_get(ctx, sizeof(*hh));
> + struct dentry *dent = file->f_dentry;
> + struct inode *inode = dent->d_inode;
> + enum fd_type fd_type;
> + int ret;
> +
> + h.type = CR_HDR_FD_DATA;
> + h.len = sizeof(*hh);
> + h.parent = parent;
> +
> + hh->f_flags = file->f_flags;
> + hh->f_mode = file->f_mode;
> + hh->f_pos = file->f_pos;
> + hh->f_uid = file->f_uid;
> + hh->f_gid = file->f_gid;
> + hh->f_version = file->f_version;
> + /* FIX: need also file->f_owner */
> +
> + switch (inode->i_mode & S_IFMT) {
> + case S_IFREG:
> + fd_type = CR_FD_FILE;
> + break;
> + case S_IFDIR:
> + fd_type = CR_FD_DIR;
> + break;
> + case S_IFLNK:
> + fd_type = CR_FD_LINK;
> + break;
> + default:
> + return -EBADF;
> + }
> +
> + /* FIX: check if the file/dir/link is unlinked */
> + hh->fd_type = fd_type;
> +
> + ret = cr_write_obj(ctx, &h, hh);
> + cr_hbuf_put(ctx, sizeof(*hh));
> + if (ret < 0)
> + return ret;
> +
> + return cr_write_fname(ctx, &file->f_path, ctx->vfsroot);
> +}
> +
> +/**
> + * cr_write_fd_ent - dump the state of a given file descriptor
> + * @ctx: checkpoint context
> + * @files: files_struct pointer
> + * @fd: file descriptor
> + *
> + * Save the state of the file descriptor; look up the actual file pointer
> + * in the hash table, and if found save the matching objref, otherwise call
> + * cr_write_fd_data to dump the file pointer too.
> + */
> +static int
> +cr_write_fd_ent(struct cr_ctx *ctx, struct files_struct *files, int fd)
> +{
> + struct cr_hdr h;
> + struct cr_hdr_fd_ent *hh = cr_hbuf_get(ctx, sizeof(*hh));
> + struct file *file = NULL;
> + struct fdtable *fdt;
> + int coe, objref, new, ret;
> +
> + rcu_read_lock();
> + fdt = files_fdtable(files);
> + file = fcheck_files(files, fd);
> + if (file) {
> + coe = FD_ISSET(fd, fdt->close_on_exec);
> + get_file(file);
> + }
> + rcu_read_unlock();
> +
> + /* sanity check (although this shouldn't happen) */
> + if (!file)
> + return -EBADF;
> +
> + new = cr_obj_add_ptr(ctx, (void *) file, &objref, CR_OBJ_FILE, 0);
> + cr_debug("fd %d objref %d file %p c-o-e %d)\n", fd, objref, file, coe);
> +
> + if (new < 0)
> + return new;
> +
> + h.type = CR_HDR_FD_ENT;
> + h.len = sizeof(*hh);
> + h.parent = 0;
> +
> + hh->objref = objref;
> + hh->fd = fd;
> + hh->close_on_exec = coe;
> +
> + ret = cr_write_obj(ctx, &h, hh);
> + cr_hbuf_put(ctx, sizeof(*hh));
> + if (ret < 0)
> + return ret;
> +
> + /* new==1 if-and-only-if file was newly added to hash */
> + if (new)
> + ret = cr_write_fd_data(ctx, file, objref);
> +
> + fput(file);
> + return ret;
> +}
> +
> +int cr_write_files(struct cr_ctx *ctx, struct task_struct *t)
> +{
> + struct cr_hdr h;
> + struct cr_hdr_files *hh = cr_hbuf_get(ctx, sizeof(*hh));
> + struct files_struct *files;
> + int *fdtable;
> + int nfds, n, ret;
> +
> + h.type = CR_HDR_FILES;
> + h.len = sizeof(*hh);
> + h.parent = task_pid_vnr(t);
> +
> + files = get_files_struct(t);
> +
> + hh->objref = 0; /* will be meaningful with multiple processes */
> +
> + nfds = cr_scan_fds(files, &fdtable);
> + if (nfds < 0) {
> + ret = nfds;
> + goto out;
> + }
> +
> + hh->nfds = nfds;
> +
> + ret = cr_write_obj(ctx, &h, hh);
> + cr_hbuf_put(ctx, sizeof(*hh));
> + if (ret < 0)
> + goto clean;
> +
> + cr_debug("nfds %d\n", nfds);
> + for (n = 0; n < nfds; n++) {
> + ret = cr_write_fd_ent(ctx, files, n);
I think your intention is not 'n' but 'fdtable[n]' in argument.
> + if (ret < 0)
> + break;
> + }
> +
> + clean:
> + kfree(fdtable);
> + out:
> + put_files_struct(files);
> +
> + return ret;
> +}
> diff --git a/checkpoint/ckpt_file.h b/checkpoint/ckpt_file.h
> new file mode 100644
> index 0000000..9dc3eba
> --- /dev/null
> +++ b/checkpoint/ckpt_file.h
> @@ -0,0 +1,17 @@
> +#ifndef _CHECKPOINT_CKPT_FILE_H_
> +#define _CHECKPOINT_CKPT_FILE_H_
> +/*
> + * Checkpoint file descriptors
> + *
> + * Copyright (C) 2008 Oren Laadan
> + *
> + * This file is subject to the terms and conditions of the GNU General Public
> + * License. See the file COPYING in the main directory of the Linux
> + * distribution for more details.
> + */
> +
> +#include <linux/fdtable.h>
> +
> +int cr_scan_fds(struct files_struct *files, int **fdtable);
> +
> +#endif /* _CHECKPOINT_CKPT_FILE_H_ */
> diff --git a/include/linux/ckpt.h b/include/linux/ckpt.h
> index d73f79e..ad46baf 100644
> --- a/include/linux/ckpt.h
> +++ b/include/linux/ckpt.h
> @@ -13,7 +13,7 @@
> #include <linux/path.h>
> #include <linux/fs.h>
>
> -#define CR_VERSION 1
> +#define CR_VERSION 2
>
> struct cr_ctx {
> pid_t pid; /* container identifier */
> @@ -80,11 +80,12 @@ int cr_read_string(struct cr_ctx *ctx, void *str, int len);
> int cr_read_fname(struct cr_ctx *ctx, void *fname, int n);
> struct file *cr_read_open_fname(struct cr_ctx *ctx, int flags, int mode);
>
> +int do_checkpoint(struct cr_ctx *ctx);
> int cr_write_mm(struct cr_ctx *ctx, struct task_struct *t);
> -int cr_read_mm(struct cr_ctx *ctx);
> +int cr_write_files(struct cr_ctx *ctx, struct task_struct *t);
>
> -int do_checkpoint(struct cr_ctx *ctx);
> int do_restart(struct cr_ctx *ctx);
> +int cr_read_mm(struct cr_ctx *ctx);
>
> #define cr_debug(fmt, args...) \
> pr_debug("[CR:%s] " fmt, __func__, ## args)
> diff --git a/include/linux/ckpt_hdr.h b/include/linux/ckpt_hdr.h
> index f064cbb..f868dce 100644
> --- a/include/linux/ckpt_hdr.h
> +++ b/include/linux/ckpt_hdr.h
> @@ -17,7 +17,7 @@
> /*
> * To maintain compatibility between 32-bit and 64-bit architecture flavors,
> * keep data 64-bit aligned: use padding for structure members, and use
> - * __attribute__ ((aligned (8))) for the entire structure.
> + * __attribute__((aligned(8))) for the entire structure.
> */
>
> /* records: generic header */
> @@ -42,6 +42,10 @@ enum {
> CR_HDR_VMA,
> CR_HDR_MM_CONTEXT,
>
> + CR_HDR_FILES = 301,
> + CR_HDR_FD_ENT,
> + CR_HDR_FD_DATA,
> +
> CR_HDR_TAIL = 5001
> };
>
> @@ -112,4 +116,32 @@ struct cr_hdr_vma {
>
> } __attribute__((aligned(8)));
>
> +struct cr_hdr_files {
> + __u32 objref; /* identifier for shared objects */
> + __u32 nfds;
> +} __attribute__((aligned(8)));
> +
> +struct cr_hdr_fd_ent {
> + __u32 objref; /* identifier for shared objects */
> + __s32 fd;
> + __u32 close_on_exec;
> +} __attribute__((aligned(8)));
> +
> +/* fd types */
> +enum fd_type {
> + CR_FD_FILE = 1,
> + CR_FD_DIR,
> + CR_FD_LINK
> +};
> +
> +struct cr_hdr_fd_data {
> + __u16 fd_type;
> + __u16 f_mode;
> + __u32 f_flags;
> + __u32 f_uid;
> + __u32 f_gid;
> + __u64 f_pos;
> + __u64 f_version;
> +} __attribute__((aligned(8)));
> +
> #endif /* _CHECKPOINT_CKPT_HDR_H_ */
> --
> 1.5.4.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo at vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
--
Kinds regards,
MinChan Kim
_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
More information about the Devel
mailing list