[Devel] Re: [RFC v14-rc2][PATCH 17/29] Checkpoint open pipes
Serge E. Hallyn
serue at us.ibm.com
Wed Apr 1 12:47:26 PDT 2009
Quoting Oren Laadan (orenl at cs.columbia.edu):
> A pipe is essentially a double-headed inode with a buffer attached to
> it. We checkpoint the pipe buffer only once, as soon as we hit one
> side of the pipe, regardless whether it is read- or write- end.
>
> To checkpoint a file descriptor that refers to a pipe (either end), we
> first lookup the inode in the hash table:
>
> If not found, it is the first encounter of this pipe. Besides the file
> descriptor, we also (a) save the pipe data, and (b) register the pipe
> inode in the hash. We save the 'objref' of the inode 'in ->fd_objref'
> of the file descriptor. The file descriptor type becomes CR_FD_PIPE.
>
> If found, it is the second encounter of this pipe, namely, as we hit
> the other end of the same pipe. In this case we need only record the
> reference ('objref') to the inode that we had saved before, and the
> file descriptor type is changed to CR_FD_OBJREF.
>
> The type CR_FD_PIPE will indicate to the kernel to create a new pipe;
> since both ends are created at the same time, one end will be used,
> and the other end will be deposited in the hash table for later use.
> The type CR_FD_OBJREF will indicate that the corresponding file
> descriptor is already setup and registered in the hash using the
> '->fd_objref' that it had been assigned.
>
> The format of the pipe data is as follows:
>
> struct cr_hdr_fd_pipe {
> __u32 nr_bufs;
> }
>
> cr_hdr + cr_hdr_fd_ent
> cr_hdr + cr_hdr_fd_data
> cr_hdr + cr_hdr_fd_pipe -> # buffers
> cr_hdr + cr_hdr_buffer -> 1st buffer
> cr_hdr + cr_hdr_buffer -> 2nd buffer
> cr_hdr + cr_hdr_buffer -> 3rd buffer
> ...
>
> Changelog[v14]:
> - Use 'fd_type' instead of 'hh->fd_objref' in cr_write_fd_data()
> - Revert change to pr_debug(), back to cr_debug()
> - Discard the 'h.parent' field
> - Check whether calls to cr_hbuf_get() fail
> - Test that a pipe's inode != ctx->file's inode to prevent deadlock
>
> Signed-off-by: Oren Laadan <orenl at cs.columbia.edu>
Acked-by: Serge Hallyn <serue at us.ibm.com>
But:
> ---
> checkpoint/ckpt_file.c | 2 +
> fs/pipe.c | 113 ++++++++++++++++++++++++++++++++++++++++
> include/linux/checkpoint_hdr.h | 8 +++-
> 3 files changed, 122 insertions(+), 1 deletions(-)
>
> diff --git a/checkpoint/ckpt_file.c b/checkpoint/ckpt_file.c
> index 0fe68bf..dd26b3d 100644
> --- a/checkpoint/ckpt_file.c
> +++ b/checkpoint/ckpt_file.c
> @@ -12,6 +12,7 @@
> #include <linux/sched.h>
> #include <linux/file.h>
> #include <linux/fdtable.h>
> +#include <linux/pipe_fs_i.h>
> #include <linux/checkpoint.h>
> #include <linux/checkpoint_hdr.h>
>
> @@ -72,6 +73,7 @@ int cr_scan_fds(struct files_struct *files, int **fdtable)
> return n;
> }
>
> +
> static int cr_write_file_generic(struct cr_ctx *ctx, struct file *file,
> struct cr_hdr_file *hh)
> {
> diff --git a/fs/pipe.c b/fs/pipe.c
> index 14f502b..0c3f391 100644
> --- a/fs/pipe.c
> +++ b/fs/pipe.c
> @@ -22,6 +22,9 @@
> #include <asm/uaccess.h>
> #include <asm/ioctls.h>
>
> +#include <linux/checkpoint.h>
> +#include <linux/checkpoint_hdr.h>
> +
> /*
> * We use a start+len construction, which provides full use of the
> * allocated memory.
> @@ -771,6 +774,113 @@ pipe_rdwr_open(struct inode *inode, struct file *filp)
> return 0;
> }
>
> +/* cr_write_pipebuf - dump contents of a pipe/fifo (assume i_mutex taken) */
> +static int cr_write_pipebuf(struct cr_ctx *ctx, struct pipe_inode_info *pipe)
> +{
> + struct cr_hdr h;
> + void *kbuf, *addr;
> + int i, ret = 0;
> +
> + kbuf = (void *) __get_free_page(GFP_KERNEL);
> + if (!kbuf)
> + return -ENOMEM;
> +
> + /* this is a simplified fs/pipe.c:read_pipe() */
pipe_read() actually :)
> +
> + for (i = 0; i < pipe->nrbufs; i++) {
> + int nn = (pipe->curbuf + i) & (PIPE_BUFFERS-1);
> + struct pipe_buffer *pbuf = pipe->bufs + nn;
> + const struct pipe_buf_operations *ops = pbuf->ops;
> +
> + ret = ops->confirm(pipe, pbuf);
> + if (ret < 0)
> + break;
not that it seems to matter, but pipe_read() returns error
also if ret > 0.
> +
> + addr = ops->map(pipe, pbuf, 1);
> + memcpy(kbuf, addr + pbuf->offset, pbuf->len);
> + ops->unmap(pipe, pbuf, addr);
> +
> + h.type = CR_HDR_BUFFER;
> + h.len = pbuf->len;
> +
> + ret = cr_write_obj(ctx, &h, kbuf);
> + if (ret < 0)
> + break;
> + }
> +
> + free_page((unsigned long) kbuf);
> + return ret;
> +}
> +
> +/* cr_write_pipe - dump pipe (assume i_mutex taken) */
> +static int cr_write_pipe(struct cr_ctx *ctx, struct inode *inode)
> +{
> + struct cr_hdr h;
> + struct cr_hdr_fd_pipe *hh;
> + struct pipe_inode_info *pipe = inode->i_pipe;
> + int ret;
> +
> + h.type = CR_HDR_FD_PIPE;
> + h.len = sizeof(*hh);
> +
> + hh = cr_hbuf_get(ctx, sizeof(*hh));
> + if (!hh)
> + return -ENOMEM;
> +
> + hh->nr_bufs = pipe->nrbufs;
> +
> + ret = cr_write_obj(ctx, &h, hh);
> + cr_hbuf_put(ctx, sizeof(*hh));
> + if (ret < 0)
> + return ret;
> +
> + return cr_write_pipebuf(ctx, pipe);
> +}
> +
> +static int pipe_file_checkpoint(struct cr_ctx *ctx,
> + struct file *file, struct cr_hdr_file *hh)
> +{
> + struct cr_hdr h;
> + struct inode *inode = file->f_dentry->d_inode;
> + int new, objref;
> + int ret;
> +
> + /*
> + * We take the inode's mutex and later will call vfs_write(),
> + * which also takes an inode's mutex. To avoid deadlock, make
> + * sure that the two inodes are distinct.
> + */
> + if (ctx->file->f_dentry->d_inode == inode) {
> + pr_warning("c/r: writing to pipe that is checkpointed "
> + "may result in a deadlock ... aborting\n");
> + return -EDEADLK;
> + }
> +
> + h.type = CR_HDR_FILE;
> + h.len = sizeof(*hh);
> +
> + new = cr_obj_add_ptr(ctx, inode, &objref, CR_OBJ_INODE, 0);
> + cr_debug("objref %d inode %p new %d\n", objref, inode, new);
> + if (new < 0)
> + return new;
> +
> + hh->fd_type = (new ? CR_FD_PIPE : CR_FD_OBJREF);
The git commit msg has a good explanation, but it's worth a comment
in the code as well, that on first instance we call it
CR_FD_PIPE and second time CR_FD_OBJREF.
> + hh->fd_objref = objref;
> +
> + ret = cr_write_obj(ctx, &h, hh);
> + if (ret < 0)
> + return ret;
> +
> + if (new) {
> + mutex_lock(&inode->i_mutex);
> + ret = cr_write_pipe(ctx, inode);
> + mutex_unlock(&inode->i_mutex);
> + }
> +
> + return ret;
> +}
> +
> +
> /*
> * The file_operations structs are not static because they
> * are also used in linux/fs/fifo.c to do operations on FIFOs.
> @@ -787,6 +897,7 @@ const struct file_operations read_pipefifo_fops = {
> .open = pipe_read_open,
> .release = pipe_read_release,
> .fasync = pipe_read_fasync,
> + .checkpoint = pipe_file_checkpoint,
> };
>
> const struct file_operations write_pipefifo_fops = {
> @@ -799,6 +910,7 @@ const struct file_operations write_pipefifo_fops = {
> .open = pipe_write_open,
> .release = pipe_write_release,
> .fasync = pipe_write_fasync,
> + .checkpoint = pipe_file_checkpoint,
> };
>
> const struct file_operations rdwr_pipefifo_fops = {
> @@ -812,6 +924,7 @@ const struct file_operations rdwr_pipefifo_fops = {
> .open = pipe_rdwr_open,
> .release = pipe_rdwr_release,
> .fasync = pipe_rdwr_fasync,
> + .checkpoint = pipe_file_checkpoint,
> };
>
> struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
> diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
> index 9ad845d..ce5d880 100644
> --- a/include/linux/checkpoint_hdr.h
> +++ b/include/linux/checkpoint_hdr.h
> @@ -57,6 +57,7 @@ enum {
> CR_HDR_FD_TABLE = 301,
> CR_HDR_FD_ENT,
> CR_HDR_FILE,
> + CR_HDR_FD_PIPE,
>
> CR_HDR_TAIL = 5001
> };
> @@ -152,7 +153,8 @@ struct cr_hdr_fd_ent {
> /* fd types */
> enum fd_type {
> CR_FD_OBJREF = 1,
> - CR_FD_GENERIC
> + CR_FD_GENERIC,
> + CR_FD_PIPE,
> };
>
> struct cr_hdr_file {
> @@ -165,4 +167,8 @@ struct cr_hdr_file {
> __u64 f_version;
> } __attribute__((aligned(8)));
>
> +struct cr_hdr_fd_pipe {
> + __s32 nr_bufs;
> +} __attribute__((aligned(8)));
> +
> #endif /* _CHECKPOINT_CKPT_HDR_H_ */
> --
> 1.5.4.3
_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
More information about the Devel
mailing list