[Devel] Re: [PATCH 2/2] Add checkpoint/restart support for epoll files.
Oren Laadan
orenl at librato.com
Tue Sep 29 15:56:06 PDT 2009
Matt Helsley wrote:
> Save/restore epoll items during checkpoint/restart respectively.
> kmalloc failures should be dealt with more kindly than just error-out
> because epoll is made to poll many thousands of file descriptors.
> Subsequent patches will change epoll c/r to "chunk" its output/input
> respectively.
[...]
>
> @@ -311,9 +313,11 @@ static int do_checkpoint_file_table(struct ckpt_ctx *ctx,
> }
>
> ret = deferqueue_run(ctx->files_deferq);
> - ckpt_debug("files_deferq ran %d entries\n", ret);
> - if (ret > 0)
> + if (ret > 0) {
> + ckpt_debug("file checkpoint deferred %d work items\n", ret);
> ret = 0;
> + }
> +
With your permission, I'll do this hunk as a separate patch (and
the restore counterpart too). So you can remove from your patch.
> out:
> kfree(fdtable);
> return ret;
> @@ -604,6 +608,13 @@ static struct restore_file_ops restore_file_ops[] = {
> .file_type = CKPT_FILE_TTY,
> .restore = tty_file_restore,
> },
> +#ifdef CONFIG_EPOLL
> + {
> + .file_name = "EPOLL",
> + .file_type = CKPT_FILE_EPOLL,
> + .restore = ep_file_restore,
> + },
> +#endif
> };
>
> static struct file *do_restore_file(struct ckpt_ctx *ctx)
> @@ -731,9 +742,11 @@ static struct files_struct *do_restore_file_table(struct ckpt_ctx *ctx)
> }
>
> ret = deferqueue_run(ctx->files_deferq);
> - ckpt_debug("files_deferq ran %d entries\n", ret);
> - if (ret > 0)
> + if (ret > 0) {
> + ckpt_debug("file restore deferred %d work items\n", ret);
> ret = 0;
> + }
> +
(This part too).
> out:
> ckpt_hdr_put(ctx, h);
> if (!ret) {
[...]
>
> +#ifdef CONFIG_CHECKPOINT
> +static int ep_file_collect(struct ckpt_ctx *ctx, struct file *file)
> +{
> + struct rb_node *rbp;
> + struct eventpoll *ep;
> + int ret = 0;
> +
> + ep = file->private_data;
> + mutex_lock(&ep->mtx);
> + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
> + struct epitem *epi;
> +
> + epi = rb_entry(rbp, struct epitem, rbn);
> + ret = ckpt_obj_collect(ctx, epi->ffd.file, CKPT_OBJ_FILE);
s/ckpt_obj_collect/ckpt_collect_file/
> + if (ret < 0)
> + break;
> + }
> + mutex_unlock(&ep->mtx);
> + return ret;
> +}
> +
> +struct epoll_deferq_entry {
> + struct ckpt_ctx *ctx;
> + struct file *epfile;
> +};
> +
> +static int ep_items_checkpoint(void *data)
> +{
> + struct epoll_deferq_entry *ep_dq_entry = data;
> + struct ckpt_ctx *ctx;
> + struct file *file;
> + struct ckpt_hdr_eventpoll_items *h;
> + struct rb_node *rbp;
> + struct eventpoll *ep;
> + __s32 epfile_objref;
> + int i, ret;
> +
> + file = ep_dq_entry->epfile;
> + ctx = ep_dq_entry->ctx;
> +
> + epfile_objref = ckpt_obj_lookup(ctx, file, CKPT_OBJ_FILE);
> + BUG_ON(epfile_objref <= 0);
> +
> +
Nit: extraneous newline :p
> + ep = file->private_data;
> + mutex_lock(&ep->mtx);
> + for (i = 0, rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), i++) {}
> + mutex_unlock(&ep->mtx);
> +
> + h = ckpt_hdr_get_type(ctx, sizeof(*h) + i*sizeof(h->items[0]),
> + CKPT_HDR_EPOLL_ITEMS);
> + if (!h)
> + return -ENOMEM;
> +
> + h->num_items = i;
> + h->epfile_objref = epfile_objref;
> +
> + ret = 0;
> + mutex_lock(&ep->mtx);
> + for (i = 0, rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), i++) {
> + struct epitem *epi;
> + int objref;
> +
> + epi = rb_entry(rbp, struct epitem, rbn);
> + objref = ckpt_obj_lookup(ctx, epi->ffd.file, CKPT_OBJ_FILE);
> + if (objref <= 0) {
(Should never return 0)
> + ret = -EBUSY; /* checkpoint obj leak */
> + break;
> + }
> + h->items[i].fd = epi->ffd.fd;
> + h->items[i].file_objref = objref;
> + h->items[i].events = epi->event.events;
> + h->items[i].data = epi->event.data;
> + }
> + mutex_unlock(&ep->mtx);
> + if (!ret && (i != h->num_items))
> + /*
> + * We raced with another thread between our first and second
Not a thread (cannot be a thread, because we checkpoint whole thread
groups). Any process who shared access to this file pointer.
> + * walks of the rbtree such that there weren't the same number
> + * of items. This means there is a checkpoint "leak".
> + */
> + ret = -EBUSY;
> + if (ret == -EBUSY)
> + ckpt_write_err(ctx, "ep_items_checkpoint(): checkpoint leak detected.\n", "");
It would be useful (for the user) to know which fd/item caused the
leak, or whether the leak was because i != h->num_items (You could
use two ckpt_write_err(), one inside the mutex and on here).
Also, the prototype of ckpt_write_err() has changed, see the code.
> + else if (!ret)
> + ret = ckpt_write_obj(ctx, &h->h);
What other value could @ret have ?
> + ckpt_hdr_put(ctx, &h->h);
> + return ret;
> +}
> +
> +static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file)
> +{
> + struct ckpt_hdr_file *h;
> + struct epoll_deferq_entry ep_dq_entry;
> + int ret = -ENOMEM;
> +
> + h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
> + if (!h)
> + return -ENOMEM;
> + h->f_type = CKPT_FILE_EPOLL;
> + ret = checkpoint_file_common(ctx, file, h);
> + if (ret < 0)
> + goto out;
> + ret = ckpt_write_obj(ctx, &h->h);
> + if (ret < 0)
> + goto out;
> +
> + /*
> + * Defer saving the epoll items until all of the ffd.file pointers
> + * have an objref; after the file table has been checkpointed.
> + */
> + ep_dq_entry.ctx = ctx;
> + ep_dq_entry.epfile = file;
> + ret = deferqueue_add(ctx->files_deferq, &ep_dq_entry,
> + sizeof(ep_dq_entry), ep_items_checkpoint, NULL);
> +out:
> + ckpt_hdr_put(ctx, h);
> + return ret;
> +}
> +
> +static int ep_items_restore(void *data)
> +{
> + struct ckpt_ctx *ctx = deferqueue_data_ptr(data);
> + struct ckpt_hdr_eventpoll_items *h;
> + struct eventpoll *ep;
> + struct file *epfile = NULL;
> + int ret, i = 0, remaining_watches;
> +
> + h = ckpt_read_obj(ctx, 0,
> + sizeof(*h) + max_user_watches*sizeof(h->items[0]));
s/ckpt_read_obj/ckpt_read_buf_type/
Saves you the @type and @len test below.
> + if (IS_ERR(h))
> + return PTR_ERR(h);
> +
> + ret = -EINVAL;
> + if ((h->h.type != CKPT_HDR_EPOLL_ITEMS) ||
> + (h->h.len < sizeof(*h)))
> + goto out;
> +
> + /* Make sure the items match the size we expect */
> + if (h->num_items != ((h->h.len - sizeof(*h)) / sizeof(h->items[0])))
> + goto out;
> +
> + epfile = ckpt_obj_fetch(ctx, h->epfile_objref, CKPT_OBJ_FILE);
> + BUG_ON(IS_ERR(epfile));
> + BUG_ON(!is_file_epoll(epfile));
> +
> + /* Make sure there are enough watches left. */
> + ret = -ENOSPC;
> + ep = epfile->private_data;
> + remaining_watches = (max_user_watches -
> + atomic_read(&ep->user->epoll_watches));
> + if (h->num_items > remaining_watches)
> + goto out;
I unsure if this is necessary: running out of watches will be
detected anyway later on. I wouldn't worry about early detection.
> +
> + ret = 0;
> + /* Restore the epoll items/watches */
> + for (i = 0; !ret && i < h->num_items; i++) {
> + struct epoll_event epev;
> + struct file *tfile;
> +
> + /* Get the file* for the target file */
> + if (h->items[i].file_objref <= 0) {
> + ret = -EINVAL;
> + break;
> + }
I should probably change the code elsewhere too, but this test
is unnecessary because ckpt_obj_fetch() will complain anyway.
> + tfile = ckpt_obj_fetch(ctx, h->items[i].file_objref,
> + CKPT_OBJ_FILE);
> + if (IS_ERR(tfile)) {
> + ret = PTR_ERR(tfile);
> + break;
> + }
> +
> + epev.events = h->items[i].events;
> + epev.data = h->items[i].data;
> +
> + ret = do_epoll_ctl(EPOLL_CTL_ADD, h->items[i].fd,
> + epfile, tfile, &epev);
> + }
> +out:
> + ckpt_hdr_put(ctx, h);
> + return ret;
> +}
> +
> +struct file* ep_file_restore(struct ckpt_ctx *ctx,
> + struct ckpt_hdr_file *h)
> +{
> + struct file *epfile;
> + int epfd, ret;
> +
> + if (h->h.type != CKPT_HDR_FILE ||
> + h->h.len != sizeof(*h) ||
> + h->f_type != CKPT_FILE_EPOLL)
> + return ERR_PTR(-EINVAL);
> +
> + epfd = sys_epoll_create1(h->f_flags & EPOLL_CLOEXEC);
> + if (epfd < 0)
> + return ERR_PTR(epfd);
> + epfile = fget(epfd);
> + BUG_ON(!epfile);
> +
> + /*
> + * Needed before we can properly restore the watches and enforce the
> + * limit on watch numbers.
> + */
> + ret = restore_file_common(ctx, epfile, h);
> + if (ret < 0)
> + goto fput_out;
> +
> + /*
> + * Defer restoring the epoll items until the file table is
> + * fully restored. Ensures that valid file objrefs will resolve.
> + */
> + ret = deferqueue_add_ptr(ctx->files_deferq, ctx, ep_items_restore, NULL);
> + if (ret < 0) {
> +fput_out:
> + fput(epfile);
> + epfile = ERR_PTR(ret);
> + }
> + sys_close(epfd); /* harmless even if an error occured */
Nit: you can move this up to right after fget().
[...]
Oren.
_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
More information about the Devel
mailing list