[CRIU] [PATCH] lazy-pages: use random read from page-pipe instead of splitting it

Mike Rapoport mike.rapoport at gmail.com
Wed Jun 14 10:28:04 MSK 2017


ping?

On Wed, Jun 7, 2017 at 11:53 AM, Mike Rapoport <rppt at linux.vnet.ibm.com> wrote:
> For the remote lazy pages case, to access pages in the middle of a pipe we
> are splitting the page_pipe_buffers and iovecs and use splice() to move the
> data between the underlying pipes. After the splits we get page_pipe_buffer
> with single iovec that can be used to splice() the data further into the
> socket.
> This patch replaces the splitting and splicing with use of a helper pipe
> and tee(). We tee() the pages from beginning of the pipe up to the last
> requested page into a helper pipe, sink the unneeded head part into
> /dev/null and we get the requested pages ready for splice() into the
> socket.
> This allows lazy-pages daemon to request the same page several time, which
> is required to properly support fork() after the restore.
> As added bonus we simplify the code and reduce amount of pipes that live in
> the system.
>
> Signed-off-by: Mike Rapoport <rppt at linux.vnet.ibm.com>
> ---
>  criu/include/page-pipe.h | 10 +++++++
>  criu/page-pipe.c         | 73 ++++++++++++++++++++++++++++++++++++++++++++++++
>  criu/page-xfer.c         | 59 +++++++++++++-------------------------
>  3 files changed, 102 insertions(+), 40 deletions(-)
>
> diff --git a/criu/include/page-pipe.h b/criu/include/page-pipe.h
> index 10ae873..76ec1fd 100644
> --- a/criu/include/page-pipe.h
> +++ b/criu/include/page-pipe.h
> @@ -138,4 +138,14 @@ extern int page_pipe_split(struct page_pipe *pp, unsigned long addr,
>
>  extern void page_pipe_destroy_ppb(struct page_pipe_buf *ppb);
>
> +struct pipe_read_dest {
> +       int p[2];
> +       int sink_fd;
> +};
> +
> +extern int pipe_read_dest_init(struct pipe_read_dest *prd);
> +extern int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd,
> +                         unsigned long addr, unsigned int *nr_pages,
> +                         unsigned int ppb_flags);
> +
>  #endif /* __CR_PAGE_PIPE_H__ */
> diff --git a/criu/page-pipe.c b/criu/page-pipe.c
> index 4ebd0cb..4b4b3fc 100644
> --- a/criu/page-pipe.c
> +++ b/criu/page-pipe.c
> @@ -474,6 +474,79 @@ int page_pipe_split(struct page_pipe *pp, unsigned long addr,
>         return 0;
>  }
>
> +int pipe_read_dest_init(struct pipe_read_dest *prd)
> +{
> +       int ret;
> +
> +       if (pipe(prd->p)) {
> +               pr_perror("Cannot create pipe for reading from page-pipe");
> +               return -1;
> +       }
> +
> +       ret = fcntl(prd->p[0], F_SETPIPE_SZ, PIPE_MAX_SIZE * PAGE_SIZE);
> +       if (ret < 0)
> +               return -1;
> +
> +       prd->sink_fd = open("/dev/null", O_WRONLY);
> +       if (prd->sink_fd < 0) {
> +               pr_perror("Cannot open sink for reading from page-pipe");
> +               return -1;
> +       }
> +
> +       ret = fcntl(prd->p[0], F_GETPIPE_SZ, 0);
> +       pr_debug("Created tee pipe size %d\n", ret);
> +
> +       return 0;
> +}
> +
> +int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd,
> +                  unsigned long addr, unsigned int *nr_pages,
> +                  unsigned int ppb_flags)
> +{
> +       struct page_pipe_buf *ppb;
> +       struct iovec *iov = NULL;
> +       unsigned long skip = 0, len;
> +       int ret;
> +
> +       /*
> +        * Get ppb that contains addr and count length of data between
> +        * the beginning of the pipe and addr. If no ppb is found, the
> +        * requested page is mapped to zero pfn
> +        */
> +       ppb = get_ppb(pp, addr, &iov, &skip);
> +       if (!ppb) {
> +               *nr_pages = 0;
> +               return 0;
> +       }
> +
> +       if (!(ppb->flags & ppb_flags)) {
> +               pr_err("PPB flags mismatch: %x %x\n", ppb_flags, ppb->flags);
> +               return false;
> +       }
> +
> +       /* clamp the request if it passes the end of iovec */
> +       len = min((unsigned long)iov->iov_base + iov->iov_len - addr,
> +                 (unsigned long)(*nr_pages) * PAGE_SIZE);
> +       *nr_pages = len / PAGE_SIZE;
> +
> +       /* we should tee() the requested lenth + the beginning of the pipe */
> +       len += skip;
> +
> +       ret = tee(ppb->p[0], prd->p[1], len, 0);
> +       if (ret != len) {
> +               pr_perror("tee: %d", ret);
> +               return -1;
> +       }
> +
> +       ret = splice(prd->p[0], NULL, prd->sink_fd, NULL, skip, 0);
> +       if (ret != skip) {
> +               pr_perror("splice: %d", ret);
> +               return -1;
> +       }
> +
> +       return 0;
> +}
> +
>  void page_pipe_destroy_ppb(struct page_pipe_buf *ppb)
>  {
>         list_del(&ppb->l);
> diff --git a/criu/page-xfer.c b/criu/page-xfer.c
> index c557407..49693bb 100644
> --- a/criu/page-xfer.c
> +++ b/criu/page-xfer.c
> @@ -554,10 +554,19 @@ static struct page_xfer_job cxfer = {
>         .dst_id = ~0,
>  };
>
> +static struct pipe_read_dest pipe_read_dest = {
> +       .sink_fd = -1,
> +};
> +
>  static void page_server_close(void)
>  {
>         if (cxfer.dst_id != ~0)
>                 cxfer.loc_xfer.close(&cxfer.loc_xfer);
> +       if (pipe_read_dest.sink_fd != -1) {
> +               close(pipe_read_dest.sink_fd);
> +               close(pipe_read_dest.p[0]);
> +               close(pipe_read_dest.p[1]);
> +       }
>  }
>
>  static int page_server_open(int sk, struct page_server_iov *pi)
> @@ -653,43 +662,18 @@ static int page_server_add(int sk, struct page_server_iov *pi, u32 flags)
>         return 0;
>  }
>
> -static bool can_send_pages(struct page_pipe_buf *ppb, struct iovec *iov,
> -                          struct page_server_iov *pi)
> -{
> -       unsigned long len = pi->nr_pages * PAGE_SIZE;
> -
> -       if (!(ppb->flags & PPB_LAZY)) {
> -               pr_err("Requested pages are not lazy\n");
> -               return false;
> -       }
> -
> -       if (iov->iov_len != len) {
> -               pr_err("IOV len %zu does not match requested %lu\n",
> -                      iov->iov_len, len);
> -               return false;
> -       }
> -
> -       if(pi->vaddr != encode_pointer(iov->iov_base)) {
> -               pr_err("IOV start %p does not match requested addr %"PRIx64"\n",
> -                      iov->iov_base, pi->vaddr);
> -               return false;
> -       }
> -
> -       return true;
> -}
> -
>  static int page_server_get_pages(int sk, struct page_server_iov *pi)
>  {
>         struct pstree_item *item;
>         struct page_pipe *pp;
> -       struct page_pipe_buf *ppb;
> -       struct iovec *iov;
> +       unsigned long len;
>         int ret;
>
>         item = pstree_item_by_virt(pi->dst_id);
>         pp = dmpi(item)->mem_pp;
>
> -       ret = page_pipe_split(pp, pi->vaddr, &pi->nr_pages);
> +       ret = page_pipe_read(pp, &pipe_read_dest, pi->vaddr,
> +                            &pi->nr_pages, PPB_LAZY);
>         if (ret)
>                 return ret;
>
> @@ -699,23 +683,17 @@ static int page_server_get_pages(int sk, struct page_server_iov *pi)
>                 return send_psi(sk, PS_IOV_ZERO, 0, 0, 0);
>         }
>
> -       ppb = list_first_entry(&pp->bufs, struct page_pipe_buf, l);
> -       iov = &ppb->iov[0];
> -
> -       if (!can_send_pages(ppb, iov, pi))
> -               return -1;
> +       len = pi->nr_pages * PAGE_SIZE;
>
>         if (send_psi(sk, PS_IOV_ADD, pi->nr_pages, pi->vaddr, pi->dst_id))
>                 return -1;
>
> -       ret = splice(ppb->p[0], NULL, sk, NULL, iov->iov_len, SPLICE_F_MOVE);
> -       if (ret != iov->iov_len)
> +       ret = splice(pipe_read_dest.p[0], NULL, sk, NULL, len, SPLICE_F_MOVE);
> +       if (ret != len)
>                 return -1;
>
>         tcp_nodelay(sk, true);
>
> -       page_pipe_destroy_ppb(ppb);
> -
>         return 0;
>  }
>
> @@ -723,8 +701,9 @@ static int page_server_serve(int sk)
>  {
>         int ret = -1;
>         bool flushed = false;
> +       bool receiving_pages = !opts.lazy_pages;
>
> -       if (!opts.lazy_pages) {
> +       if (receiving_pages) {
>                 /*
>                  * This socket only accepts data except one thing -- it
>                  * writes back the has_parent bit from time to time, so
> @@ -741,6 +720,7 @@ static int page_server_serve(int sk)
>                 cxfer.pipe_size = fcntl(cxfer.p[0], F_GETPIPE_SZ, 0);
>                 pr_debug("Created xfer pipe size %u\n", cxfer.pipe_size);
>         } else {
> +               pipe_read_dest_init(&pipe_read_dest);
>                 tcp_cork(sk, true);
>         }
>
> @@ -800,7 +780,6 @@ static int page_server_serve(int sk)
>                         break;
>                 }
>                 case PS_IOV_GET:
> -                       flushed = true;
>                         ret = page_server_get_pages(sk, &pi);
>                         break;
>                 default:
> @@ -813,7 +792,7 @@ static int page_server_serve(int sk)
>                         break;
>         }
>
> -       if (!ret && !flushed) {
> +       if (receiving_pages && !ret && !flushed) {
>                 pr_err("The data were not flushed\n");
>                 ret = -1;
>         }
> --
> 2.7.4
>
> _______________________________________________
> CRIU mailing list
> CRIU at openvz.org
> https://lists.openvz.org/mailman/listinfo/criu



-- 
Sincerely yours,
Mike.


More information about the CRIU mailing list