[CRIU] [PATCH] lazy-pages: use random read from page-pipe instead of splitting it
Pavel Emelyanov
xemul at virtuozzo.com
Mon Jun 19 19:41:56 MSK 2017
On 06/07/2017 11:53 AM, Mike Rapoport wrote:
> For the remote lazy pages case, to access pages in the middle of a pipe we
> are splitting the page_pipe_buffers and iovecs and use splice() to move the
> data between the underlying pipes. After the splits we get page_pipe_buffer
> with single iovec that can be used to splice() the data further into the
> socket.
> This patch replaces the splitting and splicing with use of a helper pipe
> and tee(). We tee() the pages from beginning of the pipe up to the last
> requested page into a helper pipe, sink the unneeded head part into
> /dev/null and we get the requested pages ready for splice() into the
> socket.
> This allows lazy-pages daemon to request the same page several time, which
> is required to properly support fork() after the restore.
> As added bonus we simplify the code and reduce amount of pipes that live in
> the system.
>
> Signed-off-by: Mike Rapoport <rppt at linux.vnet.ibm.com>
Acked-by: Pavel Emelyanov <xemul at virtuozzo.com>
> ---
> criu/include/page-pipe.h | 10 +++++++
> criu/page-pipe.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++
> criu/page-xfer.c | 59 +++++++++++++-------------------------
> 3 files changed, 102 insertions(+), 40 deletions(-)
>
> diff --git a/criu/include/page-pipe.h b/criu/include/page-pipe.h
> index 10ae873..76ec1fd 100644
> --- a/criu/include/page-pipe.h
> +++ b/criu/include/page-pipe.h
> @@ -138,4 +138,14 @@ extern int page_pipe_split(struct page_pipe *pp, unsigned long addr,
>
> extern void page_pipe_destroy_ppb(struct page_pipe_buf *ppb);
>
> +struct pipe_read_dest {
> + int p[2];
> + int sink_fd;
> +};
> +
> +extern int pipe_read_dest_init(struct pipe_read_dest *prd);
> +extern int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd,
> + unsigned long addr, unsigned int *nr_pages,
> + unsigned int ppb_flags);
> +
> #endif /* __CR_PAGE_PIPE_H__ */
> diff --git a/criu/page-pipe.c b/criu/page-pipe.c
> index 4ebd0cb..4b4b3fc 100644
> --- a/criu/page-pipe.c
> +++ b/criu/page-pipe.c
> @@ -474,6 +474,79 @@ int page_pipe_split(struct page_pipe *pp, unsigned long addr,
> return 0;
> }
>
> +int pipe_read_dest_init(struct pipe_read_dest *prd)
> +{
> + int ret;
> +
> + if (pipe(prd->p)) {
> + pr_perror("Cannot create pipe for reading from page-pipe");
> + return -1;
> + }
> +
> + ret = fcntl(prd->p[0], F_SETPIPE_SZ, PIPE_MAX_SIZE * PAGE_SIZE);
> + if (ret < 0)
> + return -1;
> +
> + prd->sink_fd = open("/dev/null", O_WRONLY);
> + if (prd->sink_fd < 0) {
> + pr_perror("Cannot open sink for reading from page-pipe");
> + return -1;
> + }
> +
> + ret = fcntl(prd->p[0], F_GETPIPE_SZ, 0);
> + pr_debug("Created tee pipe size %d\n", ret);
> +
> + return 0;
> +}
> +
> +int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd,
> + unsigned long addr, unsigned int *nr_pages,
> + unsigned int ppb_flags)
> +{
> + struct page_pipe_buf *ppb;
> + struct iovec *iov = NULL;
> + unsigned long skip = 0, len;
> + int ret;
> +
> + /*
> + * Get ppb that contains addr and count length of data between
> + * the beginning of the pipe and addr. If no ppb is found, the
> + * requested page is mapped to zero pfn
> + */
> + ppb = get_ppb(pp, addr, &iov, &skip);
> + if (!ppb) {
> + *nr_pages = 0;
> + return 0;
> + }
> +
> + if (!(ppb->flags & ppb_flags)) {
> + pr_err("PPB flags mismatch: %x %x\n", ppb_flags, ppb->flags);
> + return false;
> + }
> +
> + /* clamp the request if it passes the end of iovec */
> + len = min((unsigned long)iov->iov_base + iov->iov_len - addr,
> + (unsigned long)(*nr_pages) * PAGE_SIZE);
> + *nr_pages = len / PAGE_SIZE;
> +
> + /* we should tee() the requested lenth + the beginning of the pipe */
> + len += skip;
> +
> + ret = tee(ppb->p[0], prd->p[1], len, 0);
> + if (ret != len) {
> + pr_perror("tee: %d", ret);
> + return -1;
> + }
> +
> + ret = splice(prd->p[0], NULL, prd->sink_fd, NULL, skip, 0);
> + if (ret != skip) {
> + pr_perror("splice: %d", ret);
> + return -1;
> + }
> +
> + return 0;
> +}
> +
> void page_pipe_destroy_ppb(struct page_pipe_buf *ppb)
> {
> list_del(&ppb->l);
> diff --git a/criu/page-xfer.c b/criu/page-xfer.c
> index c557407..49693bb 100644
> --- a/criu/page-xfer.c
> +++ b/criu/page-xfer.c
> @@ -554,10 +554,19 @@ static struct page_xfer_job cxfer = {
> .dst_id = ~0,
> };
>
> +static struct pipe_read_dest pipe_read_dest = {
> + .sink_fd = -1,
> +};
> +
> static void page_server_close(void)
> {
> if (cxfer.dst_id != ~0)
> cxfer.loc_xfer.close(&cxfer.loc_xfer);
> + if (pipe_read_dest.sink_fd != -1) {
> + close(pipe_read_dest.sink_fd);
> + close(pipe_read_dest.p[0]);
> + close(pipe_read_dest.p[1]);
> + }
> }
>
> static int page_server_open(int sk, struct page_server_iov *pi)
> @@ -653,43 +662,18 @@ static int page_server_add(int sk, struct page_server_iov *pi, u32 flags)
> return 0;
> }
>
> -static bool can_send_pages(struct page_pipe_buf *ppb, struct iovec *iov,
> - struct page_server_iov *pi)
> -{
> - unsigned long len = pi->nr_pages * PAGE_SIZE;
> -
> - if (!(ppb->flags & PPB_LAZY)) {
> - pr_err("Requested pages are not lazy\n");
> - return false;
> - }
> -
> - if (iov->iov_len != len) {
> - pr_err("IOV len %zu does not match requested %lu\n",
> - iov->iov_len, len);
> - return false;
> - }
> -
> - if(pi->vaddr != encode_pointer(iov->iov_base)) {
> - pr_err("IOV start %p does not match requested addr %"PRIx64"\n",
> - iov->iov_base, pi->vaddr);
> - return false;
> - }
> -
> - return true;
> -}
> -
> static int page_server_get_pages(int sk, struct page_server_iov *pi)
> {
> struct pstree_item *item;
> struct page_pipe *pp;
> - struct page_pipe_buf *ppb;
> - struct iovec *iov;
> + unsigned long len;
> int ret;
>
> item = pstree_item_by_virt(pi->dst_id);
> pp = dmpi(item)->mem_pp;
>
> - ret = page_pipe_split(pp, pi->vaddr, &pi->nr_pages);
> + ret = page_pipe_read(pp, &pipe_read_dest, pi->vaddr,
> + &pi->nr_pages, PPB_LAZY);
> if (ret)
> return ret;
>
> @@ -699,23 +683,17 @@ static int page_server_get_pages(int sk, struct page_server_iov *pi)
> return send_psi(sk, PS_IOV_ZERO, 0, 0, 0);
> }
>
> - ppb = list_first_entry(&pp->bufs, struct page_pipe_buf, l);
> - iov = &ppb->iov[0];
> -
> - if (!can_send_pages(ppb, iov, pi))
> - return -1;
> + len = pi->nr_pages * PAGE_SIZE;
>
> if (send_psi(sk, PS_IOV_ADD, pi->nr_pages, pi->vaddr, pi->dst_id))
> return -1;
>
> - ret = splice(ppb->p[0], NULL, sk, NULL, iov->iov_len, SPLICE_F_MOVE);
> - if (ret != iov->iov_len)
> + ret = splice(pipe_read_dest.p[0], NULL, sk, NULL, len, SPLICE_F_MOVE);
> + if (ret != len)
> return -1;
>
> tcp_nodelay(sk, true);
>
> - page_pipe_destroy_ppb(ppb);
> -
> return 0;
> }
>
> @@ -723,8 +701,9 @@ static int page_server_serve(int sk)
> {
> int ret = -1;
> bool flushed = false;
> + bool receiving_pages = !opts.lazy_pages;
>
> - if (!opts.lazy_pages) {
> + if (receiving_pages) {
> /*
> * This socket only accepts data except one thing -- it
> * writes back the has_parent bit from time to time, so
> @@ -741,6 +720,7 @@ static int page_server_serve(int sk)
> cxfer.pipe_size = fcntl(cxfer.p[0], F_GETPIPE_SZ, 0);
> pr_debug("Created xfer pipe size %u\n", cxfer.pipe_size);
> } else {
> + pipe_read_dest_init(&pipe_read_dest);
> tcp_cork(sk, true);
> }
>
> @@ -800,7 +780,6 @@ static int page_server_serve(int sk)
> break;
> }
> case PS_IOV_GET:
> - flushed = true;
> ret = page_server_get_pages(sk, &pi);
> break;
> default:
> @@ -813,7 +792,7 @@ static int page_server_serve(int sk)
> break;
> }
>
> - if (!ret && !flushed) {
> + if (receiving_pages && !ret && !flushed) {
> pr_err("The data were not flushed\n");
> ret = -1;
> }
>
More information about the CRIU
mailing list