[CRIU] [PATCH RFC 6/8] criu: page-xfer: add PS_IOV_GET interface
Mike Rapoport
rppt at linux.vnet.ibm.com
Mon May 30 05:29:16 PDT 2016
On Mon, May 30, 2016 at 02:00:41PM +0300, Pavel Emelyanov wrote:
> On 05/29/2016 09:52 AM, Mike Rapoport wrote:
> > On Fri, May 27, 2016 at 10:31:59PM +0300, Pavel Emelyanov wrote:
> >> On 05/21/2016 01:49 PM, Mike Rapoport wrote:
> >>> When dump side is acting as lazy pages server it should be able to respond
> >>> to random page access requests
> >>> The protocol is quite simple:
> >>> - the restore sends PS_IOV_GET command with PID, address and number
> >>> of pages it wishes to get
> >>
> >> Ack
> >>
> >>> - the dump side replies with PS_IOV_GET command.
> >>
> >> Oops. Why PS_IOV_GET? We have PS_IOV_ADD for sending pages.
> >
> > PS_IOV_ADD is for pushing pages. PS_IOV_GET is for pulling them :)
>
> Yes, _GET is what restore side sends to dump side, but then dump side
> should just do regular PS_IOV_ADD, no? (one more comment below).
What do you mean by PS_IOV_ADD? Set the pi.cmd to IOV_ADD or use
->write_pagemap and ->write_pages?
The later requires page_xfer, which in turn is pretty much per-process
object, at least at the moment.
We can add PID-agnostic, say, lazy-page-xfer and helpers for
->write_pagemap and ->write_pages to use by both lazy-page-xfer and
page-server-xfer.
> >>> The nr_pages field is
> >>> updated to reflect actual amount of pages that the dump side is going to
> >>> send. If the pages in question are mapped to zero pfn, the entire
> >>> PS_IOV_GET reply is zeroed.
> >>> - After the PS_IOV_GET command the dump side sends actual page data
> >>>
> >>> Signed-off-by: Mike Rapoport <rppt at linux.vnet.ibm.com>
> >>> ---
> >>> criu/include/page-xfer.h | 2 +
> >>> criu/page-xfer.c | 99 ++++++++++++++++++++++++++++++++++++++++++++----
> >>> 2 files changed, 93 insertions(+), 8 deletions(-)
> >>>
> >>>
> >>> +static int page_server_get_pages(int sk, struct page_server_iov *pi)
> >>> +{
> >>> + struct pstree_item *item;
> >>> + struct page_pipe *pp;
> >>> + struct page_pipe_buf *ppb;
> >>> + struct iovec *iov;
> >>> + int ret;
> >>> +
> >>> + item = pstree_item_by_virt(pi->dst_id);
> >>> + pp = item->parasite_ctl->mem_pp;
> >>> +
> >>> + ret = page_pipe_split(pp, pi->vaddr, &pi->nr_pages);
> >>> + if (ret)
> >>> + return ret;
> >>> +
> >>> + if (pi->nr_pages == 0) {
> >>> + /* no iovs found means we've hit a zero page */
> >>> + pr_debug("no iovs found, zero pages\n");
> >>> + memset(pi, 0, sizeof(*pi));
>
> This looks like PS_IOV_HOLE. But even if it doesn't let's add special
> PS_IOV_..._ZERO_PAGE? command for this instead of zeroified pi.
If I understand correctly, HOLE means that pages are in parent checkpoint.
I'd go with ZERO_PAGE...
Again, we can add some helper to ->write_hole :)
> >>> +
> >>> + return write(sk, pi, sizeof(*pi)) != sizeof(*pi);
> >>> + }
> >>> +
> >>> + ppb = list_first_entry(&pp->bufs, struct page_pipe_buf, l);
> >>> + iov = &ppb->iov[0];
> >>> +
> >>> + BUG_ON(!(ppb->flags & PPB_LAZY));
> >>> + BUG_ON(iov->iov_len != pi->nr_pages * PAGE_SIZE);
> >>> + BUG_ON(pi->vaddr != encode_pointer(iov->iov_base));
> >>> +
> >>> + if (write(sk, pi, sizeof(*pi)) != sizeof(*pi))
> >>> + return -1;
> >>> +
> >>> + ret = splice(ppb->p[0], NULL, sk, NULL, iov->iov_len, SPLICE_F_MOVE);
> >>> + if (ret != iov->iov_len)
> >>> + return -1;
> >>> +
> >>> + return 0;
> >>> +}
> >>> +
> >>> static int page_server_check_parent(int sk, struct page_server_iov *pi);
> >>>
> >>> static int page_server_serve(int sk)
> >>> @@ -190,14 +233,16 @@ static int page_server_serve(int sk)
> >>> */
> >>> tcp_nodelay(sk, true);
> >>>
> >>> - if (pipe(cxfer.p)) {
> >>> - pr_perror("Can't make pipe for xfer");
> >>> - close(sk);
> >>> - return -1;
> >>> - }
> >>> + if (!opts.lazy_pages) {
> >>> + if (pipe(cxfer.p)) {
> >>> + pr_perror("Can't make pipe for xfer");
> >>> + close(sk);
> >>> + return -1;
> >>> + }
> >>>
> >>> - cxfer.pipe_size = fcntl(cxfer.p[0], F_GETPIPE_SZ, 0);
> >>> - pr_debug("Created xfer pipe size %u\n", cxfer.pipe_size);
> >>> + cxfer.pipe_size = fcntl(cxfer.p[0], F_GETPIPE_SZ, 0);
> >>> + pr_debug("Created xfer pipe size %u\n", cxfer.pipe_size);
> >>> + }
> >>>
> >>> while (1) {
> >>> struct page_server_iov pi;
> >>> @@ -249,6 +294,10 @@ static int page_server_serve(int sk)
> >>> flushed = true;
> >>> break;
> >>> }
> >>> + case PS_IOV_GET:
> >>> + flushed = true;
> >>> + ret = page_server_get_pages(sk, &pi);
> >>> + break;
> >>> default:
> >>> pr_err("Unknown command %u\n", pi.cmd);
> >>> ret = -1;
> >>> @@ -291,7 +340,8 @@ int cr_page_server(bool daemon_mode, int cfd)
> >>> int sk = -1;
> >>> int ret;
> >>>
> >>> - up_page_ids_base();
> >>> + if (!opts.lazy_pages)
> >>> + up_page_ids_base();
> >>>
> >>> if (opts.ps_socket != -1) {
> >>> ret = 0;
> >>> @@ -787,3 +837,36 @@ int check_parent_page_xfer(int fd_type, long id)
> >>> else
> >>> return check_parent_local_xfer(fd_type, id);
> >>> }
> >>> +
> >>> +int get_remote_pages(int pid, unsigned long addr, int nr_pages, void *dest)
> >>> +{
> >>> + int ret;
> >>> +
> >>> + struct page_server_iov pi = {
> >>> + .cmd = PS_IOV_GET,
> >>> + .nr_pages = nr_pages,
> >>> + .vaddr = addr,
> >>> + .dst_id = pid,
> >>> + };
> >>> +
> >>> + ret = write(page_server_sk, &pi, sizeof(pi));
> >>> + if (ret != sizeof(pi))
> >>> + return -1;
> >>> +
> >>> + ret = recv(page_server_sk, &pi, sizeof(pi), MSG_WAITALL);
> >>> + if (ret != sizeof(pi))
> >>> + return -1;
> >>> +
> >>> + /* zero page */
> >>> + if (pi.cmd == 0 && pi.vaddr == 0 && pi.nr_pages == 0 && pi.dst_id == 0)
> >>> + return 0;
> >>> +
> >>> + if (pi.nr_pages > nr_pages)
> >>> + return -1;
> >>> +
> >>> + ret = recv(page_server_sk, dest, PAGE_SIZE, MSG_WAITALL);
> >>> + if (ret != PAGE_SIZE)
> >>> + return -1;
> >>> +
> >>> + return 1;
> >>> +}
> >>>
> >>
> >
> > .
> >
>
More information about the CRIU
mailing list