[CRIU] [PATCH RFC 2/8] criu: page_pipe_buf: add PPB_LAZY flag
Mike Rapoport
rppt at linux.vnet.ibm.com
Sat May 28 23:46:54 PDT 2016
On Fri, May 27, 2016 at 10:26:53PM +0300, Pavel Emelyanov wrote:
> On 05/21/2016 01:49 PM, Mike Rapoport wrote:
> > for buffers that contain potentially lazy pages
>
> But... Why a buffer would contain lazy pages?
The pages of the dumpee should be somewhere. One option is to skip these
pages when parasite is doing vmsplice from the dumpee memory to the
buffers, keep parasite in the dumpee and make it capable of handling random
memory accesses when a page fault occurs at the restore side.
I've chosen to vmsplice everything and just add a flag that will
differentiate between the buffers that should be written to the image files
(e.g VDSO) and the buffers that contain pages that would be transferred
over the network, i.e. ANONYMOUS | PRIVATE. The page_xfer_dump_pages will
skip the PPB_LAZY buffers when writing pages-*.img, and when the restore
will request a lazy page, it will be extracted from the PPB_LAZY buffer and
sent over the network.
Probably I should have explained in more detail what's going on at the
first place, but better later than never. So, in a nutshell, this
implementation of remote lazy pages does the following:
* dump collects the process memory into the pipes and continues without
transferring the memory into images or to the page-server at the restore
size (pretty much like pre-dump action).
* when the dump creates the page_pipe_bufs, it marks the buffers containing
potentially lazy pages with PPB_LAZY
* at the dump_finish stage, the dump side starts TCP server that will
handle page requests from the restore side
* the checkpoint directory is transferred to the restore side
* on the restore side lazy-pages daemon is started, it creates UNIX socket
to receive uffd's from the restore and a TCP socket to forward page
requests to the dump side
* restore creates memory mappings and fills the VMAs that cannot be handled
by uffd with the contents of the pages*img.
* restore registers lazy VMAs with uffd and sends the userfault file
descriptors to the lazy-pages daemon
* when a #PF occurs, the lazy-pages daemon sends PS_IOV_GET command to the dump
side; the command contains PID, the faulting address and amount of pages
(always 1 at the moment)
* the dump side extracts the requested pages from the pipe and splices them
into the TCP socket.
* the lazy-pages daemon copies the received pages into the restored process
address space
> > Signed-off-by: Mike Rapoport <rppt at linux.vnet.ibm.com>
> > ---
> > criu/include/page-pipe.h | 5 ++++-
> > criu/mem.c | 6 +++++-
> > criu/page-pipe.c | 30 ++++++++++++++++++------------
> > criu/shmem.c | 2 +-
> > 4 files changed, 28 insertions(+), 15 deletions(-)
> >
> > diff --git a/criu/include/page-pipe.h b/criu/include/page-pipe.h
> > index a2dc268..031f145 100644
> > --- a/criu/include/page-pipe.h
> > +++ b/criu/include/page-pipe.h
> > @@ -74,6 +74,8 @@ struct page_pipe_buf {
> > unsigned int pipe_size; /* how many pages can be fit into pipe */
> > unsigned int pages_in; /* how many pages are there */
> > unsigned int nr_segs; /* how many iov-s are busy */
> > +#define PPB_LAZY (1 << 0)
> > + unsigned int flags;
> > struct iovec *iov; /* vaddr:len map */
> > struct list_head l; /* links into page_pipe->bufs */
> > };
> > @@ -98,7 +100,8 @@ struct page_pipe {
> > extern struct page_pipe *create_page_pipe(unsigned int nr,
> > struct iovec *, bool chunk_mode);
> > extern void destroy_page_pipe(struct page_pipe *p);
> > -extern int page_pipe_add_page(struct page_pipe *p, unsigned long addr);
> > +extern int page_pipe_add_page(struct page_pipe *p, unsigned long addr,
> > + unsigned int flags);
> > extern int page_pipe_add_hole(struct page_pipe *p, unsigned long addr);
> >
> > extern void debug_show_page_pipe(struct page_pipe *pp);
> > diff --git a/criu/mem.c b/criu/mem.c
> > index becbd6d..0497975 100644
> > --- a/criu/mem.c
> > +++ b/criu/mem.c
> > @@ -137,6 +137,7 @@ static int generate_iovs(struct vma_area *vma, struct page_pipe *pp, u64 *map, u
> >
> > for (pfn = 0; pfn < nr_to_scan; pfn++) {
> > unsigned long vaddr;
> > + unsigned int ppb_flags = 0;
> > int ret;
> >
> > if (!should_dump_page(vma->e, at[pfn]))
> > @@ -144,6 +145,9 @@ static int generate_iovs(struct vma_area *vma, struct page_pipe *pp, u64 *map, u
> >
> > vaddr = vma->e->start + *off + pfn * PAGE_SIZE;
> >
> > + if (vma_entry_can_be_lazy(vma->e))
> > + ppb_flags |= PPB_LAZY;
> > +
> > /*
> > * If we're doing incremental dump (parent images
> > * specified) and page is not soft-dirty -- we dump
> > @@ -155,7 +159,7 @@ static int generate_iovs(struct vma_area *vma, struct page_pipe *pp, u64 *map, u
> > ret = page_pipe_add_hole(pp, vaddr);
> > pages[0]++;
> > } else {
> > - ret = page_pipe_add_page(pp, vaddr);
> > + ret = page_pipe_add_page(pp, vaddr, ppb_flags);
> > pages[1]++;
> > }
> >
> > diff --git a/criu/page-pipe.c b/criu/page-pipe.c
> > index db58f6a..78f8271 100644
> > --- a/criu/page-pipe.c
> > +++ b/criu/page-pipe.c
> > @@ -25,7 +25,7 @@ static inline void iov_init(struct iovec *iov, unsigned long addr)
> > iov->iov_len = PAGE_SIZE;
> > }
> >
> > -static int page_pipe_grow(struct page_pipe *pp)
> > +static int page_pipe_grow(struct page_pipe *pp, unsigned int flags)
> > {
> > struct page_pipe_buf *ppb;
> >
> > @@ -57,6 +57,7 @@ static int page_pipe_grow(struct page_pipe *pp)
> > out:
> > ppb->pages_in = 0;
> > ppb->nr_segs = 0;
> > + ppb->flags = flags;
> > ppb->iov = &pp->iovs[pp->free_iov];
> >
> > return 0;
> > @@ -84,7 +85,7 @@ struct page_pipe *create_page_pipe(unsigned int nr_segs,
> >
> > pp->chunk_mode = chunk_mode;
> >
> > - if (page_pipe_grow(pp))
> > + if (page_pipe_grow(pp, 0))
> > return NULL;
> > }
> >
> > @@ -120,13 +121,16 @@ void page_pipe_reinit(struct page_pipe *pp)
> >
> > pp->free_hole = 0;
> >
> > - if (page_pipe_grow(pp))
> > + if (page_pipe_grow(pp, 0))
> > BUG(); /* It can't fail, because ppb is in free_bufs */
> > }
> >
> > static inline int try_add_page_to(struct page_pipe *pp, struct page_pipe_buf *ppb,
> > - unsigned long addr)
> > + unsigned long addr, unsigned int flags)
> > {
> > + if (ppb->flags != flags)
> > + return 1;
> > +
> > if (ppb->pages_in == ppb->pipe_size) {
> > unsigned long new_size = ppb->pipe_size << 1;
> > int ret;
> > @@ -164,25 +168,27 @@ out:
> > return 0;
> > }
> >
> > -static inline int try_add_page(struct page_pipe *pp, unsigned long addr)
> > +static inline int try_add_page(struct page_pipe *pp, unsigned long addr,
> > + unsigned int flags)
> > {
> > BUG_ON(list_empty(&pp->bufs));
> > - return try_add_page_to(pp, list_entry(pp->bufs.prev, struct page_pipe_buf, l), addr);
> > + return try_add_page_to(pp, list_entry(pp->bufs.prev, struct page_pipe_buf, l), addr, flags);
> > }
> >
> > -int page_pipe_add_page(struct page_pipe *pp, unsigned long addr)
> > +int page_pipe_add_page(struct page_pipe *pp, unsigned long addr,
> > + unsigned int flags)
> > {
> > int ret;
> >
> > - ret = try_add_page(pp, addr);
> > + ret = try_add_page(pp, addr, flags);
> > if (ret <= 0)
> > return ret;
> >
> > - ret = page_pipe_grow(pp);
> > + ret = page_pipe_grow(pp, flags);
> > if (ret < 0)
> > return ret;
> >
> > - ret = try_add_page(pp, addr);
> > + ret = try_add_page(pp, addr, flags);
> > BUG_ON(ret > 0);
> > return ret;
> > }
> > @@ -222,8 +228,8 @@ void debug_show_page_pipe(struct page_pipe *pp)
> > pr_debug("* %u pipes %u/%u iovs:\n",
> > pp->nr_pipes, pp->free_iov, pp->nr_iovs);
> > list_for_each_entry(ppb, &pp->bufs, l) {
> > - pr_debug("\tbuf %u pages, %u iovs:\n",
> > - ppb->pages_in, ppb->nr_segs);
> > + pr_debug("\tbuf %u pages, %u iovs, flags: %x :\n",
> > + ppb->pages_in, ppb->nr_segs, ppb->flags);
> > for (i = 0; i < ppb->nr_segs; i++) {
> > iov = &ppb->iov[i];
> > pr_debug("\t\t%p %lu\n", iov->iov_base, iov->iov_len / PAGE_SIZE);
> > diff --git a/criu/shmem.c b/criu/shmem.c
> > index 493477e..05458c7 100644
> > --- a/criu/shmem.c
> > +++ b/criu/shmem.c
> > @@ -405,7 +405,7 @@ static int dump_one_shmem(struct shmem_info_dump *si)
> > if (!(map[pfn] & PAGE_RSS))
> > continue;
> > again:
> > - ret = page_pipe_add_page(pp, (unsigned long)addr + pfn * PAGE_SIZE);
> > + ret = page_pipe_add_page(pp, (unsigned long)addr + pfn * PAGE_SIZE, 0);
> > if (ret == -EAGAIN) {
> > ret = dump_pages(pp, &xfer, addr);
> > if (ret)
> >
>
More information about the CRIU
mailing list