[CRIU] Re: [PATCH cr] [RFC] shmem: rework of dumping shared memory

Pavel Emelyanov xemul at parallels.com
Mon Mar 19 03:29:35 EDT 2012


On 03/17/2012 11:57 PM, Andrey Vagin wrote:
> 
> vma_entry contains shmid and all shared memory are dumped in own files.

Cool!

> The most interesting thing is restore. A maping is restored by process with the
> smallest pid. The mamping is created before executing restorer.  We map a full
> mapping, then we open a file from /proc/pid/map_files and store a descriptor in
> vma_info. The mapping is unmaped.  Now we can map any region of this mapping in
> restorer.
> 
> The content are restored from crtools before the final executing processes.
> I don't want to restore it in restore, because in this case we need to wait
> when all tasks maps this mapping.
> 
> I don't know how to split this patch...

Shame on you :(

First of all, git-bisect-safe is not a strict requirement by now, so you can
split the patch even if the individual parts do not work. But even without this
easing, there's still smth you can split. Find a couple of obvious suggestions
inline.

> Signed-off-by: Andrey Vagin <avagin at openvz.org>
> ---
>  cr-dump.c          |  135 ++++++++++++++++--
>  cr-restore.c       |  406 ++++++++++++++++++++++++++--------------------------
>  cr-show.c          |   24 ---
>  crtools.c          |   17 +--
>  include/crtools.h  |   10 +-
>  include/image.h    |    9 +-
>  include/parasite.h |    4 -
>  include/restorer.h |   18 ++-
>  include/util.h     |    2 +-
>  parasite-syscall.c |   17 +--
>  parasite.c         |   17 +--
>  proc_parse.c       |    4 +-
>  restorer.c         |    8 +-
>  util.c             |    5 +-
>  14 files changed, 375 insertions(+), 301 deletions(-)
> 

> diff --git a/cr-dump.c b/cr-dump.c
> index 7be2a96..2d5ad20 100644
> --- a/cr-dump.c
> +++ b/cr-dump.c
> @@ -415,6 +415,33 @@ static int dump_task_files(pid_t pid, const struct cr_fdset *cr_fdset,
>  	return 0;
>  }
>  
> +struct shmem_info
> +{
> +	unsigned long	size;
> +	unsigned long	shmid;
> +	unsigned long	start;
> +	unsigned long	end;
> +	int		pid;
> +};
> +
> +int nr_shmems;
> +struct shmems {
> +	struct shmem_info entries[0];
> +} *shmems;

Either move the nr_shmems into struct shmems, or remove the struct completely
and use plain struct shmem_info pointer.

> +#define SHMEMS_SIZE	4096
> +
> +static struct shmem_info* shmem_find(unsigned long shmid)
> +{
> +	int i;
> +
> +	for (i = 0; i < nr_shmems; i++)
> +		if (shmems->entries[i].shmid == shmid)
> +			return &shmems->entries[i];
> +
> +	return NULL;
> +}
> +
>  static int dump_task_mappings(pid_t pid, const struct list_head *vma_area_list,
>  			      const struct cr_fdset *cr_fdset)
>  {
> @@ -1402,6 +1443,76 @@ err_free:
>  	return ret;
>  }
>  
> +static int cr_dump_shmem(int pid)
> +{
> +	int i, err;
> +	struct cr_fdset *cr_fdset = NULL;
> +
> +	for (i = 0; i < nr_shmems; i++) {
> +		struct shmem_info *si = &shmems->entries[i];
> +		char path[128];
> +		int pfd, fd;
> +		void *addr;
> +		unsigned long pfn, nrpages = (si->size + PAGE_SIZE -1) / PAGE_SIZE;
> +		unsigned char map[nrpages];

Please, no. Allocate memory for this with xmalloc.

> +
> +		pfd = open_proc(si->pid, "map_files");
> +		if (pfd < 0)
> +			goto err;
> +
> +		snprintf(path, 128, "%lx-%lx", si->start, si->end);
> +		fd = openat(pfd, path, O_RDONLY);

Does the open_proc work for you here? I believe it does.

> +		close(pfd);
> +		if (fd < 0) {
> +			pr_err("can't open file: %s", path);
> +			goto err;
> +		}
> +
> +		addr = mmap(NULL, si->size, PROT_READ, MAP_SHARED, fd, 0);
> +		close(fd);
> +		if (addr == MAP_FAILED) {
> +			pr_err("can't map file: %s", path);
> +			goto err;
> +		}
> +
> +		err = mincore(addr, si->size, map);
> +		if (err) {
> +			munmap(addr,  si->size);
> +			goto err;
> +		}
> +
> +		fd = open_image(CR_FD_SHMEM_PAGES, O_WRONLY | O_CREAT, si->shmid);
> +		if (fd < 0)
> +			goto err;
> +
> +		for (pfn = 0; pfn < nrpages; pfn++) {
> +			u64 offset = pfn * PAGE_SIZE;

offset = pfn << PAGE_SHIFT;

> +			if (!(map[pfn] & PAGE_RSS))
> +				continue;
> +
> +			if (write_img_buf(fd, &offset, sizeof(offset)))
> +				break;
> +			if (write_img_buf(fd, addr + offset, PAGE_SIZE))
> +				break;
> +		}
> +
> +		munmap(addr,  si->size);
> +
> +		if (pfn == nrpages) {
> +			write_img(fd, &zero_page_entry);

Error check missed.

> +			close(fd);
> +		} else {
> +			close(fd);
> +			goto err;
> +		}

This is ugly. Make the err_close mark at the end of a file and turn the pfn dumping
breaks into goto-s.

> +	}
> +
> +	return 0;
> +err:
> +	return -1;
> +}
> +
>  int cr_dump_tasks(pid_t pid, const struct cr_options *opts)
>  {
>  	LIST_HEAD(pstree_list);
> @@ -1440,6 +1551,11 @@ int cr_dump_tasks(pid_t pid, const struct cr_options *opts)
>  		goto err;
>  	close_cr_fdset(&cr_fdset);
>  
> +	nr_shmems = 0;
> +	shmems = malloc(SHMEMS_SIZE);

xmalloc

> +	if (!shmems)
> +		goto err;
> +
>  	list_for_each_entry(item, &pstree_list, list) {
>  		cr_fdset = cr_dump_fdset_open(item->pid, CR_FD_DESC_NONE, NULL);
>  		if (!cr_fdset)
> @@ -1463,8 +1579,9 @@ int cr_dump_tasks(pid_t pid, const struct cr_options *opts)
>  		if (opts->leader_only)
>  			break;
>  	}
> -	ret = 0;
>  
> +	ret = cr_dump_shmem(pid);
> +	free(shmems);

I'd move the free (well, xfree) into cr_dump_shmem.

>  err:
>  	pstree_switch_state(&pstree_list, opts);
>  	free_pstree(&pstree_list);
> diff --git a/cr-restore.c b/cr-restore.c
> index 9100c55..ad005b5 100644
> --- a/cr-restore.c
> +++ b/cr-restore.c
> @@ -199,20 +201,78 @@ static int collect_shmem(int pid, struct shmem_entry *e)
>  		return -1;
>  	}
>  
> -	memset(&shmems->entries[nr_shmems], 0, sizeof(shmems->entries[0]));
> +	si = &shmems->entries[nr_shmems];
> +	shmems->nr_shmems++;
>  
> -	entries[nr_shmems].start	= e->start;
> -	entries[nr_shmems].end		= e->end;
> -	entries[nr_shmems].shmid	= e->shmid;
> -	entries[nr_shmems].pid		= pid;
> +	si->start = vi->start;
> +	si->end	  = vi->end;
> +	si->shmid = vi->shmid;
> +	si->pid	  = pid;
> +	si->size  = size;
> +	si->fd    = -1;
>  
> -	cr_wait_init(&entries[nr_shmems].lock);
> -
> -	shmems->nr_shmems++;
> +	cr_wait_init(&si->lock);
>  
>  	return 0;
>  }
>  
> +static int prepare_shmem_pid(int pid)
> +{
> +	int fd, ret = -1;
> +	struct vma_entry vi;
> +	struct task_core_entry tc;
> +	struct image_header hdr;
> +
> +	fd = open_image_ro(CR_FD_CORE, pid);
> +	if (fd < 0)
> +		return -1;
> +
> +	if (read_img(fd, &hdr) < 0)
> +		goto out;

Use lseek instead. File format may change and the changer will forget
about this code relies on the fact that tc goes right after the hdr.

> +	if (read_img(fd, &tc) < 0)
> +		goto out;
> +
> +	if (tc.task_state == TASK_DEAD) {
> +		ret = 0;
> +		goto out;
> +	}

This one is unnecessary, as the subsequent reads will read no vmas.

> +	lseek(fd, GET_FILE_OFF_AFTER(struct core_entry), SEEK_SET);
> +
> +	while (1) {
> +		ret = read(fd, &vi, sizeof(vi));

Use read_img

> +		if (ret < 0) {
> +			pr_perror("%d: Can't read vma_entry", pid);
> +		} else if (ret != sizeof(vi)) {
> +			pr_err("%d: Incomplete vma_entry (%d != %ld)\n",
> +			       pid, ret, sizeof(vi));
> +			ret = -1;
> +			goto out;
> +		}
> +
> +		pr_info("%d: vma %lx %lx\n", pid, vi.start, vi.end);
> +
> +		if (final_vma_entry(&vi))
> +			break;
> +
> +		if (!vma_entry_is(&vi, VMA_ANON_SHARED))
> +			continue;
> +
> +		if (vma_entry_is(&vi, VMA_AREA_SYSVIPC))
> +			continue;
> +
> +		ret = collect_shmem(pid, &vi);
> +		if (ret)
> +			break;
> +	}
> +
> +out:
> +	close(fd);
> +	return ret;
> +}
> +
> +
>  static int collect_pipe(int pid, struct pipe_entry *e, int p_fd)
>  {
>  	int i;
> @@ -550,15 +491,47 @@ static int try_fixup_shared_map(int pid, struct vma_entry *vi, int fd)
>  			pr_perror("%d: Can't open shmem", pid);
>  			return -1;
>  		}
> -write_fd:
> -		lseek(fd, -sizeof(*vi), SEEK_CUR);
> -		vi->fd = sh_fd;
>  		pr_info("%d: Fixed %lx vma %lx/%d shmem -> %d\n",
>  			pid, vi->start, si->shmid, si->pid, sh_fd);
> -		if (write(fd, vi, sizeof(*vi)) != sizeof(*vi)) {
> -			pr_perror("%d: Can't write img", pid);
> -			return -1;
> -		}
> +		vi->fd = sh_fd;
> +	} else if (si->pid == pid) {

ROFL

> +		void *addr;
> +		int f;
> +		char path[128];
> +
> +		if (si->start == vi->start) {
> +			/* vi->pgoff may be not zero and a full shmem may be
> +			 * mapped here, so we use this dirty hack. */
> +			addr = mmap(NULL, si->size,
> +					PROT_WRITE | PROT_READ,
> +					MAP_SHARED | MAP_ANONYMOUS, -1, 0);
> +			if (addr == MAP_FAILED) {
> +				pr_err("Can't mmap shmid=0x%lx size=%ld\n",
> +							vi->shmid, si->size);
> +				return -1;
> +			}
> +
> +			snprintf(path, 128, "/proc/self/map_files/%lx-%lx",
> +					(unsigned long) addr,
> +					(unsigned long) addr + si->size);
> +
> +			f = open(path, O_RDWR);
> +			munmap(addr, si->size);

Ugly as sin :( Let's find better solution when you're in the office.

> +			if (f < 0)
> +				return -1;
> +
> +			vi->fd = si->fd = f;
> +		} else
> +			vi->fd = dup(si->fd);

si->fd shuold be checked to be valid.
dup error code should be checked.

> +	}
> +
> +write_fd:
> +	lseek(fd, -sizeof(*vi), SEEK_CUR);
> +	if (write(fd, vi, sizeof(*vi)) != sizeof(*vi)) {
> +		pr_perror("%d: Can't write img", pid);
> +		return -1;
>  	}
>  
>  	return 0;
> @@ -590,35 +563,20 @@ static int fixup_vma_fds(int pid, int fd)
>  			continue;
>  
>  		if (vma_entry_is(&vi, VMA_FILE_PRIVATE)	||
> -		    vma_entry_is(&vi, VMA_FILE_SHARED)	||
> -		    vma_entry_is(&vi, VMA_ANON_SHARED)) {
> +		    vma_entry_is(&vi, VMA_FILE_SHARED)) {
>  
>  			pr_info("%d: Fixing %016lx-%016lx %016lx vma\n",
>  				pid, vi.start, vi.end, vi.pgoff);
>  			if (try_fixup_file_map(pid, &vi, fd))
>  				return -1;
> +		}
>  
> +		if (vma_entry_is(&vi, VMA_ANON_SHARED))

The describing pr_info should be here.

>  			if (try_fixup_shared_map(pid, &vi, fd))
>  				return -1;
> -		}
>  	}
> -}
> -
> -static inline bool should_restore_page(int pid, unsigned long va)
> -{
> -	struct shmem_info *si;
> -	struct shmem_id *shmid;
> -
> -	/*
> -	 * If this is not a shmem virtual address
> -	 * we should restore such page.
> -	 */
> -	shmid = find_shmem_id(va);
> -	if (!shmid)
> -		return true;
>  
> -	si = find_shmem_page(shmems, va, shmid->shmid);
> -	return si->pid == pid;
> +	return 0;
>  }
>  
>  /*
> @@ -640,36 +598,14 @@ static int fixup_pages_data(int pid, int fd)
>  		if (read_img(pgfd, &va) < 0)
>  			goto out;
>  
> -		if (va == 0)
> +		if (final_page_va(va))

Chaning pfn terminator from 0 to ~0 deserves separate patch.

>  			break;
>  
>  		write(fd, &va, sizeof(va));
>  		sendfile(fd, pgfd, NULL, PAGE_SIZE);
>  	}
> -	close_safe(&pgfd);
> -
> -	pgfd = open_image_ro(CR_FD_PAGES_SHMEM, pid);
> -	if (pgfd < 0)
> -		return -1;
>  
> -	while (1) {
> -		if (read_img(pgfd, &va) < 0)
> -			goto out;
> -
> -		if (va == 0)
> -			break;
> -
> -		if (!should_restore_page(pid, va)) {
> -			lseek(pgfd, PAGE_SIZE, SEEK_CUR);
> -			continue;
> -		}
> -
> -		pr_info("%d: Restoring shared page: %16lx\n", pid, va);
> -		write(fd, &va, sizeof(va));
> -		sendfile(fd, pgfd, NULL, PAGE_SIZE);
> -	}
> -
> -	ret = write_img(fd, &zero_page_entry);
> +	ret = 0;
>  out:
>  	close_safe(&pgfd);
>  	return ret;
> @@ -1373,6 +1308,73 @@ static int restore_task_with_children(void *_arg)
>  	return restore_one_task(me->pid);
>  }
>  
> +static int restore_shmems(int pid)
> +{
> +	int i, pfd, fd;
> +	void *addr;
> +	int ret;
> +
> +	pr_info("Restore content of shared memory\n");
> +	for (i = 0; i < shmems->nr_shmems; i++) {
> +		struct shmem_info *si = &shmems->entries[i];
> +		char path[128];
> +		u64 offset;
> +
> +		pfd = open_proc(si->pid, "map_files");
> +		if (pfd < 0)
> +			goto err;
> +
> +		snprintf(path, PATH_MAX, "%lx-%lx",
> +						si->start, si->end);
> +
> +		fd = openat(pfd, path, O_RDWR);

Plain open_proc should work here as well.

> +		close(pfd);
> +		if (fd < 0) {
> +			pr_err("can't open file: %s\n", path);
> +			goto err;
> +		}
> +
> +		addr = mmap(NULL, si->size, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0);
> +		close(fd);
> +		if (addr == MAP_FAILED) {
> +			pr_err("can't map file: %s\n", path);
> +			goto err;
> +		}
> +
> +		fd = open_image_ro(CR_FD_SHMEM_PAGES, si->shmid);
> +		if (fd < 0) {
> +			munmap(addr,  si->size);
> +			return -1;
> +		}
> +
> +		while (1) {
> +			ret = read_img_buf(fd, &offset, sizeof(offset));
> +			if (ret <= 0)
> +				break;
> +
> +			if (final_page_va(offset))
> +				break;
> +
> +			if (offset + PAGE_SIZE > si->size) {
> +				break;

break? It should be an error.

> +			}

Cleanup braces.

> +			ret = read_img_buf(fd, addr + offset, PAGE_SIZE);
> +			if (ret <= 0)
> +				break;
> +		}
> +
> +		close(fd);
> +		munmap(addr,  si->size);
> +		if (ret < 0)
> +			goto err;

You can change this into break and avoid err: jump.

> +	}
> +
> +	return 0;
> +err:
> +	return -1;
> +}
> +
>  static int restore_root_task(int fd, struct cr_options *opts)
>  {
>  	struct pstree_entry e;
> @@ -1430,7 +1437,6 @@ static int restore_root_task(int fd, struct cr_options *opts)
>  		perror("sigaction() failed\n");
>  		return -1;
>  	}
> -

Trash

>  	pr_info("Go on!!!\n");
>  	cr_wait_set(&task_entries->start, CR_STATE_COMPLETE);
>  
> diff --git a/crtools.c b/crtools.c
> index a6e4739..5b39645 100644
> --- a/crtools.c
> +++ b/crtools.c
> @@ -22,7 +22,9 @@
>  #include "syscall.h"
>  
>  static struct cr_options opts;
> -struct page_entry zero_page_entry;
> +struct page_entry zero_page_entry = {
> +					.va = ~0LL,
> +					 };

Ugly formatting.

>  char image_dir[PATH_MAX];
>  
>  /*
> @@ -46,9 +48,8 @@ struct cr_fd_desc_tmpl fdset_template[CR_FD_MAX] = {
>  		.magic	= PAGES_MAGIC,
>  	},
>  
> -	/* shared memory pages data */
> -	[CR_FD_PAGES_SHMEM] = {
> -		.fmt	= FMT_FNAME_PAGES_SHMEM,
> +	[CR_FD_SHMEM_PAGES] = {
> +		.fmt	= FMT_FNAME_SHMEM_PAGES,
>  		.magic	= PAGES_MAGIC,

s/CR_FD_PAGES_SHMEM/CR_FD_SHMEM_PAGES/ can go in separate patch.

>  	},
>  
> @@ -70,12 +71,6 @@ struct cr_fd_desc_tmpl fdset_template[CR_FD_MAX] = {
>  		.magic	= PSTREE_MAGIC,
>  	},
>  
> -	/* info about which memory areas are shared */
> -	[CR_FD_SHMEM] = {
> -		.fmt	= FMT_FNAME_SHMEM,
> -		.magic	= SHMEM_MAGIC,
> -	},

You forgot to kill the SHMEM_MAGIC.

> -
>  	/* info about signal handlers */
>  	[CR_FD_SIGACT] = {
>  		.fmt	= FMT_FNAME_SIGACTS,
> diff --git a/include/image.h b/include/image.h
> index 2847440..98db9da 100644
> --- a/include/image.h
> +++ b/include/image.h
> @@ -55,12 +55,6 @@ struct fdinfo_entry {
>  	 ((fe)->type == FDINFO_CWD) ||	\
>  	 ((fe)->type == FDINFO_EXE))
>  
> -struct shmem_entry {
> -	u64	start;
> -	u64	end;
> -	u64	shmid;
> -} __packed;
> -
>  struct pstree_entry {
>  	u32	pid;
>  	u32	nr_children;
> @@ -113,6 +107,7 @@ struct vma_entry {
>  	u64	start;
>  	u64	end;
>  	u64	pgoff;
> +	u64	shmid;

Adding shmid can be in a separate patch. It will be small, but will make
the review simpler anyway.

>  	u32	prot;
>  	u32	flags;
>  	u32	status;
>  		}						\
> diff --git a/parasite-syscall.c b/parasite-syscall.c
> index d1d63a8..d7f9ab6 100644
> --- a/parasite-syscall.c
> +++ b/parasite-syscall.c
> @@ -536,10 +536,6 @@ int parasite_dump_pages_seized(struct parasite_ctl *ctl, struct list_head *vma_a
>  	if (ret < 0)
>  		goto out;
>  
> -	ret = parasite_prep_file(CR_FD_PAGES_SHMEM, ctl, cr_fdset);
> -	if (ret < 0)
> -		goto out;
> -
>  	ret = parasite_execute(PARASITE_CMD_DUMPPAGES_INIT, ctl, st, sizeof(*st));
>  	if (ret < 0) {
>  		pr_err("Dumping pages failed with %li (%li) at %li\n",
> @@ -565,15 +561,15 @@ int parasite_dump_pages_seized(struct parasite_ctl *ctl, struct list_head *vma_a
>  		if (vma_area->vma.status & VMA_AREA_SYSVIPC)
>  			continue;
>  
> +		if (vma_area_is(vma_area, VMA_ANON_SHARED))
> +			continue;
> +

This check shouldn't be here.

>  		pr_info_vma(vma_area);
>  		parasite_dumppages.vma_entry = vma_area->vma;
>  
>  		if (vma_area_is(vma_area, VMA_ANON_PRIVATE) ||
> -		    vma_area_is(vma_area, VMA_FILE_PRIVATE))
> -			parasite_dumppages.fd_type = PG_PRIV;
> -		else if (vma_area_is(vma_area, VMA_ANON_SHARED))
> -			parasite_dumppages.fd_type = PG_SHARED;
> -		else {
> +		    vma_area_is(vma_area, VMA_FILE_PRIVATE)) {
> +		} else {
>  			pr_warn("Unexpected VMA area found\n");
>  			continue;
>  		}

I don't see any changes in restorer.c which fixup the shmem mapping according
to its pgoff.

Thanks,
Pavel



More information about the CRIU mailing list