[CRIU] parasite-syscall redone

Cyrill Gorcunov gorcunov at openvz.org
Wed Feb 15 08:36:51 EST 2012


Hi guys, reimplementing parasite syscall code consumed
more time than I expected, but I personnaly more-less
satisfied with result.

The diff itself is huge and almost unreadable, so I
put the whole parasite-syscall.c here.

I'll review it more and clean it up a bit. But
tests are passed, except

Test: zdtm/live/static/msgque
====================== ERROR ======================
Dump log   : /home/crtools/test/dump/msgque/16786/dump.log
Restore log: /home/crtools/test/dump/msgque/16786/restore.log
make: *** [test] Error 1

which I'm not sure yet why. Anyway, to share code early,
here we go.

	Cyrill
---
[cyrill at moon crtools]$ git diff --stat 7c961a7b8f07c2d756758d2319c3bc34c54e0a7d
 cr-dump.c                  |    2 +-
 include/parasite-syscall.h |   29 ++-
 include/ptrace.h           |    1 +
 parasite-syscall.c         |  485 +++++++++++++++++++++++---------------------
 ptrace.c                   |   18 ++
 5 files changed, 294 insertions(+), 241 deletions(-)
[cyrill at moon crtools]$ 
---

parasite-syscall.c
==================
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
#include <limits.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>

#include <sys/ptrace.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/user.h>
#include <sys/wait.h>
#include <sys/socket.h>

#include "crtools.h"
#include "compiler.h"
#include "syscall.h"
#include "types.h"
#include "ptrace.h"
#include "util.h"
#include "util-net.h"
#include "log.h"

#include "processor-flags.h"
#include "parasite-syscall.h"
#include "parasite-blob.h"
#include "parasite.h"

#ifdef CONFIG_X86_64
static const char code_syscall[] = {0x0f, 0x05, 0xcc, 0xcc,
				    0xcc, 0xcc, 0xcc, 0xcc};

#define code_syscall_size	(round_up(sizeof(code_syscall), sizeof(long)))
#define parasite_size		(round_up(sizeof(parasite_blob), sizeof(long)))

static int can_run_syscall(unsigned long ip, unsigned long start, unsigned long end)
{
	return ip >= start && ip < (end - code_syscall_size);
}

static int syscall_fits_vma_area(struct vma_area *vma_area)
{
	return can_run_syscall((unsigned long)vma_area->vma.start,
			       (unsigned long)vma_area->vma.start,
			       (unsigned long)vma_area->vma.end);
}

static struct vma_area *get_vma_by_ip(struct list_head *vma_area_list, unsigned long ip)
{
	struct vma_area *vma_area;

	list_for_each_entry(vma_area, vma_area_list, list) {
		if (!in_vma_area(vma_area, ip))
			continue;
		if (!(vma_area->vma.prot & PROT_EXEC))
			continue;
		if (syscall_fits_vma_area(vma_area))
			return vma_area;
	}

	return NULL;
}

/* Note it's destructive on @regs */
static void parasite_setup_regs(unsigned long new_ip, user_regs_struct_t *regs)
{
	regs->ip = new_ip;

	/* Avoid end of syscall processing */
	regs->orig_ax = -1;

	/* Make sure flags are in known state */
	regs->flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_DF | X86_EFLAGS_IF);
}

/*
 * @regs must already have been tuned up for parasite execution
 */
static int __parasite_execute(struct parasite_ctl *ctl, user_regs_struct_t *regs)
{
	pid_t pid = ctl->pid;
	siginfo_t siginfo;
	int status;
	int ret = -1;

again:
	if (ptrace(PTRACE_SETREGS, pid, NULL, regs)) {
		pr_err("Can't set registers (pid: %d)\n", pid);
		goto err;
	}

	/*
	 * Most ideas are taken from Tejun Heo's parasite thread
	 * https://code.google.com/p/ptrace-parasite/
	 */

	if (ptrace(PTRACE_CONT, pid, NULL, NULL)) {
		pr_err("Can't continue (pid: %d)\n", pid);
		goto err;
	}

	if (wait4(pid, &status, __WALL, NULL) != pid) {
		pr_err("Waited pid mismatch (pid: %d)\n", pid);
		goto err;
	}

	if (!WIFSTOPPED(status)) {
		pr_err("Task is still running (pid: %d)\n", pid);
		goto err;
	}

	if (ptrace(PTRACE_GETSIGINFO, pid, NULL, &siginfo)) {
		pr_err("Can't get siginfo (pid: %d)\n", pid);
		goto err;
	}

	if (ptrace(PTRACE_GETREGS, pid, NULL, regs)) {
		pr_err("Can't obtain registers (pid: %d)\n", pid);
			goto err;
	}

	if (WSTOPSIG(status) != SIGTRAP || siginfo.si_code != SI_KERNEL) {
retry_signal:
		pr_debug("** delivering signal %d si_code=%d\n",
			 siginfo.si_signo, siginfo.si_code);

		/* FIXME: jerr(siginfo.si_code > 0, err_restore); */

		/*
		 * This requires some explanation. If signal from original
		 * program delivered while we're trying to execute our
		 * injected code -- we need to setup original registers
		 * so the kernel would set up sigframe and update original
		 * program registers. Then we try to continue execution
		 * of our code. Note, the kernel does not modify any data
		 * except registers set, so we do not restore original program
		 * code here.
		 */

		if (ptrace(PTRACE_SETREGS, pid, NULL, &ctl->regs_orig)) {
			pr_panic("Can't set registers (pid: %d)\n", pid);
			goto err;
		}

		if (ptrace(PTRACE_INTERRUPT, pid, NULL, NULL)) {
			pr_panic("Can't interrupt (pid: %d)\n", pid);
			goto err;
		}

		if (ptrace(PTRACE_CONT, pid, NULL, (void *)(unsigned long)siginfo.si_signo)) {
			pr_err("Can't continue (pid: %d)\n", pid);
			goto err;
		}

		if (wait4(pid, &status, __WALL, NULL) != pid) {
			pr_err("Waited pid mismatch (pid: %d)\n", pid);
			goto err;
		}

		if (!WIFSTOPPED(status)) {
			pr_err("Task is still running (pid: %d)\n", pid);
			goto err;
		}

		if (ptrace(PTRACE_GETSIGINFO, pid, NULL, &siginfo)) {
			pr_err("Can't get siginfo (pid: %d)\n", pid);
			goto err;
		}

		if (siginfo.si_code >> 8 != PTRACE_EVENT_STOP)
			goto retry_signal;

		/*
		 * Signal is delivered, so we should have original
		 * registers modified, update them here.
		 */
		{
			user_regs_struct_t r;
			if (ptrace(PTRACE_GETREGS, pid, NULL, &r)) {
				pr_err("Can't obtain registers (pid: %d)\n", pid);
				goto err;
			}
			ctl->regs_orig = r;
		}

		goto again;
	}

	/*
	 * Our code is done.
	 */
	if (ptrace(PTRACE_INTERRUPT, pid, NULL, NULL)) {
		pr_panic("Can't interrupt (pid: %d)\n", pid);
		goto err;
	}

	if (ptrace(PTRACE_CONT, pid, NULL, NULL)) {
		pr_err("Can't continue (pid: %d)\n", pid);
		goto err;
	}

	if (wait4(pid, &status, __WALL, NULL) != pid) {
		pr_err("Waited pid mismatch (pid: %d)\n", pid);
		goto err;
	}

	if (!WIFSTOPPED(status)) {
		pr_err("Task is still running (pid: %d)\n", pid);
		goto err;
	}

	if (ptrace(PTRACE_GETSIGINFO, pid, NULL, &siginfo)) {
		pr_err("Can't get siginfo (pid: %d)\n", pid);
		goto err;
	}

	if (siginfo.si_code >> 8 != PTRACE_EVENT_STOP) {
		pr_err("si_code doesn't match (pid: %d si_code: %d)\n",
			pid, siginfo.si_code);
		goto err;
	}

	ret = 0;
err:
	return ret;
}

static int parasite_execute(unsigned long cmd, struct parasite_ctl *ctl,
			    parasite_status_t *args, int args_size)
{
	int ret;

	user_regs_struct_t regs = ctl->regs_orig;

	memcpy(ctl->addr_cmd, &cmd, sizeof(cmd));
	memcpy(ctl->addr_args, args, args_size);

	parasite_setup_regs(ctl->parasite_ip, &regs);

	ret = __parasite_execute(ctl, &regs);

	memcpy(args, ctl->addr_args, args_size);
	if (!ret)
		ret = args->ret;

	return ret;
}

static void *mmap_seized(struct parasite_ctl *ctl,
			 void *addr, size_t length, int prot,
			 int flags, int fd, off_t offset)
{
	user_regs_struct_t regs = ctl->regs_orig;
	void *map = NULL;
	int ret;

	regs.ax = (unsigned long)__NR_mmap;	/* mmap		*/
	regs.di = (unsigned long)addr;		/* @addr	*/
	regs.si = (unsigned long)length;	/* @length	*/
	regs.dx = (unsigned long)prot;		/* @prot	*/
	regs.r10= (unsigned long)flags;		/* @flags	*/
	regs.r8 = (unsigned long)fd;		/* @fd		*/
	regs.r9 = (unsigned long)offset;	/* @offset	*/

	parasite_setup_regs(ctl->syscall_ip, &regs);

	ret = __parasite_execute(ctl, &regs);
	if (ret)
		goto err;

	if ((long)regs.ax > 0)
		map = (void *)regs.ax;
err:
	return map;
}

static int munmap_seized(struct parasite_ctl *ctl, void *addr, size_t length)
{
	user_regs_struct_t regs = ctl->regs_orig;
	int ret;

	regs.ax = (unsigned long)__NR_munmap;	/* mmap		*/
	regs.di = (unsigned long)addr;		/* @addr	*/
	regs.si = (unsigned long)length;	/* @length	*/

	parasite_setup_regs(ctl->syscall_ip, &regs);

	ret = __parasite_execute(ctl, &regs);
	if (!ret)
		ret = (int)regs.ax;

	return ret;
}

static int get_socket_name(struct sockaddr_un *saddr, pid_t pid)
{
	int sun_len;

	saddr->sun_family = AF_UNIX;
	snprintf(saddr->sun_path, UNIX_PATH_MAX,
			"X/crtools-pr-%d", pid);

	sun_len = SUN_LEN(saddr);
	*saddr->sun_path = '\0';

	return sun_len;
}

static int parasite_send_fd(struct parasite_ctl *ctl, int fd)
{
	struct sockaddr_un saddr;
	int sun_len, ret = -1;
	int sock;

	sun_len = get_socket_name(&saddr, ctl->pid);

	sock = socket(PF_UNIX, SOCK_DGRAM, 0);
	if (sock < 0) {
		pr_perror("Can't create socket");
		return -1;
	}

	if (send_fd(sock, &saddr, sun_len, fd) < 0) {
		pr_perror("Can't send file descriptor");
		goto out;
	}
	ret = 0;
out:
	close(sock);
	return ret;
}

static int parasite_prep_file(int type, struct parasite_ctl *ctl,
			      struct cr_fdset *fdset)
{
	int ret;

	if (fchmod(fdset->fds[type], CR_FD_PERM_DUMP)) {
		pr_perror("Can't change permissions on %d file", type);
		return -1;
	}

	ret = parasite_send_fd(ctl, fdset->fds[type]);
	if (ret)
		return ret;

	return 0;
}

static int parasite_file_cmd(char *what, int cmd, int type,
			     struct parasite_ctl *ctl,
			     struct cr_fdset *cr_fdset)
{
	parasite_status_t args = { };
	int status, ret = -1;

	pr_info("\n");
	pr_info("Dumping %s (pid: %d)\n", what, ctl->pid);
	pr_info("----------------------------------------\n");

	ret = parasite_prep_file(type, ctl, cr_fdset);
	if (ret < 0)
		goto out;

	ret = parasite_execute(cmd, ctl, (parasite_status_t *)&args, sizeof(args));

err:
	fchmod(cr_fdset->fds[type], CR_FD_PERM);
out:
	pr_info("----------------------------------------\n");

	return ret;
}

static int parasite_init(struct parasite_ctl *ctl, pid_t pid)
{
	struct parasite_init_args args = { };

	args.sun_len = get_socket_name(&args.saddr, pid);

	return parasite_execute(PARASITE_CMD_INIT, ctl,
				(parasite_status_t *)&args, sizeof(args));
}

static int parasite_set_logfd(struct parasite_ctl *ctl, pid_t pid)
{
	parasite_status_t args = { };
	int ret;

	ret = parasite_send_fd(ctl, get_logfd());
	if (ret)
		return ret;

	ret = parasite_execute(PARASITE_CMD_SET_LOGFD, ctl, &args, sizeof(args));
	if (ret < 0)
		return ret;

	return 0;
}

int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct cr_fdset *cr_fdset)
{
	return parasite_file_cmd("sigactions", PARASITE_CMD_DUMP_SIGACTS,
				 CR_FD_SIGACT, ctl, cr_fdset);
}

int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct cr_fdset *cr_fdset)
{
	return parasite_file_cmd("timers", PARASITE_CMD_DUMP_ITIMERS,
				 CR_FD_ITIMERS, ctl, cr_fdset);
}

int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_misc *misc)
{
	return parasite_execute(PARASITE_CMD_DUMP_MISC, ctl,
				(parasite_status_t *)misc,
				sizeof(struct parasite_dump_misc));
}

/*
 * This routine drives parasite code (been previously injected into a victim
 * process) and tells it to dump pages into the file.
 */
int parasite_dump_pages_seized(struct parasite_ctl *ctl, struct list_head *vma_area_list,
			       struct cr_fdset *cr_fdset)
{
	struct parasite_dump_pages_args parasite_dumppages = { };
	parasite_status_t *st = &parasite_dumppages.status;
	user_regs_struct_t regs, regs_orig;
	unsigned long nrpages_dumped = 0;
	struct vma_area *vma_area;
	siginfo_t siginfo;
	int status, ret = -1;

	pr_info("\n");
	pr_info("Dumping pages (type: %d pid: %d)\n", CR_FD_PAGES, ctl->pid);
	pr_info("----------------------------------------\n");

	ret = parasite_prep_file(CR_FD_PAGES, ctl, cr_fdset);
	if (ret < 0)
		goto out;

	ret = parasite_prep_file(CR_FD_PAGES_SHMEM, ctl, cr_fdset);
	if (ret < 0)
		goto out;

	ret = parasite_execute(PARASITE_CMD_DUMPPAGES_INIT, ctl, st, sizeof(*st));
	if (ret < 0) {
		pr_panic("Dumping pages failed with %li (%li) at %li\n",
				parasite_dumppages.status.ret,
				parasite_dumppages.status.sys_ret,
				parasite_dumppages.status.line);
		goto out;
	}

	list_for_each_entry(vma_area, vma_area_list, list) {

		/*
		 * The special areas are not dumped.
		 */
		if (!(vma_area->vma.status & VMA_AREA_REGULAR))
			continue;

		/* No dumps for file-shared mappings */
		if (vma_area->vma.status & VMA_FILE_SHARED)
			continue;

		pr_info_vma(vma_area);
		parasite_dumppages.vma_entry = vma_area->vma;

		if (vma_area_is(vma_area, VMA_ANON_PRIVATE) ||
		    vma_area_is(vma_area, VMA_FILE_PRIVATE))
			parasite_dumppages.fd_type = PG_PRIV;
		else if (vma_area_is(vma_area, VMA_ANON_SHARED))
			parasite_dumppages.fd_type = PG_SHARED;
		else {
			pr_warning("Unexpected VMA area found\n");
			continue;
		}

		ret = parasite_execute(PARASITE_CMD_DUMPPAGES, ctl,
				       (parasite_status_t *) &parasite_dumppages,
				       sizeof(parasite_dumppages));
		if (ret) {
			pr_panic("Dumping pages failed with %li (%li) at %li\n",
				 parasite_dumppages.status.ret,
				 parasite_dumppages.status.sys_ret,
				 parasite_dumppages.status.line);

			goto out;
		}

		pr_info("  (dumped: %16li pages)\n", parasite_dumppages.nrpages_dumped);
		nrpages_dumped += parasite_dumppages.nrpages_dumped;
	}

	parasite_execute(PARASITE_CMD_DUMPPAGES_FINI, ctl, st, sizeof(*st));

	if (write_img(cr_fdset->fds[CR_FD_PAGES], &zero_page_entry))
		goto out;
	if (write_img(cr_fdset->fds[CR_FD_PAGES_SHMEM], &zero_page_entry))
		goto out;

	pr_info("\n");
	pr_info("Summary: %16li pages dumped\n", nrpages_dumped);
	ret = 0;

out:
	fchmod(cr_fdset->fds[CR_FD_PAGES], CR_FD_PERM);
	fchmod(cr_fdset->fds[CR_FD_PAGES_SHMEM], CR_FD_PERM);
	pr_info("----------------------------------------\n");

	return ret;
}

int parasite_cure_seized(struct parasite_ctl *ctl)
{
	parasite_status_t args = { };
	struct vma_area *vma_area;
	user_regs_struct_t regs;
	int ret = 0;

	if (ctl->parasite_ip) {
		if (parasite_execute(PARASITE_CMD_FINI, ctl, &args, sizeof(args))) {
			pr_err("Can't finalize parasite (pid: %d) task\n", ctl->pid);
			ret = -1;
		}
	}

	if (ctl->remote_map) {
		if (munmap_seized(ctl, (void *)ctl->remote_map, ctl->map_length)) {
			pr_panic("munmap_seized failed (pid: %d)\n", ctl->pid);
			ret = -1;
		}
	}

	if (ctl->local_map)
		munmap(ctl->local_map, parasite_size);

	if (ptrace_poke_area(ctl->pid, (void *)ctl->code_orig,
			     (void *)ctl->syscall_ip, sizeof(ctl->code_orig))) {
		pr_panic("Can't restore syscall blob (pid: %d)\n", ctl->pid);
		ret = -1;
	}

	if (ptrace(PTRACE_SETREGS, ctl->pid, NULL, &ctl->regs_orig)) {
		pr_panic("Can't restore registers (pid: %d)\n", ctl->pid);
		ret = -1;
	}

	free(ctl);
	return ret;
}

struct parasite_ctl *parasite_infect_seized(pid_t pid, int pid_dir, struct list_head *vma_area_list)
{
	parasite_status_t args = { };
	user_regs_struct_t regs, regs_orig;
	struct parasite_ctl *ctl = NULL;
	struct vma_area *vma_area;
	char fname[128];
	int ret, fd;

	/*
	 * Control block early setup.
	 */
	ctl = xzalloc(sizeof(*ctl));
	if (!ctl) {
		pr_err("Parasite control block allocation failed (pid: %d)\n", pid);
		goto err;
	}

	if (ptrace(PTRACE_GETREGS, pid, NULL, &ctl->regs_orig)) {
		pr_err("Can't obtain registers (pid: %d)\n", pid);
		goto err;
	}

	vma_area = get_vma_by_ip(vma_area_list, ctl->regs_orig.ip);
	if (!vma_area) {
		pr_err("No suitable VMA found to run parasite "
		       "bootstrap code (pid: %d)\n", pid);
		goto err;
	}

	ctl->pid	= pid;
	ctl->syscall_ip	= vma_area->vma.start;

	/*
	 * Inject syscall instruction and remember original code
	 * we will need it to cure alien from parasite at the end.
	 */
	BUILD_BUG_ON(sizeof(code_syscall) != sizeof(ctl->code_orig));
	BUILD_BUG_ON(!is_log2(sizeof(code_syscall)));

	memcpy(ctl->code_orig, code_syscall, sizeof(ctl->code_orig));
	if (ptrace_swap_area(ctl->pid, (void *)ctl->syscall_ip,
			     (void *)ctl->code_orig, sizeof(ctl->code_orig))) {
		pr_err("Can't inject syscall blob (pid: %d)\n", pid);
		goto err;
	}

	/*
	 * Inject parasite engine. Ie allocate memory inside alien
	 * space and copy engine code there. Then re-map this engine
	 * locally so we would have easy way to access engine memory
	 * without using ptrace at all.
	 */
	ctl->remote_map = mmap_seized(ctl, NULL, (size_t)parasite_size,
				      PROT_READ | PROT_WRITE | PROT_EXEC,
				      MAP_ANONYMOUS | MAP_SHARED, -1, 0);
	if (!ctl->remote_map) {
		pr_err("Can't allocate memory for parasite blob (pid: %d)\n", pid);
		goto err_restore;
	}

	ctl->map_length = round_up(parasite_size, PAGE_SIZE);

	snprintf(fname, sizeof(fname), "map_files/%p-%p",
		 ctl->remote_map, ctl->remote_map + ctl->map_length);
	fd = openat(pid_dir, fname, O_RDWR);
	if (fd < 0) {
		pr_perror("Can't open remote parasite map");
		goto err_restore;
	}

	ctl->local_map = mmap(NULL, parasite_size, PROT_READ | PROT_WRITE,
			      MAP_SHARED | MAP_FILE, fd, 0);
	close(fd);

	if (ctl->local_map == MAP_FAILED) {
		ctl->local_map = NULL;
		pr_perror("Can't map remote parasite map");
		goto err_restore;
	}

	pr_info("Putting parasite blob into %p->%p\n", ctl->local_map, ctl->remote_map);
	memcpy(ctl->local_map, parasite_blob, sizeof(parasite_blob));

	/* Setup the rest of a control block */
	ctl->parasite_ip	= PARASITE_HEAD_ADDR((unsigned long)ctl->remote_map);
	ctl->addr_cmd		= (void *)PARASITE_CMD_ADDR((unsigned long)ctl->local_map);
	ctl->addr_args		= (void *)PARASITE_ARGS_ADDR((unsigned long)ctl->local_map);

	ret = parasite_init(ctl, pid);
	if (ret) {
		pr_err("%d: Can't create a transport socket\n", pid);
		goto err_restore;
	}

	ret = parasite_set_logfd(ctl, pid);
	if (ret) {
		pr_err("%d: Can't set a logging descriptor\n", pid);
		goto err_restore;
	}

	return ctl;

err_restore:
	parasite_cure_seized(ctl);

err:
	xfree(ctl);
	return NULL;
}

#else /* CONFIG_X86_64 */
# error x86-32 is not yet implemented
#endif /* CONFIG_X86_64 */


More information about the CRIU mailing list