[Devel] [PATCH 12/12] Move remaining checkpoint/* files into kernel/

Matt Helsley matthltc at us.ibm.com
Fri Feb 26 00:45:13 PST 2010


Signed-off-by: Matt Helsley <matthltc at us.ibm.com>
---
 Makefile                       |    2 +-
 checkpoint/Kconfig             |   20 -
 checkpoint/Makefile            |   10 -
 checkpoint/checkpoint.c        |  660 -------------------
 checkpoint/objhash.c           | 1083 ------------------------------
 checkpoint/process.c           |  929 --------------------------
 checkpoint/restart.c           | 1423 ----------------------------------------
 checkpoint/sys.c               |  719 --------------------
 init/Kconfig                   |    2 +-
 kernel/Makefile                |    1 +
 kernel/checkpoint/Kconfig      |   20 +
 kernel/checkpoint/Makefile     |   10 +
 kernel/checkpoint/checkpoint.c |  660 +++++++++++++++++++
 kernel/checkpoint/objhash.c    | 1083 ++++++++++++++++++++++++++++++
 kernel/checkpoint/process.c    |  929 ++++++++++++++++++++++++++
 kernel/checkpoint/restart.c    | 1423 ++++++++++++++++++++++++++++++++++++++++
 kernel/checkpoint/sys.c        |  719 ++++++++++++++++++++
 17 files changed, 4847 insertions(+), 4846 deletions(-)
 delete mode 100644 checkpoint/Kconfig
 delete mode 100644 checkpoint/Makefile
 delete mode 100644 checkpoint/checkpoint.c
 delete mode 100644 checkpoint/objhash.c
 delete mode 100644 checkpoint/process.c
 delete mode 100644 checkpoint/restart.c
 delete mode 100644 checkpoint/sys.c
 create mode 100644 kernel/checkpoint/Kconfig
 create mode 100644 kernel/checkpoint/Makefile
 create mode 100644 kernel/checkpoint/checkpoint.c
 create mode 100644 kernel/checkpoint/objhash.c
 create mode 100644 kernel/checkpoint/process.c
 create mode 100644 kernel/checkpoint/restart.c
 create mode 100644 kernel/checkpoint/sys.c

diff --git a/Makefile b/Makefile
index 58dd95e..c84fd64 100644
--- a/Makefile
+++ b/Makefile
@@ -650,7 +650,7 @@ export mod_strip_cmd
 
 
 ifeq ($(KBUILD_EXTMOD),)
-core-y		+= kernel/ mm/ fs/ ipc/ security/ crypto/ block/ checkpoint/
+core-y		+= kernel/ mm/ fs/ ipc/ security/ crypto/ block/
 
 vmlinux-dirs	:= $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \
 		     $(core-y) $(core-m) $(drivers-y) $(drivers-m) \
diff --git a/checkpoint/Kconfig b/checkpoint/Kconfig
deleted file mode 100644
index 4a2c845..0000000
--- a/checkpoint/Kconfig
+++ /dev/null
@@ -1,20 +0,0 @@
-# Architectures should define CHECKPOINT_SUPPORT when they have
-# implemented the hooks for processor state etc. needed by the
-# core checkpoint/restart code.
-
-config DEFERQUEUE
-	bool
-	default n
-
-config CHECKPOINT
-	bool "Checkpoint/restart (EXPERIMENTAL)"
-	depends on CHECKPOINT_SUPPORT && EXPERIMENTAL
-	depends on CGROUP_FREEZER
-	select DEFERQUEUE
-	help
-	  Application checkpoint/restart is the ability to save the
-	  state of a running application so that it can later resume
-	  its execution from the time at which it was checkpointed.
-
-	  Turning this option on will enable checkpoint and restart
-	  functionality in the kernel.
diff --git a/checkpoint/Makefile b/checkpoint/Makefile
deleted file mode 100644
index 5aa6a75..0000000
--- a/checkpoint/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
-#
-# Makefile for linux checkpoint/restart.
-#
-
-obj-$(CONFIG_CHECKPOINT) += \
-	sys.o \
-	objhash.o \
-	checkpoint.o \
-	restart.o \
-	process.o
diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
deleted file mode 100644
index b3c1c4f..0000000
--- a/checkpoint/checkpoint.c
+++ /dev/null
@@ -1,660 +0,0 @@
-/*
- *  Checkpoint logic and helpers
- *
- *  Copyright (C) 2008-2009 Oren Laadan
- *
- *  This file is subject to the terms and conditions of the GNU General Public
- *  License.  See the file COPYING in the main directory of the Linux
- *  distribution for more details.
- */
-
-/* default debug level for output */
-#define CKPT_DFLAG  CKPT_DSYS
-
-#include <linux/version.h>
-#include <linux/sched.h>
-#include <linux/freezer.h>
-#include <linux/ptrace.h>
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/fs_struct.h>
-#include <linux/dcache.h>
-#include <linux/mount.h>
-#include <linux/utsname.h>
-#include <linux/magic.h>
-#include <linux/hrtimer.h>
-#include <linux/deferqueue.h>
-#include <linux/checkpoint.h>
-#include <linux/checkpoint_hdr.h>
-
-/* unique checkpoint identifier (FIXME: should be per-container ?) */
-static atomic_t ctx_count = ATOMIC_INIT(0);
-
-/**
- * ckpt_write_obj - write an object
- * @ctx: checkpoint context
- * @h: object descriptor
- */
-int ckpt_write_obj(struct ckpt_ctx *ctx, struct ckpt_hdr *h)
-{
-	_ckpt_debug(CKPT_DRW, "type %d len %d\n", h->type, h->len);
-	return ckpt_kwrite(ctx, h, h->len);
-}
-
-/**
- * ckpt_write_obj_type - write an object (from a pointer)
- * @ctx: checkpoint context
- * @ptr: buffer pointer
- * @len: buffer size
- * @type: desired type
- *
- * If @ptr is NULL, then write only the header (payload to follow)
- */
-int ckpt_write_obj_type(struct ckpt_ctx *ctx, void *ptr, int len, int type)
-{
-	struct ckpt_hdr *h;
-	int ret;
-
-	h = ckpt_hdr_get(ctx, sizeof(*h));
-	if (!h)
-		return -ENOMEM;
-
-	h->type = type;
-	h->len = len + sizeof(*h);
-
-	_ckpt_debug(CKPT_DRW, "type %d len %d\n", h->type, h->len);
-	ret = ckpt_kwrite(ctx, h, sizeof(*h));
-	if (ret < 0)
-		goto out;
-	if (ptr)
-		ret = ckpt_kwrite(ctx, ptr, len);
- out:
-	_ckpt_hdr_put(ctx, h, sizeof(*h));
-	return ret;
-}
-
-/**
- * ckpt_write_buffer - write an object of type buffer
- * @ctx: checkpoint context
- * @ptr: buffer pointer
- * @len: buffer size
- */
-int ckpt_write_buffer(struct ckpt_ctx *ctx, void *ptr, int len)
-{
-	return ckpt_write_obj_type(ctx, ptr, len, CKPT_HDR_BUFFER);
-}
-
-/**
- * ckpt_write_string - write an object of type string
- * @ctx: checkpoint context
- * @str: string pointer
- * @len: string length
- */
-int ckpt_write_string(struct ckpt_ctx *ctx, char *str, int len)
-{
-	return ckpt_write_obj_type(ctx, str, len, CKPT_HDR_STRING);
-}
-
-/***********************************************************************
- * Checkpoint
- */
-
-static void fill_kernel_const(struct ckpt_const *h)
-{
-	struct task_struct *tsk;
-	struct new_utsname *uts;
-
-	/* task */
-	h->task_comm_len = sizeof(tsk->comm);
-	/* mm->saved_auxv size */
-	h->at_vector_size = AT_VECTOR_SIZE;
-	/* signal */
-	h->signal_nsig = _NSIG;
-	/* uts */
-	h->uts_sysname_len = sizeof(uts->sysname);
-	h->uts_nodename_len = sizeof(uts->nodename);
-	h->uts_release_len = sizeof(uts->release);
-	h->uts_version_len = sizeof(uts->version);
-	h->uts_machine_len = sizeof(uts->machine);
-	h->uts_domainname_len = sizeof(uts->domainname);
-	/* rlimit */
-	h->rlimit_nlimits = RLIM_NLIMITS;
-	/* tty */
-	h->n_tty_buf_size = N_TTY_BUF_SIZE;
-	h->tty_termios_ncc = NCC;
-}
-
-/* write the checkpoint header */
-static int checkpoint_write_header(struct ckpt_ctx *ctx)
-{
-	struct ckpt_hdr_header *h;
-	struct new_utsname *uts;
-	struct timeval ktv;
-	int ret;
-
-	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_HEADER);
-	if (!h)
-		return -ENOMEM;
-
-	do_gettimeofday(&ktv);
-	uts = utsname();
-
-	h->arch_id = cpu_to_le16(CKPT_ARCH_ID);  /* see asm/checkpoitn.h */
-
-	h->magic = CHECKPOINT_MAGIC_HEAD;
-	h->major = (LINUX_VERSION_CODE >> 16) & 0xff;
-	h->minor = (LINUX_VERSION_CODE >> 8) & 0xff;
-	h->patch = (LINUX_VERSION_CODE) & 0xff;
-
-	h->rev = CHECKPOINT_VERSION;
-
-	h->uflags = ctx->uflags;
-	h->time = ktv.tv_sec;
-
-	fill_kernel_const(&h->constants);
-
-	ret = ckpt_write_obj(ctx, &h->h);
-	ckpt_hdr_put(ctx, h);
-	if (ret < 0)
-		return ret;
-
-	down_read(&uts_sem);
-	ret = ckpt_write_buffer(ctx, uts->release, sizeof(uts->release));
-	if (ret < 0)
-		goto up;
-	ret = ckpt_write_buffer(ctx, uts->version, sizeof(uts->version));
-	if (ret < 0)
-		goto up;
-	ret = ckpt_write_buffer(ctx, uts->machine, sizeof(uts->machine));
- up:
-	up_read(&uts_sem);
-	if (ret < 0)
-		return ret;
-
-	return checkpoint_write_header_arch(ctx);
-}
-
-/* write the container configuration section */
-static int checkpoint_container(struct ckpt_ctx *ctx)
-{
-	struct ckpt_hdr_container *h;
-	int ret;
-
-	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_CONTAINER);
-	if (!h)
-		return -ENOMEM;
-	ret = ckpt_write_obj(ctx, &h->h);
-	ckpt_hdr_put(ctx, h);
-
-	if (ret < 0)
-		return ret;
-
-	memset(ctx->lsm_name, 0, CHECKPOINT_LSM_NAME_MAX + 1);
-	strlcpy(ctx->lsm_name, security_get_lsm_name(),
-				CHECKPOINT_LSM_NAME_MAX + 1);
-	ret = ckpt_write_buffer(ctx, ctx->lsm_name,
-				CHECKPOINT_LSM_NAME_MAX + 1);
-	if (ret < 0)
-		return ret;
-
-	return security_checkpoint_header(ctx);
-}
-
-/* write the checkpoint trailer */
-static int checkpoint_write_tail(struct ckpt_ctx *ctx)
-{
-	struct ckpt_hdr_tail *h;
-	int ret;
-
-	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TAIL);
-	if (!h)
-		return -ENOMEM;
-
-	h->magic = CHECKPOINT_MAGIC_TAIL;
-
-	ret = ckpt_write_obj(ctx, &h->h);
-	ckpt_hdr_put(ctx, h);
-	return ret;
-}
-
-/* dump all tasks in ctx->tasks_arr[] */
-static int checkpoint_all_tasks(struct ckpt_ctx *ctx)
-{
-	int n, ret = 0;
-
-	for (n = 0; n < ctx->nr_tasks; n++) {
-		ckpt_debug("dumping task #%d\n", n);
-		ret = checkpoint_task(ctx, ctx->tasks_arr[n]);
-		if (ret < 0)
-			break;
-	}
-
-	return ret;
-}
-
-static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
-{
-	struct task_struct *root = ctx->root_task;
-	struct nsproxy *nsproxy;
-	int ret = 0;
-
-	ckpt_debug("check %d\n", task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns));
-
-	if (t->exit_state == EXIT_DEAD) {
-		_ckpt_err(ctx, -EBUSY, "%(T)Task state EXIT_DEAD\n");
-		return -EBUSY;
-	}
-
-	if (!ptrace_may_access(t, PTRACE_MODE_ATTACH)) {
-		_ckpt_err(ctx, -EPERM, "%(T)Ptrace attach denied\n");
-		return -EPERM;
-	}
-
-	/* zombies are cool (and also don't have nsproxy, below...) */
-	if (t->exit_state)
-		return 0;
-
-	/* verify that all tasks belongs to same freezer cgroup */
-	if (t != current && !in_same_cgroup_freezer(t, ctx->root_freezer)) {
-		_ckpt_err(ctx, -EBUSY, "%(T)Not frozen or wrong cgroup\n");
-		return -EBUSY;
-	}
-
-	/* FIX: add support for ptraced tasks */
-	if (task_ptrace(t)) {
-		_ckpt_err(ctx, -EBUSY, "%(T)Task is ptraced\n");
-		return -EBUSY;
-	}
-
-	/*
-	 * FIX: for now, disallow siblings of container init created
-	 * via CLONE_PARENT (unclear if they will remain possible)
-	 */
-	if (ctx->root_init && t != root &&
-	    t->real_parent == root->real_parent && t->tgid != root->tgid) {
-		_ckpt_err(ctx, -EINVAL, "%(T)Task is sibling of root\n");
-		return -EINVAL;
-	}
-
-	rcu_read_lock();
-	nsproxy = task_nsproxy(t);
-	/* no support for >1 private mntns */
-	if (nsproxy->mnt_ns != ctx->root_nsproxy->mnt_ns) {
-		_ckpt_err(ctx, -EPERM, "%(T)Nested mnt_ns unsupported\n");
-		ret = -EPERM;
-	}
-	/* no support for >1 private netns */
-	if (nsproxy->net_ns != ctx->root_nsproxy->net_ns) {
-		_ckpt_err(ctx, -EPERM, "%(T)Nested net_ns unsupported\n");
-		ret = -EPERM;
-	}
-	/* no support for >1 private pidns */
-	if (nsproxy->pid_ns != ctx->root_nsproxy->pid_ns) {
-		_ckpt_err(ctx, -EPERM, "%(T)Nested pid_ns unsupported\n");
-		ret = -EPERM;
-	}
-	rcu_read_unlock();
-
-	return ret;
-}
-
-#define CKPT_HDR_PIDS_CHUNK	256
-
-static int checkpoint_pids(struct ckpt_ctx *ctx)
-{
-	struct ckpt_pids *h;
-	struct pid_namespace *ns;
-	struct task_struct *task;
-	struct task_struct **tasks_arr;
-	int nr_tasks, n, pos = 0, ret = 0;
-
-	ns = ctx->root_nsproxy->pid_ns;
-	tasks_arr = ctx->tasks_arr;
-	nr_tasks = ctx->nr_tasks;
-	BUG_ON(nr_tasks <= 0);
-
-	ret = ckpt_write_obj_type(ctx, NULL,
-				  sizeof(*h) * nr_tasks,
-				  CKPT_HDR_BUFFER);
-	if (ret < 0)
-		return ret;
-
-	h = ckpt_hdr_get(ctx, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
-	if (!h)
-		return -ENOMEM;
-
-	do {
-		rcu_read_lock();
-		for (n = 0; n < min(nr_tasks, CKPT_HDR_PIDS_CHUNK); n++) {
-			task = tasks_arr[pos];
-
-			h[n].vpid = task_pid_nr_ns(task, ns);
-			h[n].vtgid = task_tgid_nr_ns(task, ns);
-			h[n].vpgid = task_pgrp_nr_ns(task, ns);
-			h[n].vsid = task_session_nr_ns(task, ns);
-			h[n].vppid = task_tgid_nr_ns(task->real_parent, ns);
-			ckpt_debug("task[%d]: vpid %d vtgid %d parent %d\n",
-				   pos, h[n].vpid, h[n].vtgid, h[n].vppid);
-			pos++;
-		}
-		rcu_read_unlock();
-
-		n = min(nr_tasks, CKPT_HDR_PIDS_CHUNK);
-		ret = ckpt_kwrite(ctx, h, n * sizeof(*h));
-		if (ret < 0)
-			break;
-
-		nr_tasks -= n;
-	} while (nr_tasks > 0);
-
-	_ckpt_hdr_put(ctx, h, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
-	return ret;
-}
-
-static int collect_objects(struct ckpt_ctx *ctx)
-{
-	int n, ret = 0;
-
-	for (n = 0; n < ctx->nr_tasks; n++) {
-		ckpt_debug("dumping task #%d\n", n);
-		ret = ckpt_collect_task(ctx, ctx->tasks_arr[n]);
-		if (ret < 0) {
-			ctx->tsk = ctx->tasks_arr[n];
-			ckpt_err(ctx, ret, "%(T)Collect failed\n");
-			ctx->tsk = NULL;
-			break;
-		}
-	}
-
-	return ret;
-}
-
-struct ckpt_cnt_tasks {
-	struct ckpt_ctx *ctx;
-	int nr;
-};
-
-/* count number of tasks in tree (and optionally fill pid's in array) */
-static int __tree_count_tasks(struct task_struct *task, void *data)
-{
-	struct ckpt_cnt_tasks *d = (struct ckpt_cnt_tasks *) data;
-	struct ckpt_ctx *ctx = d->ctx;
-	int ret;
-
-	ctx->tsk = task;  /* (for _ckpt_err()) */
-
-	/* is this task cool ? */
-	ret = may_checkpoint_task(ctx, task);
-	if (ret < 0)
-		goto out;
-
-	if (ctx->tasks_arr) {
-		if (d->nr == ctx->nr_tasks) {  /* unlikely... try again later */
-			_ckpt_err(ctx, -EBUSY, "%(T)Bad task count (%d)\n",
-				  d->nr);
-			ret = -EBUSY;
-			goto out;
-		}
-		ctx->tasks_arr[d->nr++] = task;
-		get_task_struct(task);
-	}
-
-	ret = 1;
- out:
-	ctx->tsk = NULL;
-	return ret;
-}
-
-static int tree_count_tasks(struct ckpt_ctx *ctx)
-{
-	struct ckpt_cnt_tasks data;
-	int ret;
-
-	data.ctx = ctx;
-	data.nr = 0;
-
-	ckpt_msg_lock(ctx);
-	ret = walk_task_subtree(ctx->root_task, __tree_count_tasks, &data);
-	ckpt_msg_unlock(ctx);
-	if (ret < 0)
-		_ckpt_msg_complete(ctx);
-	return ret;
-}
-
-/*
- * build_tree - scan the tasks tree in DFS order and fill in array
- * @ctx: checkpoint context
- *
- * Using DFS order simplifies the restart logic to re-create the tasks.
- *
- * On success, ctx->tasks_arr will be allocated and populated with all
- * tasks (reference taken), and ctx->nr_tasks will hold the total count.
- * The array is cleaned up by ckpt_ctx_free().
- */
-static int build_tree(struct ckpt_ctx *ctx)
-{
-	int n, m;
-
-	/* count tasks (no side effects) */
-	n = tree_count_tasks(ctx);
-	if (n < 0)
-		return n;
-
-	ctx->nr_tasks = n;
-	ctx->tasks_arr = kzalloc(n * sizeof(*ctx->tasks_arr), GFP_KERNEL);
-	if (!ctx->tasks_arr)
-		return -ENOMEM;
-
-	/* count again (now will fill array) */
-	m = tree_count_tasks(ctx);
-
-	/* unlikely, but ... (cleanup in ckpt_ctx_free) */
-	if (m < 0)
-		return m;
-	else if (m != n)
-		return -EBUSY;
-
-	return 0;
-}
-
-/* dump the array that describes the tasks tree */
-static int checkpoint_tree(struct ckpt_ctx *ctx)
-{
-	struct ckpt_hdr_tree *h;
-	int ret;
-
-	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TREE);
-	if (!h)
-		return -ENOMEM;
-
-	h->nr_tasks = ctx->nr_tasks;
-
-	ret = ckpt_write_obj(ctx, &h->h);
-	ckpt_hdr_put(ctx, h);
-	if (ret < 0)
-		return ret;
-
-	ret = checkpoint_pids(ctx);
-	return ret;
-}
-
-static struct task_struct *get_freezer_task(struct task_struct *root_task)
-{
-	struct task_struct *p;
-
-	/*
-	 * For the duration of checkpoint we deep-freeze all tasks.
-	 * Normally do it through the root task's freezer cgroup.
-	 * However, if the root task is also the current task (doing
-	 * self-checkpoint) we can't freeze ourselves. In this case,
-	 * choose the next available (non-dead) task instead. We'll
-	 * use its freezer cgroup to verify that all tasks belong to
-	 * the same cgroup.
-	 */
-
-	if (root_task != current) {
-		get_task_struct(root_task);
-		return root_task;
-	}
-
-	/* search among threads, then children */
-	read_lock(&tasklist_lock);
-
-	for (p = next_thread(root_task); p != root_task; p = next_thread(p)) {
-		if (p->state == TASK_DEAD)
-			continue;
-		if (!in_same_cgroup_freezer(p, root_task))
-			goto out;
-	}
-
-	list_for_each_entry(p, &root_task->children, sibling) {
-		if (p->state == TASK_DEAD)
-			continue;
-		if (!in_same_cgroup_freezer(p, root_task))
-			goto out;
-	}
-
-	p = NULL;
- out:
-	read_unlock(&tasklist_lock);
-	if (p)
-		get_task_struct(p);
-	return p;
-}
-
-/* setup checkpoint-specific parts of ctx */
-static int init_checkpoint_ctx(struct ckpt_ctx *ctx, pid_t pid)
-{
-	struct task_struct *task;
-	struct nsproxy *nsproxy;
-	struct fs_struct *fs;
-
-	/*
-	 * No need for explicit cleanup here, because if an error
-	 * occurs then ckpt_ctx_free() is eventually called.
-	 */
-
-	ctx->root_pid = pid;
-
-	/* root task */
-	read_lock(&tasklist_lock);
-	task = find_task_by_vpid(pid);
-	if (task)
-		get_task_struct(task);
-	read_unlock(&tasklist_lock);
-	if (!task)
-		return -ESRCH;
-	else
-		ctx->root_task = task;
-
-	/* root nsproxy */
-	rcu_read_lock();
-	nsproxy = task_nsproxy(task);
-	if (nsproxy)
-		get_nsproxy(nsproxy);
-	rcu_read_unlock();
-	if (!nsproxy)
-		return -ESRCH;
-	else
-		ctx->root_nsproxy = nsproxy;
-
-	/* root freezer */
-	ctx->root_freezer = get_freezer_task(task);
-
-	/* container init ? */
-	ctx->root_init = is_container_init(task);
-
-	if (!(ctx->uflags & CHECKPOINT_SUBTREE) && !ctx->root_init) {
-		ckpt_err(ctx, -EINVAL, "Not container init\n");
-		return -EINVAL;  /* cleanup by ckpt_ctx_free() */
-	}
-
-	/* root vfs (FIX: WILL CHANGE with mnt-ns etc */
-	task_lock(ctx->root_task);
-	fs = ctx->root_task->fs;
-	read_lock(&fs->lock);
-	ctx->root_fs_path = fs->root;
-	path_get(&ctx->root_fs_path);
-	read_unlock(&fs->lock);
-	task_unlock(ctx->root_task);
-
-	return 0;
-}
-
-long do_checkpoint(struct ckpt_ctx *ctx, pid_t pid)
-{
-	long ret;
-
-	ret = init_checkpoint_ctx(ctx, pid);
-	if (ret < 0)
-		return ret;
-
-	if (ctx->root_freezer) {
-		ret = cgroup_freezer_begin_checkpoint(ctx->root_freezer);
-		if (ret < 0) {
-			ckpt_err(ctx, ret, "Freezer cgroup failed\n");
-			return ret;
-		}
-	}
-
-	ret = build_tree(ctx);
-	if (ret < 0)
-		goto out;
-
-	if (!(ctx->uflags & CHECKPOINT_SUBTREE)) {
-		/*
-		 * Verify that all objects are contained (no leaks):
-		 * First collect them all into the while counting users
-		 * and then compare to the objects' real user counts.
-		 */
-		ret = collect_objects(ctx);
-		if (ret < 0)
-			goto out;
-		if (!ckpt_obj_contained(ctx)) {
-			ret = -EBUSY;
-			goto out;
-		}
-	}
-
-	ret = checkpoint_write_header(ctx);
-	if (ret < 0)
-		goto out;
-	ret = checkpoint_container(ctx);
-	if (ret < 0)
-		goto out;
-	ret = checkpoint_tree(ctx);
-	if (ret < 0)
-		goto out;
-	ret = checkpoint_all_tasks(ctx);
-	if (ret < 0)
-		goto out;
-
-	ret = deferqueue_run(ctx->deferqueue);  /* run deferred work */
-	if (ret < 0)
-		goto out;
-
-	/* verify that all objects were indeed visited */
-	if (!ckpt_obj_visited(ctx)) {
-		ckpt_err(ctx, -EBUSY, "Leak: unvisited\n");
-		ret = -EBUSY;
-		goto out;
-	}
-
-	ret = checkpoint_write_tail(ctx);
-	if (ret < 0)
-		goto out;
-
-	/* on success, return (unique) checkpoint identifier */
-	ctx->crid = atomic_inc_return(&ctx_count);
-	ret = ctx->crid;
- out:
-	if (ret < 0)
-		ckpt_set_error(ctx, ret);
-	else
-		ckpt_set_success(ctx);
-
-	if (ctx->root_freezer)
-		cgroup_freezer_end_checkpoint(ctx->root_freezer);
-	return ret;
-}
diff --git a/checkpoint/objhash.c b/checkpoint/objhash.c
deleted file mode 100644
index 70c54f5..0000000
--- a/checkpoint/objhash.c
+++ /dev/null
@@ -1,1083 +0,0 @@
-/*
- *  Checkpoint-restart - object hash infrastructure to manage shared objects
- *
- *  Copyright (C) 2008-2009 Oren Laadan
- *
- *  This file is subject to the terms and conditions of the GNU General Public
- *  License.  See the file COPYING in the main directory of the Linux
- *  distribution for more details.
- */
-
-/* default debug level for output */
-#define CKPT_DFLAG  CKPT_DOBJ
-
-#include <linux/kernel.h>
-#include <linux/hash.h>
-#include <linux/file.h>
-#include <linux/fdtable.h>
-#include <linux/fs_struct.h>
-#include <linux/sched.h>
-#include <linux/kref.h>
-#include <linux/ipc_namespace.h>
-#include <linux/user_namespace.h>
-#include <linux/mnt_namespace.h>
-#include <linux/checkpoint.h>
-#include <linux/checkpoint_hdr.h>
-#include <net/sock.h>
-
-struct ckpt_obj {
-	int users;
-	int objref;
-	int flags;
-	void *ptr;
-	const struct ckpt_obj_ops *ops;
-	struct hlist_node hash;
-	struct hlist_node next;
-};
-
-/* object internal flags */
-#define CKPT_OBJ_CHECKPOINTED		0x1   /* object already checkpointed */
-#define CKPT_OBJ_VISITED		0x2   /* object already visited */
-
-struct ckpt_obj_hash {
-	struct hlist_head *head;
-	struct hlist_head list;
-	int next_free_objref;
-};
-
-/* helper grab/drop/users functions */
-
-static int obj_inode_grab(void *ptr)
-{
-	return igrab((struct inode *) ptr) ? 0 : -EBADF;
-}
-
-static void obj_inode_drop(void *ptr, int lastref)
-{
-	iput((struct inode *) ptr);
-}
-
-static int obj_file_table_grab(void *ptr)
-{
-	atomic_inc(&((struct files_struct *) ptr)->count);
-	return 0;
-}
-
-static void obj_file_table_drop(void *ptr, int lastref)
-{
-	put_files_struct((struct files_struct *) ptr);
-}
-
-static int obj_file_table_users(void *ptr)
-{
-	return atomic_read(&((struct files_struct *) ptr)->count);
-}
-
-static int obj_file_grab(void *ptr)
-{
-	get_file((struct file *) ptr);
-	return 0;
-}
-
-static void obj_file_drop(void *ptr, int lastref)
-{
-	fput((struct file *) ptr);
-}
-
-static int obj_file_users(void *ptr)
-{
-	return atomic_long_read(&((struct file *) ptr)->f_count);
-}
-
-static int obj_fs_grab(void *ptr)
-{
-	get_fs_struct((struct fs_struct *) ptr);
-	return 0;
-}
-
-static void obj_fs_drop(void *ptr, int lastref)
-{
-	put_fs_struct((struct fs_struct *) ptr);
-}
-
-static int obj_fs_users(void *ptr)
-{
-	/*
-	 * It's safe to not use fs->lock because the fs referenced.
-	 * It's also sufficient for leak detection: with no leak the
-	 * count can't change; with a leak it will be too big already
-	 * (even if it's about to grow), and if it's about to shrink
-	 * then it's as if we sampled the count a bit earlier.
-	 */
-	return ((struct fs_struct *) ptr)->users;
-}
-
-static int obj_ipc_ns_grab(void *ptr)
-{
-	get_ipc_ns((struct ipc_namespace *) ptr);
-	return 0;
-}
-
-static void obj_ipc_ns_drop(void *ptr, int lastref)
-{
-	put_ipc_ns((struct ipc_namespace *) ptr);
-}
-
-static int obj_ipc_ns_users(void *ptr)
-{
-	return atomic_read(&((struct ipc_namespace *) ptr)->count);
-}
-
-static int obj_mnt_ns_grab(void *ptr)
-{
-	get_mnt_ns((struct mnt_namespace *) ptr);
-	return 0;
-}
-
-static void obj_mnt_ns_drop(void *ptr, int lastref)
-{
-	put_mnt_ns((struct mnt_namespace *) ptr);
-}
-
-static int obj_mnt_ns_users(void *ptr)
-{
-	return atomic_read(&((struct mnt_namespace *) ptr)->count);
-}
-
-static int obj_cred_grab(void *ptr)
-{
-	get_cred((struct cred *) ptr);
-	return 0;
-}
-
-static void obj_cred_drop(void *ptr, int lastref)
-{
-	put_cred((struct cred *) ptr);
-}
-
-static int obj_user_grab(void *ptr)
-{
-	struct user_struct *u = ptr;
-	(void) get_uid(u);
-	return 0;
-}
-
-static void obj_user_drop(void *ptr, int lastref)
-{
-	free_uid((struct user_struct *) ptr);
-}
-
-static int obj_groupinfo_grab(void *ptr)
-{
-	get_group_info((struct group_info *) ptr);
-	return 0;
-}
-
-static void obj_groupinfo_drop(void *ptr, int lastref)
-{
-	put_group_info((struct group_info *) ptr);
-}
-
-static int obj_sock_grab(void *ptr)
-{
-	sock_hold((struct sock *) ptr);
-	return 0;
-}
-
-static void obj_sock_drop(void *ptr, int lastref)
-{
-	struct sock *sk = (struct sock *) ptr;
-
-	/*
-	 * Sockets created during restart are graft()ed, i.e. have a
-	 * valid @sk->sk_socket. Because only an fput() results in the
-	 * necessary sock_release(), we may leak the struct socket of
-	 * sockets that were not attached to a file. Therefore, if
-	 * @lastref is set, we hereby invoke sock_release() on sockets
-	 * that we have put into the objhash but were never attached
-	 * to a file.
-	 */
-	if (lastref && sk->sk_socket && !sk->sk_socket->file) {
-		struct socket *sock = sk->sk_socket;
-		sock_orphan(sk);
-		sock->sk = NULL;
-		sock_release(sock);
-	}
-
-	sock_put((struct sock *) ptr);
-}
-
-static int obj_sock_users(void *ptr)
-{
-	return atomic_read(&((struct sock *) ptr)->sk_refcnt);
-}
-
-static int obj_tty_grab(void *ptr)
-{
-	tty_kref_get((struct tty_struct *) ptr);
-	return 0;
-}
-
-static void obj_tty_drop(void *ptr, int lastref)
-{
-	tty_kref_put((struct tty_struct *) ptr);
-}
-
-static int obj_tty_users(void *ptr)
-{
-	return atomic_read(&((struct tty_struct *) ptr)->kref.refcount);
-}
-
-void lsm_string_free(struct kref *kref)
-{
-	struct ckpt_lsm_string *s = container_of(kref, struct ckpt_lsm_string,
-					kref);
-	kfree(s->string);
-	kfree(s);
-}
-
-static int lsm_string_grab(void *ptr)
-{
-	struct ckpt_lsm_string *s = ptr;
-	kref_get(&s->kref);
-	return 0;
-}
-
-static void lsm_string_drop(void *ptr, int lastref)
-{
-	struct ckpt_lsm_string *s = ptr;
-	kref_put(&s->kref, lsm_string_free);
-}
-
-/* security context strings */
-static int checkpoint_lsm_string(struct ckpt_ctx *ctx, void *ptr);
-static struct ckpt_lsm_string *restore_lsm_string(struct ckpt_ctx *ctx);
-static void *restore_lsm_string_wrap(struct ckpt_ctx *ctx)
-{
-	return (void *)restore_lsm_string(ctx);
-}
-
-/* ignored object */
-static const struct ckpt_obj_ops ckpt_obj_ignored_ops = {
-	.obj_name = "IGNORED",
-	.obj_type = CKPT_OBJ_IGNORE,
-	.ref_drop = NULL,
-	.ref_grab = NULL,
-};
-
-/* inode object */
-static const struct ckpt_obj_ops ckpt_obj_inode_ops = {
-	.obj_name = "INODE",
-	.obj_type = CKPT_OBJ_INODE,
-	.ref_drop = obj_inode_drop,
-	.ref_grab = obj_inode_grab,
-};
-
-/* files_struct object */
-static const struct ckpt_obj_ops ckpt_obj_files_struct_ops = {
-	.obj_name = "FILE_TABLE",
-	.obj_type = CKPT_OBJ_FILE_TABLE,
-	.ref_drop = obj_file_table_drop,
-	.ref_grab = obj_file_table_grab,
-	.ref_users = obj_file_table_users,
-	.checkpoint = checkpoint_file_table,
-	.restore = restore_file_table,
-};
-/* file object */
-static const struct ckpt_obj_ops ckpt_obj_file_ops = {
-	.obj_name = "FILE",
-	.obj_type = CKPT_OBJ_FILE,
-	.ref_drop = obj_file_drop,
-	.ref_grab = obj_file_grab,
-	.ref_users = obj_file_users,
-	.checkpoint = checkpoint_file,
-	.restore = restore_file,
-};
-/* fs object */
-static const struct ckpt_obj_ops ckpt_obj_fs_ops = {
-	.obj_name = "FS",
-	.obj_type = CKPT_OBJ_FS,
-	.ref_drop = obj_fs_drop,
-	.ref_grab = obj_fs_grab,
-	.ref_users = obj_fs_users,
-	.checkpoint = checkpoint_fs,
-	.restore = restore_fs,
-};
-/* ipc_ns object */
-static const struct ckpt_obj_ops ckpt_obj_ipc_ns_ops = {
-	.obj_name = "IPC_NS",
-	.obj_type = CKPT_OBJ_IPC_NS,
-	.ref_drop = obj_ipc_ns_drop,
-	.ref_grab = obj_ipc_ns_grab,
-	.ref_users = obj_ipc_ns_users,
-	.checkpoint = checkpoint_ipc_ns,
-	.restore = restore_ipc_ns,
-};
-/* mnt_ns object */
-static const struct ckpt_obj_ops ckpt_obj_mnt_ns_ops = {
-	.obj_name = "MOUNTS NS",
-	.obj_type = CKPT_OBJ_MNT_NS,
-	.ref_grab = obj_mnt_ns_grab,
-	.ref_drop = obj_mnt_ns_drop,
-	.ref_users = obj_mnt_ns_users,
-};
-/* struct cred */
-static const struct ckpt_obj_ops ckpt_obj_cred_ops = {
-	.obj_name = "CRED",
-	.obj_type = CKPT_OBJ_CRED,
-	.ref_drop = obj_cred_drop,
-	.ref_grab = obj_cred_grab,
-	.checkpoint = checkpoint_cred,
-	.restore = restore_cred,
-};
-/* user object */
-static const struct ckpt_obj_ops ckpt_obj_user_ops = {
-	.obj_name = "USER",
-	.obj_type = CKPT_OBJ_USER,
-	.ref_drop = obj_user_drop,
-	.ref_grab = obj_user_grab,
-	.checkpoint = checkpoint_user,
-	.restore = restore_user,
-};
-/* struct groupinfo */
-static const struct ckpt_obj_ops ckpt_obj_groupinfo_ops = {
-	.obj_name = "GROUPINFO",
-	.obj_type = CKPT_OBJ_GROUPINFO,
-	.ref_drop = obj_groupinfo_drop,
-	.ref_grab = obj_groupinfo_grab,
-	.checkpoint = checkpoint_groupinfo,
-	.restore = restore_groupinfo,
-};
-/* sock object */
-static const struct ckpt_obj_ops ckpt_obj_sock_ops = {
-	.obj_name = "SOCKET",
-	.obj_type = CKPT_OBJ_SOCK,
-	.ref_drop = obj_sock_drop,
-	.ref_grab = obj_sock_grab,
-	.ref_users = obj_sock_users,
-	.checkpoint = checkpoint_sock,
-	.restore = restore_sock,
-};
-/* struct tty_struct */
-static const struct ckpt_obj_ops ckpt_obj_tty_ops = {
-	.obj_name = "TTY",
-	.obj_type = CKPT_OBJ_TTY,
-	.ref_drop = obj_tty_drop,
-	.ref_grab = obj_tty_grab,
-	.ref_users = obj_tty_users,
-	.checkpoint = checkpoint_tty,
-	.restore = restore_tty,
-};
-/*
- * LSM void *security on objhash - at checkpoint
- * We don't take a ref because we won't be doing
- * anything more with this void* - unless we happen
- * to run into it again through some other objects's
- * ->security (in which case that object has it pinned).
- */
-static const struct ckpt_obj_ops ckpt_obj_security_ptr_ops = {
-	.obj_name = "SECURITY PTR",
-	.obj_type = CKPT_OBJ_SECURITY_PTR,
-	.ref_drop = NULL,
-	.ref_grab = NULL,
-};
-/*
- * LSM security strings - at restart
- * This is a struct which we malloc during restart and
- * must be freed (by objhash cleanup) at the end of
- * restart
- */
-static const struct ckpt_obj_ops ckpt_obj_security_strings_ops = {
-	.obj_name = "SECURITY STRING",
-	.obj_type = CKPT_OBJ_SECURITY,
-	.ref_grab = lsm_string_grab,
-	.ref_drop = lsm_string_drop,
-	.checkpoint = checkpoint_lsm_string,
-	.restore = restore_lsm_string_wrap,
-};
-
-static const struct ckpt_obj_ops *ckpt_obj_ops[] = {
-	[CKPT_OBJ_IGNORE] = &ckpt_obj_ignored_ops,
-	[CKPT_OBJ_INODE] = &ckpt_obj_inode_ops,
-	[CKPT_OBJ_FILE_TABLE] = &ckpt_obj_files_struct_ops,
-	[CKPT_OBJ_FILE] = &ckpt_obj_file_ops,
-	[CKPT_OBJ_FS] = &ckpt_obj_fs_ops,
-	[CKPT_OBJ_IPC_NS] = &ckpt_obj_ipc_ns_ops,
-	[CKPT_OBJ_MNT_NS] = &ckpt_obj_mnt_ns_ops,
-	[CKPT_OBJ_USER_NS] = &ckpt_obj_mnt_ns_ops,
-	[CKPT_OBJ_CRED] = &ckpt_obj_cred_ops,
-	[CKPT_OBJ_USER] = &ckpt_obj_user_ops,
-	[CKPT_OBJ_GROUPINFO] = &ckpt_obj_groupinfo_ops,
-	[CKPT_OBJ_SOCK] = &ckpt_obj_sock_ops,
-	[CKPT_OBJ_TTY] = &ckpt_obj_tty_ops,
-	[CKPT_OBJ_SECURITY_PTR] = &ckpt_obj_security_ptr_ops,
-	[CKPT_OBJ_SECURITY] = &ckpt_obj_security_strings_ops,
-};
-
-void register_checkpoint_obj(const struct ckpt_obj_ops *ops)
-{
-	ckpt_obj_ops[ops->obj_type] = ops;
-}
-
-#define CKPT_OBJ_HASH_NBITS  10
-#define CKPT_OBJ_HASH_TOTAL  (1UL << CKPT_OBJ_HASH_NBITS)
-
-static void obj_hash_clear(struct ckpt_obj_hash *obj_hash)
-{
-	struct hlist_head *h = obj_hash->head;
-	struct hlist_node *n, *t;
-	struct ckpt_obj *obj;
-	int i;
-
-	for (i = 0; i < CKPT_OBJ_HASH_TOTAL; i++) {
-		hlist_for_each_entry_safe(obj, n, t, &h[i], hash) {
-			if (obj->ops->ref_drop)
-				obj->ops->ref_drop(obj->ptr, 1);
-			kfree(obj);
-		}
-	}
-}
-
-void ckpt_obj_hash_free(struct ckpt_ctx *ctx)
-{
-	struct ckpt_obj_hash *obj_hash = ctx->obj_hash;
-
-	if (obj_hash) {
-		obj_hash_clear(obj_hash);
-		kfree(obj_hash->head);
-		kfree(ctx->obj_hash);
-		ctx->obj_hash = NULL;
-	}
-}
-
-int ckpt_obj_hash_alloc(struct ckpt_ctx *ctx)
-{
-	struct ckpt_obj_hash *obj_hash;
-	struct hlist_head *head;
-
-	obj_hash = kzalloc(sizeof(*obj_hash), GFP_KERNEL);
-	if (!obj_hash)
-		return -ENOMEM;
-	head = kzalloc(CKPT_OBJ_HASH_TOTAL * sizeof(*head), GFP_KERNEL);
-	if (!head) {
-		kfree(obj_hash);
-		return -ENOMEM;
-	}
-
-	obj_hash->head = head;
-	obj_hash->next_free_objref = 1;
-	INIT_HLIST_HEAD(&obj_hash->list);
-
-	ctx->obj_hash = obj_hash;
-	return 0;
-}
-
-static struct ckpt_obj *obj_find_by_ptr(struct ckpt_ctx *ctx, void *ptr)
-{
-	struct hlist_head *h;
-	struct hlist_node *n;
-	struct ckpt_obj *obj;
-
-	h = &ctx->obj_hash->head[hash_long((unsigned long) ptr,
-					   CKPT_OBJ_HASH_NBITS)];
-	hlist_for_each_entry(obj, n, h, hash)
-		if (obj->ptr == ptr)
-			return obj;
-	return NULL;
-}
-
-static struct ckpt_obj *obj_find_by_objref(struct ckpt_ctx *ctx, int objref)
-{
-	struct hlist_head *h;
-	struct hlist_node *n;
-	struct ckpt_obj *obj;
-
-	h = &ctx->obj_hash->head[hash_long((unsigned long) objref,
-					   CKPT_OBJ_HASH_NBITS)];
-	hlist_for_each_entry(obj, n, h, hash)
-		if (obj->objref == objref)
-			return obj;
-	return NULL;
-}
-
-static inline int obj_alloc_objref(struct ckpt_ctx *ctx)
-{
-	return ctx->obj_hash->next_free_objref++;
-}
-
-/**
- * ckpt_obj_new - add an object to the obj_hash
- * @ctx: checkpoint context
- * @ptr: pointer to object
- * @objref: object unique id
- * @ops: object operations
- *
- * Add the object to the obj_hash. If @objref is zero, assign a unique
- * object id and use @ptr as a hash key [checkpoint]. Else use @objref
- * as a key [restart].
- */
-static struct ckpt_obj *obj_new(struct ckpt_ctx *ctx, void *ptr,
-				int objref, enum obj_type type)
-{
-	const struct ckpt_obj_ops *ops = ckpt_obj_ops[type];
-	struct ckpt_obj *obj;
-	int i, ret;
-
-	/* explicitly disallow null pointers */
-	BUG_ON(!ptr);
-	/* make sure we don't change this accidentally */
-	BUG_ON(ops->obj_type != type);
-
-	obj = kzalloc(sizeof(*obj), GFP_KERNEL);
-	if (!obj)
-		return ERR_PTR(-ENOMEM);
-
-	obj->ptr = ptr;
-	obj->ops = ops;
-	obj->users = 2;  /* extra reference that objhash itself takes */
-
-	if (!objref) {
-		/* use @obj->ptr to index, assign objref (checkpoint) */
-		obj->objref = obj_alloc_objref(ctx);
-		i = hash_long((unsigned long) ptr, CKPT_OBJ_HASH_NBITS);
-	} else {
-		/* use @obj->objref to index (restart) */
-		obj->objref = objref;
-		i = hash_long((unsigned long) objref, CKPT_OBJ_HASH_NBITS);
-	}
-
-	if (ops->ref_grab)
-		ret = ops->ref_grab(obj->ptr);
-	else
-		ret = 0;
-	if (ret < 0) {
-		kfree(obj);
-		obj = ERR_PTR(ret);
-	} else {
-		hlist_add_head(&obj->hash, &ctx->obj_hash->head[i]);
-		hlist_add_head(&obj->next, &ctx->obj_hash->list);
-	}
-
-	return obj;
-}
-
-/**************************************************************************
- * Checkpoint
- */
-
-/**
- * obj_lookup_add - lookup object and add if not in objhash
- * @ctx: checkpoint context
- * @ptr: pointer to object
- * @type: object type
- * @first: [output] first encounter (added to table)
- *
- * Look up the object pointed to by @ptr in the hash table. If it isn't
- * already found there, add the object, and allocate a unique object
- * id. Grab a reference to every object that is added, and maintain the
- * reference until the entire hash is freed.
- */
-static struct ckpt_obj *obj_lookup_add(struct ckpt_ctx *ctx, void *ptr,
-				       enum obj_type type, int *first)
-{
-	struct ckpt_obj *obj;
-
-	obj = obj_find_by_ptr(ctx, ptr);
-	if (!obj) {
-		obj = obj_new(ctx, ptr, 0, type);
-		*first = 1;
-	} else {
-		BUG_ON(obj->ops->obj_type != type);
-		obj->users++;
-		*first = 0;
-	}
-	return obj;
-}
-
-/**
- * ckpt_obj_collect - collect object into objhash
- * @ctx: checkpoint context
- * @ptr: pointer to object
- * @type: object type
- *
- * [used during checkpoint].
- * Return: objref if object is new, 0 otherwise, or an error
- */
-int ckpt_obj_collect(struct ckpt_ctx *ctx, void *ptr, enum obj_type type)
-{
-	struct ckpt_obj *obj;
-	int first;
-
-	obj = obj_lookup_add(ctx, ptr, type, &first);
-	if (IS_ERR(obj))
-		return PTR_ERR(obj);
-	ckpt_debug("%s objref %d first %d\n",
-		   obj->ops->obj_name, obj->objref, first);
-	return first ? obj->objref : 0;
-}
-
-/**
- * ckpt_obj_lookup - lookup object (by pointer) in objhash
- * @ctx: checkpoint context
- * @ptr: pointer to object
- * @type: object type
- *
- * [used during checkpoint].
- * Return: objref (or zero if not found)
- */
-int ckpt_obj_lookup(struct ckpt_ctx *ctx, void *ptr, enum obj_type type)
-{
-	struct ckpt_obj *obj;
-
-	obj = obj_find_by_ptr(ctx, ptr);
-	BUG_ON(obj && obj->ops->obj_type != type);
-	if (obj)
-		ckpt_debug("%s objref %d\n", obj->ops->obj_name, obj->objref);
-	return obj ? obj->objref : 0;
-}
-
-static inline int obj_reverse_leak(struct ckpt_ctx *ctx, struct ckpt_obj *obj)
-{
-	/*
-	 * A "reverse" leak ?  All objects should already be in the
-	 * objhash by now. But an outside task may have created an
-	 * object while we were collecting, which we didn't catch.
-	 */
-	if (obj->ops->ref_users && !(ctx->uflags & CHECKPOINT_SUBTREE)) {
-		ckpt_err(ctx, -EBUSY, "%(O)%(P)Leak: reverse added late (%s)\n",
-			       obj->objref, obj->ptr, obj->ops->obj_name);
-		return -EBUSY;
-	}
-	return 0;
-}
-
-/**
- * ckpt_obj_lookup_add - lookup object and add if not in objhash
- * @ctx: checkpoint context
- * @ptr: pointer to object
- * @type: object type
- * @first: [output] first encoutner (added to table)
- *
- * [used during checkpoint].
- * Return: objref
- */
-int ckpt_obj_lookup_add(struct ckpt_ctx *ctx, void *ptr,
-			enum obj_type type, int *first)
-{
-	struct ckpt_obj *obj;
-
-	obj = obj_lookup_add(ctx, ptr, type, first);
-	if (IS_ERR(obj))
-		return PTR_ERR(obj);
-	ckpt_debug("%s objref %d first %d\n",
-		   obj->ops->obj_name, obj->objref, *first);
-
-	if (*first && obj_reverse_leak(ctx, obj))
-		return -EBUSY;
-
-	obj->flags |= CKPT_OBJ_VISITED;
-	return obj->objref;
-}
-
-/**
- * ckpt_obj_reserve - reserve an objref
- * @ctx: checkpoint context
- *
- * The reserved objref will not be used for subsequent objects. This
- * gives an objref that can be safely used during restart without a
- * matching object in checkpoint.  [used during checkpoint].
- */
-int ckpt_obj_reserve(struct ckpt_ctx *ctx)
-{
-	return obj_alloc_objref(ctx);
-}
-
-/**
- * checkpoint_obj - if not already in hash, add object and checkpoint
- * @ctx: checkpoint context
- * @ptr: pointer to object
- * @type: object type
- *
- * Use obj_lookup_add() to lookup (and possibly add) the object to the
- * hash table. If the CKPT_OBJ_CHECKPOINTED flag isn't set, then also
- * save the object's state using its ops->checkpoint().
- *
- * [This is used during checkpoint].
- * Returns: objref
- */
-int checkpoint_obj(struct ckpt_ctx *ctx, void *ptr, enum obj_type type)
-{
-	struct ckpt_hdr_objref *h;
-	struct ckpt_obj *obj;
-	int new, ret = 0;
-
-	obj = obj_lookup_add(ctx, ptr, type, &new);
-	if (IS_ERR(obj))
-		return PTR_ERR(obj);
-
-	if (new && obj_reverse_leak(ctx, obj))
-		return -EBUSY;
-
-	if (!(obj->flags & CKPT_OBJ_CHECKPOINTED)) {
-		h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_OBJREF);
-		if (!h)
-			return -ENOMEM;
-
-		h->objtype = type;
-		h->objref = obj->objref;
-		ret = ckpt_write_obj(ctx, &h->h);
-		ckpt_hdr_put(ctx, h);
-
-		if (ret < 0)
-			return ret;
-
-		/* invoke callback to actually dump the state */
-		BUG_ON(!obj->ops->checkpoint);
-
-		obj->flags |= CKPT_OBJ_CHECKPOINTED;
-		ret = obj->ops->checkpoint(ctx, ptr);
-	}
-
-	obj->flags |= CKPT_OBJ_VISITED;
-	return (ret < 0 ? ret : obj->objref);
-}
-
-/**
- * ckpt_obj_visit - mark object as visited
- * @ctx: checkpoint context
- * @ptr: pointer to object
- * @type: object type
- *
- * [used during checkpoint].
- * Marks the object as visited, or fail if not found
- */
-int ckpt_obj_visit(struct ckpt_ctx *ctx, void *ptr, enum obj_type type)
-{
-	struct ckpt_obj *obj;
-
-	obj = obj_find_by_ptr(ctx, ptr);
-	BUG_ON(obj && obj->ops->obj_type != type);
-
-	if (!obj) {
-		if (!(ctx->uflags & CHECKPOINT_SUBTREE)) {
-			/* if not found report reverse leak (full container) */
-			ckpt_err(ctx, -EBUSY,
-				 "%(O)%(P)Leak: reverse unknown (%s)\n",
-				 obj->objref, obj->ptr, obj->ops->obj_name);
-			return -EBUSY;
-		}
-	} else {
-		ckpt_debug("visit %s objref %d\n",
-			   obj->ops->obj_name, obj->objref);
-		obj->flags |= CKPT_OBJ_VISITED;
-	}
-	return 0;
-}
-
-/* increment the 'users' count of an object */
-static void ckpt_obj_users_inc(struct ckpt_ctx *ctx, void *ptr, int increment)
-{
-	struct ckpt_obj *obj;
-
-	obj = obj_find_by_ptr(ctx, ptr);
-	if (obj)
-		obj->users += increment;
-}
-
-/*
- * "Leak detection" - to guarantee a consistent checkpoint of a full
- * container we verify that all resources are confined and isolated in
- * that container:
- *
- * c/r code first walks through all tasks and collects all shared
- * resources into the objhash, while counting the references to them;
- * then, it compares this count to the object's real reference count,
- * and if they don't match it means that an object has "leaked" to the
- * outside.
- *
- * Otherwise, it is guaranteed that there are no references outside
- * (of container). c/r code now proceeds to walk through all tasks,
- * again, and checkpoints the resources. It ensures that all resources
- * are already in the objhash, and that all of them are checkpointed.
- * Otherwise it means that due to a race, an object was created or
- * destroyed during the first walk but not accounted for.
- *
- * For instance, consider an outside task A that shared files_struct
- * with inside task B. Then, after B's files where collected, A opens
- * or closes a file, and immediately exits - before the first leak
- * test is performed, such that the test passes.
- */
-
-/**
- * obj_sock_adjust_users - remove implicit reference on DEAD sockets
- * @obj: CKPT_OBJ_SOCK object to adjust
- *
- * Sockets that have been disconnected from their struct file have
- * a reference count one less than normal sockets.  The objhash's
- * assumption of such a reference is therefore incorrect, so we correct
- * it here.
- */
-static inline void obj_sock_adjust_users(struct ckpt_obj *obj)
-{
-	struct sock *sk = (struct sock *)obj->ptr;
-
-	if (sock_flag(sk, SOCK_DEAD)) {
-		obj->users--;
-		ckpt_debug("Adjusting SOCK %i count to %i\n",
-			   obj->objref, obj->users);
-	}
-}
-
-/**
- * ckpt_obj_contained - test if shared objects are contained in checkpoint
- * @ctx: checkpoint context
- *
- * Loops through all objects in the table and compares the number of
- * references accumulated during checkpoint, with the reference count
- * reported by the kernel.
- *
- * Return 1 if respective counts match for all objects, 0 otherwise.
- */
-int ckpt_obj_contained(struct ckpt_ctx *ctx)
-{
-	struct ckpt_obj *obj;
-	struct hlist_node *node;
-
-	/* account for ctx->{file,logfile} (if in the table already) */
-	ckpt_obj_users_inc(ctx, ctx->file, 1);
-	if (ctx->logfile)
-		ckpt_obj_users_inc(ctx, ctx->logfile, 1);
-	/* account for ctx->root_nsproxy (if in the table already) */
-	ckpt_obj_users_inc(ctx, ctx->root_nsproxy, 1);
-
-	hlist_for_each_entry(obj, node, &ctx->obj_hash->list, next) {
-		if (!obj->ops->ref_users)
-			continue;
-
-		if (obj->ops->obj_type == CKPT_OBJ_SOCK)
-			obj_sock_adjust_users(obj);
-
-		if (obj->ops->ref_users(obj->ptr) != obj->users) {
-			ckpt_err(ctx, -EBUSY,
-				 "%(O)%(P)%(S)Usage leak (%d != %d)\n",
-				 obj->objref, obj->ptr, obj->ops->obj_name,
-				 obj->ops->ref_users(obj->ptr), obj->users);
-			return 0;
-		}
-	}
-
-	return 1;
-}
-
-/**
- * ckpt_obj_visited - test that all shared objects were visited
- * @ctx: checkpoint context
- *
- * Return 1 if all objects where visited, 0 otherwise.
- */
-int ckpt_obj_visited(struct ckpt_ctx *ctx)
-{
-	struct ckpt_obj *obj;
-	struct hlist_node *node;
-
-	hlist_for_each_entry(obj, node, &ctx->obj_hash->list, next) {
-		if (!(obj->flags & CKPT_OBJ_VISITED)) {
-			ckpt_err(ctx, -EBUSY,
-				 "%(O)%(P)%(S)Leak: not visited\n",
-				 obj->objref, obj->ptr, obj->ops->obj_name);
-			return 0;
-		}
-	}
-
-	return 1;
-}
-
-/**************************************************************************
- * Restart
- */
-
-/**
- * restore_obj - read in and restore a (first seen) shared object
- * @ctx: checkpoint context
- * @h: ckpt_hdr of shared object
- *
- * Read in the header payload (struct ckpt_hdr_objref). Lookup the
- * object to verify it isn't there.  Then restore the object's state
- * and add it to the objash. No need to explicitly grab a reference -
- * we hold the initial instance of this object. (Object maintained
- * until the entire hash is free).
- *
- * [This is used during restart].
- */
-int restore_obj(struct ckpt_ctx *ctx, struct ckpt_hdr_objref *h)
-{
-	const struct ckpt_obj_ops *ops;
-	struct ckpt_obj *obj;
-	void *ptr = NULL;
-
-	ckpt_debug("len %d ref %d type %d\n", h->h.len, h->objref, h->objtype);
-	if (h->objtype >= CKPT_OBJ_MAX)
-		return -EINVAL;
-	if (h->objref <= 0)
-		return -EINVAL;
-
-	ops = ckpt_obj_ops[h->objtype];
-	BUG_ON(ops->obj_type != h->objtype);
-
-	if (ops->restore)
-		ptr = ops->restore(ctx);
-	if (IS_ERR(ptr))
-		return PTR_ERR(ptr);
-
-	if (obj_find_by_objref(ctx, h->objref))
-		obj = ERR_PTR(-EINVAL);
-	else
-		obj = obj_new(ctx, ptr, h->objref, h->objtype);
-	/*
-	 * Drop an extra reference to the object returned by ops->restore:
-	 * On success, this clears the extra reference taken by obj_new(),
-	 * and on failure, this cleans up the object itself.
-	 */
-	if (ops->ref_drop)
-		ops->ref_drop(ptr, 0);
-	if (IS_ERR(obj)) {
-		if (ops->ref_drop)
-			ops->ref_drop(ptr, 1);
-		return PTR_ERR(obj);
-	}
-	return obj->objref;
-}
-
-/**
- * ckpt_obj_insert - add an object with a given objref to obj_hash
- * @ctx: checkpoint context
- * @ptr: pointer to object
- * @objref: unique object id
- * @type: object type
- *
- * Add the object pointer to by @ptr and identified by unique object id
- * @objref to the hash table (indexed by @objref).  Grab a reference to
- * every object added, and maintain it until the entire hash is freed.
- *
- * [This is used during restart].
- */
-int ckpt_obj_insert(struct ckpt_ctx *ctx, void *ptr,
-		    int objref, enum obj_type type)
-{
-	struct ckpt_obj *obj;
-
-	if (objref <= 0)
-		return -EINVAL;
-	if (obj_find_by_objref(ctx, objref))
-		return -EINVAL;
-	obj = obj_new(ctx, ptr, objref, type);
-	if (IS_ERR(obj))
-		return PTR_ERR(obj);
-	ckpt_debug("%s objref %d\n", obj->ops->obj_name, objref);
-	return obj->objref;
-}
-
-/**
- * ckpt_obj_try_fetch - fetch an object by its identifier
- * @ctx: checkpoint context
- * @objref: object id
- * @type: object type
- *
- * Lookup the objref identifier by @objref in the hash table. Return
- * an error not found.
- *
- * [This is used during restart].
- */
-void *ckpt_obj_try_fetch(struct ckpt_ctx *ctx, int objref, enum obj_type type)
-{
-	struct ckpt_obj *obj;
-
-	obj = obj_find_by_objref(ctx, objref);
-	if (!obj)
-		return ERR_PTR(-EINVAL);
-	ckpt_debug("%s ref %d\n", obj->ops->obj_name, obj->objref);
-	if (obj->ops->obj_type == type)
-		return obj->ptr;
-	return ERR_PTR(-ENOMSG);
-}
-
-void *ckpt_obj_fetch(struct ckpt_ctx *ctx, int objref, enum obj_type type)
-{
-	void *ret = ckpt_obj_try_fetch(ctx, objref, type);
-
-	if (unlikely(IS_ERR(ret)))
-		ckpt_err(ctx, PTR_ERR(ret), "%(O)Fetching object (type %d)\n",
-			 objref, type);
-	return ret;
-}
-
-/*
- * checkpoint a security context string.  This is done by
- * security/security.c:security_checkpoint_obj() when it checkpoints
- * a void*security whose context string has not yet been written out.
- * The objref for the void*security (which is not itself written out
- * to the checkpoint image) is stored alongside the context string,
- * as is the type of object which contained the void* security, i.e.
- * struct file, struct cred, etc.
- */
-static int checkpoint_lsm_string(struct ckpt_ctx *ctx, void *ptr)
-{
-	struct ckpt_hdr_lsm *h;
-	struct ckpt_lsm_string *l = ptr;
-	int ret;
-
-	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SECURITY);
-	if (!h)
-		return -ENOMEM;
-	h->sectype = l->sectype;
-	h->ptrref = l->ptrref;
-	ret = ckpt_write_obj(ctx, &h->h);
-	ckpt_hdr_put(ctx, h);
-
-	if (ret < 0)
-		return ret;
-	return ckpt_write_string(ctx, l->string, strlen(l->string)+1);
-}
-
-/*
- * callback invoked when a security context string is found in a
- * checkpoint image at restart.  The context string is saved in the object
- * hash.  The objref under which the void* security was inserted in the
- * objhash at checkpoint is also found here, and we re-insert this context
- * string a second time under that objref.  This is because objects which
- * had this context will have the objref of the void*security, not of the
- * context string.
- */
-static struct ckpt_lsm_string *restore_lsm_string(struct ckpt_ctx *ctx)
-{
-	struct ckpt_hdr_lsm *h;
-	struct ckpt_lsm_string *l;
-
-	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SECURITY);
-	if (IS_ERR(h)) {
-		ckpt_debug("ckpt_read_obj_type returned %ld\n", PTR_ERR(h));
-		return ERR_PTR(PTR_ERR(h));
-	}
-
-	l = kzalloc(sizeof(*l), GFP_KERNEL);
-	if (!l) {
-		l = ERR_PTR(-ENOMEM);
-		goto out;
-	}
-	l->string = ckpt_read_string(ctx, CKPT_LSM_STRING_MAX);
-	if (IS_ERR(l->string)) {
-		void *s = l->string;
-		ckpt_debug("ckpt_read_string returned %ld\n", PTR_ERR(s));
-		kfree(l);
-		l = s;
-		goto out;
-	}
-	kref_init(&l->kref);
-	l->sectype = h->sectype;
-	/* l is just a placeholder, don't grab a ref */
-	ckpt_obj_insert(ctx, l, h->ptrref, CKPT_OBJ_SECURITY);
-
-out:
-	ckpt_hdr_put(ctx, h);
-	return l;
-}
diff --git a/checkpoint/process.c b/checkpoint/process.c
deleted file mode 100644
index 6e3e382..0000000
--- a/checkpoint/process.c
+++ /dev/null
@@ -1,929 +0,0 @@
-/*
- *  Checkpoint task structure
- *
- *  Copyright (C) 2008-2009 Oren Laadan
- *
- *  This file is subject to the terms and conditions of the GNU General Public
- *  License.  See the file COPYING in the main directory of the Linux
- *  distribution for more details.
- */
-
-/* default debug level for output */
-#define CKPT_DFLAG  CKPT_DSYS
-
-#include <linux/sched.h>
-#include <linux/nsproxy.h>
-#include <linux/posix-timers.h>
-#include <linux/futex.h>
-#include <linux/compat.h>
-#include <linux/poll.h>
-#include <linux/utsname.h>
-#include <linux/user_namespace.h>
-#include <linux/checkpoint.h>
-#include <linux/checkpoint_hdr.h>
-#include <linux/mm_checkpoint.h>
-#include <linux/syscalls.h>
-
-
-pid_t ckpt_pid_nr(struct ckpt_ctx *ctx, struct pid *pid)
-{
-	return pid ? pid_nr_ns(pid, ctx->root_nsproxy->pid_ns) : CKPT_PID_NULL;
-}
-
-/* must be called with tasklist_lock or rcu_read_lock() held */
-struct pid *_ckpt_find_pgrp(struct ckpt_ctx *ctx, pid_t pgid)
-{
-	struct task_struct *p;
-	struct pid *pgrp;
-
-	if (pgid == 0) {
-		/*
-		 * At checkpoint the pgid owner lived in an ancestor
-		 * pid-ns. The best we can do (sanely and safely) is
-		 * to examine the parent of this restart's root: if in
-		 * a distinct pid-ns, use its pgrp; otherwise fail.
-		 */
-		p = ctx->root_task->real_parent;
-		if (p->nsproxy->pid_ns == current->nsproxy->pid_ns)
-			return NULL;
-		pgrp = task_pgrp(p);
-	} else {
-		/*
-		 * Find the owner process of this pgid (it must exist
-		 * if pgrp exists). It must be a thread group leader.
-		 */
-		pgrp = find_vpid(pgid);
-		p = pid_task(pgrp, PIDTYPE_PID);
-		if (!p || !thread_group_leader(p))
-			return NULL;
-		/*
-		 * The pgrp must "belong" to our restart tree (compare
-		 * p->checkpoint_ctx to ours). This prevents malicious
-		 * input from (guessing and) using unrelated pgrps. If
-		 * the owner is dead, then it doesn't have a context,
-		 * so instead compare against its (real) parent's.
-		 */
-		if (p->exit_state == EXIT_ZOMBIE)
-			p = p->real_parent;
-		if (p->checkpoint_ctx != ctx)
-			return NULL;
-	}
-
-	if (task_session(current) != task_session(p))
-		return NULL;
-
-	return pgrp;
-}
-
-
-#ifdef CONFIG_FUTEX
-static void save_task_robust_futex_list(struct ckpt_hdr_task *h,
-					struct task_struct *t)
-{
-	/*
-	 * These are __user pointers and thus can be saved without
-	 * the objhash.
-	 */
-	h->robust_futex_list = (unsigned long)t->robust_list;
-	h->robust_futex_head_len = sizeof(*t->robust_list);
-#ifdef CONFIG_COMPAT
-	h->compat_robust_futex_list = ptr_to_compat(t->compat_robust_list);
-	h->compat_robust_futex_head_len = sizeof(*t->compat_robust_list);
-#endif
-}
-
-static void restore_task_robust_futex_list(struct ckpt_hdr_task *h)
-{
-	/* Since we restore the memory map the address remains the same and
-	 * this is safe. This is the same as [compat_]sys_set_robust_list() */
-	if (h->robust_futex_list) {
-		struct robust_list_head __user *rfl;
-		rfl = (void __user *)(unsigned long) h->robust_futex_list;
-		do_set_robust_list(rfl, h->robust_futex_head_len);
-	}
-#ifdef CONFIG_COMPAT
-	if (h->compat_robust_futex_list) {
-		struct compat_robust_list_head __user *crfl;
-		crfl = compat_ptr(h->compat_robust_futex_list);
-		do_compat_set_robust_list(crfl, h->compat_robust_futex_head_len);
-	}
-#endif
-}
-#else /* !CONFIG_FUTEX */
-static inline void save_task_robust_futex_list(struct ckpt_hdr_task *h,
-					       struct task_struct *t)
-{
-}
-
-static inline void restore_task_robust_futex_list(struct ckpt_hdr_task *h)
-{
-}
-#endif /* CONFIG_FUTEX */
-
-
-/***********************************************************************
- * Checkpoint
- */
-
-/* dump the task_struct of a given task */
-static int checkpoint_task_struct(struct ckpt_ctx *ctx, struct task_struct *t)
-{
-	struct ckpt_hdr_task *h;
-	int ret;
-
-	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TASK);
-	if (!h)
-		return -ENOMEM;
-
-	h->state = t->state;
-	h->exit_state = t->exit_state;
-	h->exit_code = t->exit_code;
-
-	if (t->exit_state) {
-		/* zombie - skip remaining state */
-		BUG_ON(t->exit_state != EXIT_ZOMBIE);
-	} else {
-		/* FIXME: save remaining relevant task_struct fields */
-		h->exit_signal = t->exit_signal;
-		h->pdeath_signal = t->pdeath_signal;
-
-		h->set_child_tid = (unsigned long) t->set_child_tid;
-		h->clear_child_tid = (unsigned long) t->clear_child_tid;
-		save_task_robust_futex_list(h, t);
-	}
-
-	ret = ckpt_write_obj(ctx, &h->h);
-	ckpt_hdr_put(ctx, h);
-	if (ret < 0)
-		return ret;
-
-	return ckpt_write_string(ctx, t->comm, TASK_COMM_LEN);
-}
-
-static int checkpoint_task_ns(struct ckpt_ctx *ctx, struct task_struct *t)
-{
-	struct ckpt_hdr_task_ns *h;
-	struct nsproxy *nsproxy;
-	int ns_objref;
-	int ret;
-
-	rcu_read_lock();
-	nsproxy = task_nsproxy(t);
-	get_nsproxy(nsproxy);
-	rcu_read_unlock();
-
-	ns_objref = checkpoint_obj(ctx, nsproxy, CKPT_OBJ_NS);
-	put_nsproxy(nsproxy);
-
-	ckpt_debug("nsproxy: objref %d\n", ns_objref);
-	if (ns_objref < 0)
-		return ns_objref;
-
-	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TASK_NS);
-	if (!h)
-		return -ENOMEM;
-	h->ns_objref = ns_objref;
-	ret = ckpt_write_obj(ctx, &h->h);
-	ckpt_hdr_put(ctx, h);
-
-	return ret;
-}
-
-static int checkpoint_task_creds(struct ckpt_ctx *ctx, struct task_struct *t)
-{
-	int realcred_ref, ecred_ref;
-	struct cred *rcred, *ecred;
-	struct ckpt_hdr_task_creds *h;
-	int ret;
-
-	rcred = (struct cred *) get_cred(t->real_cred);
-	ecred = (struct cred *) get_cred(t->cred);
-
-	realcred_ref = checkpoint_obj(ctx, rcred, CKPT_OBJ_CRED);
-	if (realcred_ref < 0) {
-		ret = realcred_ref;
-		goto error;
-	}
-
-	ecred_ref = checkpoint_obj(ctx, ecred, CKPT_OBJ_CRED);
-	if (ecred_ref < 0) {
-		ret = ecred_ref;
-		goto error;
-	}
-
-	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TASK_CREDS);
-	if (!h) {
-		ret = -ENOMEM;
-		goto error;
-	}
-
-	h->cred_ref = realcred_ref;
-	h->ecred_ref = ecred_ref;
-	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
-	ckpt_hdr_put(ctx, h);
-
-error:
-	put_cred(rcred);
-	put_cred(ecred);
-	return ret;
-}
-
-static int checkpoint_task_objs(struct ckpt_ctx *ctx, struct task_struct *t)
-{
-	struct ckpt_hdr_task_objs *h;
-	int files_objref;
-	int mm_objref;
-	int fs_objref;
-	int sighand_objref;
-	int signal_objref;
-	int first, ret;
-
-	/*
-	 * Shared objects may have dependencies among them: task->mm
-	 * depends on task->nsproxy (by ipc_ns). Therefore first save
-	 * the namespaces, and then the remaining shared objects.
-	 * During restart a task will already have its namespaces
-	 * restored when it gets to restore, e.g. its memory.
-	 */
-
-	ret = checkpoint_task_creds(ctx, t);
-	ckpt_debug("cred: objref %d\n", ret);
-	if (ret < 0) {
-		ckpt_err(ctx, ret, "%(T)process credentials\n");
-		return ret;
-	}
-
-	ret = checkpoint_task_ns(ctx, t);
-	ckpt_debug("ns: objref %d\n", ret);
-	if (ret < 0) {
-		ckpt_err(ctx, ret, "%(T)process namespaces\n");
-		return ret;
-	}
-
-	files_objref = checkpoint_obj_file_table(ctx, t);
-	ckpt_debug("files: objref %d\n", files_objref);
-	if (files_objref < 0) {
-		ckpt_err(ctx, files_objref, "%(T)files_struct\n");
-		return files_objref;
-	}
-
-	mm_objref = checkpoint_obj_mm(ctx, t);
-	ckpt_debug("mm: objref %d\n", mm_objref);
-	if (mm_objref < 0) {
-		ckpt_err(ctx, mm_objref, "%(T)mm_struct\n");
-		return mm_objref;
-	}
-
-	/* note: this must come *after* file-table and mm */
-	fs_objref = checkpoint_obj_fs(ctx, t);
-	if (fs_objref < 0) {
-		ckpt_err(ctx, fs_objref, "%(T)process fs\n");
-		return fs_objref;
-	}
-
-	sighand_objref = checkpoint_obj_sighand(ctx, t);
-	ckpt_debug("sighand: objref %d\n", sighand_objref);
-	if (sighand_objref < 0) {
-		ckpt_err(ctx, sighand_objref, "%(T)sighand_struct\n");
-		return sighand_objref;
-	}
-
-	/*
-	 * Handle t->signal differently because the checkpoint method
-	 * for t->signal needs access to owning task_struct to access
-	 * t->sighand (to lock/unlock). First explicitly determine if
-	 * need to save, and only below invoke checkpoint_obj_signal()
-	 * if needed.
-	 */
-	signal_objref = ckpt_obj_lookup_add(ctx, t->signal,
-					    CKPT_OBJ_SIGNAL, &first);
-	ckpt_debug("signal: objref %d\n", signal_objref);
-	if (signal_objref < 0) {
-		ckpt_err(ctx, signal_objref, "%(T)process signals\n");
-		return signal_objref;
-	}
-
-	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TASK_OBJS);
-	if (!h)
-		return -ENOMEM;
-	h->files_objref = files_objref;
-	h->mm_objref = mm_objref;
-	h->fs_objref = fs_objref;
-	h->sighand_objref = sighand_objref;
-	h->signal_objref = signal_objref;
-	ret = ckpt_write_obj(ctx, &h->h);
-	ckpt_hdr_put(ctx, h);
-	if (ret < 0)
-		return ret;
-
-	/* actually save t->signal, if need to */
-	if (first)
-		ret = checkpoint_obj_signal(ctx, t);
-	if (ret < 0)
-		ckpt_err(ctx, ret, "%(T)signal_struct\n");
-
-	return ret;
-}
-
-/* dump the task_struct of a given task */
-int checkpoint_restart_block(struct ckpt_ctx *ctx, struct task_struct *t)
-{
-	struct ckpt_hdr_restart_block *h;
-	struct restart_block *restart_block;
-	long (*fn)(struct restart_block *);
-	s64 base, expire = 0;
-	int ret;
-
-	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_RESTART_BLOCK);
-	if (!h)
-		return -ENOMEM;
-
-	base = ktime_to_ns(ctx->ktime_begin);
-	restart_block = &task_thread_info(t)->restart_block;
-	fn = restart_block->fn;
-
-	/* FIX: enumerate clockid_t so we're immune to changes */
-
-	if (fn == do_no_restart_syscall) {
-
-		h->function_type = CKPT_RESTART_BLOCK_NONE;
-		ckpt_debug("restart_block: non\n");
-
-	} else if (fn == hrtimer_nanosleep_restart) {
-
-		h->function_type = CKPT_RESTART_BLOCK_HRTIMER_NANOSLEEP;
-		h->arg_0 = restart_block->nanosleep.index;
-		h->arg_1 = (unsigned long) restart_block->nanosleep.rmtp;
-		expire = restart_block->nanosleep.expires;
-		ckpt_debug("restart_block: hrtimer expire %lld now %lld\n",
-			 expire, base);
-
-	} else if (fn == posix_cpu_nsleep_restart) {
-		struct timespec ts;
-
-		h->function_type = CKPT_RESTART_BLOCK_POSIX_CPU_NANOSLEEP;
-		h->arg_0 = restart_block->arg0;
-		h->arg_1 = restart_block->arg1;
-		ts.tv_sec = restart_block->arg2;
-		ts.tv_nsec = restart_block->arg3;
-		expire = timespec_to_ns(&ts);
-		ckpt_debug("restart_block: posix_cpu expire %lld now %lld\n",
-			 expire, base);
-
-#ifdef CONFIG_COMPAT
-	} else if (fn == compat_nanosleep_restart) {
-
-		h->function_type = CKPT_RESTART_BLOCK_COMPAT_NANOSLEEP;
-		h->arg_0 = restart_block->nanosleep.index;
-		h->arg_1 = (unsigned long)restart_block->nanosleep.rmtp;
-		h->arg_2 = (unsigned long)restart_block->nanosleep.compat_rmtp;
-		expire = restart_block->nanosleep.expires;
-		ckpt_debug("restart_block: compat expire %lld now %lld\n",
-			 expire, base);
-
-	} else if (fn == compat_clock_nanosleep_restart) {
-
-		h->function_type = CKPT_RESTART_BLOCK_COMPAT_CLOCK_NANOSLEEP;
-		h->arg_0 = restart_block->nanosleep.index;
-		h->arg_1 = (unsigned long)restart_block->nanosleep.rmtp;
-		h->arg_2 = (unsigned long)restart_block->nanosleep.compat_rmtp;
-		expire = restart_block->nanosleep.expires;
-		ckpt_debug("restart_block: compat_clock expire %lld now %lld\n",
-			 expire, base);
-
-#endif
-	} else if (fn == futex_wait_restart) {
-
-		h->function_type = CKPT_RESTART_BLOCK_FUTEX;
-		h->arg_0 = (unsigned long) restart_block->futex.uaddr;
-		h->arg_1 = restart_block->futex.val;
-		h->arg_2 = restart_block->futex.flags;
-		h->arg_3 = restart_block->futex.bitset;
-		expire = restart_block->futex.time;
-		ckpt_debug("restart_block: futex expire %lld now %lld\n",
-			 expire, base);
-
-	} else if (fn == do_restart_poll) {
-		struct timespec ts;
-
-		h->function_type = CKPT_RESTART_BLOCK_POLL;
-		h->arg_0 = (unsigned long) restart_block->poll.ufds;
-		h->arg_1 = restart_block->poll.nfds;
-		h->arg_2 = restart_block->poll.has_timeout;
-		ts.tv_sec = restart_block->poll.tv_sec;
-		ts.tv_nsec = restart_block->poll.tv_nsec;
-		expire = timespec_to_ns(&ts);
-		ckpt_debug("restart_block: poll expire %lld now %lld\n",
-			 expire, base);
-
-	} else {
-
-		BUG();
-
-	}
-
-	/* common to all restart blocks: */
-	h->arg_4 = (base < expire ? expire - base : 0);
-
-	ckpt_debug("restart_block: args %#llx %#llx %#llx %#llx %#llx\n",
-		 h->arg_0, h->arg_1, h->arg_2, h->arg_3, h->arg_4);
-
-	ret = ckpt_write_obj(ctx, &h->h);
-	ckpt_hdr_put(ctx, h);
-
-	ckpt_debug("restart_block ret %d\n", ret);
-	return ret;
-}
-
-/* dump the entire state of a given task */
-int checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
-{
-	int ret;
-
-	ctx->tsk = t;
-
-	ret = checkpoint_task_struct(ctx, t);
-	ckpt_debug("task %d\n", ret);
-	if (ret < 0)
-		goto out;
-
-	/* zombie - we're done here */
-	if (t->exit_state)
-		return 0;
-
-	ret = checkpoint_thread(ctx, t);
-	ckpt_debug("thread %d\n", ret);
-	if (ret < 0)
-		goto out;
-	ret = checkpoint_restart_block(ctx, t);
-	ckpt_debug("restart-blocks %d\n", ret);
-	if (ret < 0)
-		goto out;
-	ret = checkpoint_cpu(ctx, t);
-	ckpt_debug("cpu %d\n", ret);
-	if (ret < 0)
-		goto out;
-	ret = checkpoint_task_objs(ctx, t);
-	ckpt_debug("objs %d\n", ret);
-	if (ret < 0)
-		goto out;
-	ret = checkpoint_task_signal(ctx, t);
-	ckpt_debug("task-signal %d\n", ret);
- out:
-	ctx->tsk = NULL;
-	return ret;
-}
-
-int ckpt_collect_task(struct ckpt_ctx *ctx, struct task_struct *t)
-{
-	int ret;
-
-	ret = ckpt_collect_ns(ctx, t);
-	if (ret < 0)
-		return ret;
-	ret = ckpt_collect_file_table(ctx, t);
-	if (ret < 0)
-		return ret;
-	ret = ckpt_collect_mm(ctx, t);
-	if (ret < 0)
-		return ret;
-	ret = ckpt_collect_fs(ctx, t);
-	if (ret < 0)
-		return ret;
-	ret = ckpt_collect_sighand(ctx, t);
-
-	return ret;
-}
-
-/***********************************************************************
- * Restart
- */
-
-static inline int valid_exit_code(int exit_code)
-{
-	if (exit_code >= 0x10000)
-		return 0;
-	if (exit_code & 0xff) {
-		if (exit_code & ~0xff)
-			return 0;
-		if (!valid_signal(exit_code & 0xff))
-			return 0;
-	}
-	return 1;
-}
-
-/* read the task_struct into the current task */
-static int restore_task_struct(struct ckpt_ctx *ctx)
-{
-	struct ckpt_hdr_task *h;
-	struct task_struct *t = current;
-	int ret;
-
-	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TASK);
-	if (IS_ERR(h))
-		return PTR_ERR(h);
-
-	ret = -EINVAL;
-	if (h->state == TASK_DEAD) {
-		if (h->exit_state != EXIT_ZOMBIE)
-			goto out;
-		if (!valid_exit_code(h->exit_code))
-			goto out;
-		t->exit_code = h->exit_code;
-	} else {
-		if (h->exit_code)
-			goto out;
-		if ((thread_group_leader(t) && !valid_signal(h->exit_signal)) ||
-		    (!thread_group_leader(t) && h->exit_signal != -1))
-			goto out;
-		if (!valid_signal(h->pdeath_signal))
-			goto out;
-
-		/* FIXME: restore remaining relevant task_struct fields */
-		t->exit_signal = h->exit_signal;
-		t->pdeath_signal = h->pdeath_signal;
-
-		t->set_child_tid =
-			(int __user *) (unsigned long) h->set_child_tid;
-		t->clear_child_tid =
-			(int __user *) (unsigned long) h->clear_child_tid;
-		restore_task_robust_futex_list(h);
-	}
-
-	memset(t->comm, 0, TASK_COMM_LEN);
-	ret = _ckpt_read_string(ctx, t->comm, TASK_COMM_LEN);
-	if (ret < 0)
-		goto out;
-
-	/* return 1 for zombie, 0 otherwise */
-	ret = (h->state == TASK_DEAD ? 1 : 0);
- out:
-	ckpt_hdr_put(ctx, h);
-	return ret;
-}
-
-static int restore_task_ns(struct ckpt_ctx *ctx)
-{
-	struct ckpt_hdr_task_ns *h;
-	struct nsproxy *nsproxy;
-	int ret = 0;
-
-	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TASK_NS);
-	if (IS_ERR(h))
-		return PTR_ERR(h);
-
-	nsproxy = ckpt_obj_fetch(ctx, h->ns_objref, CKPT_OBJ_NS);
-	if (IS_ERR(nsproxy)) {
-		ret = PTR_ERR(nsproxy);
-		goto out;
-	}
-
-	if (nsproxy != task_nsproxy(current)) {
-		get_nsproxy(nsproxy);
-		switch_task_namespaces(current, nsproxy);
-	}
- out:
-	ckpt_debug("nsproxy: ret %d (%p)\n", ret, task_nsproxy(current));
-	ckpt_hdr_put(ctx, h);
-	return ret;
-}
-
-static int restore_task_creds(struct ckpt_ctx *ctx)
-{
-	struct ckpt_hdr_task_creds *h;
-	struct cred *realcred, *ecred;
-	int ret = 0;
-
-	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TASK_CREDS);
-	if (IS_ERR(h))
-		return PTR_ERR(h);
-
-	realcred = ckpt_obj_fetch(ctx, h->cred_ref, CKPT_OBJ_CRED);
-	if (IS_ERR(realcred)) {
-		ckpt_debug("Error %ld fetching realcred (ref %d)\n",
-			PTR_ERR(realcred), h->cred_ref);
-		ret = PTR_ERR(realcred);
-		goto out;
-	}
-	ecred = ckpt_obj_fetch(ctx, h->ecred_ref, CKPT_OBJ_CRED);
-	if (IS_ERR(ecred)) {
-		ckpt_debug("Error %ld fetching ecred (ref %d)\n",
-			PTR_ERR(ecred), h->ecred_ref);
-		ret = PTR_ERR(ecred);
-		goto out;
-	}
-	ctx->realcred = realcred;
-	ctx->ecred = ecred;
-
-out:
-	ckpt_debug("Returning %d\n", ret);
-	ckpt_hdr_put(ctx, h);
-	return ret;
-}
-
-static int restore_task_objs(struct ckpt_ctx *ctx)
-{
-	struct ckpt_hdr_task_objs *h;
-	int ret;
-
-	/*
-	 * Namespaces come first, because ->mm depends on ->nsproxy,
-	 * and because shared objects are restored before they are
-	 * referenced. See comment in checkpoint_task_objs.
-	 */
-	ret = restore_task_creds(ctx);
-	if (ret < 0) {
-		ckpt_debug("restore_task_creds returned %d\n", ret);
-		return ret;
-	}
-	ret = restore_task_ns(ctx);
-	if (ret < 0) {
-		ckpt_debug("restore_task_ns returned %d\n", ret);
-		return ret;
-	}
-
-	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TASK_OBJS);
-	if (IS_ERR(h)) {
-		ckpt_debug("Error fetching task obj\n");
-		return PTR_ERR(h);
-	}
-
-	ret = restore_obj_file_table(ctx, h->files_objref);
-	ckpt_debug("file_table: ret %d (%p)\n", ret, current->files);
-	if (ret < 0)
-		goto out;
-
-	ret = restore_obj_mm(ctx, h->mm_objref);
-	ckpt_debug("mm: ret %d (%p)\n", ret, current->mm);
-	if (ret < 0)
-		goto out;
-
-	ret = restore_obj_fs(ctx, h->fs_objref);
-	ckpt_debug("fs: ret %d (%p)\n", ret, current->fs);
-	if (ret < 0)
-		return ret;
-
-	ret = restore_obj_sighand(ctx, h->sighand_objref);
-	ckpt_debug("sighand: ret %d (%p)\n", ret, current->sighand);
-	if (ret < 0)
-		goto out;
-
-	ret = restore_obj_signal(ctx, h->signal_objref);
-	ckpt_debug("signal: ret %d (%p)\n", ret, current->signal);
- out:
-	ckpt_hdr_put(ctx, h);
-	return ret;
-}
-
-static int restore_creds(struct ckpt_ctx *ctx)
-{
-	int ret;
-	const struct cred *old;
-	struct cred *rcred, *ecred;
-
-	rcred = ctx->realcred;
-	ecred = ctx->ecred;
-
-	/* commit_creds will take one ref for the eff creds, but
-	 * expects us to hold a ref for the obj creds, so take a
-	 * ref here */
-	get_cred(rcred);
-	ret = commit_creds(rcred);
-	if (ret)
-		return ret;
-
-	if (ecred == rcred)
-		return 0;
-
-	old = override_creds(ecred); /* override_creds otoh takes new ref */
-	put_cred(old);
-
-	ctx->realcred = ctx->ecred = NULL;
-	return 0;
-}
-
-int restore_restart_block(struct ckpt_ctx *ctx)
-{
-	struct ckpt_hdr_restart_block *h;
-	struct restart_block restart_block;
-	struct timespec ts;
-	clockid_t clockid;
-	s64 expire;
-	int ret = 0;
-
-	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_RESTART_BLOCK);
-	if (IS_ERR(h))
-		return PTR_ERR(h);
-
-	expire = ktime_to_ns(ctx->ktime_begin) + h->arg_4;
-	restart_block.fn = NULL;
-
-	ckpt_debug("restart_block: expire %lld begin %lld\n",
-		 expire, ktime_to_ns(ctx->ktime_begin));
-	ckpt_debug("restart_block: args %#llx %#llx %#llx %#llx %#llx\n",
-		 h->arg_0, h->arg_1, h->arg_2, h->arg_3, h->arg_4);
-
-	switch (h->function_type) {
-	case CKPT_RESTART_BLOCK_NONE:
-		restart_block.fn = do_no_restart_syscall;
-		break;
-	case CKPT_RESTART_BLOCK_HRTIMER_NANOSLEEP:
-		clockid = h->arg_0;
-		if (clockid < 0 || invalid_clockid(clockid))
-			break;
-		restart_block.fn = hrtimer_nanosleep_restart;
-		restart_block.nanosleep.index = clockid;
-		restart_block.nanosleep.rmtp =
-			(struct timespec __user *) (unsigned long) h->arg_1;
-		restart_block.nanosleep.expires = expire;
-		break;
-	case CKPT_RESTART_BLOCK_POSIX_CPU_NANOSLEEP:
-		clockid = h->arg_0;
-		if (clockid < 0 || invalid_clockid(clockid))
-			break;
-		restart_block.fn = posix_cpu_nsleep_restart;
-		restart_block.arg0 = clockid;
-		restart_block.arg1 = h->arg_1;
-		ts = ns_to_timespec(expire);
-		restart_block.arg2 = ts.tv_sec;
-		restart_block.arg3 = ts.tv_nsec;
-		break;
-#ifdef CONFIG_COMPAT
-	case CKPT_RESTART_BLOCK_COMPAT_NANOSLEEP:
-		clockid = h->arg_0;
-		if (clockid < 0 || invalid_clockid(clockid))
-			break;
-		restart_block.fn = compat_nanosleep_restart;
-		restart_block.nanosleep.index = clockid;
-		restart_block.nanosleep.rmtp =
-			(struct timespec __user *) (unsigned long) h->arg_1;
-		restart_block.nanosleep.compat_rmtp =
-			(struct compat_timespec __user *)
-				(unsigned long) h->arg_2;
-		restart_block.nanosleep.expires = expire;
-		break;
-	case CKPT_RESTART_BLOCK_COMPAT_CLOCK_NANOSLEEP:
-		clockid = h->arg_0;
-		if (clockid < 0 || invalid_clockid(clockid))
-			break;
-		restart_block.fn = compat_clock_nanosleep_restart;
-		restart_block.nanosleep.index = clockid;
-		restart_block.nanosleep.rmtp =
-			(struct timespec __user *) (unsigned long) h->arg_1;
-		restart_block.nanosleep.compat_rmtp =
-			(struct compat_timespec __user *)
-				(unsigned long) h->arg_2;
-		restart_block.nanosleep.expires = expire;
-		break;
-#endif
-	case CKPT_RESTART_BLOCK_FUTEX:
-		restart_block.fn = futex_wait_restart;
-		restart_block.futex.uaddr = (u32 *) (unsigned long) h->arg_0;
-		restart_block.futex.val = h->arg_1;
-		restart_block.futex.flags = h->arg_2;
-		restart_block.futex.bitset = h->arg_3;
-		restart_block.futex.time = expire;
-		break;
-	case CKPT_RESTART_BLOCK_POLL:
-		restart_block.fn = do_restart_poll;
-		restart_block.poll.ufds =
-			(struct pollfd __user *) (unsigned long) h->arg_0;
-		restart_block.poll.nfds = h->arg_1;
-		restart_block.poll.has_timeout = h->arg_2;
-		ts = ns_to_timespec(expire);
-		restart_block.poll.tv_sec = ts.tv_sec;
-		restart_block.poll.tv_nsec = ts.tv_nsec;
-		break;
-	default:
-		break;
-	}
-
-	if (restart_block.fn)
-		task_thread_info(current)->restart_block = restart_block;
-	else
-		ret = -EINVAL;
-
-	ckpt_hdr_put(ctx, h);
-	return ret;
-}
-
-static int restore_task_pgid(struct ckpt_ctx *ctx)
-{
-	struct task_struct *task = current;
-	struct pid *pgrp;
-	pid_t pgid;
-	int ret;
-
-	/*
-	 * We enforce the following restrictions on restoring pgrp:
-	 *  1) Only thread group leaders restore pgrp
-	 *  2) Session leader cannot change own pgrp
-	 *  3) Owner of pgrp must belong to same restart tree
-	 *  4) Must have same session as other tasks in same pgrp
-	 *  5) Change must pass setpgid security callback
-	 *
-	 * TODO - check if we need additional restrictions ?
-	 */
-
-	if (!thread_group_leader(task))  /* (1) */
-		return 0;
-
-	pgid = ctx->pids_arr[ctx->active_pid].vpgid;
-
-	if (pgid == task_pgrp_vnr(task))  /* nothing to do */
-		return 0;
-
-	if (task->signal->leader)  /* (2) */
-		return -EINVAL;
-
-	ret = -EINVAL;
-
-	write_lock_irq(&tasklist_lock);
-	pgrp = _ckpt_find_pgrp(ctx, pgid);  /* (3) and (4) */
-	if (pgrp && task_pgrp(task) != pgrp) {
-		ret = security_task_setpgid(task, pgid);  /* (5) */
-		if (!ret)
-			change_pid(task, PIDTYPE_PGID, pgrp);
-	}
-	write_unlock_irq(&tasklist_lock);
-
-	/* self-restart: be tolerant if old pgid isn't found */
-	if (ctx->uflags & RESTART_TASKSELF)
-		ret = 0;
-
-	return ret;
-}
-
-/* prepare the task for restore */
-int pre_restore_task(void)
-{
-	sigset_t sigset;
-
-	/*
-	 * Block task's signals to avoid interruptions due to signals,
-	 * say, from restored timers, file descriptors etc. Signals
-	 * will be unblocked when restore completes.
-	 *
-	 * NOTE: tasks with file descriptors set to send a SIGKILL as
-	 * i/o notification may fail the restart if a signal occurs
-	 * before that task completed its restore. FIX ?
-	 */
-	current->saved_sigmask = current->blocked;
-
-	sigfillset(&sigset);
-	sigdelset(&sigset, SIGKILL);
-	sigdelset(&sigset, SIGSTOP);
-	sigprocmask(SIG_SETMASK, &sigset, NULL);
-
-	return 0;
-}
-
-/* finish up task restore */
-void post_restore_task(void)
-{
-	/* only now is it safe to unblock the restored task's signals */
-	sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
-}
-
-/* read the entire state of the current task */
-int restore_task(struct ckpt_ctx *ctx)
-{
-	int ret;
-
-	ret = restore_task_struct(ctx);
-	ckpt_debug("task %d\n", ret);
-	if (ret < 0)
-		goto out;
-
-	/* zombie - we're done here */
-	if (ret)
-		goto out;
-
-	ret = restore_task_pgid(ctx);
-	if (ret < 0)
-		goto out;
-	ret = restore_thread(ctx);
-	ckpt_debug("thread %d\n", ret);
-	if (ret < 0)
-		goto out;
-	ret = restore_restart_block(ctx);
-	ckpt_debug("restart-blocks %d\n", ret);
-	if (ret < 0)
-		goto out;
-	ret = restore_cpu(ctx);
-	ckpt_debug("cpu %d\n", ret);
-	if (ret < 0)
-		goto out;
-	ret = restore_task_objs(ctx);
-	ckpt_debug("objs %d\n", ret);
-	if (ret < 0)
-		goto out;
-	ret = restore_creds(ctx);
-	ckpt_debug("creds: ret %d\n", ret);
-	if (ret < 0)
-		goto out;
-	ret = restore_task_signal(ctx);
-	ckpt_debug("signal: ret %d\n", ret);
- out:
-	return ret;
-}
diff --git a/checkpoint/restart.c b/checkpoint/restart.c
deleted file mode 100644
index 0891952..0000000
--- a/checkpoint/restart.c
+++ /dev/null
@@ -1,1423 +0,0 @@
-/*
- *  Restart logic and helpers
- *
- *  Copyright (C) 2008-2009 Oren Laadan
- *
- *  This file is subject to the terms and conditions of the GNU General Public
- *  License.  See the file COPYING in the main directory of the Linux
- *  distribution for more details.
- */
-
-/* default debug level for output */
-#define CKPT_DFLAG  CKPT_DSYS
-
-#include <linux/version.h>
-#include <linux/sched.h>
-#include <linux/wait.h>
-#include <linux/file.h>
-#include <linux/ptrace.h>
-#include <linux/freezer.h>
-#include <linux/magic.h>
-#include <linux/utsname.h>
-#include <linux/termios.h>
-#include <asm/syscall.h>
-#include <linux/elf.h>
-#include <linux/deferqueue.h>
-#include <linux/checkpoint.h>
-#include <linux/checkpoint_hdr.h>
-
-#define RESTART_DBG_ROOT	(1 << 0)
-#define RESTART_DBG_GHOST	(1 << 1)
-#define RESTART_DBG_COORD	(1 << 2)
-#define RESTART_DBG_TASK	(1 << 3)
-#define RESTART_DBG_WAITING	(1 << 4)
-#define RESTART_DBG_RUNNING	(1 << 5)
-#define RESTART_DBG_EXITED	(1 << 6)
-#define RESTART_DBG_FAILED	(1 << 7)
-#define RESTART_DBG_SUCCESS	(1 << 8)
-
-#ifdef CONFIG_CHECKPOINT_DEBUG
-
-/*
- * Track status of restarting tasks in a list off of checkpoint_ctx.
- * Print this info when the checkpoint_ctx is freed. Sample output:
- *
- * [3519:2:c/r:debug_task_status:207] 3 tasks registered, nr_tasks was 0 nr_total 0
- * [3519:2:c/r:debug_task_status:210] active pid was 1, ctx->errno 0
- * [3519:2:c/r:debug_task_status:212] kflags 6 uflags 0 oflags 1
- * [3519:2:c/r:debug_task_status:214] task 0 to run was 2
- * [3519:2:c/r:debug_task_status:217] pid 3517  C  r
- * [3519:2:c/r:debug_task_status:217] pid 3519  RN
- * [3519:2:c/r:debug_task_status:217] pid 3520   G
- */
-
-struct ckpt_task_status {
-	pid_t pid;
-	int flags;
-	int error;
-	struct list_head list;
-};
-
-static int restore_debug_task(struct ckpt_ctx *ctx, int flags)
-{
-	struct ckpt_task_status *s;
-
-	s = kmalloc(sizeof(*s), GFP_KERNEL);
-	if (!s) {
-		ckpt_debug("no memory to register ?!\n");
-		return -ENOMEM;
-	}
-	s->pid = current->pid;
-	s->error = 0;
-	s->flags = RESTART_DBG_WAITING | flags;
-	if (current == ctx->root_task)
-		s->flags |= RESTART_DBG_ROOT;
-
-	spin_lock(&ctx->lock);
-	list_add_tail(&s->list, &ctx->task_status);
-	spin_unlock(&ctx->lock);
-
-	return 0;
-}
-
-static struct ckpt_task_status *restore_debug_getme(struct ckpt_ctx *ctx)
-{
-	struct ckpt_task_status *s;
-
-	spin_lock(&ctx->lock);
-	list_for_each_entry(s, &ctx->task_status, list) {
-		if (s->pid == current->pid) {
-			spin_unlock(&ctx->lock);
-			return s;
-		}
-	}
-	spin_unlock(&ctx->lock);
-	return NULL;
-}
-
-static void restore_debug_error(struct ckpt_ctx *ctx, int err)
-{
-	struct ckpt_task_status *s = restore_debug_getme(ctx);
-
-	s->error = err;
-	s->flags &= ~RESTART_DBG_WAITING;
-	s->flags &= ~RESTART_DBG_RUNNING;
-	if (err)
-		s->flags |= RESTART_DBG_FAILED;
-	else
-		s->flags |= RESTART_DBG_SUCCESS;
-}
-
-static void restore_debug_running(struct ckpt_ctx *ctx)
-{
-	struct ckpt_task_status *s = restore_debug_getme(ctx);
-
-	s->flags &= ~RESTART_DBG_WAITING;
-	s->flags |= RESTART_DBG_RUNNING;
-}
-
-static void restore_debug_exit(struct ckpt_ctx *ctx)
-{
-	struct ckpt_task_status *s = restore_debug_getme(ctx);
-
-	s->flags &= ~RESTART_DBG_WAITING;
-	s->flags |= RESTART_DBG_EXITED;
-}
-
-void restore_debug_free(struct ckpt_ctx *ctx)
-{
-	struct ckpt_task_status *s, *p;
-	int i, count = 0;
-	char *which, *state;
-
-	/*
-	 * See how many tasks registered.  Tasks which didn't reach
-	 * sys_restart() won't have registered.  So if this count is
-	 * not the same as ctx->nr_total, that's a warning bell
-	 */
-	list_for_each_entry(s, &ctx->task_status, list)
-		count++;
-	ckpt_debug("%d tasks registered, nr_tasks was %d nr_total %d\n",
-		   count, ctx->nr_tasks, atomic_read(&ctx->nr_total));
-
-	ckpt_debug("active pid was %d, ctx->errno %d\n", ctx->active_pid,
-		   ctx->errno);
-	ckpt_debug("kflags %lu uflags %lu oflags %lu", ctx->kflags,
-		   ctx->uflags, ctx->oflags);
-	for (i = 0; i < ctx->nr_pids; i++)
-		ckpt_debug("task[%d] to run %d\n", i, ctx->pids_arr[i].vpid);
-
-	list_for_each_entry_safe(s, p, &ctx->task_status, list) {
-		if (s->flags & RESTART_DBG_COORD)
-			which = "Coord";
-		else if (s->flags & RESTART_DBG_ROOT)
-			which = "Root";
-		else if (s->flags & RESTART_DBG_GHOST)
-			which = "Ghost";
-		else if (s->flags & RESTART_DBG_TASK)
-			which = "Task";
-		else
-			which = "?????";
-		if (s->flags & RESTART_DBG_WAITING)
-			state = "Waiting";
-		else if (s->flags & RESTART_DBG_RUNNING)
-			state = "Running";
-		else if (s->flags & RESTART_DBG_FAILED)
-			state = "Failed";
-		else if (s->flags & RESTART_DBG_SUCCESS)
-			state = "Success";
-		else if (s->flags & RESTART_DBG_EXITED)
-			state = "Exited";
-		else
-			state = "??????";
-		ckpt_debug("pid %d type %s state %s\n", s->pid, which, state);
-		list_del(&s->list);
-		kfree(s);
-	}
-}
-
-#else
-
-static inline int restore_debug_task(struct ckpt_ctx *ctx, int flags)
-{
-	return 0;
-}
-static inline void restore_debug_error(struct ckpt_ctx *ctx, int err) {}
-static inline void restore_debug_running(struct ckpt_ctx *ctx) {}
-static inline void restore_debug_exit(struct ckpt_ctx *ctx) {}
-
-#endif /* CONFIG_CHECKPOINT_DEBUG */
-
-
-static int _ckpt_read_err(struct ckpt_ctx *ctx, struct ckpt_hdr *h)
-{
-	char *ptr;
-	int len, ret;
-
-	len = h->len - sizeof(*h);
-	ptr = kzalloc(len + 1, GFP_KERNEL);
-	if (!ptr) {
-		ckpt_debug("insufficient memory to report image error\n");
-		return -ENOMEM;
-	}
-
-	ret = ckpt_kread(ctx, ptr, len);
-	if (ret >= 0) {
-		ckpt_debug("%s\n", &ptr[1]);
-		ret = -EIO;
-	}
-
-	kfree(ptr);
-	return ret;
-}
-
-/**
- * _ckpt_read_objref - dispatch handling of a shared object
- * @ctx: checkpoint context
- * @hh: objrect descriptor
- */
-static int _ckpt_read_objref(struct ckpt_ctx *ctx, struct ckpt_hdr *hh)
-{
-	struct ckpt_hdr *h;
-	int ret;
-
-	h = ckpt_hdr_get(ctx, hh->len);
-	if (!h)
-		return -ENOMEM;
-
-	*h = *hh;	/* yay ! */
-
-	_ckpt_debug(CKPT_DOBJ, "shared len %d type %d\n", h->len, h->type);
-	ret = ckpt_kread(ctx, (h + 1), hh->len - sizeof(struct ckpt_hdr));
-	if (ret < 0)
-		goto out;
-
-	ret = restore_obj(ctx, (struct ckpt_hdr_objref *) h);
- out:
-	ckpt_hdr_put(ctx, h);
-	return ret;
-}
-
-/**
- * ckpt_read_obj_dispatch - dispatch ERRORs and OBJREFs; don't return them
- * @ctx: checkpoint context
- * @h: desired ckpt_hdr
- */
-static int ckpt_read_obj_dispatch(struct ckpt_ctx *ctx, struct ckpt_hdr *h)
-{
-	int ret;
-
-	while (1) {
-		ret = ckpt_kread(ctx, h, sizeof(*h));
-		if (ret < 0)
-			return ret;
-		_ckpt_debug(CKPT_DRW, "type %d len %d\n", h->type, h->len);
-		if (h->len < sizeof(*h))
-			return -EINVAL;
-
-		if (h->type == CKPT_HDR_ERROR) {
-			ret = _ckpt_read_err(ctx, h);
-			if (ret < 0)
-				return ret;
-		} else if (h->type == CKPT_HDR_OBJREF) {
-			ret = _ckpt_read_objref(ctx, h);
-			if (ret < 0)
-				return ret;
-		} else
-			return 0;
-	}
-}
-
-/**
- * _ckpt_read_obj - read an object (ckpt_hdr followed by payload)
- * @ctx: checkpoint context
- * @h: desired ckpt_hdr
- * @ptr: desired buffer
- * @len: desired object length (if 0, flexible)
- * @max: maximum object length (if 0, flexible)
- *
- * If @ptr is NULL, then read only the header (payload to follow)
- */
-static int _ckpt_read_obj(struct ckpt_ctx *ctx, struct ckpt_hdr *h,
-			  void *ptr, int len, int max)
-{
-	int ret;
-
-	ret = ckpt_read_obj_dispatch(ctx, h);
-	if (ret < 0)
-		return ret;
-	_ckpt_debug(CKPT_DRW, "type %d len %d(%d,%d)\n",
-		    h->type, h->len, len, max);
-
-	/* if len specified, enforce, else if maximum specified, enforce */
-	if ((len && h->len != len) || (!len && max && h->len > max))
-		return -EINVAL;
-
-	if (ptr)
-		ret = ckpt_kread(ctx, ptr, h->len - sizeof(struct ckpt_hdr));
-	return ret;
-}
-
-/**
- * _ckpt_read_obj_type - read an object of some type
- * @ctx: checkpoint context
- * @ptr: provided buffer
- * @len: buffer length
- * @type: buffer type
- *
- * If @ptr is NULL, then read only the header (payload to follow).
- * @len specifies the expected buffer length (ignored if set to 0).
- * Returns: actual _payload_ length
- */
-int _ckpt_read_obj_type(struct ckpt_ctx *ctx, void *ptr, int len, int type)
-{
-	struct ckpt_hdr h;
-	int ret;
-
-	if (len)
-		len += sizeof(struct ckpt_hdr);
-	ret = _ckpt_read_obj(ctx, &h, ptr, len, len);
-	if (ret < 0)
-		return ret;
-	if (h.type != type)
-		return -EINVAL;
-	return h.len - sizeof(h);
-}
-
-/**
- * _ckpt_read_buffer - read an object of type buffer (set length)
- * @ctx: checkpoint context
- * @ptr: provided buffer
- * @len: buffer length
- *
- * If @ptr is NULL, then read only the header (payload to follow).
- * @len specifies the expected buffer length (ignored if set to 0).
- * Returns: _payload_ length.
- */
-int _ckpt_read_buffer(struct ckpt_ctx *ctx, void *ptr, int len)
-{
-	BUG_ON(!len);
-	return _ckpt_read_obj_type(ctx, ptr, len, CKPT_HDR_BUFFER);
-}
-
-/**
- * _ckpt_read_string - read an object of type string (set length)
- * @ctx: checkpoint context
- * @ptr: provided buffer
- * @len: string length (including '\0')
- *
- * If @ptr is NULL, then read only the header (payload to follow)
- */
-int _ckpt_read_string(struct ckpt_ctx *ctx, void *ptr, int len)
-{
-	int ret;
-
-	BUG_ON(!len);
-	ret = _ckpt_read_obj_type(ctx, ptr, len, CKPT_HDR_STRING);
-	if (ret < 0)
-		return ret;
-	if (ptr)
-		((char *) ptr)[len - 1] = '\0';	/* always play it safe */
-	return 0;
-}
-
-/**
- * ckpt_read_obj - allocate and read an object (ckpt_hdr followed by payload)
- * @ctx: checkpoint context
- * @h: object descriptor
- * @len: desired total length (if 0, flexible)
- * @max: maximum total length
- *
- * Return: new buffer allocated on success, error pointer otherwise
- */
-static void *ckpt_read_obj(struct ckpt_ctx *ctx, int len, int max)
-{
-	struct ckpt_hdr hh;
-	struct ckpt_hdr *h;
-	int ret;
-
-	ret = ckpt_read_obj_dispatch(ctx, &hh);
-	if (ret < 0)
-		return ERR_PTR(ret);
-	_ckpt_debug(CKPT_DRW, "type %d len %d(%d,%d)\n",
-		    hh.type, hh.len, len, max);
-
-	/* if len specified, enforce, else if maximum specified, enforce */
-	if ((len && hh.len != len) || (!len && max && hh.len > max))
-		return ERR_PTR(-EINVAL);
-
-	h = ckpt_hdr_get(ctx, hh.len);
-	if (!h)
-		return ERR_PTR(-ENOMEM);
-
-	*h = hh;	/* yay ! */
-
-	ret = ckpt_kread(ctx, (h + 1), hh.len - sizeof(struct ckpt_hdr));
-	if (ret < 0) {
-		ckpt_hdr_put(ctx, h);
-		h = ERR_PTR(ret);
-	}
-
-	return h;
-}
-
-/**
- * ckpt_read_obj_type - allocate and read an object of some type
- * @ctx: checkpoint context
- * @len: desired object length
- * @type: desired object type
- *
- * Return: new buffer allocated on success, error pointer otherwise
- */
-void *ckpt_read_obj_type(struct ckpt_ctx *ctx, int len, int type)
-{
-	struct ckpt_hdr *h;
-
-	BUG_ON(!len);
-
-	h = ckpt_read_obj(ctx, len, len);
-	if (IS_ERR(h)) {
-		ckpt_err(ctx, PTR_ERR(h), "Expecting to read type %d\n", type);
-		return h;
-	}
-
-	if (h->type != type) {
-		ckpt_hdr_put(ctx, h);
-		ckpt_err(ctx, -EINVAL, "Expected type %d but got %d\n",
-			 h->type, type);
-		h = ERR_PTR(-EINVAL);
-	}
-
-	return h;
-}
-
-/**
- * ckpt_read_buf_type - allocate and read an object of some type (flxible)
- * @ctx: checkpoint context
- * @max: maximum payload length
- * @type: desired object type
- *
- * This differs from ckpt_read_obj_type() in that the length of the
- * incoming object is flexible (up to the maximum specified by @max;
- * unlimited if @max is 0), as determined by the ckpt_hdr data.
- *
- * NOTE: for symmetry with checkpoint, @max is the maximum _payload_
- * size, excluding the header.
- *
- * Return: new buffer allocated on success, error pointer otherwise
- */
-void *ckpt_read_buf_type(struct ckpt_ctx *ctx, int max, int type)
-{
-	struct ckpt_hdr *h;
-
-	if (max)
-		max += sizeof(struct ckpt_hdr);
-
-	h = ckpt_read_obj(ctx, 0, max);
-	if (IS_ERR(h))
-		return h;
-
-	if (h->type != type) {
-		ckpt_hdr_put(ctx, h);
-		h = ERR_PTR(-EINVAL);
-	}
-
-	return h;
-}
-
-/**
- * ckpt_read_payload - allocate and read the payload of an object
- * @ctx: checkpoint context
- * @max: maximum payload length
- * @str: pointer to buffer to be allocated (caller must free)
- * @type: desired object type
- *
- * This can be used to read a variable-length _payload_ from the checkpoint
- * stream. @max limits the size of the resulting buffer.
- *
- * Return: actual _payload_ length
- */
-int ckpt_read_payload(struct ckpt_ctx *ctx, void **ptr, int max, int type)
-{
-	int len, ret;
-
-	len = _ckpt_read_obj_type(ctx, NULL, 0, type);
-	if (len < 0)
-		return len;
-	else if (len > max)
-		return -EINVAL;
-
-	*ptr = kmalloc(len, GFP_KERNEL);
-	if (!*ptr)
-		return -ENOMEM;
-
-	ret = ckpt_kread(ctx, *ptr, len);
-	if (ret < 0) {
-		kfree(*ptr);
-		return ret;
-	}
-
-	return len;
-}
-
-/**
- * ckpt_read_string - allocate and read a string (variable length)
- * @ctx: checkpoint context
- * @max: maximum acceptable length
- *
- * Return: allocate string or error pointer
- */
-char *ckpt_read_string(struct ckpt_ctx *ctx, int max)
-{
-	char *str;
-	int len;
-
-	len = ckpt_read_payload(ctx, (void **)&str, max, CKPT_HDR_STRING);
-	if (len < 0)
-		return ERR_PTR(len);
-	str[len - 1] = '\0';  	/* always play it safe */
-	return str;
-}
-
-/**
- * ckpt_read_consume - consume the next object of expected type
- * @ctx: checkpoint context
- * @len: desired object length
- * @type: desired object type
- *
- * This can be used to skip an object in the input stream when the
- * data is unnecessary for the restart. @len indicates the length of
- * the object); if @len is zero the length is unconstrained.
- */
-int ckpt_read_consume(struct ckpt_ctx *ctx, int len, int type)
-{
-	struct ckpt_hdr *h;
-	int ret = 0;
-
-	h = ckpt_read_obj(ctx, len, 0);
-	if (IS_ERR(h))
-		return PTR_ERR(h);
-
-	if (h->type != type)
-		ret = -EINVAL;
-
-	ckpt_hdr_put(ctx, h);
-	return ret;
-}
-
-/***********************************************************************
- * Restart
- */
-
-static int check_kernel_const(struct ckpt_const *h)
-{
-	struct task_struct *tsk;
-	struct new_utsname *uts;
-
-	/* task */
-	if (h->task_comm_len != sizeof(tsk->comm))
-		return -EINVAL;
-	/* mm->saved_auxv size */
-	if (h->at_vector_size != AT_VECTOR_SIZE)
-		return -EINVAL;
-	/* signal */
-	if (h->signal_nsig != _NSIG)
-		return -EINVAL;
-	/* uts */
-	if (h->uts_sysname_len != sizeof(uts->sysname))
-		return -EINVAL;
-	if (h->uts_nodename_len != sizeof(uts->nodename))
-		return -EINVAL;
-	if (h->uts_release_len != sizeof(uts->release))
-		return -EINVAL;
-	if (h->uts_version_len != sizeof(uts->version))
-		return -EINVAL;
-	if (h->uts_machine_len != sizeof(uts->machine))
-		return -EINVAL;
-	if (h->uts_domainname_len != sizeof(uts->domainname))
-		return -EINVAL;
-	/* rlimit */
-	if (h->rlimit_nlimits != RLIM_NLIMITS)
-		return -EINVAL;
-	/* tty */
-	if (h->n_tty_buf_size != N_TTY_BUF_SIZE)
-		return -EINVAL;
-	if (h->tty_termios_ncc != NCC)
-		return -EINVAL;
-
-	return 0;
-}
-
-/* read the checkpoint header */
-static int restore_read_header(struct ckpt_ctx *ctx)
-{
-	struct ckpt_hdr_header *h;
-	struct new_utsname *uts = NULL;
-	int ret;
-
-	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_HEADER);
-	if (IS_ERR(h))
-		return PTR_ERR(h);
-
-	ret = -EINVAL;
-	if (le16_to_cpu(h->arch_id) != CKPT_ARCH_ID) {
-		ckpt_err(ctx, ret, "incompatible architecture id");
-		goto out;
-	}
-	if (h->magic != CHECKPOINT_MAGIC_HEAD ||
-	    h->rev != CHECKPOINT_VERSION ||
-	    h->major != ((LINUX_VERSION_CODE >> 16) & 0xff) ||
-	    h->minor != ((LINUX_VERSION_CODE >> 8) & 0xff) ||
-	    h->patch != ((LINUX_VERSION_CODE) & 0xff)) {
-		ckpt_err(ctx, ret, "incompatible kernel version");
-		goto out;
-	}
-	if (h->uflags & ~CHECKPOINT_USER_FLAGS) {
-		ckpt_err(ctx, ret, "incompatible restart user flags");
-		goto out;
-	}
-
-	ret = check_kernel_const(&h->constants);
-	if (ret < 0) {
-		ckpt_err(ctx, ret, "incompatible kernel constants");
-		goto out;
-	}
-
-	ret = -ENOMEM;
-	uts = kmalloc(sizeof(*uts), GFP_KERNEL);
-	if (!uts)
-		goto out;
-
-	ctx->oflags = h->uflags;
-
-	/* FIX: verify compatibility of release, version and machine */
-	ret = _ckpt_read_buffer(ctx, uts->release, sizeof(uts->release));
-	if (ret < 0)
-		goto out;
-	ret = _ckpt_read_buffer(ctx, uts->version, sizeof(uts->version));
-	if (ret < 0)
-		goto out;
-	ret = _ckpt_read_buffer(ctx, uts->machine, sizeof(uts->machine));
-	if (ret < 0)
-		goto out;
-
-	ret = restore_read_header_arch(ctx);
- out:
-	kfree(uts);
-	ckpt_hdr_put(ctx, h);
-	return ret;
-}
-
-/* read the LSM configuration section */
-static int restore_lsm(struct ckpt_ctx *ctx)
-{
-	int ret;
-	char *cur_lsm = security_get_lsm_name();
-
-	ret = _ckpt_read_buffer(ctx, ctx->lsm_name,
-				CHECKPOINT_LSM_NAME_MAX + 1);
-	if (ret < 0) {
-		ckpt_debug("Error %d reading lsm name\n", ret);
-		return ret;
-	}
-
-	if (!(ctx->uflags & RESTART_KEEP_LSM))
-		goto skip_lsm;
-
-	if (strncmp(cur_lsm, ctx->lsm_name, CHECKPOINT_LSM_NAME_MAX + 1) != 0) {
-		ckpt_debug("c/r: checkpointed LSM %s, current is %s.\n",
-			ctx->lsm_name, cur_lsm);
-		return -EPERM;
-	}
-
-	if (strcmp(ctx->lsm_name, "lsm_none") != 0 &&
-			strcmp(ctx->lsm_name, "smack") != 0 &&
-			strcmp(ctx->lsm_name, "selinux") != 0 &&
-			strcmp(ctx->lsm_name, "default") != 0) {
-		ckpt_debug("c/r: RESTART_KEEP_LSM unsupported for %s\n",
-				ctx->lsm_name);
-		return -ENOSYS;
-	}
-
-skip_lsm:
-	ret = security_may_restart(ctx);
-	if (ret < 0)
-		ckpt_debug("security_may_restart returned %d\n", ret);
-	return ret;
-}
-
-/* read the container configuration section */
-static int restore_container(struct ckpt_ctx *ctx)
-{
-	int ret = 0;
-	struct ckpt_hdr_container *h;
-
-	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_CONTAINER);
-	if (IS_ERR(h))
-		return PTR_ERR(h);
-	ckpt_hdr_put(ctx, h);
-
-	/* read the LSM name and info which follow ("are a part of")
-	 * the ckpt_hdr_container */
-	ret = restore_lsm(ctx);
-	if (ret < 0)
-		ckpt_debug("Error %d on LSM configuration\n", ret);
-	return ret;
-}
-
-/* read the checkpoint trailer */
-static int restore_read_tail(struct ckpt_ctx *ctx)
-{
-	struct ckpt_hdr_tail *h;
-	int ret = 0;
-
-	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TAIL);
-	if (IS_ERR(h))
-		return PTR_ERR(h);
-
-	if (h->magic != CHECKPOINT_MAGIC_TAIL)
-		ret = -EINVAL;
-
-	ckpt_hdr_put(ctx, h);
-	return ret;
-}
-
-/* restore_read_tree - read the tasks tree into the checkpoint context */
-static int restore_read_tree(struct ckpt_ctx *ctx)
-{
-	struct ckpt_hdr_tree *h;
-	int size, ret;
-
-	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TREE);
-	if (IS_ERR(h))
-		return PTR_ERR(h);
-
-	ret = -EINVAL;
-	if (h->nr_tasks <= 0)
-		goto out;
-
-	ctx->nr_pids = h->nr_tasks;
-	size = sizeof(*ctx->pids_arr) * ctx->nr_pids;
-	if (size <= 0)		/* overflow ? */
-		goto out;
-
-	ctx->pids_arr = kmalloc(size, GFP_KERNEL);
-	if (!ctx->pids_arr) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	ret = _ckpt_read_buffer(ctx, ctx->pids_arr, size);
- out:
-	ckpt_hdr_put(ctx, h);
-	return ret;
-}
-
-static inline int all_tasks_activated(struct ckpt_ctx *ctx)
-{
-	return (ctx->active_pid == ctx->nr_pids);
-}
-
-static inline pid_t get_active_pid(struct ckpt_ctx *ctx)
-{
-	int active = ctx->active_pid;
-	return active >= 0 ? ctx->pids_arr[active].vpid : 0;
-}
-
-static inline int is_task_active(struct ckpt_ctx *ctx, pid_t pid)
-{
-	return get_active_pid(ctx) == pid;
-}
-
-/*
- * If exiting a restart with error, then wake up all other tasks
- * in the restart context.
- */
-void restore_notify_error(struct ckpt_ctx *ctx)
-{
-	complete(&ctx->complete);
-	wake_up_all(&ctx->waitq);
-	wake_up_all(&ctx->ghostq);
-}
-
-static inline struct ckpt_ctx *get_task_ctx(struct task_struct *task)
-{
-	struct ckpt_ctx *ctx;
-
-	task_lock(task);
-	ctx = ckpt_ctx_get(task->checkpoint_ctx);
-	task_unlock(task);
-	return ctx;
-}
-
-/* returns 0 on success, 1 otherwise */
-static int set_task_ctx(struct task_struct *task, struct ckpt_ctx *ctx)
-{
-	int ret;
-
-	task_lock(task);
-	if (!task->checkpoint_ctx) {
-		task->checkpoint_ctx = ckpt_ctx_get(ctx);
-		ret = 0;
-	} else {
-		ckpt_debug("task %d has checkpoint_ctx\n", task_pid_vnr(task));
-		ret = 1;
-	}
-	task_unlock(task);
-	return ret;
-}
-
-static void clear_task_ctx(struct task_struct *task)
-{
-	struct ckpt_ctx *old;
-
-	task_lock(task);
-	old = task->checkpoint_ctx;
-	task->checkpoint_ctx = NULL;
-	task_unlock(task);
-
-	ckpt_debug("task %d clear checkpoint_ctx\n", task_pid_vnr(task));
-	ckpt_ctx_put(old);
-}
-
-static void restore_task_done(struct ckpt_ctx *ctx)
-{
-	if (atomic_dec_and_test(&ctx->nr_total))
-		complete(&ctx->complete);
-	BUG_ON(atomic_read(&ctx->nr_total) < 0);
-}
-
-static int restore_activate_next(struct ckpt_ctx *ctx)
-{
-	struct task_struct *task;
-	pid_t pid;
-
-	ctx->active_pid++;
-
-	BUG_ON(ctx->active_pid > ctx->nr_pids);
-
-	if (!all_tasks_activated(ctx)) {
-		/* wake up next task in line to restore its state */
-		pid = get_active_pid(ctx);
-
-		rcu_read_lock();
-		task = find_task_by_pid_ns(pid, ctx->root_nsproxy->pid_ns);
-		/* target task must have same restart context */
-		if (task && task->checkpoint_ctx == ctx)
-			wake_up_process(task);
-		else
-			task = NULL;
-		rcu_read_unlock();
-
-		if (!task) {
-			ckpt_err(ctx, -ESRCH, "task %d not found\n", pid);
-			return -ESRCH;
-		}
-	} else {
-		/* wake up ghosts tasks so that they can terminate */
-		wake_up_all(&ctx->ghostq);
-	}
-
-	return 0;
-}
-
-static int wait_task_active(struct ckpt_ctx *ctx)
-{
-	pid_t pid = task_pid_vnr(current);
-	int ret;
-
-	ckpt_debug("pid %d waiting\n", pid);
-	ret = wait_event_interruptible(ctx->waitq,
-				       is_task_active(ctx, pid) ||
-				       ckpt_test_error(ctx));
-	ckpt_debug("active %d < %d (ret %d, errno %d)\n",
-		   ctx->active_pid, ctx->nr_pids, ret, ctx->errno);
-	if (ckpt_test_error(ctx))
-		return ckpt_get_error(ctx);
-	return 0;
-}
-
-static int wait_task_sync(struct ckpt_ctx *ctx)
-{
-	ckpt_debug("pid %d syncing\n", task_pid_vnr(current));
-	wait_event_interruptible(ctx->waitq, ckpt_test_complete(ctx));
-	ckpt_debug("task sync done (errno %d)\n", ctx->errno);
-	if (ckpt_test_error(ctx))
-		return ckpt_get_error(ctx);
-	return 0;
-}
-
-/* grabs a reference to the @ctx on success; caller should free */
-static struct ckpt_ctx *wait_checkpoint_ctx(void)
-{
-	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(waitq);
-	struct ckpt_ctx *ctx;
-	int ret;
-
-	/*
-	 * Wait for coordinator to become visible, then grab a
-	 * reference to its restart context.
-	 */
-	ret = wait_event_interruptible(waitq, current->checkpoint_ctx);
-	if (ret < 0) {
-		ckpt_debug("wait_checkpoint_ctx: failed (%d)\n", ret);
-		return ERR_PTR(ret);
-	}
-
-	ctx = get_task_ctx(current);
-	if (!ctx) {
-		ckpt_debug("wait_checkpoint_ctx: checkpoint_ctx missing\n");
-		return ERR_PTR(-EAGAIN);
-	}
-
-	return ctx;
-}
-
-static int do_ghost_task(void)
-{
-	struct ckpt_ctx *ctx;
-	int ret;
-
-	ctx = wait_checkpoint_ctx();
-	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
-
-	ret = restore_debug_task(ctx, RESTART_DBG_GHOST);
-	if (ret < 0)
-		goto out;
-
-	current->flags |= PF_RESTARTING;
-	restore_debug_running(ctx);
-
-	ret = wait_event_interruptible(ctx->ghostq,
-				       all_tasks_activated(ctx) ||
-				       ckpt_test_error(ctx));
- out:
-	restore_debug_error(ctx, ret);
-	if (ret < 0)
-		ckpt_err(ctx, ret, "ghost restart failed\n");
-
-	current->exit_signal = -1;
-	restore_debug_exit(ctx);
-	ckpt_ctx_put(ctx);
-	do_exit(0);
-
-	/* NOT REACHED */
-}
-
-/*
- * Ensure that all members of a thread group are in sys_restart before
- * restoring any of them. Otherwise, restore may modify shared state
- * and crash or fault a thread still in userspace,
- */
-static int wait_sync_threads(void)
-{
-	struct task_struct *p = current;
-	atomic_t *count;
-	int nr = 0;
-	int ret = 0;
-
-	if (thread_group_empty(p))
-		return 0;
-
-	count = &p->signal->restart_count;
-
-	if (!atomic_read(count)) {
-		read_lock(&tasklist_lock);
-		for (p = next_thread(p); p != current; p = next_thread(p))
-			nr++;
-		read_unlock(&tasklist_lock);
-		/*
-		 * Testing that @count is 0 makes it unlikely that
-		 * multiple threads get here. But if they do, then
-		 * only one will succeed in initializing @count.
-		 */
-		atomic_cmpxchg(count, 0, nr + 1);
-	}
-
-	if (atomic_dec_and_test(count)) {
-		read_lock(&tasklist_lock);
-		for (p = next_thread(p); p != current; p = next_thread(p))
-			wake_up_process(p);
-		read_unlock(&tasklist_lock);
-	} else {
-		DECLARE_WAIT_QUEUE_HEAD_ONSTACK(waitq);
-		ret = wait_event_interruptible(waitq, !atomic_read(count));
-	}
-
-	return ret;
-}
-
-static int do_restore_task(void)
-{
-	struct ckpt_ctx *ctx;
-	int zombie, ret;
-
-	ctx = wait_checkpoint_ctx();
-	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
-
-	ret = restore_debug_task(ctx, RESTART_DBG_TASK);
-	if (ret < 0)
-		goto out;
-
-	current->flags |= PF_RESTARTING;
-
-	ret = wait_sync_threads();
-	if (ret < 0)
-		goto out;
-
-	/* wait for our turn, do the restore, and tell next task in line */
-	ret = wait_task_active(ctx);
-	if (ret < 0)
-		goto out;
-
-	restore_debug_running(ctx);
-
-	ret = pre_restore_task();
-	if (ret < 0)
-		goto out;
-
-	zombie = restore_task(ctx);
-	if (zombie < 0) {
-		ret = zombie;
-		goto out;
-	}
-
-	ret = restore_activate_next(ctx);
-	if (ret < 0)
-		goto out;
-
-	/*
-	 * zombie: we're done here; do_exit() will notice the @ctx on
-	 * our current->checkpoint_ctx (and our PF_RESTARTING), will
-	 * call restore_task_done() and release the @ctx. This ensures
-	 * that we only report done after we really become zombie.
-	 */
-	if (zombie) {
-		restore_debug_exit(ctx);
-		post_restore_task();
-		ckpt_ctx_put(ctx);
-		do_exit(current->exit_code);
-	}
-
-	restore_task_done(ctx);
-	ret = wait_task_sync(ctx);
- out:
-	restore_debug_error(ctx, ret);
-	if (ret < 0)
-		ckpt_err(ctx, ret, "task restart failed\n");
-
-	post_restore_task();
-	current->flags &= ~PF_RESTARTING;
-	clear_task_ctx(current);
-	ckpt_ctx_put(ctx);
-	return ret;
-}
-
-/**
- * __prepare_descendants - set ->checkpoint_ctx of a descendants
- * @task: descendant task
- * @data: points to the checkpoint ctx
- */
-static int __prepare_descendants(struct task_struct *task, void *data)
-{
-	struct ckpt_ctx *ctx = (struct ckpt_ctx *) data;
-
-	ckpt_debug("consider task %d\n", task_pid_vnr(task));
-
-	if (!ptrace_may_access(task, PTRACE_MODE_ATTACH)) {
-		ckpt_debug("stranger task %d\n", task_pid_vnr(task));
-		return -EPERM;
-	}
-
-	if (task_ptrace(task) & PT_PTRACED) {
-		ckpt_debug("ptraced task %d\n", task_pid_vnr(task));
-		return -EBUSY;
-	}
-
-	/*
-	 * Set task->checkpoint_ctx of all non-zombie descendants.
-	 * If a descendant already has a ->checkpoint_ctx, it
-	 * must be a coordinator (for a different restart ?) so
-	 * we fail.
-	 *
-	 * Note that own ancestors cannot interfere since they
-	 * won't descend past us, as own ->checkpoint_ctx must
-	 * already be set.
-	 */
-	if (!task->exit_state) {
-		if (set_task_ctx(task, ctx))
-			return -EBUSY;
-		ckpt_debug("prepare task %d\n", task_pid_vnr(task));
-		wake_up_process(task);
-		return 1;
-	}
-
-	return 0;
-}
-
-/**
- * prepare_descendants - set ->checkpoint_ctx of all descendants
- * @ctx: checkpoint context
- * @root: root process for restart
- *
- * Called by the coodinator to set the ->checkpoint_ctx pointer of the
- * root task and all its descendants.
- */
-static int prepare_descendants(struct ckpt_ctx *ctx, struct task_struct *root)
-{
-	int nr_pids;
-
-	nr_pids = walk_task_subtree(root, __prepare_descendants, ctx);
-	ckpt_debug("nr %d/%d\n", ctx->nr_pids, nr_pids);
-	if (nr_pids < 0)
-		return nr_pids;
-
-	/*
-	 * Actual tasks count may exceed ctx->nr_pids due of 'dead'
-	 * tasks used as place-holders for PGIDs, but not fall short.
-	 */
-	if (nr_pids < ctx->nr_pids)
-		return -ESRCH;
-
-	atomic_set(&ctx->nr_total, nr_pids);
-	return nr_pids;
-}
-
-static int wait_all_tasks_finish(struct ckpt_ctx *ctx)
-{
-	int ret;
-
-	BUG_ON(ctx->active_pid != -1);
-	ret = restore_activate_next(ctx);
-	if (ret < 0)
-		return ret;
-
-	ret = wait_for_completion_interruptible(&ctx->complete);
-	ckpt_debug("final sync kflags %#lx (ret %d)\n", ctx->kflags, ret);
-
-	return ret;
-}
-
-static struct task_struct *choose_root_task(struct ckpt_ctx *ctx, pid_t pid)
-{
-	struct task_struct *task;
-
-	if (ctx->uflags & RESTART_TASKSELF) {
-		ctx->root_pid = pid;
-		ctx->root_task = current;
-		get_task_struct(current);
-		return current;
-	}
-
-	read_lock(&tasklist_lock);
-	list_for_each_entry(task, &current->children, sibling) {
-		if (task_pid_vnr(task) == pid) {
-			get_task_struct(task);
-			ctx->root_task = task;
-			ctx->root_pid = pid;
-			break;
-		}
-	}
-	read_unlock(&tasklist_lock);
-
-	return ctx->root_task;
-}
-
-/* setup restart-specific parts of ctx */
-static int init_restart_ctx(struct ckpt_ctx *ctx, pid_t pid)
-{
-	struct nsproxy *nsproxy;
-
-	/*
-	 * No need for explicit cleanup here, because if an error
-	 * occurs then ckpt_ctx_free() is eventually called.
-	 */
-
-	if (!choose_root_task(ctx, pid))
-		return -ESRCH;
-
-	rcu_read_lock();
-	nsproxy = task_nsproxy(ctx->root_task);
-	if (nsproxy) {
-		get_nsproxy(nsproxy);
-		ctx->root_nsproxy = nsproxy;
-	}
-	rcu_read_unlock();
-	if (!nsproxy)
-		return -ESRCH;
-
-	ctx->active_pid = -1;	/* see restore_activate_next, get_active_pid */
-
-	return 0;
-}
-
-static int __destroy_descendants(struct task_struct *task, void *data)
-{
-	struct ckpt_ctx *ctx = (struct ckpt_ctx *) data;
-
-	if (task->checkpoint_ctx == ctx)
-		force_sig(SIGKILL, task);
-
-	return 0;
-}
-
-static void destroy_descendants(struct ckpt_ctx *ctx)
-{
-	walk_task_subtree(ctx->root_task, __destroy_descendants, ctx);
-}
-
-static int do_restore_coord(struct ckpt_ctx *ctx, pid_t pid)
-{
-	int ret;
-
-	ret = restore_debug_task(ctx, RESTART_DBG_COORD);
-	if (ret < 0)
-		return ret;
-	restore_debug_running(ctx);
-
-	ret = restore_read_header(ctx);
-	ckpt_debug("restore header: %d\n", ret);
-	if (ret < 0)
-		return ret;
-	ret = restore_container(ctx);
-	ckpt_debug("restore container: %d\n", ret);
-	if (ret < 0)
-		return ret;
-	ret = restore_read_tree(ctx);
-	ckpt_debug("restore tree: %d\n", ret);
-	if (ret < 0)
-		return ret;
-
-	if ((ctx->uflags & RESTART_TASKSELF) && ctx->nr_pids != 1)
-		return -EINVAL;
-
-	ret = init_restart_ctx(ctx, pid);
-	if (ret < 0)
-		return ret;
-
-	/*
-	 * Populate own ->checkpoint_ctx: if an ancestor attempts to
-	 * prepare_descendants() on us, it will fail. Furthermore,
-	 * that ancestor won't proceed deeper to interfere with our
-	 * descendants that are restarting.
-	 */
-	if (set_task_ctx(current, ctx)) {
-		/*
-		 * We are a bad-behaving descendant: an ancestor must
-		 * have prepare_descendants() us as part of a restart.
-		 */
-		ckpt_debug("coord already has checkpoint_ctx\n");
-		return -EBUSY;
-	}
-
-	/*
-	 * From now on we are committed to the restart. If anything
-	 * fails, we'll cleanup (that is, kill) those tasks in our
-	 * subtree that we marked for restart - see below.
-	 */
-
-	if (ctx->uflags & RESTART_TASKSELF) {
-		ret = pre_restore_task();
-		ckpt_debug("pre restore task: %d\n", ret);
-		if (ret < 0)
-			goto out;
-		ret = restore_task(ctx);
-		ckpt_debug("restore task: %d\n", ret);
-		if (ret < 0)
-			goto out;
-	} else {
-		/* prepare descendants' t->checkpoint_ctx point to coord */
-		ret = prepare_descendants(ctx, ctx->root_task);
-		ckpt_debug("restore prepare: %d\n", ret);
-		if (ret < 0)
-			goto out;
-		/* wait for all other tasks to complete do_restore_task() */
-		ret = wait_all_tasks_finish(ctx);
-		ckpt_debug("restore finish: %d\n", ret);
-		if (ret < 0)
-			goto out;
-	}
-
-	ret = deferqueue_run(ctx->deferqueue);  /* run deferred work */
-	ckpt_debug("restore deferqueue: %d\n", ret);
-	if (ret < 0)
-		goto out;
-
-	ret = restore_read_tail(ctx);
-	ckpt_debug("restore tail: %d\n", ret);
-	if (ret < 0)
-		goto out;
-
-	if (ctx->uflags & RESTART_FROZEN) {
-		ret = cgroup_freezer_make_frozen(ctx->root_task);
-		ckpt_debug("freezing restart tasks ... %d\n", ret);
-	}
- out:
-	if (ctx->uflags & RESTART_TASKSELF)
-		post_restore_task();
-
-	restore_debug_error(ctx, ret);
-	if (ret < 0)
-		ckpt_err(ctx, ret, "restart failed (coordinator)\n");
-
-	if (ckpt_test_error(ctx)) {
-		destroy_descendants(ctx);
-		ret = ckpt_get_error(ctx);
-	} else {
-		ckpt_set_success(ctx);
-		wake_up_all(&ctx->waitq);
-	}
-
-	clear_task_ctx(current);
-	return ret;
-}
-
-static long restore_retval(void)
-{
-	struct pt_regs *regs = task_pt_regs(current);
-	long ret;
-
-	/*
-	 * For the restart, we entered the kernel via sys_restart(),
-	 * so our return path is via the syscall exit. In particular,
-	 * the code in entry.S will put the value that we will return
-	 * into a register (e.g. regs->eax in x86), thus passing it to
-	 * the caller task.
-	 *
-	 * What we do now depends on what happened to the checkpointed
-	 * task right before the checkpoint - there are three cases:
-	 *
-	 * 1) It was carrying out a syscall when became frozen, or
-	 * 2) It was running in userspace, or
-	 * 3) It was doing a self-checkpoint
-	 *
-	 * In case #1, if the syscall succeeded, perhaps partially,
-	 * then the retval is non-negative. If it failed, the error
-	 * may be one of -ERESTART..., which is interpreted in the
-	 * signal handling code. If that is the case, we force the
-	 * signal handler to kick in by faking a signal to ourselves
-	 * (a la freeze/thaw) when ret < 0.
-	 *
-	 * In case #2, our return value will overwrite the original
-	 * value in the affected register. Workaround by simply using
-	 * that saved value of that register as our retval.
-	 *
-	 * In case #3, then the state was recorded while the task was
-	 * in checkpoint(2) syscall. The syscall is execpted to return
-	 * 0 when returning from a restart. Fortunately, this already
-	 * has been arranged for at checkpoint time (the register that
-	 * holds the retval, e.g. regs->eax in x86, was set to
-	 * zero).
-	 */
-
-	/* needed for all 3 cases: get old value/error/retval */
-	ret = syscall_get_return_value(current, regs);
-
-	/* if from a syscall and returning error, kick in signal handlig */
-	if (syscall_get_nr(current, regs) >= 0 && ret < 0)
-		set_tsk_thread_flag(current, TIF_SIGPENDING);
-
-	return ret;
-}
-
-long do_restart(struct ckpt_ctx *ctx, pid_t pid, unsigned long flags)
-{
-	long ret;
-
-	if (ctx)
-		ret = do_restore_coord(ctx, pid);
-	else if (flags & RESTART_GHOST)
-		ret = do_ghost_task();
-	else
-		ret = do_restore_task();
-
-	/* restart(2) isn't idempotent: should not be auto-restarted */
-	if (ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
-	    ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK)
-		ret = -EINTR;
-
-	/*
-	 * The retval from what we return to the caller when all goes
-	 * well: this is either the retval from the original syscall
-	 * that was interrupted during checkpoint, or the contents of
-	 * (saved) eax if the task was in userspace.
-	 *
-	 * The coordinator (ctx!=NULL) is exempt: don't adjust its retval.
-	 * But in self-restart (where RESTART_TASKSELF), the coordinator
-	 * _itself_ is a restarting task.
-	 */
-
-	if (!ctx || (ctx->uflags & RESTART_TASKSELF)) {
-		if (ret < 0) {
-			/* partial restore is undefined: terminate */
-			ckpt_debug("restart err %ld, exiting\n", ret);
-			force_sig(SIGKILL, current);
-		} else {
-			ret = restore_retval();
-		}
-	}
-
-	ckpt_debug("sys_restart returns %ld\n", ret);
-	return ret;
-}
-
-/**
- * exit_checkpoint - callback from do_exit to cleanup checkpoint state
- * @tsk: terminating task
- */
-void exit_checkpoint(struct task_struct *tsk)
-{
-	struct ckpt_ctx *ctx;
-
-	/* no one else will touch this, because @tsk is dead already */
-	ctx = tsk->checkpoint_ctx;
-
-	/* restarting zombies will activate next task in restart */
-	if (tsk->flags & PF_RESTARTING) {
-		BUG_ON(ctx->active_pid == -1);
-		restore_task_done(ctx);
-	}
-
-	ckpt_ctx_put(ctx);
-}
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
deleted file mode 100644
index a420c02..0000000
--- a/checkpoint/sys.c
+++ /dev/null
@@ -1,719 +0,0 @@
-/*
- *  Generic container checkpoint-restart
- *
- *  Copyright (C) 2008-2009 Oren Laadan
- *
- *  This file is subject to the terms and conditions of the GNU General Public
- *  License.  See the file COPYING in the main directory of the Linux
- *  distribution for more details.
- */
-
-/* default debug level for output */
-#define CKPT_DFLAG  CKPT_DSYS
-
-#include <linux/sched.h>
-#include <linux/nsproxy.h>
-#include <linux/kernel.h>
-#include <linux/cgroup.h>
-#include <linux/syscalls.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/uaccess.h>
-#include <linux/capability.h>
-#include <linux/checkpoint.h>
-#include <linux/mm_checkpoint.h> /* for ckpt_pgarr_free() */
-#include <linux/deferqueue.h>
-
-/*
- * ckpt_unpriv_allowed - sysctl controlled, do not allow checkpoints or
- * restarts unless caller has CAP_SYS_ADMIN, if 0 (prevent unprivileged
- * useres from expoitling any privilege escalation bugs). If it is 1,
- * then regular permissions checks are intended to do the job.
- */
-int ckpt_unpriv_allowed = 1;	/* default: allow */
-
-/*
- * Helpers to write(read) from(to) kernel space to(from) the checkpoint
- * image file descriptor (similar to how a core-dump is performed).
- *
- *   ckpt_kwrite() - write a kernel-space buffer to the checkpoint image
- *   ckpt_kread() - read from the checkpoint image to a kernel-space buffer
- */
-
-static inline int _ckpt_kwrite(struct file *file, void *addr, int count)
-{
-	void __user *uaddr = (__force void __user *) addr;
-	ssize_t nwrite;
-	int nleft;
-
-	for (nleft = count; nleft; nleft -= nwrite) {
-		loff_t pos = file_pos_read(file);
-		nwrite = vfs_write(file, uaddr, nleft, &pos);
-		file_pos_write(file, pos);
-		if (nwrite < 0) {
-			if (nwrite == -EAGAIN)
-				nwrite = 0;
-			else
-				return nwrite;
-		}
-		uaddr += nwrite;
-	}
-	return 0;
-}
-
-int ckpt_kwrite(struct ckpt_ctx *ctx, void *addr, int count)
-{
-	mm_segment_t fs;
-	int ret;
-
-	if (ckpt_test_error(ctx))
-		return ckpt_get_error(ctx);
-
-	fs = get_fs();
-	set_fs(KERNEL_DS);
-	ret = _ckpt_kwrite(ctx->file, addr, count);
-	set_fs(fs);
-
-	ctx->total += count;
-	return ret;
-}
-
-static inline int _ckpt_kread(struct file *file, void *addr, int count)
-{
-	void __user *uaddr = (__force void __user *) addr;
-	ssize_t nread;
-	int nleft;
-
-	for (nleft = count; nleft; nleft -= nread) {
-		loff_t pos = file_pos_read(file);
-		nread = vfs_read(file, uaddr, nleft, &pos);
-		file_pos_write(file, pos);
-		if (nread <= 0) {
-			if (nread == -EAGAIN) {
-				nread = 0;
-				continue;
-			} else if (nread == 0)
-				nread = -EPIPE;		/* unexecpted EOF */
-			return nread;
-		}
-		uaddr += nread;
-	}
-	return 0;
-}
-
-int ckpt_kread(struct ckpt_ctx *ctx, void *addr, int count)
-{
-	mm_segment_t fs;
-	int ret;
-
-	if (ckpt_test_error(ctx))
-		return ckpt_get_error(ctx);
-
-	fs = get_fs();
-	set_fs(KERNEL_DS);
-	ret = _ckpt_kread(ctx->file , addr, count);
-	set_fs(fs);
-
-	ctx->total += count;
-	return ret;
-}
-
-/**
- * ckpt_hdr_get - get a hdr of certain size
- * @ctx: checkpoint context
- * @len: desired length
- *
- * Returns pointer to header
- */
-void *ckpt_hdr_get(struct ckpt_ctx *ctx, int len)
-{
-	return kzalloc(len, GFP_KERNEL);
-}
-
-/**
- * _ckpt_hdr_put - free a hdr allocated with ckpt_hdr_get
- * @ctx: checkpoint context
- * @ptr: header to free
- * @len: header length
- *
- * (requiring 'ptr' makes it easily interchangable with kmalloc/kfree
- */
-void _ckpt_hdr_put(struct ckpt_ctx *ctx, void *ptr, int len)
-{
-	kfree(ptr);
-}
-
-/**
- * ckpt_hdr_put - free a hdr allocated with ckpt_hdr_get
- * @ctx: checkpoint context
- * @ptr: header to free
- *
- * It is assumed that @ptr begins with a 'struct ckpt_hdr'.
- */
-void ckpt_hdr_put(struct ckpt_ctx *ctx, void *ptr)
-{
-	struct ckpt_hdr *h = (struct ckpt_hdr *) ptr;
-	_ckpt_hdr_put(ctx, ptr, h->len);
-}
-
-/**
- * ckpt_hdr_get_type - get a hdr of certain size
- * @ctx: checkpoint context
- * @len: number of bytes to reserve
- *
- * Returns pointer to reserved space on hbuf
- */
-void *ckpt_hdr_get_type(struct ckpt_ctx *ctx, int len, int type)
-{
-	struct ckpt_hdr *h;
-
-	h = ckpt_hdr_get(ctx, len);
-	if (!h)
-		return NULL;
-
-	h->type = type;
-	h->len = len;
-	return h;
-}
-
-#define DUMMY_LSM_INFO "dummy"
-
-int ckpt_write_dummy_lsm_info(struct ckpt_ctx *ctx)
-{
-	return ckpt_write_obj_type(ctx, DUMMY_LSM_INFO,
-			strlen(DUMMY_LSM_INFO), CKPT_HDR_LSM_INFO);
-}
-
-/*
- * ckpt_snarf_lsm_info
- * If there is a CKPT_HDR_LSM_INFO field, toss it.
- * Used when the current LSM doesn't care about this field.
- */
-void ckpt_snarf_lsm_info(struct ckpt_ctx *ctx)
-{
-	struct ckpt_hdr *h;
-
-	h = ckpt_read_buf_type(ctx, CKPT_LSM_INFO_LEN, CKPT_HDR_LSM_INFO);
-	if (!IS_ERR(h))
-		ckpt_hdr_put(ctx, h);
-}
-
-/*
- * Helpers to manage c/r contexts: allocated for each checkpoint and/or
- * restart operation, and persists until the operation is completed.
- */
-
-static void task_arr_free(struct ckpt_ctx *ctx)
-{
-	int n;
-
-	for (n = 0; n < ctx->nr_tasks; n++) {
-		if (ctx->tasks_arr[n]) {
-			put_task_struct(ctx->tasks_arr[n]);
-			ctx->tasks_arr[n] = NULL;
-		}
-	}
-	kfree(ctx->tasks_arr);
-}
-
-static void ckpt_ctx_free(struct ckpt_ctx *ctx)
-{
-	BUG_ON(atomic_read(&ctx->refcount));
-
-	/* per task status debugging only during restart */
-	if (ctx->kflags & CKPT_CTX_RESTART)
-		restore_debug_free(ctx);
-
-	if (ctx->deferqueue)
-		deferqueue_destroy(ctx->deferqueue);
-
-	if (ctx->files_deferq)
-		deferqueue_destroy(ctx->files_deferq);
-
-	if (ctx->file)
-		fput(ctx->file);
-	if (ctx->logfile)
-		fput(ctx->logfile);
-
-	ckpt_obj_hash_free(ctx);
-	path_put(&ctx->root_fs_path);
-	ckpt_pgarr_free(ctx);
-
-	if (ctx->tasks_arr)
-		task_arr_free(ctx);
-
-	if (ctx->root_nsproxy)
-		put_nsproxy(ctx->root_nsproxy);
-	if (ctx->root_task)
-		put_task_struct(ctx->root_task);
-	if (ctx->root_freezer)
-		put_task_struct(ctx->root_freezer);
-
-	free_page((unsigned long) ctx->scratch_page);
-
-	kfree(ctx->pids_arr);
-
-	sock_listening_list_free(&ctx->listen_sockets);
-
-	kfree(ctx);
-}
-
-static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags,
-				       unsigned long kflags, int logfd)
-{
-	struct ckpt_ctx *ctx;
-	int err;
-
-	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
-	if (!ctx)
-		return ERR_PTR(-ENOMEM);
-
-	ctx->uflags = uflags;
-	ctx->kflags = kflags;
-	ctx->ktime_begin = ktime_get();
-
-	atomic_set(&ctx->refcount, 0);
-	INIT_LIST_HEAD(&ctx->pgarr_list);
-	INIT_LIST_HEAD(&ctx->pgarr_pool);
-	init_waitqueue_head(&ctx->waitq);
-	init_waitqueue_head(&ctx->ghostq);
-	init_completion(&ctx->complete);
-
-	init_rwsem(&ctx->errno_sem);
-	down_write(&ctx->errno_sem);
-
-#ifdef CONFIG_CHECKPOINT_DEBUG
-	INIT_LIST_HEAD(&ctx->task_status);
-	spin_lock_init(&ctx->lock);
-#endif
-
-	mutex_init(&ctx->msg_mutex);
-
-	INIT_LIST_HEAD(&ctx->listen_sockets);
-
-	err = -EBADF;
-	ctx->file = fget(fd);
-	if (!ctx->file)
-		goto err;
-	if (logfd == CHECKPOINT_FD_NONE)
-		goto nolog;
-	ctx->logfile = fget(logfd);
-	if (!ctx->logfile)
-		goto err;
-
- nolog:
-	err = -ENOMEM;
-	if (ckpt_obj_hash_alloc(ctx) < 0)
-		goto err;
-	ctx->deferqueue = deferqueue_create();
-	if (!ctx->deferqueue)
-		goto err;
-
-	ctx->files_deferq = deferqueue_create();
-	if (!ctx->files_deferq)
-		goto err;
-
-	ctx->scratch_page = (void *) __get_free_page(GFP_KERNEL);
-	if (!ctx->scratch_page)
-		goto err;
-
-	atomic_inc(&ctx->refcount);
-	return ctx;
- err:
-	ckpt_ctx_free(ctx);
-	return ERR_PTR(err);
-}
-
-struct ckpt_ctx *ckpt_ctx_get(struct ckpt_ctx *ctx)
-{
-	if (ctx)
-		atomic_inc(&ctx->refcount);
-	return ctx;
-}
-
-void ckpt_ctx_put(struct ckpt_ctx *ctx)
-{
-	if (ctx && atomic_dec_and_test(&ctx->refcount))
-		ckpt_ctx_free(ctx);
-}
-
-void ckpt_set_error(struct ckpt_ctx *ctx, int err)
-{
-	/* atomically set ctx->errno */
-	if (!ckpt_test_and_set_ctx_kflag(ctx, CKPT_CTX_ERROR)) {
-		ctx->errno = err;
-		/*
-		 * We initialized ctx->errno_sem write-held to prevent
-		 * other tasks from reading ctx->errno prematurely.
-		 */
-		up_write(&ctx->errno_sem);
-		/* on restart, notify all tasks in restarting subtree */
-		if (ctx->kflags & CKPT_CTX_RESTART)
-			restore_notify_error(ctx);
-	}
-}
-
-void ckpt_set_success(struct ckpt_ctx *ctx)
-{
-	ckpt_set_ctx_kflag(ctx, CKPT_CTX_SUCCESS);
-	/* avoid warning "lock still held" when freeing (was write-held) */
-	up_write(&ctx->errno_sem);
-}
-
-/* helpers to handler log/dbg/err messages */
-void ckpt_msg_lock(struct ckpt_ctx *ctx)
-{
-	if (!ctx)
-		return;
-	mutex_lock(&ctx->msg_mutex);
-	ctx->msg[0] = '\0';
-	ctx->msglen = 1;
-}
-
-void ckpt_msg_unlock(struct ckpt_ctx *ctx)
-{
-	if (!ctx)
-		return;
-	mutex_unlock(&ctx->msg_mutex);
-}
-
-static inline int is_special_flag(char *s)
-{
-	if (*s == '%' && s[1] == '(' && s[2] != '\0' && s[3] == ')')
-		return 1;
-	return 0;
-}
-
-/*
- * _ckpt_generate_fmt - handle the special flags in the enhanced format
- * strings used by checkpoint/restart error messages.
- * @ctx: checkpoint context
- * @fmt: message format
- *
- * The special flags are surrounded by %() to help them visually stand
- * out.  For instance, %(O) means an objref.  The following special
- * flags are recognized:
- *	O: objref
- *	P: pointer
- *	T: task
- *	S: string
- *	V: variable
- *
- * %(O) will be expanded to "[obj %d]".  Likewise P, S, and V, will
- * also expand to format flags requiring an argument to the subsequent
- * sprintf or printk.  T will be expanded to a string with no flags,
- * requiring no further arguments.
- *
- * These do not accept any extra flags (i.e. min field width, precision,
- * etc).
- *
- * The caller of ckpt_err() and _ckpt_err() must provide
- * the additional variabes, in order, to match the @fmt (except for
- * the T key), e.g.:
- *
- *	ckpt_err(ctx, err, "%(T)FILE flags %d %(O)\n", flags, objref);
- *
- * May be called under spinlock.
- * Must be called with ctx->msg_mutex held.  The expanded format
- * will be placed in ctx->fmt.
- */
-static void _ckpt_generate_fmt(struct ckpt_ctx *ctx, char *fmt)
-{
-	char *s = ctx->fmt;
-	int len = 0;
-
-	for (; *fmt && len < CKPT_MSG_LEN; fmt++) {
-		if (!is_special_flag(fmt)) {
-			s[len++] = *fmt;
-			continue;
-		}
-		switch (fmt[2]) {
-		case 'O':
-			len += snprintf(s+len, CKPT_MSG_LEN-len, "[obj %%d]");
-			break;
-		case 'P':
-			len += snprintf(s+len, CKPT_MSG_LEN-len, "[ptr %%p]");
-			break;
-		case 'V':
-			len += snprintf(s+len, CKPT_MSG_LEN-len, "[sym %%pS]");
-			break;
-		case 'S':
-			len += snprintf(s+len, CKPT_MSG_LEN-len, "[str %%s]");
-			break;
-		case 'T':
-			if (ctx->tsk)
-				len += snprintf(s+len, CKPT_MSG_LEN-len,
-					"[pid %d tsk %s]",
-					task_pid_vnr(ctx->tsk), ctx->tsk->comm);
-			else
-				len += snprintf(s+len, CKPT_MSG_LEN-len,
-					"[pid -1 tsk NULL]");
-			break;
-		default:
-			printk(KERN_ERR "c/r: bad format specifier %c\n",
-					fmt[2]);
-			BUG();
-		}
-		fmt += 3;
-	}
-	if (len == CKPT_MSG_LEN)
-		s[CKPT_MSG_LEN-1] = '\0';
-	else
-		s[len] = '\0';
-}
-
-static void _ckpt_msg_appendv(struct ckpt_ctx *ctx, int err, char *fmt,
-				va_list ap)
-{
-	int len = ctx->msglen;
-
-	if (err) {
-		len += snprintf(&ctx->msg[len], CKPT_MSG_LEN-len, "[err %d]",
-				 err);
-		if (len > CKPT_MSG_LEN)
-			goto full;
-	}
-
-	len += snprintf(&ctx->msg[len], CKPT_MSG_LEN-len, "[pos %lld]",
-			ctx->total);
-	len += vsnprintf(&ctx->msg[len], CKPT_MSG_LEN-len, fmt, ap);
-	if (len > CKPT_MSG_LEN) {
-full:
-		len = CKPT_MSG_LEN;
-		ctx->msg[CKPT_MSG_LEN-1] = '\0';
-	}
-	ctx->msglen = len;
-}
-
-void _ckpt_msg_append(struct ckpt_ctx *ctx, char *fmt, ...)
-{
-	va_list ap;
-
-	va_start(ap, fmt);
-	_ckpt_msg_appendv(ctx, 0, fmt, ap);
-	va_end(ap);
-}
-
-void _ckpt_msg_complete(struct ckpt_ctx *ctx)
-{
-	int ret;
-
-	/* Don't write an empty or uninitialized msg */
-	if (ctx->msglen <= 1)
-		return;
-
-	if (ctx->kflags & CKPT_CTX_CHECKPOINT && ckpt_test_error(ctx)) {
-		ret = ckpt_write_obj_type(ctx, NULL, 0, CKPT_HDR_ERROR);
-		if (!ret)
-			ret = ckpt_write_string(ctx, ctx->msg, ctx->msglen);
-		if (ret < 0)
-			printk(KERN_NOTICE "c/r: error string unsaved (%d): %s\n",
-			       ret, ctx->msg+1);
-	}
-
-	if (ctx->logfile) {
-		mm_segment_t fs = get_fs();
-		set_fs(KERNEL_DS);
-		ret = _ckpt_kwrite(ctx->logfile, ctx->msg+1, ctx->msglen-1);
-		set_fs(fs);
-	}
-
-#ifdef CONFIG_CHECKPOINT_DEBUG
-	printk(KERN_DEBUG "%s", ctx->msg+1);
-#endif
-
-	ctx->msglen = 0;
-}
-
-#define __do_ckpt_msg(ctx, err, fmt) do {		\
-	va_list ap;					\
-	_ckpt_generate_fmt(ctx, fmt);			\
-	va_start(ap, fmt);				\
-	_ckpt_msg_appendv(ctx, err, ctx->fmt, ap);	\
-	va_end(ap);					\
-} while (0)
-
-void _do_ckpt_msg(struct ckpt_ctx *ctx, int err, char *fmt, ...)
-{
-	__do_ckpt_msg(ctx, err, fmt);
-}
-
-void do_ckpt_msg(struct ckpt_ctx *ctx, int err, char *fmt, ...)
-{
-	if (!ctx)
-		return;
-
-	ckpt_msg_lock(ctx);
-	__do_ckpt_msg(ctx, err, fmt);
-	_ckpt_msg_complete(ctx);
-	ckpt_msg_unlock(ctx);
-
-	if (err)
-		ckpt_set_error(ctx, err);
-}
-
-/**
- * walk_task_subtree: iterate through a task's descendants
- * @root: subtree root task
- * @func: callback invoked on each task
- * @data: pointer passed to the callback
- *
- * The function will start with @root, and iterate through all the
- * descendants, including threads, in a DFS manner. Children of a task
- * are traversed before proceeding to the next thread of that task.
- *
- * For each task, the callback @func will be called providing the task
- * pointer and the @data. The callback is invoked while holding the
- * tasklist_lock for reading. If the callback fails it should return a
- * negative error, and the traversal ends. If the callback succeeds,
- * it returns a non-negative number, and these values are summed.
- *
- * On success, walk_task_subtree() returns the total summed. On
- * failure, it returns a negative value.
- */
-int walk_task_subtree(struct task_struct *root,
-		      int (*func)(struct task_struct *, void *),
-		      void *data)
-{
-
-	struct task_struct *leader = root;
-	struct task_struct *parent = NULL;
-	struct task_struct *task = root;
-	int total = 0;
-	int ret;
-
-	read_lock(&tasklist_lock);
-	while (1) {
-		/* invoke callback on this task */
-		ret = func(task, data);
-		if (ret < 0)
-			break;
-
-		total += ret;
-
-		/* if has children - proceed with child */
-		if (!list_empty(&task->children)) {
-			parent = task;
-			task = list_entry(task->children.next,
-					  struct task_struct, sibling);
-			continue;
-		}
-
-		while (task != root) {
-			/* if has sibling - proceed with sibling */
-			if (!list_is_last(&task->sibling, &parent->children)) {
-				task = list_entry(task->sibling.next,
-						  struct task_struct, sibling);
-				break;
-			}
-
-			/* else, trace back to parent and proceed */
-			task = parent;
-			parent = parent->real_parent;
-		}
-
-		if (task == root) {
-			/* in case root task is multi-threaded */
-			root = task = next_thread(task);
-			if (root == leader)
-				break;
-		}
-	}
-	read_unlock(&tasklist_lock);
-
-	ckpt_debug("total %d ret %d\n", total, ret);
-	return (ret < 0 ? ret : total);
-}
-
-/* checkpoint/restart syscalls */
-
-/**
- * do_sys_checkpoint - checkpoint a container
- * @pid: pid of the container init(1) process
- * @fd: file to which dump the checkpoint image
- * @flags: checkpoint operation flags
- * @logfd: fd to which to dump debug and error messages
- *
- * Returns positive identifier on success, 0 when returning from restart
- * or negative value on error
- */
-long do_sys_checkpoint(pid_t pid, int fd, unsigned long flags, int logfd)
-{
-	struct ckpt_ctx *ctx;
-	long ret;
-
-	if (flags & ~CHECKPOINT_USER_FLAGS)
-		return -EINVAL;
-
-	if (!ckpt_unpriv_allowed && !capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if (pid == 0)
-		pid = task_pid_vnr(current);
-	ctx = ckpt_ctx_alloc(fd, flags, CKPT_CTX_CHECKPOINT, logfd);
-	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
-
-	ret = do_checkpoint(ctx, pid);
-
-	if (!ret)
-		ret = ctx->crid;
-
-	ckpt_ctx_put(ctx);
-	return ret;
-}
-
-/**
- * do_sys_restart - restart a container
- * @pid: pid of task root (in coordinator's namespace), or 0
- * @fd: file from which read the checkpoint image
- * @flags: restart operation flags
- * @logfd: fd to which to dump debug and error messages
- *
- * Returns negative value on error, or otherwise returns in the realm
- * of the original checkpoint
- */
-long do_sys_restart(pid_t pid, int fd, unsigned long flags, int logfd)
-{
-	struct ckpt_ctx *ctx = NULL;
-	long ret;
-
-	/* no flags for now */
-	if (flags & ~RESTART_USER_FLAGS)
-		return -EINVAL;
-
-	if (!ckpt_unpriv_allowed && !capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if (pid)
-		ctx = ckpt_ctx_alloc(fd, flags, CKPT_CTX_RESTART, logfd);
-	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
-
-	ret = do_restart(ctx, pid, flags);
-
-	ckpt_ctx_put(ctx);
-	return ret;
-}
-
-
-/* 'ckpt_debug_level' controls the verbosity level of c/r code */
-#ifdef CONFIG_CHECKPOINT_DEBUG
-
-/* FIX: allow to change during runtime */
-unsigned long __read_mostly ckpt_debug_level = CKPT_DDEFAULT;
-
-static __init int ckpt_debug_setup(char *s)
-{
-	long val, ret;
-
-	ret = strict_strtoul(s, 10, &val);
-	if (ret < 0)
-		return ret;
-	ckpt_debug_level = val;
-	return 0;
-}
-
-__setup("ckpt_debug=", ckpt_debug_setup);
-
-#endif /* CONFIG_CHECKPOINT_DEBUG */
diff --git a/init/Kconfig b/init/Kconfig
index fb43090..5184f65 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -725,7 +725,7 @@ config NET_NS
 	  Allow user space to create what appear to be multiple instances
 	  of the network stack.
 
-source "checkpoint/Kconfig"
+source "kernel/checkpoint/Kconfig"
 
 config BLK_DEV_INITRD
 	bool "Initial RAM filesystem and RAM disk (initramfs/initrd) support"
diff --git a/kernel/Makefile b/kernel/Makefile
index 3c2c303..eea17e1 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -101,6 +101,7 @@ obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
 obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
+obj-$(CONFIG_CHECKPOINT) += checkpoint/
 
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan at linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/checkpoint/Kconfig b/kernel/checkpoint/Kconfig
new file mode 100644
index 0000000..4a2c845
--- /dev/null
+++ b/kernel/checkpoint/Kconfig
@@ -0,0 +1,20 @@
+# Architectures should define CHECKPOINT_SUPPORT when they have
+# implemented the hooks for processor state etc. needed by the
+# core checkpoint/restart code.
+
+config DEFERQUEUE
+	bool
+	default n
+
+config CHECKPOINT
+	bool "Checkpoint/restart (EXPERIMENTAL)"
+	depends on CHECKPOINT_SUPPORT && EXPERIMENTAL
+	depends on CGROUP_FREEZER
+	select DEFERQUEUE
+	help
+	  Application checkpoint/restart is the ability to save the
+	  state of a running application so that it can later resume
+	  its execution from the time at which it was checkpointed.
+
+	  Turning this option on will enable checkpoint and restart
+	  functionality in the kernel.
diff --git a/kernel/checkpoint/Makefile b/kernel/checkpoint/Makefile
new file mode 100644
index 0000000..5aa6a75
--- /dev/null
+++ b/kernel/checkpoint/Makefile
@@ -0,0 +1,10 @@
+#
+# Makefile for linux checkpoint/restart.
+#
+
+obj-$(CONFIG_CHECKPOINT) += \
+	sys.o \
+	objhash.o \
+	checkpoint.o \
+	restart.o \
+	process.o
diff --git a/kernel/checkpoint/checkpoint.c b/kernel/checkpoint/checkpoint.c
new file mode 100644
index 0000000..b3c1c4f
--- /dev/null
+++ b/kernel/checkpoint/checkpoint.c
@@ -0,0 +1,660 @@
+/*
+ *  Checkpoint logic and helpers
+ *
+ *  Copyright (C) 2008-2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG  CKPT_DSYS
+
+#include <linux/version.h>
+#include <linux/sched.h>
+#include <linux/freezer.h>
+#include <linux/ptrace.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/fs_struct.h>
+#include <linux/dcache.h>
+#include <linux/mount.h>
+#include <linux/utsname.h>
+#include <linux/magic.h>
+#include <linux/hrtimer.h>
+#include <linux/deferqueue.h>
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+
+/* unique checkpoint identifier (FIXME: should be per-container ?) */
+static atomic_t ctx_count = ATOMIC_INIT(0);
+
+/**
+ * ckpt_write_obj - write an object
+ * @ctx: checkpoint context
+ * @h: object descriptor
+ */
+int ckpt_write_obj(struct ckpt_ctx *ctx, struct ckpt_hdr *h)
+{
+	_ckpt_debug(CKPT_DRW, "type %d len %d\n", h->type, h->len);
+	return ckpt_kwrite(ctx, h, h->len);
+}
+
+/**
+ * ckpt_write_obj_type - write an object (from a pointer)
+ * @ctx: checkpoint context
+ * @ptr: buffer pointer
+ * @len: buffer size
+ * @type: desired type
+ *
+ * If @ptr is NULL, then write only the header (payload to follow)
+ */
+int ckpt_write_obj_type(struct ckpt_ctx *ctx, void *ptr, int len, int type)
+{
+	struct ckpt_hdr *h;
+	int ret;
+
+	h = ckpt_hdr_get(ctx, sizeof(*h));
+	if (!h)
+		return -ENOMEM;
+
+	h->type = type;
+	h->len = len + sizeof(*h);
+
+	_ckpt_debug(CKPT_DRW, "type %d len %d\n", h->type, h->len);
+	ret = ckpt_kwrite(ctx, h, sizeof(*h));
+	if (ret < 0)
+		goto out;
+	if (ptr)
+		ret = ckpt_kwrite(ctx, ptr, len);
+ out:
+	_ckpt_hdr_put(ctx, h, sizeof(*h));
+	return ret;
+}
+
+/**
+ * ckpt_write_buffer - write an object of type buffer
+ * @ctx: checkpoint context
+ * @ptr: buffer pointer
+ * @len: buffer size
+ */
+int ckpt_write_buffer(struct ckpt_ctx *ctx, void *ptr, int len)
+{
+	return ckpt_write_obj_type(ctx, ptr, len, CKPT_HDR_BUFFER);
+}
+
+/**
+ * ckpt_write_string - write an object of type string
+ * @ctx: checkpoint context
+ * @str: string pointer
+ * @len: string length
+ */
+int ckpt_write_string(struct ckpt_ctx *ctx, char *str, int len)
+{
+	return ckpt_write_obj_type(ctx, str, len, CKPT_HDR_STRING);
+}
+
+/***********************************************************************
+ * Checkpoint
+ */
+
+static void fill_kernel_const(struct ckpt_const *h)
+{
+	struct task_struct *tsk;
+	struct new_utsname *uts;
+
+	/* task */
+	h->task_comm_len = sizeof(tsk->comm);
+	/* mm->saved_auxv size */
+	h->at_vector_size = AT_VECTOR_SIZE;
+	/* signal */
+	h->signal_nsig = _NSIG;
+	/* uts */
+	h->uts_sysname_len = sizeof(uts->sysname);
+	h->uts_nodename_len = sizeof(uts->nodename);
+	h->uts_release_len = sizeof(uts->release);
+	h->uts_version_len = sizeof(uts->version);
+	h->uts_machine_len = sizeof(uts->machine);
+	h->uts_domainname_len = sizeof(uts->domainname);
+	/* rlimit */
+	h->rlimit_nlimits = RLIM_NLIMITS;
+	/* tty */
+	h->n_tty_buf_size = N_TTY_BUF_SIZE;
+	h->tty_termios_ncc = NCC;
+}
+
+/* write the checkpoint header */
+static int checkpoint_write_header(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_header *h;
+	struct new_utsname *uts;
+	struct timeval ktv;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_HEADER);
+	if (!h)
+		return -ENOMEM;
+
+	do_gettimeofday(&ktv);
+	uts = utsname();
+
+	h->arch_id = cpu_to_le16(CKPT_ARCH_ID);  /* see asm/checkpoitn.h */
+
+	h->magic = CHECKPOINT_MAGIC_HEAD;
+	h->major = (LINUX_VERSION_CODE >> 16) & 0xff;
+	h->minor = (LINUX_VERSION_CODE >> 8) & 0xff;
+	h->patch = (LINUX_VERSION_CODE) & 0xff;
+
+	h->rev = CHECKPOINT_VERSION;
+
+	h->uflags = ctx->uflags;
+	h->time = ktv.tv_sec;
+
+	fill_kernel_const(&h->constants);
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+	if (ret < 0)
+		return ret;
+
+	down_read(&uts_sem);
+	ret = ckpt_write_buffer(ctx, uts->release, sizeof(uts->release));
+	if (ret < 0)
+		goto up;
+	ret = ckpt_write_buffer(ctx, uts->version, sizeof(uts->version));
+	if (ret < 0)
+		goto up;
+	ret = ckpt_write_buffer(ctx, uts->machine, sizeof(uts->machine));
+ up:
+	up_read(&uts_sem);
+	if (ret < 0)
+		return ret;
+
+	return checkpoint_write_header_arch(ctx);
+}
+
+/* write the container configuration section */
+static int checkpoint_container(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_container *h;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_CONTAINER);
+	if (!h)
+		return -ENOMEM;
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+
+	if (ret < 0)
+		return ret;
+
+	memset(ctx->lsm_name, 0, CHECKPOINT_LSM_NAME_MAX + 1);
+	strlcpy(ctx->lsm_name, security_get_lsm_name(),
+				CHECKPOINT_LSM_NAME_MAX + 1);
+	ret = ckpt_write_buffer(ctx, ctx->lsm_name,
+				CHECKPOINT_LSM_NAME_MAX + 1);
+	if (ret < 0)
+		return ret;
+
+	return security_checkpoint_header(ctx);
+}
+
+/* write the checkpoint trailer */
+static int checkpoint_write_tail(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_tail *h;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TAIL);
+	if (!h)
+		return -ENOMEM;
+
+	h->magic = CHECKPOINT_MAGIC_TAIL;
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+/* dump all tasks in ctx->tasks_arr[] */
+static int checkpoint_all_tasks(struct ckpt_ctx *ctx)
+{
+	int n, ret = 0;
+
+	for (n = 0; n < ctx->nr_tasks; n++) {
+		ckpt_debug("dumping task #%d\n", n);
+		ret = checkpoint_task(ctx, ctx->tasks_arr[n]);
+		if (ret < 0)
+			break;
+	}
+
+	return ret;
+}
+
+static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	struct task_struct *root = ctx->root_task;
+	struct nsproxy *nsproxy;
+	int ret = 0;
+
+	ckpt_debug("check %d\n", task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns));
+
+	if (t->exit_state == EXIT_DEAD) {
+		_ckpt_err(ctx, -EBUSY, "%(T)Task state EXIT_DEAD\n");
+		return -EBUSY;
+	}
+
+	if (!ptrace_may_access(t, PTRACE_MODE_ATTACH)) {
+		_ckpt_err(ctx, -EPERM, "%(T)Ptrace attach denied\n");
+		return -EPERM;
+	}
+
+	/* zombies are cool (and also don't have nsproxy, below...) */
+	if (t->exit_state)
+		return 0;
+
+	/* verify that all tasks belongs to same freezer cgroup */
+	if (t != current && !in_same_cgroup_freezer(t, ctx->root_freezer)) {
+		_ckpt_err(ctx, -EBUSY, "%(T)Not frozen or wrong cgroup\n");
+		return -EBUSY;
+	}
+
+	/* FIX: add support for ptraced tasks */
+	if (task_ptrace(t)) {
+		_ckpt_err(ctx, -EBUSY, "%(T)Task is ptraced\n");
+		return -EBUSY;
+	}
+
+	/*
+	 * FIX: for now, disallow siblings of container init created
+	 * via CLONE_PARENT (unclear if they will remain possible)
+	 */
+	if (ctx->root_init && t != root &&
+	    t->real_parent == root->real_parent && t->tgid != root->tgid) {
+		_ckpt_err(ctx, -EINVAL, "%(T)Task is sibling of root\n");
+		return -EINVAL;
+	}
+
+	rcu_read_lock();
+	nsproxy = task_nsproxy(t);
+	/* no support for >1 private mntns */
+	if (nsproxy->mnt_ns != ctx->root_nsproxy->mnt_ns) {
+		_ckpt_err(ctx, -EPERM, "%(T)Nested mnt_ns unsupported\n");
+		ret = -EPERM;
+	}
+	/* no support for >1 private netns */
+	if (nsproxy->net_ns != ctx->root_nsproxy->net_ns) {
+		_ckpt_err(ctx, -EPERM, "%(T)Nested net_ns unsupported\n");
+		ret = -EPERM;
+	}
+	/* no support for >1 private pidns */
+	if (nsproxy->pid_ns != ctx->root_nsproxy->pid_ns) {
+		_ckpt_err(ctx, -EPERM, "%(T)Nested pid_ns unsupported\n");
+		ret = -EPERM;
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
+#define CKPT_HDR_PIDS_CHUNK	256
+
+static int checkpoint_pids(struct ckpt_ctx *ctx)
+{
+	struct ckpt_pids *h;
+	struct pid_namespace *ns;
+	struct task_struct *task;
+	struct task_struct **tasks_arr;
+	int nr_tasks, n, pos = 0, ret = 0;
+
+	ns = ctx->root_nsproxy->pid_ns;
+	tasks_arr = ctx->tasks_arr;
+	nr_tasks = ctx->nr_tasks;
+	BUG_ON(nr_tasks <= 0);
+
+	ret = ckpt_write_obj_type(ctx, NULL,
+				  sizeof(*h) * nr_tasks,
+				  CKPT_HDR_BUFFER);
+	if (ret < 0)
+		return ret;
+
+	h = ckpt_hdr_get(ctx, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
+	if (!h)
+		return -ENOMEM;
+
+	do {
+		rcu_read_lock();
+		for (n = 0; n < min(nr_tasks, CKPT_HDR_PIDS_CHUNK); n++) {
+			task = tasks_arr[pos];
+
+			h[n].vpid = task_pid_nr_ns(task, ns);
+			h[n].vtgid = task_tgid_nr_ns(task, ns);
+			h[n].vpgid = task_pgrp_nr_ns(task, ns);
+			h[n].vsid = task_session_nr_ns(task, ns);
+			h[n].vppid = task_tgid_nr_ns(task->real_parent, ns);
+			ckpt_debug("task[%d]: vpid %d vtgid %d parent %d\n",
+				   pos, h[n].vpid, h[n].vtgid, h[n].vppid);
+			pos++;
+		}
+		rcu_read_unlock();
+
+		n = min(nr_tasks, CKPT_HDR_PIDS_CHUNK);
+		ret = ckpt_kwrite(ctx, h, n * sizeof(*h));
+		if (ret < 0)
+			break;
+
+		nr_tasks -= n;
+	} while (nr_tasks > 0);
+
+	_ckpt_hdr_put(ctx, h, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
+	return ret;
+}
+
+static int collect_objects(struct ckpt_ctx *ctx)
+{
+	int n, ret = 0;
+
+	for (n = 0; n < ctx->nr_tasks; n++) {
+		ckpt_debug("dumping task #%d\n", n);
+		ret = ckpt_collect_task(ctx, ctx->tasks_arr[n]);
+		if (ret < 0) {
+			ctx->tsk = ctx->tasks_arr[n];
+			ckpt_err(ctx, ret, "%(T)Collect failed\n");
+			ctx->tsk = NULL;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+struct ckpt_cnt_tasks {
+	struct ckpt_ctx *ctx;
+	int nr;
+};
+
+/* count number of tasks in tree (and optionally fill pid's in array) */
+static int __tree_count_tasks(struct task_struct *task, void *data)
+{
+	struct ckpt_cnt_tasks *d = (struct ckpt_cnt_tasks *) data;
+	struct ckpt_ctx *ctx = d->ctx;
+	int ret;
+
+	ctx->tsk = task;  /* (for _ckpt_err()) */
+
+	/* is this task cool ? */
+	ret = may_checkpoint_task(ctx, task);
+	if (ret < 0)
+		goto out;
+
+	if (ctx->tasks_arr) {
+		if (d->nr == ctx->nr_tasks) {  /* unlikely... try again later */
+			_ckpt_err(ctx, -EBUSY, "%(T)Bad task count (%d)\n",
+				  d->nr);
+			ret = -EBUSY;
+			goto out;
+		}
+		ctx->tasks_arr[d->nr++] = task;
+		get_task_struct(task);
+	}
+
+	ret = 1;
+ out:
+	ctx->tsk = NULL;
+	return ret;
+}
+
+static int tree_count_tasks(struct ckpt_ctx *ctx)
+{
+	struct ckpt_cnt_tasks data;
+	int ret;
+
+	data.ctx = ctx;
+	data.nr = 0;
+
+	ckpt_msg_lock(ctx);
+	ret = walk_task_subtree(ctx->root_task, __tree_count_tasks, &data);
+	ckpt_msg_unlock(ctx);
+	if (ret < 0)
+		_ckpt_msg_complete(ctx);
+	return ret;
+}
+
+/*
+ * build_tree - scan the tasks tree in DFS order and fill in array
+ * @ctx: checkpoint context
+ *
+ * Using DFS order simplifies the restart logic to re-create the tasks.
+ *
+ * On success, ctx->tasks_arr will be allocated and populated with all
+ * tasks (reference taken), and ctx->nr_tasks will hold the total count.
+ * The array is cleaned up by ckpt_ctx_free().
+ */
+static int build_tree(struct ckpt_ctx *ctx)
+{
+	int n, m;
+
+	/* count tasks (no side effects) */
+	n = tree_count_tasks(ctx);
+	if (n < 0)
+		return n;
+
+	ctx->nr_tasks = n;
+	ctx->tasks_arr = kzalloc(n * sizeof(*ctx->tasks_arr), GFP_KERNEL);
+	if (!ctx->tasks_arr)
+		return -ENOMEM;
+
+	/* count again (now will fill array) */
+	m = tree_count_tasks(ctx);
+
+	/* unlikely, but ... (cleanup in ckpt_ctx_free) */
+	if (m < 0)
+		return m;
+	else if (m != n)
+		return -EBUSY;
+
+	return 0;
+}
+
+/* dump the array that describes the tasks tree */
+static int checkpoint_tree(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_tree *h;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TREE);
+	if (!h)
+		return -ENOMEM;
+
+	h->nr_tasks = ctx->nr_tasks;
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+	if (ret < 0)
+		return ret;
+
+	ret = checkpoint_pids(ctx);
+	return ret;
+}
+
+static struct task_struct *get_freezer_task(struct task_struct *root_task)
+{
+	struct task_struct *p;
+
+	/*
+	 * For the duration of checkpoint we deep-freeze all tasks.
+	 * Normally do it through the root task's freezer cgroup.
+	 * However, if the root task is also the current task (doing
+	 * self-checkpoint) we can't freeze ourselves. In this case,
+	 * choose the next available (non-dead) task instead. We'll
+	 * use its freezer cgroup to verify that all tasks belong to
+	 * the same cgroup.
+	 */
+
+	if (root_task != current) {
+		get_task_struct(root_task);
+		return root_task;
+	}
+
+	/* search among threads, then children */
+	read_lock(&tasklist_lock);
+
+	for (p = next_thread(root_task); p != root_task; p = next_thread(p)) {
+		if (p->state == TASK_DEAD)
+			continue;
+		if (!in_same_cgroup_freezer(p, root_task))
+			goto out;
+	}
+
+	list_for_each_entry(p, &root_task->children, sibling) {
+		if (p->state == TASK_DEAD)
+			continue;
+		if (!in_same_cgroup_freezer(p, root_task))
+			goto out;
+	}
+
+	p = NULL;
+ out:
+	read_unlock(&tasklist_lock);
+	if (p)
+		get_task_struct(p);
+	return p;
+}
+
+/* setup checkpoint-specific parts of ctx */
+static int init_checkpoint_ctx(struct ckpt_ctx *ctx, pid_t pid)
+{
+	struct task_struct *task;
+	struct nsproxy *nsproxy;
+	struct fs_struct *fs;
+
+	/*
+	 * No need for explicit cleanup here, because if an error
+	 * occurs then ckpt_ctx_free() is eventually called.
+	 */
+
+	ctx->root_pid = pid;
+
+	/* root task */
+	read_lock(&tasklist_lock);
+	task = find_task_by_vpid(pid);
+	if (task)
+		get_task_struct(task);
+	read_unlock(&tasklist_lock);
+	if (!task)
+		return -ESRCH;
+	else
+		ctx->root_task = task;
+
+	/* root nsproxy */
+	rcu_read_lock();
+	nsproxy = task_nsproxy(task);
+	if (nsproxy)
+		get_nsproxy(nsproxy);
+	rcu_read_unlock();
+	if (!nsproxy)
+		return -ESRCH;
+	else
+		ctx->root_nsproxy = nsproxy;
+
+	/* root freezer */
+	ctx->root_freezer = get_freezer_task(task);
+
+	/* container init ? */
+	ctx->root_init = is_container_init(task);
+
+	if (!(ctx->uflags & CHECKPOINT_SUBTREE) && !ctx->root_init) {
+		ckpt_err(ctx, -EINVAL, "Not container init\n");
+		return -EINVAL;  /* cleanup by ckpt_ctx_free() */
+	}
+
+	/* root vfs (FIX: WILL CHANGE with mnt-ns etc */
+	task_lock(ctx->root_task);
+	fs = ctx->root_task->fs;
+	read_lock(&fs->lock);
+	ctx->root_fs_path = fs->root;
+	path_get(&ctx->root_fs_path);
+	read_unlock(&fs->lock);
+	task_unlock(ctx->root_task);
+
+	return 0;
+}
+
+long do_checkpoint(struct ckpt_ctx *ctx, pid_t pid)
+{
+	long ret;
+
+	ret = init_checkpoint_ctx(ctx, pid);
+	if (ret < 0)
+		return ret;
+
+	if (ctx->root_freezer) {
+		ret = cgroup_freezer_begin_checkpoint(ctx->root_freezer);
+		if (ret < 0) {
+			ckpt_err(ctx, ret, "Freezer cgroup failed\n");
+			return ret;
+		}
+	}
+
+	ret = build_tree(ctx);
+	if (ret < 0)
+		goto out;
+
+	if (!(ctx->uflags & CHECKPOINT_SUBTREE)) {
+		/*
+		 * Verify that all objects are contained (no leaks):
+		 * First collect them all into the while counting users
+		 * and then compare to the objects' real user counts.
+		 */
+		ret = collect_objects(ctx);
+		if (ret < 0)
+			goto out;
+		if (!ckpt_obj_contained(ctx)) {
+			ret = -EBUSY;
+			goto out;
+		}
+	}
+
+	ret = checkpoint_write_header(ctx);
+	if (ret < 0)
+		goto out;
+	ret = checkpoint_container(ctx);
+	if (ret < 0)
+		goto out;
+	ret = checkpoint_tree(ctx);
+	if (ret < 0)
+		goto out;
+	ret = checkpoint_all_tasks(ctx);
+	if (ret < 0)
+		goto out;
+
+	ret = deferqueue_run(ctx->deferqueue);  /* run deferred work */
+	if (ret < 0)
+		goto out;
+
+	/* verify that all objects were indeed visited */
+	if (!ckpt_obj_visited(ctx)) {
+		ckpt_err(ctx, -EBUSY, "Leak: unvisited\n");
+		ret = -EBUSY;
+		goto out;
+	}
+
+	ret = checkpoint_write_tail(ctx);
+	if (ret < 0)
+		goto out;
+
+	/* on success, return (unique) checkpoint identifier */
+	ctx->crid = atomic_inc_return(&ctx_count);
+	ret = ctx->crid;
+ out:
+	if (ret < 0)
+		ckpt_set_error(ctx, ret);
+	else
+		ckpt_set_success(ctx);
+
+	if (ctx->root_freezer)
+		cgroup_freezer_end_checkpoint(ctx->root_freezer);
+	return ret;
+}
diff --git a/kernel/checkpoint/objhash.c b/kernel/checkpoint/objhash.c
new file mode 100644
index 0000000..70c54f5
--- /dev/null
+++ b/kernel/checkpoint/objhash.c
@@ -0,0 +1,1083 @@
+/*
+ *  Checkpoint-restart - object hash infrastructure to manage shared objects
+ *
+ *  Copyright (C) 2008-2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG  CKPT_DOBJ
+
+#include <linux/kernel.h>
+#include <linux/hash.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/fs_struct.h>
+#include <linux/sched.h>
+#include <linux/kref.h>
+#include <linux/ipc_namespace.h>
+#include <linux/user_namespace.h>
+#include <linux/mnt_namespace.h>
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+#include <net/sock.h>
+
+struct ckpt_obj {
+	int users;
+	int objref;
+	int flags;
+	void *ptr;
+	const struct ckpt_obj_ops *ops;
+	struct hlist_node hash;
+	struct hlist_node next;
+};
+
+/* object internal flags */
+#define CKPT_OBJ_CHECKPOINTED		0x1   /* object already checkpointed */
+#define CKPT_OBJ_VISITED		0x2   /* object already visited */
+
+struct ckpt_obj_hash {
+	struct hlist_head *head;
+	struct hlist_head list;
+	int next_free_objref;
+};
+
+/* helper grab/drop/users functions */
+
+static int obj_inode_grab(void *ptr)
+{
+	return igrab((struct inode *) ptr) ? 0 : -EBADF;
+}
+
+static void obj_inode_drop(void *ptr, int lastref)
+{
+	iput((struct inode *) ptr);
+}
+
+static int obj_file_table_grab(void *ptr)
+{
+	atomic_inc(&((struct files_struct *) ptr)->count);
+	return 0;
+}
+
+static void obj_file_table_drop(void *ptr, int lastref)
+{
+	put_files_struct((struct files_struct *) ptr);
+}
+
+static int obj_file_table_users(void *ptr)
+{
+	return atomic_read(&((struct files_struct *) ptr)->count);
+}
+
+static int obj_file_grab(void *ptr)
+{
+	get_file((struct file *) ptr);
+	return 0;
+}
+
+static void obj_file_drop(void *ptr, int lastref)
+{
+	fput((struct file *) ptr);
+}
+
+static int obj_file_users(void *ptr)
+{
+	return atomic_long_read(&((struct file *) ptr)->f_count);
+}
+
+static int obj_fs_grab(void *ptr)
+{
+	get_fs_struct((struct fs_struct *) ptr);
+	return 0;
+}
+
+static void obj_fs_drop(void *ptr, int lastref)
+{
+	put_fs_struct((struct fs_struct *) ptr);
+}
+
+static int obj_fs_users(void *ptr)
+{
+	/*
+	 * It's safe to not use fs->lock because the fs referenced.
+	 * It's also sufficient for leak detection: with no leak the
+	 * count can't change; with a leak it will be too big already
+	 * (even if it's about to grow), and if it's about to shrink
+	 * then it's as if we sampled the count a bit earlier.
+	 */
+	return ((struct fs_struct *) ptr)->users;
+}
+
+static int obj_ipc_ns_grab(void *ptr)
+{
+	get_ipc_ns((struct ipc_namespace *) ptr);
+	return 0;
+}
+
+static void obj_ipc_ns_drop(void *ptr, int lastref)
+{
+	put_ipc_ns((struct ipc_namespace *) ptr);
+}
+
+static int obj_ipc_ns_users(void *ptr)
+{
+	return atomic_read(&((struct ipc_namespace *) ptr)->count);
+}
+
+static int obj_mnt_ns_grab(void *ptr)
+{
+	get_mnt_ns((struct mnt_namespace *) ptr);
+	return 0;
+}
+
+static void obj_mnt_ns_drop(void *ptr, int lastref)
+{
+	put_mnt_ns((struct mnt_namespace *) ptr);
+}
+
+static int obj_mnt_ns_users(void *ptr)
+{
+	return atomic_read(&((struct mnt_namespace *) ptr)->count);
+}
+
+static int obj_cred_grab(void *ptr)
+{
+	get_cred((struct cred *) ptr);
+	return 0;
+}
+
+static void obj_cred_drop(void *ptr, int lastref)
+{
+	put_cred((struct cred *) ptr);
+}
+
+static int obj_user_grab(void *ptr)
+{
+	struct user_struct *u = ptr;
+	(void) get_uid(u);
+	return 0;
+}
+
+static void obj_user_drop(void *ptr, int lastref)
+{
+	free_uid((struct user_struct *) ptr);
+}
+
+static int obj_groupinfo_grab(void *ptr)
+{
+	get_group_info((struct group_info *) ptr);
+	return 0;
+}
+
+static void obj_groupinfo_drop(void *ptr, int lastref)
+{
+	put_group_info((struct group_info *) ptr);
+}
+
+static int obj_sock_grab(void *ptr)
+{
+	sock_hold((struct sock *) ptr);
+	return 0;
+}
+
+static void obj_sock_drop(void *ptr, int lastref)
+{
+	struct sock *sk = (struct sock *) ptr;
+
+	/*
+	 * Sockets created during restart are graft()ed, i.e. have a
+	 * valid @sk->sk_socket. Because only an fput() results in the
+	 * necessary sock_release(), we may leak the struct socket of
+	 * sockets that were not attached to a file. Therefore, if
+	 * @lastref is set, we hereby invoke sock_release() on sockets
+	 * that we have put into the objhash but were never attached
+	 * to a file.
+	 */
+	if (lastref && sk->sk_socket && !sk->sk_socket->file) {
+		struct socket *sock = sk->sk_socket;
+		sock_orphan(sk);
+		sock->sk = NULL;
+		sock_release(sock);
+	}
+
+	sock_put((struct sock *) ptr);
+}
+
+static int obj_sock_users(void *ptr)
+{
+	return atomic_read(&((struct sock *) ptr)->sk_refcnt);
+}
+
+static int obj_tty_grab(void *ptr)
+{
+	tty_kref_get((struct tty_struct *) ptr);
+	return 0;
+}
+
+static void obj_tty_drop(void *ptr, int lastref)
+{
+	tty_kref_put((struct tty_struct *) ptr);
+}
+
+static int obj_tty_users(void *ptr)
+{
+	return atomic_read(&((struct tty_struct *) ptr)->kref.refcount);
+}
+
+void lsm_string_free(struct kref *kref)
+{
+	struct ckpt_lsm_string *s = container_of(kref, struct ckpt_lsm_string,
+					kref);
+	kfree(s->string);
+	kfree(s);
+}
+
+static int lsm_string_grab(void *ptr)
+{
+	struct ckpt_lsm_string *s = ptr;
+	kref_get(&s->kref);
+	return 0;
+}
+
+static void lsm_string_drop(void *ptr, int lastref)
+{
+	struct ckpt_lsm_string *s = ptr;
+	kref_put(&s->kref, lsm_string_free);
+}
+
+/* security context strings */
+static int checkpoint_lsm_string(struct ckpt_ctx *ctx, void *ptr);
+static struct ckpt_lsm_string *restore_lsm_string(struct ckpt_ctx *ctx);
+static void *restore_lsm_string_wrap(struct ckpt_ctx *ctx)
+{
+	return (void *)restore_lsm_string(ctx);
+}
+
+/* ignored object */
+static const struct ckpt_obj_ops ckpt_obj_ignored_ops = {
+	.obj_name = "IGNORED",
+	.obj_type = CKPT_OBJ_IGNORE,
+	.ref_drop = NULL,
+	.ref_grab = NULL,
+};
+
+/* inode object */
+static const struct ckpt_obj_ops ckpt_obj_inode_ops = {
+	.obj_name = "INODE",
+	.obj_type = CKPT_OBJ_INODE,
+	.ref_drop = obj_inode_drop,
+	.ref_grab = obj_inode_grab,
+};
+
+/* files_struct object */
+static const struct ckpt_obj_ops ckpt_obj_files_struct_ops = {
+	.obj_name = "FILE_TABLE",
+	.obj_type = CKPT_OBJ_FILE_TABLE,
+	.ref_drop = obj_file_table_drop,
+	.ref_grab = obj_file_table_grab,
+	.ref_users = obj_file_table_users,
+	.checkpoint = checkpoint_file_table,
+	.restore = restore_file_table,
+};
+/* file object */
+static const struct ckpt_obj_ops ckpt_obj_file_ops = {
+	.obj_name = "FILE",
+	.obj_type = CKPT_OBJ_FILE,
+	.ref_drop = obj_file_drop,
+	.ref_grab = obj_file_grab,
+	.ref_users = obj_file_users,
+	.checkpoint = checkpoint_file,
+	.restore = restore_file,
+};
+/* fs object */
+static const struct ckpt_obj_ops ckpt_obj_fs_ops = {
+	.obj_name = "FS",
+	.obj_type = CKPT_OBJ_FS,
+	.ref_drop = obj_fs_drop,
+	.ref_grab = obj_fs_grab,
+	.ref_users = obj_fs_users,
+	.checkpoint = checkpoint_fs,
+	.restore = restore_fs,
+};
+/* ipc_ns object */
+static const struct ckpt_obj_ops ckpt_obj_ipc_ns_ops = {
+	.obj_name = "IPC_NS",
+	.obj_type = CKPT_OBJ_IPC_NS,
+	.ref_drop = obj_ipc_ns_drop,
+	.ref_grab = obj_ipc_ns_grab,
+	.ref_users = obj_ipc_ns_users,
+	.checkpoint = checkpoint_ipc_ns,
+	.restore = restore_ipc_ns,
+};
+/* mnt_ns object */
+static const struct ckpt_obj_ops ckpt_obj_mnt_ns_ops = {
+	.obj_name = "MOUNTS NS",
+	.obj_type = CKPT_OBJ_MNT_NS,
+	.ref_grab = obj_mnt_ns_grab,
+	.ref_drop = obj_mnt_ns_drop,
+	.ref_users = obj_mnt_ns_users,
+};
+/* struct cred */
+static const struct ckpt_obj_ops ckpt_obj_cred_ops = {
+	.obj_name = "CRED",
+	.obj_type = CKPT_OBJ_CRED,
+	.ref_drop = obj_cred_drop,
+	.ref_grab = obj_cred_grab,
+	.checkpoint = checkpoint_cred,
+	.restore = restore_cred,
+};
+/* user object */
+static const struct ckpt_obj_ops ckpt_obj_user_ops = {
+	.obj_name = "USER",
+	.obj_type = CKPT_OBJ_USER,
+	.ref_drop = obj_user_drop,
+	.ref_grab = obj_user_grab,
+	.checkpoint = checkpoint_user,
+	.restore = restore_user,
+};
+/* struct groupinfo */
+static const struct ckpt_obj_ops ckpt_obj_groupinfo_ops = {
+	.obj_name = "GROUPINFO",
+	.obj_type = CKPT_OBJ_GROUPINFO,
+	.ref_drop = obj_groupinfo_drop,
+	.ref_grab = obj_groupinfo_grab,
+	.checkpoint = checkpoint_groupinfo,
+	.restore = restore_groupinfo,
+};
+/* sock object */
+static const struct ckpt_obj_ops ckpt_obj_sock_ops = {
+	.obj_name = "SOCKET",
+	.obj_type = CKPT_OBJ_SOCK,
+	.ref_drop = obj_sock_drop,
+	.ref_grab = obj_sock_grab,
+	.ref_users = obj_sock_users,
+	.checkpoint = checkpoint_sock,
+	.restore = restore_sock,
+};
+/* struct tty_struct */
+static const struct ckpt_obj_ops ckpt_obj_tty_ops = {
+	.obj_name = "TTY",
+	.obj_type = CKPT_OBJ_TTY,
+	.ref_drop = obj_tty_drop,
+	.ref_grab = obj_tty_grab,
+	.ref_users = obj_tty_users,
+	.checkpoint = checkpoint_tty,
+	.restore = restore_tty,
+};
+/*
+ * LSM void *security on objhash - at checkpoint
+ * We don't take a ref because we won't be doing
+ * anything more with this void* - unless we happen
+ * to run into it again through some other objects's
+ * ->security (in which case that object has it pinned).
+ */
+static const struct ckpt_obj_ops ckpt_obj_security_ptr_ops = {
+	.obj_name = "SECURITY PTR",
+	.obj_type = CKPT_OBJ_SECURITY_PTR,
+	.ref_drop = NULL,
+	.ref_grab = NULL,
+};
+/*
+ * LSM security strings - at restart
+ * This is a struct which we malloc during restart and
+ * must be freed (by objhash cleanup) at the end of
+ * restart
+ */
+static const struct ckpt_obj_ops ckpt_obj_security_strings_ops = {
+	.obj_name = "SECURITY STRING",
+	.obj_type = CKPT_OBJ_SECURITY,
+	.ref_grab = lsm_string_grab,
+	.ref_drop = lsm_string_drop,
+	.checkpoint = checkpoint_lsm_string,
+	.restore = restore_lsm_string_wrap,
+};
+
+static const struct ckpt_obj_ops *ckpt_obj_ops[] = {
+	[CKPT_OBJ_IGNORE] = &ckpt_obj_ignored_ops,
+	[CKPT_OBJ_INODE] = &ckpt_obj_inode_ops,
+	[CKPT_OBJ_FILE_TABLE] = &ckpt_obj_files_struct_ops,
+	[CKPT_OBJ_FILE] = &ckpt_obj_file_ops,
+	[CKPT_OBJ_FS] = &ckpt_obj_fs_ops,
+	[CKPT_OBJ_IPC_NS] = &ckpt_obj_ipc_ns_ops,
+	[CKPT_OBJ_MNT_NS] = &ckpt_obj_mnt_ns_ops,
+	[CKPT_OBJ_USER_NS] = &ckpt_obj_mnt_ns_ops,
+	[CKPT_OBJ_CRED] = &ckpt_obj_cred_ops,
+	[CKPT_OBJ_USER] = &ckpt_obj_user_ops,
+	[CKPT_OBJ_GROUPINFO] = &ckpt_obj_groupinfo_ops,
+	[CKPT_OBJ_SOCK] = &ckpt_obj_sock_ops,
+	[CKPT_OBJ_TTY] = &ckpt_obj_tty_ops,
+	[CKPT_OBJ_SECURITY_PTR] = &ckpt_obj_security_ptr_ops,
+	[CKPT_OBJ_SECURITY] = &ckpt_obj_security_strings_ops,
+};
+
+void register_checkpoint_obj(const struct ckpt_obj_ops *ops)
+{
+	ckpt_obj_ops[ops->obj_type] = ops;
+}
+
+#define CKPT_OBJ_HASH_NBITS  10
+#define CKPT_OBJ_HASH_TOTAL  (1UL << CKPT_OBJ_HASH_NBITS)
+
+static void obj_hash_clear(struct ckpt_obj_hash *obj_hash)
+{
+	struct hlist_head *h = obj_hash->head;
+	struct hlist_node *n, *t;
+	struct ckpt_obj *obj;
+	int i;
+
+	for (i = 0; i < CKPT_OBJ_HASH_TOTAL; i++) {
+		hlist_for_each_entry_safe(obj, n, t, &h[i], hash) {
+			if (obj->ops->ref_drop)
+				obj->ops->ref_drop(obj->ptr, 1);
+			kfree(obj);
+		}
+	}
+}
+
+void ckpt_obj_hash_free(struct ckpt_ctx *ctx)
+{
+	struct ckpt_obj_hash *obj_hash = ctx->obj_hash;
+
+	if (obj_hash) {
+		obj_hash_clear(obj_hash);
+		kfree(obj_hash->head);
+		kfree(ctx->obj_hash);
+		ctx->obj_hash = NULL;
+	}
+}
+
+int ckpt_obj_hash_alloc(struct ckpt_ctx *ctx)
+{
+	struct ckpt_obj_hash *obj_hash;
+	struct hlist_head *head;
+
+	obj_hash = kzalloc(sizeof(*obj_hash), GFP_KERNEL);
+	if (!obj_hash)
+		return -ENOMEM;
+	head = kzalloc(CKPT_OBJ_HASH_TOTAL * sizeof(*head), GFP_KERNEL);
+	if (!head) {
+		kfree(obj_hash);
+		return -ENOMEM;
+	}
+
+	obj_hash->head = head;
+	obj_hash->next_free_objref = 1;
+	INIT_HLIST_HEAD(&obj_hash->list);
+
+	ctx->obj_hash = obj_hash;
+	return 0;
+}
+
+static struct ckpt_obj *obj_find_by_ptr(struct ckpt_ctx *ctx, void *ptr)
+{
+	struct hlist_head *h;
+	struct hlist_node *n;
+	struct ckpt_obj *obj;
+
+	h = &ctx->obj_hash->head[hash_long((unsigned long) ptr,
+					   CKPT_OBJ_HASH_NBITS)];
+	hlist_for_each_entry(obj, n, h, hash)
+		if (obj->ptr == ptr)
+			return obj;
+	return NULL;
+}
+
+static struct ckpt_obj *obj_find_by_objref(struct ckpt_ctx *ctx, int objref)
+{
+	struct hlist_head *h;
+	struct hlist_node *n;
+	struct ckpt_obj *obj;
+
+	h = &ctx->obj_hash->head[hash_long((unsigned long) objref,
+					   CKPT_OBJ_HASH_NBITS)];
+	hlist_for_each_entry(obj, n, h, hash)
+		if (obj->objref == objref)
+			return obj;
+	return NULL;
+}
+
+static inline int obj_alloc_objref(struct ckpt_ctx *ctx)
+{
+	return ctx->obj_hash->next_free_objref++;
+}
+
+/**
+ * ckpt_obj_new - add an object to the obj_hash
+ * @ctx: checkpoint context
+ * @ptr: pointer to object
+ * @objref: object unique id
+ * @ops: object operations
+ *
+ * Add the object to the obj_hash. If @objref is zero, assign a unique
+ * object id and use @ptr as a hash key [checkpoint]. Else use @objref
+ * as a key [restart].
+ */
+static struct ckpt_obj *obj_new(struct ckpt_ctx *ctx, void *ptr,
+				int objref, enum obj_type type)
+{
+	const struct ckpt_obj_ops *ops = ckpt_obj_ops[type];
+	struct ckpt_obj *obj;
+	int i, ret;
+
+	/* explicitly disallow null pointers */
+	BUG_ON(!ptr);
+	/* make sure we don't change this accidentally */
+	BUG_ON(ops->obj_type != type);
+
+	obj = kzalloc(sizeof(*obj), GFP_KERNEL);
+	if (!obj)
+		return ERR_PTR(-ENOMEM);
+
+	obj->ptr = ptr;
+	obj->ops = ops;
+	obj->users = 2;  /* extra reference that objhash itself takes */
+
+	if (!objref) {
+		/* use @obj->ptr to index, assign objref (checkpoint) */
+		obj->objref = obj_alloc_objref(ctx);
+		i = hash_long((unsigned long) ptr, CKPT_OBJ_HASH_NBITS);
+	} else {
+		/* use @obj->objref to index (restart) */
+		obj->objref = objref;
+		i = hash_long((unsigned long) objref, CKPT_OBJ_HASH_NBITS);
+	}
+
+	if (ops->ref_grab)
+		ret = ops->ref_grab(obj->ptr);
+	else
+		ret = 0;
+	if (ret < 0) {
+		kfree(obj);
+		obj = ERR_PTR(ret);
+	} else {
+		hlist_add_head(&obj->hash, &ctx->obj_hash->head[i]);
+		hlist_add_head(&obj->next, &ctx->obj_hash->list);
+	}
+
+	return obj;
+}
+
+/**************************************************************************
+ * Checkpoint
+ */
+
+/**
+ * obj_lookup_add - lookup object and add if not in objhash
+ * @ctx: checkpoint context
+ * @ptr: pointer to object
+ * @type: object type
+ * @first: [output] first encounter (added to table)
+ *
+ * Look up the object pointed to by @ptr in the hash table. If it isn't
+ * already found there, add the object, and allocate a unique object
+ * id. Grab a reference to every object that is added, and maintain the
+ * reference until the entire hash is freed.
+ */
+static struct ckpt_obj *obj_lookup_add(struct ckpt_ctx *ctx, void *ptr,
+				       enum obj_type type, int *first)
+{
+	struct ckpt_obj *obj;
+
+	obj = obj_find_by_ptr(ctx, ptr);
+	if (!obj) {
+		obj = obj_new(ctx, ptr, 0, type);
+		*first = 1;
+	} else {
+		BUG_ON(obj->ops->obj_type != type);
+		obj->users++;
+		*first = 0;
+	}
+	return obj;
+}
+
+/**
+ * ckpt_obj_collect - collect object into objhash
+ * @ctx: checkpoint context
+ * @ptr: pointer to object
+ * @type: object type
+ *
+ * [used during checkpoint].
+ * Return: objref if object is new, 0 otherwise, or an error
+ */
+int ckpt_obj_collect(struct ckpt_ctx *ctx, void *ptr, enum obj_type type)
+{
+	struct ckpt_obj *obj;
+	int first;
+
+	obj = obj_lookup_add(ctx, ptr, type, &first);
+	if (IS_ERR(obj))
+		return PTR_ERR(obj);
+	ckpt_debug("%s objref %d first %d\n",
+		   obj->ops->obj_name, obj->objref, first);
+	return first ? obj->objref : 0;
+}
+
+/**
+ * ckpt_obj_lookup - lookup object (by pointer) in objhash
+ * @ctx: checkpoint context
+ * @ptr: pointer to object
+ * @type: object type
+ *
+ * [used during checkpoint].
+ * Return: objref (or zero if not found)
+ */
+int ckpt_obj_lookup(struct ckpt_ctx *ctx, void *ptr, enum obj_type type)
+{
+	struct ckpt_obj *obj;
+
+	obj = obj_find_by_ptr(ctx, ptr);
+	BUG_ON(obj && obj->ops->obj_type != type);
+	if (obj)
+		ckpt_debug("%s objref %d\n", obj->ops->obj_name, obj->objref);
+	return obj ? obj->objref : 0;
+}
+
+static inline int obj_reverse_leak(struct ckpt_ctx *ctx, struct ckpt_obj *obj)
+{
+	/*
+	 * A "reverse" leak ?  All objects should already be in the
+	 * objhash by now. But an outside task may have created an
+	 * object while we were collecting, which we didn't catch.
+	 */
+	if (obj->ops->ref_users && !(ctx->uflags & CHECKPOINT_SUBTREE)) {
+		ckpt_err(ctx, -EBUSY, "%(O)%(P)Leak: reverse added late (%s)\n",
+			       obj->objref, obj->ptr, obj->ops->obj_name);
+		return -EBUSY;
+	}
+	return 0;
+}
+
+/**
+ * ckpt_obj_lookup_add - lookup object and add if not in objhash
+ * @ctx: checkpoint context
+ * @ptr: pointer to object
+ * @type: object type
+ * @first: [output] first encoutner (added to table)
+ *
+ * [used during checkpoint].
+ * Return: objref
+ */
+int ckpt_obj_lookup_add(struct ckpt_ctx *ctx, void *ptr,
+			enum obj_type type, int *first)
+{
+	struct ckpt_obj *obj;
+
+	obj = obj_lookup_add(ctx, ptr, type, first);
+	if (IS_ERR(obj))
+		return PTR_ERR(obj);
+	ckpt_debug("%s objref %d first %d\n",
+		   obj->ops->obj_name, obj->objref, *first);
+
+	if (*first && obj_reverse_leak(ctx, obj))
+		return -EBUSY;
+
+	obj->flags |= CKPT_OBJ_VISITED;
+	return obj->objref;
+}
+
+/**
+ * ckpt_obj_reserve - reserve an objref
+ * @ctx: checkpoint context
+ *
+ * The reserved objref will not be used for subsequent objects. This
+ * gives an objref that can be safely used during restart without a
+ * matching object in checkpoint.  [used during checkpoint].
+ */
+int ckpt_obj_reserve(struct ckpt_ctx *ctx)
+{
+	return obj_alloc_objref(ctx);
+}
+
+/**
+ * checkpoint_obj - if not already in hash, add object and checkpoint
+ * @ctx: checkpoint context
+ * @ptr: pointer to object
+ * @type: object type
+ *
+ * Use obj_lookup_add() to lookup (and possibly add) the object to the
+ * hash table. If the CKPT_OBJ_CHECKPOINTED flag isn't set, then also
+ * save the object's state using its ops->checkpoint().
+ *
+ * [This is used during checkpoint].
+ * Returns: objref
+ */
+int checkpoint_obj(struct ckpt_ctx *ctx, void *ptr, enum obj_type type)
+{
+	struct ckpt_hdr_objref *h;
+	struct ckpt_obj *obj;
+	int new, ret = 0;
+
+	obj = obj_lookup_add(ctx, ptr, type, &new);
+	if (IS_ERR(obj))
+		return PTR_ERR(obj);
+
+	if (new && obj_reverse_leak(ctx, obj))
+		return -EBUSY;
+
+	if (!(obj->flags & CKPT_OBJ_CHECKPOINTED)) {
+		h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_OBJREF);
+		if (!h)
+			return -ENOMEM;
+
+		h->objtype = type;
+		h->objref = obj->objref;
+		ret = ckpt_write_obj(ctx, &h->h);
+		ckpt_hdr_put(ctx, h);
+
+		if (ret < 0)
+			return ret;
+
+		/* invoke callback to actually dump the state */
+		BUG_ON(!obj->ops->checkpoint);
+
+		obj->flags |= CKPT_OBJ_CHECKPOINTED;
+		ret = obj->ops->checkpoint(ctx, ptr);
+	}
+
+	obj->flags |= CKPT_OBJ_VISITED;
+	return (ret < 0 ? ret : obj->objref);
+}
+
+/**
+ * ckpt_obj_visit - mark object as visited
+ * @ctx: checkpoint context
+ * @ptr: pointer to object
+ * @type: object type
+ *
+ * [used during checkpoint].
+ * Marks the object as visited, or fail if not found
+ */
+int ckpt_obj_visit(struct ckpt_ctx *ctx, void *ptr, enum obj_type type)
+{
+	struct ckpt_obj *obj;
+
+	obj = obj_find_by_ptr(ctx, ptr);
+	BUG_ON(obj && obj->ops->obj_type != type);
+
+	if (!obj) {
+		if (!(ctx->uflags & CHECKPOINT_SUBTREE)) {
+			/* if not found report reverse leak (full container) */
+			ckpt_err(ctx, -EBUSY,
+				 "%(O)%(P)Leak: reverse unknown (%s)\n",
+				 obj->objref, obj->ptr, obj->ops->obj_name);
+			return -EBUSY;
+		}
+	} else {
+		ckpt_debug("visit %s objref %d\n",
+			   obj->ops->obj_name, obj->objref);
+		obj->flags |= CKPT_OBJ_VISITED;
+	}
+	return 0;
+}
+
+/* increment the 'users' count of an object */
+static void ckpt_obj_users_inc(struct ckpt_ctx *ctx, void *ptr, int increment)
+{
+	struct ckpt_obj *obj;
+
+	obj = obj_find_by_ptr(ctx, ptr);
+	if (obj)
+		obj->users += increment;
+}
+
+/*
+ * "Leak detection" - to guarantee a consistent checkpoint of a full
+ * container we verify that all resources are confined and isolated in
+ * that container:
+ *
+ * c/r code first walks through all tasks and collects all shared
+ * resources into the objhash, while counting the references to them;
+ * then, it compares this count to the object's real reference count,
+ * and if they don't match it means that an object has "leaked" to the
+ * outside.
+ *
+ * Otherwise, it is guaranteed that there are no references outside
+ * (of container). c/r code now proceeds to walk through all tasks,
+ * again, and checkpoints the resources. It ensures that all resources
+ * are already in the objhash, and that all of them are checkpointed.
+ * Otherwise it means that due to a race, an object was created or
+ * destroyed during the first walk but not accounted for.
+ *
+ * For instance, consider an outside task A that shared files_struct
+ * with inside task B. Then, after B's files where collected, A opens
+ * or closes a file, and immediately exits - before the first leak
+ * test is performed, such that the test passes.
+ */
+
+/**
+ * obj_sock_adjust_users - remove implicit reference on DEAD sockets
+ * @obj: CKPT_OBJ_SOCK object to adjust
+ *
+ * Sockets that have been disconnected from their struct file have
+ * a reference count one less than normal sockets.  The objhash's
+ * assumption of such a reference is therefore incorrect, so we correct
+ * it here.
+ */
+static inline void obj_sock_adjust_users(struct ckpt_obj *obj)
+{
+	struct sock *sk = (struct sock *)obj->ptr;
+
+	if (sock_flag(sk, SOCK_DEAD)) {
+		obj->users--;
+		ckpt_debug("Adjusting SOCK %i count to %i\n",
+			   obj->objref, obj->users);
+	}
+}
+
+/**
+ * ckpt_obj_contained - test if shared objects are contained in checkpoint
+ * @ctx: checkpoint context
+ *
+ * Loops through all objects in the table and compares the number of
+ * references accumulated during checkpoint, with the reference count
+ * reported by the kernel.
+ *
+ * Return 1 if respective counts match for all objects, 0 otherwise.
+ */
+int ckpt_obj_contained(struct ckpt_ctx *ctx)
+{
+	struct ckpt_obj *obj;
+	struct hlist_node *node;
+
+	/* account for ctx->{file,logfile} (if in the table already) */
+	ckpt_obj_users_inc(ctx, ctx->file, 1);
+	if (ctx->logfile)
+		ckpt_obj_users_inc(ctx, ctx->logfile, 1);
+	/* account for ctx->root_nsproxy (if in the table already) */
+	ckpt_obj_users_inc(ctx, ctx->root_nsproxy, 1);
+
+	hlist_for_each_entry(obj, node, &ctx->obj_hash->list, next) {
+		if (!obj->ops->ref_users)
+			continue;
+
+		if (obj->ops->obj_type == CKPT_OBJ_SOCK)
+			obj_sock_adjust_users(obj);
+
+		if (obj->ops->ref_users(obj->ptr) != obj->users) {
+			ckpt_err(ctx, -EBUSY,
+				 "%(O)%(P)%(S)Usage leak (%d != %d)\n",
+				 obj->objref, obj->ptr, obj->ops->obj_name,
+				 obj->ops->ref_users(obj->ptr), obj->users);
+			return 0;
+		}
+	}
+
+	return 1;
+}
+
+/**
+ * ckpt_obj_visited - test that all shared objects were visited
+ * @ctx: checkpoint context
+ *
+ * Return 1 if all objects where visited, 0 otherwise.
+ */
+int ckpt_obj_visited(struct ckpt_ctx *ctx)
+{
+	struct ckpt_obj *obj;
+	struct hlist_node *node;
+
+	hlist_for_each_entry(obj, node, &ctx->obj_hash->list, next) {
+		if (!(obj->flags & CKPT_OBJ_VISITED)) {
+			ckpt_err(ctx, -EBUSY,
+				 "%(O)%(P)%(S)Leak: not visited\n",
+				 obj->objref, obj->ptr, obj->ops->obj_name);
+			return 0;
+		}
+	}
+
+	return 1;
+}
+
+/**************************************************************************
+ * Restart
+ */
+
+/**
+ * restore_obj - read in and restore a (first seen) shared object
+ * @ctx: checkpoint context
+ * @h: ckpt_hdr of shared object
+ *
+ * Read in the header payload (struct ckpt_hdr_objref). Lookup the
+ * object to verify it isn't there.  Then restore the object's state
+ * and add it to the objash. No need to explicitly grab a reference -
+ * we hold the initial instance of this object. (Object maintained
+ * until the entire hash is free).
+ *
+ * [This is used during restart].
+ */
+int restore_obj(struct ckpt_ctx *ctx, struct ckpt_hdr_objref *h)
+{
+	const struct ckpt_obj_ops *ops;
+	struct ckpt_obj *obj;
+	void *ptr = NULL;
+
+	ckpt_debug("len %d ref %d type %d\n", h->h.len, h->objref, h->objtype);
+	if (h->objtype >= CKPT_OBJ_MAX)
+		return -EINVAL;
+	if (h->objref <= 0)
+		return -EINVAL;
+
+	ops = ckpt_obj_ops[h->objtype];
+	BUG_ON(ops->obj_type != h->objtype);
+
+	if (ops->restore)
+		ptr = ops->restore(ctx);
+	if (IS_ERR(ptr))
+		return PTR_ERR(ptr);
+
+	if (obj_find_by_objref(ctx, h->objref))
+		obj = ERR_PTR(-EINVAL);
+	else
+		obj = obj_new(ctx, ptr, h->objref, h->objtype);
+	/*
+	 * Drop an extra reference to the object returned by ops->restore:
+	 * On success, this clears the extra reference taken by obj_new(),
+	 * and on failure, this cleans up the object itself.
+	 */
+	if (ops->ref_drop)
+		ops->ref_drop(ptr, 0);
+	if (IS_ERR(obj)) {
+		if (ops->ref_drop)
+			ops->ref_drop(ptr, 1);
+		return PTR_ERR(obj);
+	}
+	return obj->objref;
+}
+
+/**
+ * ckpt_obj_insert - add an object with a given objref to obj_hash
+ * @ctx: checkpoint context
+ * @ptr: pointer to object
+ * @objref: unique object id
+ * @type: object type
+ *
+ * Add the object pointer to by @ptr and identified by unique object id
+ * @objref to the hash table (indexed by @objref).  Grab a reference to
+ * every object added, and maintain it until the entire hash is freed.
+ *
+ * [This is used during restart].
+ */
+int ckpt_obj_insert(struct ckpt_ctx *ctx, void *ptr,
+		    int objref, enum obj_type type)
+{
+	struct ckpt_obj *obj;
+
+	if (objref <= 0)
+		return -EINVAL;
+	if (obj_find_by_objref(ctx, objref))
+		return -EINVAL;
+	obj = obj_new(ctx, ptr, objref, type);
+	if (IS_ERR(obj))
+		return PTR_ERR(obj);
+	ckpt_debug("%s objref %d\n", obj->ops->obj_name, objref);
+	return obj->objref;
+}
+
+/**
+ * ckpt_obj_try_fetch - fetch an object by its identifier
+ * @ctx: checkpoint context
+ * @objref: object id
+ * @type: object type
+ *
+ * Lookup the objref identifier by @objref in the hash table. Return
+ * an error not found.
+ *
+ * [This is used during restart].
+ */
+void *ckpt_obj_try_fetch(struct ckpt_ctx *ctx, int objref, enum obj_type type)
+{
+	struct ckpt_obj *obj;
+
+	obj = obj_find_by_objref(ctx, objref);
+	if (!obj)
+		return ERR_PTR(-EINVAL);
+	ckpt_debug("%s ref %d\n", obj->ops->obj_name, obj->objref);
+	if (obj->ops->obj_type == type)
+		return obj->ptr;
+	return ERR_PTR(-ENOMSG);
+}
+
+void *ckpt_obj_fetch(struct ckpt_ctx *ctx, int objref, enum obj_type type)
+{
+	void *ret = ckpt_obj_try_fetch(ctx, objref, type);
+
+	if (unlikely(IS_ERR(ret)))
+		ckpt_err(ctx, PTR_ERR(ret), "%(O)Fetching object (type %d)\n",
+			 objref, type);
+	return ret;
+}
+
+/*
+ * checkpoint a security context string.  This is done by
+ * security/security.c:security_checkpoint_obj() when it checkpoints
+ * a void*security whose context string has not yet been written out.
+ * The objref for the void*security (which is not itself written out
+ * to the checkpoint image) is stored alongside the context string,
+ * as is the type of object which contained the void* security, i.e.
+ * struct file, struct cred, etc.
+ */
+static int checkpoint_lsm_string(struct ckpt_ctx *ctx, void *ptr)
+{
+	struct ckpt_hdr_lsm *h;
+	struct ckpt_lsm_string *l = ptr;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SECURITY);
+	if (!h)
+		return -ENOMEM;
+	h->sectype = l->sectype;
+	h->ptrref = l->ptrref;
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+
+	if (ret < 0)
+		return ret;
+	return ckpt_write_string(ctx, l->string, strlen(l->string)+1);
+}
+
+/*
+ * callback invoked when a security context string is found in a
+ * checkpoint image at restart.  The context string is saved in the object
+ * hash.  The objref under which the void* security was inserted in the
+ * objhash at checkpoint is also found here, and we re-insert this context
+ * string a second time under that objref.  This is because objects which
+ * had this context will have the objref of the void*security, not of the
+ * context string.
+ */
+static struct ckpt_lsm_string *restore_lsm_string(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_lsm *h;
+	struct ckpt_lsm_string *l;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SECURITY);
+	if (IS_ERR(h)) {
+		ckpt_debug("ckpt_read_obj_type returned %ld\n", PTR_ERR(h));
+		return ERR_PTR(PTR_ERR(h));
+	}
+
+	l = kzalloc(sizeof(*l), GFP_KERNEL);
+	if (!l) {
+		l = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+	l->string = ckpt_read_string(ctx, CKPT_LSM_STRING_MAX);
+	if (IS_ERR(l->string)) {
+		void *s = l->string;
+		ckpt_debug("ckpt_read_string returned %ld\n", PTR_ERR(s));
+		kfree(l);
+		l = s;
+		goto out;
+	}
+	kref_init(&l->kref);
+	l->sectype = h->sectype;
+	/* l is just a placeholder, don't grab a ref */
+	ckpt_obj_insert(ctx, l, h->ptrref, CKPT_OBJ_SECURITY);
+
+out:
+	ckpt_hdr_put(ctx, h);
+	return l;
+}
diff --git a/kernel/checkpoint/process.c b/kernel/checkpoint/process.c
new file mode 100644
index 0000000..6e3e382
--- /dev/null
+++ b/kernel/checkpoint/process.c
@@ -0,0 +1,929 @@
+/*
+ *  Checkpoint task structure
+ *
+ *  Copyright (C) 2008-2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG  CKPT_DSYS
+
+#include <linux/sched.h>
+#include <linux/nsproxy.h>
+#include <linux/posix-timers.h>
+#include <linux/futex.h>
+#include <linux/compat.h>
+#include <linux/poll.h>
+#include <linux/utsname.h>
+#include <linux/user_namespace.h>
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+#include <linux/mm_checkpoint.h>
+#include <linux/syscalls.h>
+
+
+pid_t ckpt_pid_nr(struct ckpt_ctx *ctx, struct pid *pid)
+{
+	return pid ? pid_nr_ns(pid, ctx->root_nsproxy->pid_ns) : CKPT_PID_NULL;
+}
+
+/* must be called with tasklist_lock or rcu_read_lock() held */
+struct pid *_ckpt_find_pgrp(struct ckpt_ctx *ctx, pid_t pgid)
+{
+	struct task_struct *p;
+	struct pid *pgrp;
+
+	if (pgid == 0) {
+		/*
+		 * At checkpoint the pgid owner lived in an ancestor
+		 * pid-ns. The best we can do (sanely and safely) is
+		 * to examine the parent of this restart's root: if in
+		 * a distinct pid-ns, use its pgrp; otherwise fail.
+		 */
+		p = ctx->root_task->real_parent;
+		if (p->nsproxy->pid_ns == current->nsproxy->pid_ns)
+			return NULL;
+		pgrp = task_pgrp(p);
+	} else {
+		/*
+		 * Find the owner process of this pgid (it must exist
+		 * if pgrp exists). It must be a thread group leader.
+		 */
+		pgrp = find_vpid(pgid);
+		p = pid_task(pgrp, PIDTYPE_PID);
+		if (!p || !thread_group_leader(p))
+			return NULL;
+		/*
+		 * The pgrp must "belong" to our restart tree (compare
+		 * p->checkpoint_ctx to ours). This prevents malicious
+		 * input from (guessing and) using unrelated pgrps. If
+		 * the owner is dead, then it doesn't have a context,
+		 * so instead compare against its (real) parent's.
+		 */
+		if (p->exit_state == EXIT_ZOMBIE)
+			p = p->real_parent;
+		if (p->checkpoint_ctx != ctx)
+			return NULL;
+	}
+
+	if (task_session(current) != task_session(p))
+		return NULL;
+
+	return pgrp;
+}
+
+
+#ifdef CONFIG_FUTEX
+static void save_task_robust_futex_list(struct ckpt_hdr_task *h,
+					struct task_struct *t)
+{
+	/*
+	 * These are __user pointers and thus can be saved without
+	 * the objhash.
+	 */
+	h->robust_futex_list = (unsigned long)t->robust_list;
+	h->robust_futex_head_len = sizeof(*t->robust_list);
+#ifdef CONFIG_COMPAT
+	h->compat_robust_futex_list = ptr_to_compat(t->compat_robust_list);
+	h->compat_robust_futex_head_len = sizeof(*t->compat_robust_list);
+#endif
+}
+
+static void restore_task_robust_futex_list(struct ckpt_hdr_task *h)
+{
+	/* Since we restore the memory map the address remains the same and
+	 * this is safe. This is the same as [compat_]sys_set_robust_list() */
+	if (h->robust_futex_list) {
+		struct robust_list_head __user *rfl;
+		rfl = (void __user *)(unsigned long) h->robust_futex_list;
+		do_set_robust_list(rfl, h->robust_futex_head_len);
+	}
+#ifdef CONFIG_COMPAT
+	if (h->compat_robust_futex_list) {
+		struct compat_robust_list_head __user *crfl;
+		crfl = compat_ptr(h->compat_robust_futex_list);
+		do_compat_set_robust_list(crfl, h->compat_robust_futex_head_len);
+	}
+#endif
+}
+#else /* !CONFIG_FUTEX */
+static inline void save_task_robust_futex_list(struct ckpt_hdr_task *h,
+					       struct task_struct *t)
+{
+}
+
+static inline void restore_task_robust_futex_list(struct ckpt_hdr_task *h)
+{
+}
+#endif /* CONFIG_FUTEX */
+
+
+/***********************************************************************
+ * Checkpoint
+ */
+
+/* dump the task_struct of a given task */
+static int checkpoint_task_struct(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	struct ckpt_hdr_task *h;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TASK);
+	if (!h)
+		return -ENOMEM;
+
+	h->state = t->state;
+	h->exit_state = t->exit_state;
+	h->exit_code = t->exit_code;
+
+	if (t->exit_state) {
+		/* zombie - skip remaining state */
+		BUG_ON(t->exit_state != EXIT_ZOMBIE);
+	} else {
+		/* FIXME: save remaining relevant task_struct fields */
+		h->exit_signal = t->exit_signal;
+		h->pdeath_signal = t->pdeath_signal;
+
+		h->set_child_tid = (unsigned long) t->set_child_tid;
+		h->clear_child_tid = (unsigned long) t->clear_child_tid;
+		save_task_robust_futex_list(h, t);
+	}
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+	if (ret < 0)
+		return ret;
+
+	return ckpt_write_string(ctx, t->comm, TASK_COMM_LEN);
+}
+
+static int checkpoint_task_ns(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	struct ckpt_hdr_task_ns *h;
+	struct nsproxy *nsproxy;
+	int ns_objref;
+	int ret;
+
+	rcu_read_lock();
+	nsproxy = task_nsproxy(t);
+	get_nsproxy(nsproxy);
+	rcu_read_unlock();
+
+	ns_objref = checkpoint_obj(ctx, nsproxy, CKPT_OBJ_NS);
+	put_nsproxy(nsproxy);
+
+	ckpt_debug("nsproxy: objref %d\n", ns_objref);
+	if (ns_objref < 0)
+		return ns_objref;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TASK_NS);
+	if (!h)
+		return -ENOMEM;
+	h->ns_objref = ns_objref;
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
+static int checkpoint_task_creds(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	int realcred_ref, ecred_ref;
+	struct cred *rcred, *ecred;
+	struct ckpt_hdr_task_creds *h;
+	int ret;
+
+	rcred = (struct cred *) get_cred(t->real_cred);
+	ecred = (struct cred *) get_cred(t->cred);
+
+	realcred_ref = checkpoint_obj(ctx, rcred, CKPT_OBJ_CRED);
+	if (realcred_ref < 0) {
+		ret = realcred_ref;
+		goto error;
+	}
+
+	ecred_ref = checkpoint_obj(ctx, ecred, CKPT_OBJ_CRED);
+	if (ecred_ref < 0) {
+		ret = ecred_ref;
+		goto error;
+	}
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TASK_CREDS);
+	if (!h) {
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	h->cred_ref = realcred_ref;
+	h->ecred_ref = ecred_ref;
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+	ckpt_hdr_put(ctx, h);
+
+error:
+	put_cred(rcred);
+	put_cred(ecred);
+	return ret;
+}
+
+static int checkpoint_task_objs(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	struct ckpt_hdr_task_objs *h;
+	int files_objref;
+	int mm_objref;
+	int fs_objref;
+	int sighand_objref;
+	int signal_objref;
+	int first, ret;
+
+	/*
+	 * Shared objects may have dependencies among them: task->mm
+	 * depends on task->nsproxy (by ipc_ns). Therefore first save
+	 * the namespaces, and then the remaining shared objects.
+	 * During restart a task will already have its namespaces
+	 * restored when it gets to restore, e.g. its memory.
+	 */
+
+	ret = checkpoint_task_creds(ctx, t);
+	ckpt_debug("cred: objref %d\n", ret);
+	if (ret < 0) {
+		ckpt_err(ctx, ret, "%(T)process credentials\n");
+		return ret;
+	}
+
+	ret = checkpoint_task_ns(ctx, t);
+	ckpt_debug("ns: objref %d\n", ret);
+	if (ret < 0) {
+		ckpt_err(ctx, ret, "%(T)process namespaces\n");
+		return ret;
+	}
+
+	files_objref = checkpoint_obj_file_table(ctx, t);
+	ckpt_debug("files: objref %d\n", files_objref);
+	if (files_objref < 0) {
+		ckpt_err(ctx, files_objref, "%(T)files_struct\n");
+		return files_objref;
+	}
+
+	mm_objref = checkpoint_obj_mm(ctx, t);
+	ckpt_debug("mm: objref %d\n", mm_objref);
+	if (mm_objref < 0) {
+		ckpt_err(ctx, mm_objref, "%(T)mm_struct\n");
+		return mm_objref;
+	}
+
+	/* note: this must come *after* file-table and mm */
+	fs_objref = checkpoint_obj_fs(ctx, t);
+	if (fs_objref < 0) {
+		ckpt_err(ctx, fs_objref, "%(T)process fs\n");
+		return fs_objref;
+	}
+
+	sighand_objref = checkpoint_obj_sighand(ctx, t);
+	ckpt_debug("sighand: objref %d\n", sighand_objref);
+	if (sighand_objref < 0) {
+		ckpt_err(ctx, sighand_objref, "%(T)sighand_struct\n");
+		return sighand_objref;
+	}
+
+	/*
+	 * Handle t->signal differently because the checkpoint method
+	 * for t->signal needs access to owning task_struct to access
+	 * t->sighand (to lock/unlock). First explicitly determine if
+	 * need to save, and only below invoke checkpoint_obj_signal()
+	 * if needed.
+	 */
+	signal_objref = ckpt_obj_lookup_add(ctx, t->signal,
+					    CKPT_OBJ_SIGNAL, &first);
+	ckpt_debug("signal: objref %d\n", signal_objref);
+	if (signal_objref < 0) {
+		ckpt_err(ctx, signal_objref, "%(T)process signals\n");
+		return signal_objref;
+	}
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TASK_OBJS);
+	if (!h)
+		return -ENOMEM;
+	h->files_objref = files_objref;
+	h->mm_objref = mm_objref;
+	h->fs_objref = fs_objref;
+	h->sighand_objref = sighand_objref;
+	h->signal_objref = signal_objref;
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+	if (ret < 0)
+		return ret;
+
+	/* actually save t->signal, if need to */
+	if (first)
+		ret = checkpoint_obj_signal(ctx, t);
+	if (ret < 0)
+		ckpt_err(ctx, ret, "%(T)signal_struct\n");
+
+	return ret;
+}
+
+/* dump the task_struct of a given task */
+int checkpoint_restart_block(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	struct ckpt_hdr_restart_block *h;
+	struct restart_block *restart_block;
+	long (*fn)(struct restart_block *);
+	s64 base, expire = 0;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_RESTART_BLOCK);
+	if (!h)
+		return -ENOMEM;
+
+	base = ktime_to_ns(ctx->ktime_begin);
+	restart_block = &task_thread_info(t)->restart_block;
+	fn = restart_block->fn;
+
+	/* FIX: enumerate clockid_t so we're immune to changes */
+
+	if (fn == do_no_restart_syscall) {
+
+		h->function_type = CKPT_RESTART_BLOCK_NONE;
+		ckpt_debug("restart_block: non\n");
+
+	} else if (fn == hrtimer_nanosleep_restart) {
+
+		h->function_type = CKPT_RESTART_BLOCK_HRTIMER_NANOSLEEP;
+		h->arg_0 = restart_block->nanosleep.index;
+		h->arg_1 = (unsigned long) restart_block->nanosleep.rmtp;
+		expire = restart_block->nanosleep.expires;
+		ckpt_debug("restart_block: hrtimer expire %lld now %lld\n",
+			 expire, base);
+
+	} else if (fn == posix_cpu_nsleep_restart) {
+		struct timespec ts;
+
+		h->function_type = CKPT_RESTART_BLOCK_POSIX_CPU_NANOSLEEP;
+		h->arg_0 = restart_block->arg0;
+		h->arg_1 = restart_block->arg1;
+		ts.tv_sec = restart_block->arg2;
+		ts.tv_nsec = restart_block->arg3;
+		expire = timespec_to_ns(&ts);
+		ckpt_debug("restart_block: posix_cpu expire %lld now %lld\n",
+			 expire, base);
+
+#ifdef CONFIG_COMPAT
+	} else if (fn == compat_nanosleep_restart) {
+
+		h->function_type = CKPT_RESTART_BLOCK_COMPAT_NANOSLEEP;
+		h->arg_0 = restart_block->nanosleep.index;
+		h->arg_1 = (unsigned long)restart_block->nanosleep.rmtp;
+		h->arg_2 = (unsigned long)restart_block->nanosleep.compat_rmtp;
+		expire = restart_block->nanosleep.expires;
+		ckpt_debug("restart_block: compat expire %lld now %lld\n",
+			 expire, base);
+
+	} else if (fn == compat_clock_nanosleep_restart) {
+
+		h->function_type = CKPT_RESTART_BLOCK_COMPAT_CLOCK_NANOSLEEP;
+		h->arg_0 = restart_block->nanosleep.index;
+		h->arg_1 = (unsigned long)restart_block->nanosleep.rmtp;
+		h->arg_2 = (unsigned long)restart_block->nanosleep.compat_rmtp;
+		expire = restart_block->nanosleep.expires;
+		ckpt_debug("restart_block: compat_clock expire %lld now %lld\n",
+			 expire, base);
+
+#endif
+	} else if (fn == futex_wait_restart) {
+
+		h->function_type = CKPT_RESTART_BLOCK_FUTEX;
+		h->arg_0 = (unsigned long) restart_block->futex.uaddr;
+		h->arg_1 = restart_block->futex.val;
+		h->arg_2 = restart_block->futex.flags;
+		h->arg_3 = restart_block->futex.bitset;
+		expire = restart_block->futex.time;
+		ckpt_debug("restart_block: futex expire %lld now %lld\n",
+			 expire, base);
+
+	} else if (fn == do_restart_poll) {
+		struct timespec ts;
+
+		h->function_type = CKPT_RESTART_BLOCK_POLL;
+		h->arg_0 = (unsigned long) restart_block->poll.ufds;
+		h->arg_1 = restart_block->poll.nfds;
+		h->arg_2 = restart_block->poll.has_timeout;
+		ts.tv_sec = restart_block->poll.tv_sec;
+		ts.tv_nsec = restart_block->poll.tv_nsec;
+		expire = timespec_to_ns(&ts);
+		ckpt_debug("restart_block: poll expire %lld now %lld\n",
+			 expire, base);
+
+	} else {
+
+		BUG();
+
+	}
+
+	/* common to all restart blocks: */
+	h->arg_4 = (base < expire ? expire - base : 0);
+
+	ckpt_debug("restart_block: args %#llx %#llx %#llx %#llx %#llx\n",
+		 h->arg_0, h->arg_1, h->arg_2, h->arg_3, h->arg_4);
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+
+	ckpt_debug("restart_block ret %d\n", ret);
+	return ret;
+}
+
+/* dump the entire state of a given task */
+int checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	int ret;
+
+	ctx->tsk = t;
+
+	ret = checkpoint_task_struct(ctx, t);
+	ckpt_debug("task %d\n", ret);
+	if (ret < 0)
+		goto out;
+
+	/* zombie - we're done here */
+	if (t->exit_state)
+		return 0;
+
+	ret = checkpoint_thread(ctx, t);
+	ckpt_debug("thread %d\n", ret);
+	if (ret < 0)
+		goto out;
+	ret = checkpoint_restart_block(ctx, t);
+	ckpt_debug("restart-blocks %d\n", ret);
+	if (ret < 0)
+		goto out;
+	ret = checkpoint_cpu(ctx, t);
+	ckpt_debug("cpu %d\n", ret);
+	if (ret < 0)
+		goto out;
+	ret = checkpoint_task_objs(ctx, t);
+	ckpt_debug("objs %d\n", ret);
+	if (ret < 0)
+		goto out;
+	ret = checkpoint_task_signal(ctx, t);
+	ckpt_debug("task-signal %d\n", ret);
+ out:
+	ctx->tsk = NULL;
+	return ret;
+}
+
+int ckpt_collect_task(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	int ret;
+
+	ret = ckpt_collect_ns(ctx, t);
+	if (ret < 0)
+		return ret;
+	ret = ckpt_collect_file_table(ctx, t);
+	if (ret < 0)
+		return ret;
+	ret = ckpt_collect_mm(ctx, t);
+	if (ret < 0)
+		return ret;
+	ret = ckpt_collect_fs(ctx, t);
+	if (ret < 0)
+		return ret;
+	ret = ckpt_collect_sighand(ctx, t);
+
+	return ret;
+}
+
+/***********************************************************************
+ * Restart
+ */
+
+static inline int valid_exit_code(int exit_code)
+{
+	if (exit_code >= 0x10000)
+		return 0;
+	if (exit_code & 0xff) {
+		if (exit_code & ~0xff)
+			return 0;
+		if (!valid_signal(exit_code & 0xff))
+			return 0;
+	}
+	return 1;
+}
+
+/* read the task_struct into the current task */
+static int restore_task_struct(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_task *h;
+	struct task_struct *t = current;
+	int ret;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TASK);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	ret = -EINVAL;
+	if (h->state == TASK_DEAD) {
+		if (h->exit_state != EXIT_ZOMBIE)
+			goto out;
+		if (!valid_exit_code(h->exit_code))
+			goto out;
+		t->exit_code = h->exit_code;
+	} else {
+		if (h->exit_code)
+			goto out;
+		if ((thread_group_leader(t) && !valid_signal(h->exit_signal)) ||
+		    (!thread_group_leader(t) && h->exit_signal != -1))
+			goto out;
+		if (!valid_signal(h->pdeath_signal))
+			goto out;
+
+		/* FIXME: restore remaining relevant task_struct fields */
+		t->exit_signal = h->exit_signal;
+		t->pdeath_signal = h->pdeath_signal;
+
+		t->set_child_tid =
+			(int __user *) (unsigned long) h->set_child_tid;
+		t->clear_child_tid =
+			(int __user *) (unsigned long) h->clear_child_tid;
+		restore_task_robust_futex_list(h);
+	}
+
+	memset(t->comm, 0, TASK_COMM_LEN);
+	ret = _ckpt_read_string(ctx, t->comm, TASK_COMM_LEN);
+	if (ret < 0)
+		goto out;
+
+	/* return 1 for zombie, 0 otherwise */
+	ret = (h->state == TASK_DEAD ? 1 : 0);
+ out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+static int restore_task_ns(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_task_ns *h;
+	struct nsproxy *nsproxy;
+	int ret = 0;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TASK_NS);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	nsproxy = ckpt_obj_fetch(ctx, h->ns_objref, CKPT_OBJ_NS);
+	if (IS_ERR(nsproxy)) {
+		ret = PTR_ERR(nsproxy);
+		goto out;
+	}
+
+	if (nsproxy != task_nsproxy(current)) {
+		get_nsproxy(nsproxy);
+		switch_task_namespaces(current, nsproxy);
+	}
+ out:
+	ckpt_debug("nsproxy: ret %d (%p)\n", ret, task_nsproxy(current));
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+static int restore_task_creds(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_task_creds *h;
+	struct cred *realcred, *ecred;
+	int ret = 0;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TASK_CREDS);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	realcred = ckpt_obj_fetch(ctx, h->cred_ref, CKPT_OBJ_CRED);
+	if (IS_ERR(realcred)) {
+		ckpt_debug("Error %ld fetching realcred (ref %d)\n",
+			PTR_ERR(realcred), h->cred_ref);
+		ret = PTR_ERR(realcred);
+		goto out;
+	}
+	ecred = ckpt_obj_fetch(ctx, h->ecred_ref, CKPT_OBJ_CRED);
+	if (IS_ERR(ecred)) {
+		ckpt_debug("Error %ld fetching ecred (ref %d)\n",
+			PTR_ERR(ecred), h->ecred_ref);
+		ret = PTR_ERR(ecred);
+		goto out;
+	}
+	ctx->realcred = realcred;
+	ctx->ecred = ecred;
+
+out:
+	ckpt_debug("Returning %d\n", ret);
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+static int restore_task_objs(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_task_objs *h;
+	int ret;
+
+	/*
+	 * Namespaces come first, because ->mm depends on ->nsproxy,
+	 * and because shared objects are restored before they are
+	 * referenced. See comment in checkpoint_task_objs.
+	 */
+	ret = restore_task_creds(ctx);
+	if (ret < 0) {
+		ckpt_debug("restore_task_creds returned %d\n", ret);
+		return ret;
+	}
+	ret = restore_task_ns(ctx);
+	if (ret < 0) {
+		ckpt_debug("restore_task_ns returned %d\n", ret);
+		return ret;
+	}
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TASK_OBJS);
+	if (IS_ERR(h)) {
+		ckpt_debug("Error fetching task obj\n");
+		return PTR_ERR(h);
+	}
+
+	ret = restore_obj_file_table(ctx, h->files_objref);
+	ckpt_debug("file_table: ret %d (%p)\n", ret, current->files);
+	if (ret < 0)
+		goto out;
+
+	ret = restore_obj_mm(ctx, h->mm_objref);
+	ckpt_debug("mm: ret %d (%p)\n", ret, current->mm);
+	if (ret < 0)
+		goto out;
+
+	ret = restore_obj_fs(ctx, h->fs_objref);
+	ckpt_debug("fs: ret %d (%p)\n", ret, current->fs);
+	if (ret < 0)
+		return ret;
+
+	ret = restore_obj_sighand(ctx, h->sighand_objref);
+	ckpt_debug("sighand: ret %d (%p)\n", ret, current->sighand);
+	if (ret < 0)
+		goto out;
+
+	ret = restore_obj_signal(ctx, h->signal_objref);
+	ckpt_debug("signal: ret %d (%p)\n", ret, current->signal);
+ out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+static int restore_creds(struct ckpt_ctx *ctx)
+{
+	int ret;
+	const struct cred *old;
+	struct cred *rcred, *ecred;
+
+	rcred = ctx->realcred;
+	ecred = ctx->ecred;
+
+	/* commit_creds will take one ref for the eff creds, but
+	 * expects us to hold a ref for the obj creds, so take a
+	 * ref here */
+	get_cred(rcred);
+	ret = commit_creds(rcred);
+	if (ret)
+		return ret;
+
+	if (ecred == rcred)
+		return 0;
+
+	old = override_creds(ecred); /* override_creds otoh takes new ref */
+	put_cred(old);
+
+	ctx->realcred = ctx->ecred = NULL;
+	return 0;
+}
+
+int restore_restart_block(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_restart_block *h;
+	struct restart_block restart_block;
+	struct timespec ts;
+	clockid_t clockid;
+	s64 expire;
+	int ret = 0;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_RESTART_BLOCK);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	expire = ktime_to_ns(ctx->ktime_begin) + h->arg_4;
+	restart_block.fn = NULL;
+
+	ckpt_debug("restart_block: expire %lld begin %lld\n",
+		 expire, ktime_to_ns(ctx->ktime_begin));
+	ckpt_debug("restart_block: args %#llx %#llx %#llx %#llx %#llx\n",
+		 h->arg_0, h->arg_1, h->arg_2, h->arg_3, h->arg_4);
+
+	switch (h->function_type) {
+	case CKPT_RESTART_BLOCK_NONE:
+		restart_block.fn = do_no_restart_syscall;
+		break;
+	case CKPT_RESTART_BLOCK_HRTIMER_NANOSLEEP:
+		clockid = h->arg_0;
+		if (clockid < 0 || invalid_clockid(clockid))
+			break;
+		restart_block.fn = hrtimer_nanosleep_restart;
+		restart_block.nanosleep.index = clockid;
+		restart_block.nanosleep.rmtp =
+			(struct timespec __user *) (unsigned long) h->arg_1;
+		restart_block.nanosleep.expires = expire;
+		break;
+	case CKPT_RESTART_BLOCK_POSIX_CPU_NANOSLEEP:
+		clockid = h->arg_0;
+		if (clockid < 0 || invalid_clockid(clockid))
+			break;
+		restart_block.fn = posix_cpu_nsleep_restart;
+		restart_block.arg0 = clockid;
+		restart_block.arg1 = h->arg_1;
+		ts = ns_to_timespec(expire);
+		restart_block.arg2 = ts.tv_sec;
+		restart_block.arg3 = ts.tv_nsec;
+		break;
+#ifdef CONFIG_COMPAT
+	case CKPT_RESTART_BLOCK_COMPAT_NANOSLEEP:
+		clockid = h->arg_0;
+		if (clockid < 0 || invalid_clockid(clockid))
+			break;
+		restart_block.fn = compat_nanosleep_restart;
+		restart_block.nanosleep.index = clockid;
+		restart_block.nanosleep.rmtp =
+			(struct timespec __user *) (unsigned long) h->arg_1;
+		restart_block.nanosleep.compat_rmtp =
+			(struct compat_timespec __user *)
+				(unsigned long) h->arg_2;
+		restart_block.nanosleep.expires = expire;
+		break;
+	case CKPT_RESTART_BLOCK_COMPAT_CLOCK_NANOSLEEP:
+		clockid = h->arg_0;
+		if (clockid < 0 || invalid_clockid(clockid))
+			break;
+		restart_block.fn = compat_clock_nanosleep_restart;
+		restart_block.nanosleep.index = clockid;
+		restart_block.nanosleep.rmtp =
+			(struct timespec __user *) (unsigned long) h->arg_1;
+		restart_block.nanosleep.compat_rmtp =
+			(struct compat_timespec __user *)
+				(unsigned long) h->arg_2;
+		restart_block.nanosleep.expires = expire;
+		break;
+#endif
+	case CKPT_RESTART_BLOCK_FUTEX:
+		restart_block.fn = futex_wait_restart;
+		restart_block.futex.uaddr = (u32 *) (unsigned long) h->arg_0;
+		restart_block.futex.val = h->arg_1;
+		restart_block.futex.flags = h->arg_2;
+		restart_block.futex.bitset = h->arg_3;
+		restart_block.futex.time = expire;
+		break;
+	case CKPT_RESTART_BLOCK_POLL:
+		restart_block.fn = do_restart_poll;
+		restart_block.poll.ufds =
+			(struct pollfd __user *) (unsigned long) h->arg_0;
+		restart_block.poll.nfds = h->arg_1;
+		restart_block.poll.has_timeout = h->arg_2;
+		ts = ns_to_timespec(expire);
+		restart_block.poll.tv_sec = ts.tv_sec;
+		restart_block.poll.tv_nsec = ts.tv_nsec;
+		break;
+	default:
+		break;
+	}
+
+	if (restart_block.fn)
+		task_thread_info(current)->restart_block = restart_block;
+	else
+		ret = -EINVAL;
+
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+static int restore_task_pgid(struct ckpt_ctx *ctx)
+{
+	struct task_struct *task = current;
+	struct pid *pgrp;
+	pid_t pgid;
+	int ret;
+
+	/*
+	 * We enforce the following restrictions on restoring pgrp:
+	 *  1) Only thread group leaders restore pgrp
+	 *  2) Session leader cannot change own pgrp
+	 *  3) Owner of pgrp must belong to same restart tree
+	 *  4) Must have same session as other tasks in same pgrp
+	 *  5) Change must pass setpgid security callback
+	 *
+	 * TODO - check if we need additional restrictions ?
+	 */
+
+	if (!thread_group_leader(task))  /* (1) */
+		return 0;
+
+	pgid = ctx->pids_arr[ctx->active_pid].vpgid;
+
+	if (pgid == task_pgrp_vnr(task))  /* nothing to do */
+		return 0;
+
+	if (task->signal->leader)  /* (2) */
+		return -EINVAL;
+
+	ret = -EINVAL;
+
+	write_lock_irq(&tasklist_lock);
+	pgrp = _ckpt_find_pgrp(ctx, pgid);  /* (3) and (4) */
+	if (pgrp && task_pgrp(task) != pgrp) {
+		ret = security_task_setpgid(task, pgid);  /* (5) */
+		if (!ret)
+			change_pid(task, PIDTYPE_PGID, pgrp);
+	}
+	write_unlock_irq(&tasklist_lock);
+
+	/* self-restart: be tolerant if old pgid isn't found */
+	if (ctx->uflags & RESTART_TASKSELF)
+		ret = 0;
+
+	return ret;
+}
+
+/* prepare the task for restore */
+int pre_restore_task(void)
+{
+	sigset_t sigset;
+
+	/*
+	 * Block task's signals to avoid interruptions due to signals,
+	 * say, from restored timers, file descriptors etc. Signals
+	 * will be unblocked when restore completes.
+	 *
+	 * NOTE: tasks with file descriptors set to send a SIGKILL as
+	 * i/o notification may fail the restart if a signal occurs
+	 * before that task completed its restore. FIX ?
+	 */
+	current->saved_sigmask = current->blocked;
+
+	sigfillset(&sigset);
+	sigdelset(&sigset, SIGKILL);
+	sigdelset(&sigset, SIGSTOP);
+	sigprocmask(SIG_SETMASK, &sigset, NULL);
+
+	return 0;
+}
+
+/* finish up task restore */
+void post_restore_task(void)
+{
+	/* only now is it safe to unblock the restored task's signals */
+	sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
+}
+
+/* read the entire state of the current task */
+int restore_task(struct ckpt_ctx *ctx)
+{
+	int ret;
+
+	ret = restore_task_struct(ctx);
+	ckpt_debug("task %d\n", ret);
+	if (ret < 0)
+		goto out;
+
+	/* zombie - we're done here */
+	if (ret)
+		goto out;
+
+	ret = restore_task_pgid(ctx);
+	if (ret < 0)
+		goto out;
+	ret = restore_thread(ctx);
+	ckpt_debug("thread %d\n", ret);
+	if (ret < 0)
+		goto out;
+	ret = restore_restart_block(ctx);
+	ckpt_debug("restart-blocks %d\n", ret);
+	if (ret < 0)
+		goto out;
+	ret = restore_cpu(ctx);
+	ckpt_debug("cpu %d\n", ret);
+	if (ret < 0)
+		goto out;
+	ret = restore_task_objs(ctx);
+	ckpt_debug("objs %d\n", ret);
+	if (ret < 0)
+		goto out;
+	ret = restore_creds(ctx);
+	ckpt_debug("creds: ret %d\n", ret);
+	if (ret < 0)
+		goto out;
+	ret = restore_task_signal(ctx);
+	ckpt_debug("signal: ret %d\n", ret);
+ out:
+	return ret;
+}
diff --git a/kernel/checkpoint/restart.c b/kernel/checkpoint/restart.c
new file mode 100644
index 0000000..0891952
--- /dev/null
+++ b/kernel/checkpoint/restart.c
@@ -0,0 +1,1423 @@
+/*
+ *  Restart logic and helpers
+ *
+ *  Copyright (C) 2008-2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG  CKPT_DSYS
+
+#include <linux/version.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/file.h>
+#include <linux/ptrace.h>
+#include <linux/freezer.h>
+#include <linux/magic.h>
+#include <linux/utsname.h>
+#include <linux/termios.h>
+#include <asm/syscall.h>
+#include <linux/elf.h>
+#include <linux/deferqueue.h>
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+
+#define RESTART_DBG_ROOT	(1 << 0)
+#define RESTART_DBG_GHOST	(1 << 1)
+#define RESTART_DBG_COORD	(1 << 2)
+#define RESTART_DBG_TASK	(1 << 3)
+#define RESTART_DBG_WAITING	(1 << 4)
+#define RESTART_DBG_RUNNING	(1 << 5)
+#define RESTART_DBG_EXITED	(1 << 6)
+#define RESTART_DBG_FAILED	(1 << 7)
+#define RESTART_DBG_SUCCESS	(1 << 8)
+
+#ifdef CONFIG_CHECKPOINT_DEBUG
+
+/*
+ * Track status of restarting tasks in a list off of checkpoint_ctx.
+ * Print this info when the checkpoint_ctx is freed. Sample output:
+ *
+ * [3519:2:c/r:debug_task_status:207] 3 tasks registered, nr_tasks was 0 nr_total 0
+ * [3519:2:c/r:debug_task_status:210] active pid was 1, ctx->errno 0
+ * [3519:2:c/r:debug_task_status:212] kflags 6 uflags 0 oflags 1
+ * [3519:2:c/r:debug_task_status:214] task 0 to run was 2
+ * [3519:2:c/r:debug_task_status:217] pid 3517  C  r
+ * [3519:2:c/r:debug_task_status:217] pid 3519  RN
+ * [3519:2:c/r:debug_task_status:217] pid 3520   G
+ */
+
+struct ckpt_task_status {
+	pid_t pid;
+	int flags;
+	int error;
+	struct list_head list;
+};
+
+static int restore_debug_task(struct ckpt_ctx *ctx, int flags)
+{
+	struct ckpt_task_status *s;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s) {
+		ckpt_debug("no memory to register ?!\n");
+		return -ENOMEM;
+	}
+	s->pid = current->pid;
+	s->error = 0;
+	s->flags = RESTART_DBG_WAITING | flags;
+	if (current == ctx->root_task)
+		s->flags |= RESTART_DBG_ROOT;
+
+	spin_lock(&ctx->lock);
+	list_add_tail(&s->list, &ctx->task_status);
+	spin_unlock(&ctx->lock);
+
+	return 0;
+}
+
+static struct ckpt_task_status *restore_debug_getme(struct ckpt_ctx *ctx)
+{
+	struct ckpt_task_status *s;
+
+	spin_lock(&ctx->lock);
+	list_for_each_entry(s, &ctx->task_status, list) {
+		if (s->pid == current->pid) {
+			spin_unlock(&ctx->lock);
+			return s;
+		}
+	}
+	spin_unlock(&ctx->lock);
+	return NULL;
+}
+
+static void restore_debug_error(struct ckpt_ctx *ctx, int err)
+{
+	struct ckpt_task_status *s = restore_debug_getme(ctx);
+
+	s->error = err;
+	s->flags &= ~RESTART_DBG_WAITING;
+	s->flags &= ~RESTART_DBG_RUNNING;
+	if (err)
+		s->flags |= RESTART_DBG_FAILED;
+	else
+		s->flags |= RESTART_DBG_SUCCESS;
+}
+
+static void restore_debug_running(struct ckpt_ctx *ctx)
+{
+	struct ckpt_task_status *s = restore_debug_getme(ctx);
+
+	s->flags &= ~RESTART_DBG_WAITING;
+	s->flags |= RESTART_DBG_RUNNING;
+}
+
+static void restore_debug_exit(struct ckpt_ctx *ctx)
+{
+	struct ckpt_task_status *s = restore_debug_getme(ctx);
+
+	s->flags &= ~RESTART_DBG_WAITING;
+	s->flags |= RESTART_DBG_EXITED;
+}
+
+void restore_debug_free(struct ckpt_ctx *ctx)
+{
+	struct ckpt_task_status *s, *p;
+	int i, count = 0;
+	char *which, *state;
+
+	/*
+	 * See how many tasks registered.  Tasks which didn't reach
+	 * sys_restart() won't have registered.  So if this count is
+	 * not the same as ctx->nr_total, that's a warning bell
+	 */
+	list_for_each_entry(s, &ctx->task_status, list)
+		count++;
+	ckpt_debug("%d tasks registered, nr_tasks was %d nr_total %d\n",
+		   count, ctx->nr_tasks, atomic_read(&ctx->nr_total));
+
+	ckpt_debug("active pid was %d, ctx->errno %d\n", ctx->active_pid,
+		   ctx->errno);
+	ckpt_debug("kflags %lu uflags %lu oflags %lu", ctx->kflags,
+		   ctx->uflags, ctx->oflags);
+	for (i = 0; i < ctx->nr_pids; i++)
+		ckpt_debug("task[%d] to run %d\n", i, ctx->pids_arr[i].vpid);
+
+	list_for_each_entry_safe(s, p, &ctx->task_status, list) {
+		if (s->flags & RESTART_DBG_COORD)
+			which = "Coord";
+		else if (s->flags & RESTART_DBG_ROOT)
+			which = "Root";
+		else if (s->flags & RESTART_DBG_GHOST)
+			which = "Ghost";
+		else if (s->flags & RESTART_DBG_TASK)
+			which = "Task";
+		else
+			which = "?????";
+		if (s->flags & RESTART_DBG_WAITING)
+			state = "Waiting";
+		else if (s->flags & RESTART_DBG_RUNNING)
+			state = "Running";
+		else if (s->flags & RESTART_DBG_FAILED)
+			state = "Failed";
+		else if (s->flags & RESTART_DBG_SUCCESS)
+			state = "Success";
+		else if (s->flags & RESTART_DBG_EXITED)
+			state = "Exited";
+		else
+			state = "??????";
+		ckpt_debug("pid %d type %s state %s\n", s->pid, which, state);
+		list_del(&s->list);
+		kfree(s);
+	}
+}
+
+#else
+
+static inline int restore_debug_task(struct ckpt_ctx *ctx, int flags)
+{
+	return 0;
+}
+static inline void restore_debug_error(struct ckpt_ctx *ctx, int err) {}
+static inline void restore_debug_running(struct ckpt_ctx *ctx) {}
+static inline void restore_debug_exit(struct ckpt_ctx *ctx) {}
+
+#endif /* CONFIG_CHECKPOINT_DEBUG */
+
+
+static int _ckpt_read_err(struct ckpt_ctx *ctx, struct ckpt_hdr *h)
+{
+	char *ptr;
+	int len, ret;
+
+	len = h->len - sizeof(*h);
+	ptr = kzalloc(len + 1, GFP_KERNEL);
+	if (!ptr) {
+		ckpt_debug("insufficient memory to report image error\n");
+		return -ENOMEM;
+	}
+
+	ret = ckpt_kread(ctx, ptr, len);
+	if (ret >= 0) {
+		ckpt_debug("%s\n", &ptr[1]);
+		ret = -EIO;
+	}
+
+	kfree(ptr);
+	return ret;
+}
+
+/**
+ * _ckpt_read_objref - dispatch handling of a shared object
+ * @ctx: checkpoint context
+ * @hh: objrect descriptor
+ */
+static int _ckpt_read_objref(struct ckpt_ctx *ctx, struct ckpt_hdr *hh)
+{
+	struct ckpt_hdr *h;
+	int ret;
+
+	h = ckpt_hdr_get(ctx, hh->len);
+	if (!h)
+		return -ENOMEM;
+
+	*h = *hh;	/* yay ! */
+
+	_ckpt_debug(CKPT_DOBJ, "shared len %d type %d\n", h->len, h->type);
+	ret = ckpt_kread(ctx, (h + 1), hh->len - sizeof(struct ckpt_hdr));
+	if (ret < 0)
+		goto out;
+
+	ret = restore_obj(ctx, (struct ckpt_hdr_objref *) h);
+ out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+/**
+ * ckpt_read_obj_dispatch - dispatch ERRORs and OBJREFs; don't return them
+ * @ctx: checkpoint context
+ * @h: desired ckpt_hdr
+ */
+static int ckpt_read_obj_dispatch(struct ckpt_ctx *ctx, struct ckpt_hdr *h)
+{
+	int ret;
+
+	while (1) {
+		ret = ckpt_kread(ctx, h, sizeof(*h));
+		if (ret < 0)
+			return ret;
+		_ckpt_debug(CKPT_DRW, "type %d len %d\n", h->type, h->len);
+		if (h->len < sizeof(*h))
+			return -EINVAL;
+
+		if (h->type == CKPT_HDR_ERROR) {
+			ret = _ckpt_read_err(ctx, h);
+			if (ret < 0)
+				return ret;
+		} else if (h->type == CKPT_HDR_OBJREF) {
+			ret = _ckpt_read_objref(ctx, h);
+			if (ret < 0)
+				return ret;
+		} else
+			return 0;
+	}
+}
+
+/**
+ * _ckpt_read_obj - read an object (ckpt_hdr followed by payload)
+ * @ctx: checkpoint context
+ * @h: desired ckpt_hdr
+ * @ptr: desired buffer
+ * @len: desired object length (if 0, flexible)
+ * @max: maximum object length (if 0, flexible)
+ *
+ * If @ptr is NULL, then read only the header (payload to follow)
+ */
+static int _ckpt_read_obj(struct ckpt_ctx *ctx, struct ckpt_hdr *h,
+			  void *ptr, int len, int max)
+{
+	int ret;
+
+	ret = ckpt_read_obj_dispatch(ctx, h);
+	if (ret < 0)
+		return ret;
+	_ckpt_debug(CKPT_DRW, "type %d len %d(%d,%d)\n",
+		    h->type, h->len, len, max);
+
+	/* if len specified, enforce, else if maximum specified, enforce */
+	if ((len && h->len != len) || (!len && max && h->len > max))
+		return -EINVAL;
+
+	if (ptr)
+		ret = ckpt_kread(ctx, ptr, h->len - sizeof(struct ckpt_hdr));
+	return ret;
+}
+
+/**
+ * _ckpt_read_obj_type - read an object of some type
+ * @ctx: checkpoint context
+ * @ptr: provided buffer
+ * @len: buffer length
+ * @type: buffer type
+ *
+ * If @ptr is NULL, then read only the header (payload to follow).
+ * @len specifies the expected buffer length (ignored if set to 0).
+ * Returns: actual _payload_ length
+ */
+int _ckpt_read_obj_type(struct ckpt_ctx *ctx, void *ptr, int len, int type)
+{
+	struct ckpt_hdr h;
+	int ret;
+
+	if (len)
+		len += sizeof(struct ckpt_hdr);
+	ret = _ckpt_read_obj(ctx, &h, ptr, len, len);
+	if (ret < 0)
+		return ret;
+	if (h.type != type)
+		return -EINVAL;
+	return h.len - sizeof(h);
+}
+
+/**
+ * _ckpt_read_buffer - read an object of type buffer (set length)
+ * @ctx: checkpoint context
+ * @ptr: provided buffer
+ * @len: buffer length
+ *
+ * If @ptr is NULL, then read only the header (payload to follow).
+ * @len specifies the expected buffer length (ignored if set to 0).
+ * Returns: _payload_ length.
+ */
+int _ckpt_read_buffer(struct ckpt_ctx *ctx, void *ptr, int len)
+{
+	BUG_ON(!len);
+	return _ckpt_read_obj_type(ctx, ptr, len, CKPT_HDR_BUFFER);
+}
+
+/**
+ * _ckpt_read_string - read an object of type string (set length)
+ * @ctx: checkpoint context
+ * @ptr: provided buffer
+ * @len: string length (including '\0')
+ *
+ * If @ptr is NULL, then read only the header (payload to follow)
+ */
+int _ckpt_read_string(struct ckpt_ctx *ctx, void *ptr, int len)
+{
+	int ret;
+
+	BUG_ON(!len);
+	ret = _ckpt_read_obj_type(ctx, ptr, len, CKPT_HDR_STRING);
+	if (ret < 0)
+		return ret;
+	if (ptr)
+		((char *) ptr)[len - 1] = '\0';	/* always play it safe */
+	return 0;
+}
+
+/**
+ * ckpt_read_obj - allocate and read an object (ckpt_hdr followed by payload)
+ * @ctx: checkpoint context
+ * @h: object descriptor
+ * @len: desired total length (if 0, flexible)
+ * @max: maximum total length
+ *
+ * Return: new buffer allocated on success, error pointer otherwise
+ */
+static void *ckpt_read_obj(struct ckpt_ctx *ctx, int len, int max)
+{
+	struct ckpt_hdr hh;
+	struct ckpt_hdr *h;
+	int ret;
+
+	ret = ckpt_read_obj_dispatch(ctx, &hh);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	_ckpt_debug(CKPT_DRW, "type %d len %d(%d,%d)\n",
+		    hh.type, hh.len, len, max);
+
+	/* if len specified, enforce, else if maximum specified, enforce */
+	if ((len && hh.len != len) || (!len && max && hh.len > max))
+		return ERR_PTR(-EINVAL);
+
+	h = ckpt_hdr_get(ctx, hh.len);
+	if (!h)
+		return ERR_PTR(-ENOMEM);
+
+	*h = hh;	/* yay ! */
+
+	ret = ckpt_kread(ctx, (h + 1), hh.len - sizeof(struct ckpt_hdr));
+	if (ret < 0) {
+		ckpt_hdr_put(ctx, h);
+		h = ERR_PTR(ret);
+	}
+
+	return h;
+}
+
+/**
+ * ckpt_read_obj_type - allocate and read an object of some type
+ * @ctx: checkpoint context
+ * @len: desired object length
+ * @type: desired object type
+ *
+ * Return: new buffer allocated on success, error pointer otherwise
+ */
+void *ckpt_read_obj_type(struct ckpt_ctx *ctx, int len, int type)
+{
+	struct ckpt_hdr *h;
+
+	BUG_ON(!len);
+
+	h = ckpt_read_obj(ctx, len, len);
+	if (IS_ERR(h)) {
+		ckpt_err(ctx, PTR_ERR(h), "Expecting to read type %d\n", type);
+		return h;
+	}
+
+	if (h->type != type) {
+		ckpt_hdr_put(ctx, h);
+		ckpt_err(ctx, -EINVAL, "Expected type %d but got %d\n",
+			 h->type, type);
+		h = ERR_PTR(-EINVAL);
+	}
+
+	return h;
+}
+
+/**
+ * ckpt_read_buf_type - allocate and read an object of some type (flxible)
+ * @ctx: checkpoint context
+ * @max: maximum payload length
+ * @type: desired object type
+ *
+ * This differs from ckpt_read_obj_type() in that the length of the
+ * incoming object is flexible (up to the maximum specified by @max;
+ * unlimited if @max is 0), as determined by the ckpt_hdr data.
+ *
+ * NOTE: for symmetry with checkpoint, @max is the maximum _payload_
+ * size, excluding the header.
+ *
+ * Return: new buffer allocated on success, error pointer otherwise
+ */
+void *ckpt_read_buf_type(struct ckpt_ctx *ctx, int max, int type)
+{
+	struct ckpt_hdr *h;
+
+	if (max)
+		max += sizeof(struct ckpt_hdr);
+
+	h = ckpt_read_obj(ctx, 0, max);
+	if (IS_ERR(h))
+		return h;
+
+	if (h->type != type) {
+		ckpt_hdr_put(ctx, h);
+		h = ERR_PTR(-EINVAL);
+	}
+
+	return h;
+}
+
+/**
+ * ckpt_read_payload - allocate and read the payload of an object
+ * @ctx: checkpoint context
+ * @max: maximum payload length
+ * @str: pointer to buffer to be allocated (caller must free)
+ * @type: desired object type
+ *
+ * This can be used to read a variable-length _payload_ from the checkpoint
+ * stream. @max limits the size of the resulting buffer.
+ *
+ * Return: actual _payload_ length
+ */
+int ckpt_read_payload(struct ckpt_ctx *ctx, void **ptr, int max, int type)
+{
+	int len, ret;
+
+	len = _ckpt_read_obj_type(ctx, NULL, 0, type);
+	if (len < 0)
+		return len;
+	else if (len > max)
+		return -EINVAL;
+
+	*ptr = kmalloc(len, GFP_KERNEL);
+	if (!*ptr)
+		return -ENOMEM;
+
+	ret = ckpt_kread(ctx, *ptr, len);
+	if (ret < 0) {
+		kfree(*ptr);
+		return ret;
+	}
+
+	return len;
+}
+
+/**
+ * ckpt_read_string - allocate and read a string (variable length)
+ * @ctx: checkpoint context
+ * @max: maximum acceptable length
+ *
+ * Return: allocate string or error pointer
+ */
+char *ckpt_read_string(struct ckpt_ctx *ctx, int max)
+{
+	char *str;
+	int len;
+
+	len = ckpt_read_payload(ctx, (void **)&str, max, CKPT_HDR_STRING);
+	if (len < 0)
+		return ERR_PTR(len);
+	str[len - 1] = '\0';  	/* always play it safe */
+	return str;
+}
+
+/**
+ * ckpt_read_consume - consume the next object of expected type
+ * @ctx: checkpoint context
+ * @len: desired object length
+ * @type: desired object type
+ *
+ * This can be used to skip an object in the input stream when the
+ * data is unnecessary for the restart. @len indicates the length of
+ * the object); if @len is zero the length is unconstrained.
+ */
+int ckpt_read_consume(struct ckpt_ctx *ctx, int len, int type)
+{
+	struct ckpt_hdr *h;
+	int ret = 0;
+
+	h = ckpt_read_obj(ctx, len, 0);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	if (h->type != type)
+		ret = -EINVAL;
+
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+/***********************************************************************
+ * Restart
+ */
+
+static int check_kernel_const(struct ckpt_const *h)
+{
+	struct task_struct *tsk;
+	struct new_utsname *uts;
+
+	/* task */
+	if (h->task_comm_len != sizeof(tsk->comm))
+		return -EINVAL;
+	/* mm->saved_auxv size */
+	if (h->at_vector_size != AT_VECTOR_SIZE)
+		return -EINVAL;
+	/* signal */
+	if (h->signal_nsig != _NSIG)
+		return -EINVAL;
+	/* uts */
+	if (h->uts_sysname_len != sizeof(uts->sysname))
+		return -EINVAL;
+	if (h->uts_nodename_len != sizeof(uts->nodename))
+		return -EINVAL;
+	if (h->uts_release_len != sizeof(uts->release))
+		return -EINVAL;
+	if (h->uts_version_len != sizeof(uts->version))
+		return -EINVAL;
+	if (h->uts_machine_len != sizeof(uts->machine))
+		return -EINVAL;
+	if (h->uts_domainname_len != sizeof(uts->domainname))
+		return -EINVAL;
+	/* rlimit */
+	if (h->rlimit_nlimits != RLIM_NLIMITS)
+		return -EINVAL;
+	/* tty */
+	if (h->n_tty_buf_size != N_TTY_BUF_SIZE)
+		return -EINVAL;
+	if (h->tty_termios_ncc != NCC)
+		return -EINVAL;
+
+	return 0;
+}
+
+/* read the checkpoint header */
+static int restore_read_header(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_header *h;
+	struct new_utsname *uts = NULL;
+	int ret;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_HEADER);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	ret = -EINVAL;
+	if (le16_to_cpu(h->arch_id) != CKPT_ARCH_ID) {
+		ckpt_err(ctx, ret, "incompatible architecture id");
+		goto out;
+	}
+	if (h->magic != CHECKPOINT_MAGIC_HEAD ||
+	    h->rev != CHECKPOINT_VERSION ||
+	    h->major != ((LINUX_VERSION_CODE >> 16) & 0xff) ||
+	    h->minor != ((LINUX_VERSION_CODE >> 8) & 0xff) ||
+	    h->patch != ((LINUX_VERSION_CODE) & 0xff)) {
+		ckpt_err(ctx, ret, "incompatible kernel version");
+		goto out;
+	}
+	if (h->uflags & ~CHECKPOINT_USER_FLAGS) {
+		ckpt_err(ctx, ret, "incompatible restart user flags");
+		goto out;
+	}
+
+	ret = check_kernel_const(&h->constants);
+	if (ret < 0) {
+		ckpt_err(ctx, ret, "incompatible kernel constants");
+		goto out;
+	}
+
+	ret = -ENOMEM;
+	uts = kmalloc(sizeof(*uts), GFP_KERNEL);
+	if (!uts)
+		goto out;
+
+	ctx->oflags = h->uflags;
+
+	/* FIX: verify compatibility of release, version and machine */
+	ret = _ckpt_read_buffer(ctx, uts->release, sizeof(uts->release));
+	if (ret < 0)
+		goto out;
+	ret = _ckpt_read_buffer(ctx, uts->version, sizeof(uts->version));
+	if (ret < 0)
+		goto out;
+	ret = _ckpt_read_buffer(ctx, uts->machine, sizeof(uts->machine));
+	if (ret < 0)
+		goto out;
+
+	ret = restore_read_header_arch(ctx);
+ out:
+	kfree(uts);
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+/* read the LSM configuration section */
+static int restore_lsm(struct ckpt_ctx *ctx)
+{
+	int ret;
+	char *cur_lsm = security_get_lsm_name();
+
+	ret = _ckpt_read_buffer(ctx, ctx->lsm_name,
+				CHECKPOINT_LSM_NAME_MAX + 1);
+	if (ret < 0) {
+		ckpt_debug("Error %d reading lsm name\n", ret);
+		return ret;
+	}
+
+	if (!(ctx->uflags & RESTART_KEEP_LSM))
+		goto skip_lsm;
+
+	if (strncmp(cur_lsm, ctx->lsm_name, CHECKPOINT_LSM_NAME_MAX + 1) != 0) {
+		ckpt_debug("c/r: checkpointed LSM %s, current is %s.\n",
+			ctx->lsm_name, cur_lsm);
+		return -EPERM;
+	}
+
+	if (strcmp(ctx->lsm_name, "lsm_none") != 0 &&
+			strcmp(ctx->lsm_name, "smack") != 0 &&
+			strcmp(ctx->lsm_name, "selinux") != 0 &&
+			strcmp(ctx->lsm_name, "default") != 0) {
+		ckpt_debug("c/r: RESTART_KEEP_LSM unsupported for %s\n",
+				ctx->lsm_name);
+		return -ENOSYS;
+	}
+
+skip_lsm:
+	ret = security_may_restart(ctx);
+	if (ret < 0)
+		ckpt_debug("security_may_restart returned %d\n", ret);
+	return ret;
+}
+
+/* read the container configuration section */
+static int restore_container(struct ckpt_ctx *ctx)
+{
+	int ret = 0;
+	struct ckpt_hdr_container *h;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_CONTAINER);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+	ckpt_hdr_put(ctx, h);
+
+	/* read the LSM name and info which follow ("are a part of")
+	 * the ckpt_hdr_container */
+	ret = restore_lsm(ctx);
+	if (ret < 0)
+		ckpt_debug("Error %d on LSM configuration\n", ret);
+	return ret;
+}
+
+/* read the checkpoint trailer */
+static int restore_read_tail(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_tail *h;
+	int ret = 0;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TAIL);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	if (h->magic != CHECKPOINT_MAGIC_TAIL)
+		ret = -EINVAL;
+
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+/* restore_read_tree - read the tasks tree into the checkpoint context */
+static int restore_read_tree(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_tree *h;
+	int size, ret;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TREE);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	ret = -EINVAL;
+	if (h->nr_tasks <= 0)
+		goto out;
+
+	ctx->nr_pids = h->nr_tasks;
+	size = sizeof(*ctx->pids_arr) * ctx->nr_pids;
+	if (size <= 0)		/* overflow ? */
+		goto out;
+
+	ctx->pids_arr = kmalloc(size, GFP_KERNEL);
+	if (!ctx->pids_arr) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	ret = _ckpt_read_buffer(ctx, ctx->pids_arr, size);
+ out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+static inline int all_tasks_activated(struct ckpt_ctx *ctx)
+{
+	return (ctx->active_pid == ctx->nr_pids);
+}
+
+static inline pid_t get_active_pid(struct ckpt_ctx *ctx)
+{
+	int active = ctx->active_pid;
+	return active >= 0 ? ctx->pids_arr[active].vpid : 0;
+}
+
+static inline int is_task_active(struct ckpt_ctx *ctx, pid_t pid)
+{
+	return get_active_pid(ctx) == pid;
+}
+
+/*
+ * If exiting a restart with error, then wake up all other tasks
+ * in the restart context.
+ */
+void restore_notify_error(struct ckpt_ctx *ctx)
+{
+	complete(&ctx->complete);
+	wake_up_all(&ctx->waitq);
+	wake_up_all(&ctx->ghostq);
+}
+
+static inline struct ckpt_ctx *get_task_ctx(struct task_struct *task)
+{
+	struct ckpt_ctx *ctx;
+
+	task_lock(task);
+	ctx = ckpt_ctx_get(task->checkpoint_ctx);
+	task_unlock(task);
+	return ctx;
+}
+
+/* returns 0 on success, 1 otherwise */
+static int set_task_ctx(struct task_struct *task, struct ckpt_ctx *ctx)
+{
+	int ret;
+
+	task_lock(task);
+	if (!task->checkpoint_ctx) {
+		task->checkpoint_ctx = ckpt_ctx_get(ctx);
+		ret = 0;
+	} else {
+		ckpt_debug("task %d has checkpoint_ctx\n", task_pid_vnr(task));
+		ret = 1;
+	}
+	task_unlock(task);
+	return ret;
+}
+
+static void clear_task_ctx(struct task_struct *task)
+{
+	struct ckpt_ctx *old;
+
+	task_lock(task);
+	old = task->checkpoint_ctx;
+	task->checkpoint_ctx = NULL;
+	task_unlock(task);
+
+	ckpt_debug("task %d clear checkpoint_ctx\n", task_pid_vnr(task));
+	ckpt_ctx_put(old);
+}
+
+static void restore_task_done(struct ckpt_ctx *ctx)
+{
+	if (atomic_dec_and_test(&ctx->nr_total))
+		complete(&ctx->complete);
+	BUG_ON(atomic_read(&ctx->nr_total) < 0);
+}
+
+static int restore_activate_next(struct ckpt_ctx *ctx)
+{
+	struct task_struct *task;
+	pid_t pid;
+
+	ctx->active_pid++;
+
+	BUG_ON(ctx->active_pid > ctx->nr_pids);
+
+	if (!all_tasks_activated(ctx)) {
+		/* wake up next task in line to restore its state */
+		pid = get_active_pid(ctx);
+
+		rcu_read_lock();
+		task = find_task_by_pid_ns(pid, ctx->root_nsproxy->pid_ns);
+		/* target task must have same restart context */
+		if (task && task->checkpoint_ctx == ctx)
+			wake_up_process(task);
+		else
+			task = NULL;
+		rcu_read_unlock();
+
+		if (!task) {
+			ckpt_err(ctx, -ESRCH, "task %d not found\n", pid);
+			return -ESRCH;
+		}
+	} else {
+		/* wake up ghosts tasks so that they can terminate */
+		wake_up_all(&ctx->ghostq);
+	}
+
+	return 0;
+}
+
+static int wait_task_active(struct ckpt_ctx *ctx)
+{
+	pid_t pid = task_pid_vnr(current);
+	int ret;
+
+	ckpt_debug("pid %d waiting\n", pid);
+	ret = wait_event_interruptible(ctx->waitq,
+				       is_task_active(ctx, pid) ||
+				       ckpt_test_error(ctx));
+	ckpt_debug("active %d < %d (ret %d, errno %d)\n",
+		   ctx->active_pid, ctx->nr_pids, ret, ctx->errno);
+	if (ckpt_test_error(ctx))
+		return ckpt_get_error(ctx);
+	return 0;
+}
+
+static int wait_task_sync(struct ckpt_ctx *ctx)
+{
+	ckpt_debug("pid %d syncing\n", task_pid_vnr(current));
+	wait_event_interruptible(ctx->waitq, ckpt_test_complete(ctx));
+	ckpt_debug("task sync done (errno %d)\n", ctx->errno);
+	if (ckpt_test_error(ctx))
+		return ckpt_get_error(ctx);
+	return 0;
+}
+
+/* grabs a reference to the @ctx on success; caller should free */
+static struct ckpt_ctx *wait_checkpoint_ctx(void)
+{
+	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(waitq);
+	struct ckpt_ctx *ctx;
+	int ret;
+
+	/*
+	 * Wait for coordinator to become visible, then grab a
+	 * reference to its restart context.
+	 */
+	ret = wait_event_interruptible(waitq, current->checkpoint_ctx);
+	if (ret < 0) {
+		ckpt_debug("wait_checkpoint_ctx: failed (%d)\n", ret);
+		return ERR_PTR(ret);
+	}
+
+	ctx = get_task_ctx(current);
+	if (!ctx) {
+		ckpt_debug("wait_checkpoint_ctx: checkpoint_ctx missing\n");
+		return ERR_PTR(-EAGAIN);
+	}
+
+	return ctx;
+}
+
+static int do_ghost_task(void)
+{
+	struct ckpt_ctx *ctx;
+	int ret;
+
+	ctx = wait_checkpoint_ctx();
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = restore_debug_task(ctx, RESTART_DBG_GHOST);
+	if (ret < 0)
+		goto out;
+
+	current->flags |= PF_RESTARTING;
+	restore_debug_running(ctx);
+
+	ret = wait_event_interruptible(ctx->ghostq,
+				       all_tasks_activated(ctx) ||
+				       ckpt_test_error(ctx));
+ out:
+	restore_debug_error(ctx, ret);
+	if (ret < 0)
+		ckpt_err(ctx, ret, "ghost restart failed\n");
+
+	current->exit_signal = -1;
+	restore_debug_exit(ctx);
+	ckpt_ctx_put(ctx);
+	do_exit(0);
+
+	/* NOT REACHED */
+}
+
+/*
+ * Ensure that all members of a thread group are in sys_restart before
+ * restoring any of them. Otherwise, restore may modify shared state
+ * and crash or fault a thread still in userspace,
+ */
+static int wait_sync_threads(void)
+{
+	struct task_struct *p = current;
+	atomic_t *count;
+	int nr = 0;
+	int ret = 0;
+
+	if (thread_group_empty(p))
+		return 0;
+
+	count = &p->signal->restart_count;
+
+	if (!atomic_read(count)) {
+		read_lock(&tasklist_lock);
+		for (p = next_thread(p); p != current; p = next_thread(p))
+			nr++;
+		read_unlock(&tasklist_lock);
+		/*
+		 * Testing that @count is 0 makes it unlikely that
+		 * multiple threads get here. But if they do, then
+		 * only one will succeed in initializing @count.
+		 */
+		atomic_cmpxchg(count, 0, nr + 1);
+	}
+
+	if (atomic_dec_and_test(count)) {
+		read_lock(&tasklist_lock);
+		for (p = next_thread(p); p != current; p = next_thread(p))
+			wake_up_process(p);
+		read_unlock(&tasklist_lock);
+	} else {
+		DECLARE_WAIT_QUEUE_HEAD_ONSTACK(waitq);
+		ret = wait_event_interruptible(waitq, !atomic_read(count));
+	}
+
+	return ret;
+}
+
+static int do_restore_task(void)
+{
+	struct ckpt_ctx *ctx;
+	int zombie, ret;
+
+	ctx = wait_checkpoint_ctx();
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = restore_debug_task(ctx, RESTART_DBG_TASK);
+	if (ret < 0)
+		goto out;
+
+	current->flags |= PF_RESTARTING;
+
+	ret = wait_sync_threads();
+	if (ret < 0)
+		goto out;
+
+	/* wait for our turn, do the restore, and tell next task in line */
+	ret = wait_task_active(ctx);
+	if (ret < 0)
+		goto out;
+
+	restore_debug_running(ctx);
+
+	ret = pre_restore_task();
+	if (ret < 0)
+		goto out;
+
+	zombie = restore_task(ctx);
+	if (zombie < 0) {
+		ret = zombie;
+		goto out;
+	}
+
+	ret = restore_activate_next(ctx);
+	if (ret < 0)
+		goto out;
+
+	/*
+	 * zombie: we're done here; do_exit() will notice the @ctx on
+	 * our current->checkpoint_ctx (and our PF_RESTARTING), will
+	 * call restore_task_done() and release the @ctx. This ensures
+	 * that we only report done after we really become zombie.
+	 */
+	if (zombie) {
+		restore_debug_exit(ctx);
+		post_restore_task();
+		ckpt_ctx_put(ctx);
+		do_exit(current->exit_code);
+	}
+
+	restore_task_done(ctx);
+	ret = wait_task_sync(ctx);
+ out:
+	restore_debug_error(ctx, ret);
+	if (ret < 0)
+		ckpt_err(ctx, ret, "task restart failed\n");
+
+	post_restore_task();
+	current->flags &= ~PF_RESTARTING;
+	clear_task_ctx(current);
+	ckpt_ctx_put(ctx);
+	return ret;
+}
+
+/**
+ * __prepare_descendants - set ->checkpoint_ctx of a descendants
+ * @task: descendant task
+ * @data: points to the checkpoint ctx
+ */
+static int __prepare_descendants(struct task_struct *task, void *data)
+{
+	struct ckpt_ctx *ctx = (struct ckpt_ctx *) data;
+
+	ckpt_debug("consider task %d\n", task_pid_vnr(task));
+
+	if (!ptrace_may_access(task, PTRACE_MODE_ATTACH)) {
+		ckpt_debug("stranger task %d\n", task_pid_vnr(task));
+		return -EPERM;
+	}
+
+	if (task_ptrace(task) & PT_PTRACED) {
+		ckpt_debug("ptraced task %d\n", task_pid_vnr(task));
+		return -EBUSY;
+	}
+
+	/*
+	 * Set task->checkpoint_ctx of all non-zombie descendants.
+	 * If a descendant already has a ->checkpoint_ctx, it
+	 * must be a coordinator (for a different restart ?) so
+	 * we fail.
+	 *
+	 * Note that own ancestors cannot interfere since they
+	 * won't descend past us, as own ->checkpoint_ctx must
+	 * already be set.
+	 */
+	if (!task->exit_state) {
+		if (set_task_ctx(task, ctx))
+			return -EBUSY;
+		ckpt_debug("prepare task %d\n", task_pid_vnr(task));
+		wake_up_process(task);
+		return 1;
+	}
+
+	return 0;
+}
+
+/**
+ * prepare_descendants - set ->checkpoint_ctx of all descendants
+ * @ctx: checkpoint context
+ * @root: root process for restart
+ *
+ * Called by the coodinator to set the ->checkpoint_ctx pointer of the
+ * root task and all its descendants.
+ */
+static int prepare_descendants(struct ckpt_ctx *ctx, struct task_struct *root)
+{
+	int nr_pids;
+
+	nr_pids = walk_task_subtree(root, __prepare_descendants, ctx);
+	ckpt_debug("nr %d/%d\n", ctx->nr_pids, nr_pids);
+	if (nr_pids < 0)
+		return nr_pids;
+
+	/*
+	 * Actual tasks count may exceed ctx->nr_pids due of 'dead'
+	 * tasks used as place-holders for PGIDs, but not fall short.
+	 */
+	if (nr_pids < ctx->nr_pids)
+		return -ESRCH;
+
+	atomic_set(&ctx->nr_total, nr_pids);
+	return nr_pids;
+}
+
+static int wait_all_tasks_finish(struct ckpt_ctx *ctx)
+{
+	int ret;
+
+	BUG_ON(ctx->active_pid != -1);
+	ret = restore_activate_next(ctx);
+	if (ret < 0)
+		return ret;
+
+	ret = wait_for_completion_interruptible(&ctx->complete);
+	ckpt_debug("final sync kflags %#lx (ret %d)\n", ctx->kflags, ret);
+
+	return ret;
+}
+
+static struct task_struct *choose_root_task(struct ckpt_ctx *ctx, pid_t pid)
+{
+	struct task_struct *task;
+
+	if (ctx->uflags & RESTART_TASKSELF) {
+		ctx->root_pid = pid;
+		ctx->root_task = current;
+		get_task_struct(current);
+		return current;
+	}
+
+	read_lock(&tasklist_lock);
+	list_for_each_entry(task, &current->children, sibling) {
+		if (task_pid_vnr(task) == pid) {
+			get_task_struct(task);
+			ctx->root_task = task;
+			ctx->root_pid = pid;
+			break;
+		}
+	}
+	read_unlock(&tasklist_lock);
+
+	return ctx->root_task;
+}
+
+/* setup restart-specific parts of ctx */
+static int init_restart_ctx(struct ckpt_ctx *ctx, pid_t pid)
+{
+	struct nsproxy *nsproxy;
+
+	/*
+	 * No need for explicit cleanup here, because if an error
+	 * occurs then ckpt_ctx_free() is eventually called.
+	 */
+
+	if (!choose_root_task(ctx, pid))
+		return -ESRCH;
+
+	rcu_read_lock();
+	nsproxy = task_nsproxy(ctx->root_task);
+	if (nsproxy) {
+		get_nsproxy(nsproxy);
+		ctx->root_nsproxy = nsproxy;
+	}
+	rcu_read_unlock();
+	if (!nsproxy)
+		return -ESRCH;
+
+	ctx->active_pid = -1;	/* see restore_activate_next, get_active_pid */
+
+	return 0;
+}
+
+static int __destroy_descendants(struct task_struct *task, void *data)
+{
+	struct ckpt_ctx *ctx = (struct ckpt_ctx *) data;
+
+	if (task->checkpoint_ctx == ctx)
+		force_sig(SIGKILL, task);
+
+	return 0;
+}
+
+static void destroy_descendants(struct ckpt_ctx *ctx)
+{
+	walk_task_subtree(ctx->root_task, __destroy_descendants, ctx);
+}
+
+static int do_restore_coord(struct ckpt_ctx *ctx, pid_t pid)
+{
+	int ret;
+
+	ret = restore_debug_task(ctx, RESTART_DBG_COORD);
+	if (ret < 0)
+		return ret;
+	restore_debug_running(ctx);
+
+	ret = restore_read_header(ctx);
+	ckpt_debug("restore header: %d\n", ret);
+	if (ret < 0)
+		return ret;
+	ret = restore_container(ctx);
+	ckpt_debug("restore container: %d\n", ret);
+	if (ret < 0)
+		return ret;
+	ret = restore_read_tree(ctx);
+	ckpt_debug("restore tree: %d\n", ret);
+	if (ret < 0)
+		return ret;
+
+	if ((ctx->uflags & RESTART_TASKSELF) && ctx->nr_pids != 1)
+		return -EINVAL;
+
+	ret = init_restart_ctx(ctx, pid);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Populate own ->checkpoint_ctx: if an ancestor attempts to
+	 * prepare_descendants() on us, it will fail. Furthermore,
+	 * that ancestor won't proceed deeper to interfere with our
+	 * descendants that are restarting.
+	 */
+	if (set_task_ctx(current, ctx)) {
+		/*
+		 * We are a bad-behaving descendant: an ancestor must
+		 * have prepare_descendants() us as part of a restart.
+		 */
+		ckpt_debug("coord already has checkpoint_ctx\n");
+		return -EBUSY;
+	}
+
+	/*
+	 * From now on we are committed to the restart. If anything
+	 * fails, we'll cleanup (that is, kill) those tasks in our
+	 * subtree that we marked for restart - see below.
+	 */
+
+	if (ctx->uflags & RESTART_TASKSELF) {
+		ret = pre_restore_task();
+		ckpt_debug("pre restore task: %d\n", ret);
+		if (ret < 0)
+			goto out;
+		ret = restore_task(ctx);
+		ckpt_debug("restore task: %d\n", ret);
+		if (ret < 0)
+			goto out;
+	} else {
+		/* prepare descendants' t->checkpoint_ctx point to coord */
+		ret = prepare_descendants(ctx, ctx->root_task);
+		ckpt_debug("restore prepare: %d\n", ret);
+		if (ret < 0)
+			goto out;
+		/* wait for all other tasks to complete do_restore_task() */
+		ret = wait_all_tasks_finish(ctx);
+		ckpt_debug("restore finish: %d\n", ret);
+		if (ret < 0)
+			goto out;
+	}
+
+	ret = deferqueue_run(ctx->deferqueue);  /* run deferred work */
+	ckpt_debug("restore deferqueue: %d\n", ret);
+	if (ret < 0)
+		goto out;
+
+	ret = restore_read_tail(ctx);
+	ckpt_debug("restore tail: %d\n", ret);
+	if (ret < 0)
+		goto out;
+
+	if (ctx->uflags & RESTART_FROZEN) {
+		ret = cgroup_freezer_make_frozen(ctx->root_task);
+		ckpt_debug("freezing restart tasks ... %d\n", ret);
+	}
+ out:
+	if (ctx->uflags & RESTART_TASKSELF)
+		post_restore_task();
+
+	restore_debug_error(ctx, ret);
+	if (ret < 0)
+		ckpt_err(ctx, ret, "restart failed (coordinator)\n");
+
+	if (ckpt_test_error(ctx)) {
+		destroy_descendants(ctx);
+		ret = ckpt_get_error(ctx);
+	} else {
+		ckpt_set_success(ctx);
+		wake_up_all(&ctx->waitq);
+	}
+
+	clear_task_ctx(current);
+	return ret;
+}
+
+static long restore_retval(void)
+{
+	struct pt_regs *regs = task_pt_regs(current);
+	long ret;
+
+	/*
+	 * For the restart, we entered the kernel via sys_restart(),
+	 * so our return path is via the syscall exit. In particular,
+	 * the code in entry.S will put the value that we will return
+	 * into a register (e.g. regs->eax in x86), thus passing it to
+	 * the caller task.
+	 *
+	 * What we do now depends on what happened to the checkpointed
+	 * task right before the checkpoint - there are three cases:
+	 *
+	 * 1) It was carrying out a syscall when became frozen, or
+	 * 2) It was running in userspace, or
+	 * 3) It was doing a self-checkpoint
+	 *
+	 * In case #1, if the syscall succeeded, perhaps partially,
+	 * then the retval is non-negative. If it failed, the error
+	 * may be one of -ERESTART..., which is interpreted in the
+	 * signal handling code. If that is the case, we force the
+	 * signal handler to kick in by faking a signal to ourselves
+	 * (a la freeze/thaw) when ret < 0.
+	 *
+	 * In case #2, our return value will overwrite the original
+	 * value in the affected register. Workaround by simply using
+	 * that saved value of that register as our retval.
+	 *
+	 * In case #3, then the state was recorded while the task was
+	 * in checkpoint(2) syscall. The syscall is execpted to return
+	 * 0 when returning from a restart. Fortunately, this already
+	 * has been arranged for at checkpoint time (the register that
+	 * holds the retval, e.g. regs->eax in x86, was set to
+	 * zero).
+	 */
+
+	/* needed for all 3 cases: get old value/error/retval */
+	ret = syscall_get_return_value(current, regs);
+
+	/* if from a syscall and returning error, kick in signal handlig */
+	if (syscall_get_nr(current, regs) >= 0 && ret < 0)
+		set_tsk_thread_flag(current, TIF_SIGPENDING);
+
+	return ret;
+}
+
+long do_restart(struct ckpt_ctx *ctx, pid_t pid, unsigned long flags)
+{
+	long ret;
+
+	if (ctx)
+		ret = do_restore_coord(ctx, pid);
+	else if (flags & RESTART_GHOST)
+		ret = do_ghost_task();
+	else
+		ret = do_restore_task();
+
+	/* restart(2) isn't idempotent: should not be auto-restarted */
+	if (ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
+	    ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK)
+		ret = -EINTR;
+
+	/*
+	 * The retval from what we return to the caller when all goes
+	 * well: this is either the retval from the original syscall
+	 * that was interrupted during checkpoint, or the contents of
+	 * (saved) eax if the task was in userspace.
+	 *
+	 * The coordinator (ctx!=NULL) is exempt: don't adjust its retval.
+	 * But in self-restart (where RESTART_TASKSELF), the coordinator
+	 * _itself_ is a restarting task.
+	 */
+
+	if (!ctx || (ctx->uflags & RESTART_TASKSELF)) {
+		if (ret < 0) {
+			/* partial restore is undefined: terminate */
+			ckpt_debug("restart err %ld, exiting\n", ret);
+			force_sig(SIGKILL, current);
+		} else {
+			ret = restore_retval();
+		}
+	}
+
+	ckpt_debug("sys_restart returns %ld\n", ret);
+	return ret;
+}
+
+/**
+ * exit_checkpoint - callback from do_exit to cleanup checkpoint state
+ * @tsk: terminating task
+ */
+void exit_checkpoint(struct task_struct *tsk)
+{
+	struct ckpt_ctx *ctx;
+
+	/* no one else will touch this, because @tsk is dead already */
+	ctx = tsk->checkpoint_ctx;
+
+	/* restarting zombies will activate next task in restart */
+	if (tsk->flags & PF_RESTARTING) {
+		BUG_ON(ctx->active_pid == -1);
+		restore_task_done(ctx);
+	}
+
+	ckpt_ctx_put(ctx);
+}
diff --git a/kernel/checkpoint/sys.c b/kernel/checkpoint/sys.c
new file mode 100644
index 0000000..a420c02
--- /dev/null
+++ b/kernel/checkpoint/sys.c
@@ -0,0 +1,719 @@
+/*
+ *  Generic container checkpoint-restart
+ *
+ *  Copyright (C) 2008-2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG  CKPT_DSYS
+
+#include <linux/sched.h>
+#include <linux/nsproxy.h>
+#include <linux/kernel.h>
+#include <linux/cgroup.h>
+#include <linux/syscalls.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/uaccess.h>
+#include <linux/capability.h>
+#include <linux/checkpoint.h>
+#include <linux/mm_checkpoint.h> /* for ckpt_pgarr_free() */
+#include <linux/deferqueue.h>
+
+/*
+ * ckpt_unpriv_allowed - sysctl controlled, do not allow checkpoints or
+ * restarts unless caller has CAP_SYS_ADMIN, if 0 (prevent unprivileged
+ * useres from expoitling any privilege escalation bugs). If it is 1,
+ * then regular permissions checks are intended to do the job.
+ */
+int ckpt_unpriv_allowed = 1;	/* default: allow */
+
+/*
+ * Helpers to write(read) from(to) kernel space to(from) the checkpoint
+ * image file descriptor (similar to how a core-dump is performed).
+ *
+ *   ckpt_kwrite() - write a kernel-space buffer to the checkpoint image
+ *   ckpt_kread() - read from the checkpoint image to a kernel-space buffer
+ */
+
+static inline int _ckpt_kwrite(struct file *file, void *addr, int count)
+{
+	void __user *uaddr = (__force void __user *) addr;
+	ssize_t nwrite;
+	int nleft;
+
+	for (nleft = count; nleft; nleft -= nwrite) {
+		loff_t pos = file_pos_read(file);
+		nwrite = vfs_write(file, uaddr, nleft, &pos);
+		file_pos_write(file, pos);
+		if (nwrite < 0) {
+			if (nwrite == -EAGAIN)
+				nwrite = 0;
+			else
+				return nwrite;
+		}
+		uaddr += nwrite;
+	}
+	return 0;
+}
+
+int ckpt_kwrite(struct ckpt_ctx *ctx, void *addr, int count)
+{
+	mm_segment_t fs;
+	int ret;
+
+	if (ckpt_test_error(ctx))
+		return ckpt_get_error(ctx);
+
+	fs = get_fs();
+	set_fs(KERNEL_DS);
+	ret = _ckpt_kwrite(ctx->file, addr, count);
+	set_fs(fs);
+
+	ctx->total += count;
+	return ret;
+}
+
+static inline int _ckpt_kread(struct file *file, void *addr, int count)
+{
+	void __user *uaddr = (__force void __user *) addr;
+	ssize_t nread;
+	int nleft;
+
+	for (nleft = count; nleft; nleft -= nread) {
+		loff_t pos = file_pos_read(file);
+		nread = vfs_read(file, uaddr, nleft, &pos);
+		file_pos_write(file, pos);
+		if (nread <= 0) {
+			if (nread == -EAGAIN) {
+				nread = 0;
+				continue;
+			} else if (nread == 0)
+				nread = -EPIPE;		/* unexecpted EOF */
+			return nread;
+		}
+		uaddr += nread;
+	}
+	return 0;
+}
+
+int ckpt_kread(struct ckpt_ctx *ctx, void *addr, int count)
+{
+	mm_segment_t fs;
+	int ret;
+
+	if (ckpt_test_error(ctx))
+		return ckpt_get_error(ctx);
+
+	fs = get_fs();
+	set_fs(KERNEL_DS);
+	ret = _ckpt_kread(ctx->file , addr, count);
+	set_fs(fs);
+
+	ctx->total += count;
+	return ret;
+}
+
+/**
+ * ckpt_hdr_get - get a hdr of certain size
+ * @ctx: checkpoint context
+ * @len: desired length
+ *
+ * Returns pointer to header
+ */
+void *ckpt_hdr_get(struct ckpt_ctx *ctx, int len)
+{
+	return kzalloc(len, GFP_KERNEL);
+}
+
+/**
+ * _ckpt_hdr_put - free a hdr allocated with ckpt_hdr_get
+ * @ctx: checkpoint context
+ * @ptr: header to free
+ * @len: header length
+ *
+ * (requiring 'ptr' makes it easily interchangable with kmalloc/kfree
+ */
+void _ckpt_hdr_put(struct ckpt_ctx *ctx, void *ptr, int len)
+{
+	kfree(ptr);
+}
+
+/**
+ * ckpt_hdr_put - free a hdr allocated with ckpt_hdr_get
+ * @ctx: checkpoint context
+ * @ptr: header to free
+ *
+ * It is assumed that @ptr begins with a 'struct ckpt_hdr'.
+ */
+void ckpt_hdr_put(struct ckpt_ctx *ctx, void *ptr)
+{
+	struct ckpt_hdr *h = (struct ckpt_hdr *) ptr;
+	_ckpt_hdr_put(ctx, ptr, h->len);
+}
+
+/**
+ * ckpt_hdr_get_type - get a hdr of certain size
+ * @ctx: checkpoint context
+ * @len: number of bytes to reserve
+ *
+ * Returns pointer to reserved space on hbuf
+ */
+void *ckpt_hdr_get_type(struct ckpt_ctx *ctx, int len, int type)
+{
+	struct ckpt_hdr *h;
+
+	h = ckpt_hdr_get(ctx, len);
+	if (!h)
+		return NULL;
+
+	h->type = type;
+	h->len = len;
+	return h;
+}
+
+#define DUMMY_LSM_INFO "dummy"
+
+int ckpt_write_dummy_lsm_info(struct ckpt_ctx *ctx)
+{
+	return ckpt_write_obj_type(ctx, DUMMY_LSM_INFO,
+			strlen(DUMMY_LSM_INFO), CKPT_HDR_LSM_INFO);
+}
+
+/*
+ * ckpt_snarf_lsm_info
+ * If there is a CKPT_HDR_LSM_INFO field, toss it.
+ * Used when the current LSM doesn't care about this field.
+ */
+void ckpt_snarf_lsm_info(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr *h;
+
+	h = ckpt_read_buf_type(ctx, CKPT_LSM_INFO_LEN, CKPT_HDR_LSM_INFO);
+	if (!IS_ERR(h))
+		ckpt_hdr_put(ctx, h);
+}
+
+/*
+ * Helpers to manage c/r contexts: allocated for each checkpoint and/or
+ * restart operation, and persists until the operation is completed.
+ */
+
+static void task_arr_free(struct ckpt_ctx *ctx)
+{
+	int n;
+
+	for (n = 0; n < ctx->nr_tasks; n++) {
+		if (ctx->tasks_arr[n]) {
+			put_task_struct(ctx->tasks_arr[n]);
+			ctx->tasks_arr[n] = NULL;
+		}
+	}
+	kfree(ctx->tasks_arr);
+}
+
+static void ckpt_ctx_free(struct ckpt_ctx *ctx)
+{
+	BUG_ON(atomic_read(&ctx->refcount));
+
+	/* per task status debugging only during restart */
+	if (ctx->kflags & CKPT_CTX_RESTART)
+		restore_debug_free(ctx);
+
+	if (ctx->deferqueue)
+		deferqueue_destroy(ctx->deferqueue);
+
+	if (ctx->files_deferq)
+		deferqueue_destroy(ctx->files_deferq);
+
+	if (ctx->file)
+		fput(ctx->file);
+	if (ctx->logfile)
+		fput(ctx->logfile);
+
+	ckpt_obj_hash_free(ctx);
+	path_put(&ctx->root_fs_path);
+	ckpt_pgarr_free(ctx);
+
+	if (ctx->tasks_arr)
+		task_arr_free(ctx);
+
+	if (ctx->root_nsproxy)
+		put_nsproxy(ctx->root_nsproxy);
+	if (ctx->root_task)
+		put_task_struct(ctx->root_task);
+	if (ctx->root_freezer)
+		put_task_struct(ctx->root_freezer);
+
+	free_page((unsigned long) ctx->scratch_page);
+
+	kfree(ctx->pids_arr);
+
+	sock_listening_list_free(&ctx->listen_sockets);
+
+	kfree(ctx);
+}
+
+static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags,
+				       unsigned long kflags, int logfd)
+{
+	struct ckpt_ctx *ctx;
+	int err;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return ERR_PTR(-ENOMEM);
+
+	ctx->uflags = uflags;
+	ctx->kflags = kflags;
+	ctx->ktime_begin = ktime_get();
+
+	atomic_set(&ctx->refcount, 0);
+	INIT_LIST_HEAD(&ctx->pgarr_list);
+	INIT_LIST_HEAD(&ctx->pgarr_pool);
+	init_waitqueue_head(&ctx->waitq);
+	init_waitqueue_head(&ctx->ghostq);
+	init_completion(&ctx->complete);
+
+	init_rwsem(&ctx->errno_sem);
+	down_write(&ctx->errno_sem);
+
+#ifdef CONFIG_CHECKPOINT_DEBUG
+	INIT_LIST_HEAD(&ctx->task_status);
+	spin_lock_init(&ctx->lock);
+#endif
+
+	mutex_init(&ctx->msg_mutex);
+
+	INIT_LIST_HEAD(&ctx->listen_sockets);
+
+	err = -EBADF;
+	ctx->file = fget(fd);
+	if (!ctx->file)
+		goto err;
+	if (logfd == CHECKPOINT_FD_NONE)
+		goto nolog;
+	ctx->logfile = fget(logfd);
+	if (!ctx->logfile)
+		goto err;
+
+ nolog:
+	err = -ENOMEM;
+	if (ckpt_obj_hash_alloc(ctx) < 0)
+		goto err;
+	ctx->deferqueue = deferqueue_create();
+	if (!ctx->deferqueue)
+		goto err;
+
+	ctx->files_deferq = deferqueue_create();
+	if (!ctx->files_deferq)
+		goto err;
+
+	ctx->scratch_page = (void *) __get_free_page(GFP_KERNEL);
+	if (!ctx->scratch_page)
+		goto err;
+
+	atomic_inc(&ctx->refcount);
+	return ctx;
+ err:
+	ckpt_ctx_free(ctx);
+	return ERR_PTR(err);
+}
+
+struct ckpt_ctx *ckpt_ctx_get(struct ckpt_ctx *ctx)
+{
+	if (ctx)
+		atomic_inc(&ctx->refcount);
+	return ctx;
+}
+
+void ckpt_ctx_put(struct ckpt_ctx *ctx)
+{
+	if (ctx && atomic_dec_and_test(&ctx->refcount))
+		ckpt_ctx_free(ctx);
+}
+
+void ckpt_set_error(struct ckpt_ctx *ctx, int err)
+{
+	/* atomically set ctx->errno */
+	if (!ckpt_test_and_set_ctx_kflag(ctx, CKPT_CTX_ERROR)) {
+		ctx->errno = err;
+		/*
+		 * We initialized ctx->errno_sem write-held to prevent
+		 * other tasks from reading ctx->errno prematurely.
+		 */
+		up_write(&ctx->errno_sem);
+		/* on restart, notify all tasks in restarting subtree */
+		if (ctx->kflags & CKPT_CTX_RESTART)
+			restore_notify_error(ctx);
+	}
+}
+
+void ckpt_set_success(struct ckpt_ctx *ctx)
+{
+	ckpt_set_ctx_kflag(ctx, CKPT_CTX_SUCCESS);
+	/* avoid warning "lock still held" when freeing (was write-held) */
+	up_write(&ctx->errno_sem);
+}
+
+/* helpers to handler log/dbg/err messages */
+void ckpt_msg_lock(struct ckpt_ctx *ctx)
+{
+	if (!ctx)
+		return;
+	mutex_lock(&ctx->msg_mutex);
+	ctx->msg[0] = '\0';
+	ctx->msglen = 1;
+}
+
+void ckpt_msg_unlock(struct ckpt_ctx *ctx)
+{
+	if (!ctx)
+		return;
+	mutex_unlock(&ctx->msg_mutex);
+}
+
+static inline int is_special_flag(char *s)
+{
+	if (*s == '%' && s[1] == '(' && s[2] != '\0' && s[3] == ')')
+		return 1;
+	return 0;
+}
+
+/*
+ * _ckpt_generate_fmt - handle the special flags in the enhanced format
+ * strings used by checkpoint/restart error messages.
+ * @ctx: checkpoint context
+ * @fmt: message format
+ *
+ * The special flags are surrounded by %() to help them visually stand
+ * out.  For instance, %(O) means an objref.  The following special
+ * flags are recognized:
+ *	O: objref
+ *	P: pointer
+ *	T: task
+ *	S: string
+ *	V: variable
+ *
+ * %(O) will be expanded to "[obj %d]".  Likewise P, S, and V, will
+ * also expand to format flags requiring an argument to the subsequent
+ * sprintf or printk.  T will be expanded to a string with no flags,
+ * requiring no further arguments.
+ *
+ * These do not accept any extra flags (i.e. min field width, precision,
+ * etc).
+ *
+ * The caller of ckpt_err() and _ckpt_err() must provide
+ * the additional variabes, in order, to match the @fmt (except for
+ * the T key), e.g.:
+ *
+ *	ckpt_err(ctx, err, "%(T)FILE flags %d %(O)\n", flags, objref);
+ *
+ * May be called under spinlock.
+ * Must be called with ctx->msg_mutex held.  The expanded format
+ * will be placed in ctx->fmt.
+ */
+static void _ckpt_generate_fmt(struct ckpt_ctx *ctx, char *fmt)
+{
+	char *s = ctx->fmt;
+	int len = 0;
+
+	for (; *fmt && len < CKPT_MSG_LEN; fmt++) {
+		if (!is_special_flag(fmt)) {
+			s[len++] = *fmt;
+			continue;
+		}
+		switch (fmt[2]) {
+		case 'O':
+			len += snprintf(s+len, CKPT_MSG_LEN-len, "[obj %%d]");
+			break;
+		case 'P':
+			len += snprintf(s+len, CKPT_MSG_LEN-len, "[ptr %%p]");
+			break;
+		case 'V':
+			len += snprintf(s+len, CKPT_MSG_LEN-len, "[sym %%pS]");
+			break;
+		case 'S':
+			len += snprintf(s+len, CKPT_MSG_LEN-len, "[str %%s]");
+			break;
+		case 'T':
+			if (ctx->tsk)
+				len += snprintf(s+len, CKPT_MSG_LEN-len,
+					"[pid %d tsk %s]",
+					task_pid_vnr(ctx->tsk), ctx->tsk->comm);
+			else
+				len += snprintf(s+len, CKPT_MSG_LEN-len,
+					"[pid -1 tsk NULL]");
+			break;
+		default:
+			printk(KERN_ERR "c/r: bad format specifier %c\n",
+					fmt[2]);
+			BUG();
+		}
+		fmt += 3;
+	}
+	if (len == CKPT_MSG_LEN)
+		s[CKPT_MSG_LEN-1] = '\0';
+	else
+		s[len] = '\0';
+}
+
+static void _ckpt_msg_appendv(struct ckpt_ctx *ctx, int err, char *fmt,
+				va_list ap)
+{
+	int len = ctx->msglen;
+
+	if (err) {
+		len += snprintf(&ctx->msg[len], CKPT_MSG_LEN-len, "[err %d]",
+				 err);
+		if (len > CKPT_MSG_LEN)
+			goto full;
+	}
+
+	len += snprintf(&ctx->msg[len], CKPT_MSG_LEN-len, "[pos %lld]",
+			ctx->total);
+	len += vsnprintf(&ctx->msg[len], CKPT_MSG_LEN-len, fmt, ap);
+	if (len > CKPT_MSG_LEN) {
+full:
+		len = CKPT_MSG_LEN;
+		ctx->msg[CKPT_MSG_LEN-1] = '\0';
+	}
+	ctx->msglen = len;
+}
+
+void _ckpt_msg_append(struct ckpt_ctx *ctx, char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	_ckpt_msg_appendv(ctx, 0, fmt, ap);
+	va_end(ap);
+}
+
+void _ckpt_msg_complete(struct ckpt_ctx *ctx)
+{
+	int ret;
+
+	/* Don't write an empty or uninitialized msg */
+	if (ctx->msglen <= 1)
+		return;
+
+	if (ctx->kflags & CKPT_CTX_CHECKPOINT && ckpt_test_error(ctx)) {
+		ret = ckpt_write_obj_type(ctx, NULL, 0, CKPT_HDR_ERROR);
+		if (!ret)
+			ret = ckpt_write_string(ctx, ctx->msg, ctx->msglen);
+		if (ret < 0)
+			printk(KERN_NOTICE "c/r: error string unsaved (%d): %s\n",
+			       ret, ctx->msg+1);
+	}
+
+	if (ctx->logfile) {
+		mm_segment_t fs = get_fs();
+		set_fs(KERNEL_DS);
+		ret = _ckpt_kwrite(ctx->logfile, ctx->msg+1, ctx->msglen-1);
+		set_fs(fs);
+	}
+
+#ifdef CONFIG_CHECKPOINT_DEBUG
+	printk(KERN_DEBUG "%s", ctx->msg+1);
+#endif
+
+	ctx->msglen = 0;
+}
+
+#define __do_ckpt_msg(ctx, err, fmt) do {		\
+	va_list ap;					\
+	_ckpt_generate_fmt(ctx, fmt);			\
+	va_start(ap, fmt);				\
+	_ckpt_msg_appendv(ctx, err, ctx->fmt, ap);	\
+	va_end(ap);					\
+} while (0)
+
+void _do_ckpt_msg(struct ckpt_ctx *ctx, int err, char *fmt, ...)
+{
+	__do_ckpt_msg(ctx, err, fmt);
+}
+
+void do_ckpt_msg(struct ckpt_ctx *ctx, int err, char *fmt, ...)
+{
+	if (!ctx)
+		return;
+
+	ckpt_msg_lock(ctx);
+	__do_ckpt_msg(ctx, err, fmt);
+	_ckpt_msg_complete(ctx);
+	ckpt_msg_unlock(ctx);
+
+	if (err)
+		ckpt_set_error(ctx, err);
+}
+
+/**
+ * walk_task_subtree: iterate through a task's descendants
+ * @root: subtree root task
+ * @func: callback invoked on each task
+ * @data: pointer passed to the callback
+ *
+ * The function will start with @root, and iterate through all the
+ * descendants, including threads, in a DFS manner. Children of a task
+ * are traversed before proceeding to the next thread of that task.
+ *
+ * For each task, the callback @func will be called providing the task
+ * pointer and the @data. The callback is invoked while holding the
+ * tasklist_lock for reading. If the callback fails it should return a
+ * negative error, and the traversal ends. If the callback succeeds,
+ * it returns a non-negative number, and these values are summed.
+ *
+ * On success, walk_task_subtree() returns the total summed. On
+ * failure, it returns a negative value.
+ */
+int walk_task_subtree(struct task_struct *root,
+		      int (*func)(struct task_struct *, void *),
+		      void *data)
+{
+
+	struct task_struct *leader = root;
+	struct task_struct *parent = NULL;
+	struct task_struct *task = root;
+	int total = 0;
+	int ret;
+
+	read_lock(&tasklist_lock);
+	while (1) {
+		/* invoke callback on this task */
+		ret = func(task, data);
+		if (ret < 0)
+			break;
+
+		total += ret;
+
+		/* if has children - proceed with child */
+		if (!list_empty(&task->children)) {
+			parent = task;
+			task = list_entry(task->children.next,
+					  struct task_struct, sibling);
+			continue;
+		}
+
+		while (task != root) {
+			/* if has sibling - proceed with sibling */
+			if (!list_is_last(&task->sibling, &parent->children)) {
+				task = list_entry(task->sibling.next,
+						  struct task_struct, sibling);
+				break;
+			}
+
+			/* else, trace back to parent and proceed */
+			task = parent;
+			parent = parent->real_parent;
+		}
+
+		if (task == root) {
+			/* in case root task is multi-threaded */
+			root = task = next_thread(task);
+			if (root == leader)
+				break;
+		}
+	}
+	read_unlock(&tasklist_lock);
+
+	ckpt_debug("total %d ret %d\n", total, ret);
+	return (ret < 0 ? ret : total);
+}
+
+/* checkpoint/restart syscalls */
+
+/**
+ * do_sys_checkpoint - checkpoint a container
+ * @pid: pid of the container init(1) process
+ * @fd: file to which dump the checkpoint image
+ * @flags: checkpoint operation flags
+ * @logfd: fd to which to dump debug and error messages
+ *
+ * Returns positive identifier on success, 0 when returning from restart
+ * or negative value on error
+ */
+long do_sys_checkpoint(pid_t pid, int fd, unsigned long flags, int logfd)
+{
+	struct ckpt_ctx *ctx;
+	long ret;
+
+	if (flags & ~CHECKPOINT_USER_FLAGS)
+		return -EINVAL;
+
+	if (!ckpt_unpriv_allowed && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (pid == 0)
+		pid = task_pid_vnr(current);
+	ctx = ckpt_ctx_alloc(fd, flags, CKPT_CTX_CHECKPOINT, logfd);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = do_checkpoint(ctx, pid);
+
+	if (!ret)
+		ret = ctx->crid;
+
+	ckpt_ctx_put(ctx);
+	return ret;
+}
+
+/**
+ * do_sys_restart - restart a container
+ * @pid: pid of task root (in coordinator's namespace), or 0
+ * @fd: file from which read the checkpoint image
+ * @flags: restart operation flags
+ * @logfd: fd to which to dump debug and error messages
+ *
+ * Returns negative value on error, or otherwise returns in the realm
+ * of the original checkpoint
+ */
+long do_sys_restart(pid_t pid, int fd, unsigned long flags, int logfd)
+{
+	struct ckpt_ctx *ctx = NULL;
+	long ret;
+
+	/* no flags for now */
+	if (flags & ~RESTART_USER_FLAGS)
+		return -EINVAL;
+
+	if (!ckpt_unpriv_allowed && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (pid)
+		ctx = ckpt_ctx_alloc(fd, flags, CKPT_CTX_RESTART, logfd);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = do_restart(ctx, pid, flags);
+
+	ckpt_ctx_put(ctx);
+	return ret;
+}
+
+
+/* 'ckpt_debug_level' controls the verbosity level of c/r code */
+#ifdef CONFIG_CHECKPOINT_DEBUG
+
+/* FIX: allow to change during runtime */
+unsigned long __read_mostly ckpt_debug_level = CKPT_DDEFAULT;
+
+static __init int ckpt_debug_setup(char *s)
+{
+	long val, ret;
+
+	ret = strict_strtoul(s, 10, &val);
+	if (ret < 0)
+		return ret;
+	ckpt_debug_level = val;
+	return 0;
+}
+
+__setup("ckpt_debug=", ckpt_debug_setup);
+
+#endif /* CONFIG_CHECKPOINT_DEBUG */
-- 
1.6.3.3

_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers




More information about the Devel mailing list