[Devel] [PATCH 06/12] Move checkpoint/files.c into fs/

Matt Helsley matthltc at us.ibm.com
Fri Feb 26 00:45:07 PST 2010


Subsequent patches break up fs/checkpoint.c into the file table checkpoint,
the fs_struct checkpoint, etc.

Signed-off-by: Matt Helsley <matthltc at us.ibm.com>
---
 checkpoint/Makefile |    1 -
 checkpoint/files.c  | 1041 ---------------------------------------------------
 fs/Makefile         |    1 +
 fs/checkpoint.c     | 1041 +++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 1042 insertions(+), 1042 deletions(-)
 delete mode 100644 checkpoint/files.c
 create mode 100644 fs/checkpoint.c

diff --git a/checkpoint/Makefile b/checkpoint/Makefile
index f8a55df..02e66b6 100644
--- a/checkpoint/Makefile
+++ b/checkpoint/Makefile
@@ -9,6 +9,5 @@ obj-$(CONFIG_CHECKPOINT) += \
 	restart.o \
 	process.o \
 	namespace.o \
-	files.o \
 	memory.o \
 	signal.o
diff --git a/checkpoint/files.c b/checkpoint/files.c
deleted file mode 100644
index 2859cf9..0000000
--- a/checkpoint/files.c
+++ /dev/null
@@ -1,1041 +0,0 @@
-/*
- *  Checkpoint file descriptors
- *
- *  Copyright (C) 2008-2009 Oren Laadan
- *
- *  This file is subject to the terms and conditions of the GNU General Public
- *  License.  See the file COPYING in the main directory of the Linux
- *  distribution for more details.
- */
-
-/* default debug level for output */
-#define CKPT_DFLAG  CKPT_DFILE
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/file.h>
-#include <linux/namei.h>
-#include <linux/fs_struct.h>
-#include <linux/fs.h>
-#include <linux/fdtable.h>
-#include <linux/fsnotify.h>
-#include <linux/pipe_fs_i.h>
-#include <linux/syscalls.h>
-#include <linux/deferqueue.h>
-#include <linux/checkpoint.h>
-#include <linux/checkpoint_hdr.h>
-#include <linux/eventpoll.h>
-#include <linux/eventfd.h>
-#include <net/sock.h>
-
-
-/**************************************************************************
- * Checkpoint
- */
-
-/**
- * ckpt_fill_fname - return pathname of a given file
- * @path: path name
- * @root: relative root
- * @buf: buffer for pathname
- * @len: buffer length (in) and pathname length (out)
- */
-char *ckpt_fill_fname(struct path *path, struct path *root, char *buf, int *len)
-{
-	struct path tmp = *root;
-	char *fname;
-
-	BUG_ON(!buf);
-	spin_lock(&dcache_lock);
-	fname = __d_path(path, &tmp, buf, *len);
-	spin_unlock(&dcache_lock);
-	if (IS_ERR(fname))
-		return fname;
-	*len = (buf + (*len) - fname);
-	/*
-	 * FIX: if __d_path() changed these, it must have stepped out of
-	 * init's namespace. Since currently we require a unified namespace
-	 * within the container: simply fail.
-	 */
-	if (tmp.mnt != root->mnt || tmp.dentry != root->dentry)
-		fname = ERR_PTR(-EBADF);
-
-	return fname;
-}
-
-/**
- * checkpoint_fname - write a file name
- * @ctx: checkpoint context
- * @path: path name
- * @root: relative root
- */
-int checkpoint_fname(struct ckpt_ctx *ctx, struct path *path, struct path *root)
-{
-	char *buf, *fname;
-	int ret, flen;
-
-	/*
-	 * FIXME: we can optimize and save memory (and storage) if we
-	 * share strings (through objhash) and reference them instead
-	 */
-
-	flen = PATH_MAX;
-	buf = kmalloc(flen, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-
-	fname = ckpt_fill_fname(path, root, buf, &flen);
-	if (!IS_ERR(fname)) {
-		ret = ckpt_write_obj_type(ctx, fname, flen,
-					  CKPT_HDR_FILE_NAME);
-	} else {
-		ret = PTR_ERR(fname);
-		ckpt_err(ctx, ret, "%(T)%(S)Obtain filename\n",
-			 path->dentry->d_name.name);
-	}
-
-	kfree(buf);
-	return ret;
-}
-
-#define CKPT_DEFAULT_FDTABLE  256		/* an initial guess */
-
-/**
- * scan_fds - scan file table and construct array of open fds
- * @files: files_struct pointer
- * @fdtable: (output) array of open fds
- *
- * Returns the number of open fds found, and also the file table
- * array via *fdtable. The caller should free the array.
- *
- * The caller must validate the file descriptors collected in the
- * array before using them, e.g. by using fcheck_files(), in case
- * the task's fdtable changes in the meantime.
- */
-static int scan_fds(struct files_struct *files, int **fdtable)
-{
-	struct fdtable *fdt;
-	int *fds = NULL;
-	int i = 0, n = 0;
-	int tot = CKPT_DEFAULT_FDTABLE;
-
-	/*
-	 * We assume that all tasks possibly sharing the file table are
-	 * frozen (or we are a single process and we checkpoint ourselves).
-	 * Therefore, we can safely proceed after krealloc() from where we
-	 * left off. Otherwise the file table may be modified by another
-	 * task after we scan it. The behavior is this case is undefined,
-	 * and either checkpoint or restart will likely fail.
-	 */
- retry:
-	fds = krealloc(fds, tot * sizeof(*fds), GFP_KERNEL);
-	if (!fds)
-		return -ENOMEM;
-
-	rcu_read_lock();
-	fdt = files_fdtable(files);
-	for (/**/; i < fdt->max_fds; i++) {
-		if (!fcheck_files(files, i))
-			continue;
-		if (n == tot) {
-			rcu_read_unlock();
-			tot *= 2;	/* won't overflow: kmalloc will fail */
-			goto retry;
-		}
-		fds[n++] = i;
-	}
-	rcu_read_unlock();
-
-	*fdtable = fds;
-	return n;
-}
-
-#ifdef CONFIG_SECURITY
-int checkpoint_file_security(struct ckpt_ctx *ctx, struct file *file)
-{
-	return security_checkpoint_obj(ctx, file->f_security,
-				       CKPT_SECURITY_FILE);
-}
-#else
-int checkpoint_file_security(struct ckpt_ctx *ctx, struct file *file)
-{
-	return SECURITY_CTX_NONE;
-}
-#endif
-
-int checkpoint_file_common(struct ckpt_ctx *ctx, struct file *file,
-			   struct ckpt_hdr_file *h)
-{
-	struct cred *f_cred = (struct cred *) file->f_cred;
-
-	h->f_flags = file->f_flags;
-	h->f_mode = file->f_mode;
-	h->f_pos = file->f_pos;
-	h->f_version = file->f_version;
-
-	h->f_credref = checkpoint_obj(ctx, f_cred, CKPT_OBJ_CRED);
-	if (h->f_credref < 0)
-		return h->f_credref;
-
-	h->f_secref = checkpoint_file_security(ctx, file);
-	if (h->f_secref < 0) {
-		ckpt_err(ctx, h->f_secref, "%(T)file->f_security");
-		return h->f_secref;
-	}
-
-	ckpt_debug("file %s credref %d secref %d\n",
-		file->f_dentry->d_name.name, h->f_credref, h->f_secref);
-
-	/* FIX: need also file->f_owner, etc */
-
-	return 0;
-}
-
-int generic_file_checkpoint(struct ckpt_ctx *ctx, struct file *file)
-{
-	struct ckpt_hdr_file_generic *h;
-	int ret;
-
-	/*
-	 * FIXME: when we'll add support for unlinked files/dirs, we'll
-	 * need to distinguish between unlinked filed and unlinked dirs.
-	 */
-	if (d_unlinked(file->f_dentry)) {
-		ckpt_err(ctx, -EBADF, "%(T)%(P)Unlinked files unsupported\n",
-			 file);
-		return -EBADF;
-	}
-
-	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
-	if (!h)
-		return -ENOMEM;
-
-	h->common.f_type = CKPT_FILE_GENERIC;
-
-	ret = checkpoint_file_common(ctx, file, &h->common);
-	if (ret < 0)
-		goto out;
-	ret = ckpt_write_obj(ctx, &h->common.h);
-	if (ret < 0)
-		goto out;
-	ret = checkpoint_fname(ctx, &file->f_path, &ctx->root_fs_path);
- out:
-	ckpt_hdr_put(ctx, h);
-	return ret;
-}
-EXPORT_SYMBOL(generic_file_checkpoint);
-
-/* checkpoint callback for file pointer */
-int checkpoint_file(struct ckpt_ctx *ctx, void *ptr)
-{
-	struct file *file = (struct file *) ptr;
-	int ret;
-
-	if (!file->f_op || !file->f_op->checkpoint) {
-		ckpt_err(ctx, -EBADF, "%(T)%(P)%(V)f_op lacks checkpoint\n",
-			       file, file->f_op);
-		return -EBADF;
-	}
-
-	if (is_dnotify_attached(file)) {
-		ckpt_err(ctx, -EBADF, "%(T)%(P)dnotify unsupported\n", file);
-		return -EBADF;
-	}
-
-	ret = file->f_op->checkpoint(ctx, file);
-	if (ret < 0)
-		ckpt_err(ctx, ret, "%(T)%(P)file checkpoint failed\n", file);
-	return ret;
-}
-
-/**
- * ckpt_write_file_desc - dump the state of a given file descriptor
- * @ctx: checkpoint context
- * @files: files_struct pointer
- * @fd: file descriptor
- *
- * Saves the state of the file descriptor; looks up the actual file
- * pointer in the hash table, and if found saves the matching objref,
- * otherwise calls ckpt_write_file to dump the file pointer too.
- */
-static int checkpoint_file_desc(struct ckpt_ctx *ctx,
-				struct files_struct *files, int fd)
-{
-	struct ckpt_hdr_file_desc *h;
-	struct file *file = NULL;
-	struct fdtable *fdt;
-	int objref, ret;
-	int coe = 0;	/* avoid gcc warning */
-	pid_t pid;
-
-	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE_DESC);
-	if (!h)
-		return -ENOMEM;
-
-	rcu_read_lock();
-	fdt = files_fdtable(files);
-	file = fcheck_files(files, fd);
-	if (file) {
-		coe = FD_ISSET(fd, fdt->close_on_exec);
-		get_file(file);
-	}
-	rcu_read_unlock();
-
-	ret = find_locks_with_owner(file, files);
-	/*
-	 * find_locks_with_owner() returns an error when there
-	 * are no locks found, so we *want* it to return an error
-	 * code.  Its success means we have to fail the checkpoint.
-	 */
-	if (!ret) {
-		ret = -EBADF;
-		ckpt_err(ctx, ret, "%(T)fd %d has file lock or lease\n", fd);
-		goto out;
-	}
-
-	/* sanity check (although this shouldn't happen) */
-	ret = -EBADF;
-	if (!file) {
-		ckpt_err(ctx, ret, "%(T)fd %d gone?\n", fd);
-		goto out;
-	}
-
-	/*
-	 * TODO: Implement c/r of fowner and f_sigio.  Should be
-	 * trivial, but for now we just refuse its checkpoint
-	 */
-	pid = f_getown(file);
-	if (pid) {
-		ret = -EBUSY;
-		ckpt_err(ctx, ret, "%(T)fd %d has an owner (%d)\n", fd);
-		goto out;
-	}
-
-	/*
-	 * if seen first time, this will add 'file' to the objhash, keep
-	 * a reference to it, dump its state while at it.
-	 */
-	objref = checkpoint_obj(ctx, file, CKPT_OBJ_FILE);
-	ckpt_debug("fd %d objref %d file %p coe %d)\n", fd, objref, file, coe);
-	if (objref < 0) {
-		ret = objref;
-		goto out;
-	}
-
-	h->fd_objref = objref;
-	h->fd_descriptor = fd;
-	h->fd_close_on_exec = coe;
-
-	ret = ckpt_write_obj(ctx, &h->h);
-out:
-	ckpt_hdr_put(ctx, h);
-	if (file)
-		fput(file);
-	return ret;
-}
-
-static int do_checkpoint_file_table(struct ckpt_ctx *ctx,
-				    struct files_struct *files)
-{
-	struct ckpt_hdr_file_table *h;
-	int *fdtable = NULL;
-	int nfds, n, ret;
-
-	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE_TABLE);
-	if (!h)
-		return -ENOMEM;
-
-	nfds = scan_fds(files, &fdtable);
-	if (nfds < 0) {
-		ret = nfds;
-		goto out;
-	}
-
-	h->fdt_nfds = nfds;
-
-	ret = ckpt_write_obj(ctx, &h->h);
-	ckpt_hdr_put(ctx, h);
-	if (ret < 0)
-		goto out;
-
-	ckpt_debug("nfds %d\n", nfds);
-	for (n = 0; n < nfds; n++) {
-		ret = checkpoint_file_desc(ctx, files, fdtable[n]);
-		if (ret < 0)
-			goto out;
-	}
-
-	ret = deferqueue_run(ctx->files_deferq);
-	ckpt_debug("files_deferq ran %d entries\n", ret);
-	if (ret > 0)
-		ret = 0;
- out:
-	kfree(fdtable);
-	return ret;
-}
-
-/* checkpoint callback for file table */
-int checkpoint_file_table(struct ckpt_ctx *ctx, void *ptr)
-{
-	return do_checkpoint_file_table(ctx, (struct files_struct *) ptr);
-}
-
-/* checkpoint wrapper for file table */
-int checkpoint_obj_file_table(struct ckpt_ctx *ctx, struct task_struct *t)
-{
-	struct files_struct *files;
-	int objref;
-
-	files = get_files_struct(t);
-	if (!files)
-		return -EBUSY;
-	objref = checkpoint_obj(ctx, files, CKPT_OBJ_FILE_TABLE);
-	put_files_struct(files);
-
-	return objref;
-}
-
-int checkpoint_obj_fs(struct ckpt_ctx *ctx, struct task_struct *t)
-{
-	struct fs_struct *fs;
-	int fs_objref;
-
-	task_lock(current);
-	fs = t->fs;
-	get_fs_struct(fs);
-	task_unlock(current);
-
-	fs_objref = checkpoint_obj(ctx, fs, CKPT_OBJ_FS);
-	put_fs_struct(fs);
-
-	return fs_objref;
-}
-
-/* called with fs refcount bumped so it won't disappear */
-static int do_checkpoint_fs(struct ckpt_ctx *ctx, struct fs_struct *fs)
-{
-	struct ckpt_hdr_fs *h;
-	struct fs_struct *fscopy;
-	int ret;
-
-	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FS);
-	if (!h)
-		return -ENOMEM;
-	ret = ckpt_write_obj(ctx, &h->h);
-	ckpt_hdr_put(ctx, h);
-	if (ret)
-		return ret;
-
-	fscopy = copy_fs_struct(fs);
-	if (!fs)
-		return -ENOMEM;
-
-	ret = checkpoint_fname(ctx, &fscopy->pwd, &ctx->root_fs_path);
-	if (ret < 0) {
-		ckpt_err(ctx, ret, "%(T)writing path of cwd");
-		goto out;
-	}
-	ret = checkpoint_fname(ctx, &fscopy->root, &ctx->root_fs_path);
-	if (ret < 0) {
-		ckpt_err(ctx, ret, "%(T)writing path of fs root");
-		goto out;
-	}
-	ret = 0;
- out:
-	free_fs_struct(fscopy);
-	return ret;
-}
-
-int checkpoint_fs(struct ckpt_ctx *ctx, void *ptr)
-{
-	return do_checkpoint_fs(ctx, (struct fs_struct *) ptr);
-}
-
-/***********************************************************************
- * Collect
- */
-
-int ckpt_collect_file(struct ckpt_ctx *ctx, struct file *file)
-{
-	int ret;
-
-	ret = ckpt_obj_collect(ctx, file, CKPT_OBJ_FILE);
-	if (ret <= 0)
-		return ret;
-	/* if first time for this file (ret > 0), invoke ->collect() */
-	if (file->f_op->collect)
-		ret = file->f_op->collect(ctx, file);
-	if (ret < 0)
-		ckpt_err(ctx, ret, "%(T)%(P)File collect\n", file);
-	return ret;
-}
-
-static int collect_file_desc(struct ckpt_ctx *ctx,
-			     struct files_struct *files, int fd)
-{
-	struct fdtable *fdt;
-	struct file *file;
-	int ret;
-
-	rcu_read_lock();
-	fdt = files_fdtable(files);
-	file = fcheck_files(files, fd);
-	if (file)
-		get_file(file);
-	rcu_read_unlock();
-
-	if (!file) {
-		ckpt_err(ctx, -EBUSY, "%(T)%(P)File removed\n", file);
-		return -EBUSY;
-	}
-
-	ret = ckpt_collect_file(ctx, file);
-	fput(file);
-
-	return ret;
-}
-
-static int collect_file_table(struct ckpt_ctx *ctx, struct files_struct *files)
-{
-	int *fdtable;
-	int nfds, n;
-	int ret;
-
-	/* if already exists (ret == 0), nothing to do */
-	ret = ckpt_obj_collect(ctx, files, CKPT_OBJ_FILE_TABLE);
-	if (ret <= 0)
-		return ret;
-
-	/* if first time for this file table (ret > 0), proceed inside */
-	nfds = scan_fds(files, &fdtable);
-	if (nfds < 0)
-		return nfds;
-
-	for (n = 0; n < nfds; n++) {
-		ret = collect_file_desc(ctx, files, fdtable[n]);
-		if (ret < 0)
-			break;
-	}
-
-	kfree(fdtable);
-	return ret;
-}
-
-int ckpt_collect_file_table(struct ckpt_ctx *ctx, struct task_struct *t)
-{
-	struct files_struct *files;
-	int ret;
-
-	files = get_files_struct(t);
-	if (!files) {
-		ckpt_err(ctx, -EBUSY, "%(T)files_struct missing\n");
-		return -EBUSY;
-	}
-	ret = collect_file_table(ctx, files);
-	put_files_struct(files);
-
-	return ret;
-}
-
-int ckpt_collect_fs(struct ckpt_ctx *ctx, struct task_struct *t)
-{
-	struct fs_struct *fs;
-	int ret;
-
-	task_lock(t);
-	fs = t->fs;
-	get_fs_struct(fs);
-	task_unlock(t);
-
-	ret = ckpt_obj_collect(ctx, fs, CKPT_OBJ_FS);
-
-	put_fs_struct(fs);
-	return ret;
-}
-
-/**************************************************************************
- * Restart
- */
-
-static int ckpt_read_fname(struct ckpt_ctx *ctx, char **fname)
-{
-	int len;
-
-	len = ckpt_read_payload(ctx, (void **) fname,
-				PATH_MAX, CKPT_HDR_FILE_NAME);
-	if (len < 0)
-		return len;
-
-	(*fname)[len - 1] = '\0';	/* always play if safe */
-	ckpt_debug("read filename '%s'\n", *fname);
-
-	return len;
-}
-
-/**
- * restore_open_fname - read a file name and open a file
- * @ctx: checkpoint context
- * @flags: file flags
- */
-struct file *restore_open_fname(struct ckpt_ctx *ctx, int flags)
-{
-	struct file *file;
-	char *fname;
-	int len;
-
-	/* prevent bad input from doing bad things */
-	if (flags & (O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC))
-		return ERR_PTR(-EINVAL);
-
-	len = ckpt_read_fname(ctx, &fname);
-	if (len < 0)
-		return ERR_PTR(len);
-	ckpt_debug("fname '%s' flags %#x\n", fname, flags);
-
-	file = filp_open(fname, flags, 0);
-	kfree(fname);
-
-	return file;
-}
-
-static int close_all_fds(struct files_struct *files)
-{
-	int *fdtable;
-	int nfds;
-
-	nfds = scan_fds(files, &fdtable);
-	if (nfds < 0)
-		return nfds;
-	while (nfds--)
-		sys_close(fdtable[nfds]);
-	kfree(fdtable);
-	return 0;
-}
-
-/**
- * attach_file - attach a lonely file ptr to a file descriptor
- * @file: lonely file pointer
- */
-static int attach_file(struct file *file)
-{
-	int fd = get_unused_fd_flags(0);
-
-	if (fd >= 0) {
-		get_file(file);
-		fsnotify_open(file->f_path.dentry);
-		fd_install(fd, file);
-	}
-	return fd;
-}
-
-#define CKPT_SETFL_MASK  \
-	(O_APPEND | O_NONBLOCK | O_NDELAY | FASYNC | O_DIRECT | O_NOATIME)
-
-int restore_file_common(struct ckpt_ctx *ctx, struct file *file,
-			struct ckpt_hdr_file *h)
-{
-	fmode_t new_mode = file->f_mode;
-	fmode_t saved_mode = (__force fmode_t) h->f_mode;
-	int ret;
-	struct cred *cred;
-
-	/* FIX: need to restore owner etc */
-
-	/* restore the cred */
-	cred = ckpt_obj_fetch(ctx, h->f_credref, CKPT_OBJ_CRED);
-	if (IS_ERR(cred))
-		return PTR_ERR(cred);
-	put_cred(file->f_cred);
-	file->f_cred = get_cred(cred);
-
-	ret = security_restore_obj(ctx, (void *) file, CKPT_SECURITY_FILE,
-				   h->f_secref);
-	if (ret < 0) {
-		ckpt_err(ctx, ret, "file secref %(O)%(P)\n", h->f_secref,
-			 file);
-		return ret;
-	}
-
-	/* safe to set 1st arg (fd) to 0, as command is F_SETFL */
-	ret = vfs_fcntl(0, F_SETFL, h->f_flags & CKPT_SETFL_MASK, file);
-	if (ret < 0)
-		return ret;
-
-	/*
-	 * Normally f_mode is set by open, and modified only via
-	 * fcntl(), so its value now should match that at checkpoint.
-	 * However, a file may be downgraded from (read-)write to
-	 * read-only, e.g:
-	 *  - mark_files_ro() unsets FMODE_WRITE
-	 *  - nfs4_file_downgrade() too, and also sert FMODE_READ
-	 * Validate the new f_mode against saved f_mode, allowing:
-	 *  - new with FMODE_WRITE, saved without FMODE_WRITE
-	 *  - new without FMODE_READ, saved with FMODE_READ
-	 */
-	if ((new_mode & FMODE_WRITE) && !(saved_mode & FMODE_WRITE)) {
-		new_mode &= ~FMODE_WRITE;
-		if (!(new_mode & FMODE_READ) && (saved_mode & FMODE_READ))
-			new_mode |= FMODE_READ;
-	}
-	/* finally, at this point new mode should match saved mode */
-	if (new_mode ^ saved_mode)
-		return -EINVAL;
-
-	if (file->f_mode & FMODE_LSEEK)
-		ret = vfs_llseek(file, h->f_pos, SEEK_SET);
-
-	return ret;
-}
-
-static struct file *generic_file_restore(struct ckpt_ctx *ctx,
-					 struct ckpt_hdr_file *ptr)
-{
-	struct file *file;
-	int ret;
-
-	if (ptr->h.type != CKPT_HDR_FILE  ||
-	    ptr->h.len != sizeof(*ptr) || ptr->f_type != CKPT_FILE_GENERIC)
-		return ERR_PTR(-EINVAL);
-
-	file = restore_open_fname(ctx, ptr->f_flags);
-	if (IS_ERR(file))
-		return file;
-
-	ret = restore_file_common(ctx, file, ptr);
-	if (ret < 0) {
-		fput(file);
-		file = ERR_PTR(ret);
-	}
-	return file;
-}
-
-struct restore_file_ops {
-	char *file_name;
-	enum file_type file_type;
-	struct file * (*restore) (struct ckpt_ctx *ctx,
-				  struct ckpt_hdr_file *ptr);
-};
-
-static struct restore_file_ops restore_file_ops[] = {
-	/* ignored file */
-	{
-		.file_name = "IGNORE",
-		.file_type = CKPT_FILE_IGNORE,
-		.restore = NULL,
-	},
-	/* regular file/directory */
-	{
-		.file_name = "GENERIC",
-		.file_type = CKPT_FILE_GENERIC,
-		.restore = generic_file_restore,
-	},
-	/* pipes */
-	{
-		.file_name = "PIPE",
-		.file_type = CKPT_FILE_PIPE,
-		.restore = pipe_file_restore,
-	},
-	/* fifo */
-	{
-		.file_name = "FIFO",
-		.file_type = CKPT_FILE_FIFO,
-		.restore = fifo_file_restore,
-	},
-	/* socket */
-	{
-		.file_name = "SOCKET",
-		.file_type = CKPT_FILE_SOCKET,
-		.restore = sock_file_restore,
-	},
-	/* tty */
-	{
-		.file_name = "TTY",
-		.file_type = CKPT_FILE_TTY,
-		.restore = tty_file_restore,
-	},
-	/* epoll */
-	{
-		.file_name = "EPOLL",
-		.file_type = CKPT_FILE_EPOLL,
-		.restore = ep_file_restore,
-	},
-	/* eventfd */
-	{
-		.file_name = "EVENTFD",
-		.file_type = CKPT_FILE_EVENTFD,
-		.restore = eventfd_restore,
-	},
-};
-
-static struct file *do_restore_file(struct ckpt_ctx *ctx)
-{
-	struct restore_file_ops *ops;
-	struct ckpt_hdr_file *h;
-	struct file *file = ERR_PTR(-EINVAL);
-
-	/*
-	 * All 'struct ckpt_hdr_file_...' begin with ckpt_hdr_file,
-	 * but the actual object depends on the file type. The length
-	 * should never be more than page.
-	 */
-	h = ckpt_read_buf_type(ctx, PAGE_SIZE, CKPT_HDR_FILE);
-	if (IS_ERR(h))
-		return (struct file *) h;
-	ckpt_debug("flags %#x mode %#x type %d\n",
-		 h->f_flags, h->f_mode, h->f_type);
-
-	if (h->f_type >= CKPT_FILE_MAX)
-		goto out;
-
-	ops = &restore_file_ops[h->f_type];
-	BUG_ON(ops->file_type != h->f_type);
-
-	if (ops->restore)
-		file = ops->restore(ctx, h);
- out:
-	ckpt_hdr_put(ctx, h);
-	return file;
-}
-
-/* restore callback for file pointer */
-void *restore_file(struct ckpt_ctx *ctx)
-{
-	return (void *) do_restore_file(ctx);
-}
-
-/**
- * ckpt_read_file_desc - restore the state of a given file descriptor
- * @ctx: checkpoint context
- *
- * Restores the state of a file descriptor; looks up the objref (in the
- * header) in the hash table, and if found picks the matching file and
- * use it; otherwise calls restore_file to restore the file too.
- */
-static int restore_file_desc(struct ckpt_ctx *ctx)
-{
-	struct ckpt_hdr_file_desc *h;
-	struct file *file;
-	int newfd, ret;
-
-	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_FILE_DESC);
-	if (IS_ERR(h))
-		return PTR_ERR(h);
-	ckpt_debug("ref %d fd %d c.o.e %d\n",
-		 h->fd_objref, h->fd_descriptor, h->fd_close_on_exec);
-
-	ret = -EINVAL;
-	if (h->fd_objref <= 0 || h->fd_descriptor < 0)
-		goto out;
-
-	file = ckpt_obj_fetch(ctx, h->fd_objref, CKPT_OBJ_FILE);
-	if (IS_ERR(file)) {
-		ret = PTR_ERR(file);
-		goto out;
-	}
-
-	newfd = attach_file(file);
-	if (newfd < 0) {
-		ret = newfd;
-		goto out;
-	}
-
-	ckpt_debug("newfd got %d wanted %d\n", newfd, h->fd_descriptor);
-
-	/* reposition if newfd isn't desired fd */
-	if (newfd != h->fd_descriptor) {
-		ret = sys_dup2(newfd, h->fd_descriptor);
-		if (ret < 0)
-			goto out;
-		sys_close(newfd);
-	}
-
-	set_close_on_exec(h->fd_descriptor, h->fd_close_on_exec);
-	ret = 0;
- out:
-	ckpt_hdr_put(ctx, h);
-	return ret;
-}
-
-/* restore callback for file table */
-static struct files_struct *do_restore_file_table(struct ckpt_ctx *ctx)
-{
-	struct ckpt_hdr_file_table *h;
-	struct files_struct *files;
-	int i, ret;
-
-	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_FILE_TABLE);
-	if (IS_ERR(h))
-		return (struct files_struct *) h;
-
-	ckpt_debug("nfds %d\n", h->fdt_nfds);
-
-	ret = -EMFILE;
-	if (h->fdt_nfds < 0 || h->fdt_nfds > sysctl_nr_open)
-		goto out;
-
-	/*
-	 * We assume that restarting tasks, as created in user-space,
-	 * have distinct files_struct objects each. If not, we need to
-	 * call dup_fd() to make sure we don't overwrite an already
-	 * restored one.
-	 */
-
-	/* point of no return -- close all file descriptors */
-	ret = close_all_fds(current->files);
-	if (ret < 0)
-		goto out;
-
-	for (i = 0; i < h->fdt_nfds; i++) {
-		ret = restore_file_desc(ctx);
-		if (ret < 0)
-			goto out;
-	}
-
-	ret = deferqueue_run(ctx->files_deferq);
-	ckpt_debug("files_deferq ran %d entries\n", ret);
-	if (ret > 0)
-		ret = 0;
- out:
-	ckpt_hdr_put(ctx, h);
-	if (!ret) {
-		files = current->files;
-		atomic_inc(&files->count);
-	} else {
-		files = ERR_PTR(ret);
-	}
-	return files;
-}
-
-void *restore_file_table(struct ckpt_ctx *ctx)
-{
-	return (void *) do_restore_file_table(ctx);
-}
-
-int restore_obj_file_table(struct ckpt_ctx *ctx, int files_objref)
-{
-	struct files_struct *files;
-
-	files = ckpt_obj_fetch(ctx, files_objref, CKPT_OBJ_FILE_TABLE);
-	if (IS_ERR(files))
-		return PTR_ERR(files);
-
-	if (files != current->files) {
-		task_lock(current);
-		put_files_struct(current->files);
-		current->files = files;
-		task_unlock(current);
-		atomic_inc(&files->count);
-	}
-
-	return 0;
-}
-
-/*
- * Called by task restore code to set the restarted task's
- * current->fs to an entry on the hash
- */
-int restore_obj_fs(struct ckpt_ctx *ctx, int fs_objref)
-{
-	struct fs_struct *newfs, *oldfs;
-
-	newfs = ckpt_obj_fetch(ctx, fs_objref, CKPT_OBJ_FS);
-	if (IS_ERR(newfs))
-		return PTR_ERR(newfs);
-
-	task_lock(current);
-	get_fs_struct(newfs);
-	oldfs = current->fs;
-	current->fs = newfs;
-	task_unlock(current);
-	put_fs_struct(oldfs);
-
-	return 0;
-}
-
-static int restore_chroot(struct ckpt_ctx *ctx, struct fs_struct *fs, char *name)
-{
-	struct nameidata nd;
-	int ret;
-
-	ckpt_debug("attempting chroot to %s\n", name);
-	ret = path_lookup(name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &nd);
-	if (ret) {
-		ckpt_err(ctx, ret, "%(T)Opening chroot dir %s", name);
-		return ret;
-	}
-	ret = do_chroot(fs, &nd.path);
-	path_put(&nd.path);
-	if (ret) {
-		ckpt_err(ctx, ret, "%(T)Setting chroot %s", name);
-		return ret;
-	}
-	return 0;
-}
-
-static int restore_cwd(struct ckpt_ctx *ctx, struct fs_struct *fs, char *name)
-{
-	struct nameidata nd;
-	int ret;
-
-	ckpt_debug("attempting chdir to %s\n", name);
-	ret = path_lookup(name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &nd);
-	if (ret) {
-		ckpt_err(ctx, ret, "%(T)Opening cwd %s", name);
-		return ret;
-	}
-	ret = do_chdir(fs, &nd.path);
-	path_put(&nd.path);
-	if (ret) {
-		ckpt_err(ctx, ret, "%(T)Setting cwd %s", name);
-		return ret;
-	}
-	return 0;
-}
-
-/*
- * Called by objhash when it runs into a CKPT_OBJ_FS entry. Creates
- * an fs_struct with desired chroot/cwd and places it in the hash.
- */
-static struct fs_struct *do_restore_fs(struct ckpt_ctx *ctx)
-{
-	struct ckpt_hdr_fs *h;
-	struct fs_struct *fs;
-	char *path;
-	int ret = 0;
-
-	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_FS);
-	if (IS_ERR(h))
-		return ERR_PTR(PTR_ERR(h));
-	ckpt_hdr_put(ctx, h);
-
-	fs = copy_fs_struct(current->fs);
-	if (!fs)
-		return ERR_PTR(-ENOMEM);
-
-	ret = ckpt_read_fname(ctx, &path);
-	if (ret < 0)
-		goto out;
-	ret = restore_cwd(ctx, fs, path);
-	kfree(path);
-	if (ret)
-		goto out;
-
-	ret = ckpt_read_fname(ctx, &path);
-	if (ret < 0)
-		goto out;
-	ret = restore_chroot(ctx, fs, path);
-	kfree(path);
-
-out:
-	if (ret) {
-		free_fs_struct(fs);
-		return ERR_PTR(ret);
-	}
-	return fs;
-}
-
-void *restore_fs(struct ckpt_ctx *ctx)
-{
-	return (void *) do_restore_fs(ctx);
-}
diff --git a/fs/Makefile b/fs/Makefile
index af6d047..93c4775 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_EVENTFD)		+= eventfd.o
 obj-$(CONFIG_AIO)               += aio.o
 obj-$(CONFIG_FILE_LOCKING)      += locks.o
 obj-$(CONFIG_COMPAT)		+= compat.o compat_ioctl.o
+obj-$(CONFIG_CHECKPOINT)	+= checkpoint.o
 
 nfsd-$(CONFIG_NFSD)		:= nfsctl.o
 obj-y				+= $(nfsd-y) $(nfsd-m)
diff --git a/fs/checkpoint.c b/fs/checkpoint.c
new file mode 100644
index 0000000..2859cf9
--- /dev/null
+++ b/fs/checkpoint.c
@@ -0,0 +1,1041 @@
+/*
+ *  Checkpoint file descriptors
+ *
+ *  Copyright (C) 2008-2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG  CKPT_DFILE
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/fs_struct.h>
+#include <linux/fs.h>
+#include <linux/fdtable.h>
+#include <linux/fsnotify.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/syscalls.h>
+#include <linux/deferqueue.h>
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+#include <linux/eventpoll.h>
+#include <linux/eventfd.h>
+#include <net/sock.h>
+
+
+/**************************************************************************
+ * Checkpoint
+ */
+
+/**
+ * ckpt_fill_fname - return pathname of a given file
+ * @path: path name
+ * @root: relative root
+ * @buf: buffer for pathname
+ * @len: buffer length (in) and pathname length (out)
+ */
+char *ckpt_fill_fname(struct path *path, struct path *root, char *buf, int *len)
+{
+	struct path tmp = *root;
+	char *fname;
+
+	BUG_ON(!buf);
+	spin_lock(&dcache_lock);
+	fname = __d_path(path, &tmp, buf, *len);
+	spin_unlock(&dcache_lock);
+	if (IS_ERR(fname))
+		return fname;
+	*len = (buf + (*len) - fname);
+	/*
+	 * FIX: if __d_path() changed these, it must have stepped out of
+	 * init's namespace. Since currently we require a unified namespace
+	 * within the container: simply fail.
+	 */
+	if (tmp.mnt != root->mnt || tmp.dentry != root->dentry)
+		fname = ERR_PTR(-EBADF);
+
+	return fname;
+}
+
+/**
+ * checkpoint_fname - write a file name
+ * @ctx: checkpoint context
+ * @path: path name
+ * @root: relative root
+ */
+int checkpoint_fname(struct ckpt_ctx *ctx, struct path *path, struct path *root)
+{
+	char *buf, *fname;
+	int ret, flen;
+
+	/*
+	 * FIXME: we can optimize and save memory (and storage) if we
+	 * share strings (through objhash) and reference them instead
+	 */
+
+	flen = PATH_MAX;
+	buf = kmalloc(flen, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	fname = ckpt_fill_fname(path, root, buf, &flen);
+	if (!IS_ERR(fname)) {
+		ret = ckpt_write_obj_type(ctx, fname, flen,
+					  CKPT_HDR_FILE_NAME);
+	} else {
+		ret = PTR_ERR(fname);
+		ckpt_err(ctx, ret, "%(T)%(S)Obtain filename\n",
+			 path->dentry->d_name.name);
+	}
+
+	kfree(buf);
+	return ret;
+}
+
+#define CKPT_DEFAULT_FDTABLE  256		/* an initial guess */
+
+/**
+ * scan_fds - scan file table and construct array of open fds
+ * @files: files_struct pointer
+ * @fdtable: (output) array of open fds
+ *
+ * Returns the number of open fds found, and also the file table
+ * array via *fdtable. The caller should free the array.
+ *
+ * The caller must validate the file descriptors collected in the
+ * array before using them, e.g. by using fcheck_files(), in case
+ * the task's fdtable changes in the meantime.
+ */
+static int scan_fds(struct files_struct *files, int **fdtable)
+{
+	struct fdtable *fdt;
+	int *fds = NULL;
+	int i = 0, n = 0;
+	int tot = CKPT_DEFAULT_FDTABLE;
+
+	/*
+	 * We assume that all tasks possibly sharing the file table are
+	 * frozen (or we are a single process and we checkpoint ourselves).
+	 * Therefore, we can safely proceed after krealloc() from where we
+	 * left off. Otherwise the file table may be modified by another
+	 * task after we scan it. The behavior is this case is undefined,
+	 * and either checkpoint or restart will likely fail.
+	 */
+ retry:
+	fds = krealloc(fds, tot * sizeof(*fds), GFP_KERNEL);
+	if (!fds)
+		return -ENOMEM;
+
+	rcu_read_lock();
+	fdt = files_fdtable(files);
+	for (/**/; i < fdt->max_fds; i++) {
+		if (!fcheck_files(files, i))
+			continue;
+		if (n == tot) {
+			rcu_read_unlock();
+			tot *= 2;	/* won't overflow: kmalloc will fail */
+			goto retry;
+		}
+		fds[n++] = i;
+	}
+	rcu_read_unlock();
+
+	*fdtable = fds;
+	return n;
+}
+
+#ifdef CONFIG_SECURITY
+int checkpoint_file_security(struct ckpt_ctx *ctx, struct file *file)
+{
+	return security_checkpoint_obj(ctx, file->f_security,
+				       CKPT_SECURITY_FILE);
+}
+#else
+int checkpoint_file_security(struct ckpt_ctx *ctx, struct file *file)
+{
+	return SECURITY_CTX_NONE;
+}
+#endif
+
+int checkpoint_file_common(struct ckpt_ctx *ctx, struct file *file,
+			   struct ckpt_hdr_file *h)
+{
+	struct cred *f_cred = (struct cred *) file->f_cred;
+
+	h->f_flags = file->f_flags;
+	h->f_mode = file->f_mode;
+	h->f_pos = file->f_pos;
+	h->f_version = file->f_version;
+
+	h->f_credref = checkpoint_obj(ctx, f_cred, CKPT_OBJ_CRED);
+	if (h->f_credref < 0)
+		return h->f_credref;
+
+	h->f_secref = checkpoint_file_security(ctx, file);
+	if (h->f_secref < 0) {
+		ckpt_err(ctx, h->f_secref, "%(T)file->f_security");
+		return h->f_secref;
+	}
+
+	ckpt_debug("file %s credref %d secref %d\n",
+		file->f_dentry->d_name.name, h->f_credref, h->f_secref);
+
+	/* FIX: need also file->f_owner, etc */
+
+	return 0;
+}
+
+int generic_file_checkpoint(struct ckpt_ctx *ctx, struct file *file)
+{
+	struct ckpt_hdr_file_generic *h;
+	int ret;
+
+	/*
+	 * FIXME: when we'll add support for unlinked files/dirs, we'll
+	 * need to distinguish between unlinked filed and unlinked dirs.
+	 */
+	if (d_unlinked(file->f_dentry)) {
+		ckpt_err(ctx, -EBADF, "%(T)%(P)Unlinked files unsupported\n",
+			 file);
+		return -EBADF;
+	}
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
+	if (!h)
+		return -ENOMEM;
+
+	h->common.f_type = CKPT_FILE_GENERIC;
+
+	ret = checkpoint_file_common(ctx, file, &h->common);
+	if (ret < 0)
+		goto out;
+	ret = ckpt_write_obj(ctx, &h->common.h);
+	if (ret < 0)
+		goto out;
+	ret = checkpoint_fname(ctx, &file->f_path, &ctx->root_fs_path);
+ out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+EXPORT_SYMBOL(generic_file_checkpoint);
+
+/* checkpoint callback for file pointer */
+int checkpoint_file(struct ckpt_ctx *ctx, void *ptr)
+{
+	struct file *file = (struct file *) ptr;
+	int ret;
+
+	if (!file->f_op || !file->f_op->checkpoint) {
+		ckpt_err(ctx, -EBADF, "%(T)%(P)%(V)f_op lacks checkpoint\n",
+			       file, file->f_op);
+		return -EBADF;
+	}
+
+	if (is_dnotify_attached(file)) {
+		ckpt_err(ctx, -EBADF, "%(T)%(P)dnotify unsupported\n", file);
+		return -EBADF;
+	}
+
+	ret = file->f_op->checkpoint(ctx, file);
+	if (ret < 0)
+		ckpt_err(ctx, ret, "%(T)%(P)file checkpoint failed\n", file);
+	return ret;
+}
+
+/**
+ * ckpt_write_file_desc - dump the state of a given file descriptor
+ * @ctx: checkpoint context
+ * @files: files_struct pointer
+ * @fd: file descriptor
+ *
+ * Saves the state of the file descriptor; looks up the actual file
+ * pointer in the hash table, and if found saves the matching objref,
+ * otherwise calls ckpt_write_file to dump the file pointer too.
+ */
+static int checkpoint_file_desc(struct ckpt_ctx *ctx,
+				struct files_struct *files, int fd)
+{
+	struct ckpt_hdr_file_desc *h;
+	struct file *file = NULL;
+	struct fdtable *fdt;
+	int objref, ret;
+	int coe = 0;	/* avoid gcc warning */
+	pid_t pid;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE_DESC);
+	if (!h)
+		return -ENOMEM;
+
+	rcu_read_lock();
+	fdt = files_fdtable(files);
+	file = fcheck_files(files, fd);
+	if (file) {
+		coe = FD_ISSET(fd, fdt->close_on_exec);
+		get_file(file);
+	}
+	rcu_read_unlock();
+
+	ret = find_locks_with_owner(file, files);
+	/*
+	 * find_locks_with_owner() returns an error when there
+	 * are no locks found, so we *want* it to return an error
+	 * code.  Its success means we have to fail the checkpoint.
+	 */
+	if (!ret) {
+		ret = -EBADF;
+		ckpt_err(ctx, ret, "%(T)fd %d has file lock or lease\n", fd);
+		goto out;
+	}
+
+	/* sanity check (although this shouldn't happen) */
+	ret = -EBADF;
+	if (!file) {
+		ckpt_err(ctx, ret, "%(T)fd %d gone?\n", fd);
+		goto out;
+	}
+
+	/*
+	 * TODO: Implement c/r of fowner and f_sigio.  Should be
+	 * trivial, but for now we just refuse its checkpoint
+	 */
+	pid = f_getown(file);
+	if (pid) {
+		ret = -EBUSY;
+		ckpt_err(ctx, ret, "%(T)fd %d has an owner (%d)\n", fd);
+		goto out;
+	}
+
+	/*
+	 * if seen first time, this will add 'file' to the objhash, keep
+	 * a reference to it, dump its state while at it.
+	 */
+	objref = checkpoint_obj(ctx, file, CKPT_OBJ_FILE);
+	ckpt_debug("fd %d objref %d file %p coe %d)\n", fd, objref, file, coe);
+	if (objref < 0) {
+		ret = objref;
+		goto out;
+	}
+
+	h->fd_objref = objref;
+	h->fd_descriptor = fd;
+	h->fd_close_on_exec = coe;
+
+	ret = ckpt_write_obj(ctx, &h->h);
+out:
+	ckpt_hdr_put(ctx, h);
+	if (file)
+		fput(file);
+	return ret;
+}
+
+static int do_checkpoint_file_table(struct ckpt_ctx *ctx,
+				    struct files_struct *files)
+{
+	struct ckpt_hdr_file_table *h;
+	int *fdtable = NULL;
+	int nfds, n, ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE_TABLE);
+	if (!h)
+		return -ENOMEM;
+
+	nfds = scan_fds(files, &fdtable);
+	if (nfds < 0) {
+		ret = nfds;
+		goto out;
+	}
+
+	h->fdt_nfds = nfds;
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+	if (ret < 0)
+		goto out;
+
+	ckpt_debug("nfds %d\n", nfds);
+	for (n = 0; n < nfds; n++) {
+		ret = checkpoint_file_desc(ctx, files, fdtable[n]);
+		if (ret < 0)
+			goto out;
+	}
+
+	ret = deferqueue_run(ctx->files_deferq);
+	ckpt_debug("files_deferq ran %d entries\n", ret);
+	if (ret > 0)
+		ret = 0;
+ out:
+	kfree(fdtable);
+	return ret;
+}
+
+/* checkpoint callback for file table */
+int checkpoint_file_table(struct ckpt_ctx *ctx, void *ptr)
+{
+	return do_checkpoint_file_table(ctx, (struct files_struct *) ptr);
+}
+
+/* checkpoint wrapper for file table */
+int checkpoint_obj_file_table(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	struct files_struct *files;
+	int objref;
+
+	files = get_files_struct(t);
+	if (!files)
+		return -EBUSY;
+	objref = checkpoint_obj(ctx, files, CKPT_OBJ_FILE_TABLE);
+	put_files_struct(files);
+
+	return objref;
+}
+
+int checkpoint_obj_fs(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	struct fs_struct *fs;
+	int fs_objref;
+
+	task_lock(current);
+	fs = t->fs;
+	get_fs_struct(fs);
+	task_unlock(current);
+
+	fs_objref = checkpoint_obj(ctx, fs, CKPT_OBJ_FS);
+	put_fs_struct(fs);
+
+	return fs_objref;
+}
+
+/* called with fs refcount bumped so it won't disappear */
+static int do_checkpoint_fs(struct ckpt_ctx *ctx, struct fs_struct *fs)
+{
+	struct ckpt_hdr_fs *h;
+	struct fs_struct *fscopy;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FS);
+	if (!h)
+		return -ENOMEM;
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+	if (ret)
+		return ret;
+
+	fscopy = copy_fs_struct(fs);
+	if (!fs)
+		return -ENOMEM;
+
+	ret = checkpoint_fname(ctx, &fscopy->pwd, &ctx->root_fs_path);
+	if (ret < 0) {
+		ckpt_err(ctx, ret, "%(T)writing path of cwd");
+		goto out;
+	}
+	ret = checkpoint_fname(ctx, &fscopy->root, &ctx->root_fs_path);
+	if (ret < 0) {
+		ckpt_err(ctx, ret, "%(T)writing path of fs root");
+		goto out;
+	}
+	ret = 0;
+ out:
+	free_fs_struct(fscopy);
+	return ret;
+}
+
+int checkpoint_fs(struct ckpt_ctx *ctx, void *ptr)
+{
+	return do_checkpoint_fs(ctx, (struct fs_struct *) ptr);
+}
+
+/***********************************************************************
+ * Collect
+ */
+
+int ckpt_collect_file(struct ckpt_ctx *ctx, struct file *file)
+{
+	int ret;
+
+	ret = ckpt_obj_collect(ctx, file, CKPT_OBJ_FILE);
+	if (ret <= 0)
+		return ret;
+	/* if first time for this file (ret > 0), invoke ->collect() */
+	if (file->f_op->collect)
+		ret = file->f_op->collect(ctx, file);
+	if (ret < 0)
+		ckpt_err(ctx, ret, "%(T)%(P)File collect\n", file);
+	return ret;
+}
+
+static int collect_file_desc(struct ckpt_ctx *ctx,
+			     struct files_struct *files, int fd)
+{
+	struct fdtable *fdt;
+	struct file *file;
+	int ret;
+
+	rcu_read_lock();
+	fdt = files_fdtable(files);
+	file = fcheck_files(files, fd);
+	if (file)
+		get_file(file);
+	rcu_read_unlock();
+
+	if (!file) {
+		ckpt_err(ctx, -EBUSY, "%(T)%(P)File removed\n", file);
+		return -EBUSY;
+	}
+
+	ret = ckpt_collect_file(ctx, file);
+	fput(file);
+
+	return ret;
+}
+
+static int collect_file_table(struct ckpt_ctx *ctx, struct files_struct *files)
+{
+	int *fdtable;
+	int nfds, n;
+	int ret;
+
+	/* if already exists (ret == 0), nothing to do */
+	ret = ckpt_obj_collect(ctx, files, CKPT_OBJ_FILE_TABLE);
+	if (ret <= 0)
+		return ret;
+
+	/* if first time for this file table (ret > 0), proceed inside */
+	nfds = scan_fds(files, &fdtable);
+	if (nfds < 0)
+		return nfds;
+
+	for (n = 0; n < nfds; n++) {
+		ret = collect_file_desc(ctx, files, fdtable[n]);
+		if (ret < 0)
+			break;
+	}
+
+	kfree(fdtable);
+	return ret;
+}
+
+int ckpt_collect_file_table(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	struct files_struct *files;
+	int ret;
+
+	files = get_files_struct(t);
+	if (!files) {
+		ckpt_err(ctx, -EBUSY, "%(T)files_struct missing\n");
+		return -EBUSY;
+	}
+	ret = collect_file_table(ctx, files);
+	put_files_struct(files);
+
+	return ret;
+}
+
+int ckpt_collect_fs(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	struct fs_struct *fs;
+	int ret;
+
+	task_lock(t);
+	fs = t->fs;
+	get_fs_struct(fs);
+	task_unlock(t);
+
+	ret = ckpt_obj_collect(ctx, fs, CKPT_OBJ_FS);
+
+	put_fs_struct(fs);
+	return ret;
+}
+
+/**************************************************************************
+ * Restart
+ */
+
+static int ckpt_read_fname(struct ckpt_ctx *ctx, char **fname)
+{
+	int len;
+
+	len = ckpt_read_payload(ctx, (void **) fname,
+				PATH_MAX, CKPT_HDR_FILE_NAME);
+	if (len < 0)
+		return len;
+
+	(*fname)[len - 1] = '\0';	/* always play if safe */
+	ckpt_debug("read filename '%s'\n", *fname);
+
+	return len;
+}
+
+/**
+ * restore_open_fname - read a file name and open a file
+ * @ctx: checkpoint context
+ * @flags: file flags
+ */
+struct file *restore_open_fname(struct ckpt_ctx *ctx, int flags)
+{
+	struct file *file;
+	char *fname;
+	int len;
+
+	/* prevent bad input from doing bad things */
+	if (flags & (O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC))
+		return ERR_PTR(-EINVAL);
+
+	len = ckpt_read_fname(ctx, &fname);
+	if (len < 0)
+		return ERR_PTR(len);
+	ckpt_debug("fname '%s' flags %#x\n", fname, flags);
+
+	file = filp_open(fname, flags, 0);
+	kfree(fname);
+
+	return file;
+}
+
+static int close_all_fds(struct files_struct *files)
+{
+	int *fdtable;
+	int nfds;
+
+	nfds = scan_fds(files, &fdtable);
+	if (nfds < 0)
+		return nfds;
+	while (nfds--)
+		sys_close(fdtable[nfds]);
+	kfree(fdtable);
+	return 0;
+}
+
+/**
+ * attach_file - attach a lonely file ptr to a file descriptor
+ * @file: lonely file pointer
+ */
+static int attach_file(struct file *file)
+{
+	int fd = get_unused_fd_flags(0);
+
+	if (fd >= 0) {
+		get_file(file);
+		fsnotify_open(file->f_path.dentry);
+		fd_install(fd, file);
+	}
+	return fd;
+}
+
+#define CKPT_SETFL_MASK  \
+	(O_APPEND | O_NONBLOCK | O_NDELAY | FASYNC | O_DIRECT | O_NOATIME)
+
+int restore_file_common(struct ckpt_ctx *ctx, struct file *file,
+			struct ckpt_hdr_file *h)
+{
+	fmode_t new_mode = file->f_mode;
+	fmode_t saved_mode = (__force fmode_t) h->f_mode;
+	int ret;
+	struct cred *cred;
+
+	/* FIX: need to restore owner etc */
+
+	/* restore the cred */
+	cred = ckpt_obj_fetch(ctx, h->f_credref, CKPT_OBJ_CRED);
+	if (IS_ERR(cred))
+		return PTR_ERR(cred);
+	put_cred(file->f_cred);
+	file->f_cred = get_cred(cred);
+
+	ret = security_restore_obj(ctx, (void *) file, CKPT_SECURITY_FILE,
+				   h->f_secref);
+	if (ret < 0) {
+		ckpt_err(ctx, ret, "file secref %(O)%(P)\n", h->f_secref,
+			 file);
+		return ret;
+	}
+
+	/* safe to set 1st arg (fd) to 0, as command is F_SETFL */
+	ret = vfs_fcntl(0, F_SETFL, h->f_flags & CKPT_SETFL_MASK, file);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Normally f_mode is set by open, and modified only via
+	 * fcntl(), so its value now should match that at checkpoint.
+	 * However, a file may be downgraded from (read-)write to
+	 * read-only, e.g:
+	 *  - mark_files_ro() unsets FMODE_WRITE
+	 *  - nfs4_file_downgrade() too, and also sert FMODE_READ
+	 * Validate the new f_mode against saved f_mode, allowing:
+	 *  - new with FMODE_WRITE, saved without FMODE_WRITE
+	 *  - new without FMODE_READ, saved with FMODE_READ
+	 */
+	if ((new_mode & FMODE_WRITE) && !(saved_mode & FMODE_WRITE)) {
+		new_mode &= ~FMODE_WRITE;
+		if (!(new_mode & FMODE_READ) && (saved_mode & FMODE_READ))
+			new_mode |= FMODE_READ;
+	}
+	/* finally, at this point new mode should match saved mode */
+	if (new_mode ^ saved_mode)
+		return -EINVAL;
+
+	if (file->f_mode & FMODE_LSEEK)
+		ret = vfs_llseek(file, h->f_pos, SEEK_SET);
+
+	return ret;
+}
+
+static struct file *generic_file_restore(struct ckpt_ctx *ctx,
+					 struct ckpt_hdr_file *ptr)
+{
+	struct file *file;
+	int ret;
+
+	if (ptr->h.type != CKPT_HDR_FILE  ||
+	    ptr->h.len != sizeof(*ptr) || ptr->f_type != CKPT_FILE_GENERIC)
+		return ERR_PTR(-EINVAL);
+
+	file = restore_open_fname(ctx, ptr->f_flags);
+	if (IS_ERR(file))
+		return file;
+
+	ret = restore_file_common(ctx, file, ptr);
+	if (ret < 0) {
+		fput(file);
+		file = ERR_PTR(ret);
+	}
+	return file;
+}
+
+struct restore_file_ops {
+	char *file_name;
+	enum file_type file_type;
+	struct file * (*restore) (struct ckpt_ctx *ctx,
+				  struct ckpt_hdr_file *ptr);
+};
+
+static struct restore_file_ops restore_file_ops[] = {
+	/* ignored file */
+	{
+		.file_name = "IGNORE",
+		.file_type = CKPT_FILE_IGNORE,
+		.restore = NULL,
+	},
+	/* regular file/directory */
+	{
+		.file_name = "GENERIC",
+		.file_type = CKPT_FILE_GENERIC,
+		.restore = generic_file_restore,
+	},
+	/* pipes */
+	{
+		.file_name = "PIPE",
+		.file_type = CKPT_FILE_PIPE,
+		.restore = pipe_file_restore,
+	},
+	/* fifo */
+	{
+		.file_name = "FIFO",
+		.file_type = CKPT_FILE_FIFO,
+		.restore = fifo_file_restore,
+	},
+	/* socket */
+	{
+		.file_name = "SOCKET",
+		.file_type = CKPT_FILE_SOCKET,
+		.restore = sock_file_restore,
+	},
+	/* tty */
+	{
+		.file_name = "TTY",
+		.file_type = CKPT_FILE_TTY,
+		.restore = tty_file_restore,
+	},
+	/* epoll */
+	{
+		.file_name = "EPOLL",
+		.file_type = CKPT_FILE_EPOLL,
+		.restore = ep_file_restore,
+	},
+	/* eventfd */
+	{
+		.file_name = "EVENTFD",
+		.file_type = CKPT_FILE_EVENTFD,
+		.restore = eventfd_restore,
+	},
+};
+
+static struct file *do_restore_file(struct ckpt_ctx *ctx)
+{
+	struct restore_file_ops *ops;
+	struct ckpt_hdr_file *h;
+	struct file *file = ERR_PTR(-EINVAL);
+
+	/*
+	 * All 'struct ckpt_hdr_file_...' begin with ckpt_hdr_file,
+	 * but the actual object depends on the file type. The length
+	 * should never be more than page.
+	 */
+	h = ckpt_read_buf_type(ctx, PAGE_SIZE, CKPT_HDR_FILE);
+	if (IS_ERR(h))
+		return (struct file *) h;
+	ckpt_debug("flags %#x mode %#x type %d\n",
+		 h->f_flags, h->f_mode, h->f_type);
+
+	if (h->f_type >= CKPT_FILE_MAX)
+		goto out;
+
+	ops = &restore_file_ops[h->f_type];
+	BUG_ON(ops->file_type != h->f_type);
+
+	if (ops->restore)
+		file = ops->restore(ctx, h);
+ out:
+	ckpt_hdr_put(ctx, h);
+	return file;
+}
+
+/* restore callback for file pointer */
+void *restore_file(struct ckpt_ctx *ctx)
+{
+	return (void *) do_restore_file(ctx);
+}
+
+/**
+ * ckpt_read_file_desc - restore the state of a given file descriptor
+ * @ctx: checkpoint context
+ *
+ * Restores the state of a file descriptor; looks up the objref (in the
+ * header) in the hash table, and if found picks the matching file and
+ * use it; otherwise calls restore_file to restore the file too.
+ */
+static int restore_file_desc(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_file_desc *h;
+	struct file *file;
+	int newfd, ret;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_FILE_DESC);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+	ckpt_debug("ref %d fd %d c.o.e %d\n",
+		 h->fd_objref, h->fd_descriptor, h->fd_close_on_exec);
+
+	ret = -EINVAL;
+	if (h->fd_objref <= 0 || h->fd_descriptor < 0)
+		goto out;
+
+	file = ckpt_obj_fetch(ctx, h->fd_objref, CKPT_OBJ_FILE);
+	if (IS_ERR(file)) {
+		ret = PTR_ERR(file);
+		goto out;
+	}
+
+	newfd = attach_file(file);
+	if (newfd < 0) {
+		ret = newfd;
+		goto out;
+	}
+
+	ckpt_debug("newfd got %d wanted %d\n", newfd, h->fd_descriptor);
+
+	/* reposition if newfd isn't desired fd */
+	if (newfd != h->fd_descriptor) {
+		ret = sys_dup2(newfd, h->fd_descriptor);
+		if (ret < 0)
+			goto out;
+		sys_close(newfd);
+	}
+
+	set_close_on_exec(h->fd_descriptor, h->fd_close_on_exec);
+	ret = 0;
+ out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+/* restore callback for file table */
+static struct files_struct *do_restore_file_table(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_file_table *h;
+	struct files_struct *files;
+	int i, ret;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_FILE_TABLE);
+	if (IS_ERR(h))
+		return (struct files_struct *) h;
+
+	ckpt_debug("nfds %d\n", h->fdt_nfds);
+
+	ret = -EMFILE;
+	if (h->fdt_nfds < 0 || h->fdt_nfds > sysctl_nr_open)
+		goto out;
+
+	/*
+	 * We assume that restarting tasks, as created in user-space,
+	 * have distinct files_struct objects each. If not, we need to
+	 * call dup_fd() to make sure we don't overwrite an already
+	 * restored one.
+	 */
+
+	/* point of no return -- close all file descriptors */
+	ret = close_all_fds(current->files);
+	if (ret < 0)
+		goto out;
+
+	for (i = 0; i < h->fdt_nfds; i++) {
+		ret = restore_file_desc(ctx);
+		if (ret < 0)
+			goto out;
+	}
+
+	ret = deferqueue_run(ctx->files_deferq);
+	ckpt_debug("files_deferq ran %d entries\n", ret);
+	if (ret > 0)
+		ret = 0;
+ out:
+	ckpt_hdr_put(ctx, h);
+	if (!ret) {
+		files = current->files;
+		atomic_inc(&files->count);
+	} else {
+		files = ERR_PTR(ret);
+	}
+	return files;
+}
+
+void *restore_file_table(struct ckpt_ctx *ctx)
+{
+	return (void *) do_restore_file_table(ctx);
+}
+
+int restore_obj_file_table(struct ckpt_ctx *ctx, int files_objref)
+{
+	struct files_struct *files;
+
+	files = ckpt_obj_fetch(ctx, files_objref, CKPT_OBJ_FILE_TABLE);
+	if (IS_ERR(files))
+		return PTR_ERR(files);
+
+	if (files != current->files) {
+		task_lock(current);
+		put_files_struct(current->files);
+		current->files = files;
+		task_unlock(current);
+		atomic_inc(&files->count);
+	}
+
+	return 0;
+}
+
+/*
+ * Called by task restore code to set the restarted task's
+ * current->fs to an entry on the hash
+ */
+int restore_obj_fs(struct ckpt_ctx *ctx, int fs_objref)
+{
+	struct fs_struct *newfs, *oldfs;
+
+	newfs = ckpt_obj_fetch(ctx, fs_objref, CKPT_OBJ_FS);
+	if (IS_ERR(newfs))
+		return PTR_ERR(newfs);
+
+	task_lock(current);
+	get_fs_struct(newfs);
+	oldfs = current->fs;
+	current->fs = newfs;
+	task_unlock(current);
+	put_fs_struct(oldfs);
+
+	return 0;
+}
+
+static int restore_chroot(struct ckpt_ctx *ctx, struct fs_struct *fs, char *name)
+{
+	struct nameidata nd;
+	int ret;
+
+	ckpt_debug("attempting chroot to %s\n", name);
+	ret = path_lookup(name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &nd);
+	if (ret) {
+		ckpt_err(ctx, ret, "%(T)Opening chroot dir %s", name);
+		return ret;
+	}
+	ret = do_chroot(fs, &nd.path);
+	path_put(&nd.path);
+	if (ret) {
+		ckpt_err(ctx, ret, "%(T)Setting chroot %s", name);
+		return ret;
+	}
+	return 0;
+}
+
+static int restore_cwd(struct ckpt_ctx *ctx, struct fs_struct *fs, char *name)
+{
+	struct nameidata nd;
+	int ret;
+
+	ckpt_debug("attempting chdir to %s\n", name);
+	ret = path_lookup(name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &nd);
+	if (ret) {
+		ckpt_err(ctx, ret, "%(T)Opening cwd %s", name);
+		return ret;
+	}
+	ret = do_chdir(fs, &nd.path);
+	path_put(&nd.path);
+	if (ret) {
+		ckpt_err(ctx, ret, "%(T)Setting cwd %s", name);
+		return ret;
+	}
+	return 0;
+}
+
+/*
+ * Called by objhash when it runs into a CKPT_OBJ_FS entry. Creates
+ * an fs_struct with desired chroot/cwd and places it in the hash.
+ */
+static struct fs_struct *do_restore_fs(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_fs *h;
+	struct fs_struct *fs;
+	char *path;
+	int ret = 0;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_FS);
+	if (IS_ERR(h))
+		return ERR_PTR(PTR_ERR(h));
+	ckpt_hdr_put(ctx, h);
+
+	fs = copy_fs_struct(current->fs);
+	if (!fs)
+		return ERR_PTR(-ENOMEM);
+
+	ret = ckpt_read_fname(ctx, &path);
+	if (ret < 0)
+		goto out;
+	ret = restore_cwd(ctx, fs, path);
+	kfree(path);
+	if (ret)
+		goto out;
+
+	ret = ckpt_read_fname(ctx, &path);
+	if (ret < 0)
+		goto out;
+	ret = restore_chroot(ctx, fs, path);
+	kfree(path);
+
+out:
+	if (ret) {
+		free_fs_struct(fs);
+		return ERR_PTR(ret);
+	}
+	return fs;
+}
+
+void *restore_fs(struct ckpt_ctx *ctx)
+{
+	return (void *) do_restore_fs(ctx);
+}
-- 
1.6.3.3

_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers




More information about the Devel mailing list