[Devel] Re: [RFC v2][PATCH 1/9] checkpoint-restart: general infrastructure

Matt Helsley matthltc at us.ibm.com
Thu Aug 28 20:34:58 PDT 2008


On Wed, 2008-08-20 at 12:25 -0700, Dave Hansen wrote:
> This patch adds those interfaces, as well as all of the helpers
> needed to easily manage the file format.
> 
> The code is roughly broken out as follows:
> 
> ckpt/sys.c - user/kernel data transfer, as well as setting up of the
> 	     checkpoint/restart context (a per-checkpoint data
> 	     structure for housekeeping)
> ckpt/checkpoint.c - output wrappers and basic checkpoint handling
> ckpt/restart.c - input wrappers and basic restart handling
> 
> Patches to add the per-architecture support as well as the actual
> work to do the memory checkpoint follow in subsequent patches.
> 
> changes from last version:
> - Moved over to pr_debug() from CR_PRINTK()
> - Moved magic number over to linux/magic.h
> 
> TODO:
> - Investigate using anon_inodes for the sys_checkpoint() side
> - Move all the structure declarations to somewhere that we
>   can easily export them to userspace.
> - Lots of ABI issues to work out.
> 
> Signed-off-by: Oren Laadan <orenl at cs.columbia.edu>
> ---
> 
>  oren-cr.git-dave/Makefile                |    2 
>  oren-cr.git-dave/checkpoint/Makefile     |    1 
>  oren-cr.git-dave/checkpoint/checkpoint.c |  208 +++++++++++++++++++++++++++
>  oren-cr.git-dave/checkpoint/ckpt.h       |   71 +++++++++
>  oren-cr.git-dave/checkpoint/ckpt_hdr.h   |   69 +++++++++
>  oren-cr.git-dave/checkpoint/restart.c    |  190 +++++++++++++++++++++++++
>  oren-cr.git-dave/checkpoint/sys.c        |  233 +++++++++++++++++++++++++++++++
>  oren-cr.git-dave/include/linux/magic.h   |    2 
>  8 files changed, 775 insertions(+), 1 deletion(-)
> 
> diff -puN /dev/null checkpoint/checkpoint.c
> --- /dev/null	2008-04-22 10:49:52.000000000 -0700
> +++ oren-cr.git-dave/checkpoint/checkpoint.c	2008-08-20 12:12:48.000000000 -0700
> @@ -0,0 +1,208 @@
> +/*
> + *  Checkpoint logic and helpers
> + *
> + *  Copyright (C) 2008 Oren Laadan
> + *
> + *  This file is subject to the terms and conditions of the GNU General Public
> + *  License.  See the file COPYING in the main directory of the Linux
> + *  distribution for more details.
> + */
> +
> +#include <linux/version.h>
> +#include <linux/sched.h>
> +#include <linux/time.h>
> +#include <linux/fs.h>
> +#include <linux/file.h>
> +#include <linux/dcache.h>
> +#include <linux/magic.h>
> +#include <linux/mount.h>
> +#include <asm/ptrace.h>
> +
> +#include "ckpt.h"
> +#include "ckpt_hdr.h"
> +
> +/**
> + * cr_get_fname - return pathname of a given file
> + * @file: file pointer
> + * @buf: buffer for pathname
> + * @n: buffer length (in) and pathname length (out)
> + *
> + * if the buffer provivded by the caller is too small, allocate a new
> + * buffer; caller should call cr_put_pathname() for cleanup
> + */
> +char *cr_get_fname(struct path *path, struct path *root, char *buf, int *n)
> +{
> +	char *fname;
> +
> +	fname = __d_path(path, root, buf, *n);
> +
> +	if (IS_ERR(fname) && PTR_ERR(fname) == -ENAMETOOLONG) {
> +		 if (!(buf = (char *) __get_free_pages(GFP_KERNEL, 0)))
> +			 return ERR_PTR(-ENOMEM);
> +		fname = __d_path(path, root, buf, PAGE_SIZE);
> +		if (IS_ERR(fname))
> +			free_pages((unsigned long) buf, 0);
> +	}
> +	if (!IS_ERR(fname))
> +		*n = (buf + *n - fname);
> +
> +	return fname;
> +}
> +
> +/**
> + * cr_put_fname - (possibly) cleanup pathname buffer
> + * @buf: original buffer that was given to cr_get_pathname()
> + * @fname: resulting pathname from cr_get_pathname()
> + * @n: length of original buffer
> + */
> +void cr_put_fname(char *buf, char *fname, int n)
> +{
> +	if (fname && (fname < buf || fname >= buf + n))
> +		free_pages((unsigned long) buf, 0);
> +}
> +
> +/**
> + * cr_write_obj - write a record described by a cr_hdr
> + * @ctx: checkpoint context
> + * @h: record descriptor
> + * @buf: record buffer
> + */
> +int cr_write_obj(struct cr_ctx *ctx, struct cr_hdr *h, void *buf)
> +{
> +	int ret;
> +
> +	if ((ret = cr_kwrite(ctx, h, sizeof(*h))) < 0)
> +		return ret;
> +	return cr_kwrite(ctx, buf, h->len);
> +}
> +
> +/**
> + * cr_write_str - write a string record
> + * @ctx: checkpoint context
> + * @str: string buffer
> + * @n: string length
> + */
> +int cr_write_str(struct cr_ctx *ctx, char *str, int n)
> +{
> +	struct cr_hdr h;
> +
> +	h.type = CR_HDR_STR;
> +	h.len = n;
> +	h.id = 0;
> +
> +	return cr_write_obj(ctx, &h, str);
> +}
> +
> +/* write the checkpoint header */
> +static int cr_write_hdr(struct cr_ctx *ctx)
> +{
> +	struct cr_hdr h;
> +	struct cr_hdr_head *hh = ctx->tbuf;
> +	struct timeval ktv;
> +
> +	h.type = CR_HDR_HEAD;
> +	h.len = sizeof(hh);
> +	h.id = 0;
> +
> +	do_gettimeofday(&ktv);
> +
> +	hh->magic = CR_HEADER_MAGIC;
> +	hh->major = (LINUX_VERSION_CODE >> 16) & 0xff;
> +	hh->minor = (LINUX_VERSION_CODE >> 8) & 0xff;
> +	hh->patch = (LINUX_VERSION_CODE) & 0xff;
> +
> +	hh->version = 1;
> +
> +	hh->flags = ctx->flags;
> +	hh->time = ktv.tv_sec;
> +
> +	return cr_write_obj(ctx, &h, hh);
> +}
> +
> +/* write the checkpoint trailer */
> +static int cr_write_tail(struct cr_ctx *ctx)
> +{
> +	struct cr_hdr h;
> +	struct cr_hdr_tail *hh = ctx->tbuf;
> +
> +	h.type = CR_HDR_TAIL;
> +	h.len = sizeof(*hh);
> +	h.id = 0;
> +
> +	hh->magic = CR_HEADER_MAGIC;
> +	hh->cksum[0] = hh->cksum[1] = 1;	/* TBD ... */
> +
> +	return cr_write_obj(ctx, &h, hh);
> +}
> +
> +/* dump the task_struct of a given task */
> +static int cr_write_task_struct(struct cr_ctx *ctx, struct task_struct *t)
> +{
> +	struct cr_hdr h;
> +	struct cr_hdr_task *hh = ctx->tbuf;
> +
> +	h.type = CR_HDR_TASK;
> +	h.len = sizeof(*hh);
> +	h.id = ctx->pid;
> +
> +	hh->state = t->state;
> +	hh->exit_state = t->exit_state;
> +	hh->exit_code = t->exit_code;
> +	hh->exit_signal = t->exit_signal;
> +
> +	hh->pid = t->pid;
> +	hh->tgid = t->tgid;
> +
> +	hh->utime = t->utime;
> +	hh->stime = t->stime;
> +	hh->utimescaled = t->utimescaled;
> +	hh->stimescaled = t->stimescaled;
> +	hh->gtime = t->gtime;
> +	hh->prev_utime = t->prev_utime;
> +	hh->prev_stime = t->prev_stime;
> +	hh->nvcsw = t->nvcsw;
> +	hh->nivcsw = t->nivcsw;
> +	hh->start_time_sec = t->start_time.tv_sec;
> +	hh->start_time_nsec = t->start_time.tv_nsec;
> +	hh->real_start_time_sec = t->real_start_time.tv_sec;
> +	hh->real_start_time_nsec = t->real_start_time.tv_nsec;
> +	hh->min_flt = t->min_flt;
> +	hh->maj_flt = t->maj_flt;
> +
> +	hh->task_comm_len = TASK_COMM_LEN;
> +	memcpy(hh->comm, t->comm, TASK_COMM_LEN);
> +
> +	return cr_write_obj(ctx, &h, hh);
> +}
> +
> +/* dump the entire state of a given task */
> +static int cr_write_task(struct cr_ctx *ctx, struct task_struct *t)
> +{
> +	int ret ;
> +
> +	BUG_ON(t->state == TASK_DEAD);
> +
> +	ret = cr_write_task_struct(ctx, t);
> +	pr_debug("ret (task_struct) %d\n", ret);
> +
> +	return ret;
> +}
> +
> +int do_checkpoint(struct cr_ctx *ctx)
> +{
> +	int ret;
> +
> +	/* FIX: need to test whether container is checkpointable */
> +
> +	ret = cr_write_hdr(ctx);
> +	if (!ret)
> +		ret = cr_write_task(ctx, current);
> +	if (!ret)
> +		ret = cr_write_tail(ctx);
> +
> +	/* on success, return (unique) checkpoint identifier */
> +	if (!ret)
> +		ret = ctx->crid;
> +
> +	return ret;

How about conforming to existing kernel style by inverting the ret tests
and using goto here:

	ret = cr_write_hdr(ctx);
	if (ret)
		goto out;
	ret = cr_write_task(ctx, current);
	if (ret)
		goto out;
	ret = cr_write_tail(ctx);
	if (ret)
		goto out;
	ret = ctx->crid;
out:
	return ret;

That means that if we aren't always assigning to ret we won't always
need to (re)test it. Granted the compiler output is probably the same,
but I think this is much more readable by reviewers. It may also be
easier to maintain since you won't have to test ret before adding new
code here.

> +}
> diff -puN /dev/null checkpoint/ckpt.h
> --- /dev/null	2008-04-22 10:49:52.000000000 -0700
> +++ oren-cr.git-dave/checkpoint/ckpt.h	2008-08-20 12:12:48.000000000 -0700
> @@ -0,0 +1,71 @@
> +#ifndef _CKPT_CKPT_H_
> +#define _CKPT_CKPT_H_
> +/*
> + *  Generic container checkpoint-restart
> + *
> + *  Copyright (C) 2008 Oren Laadan
> + *
> + *  This file is subject to the terms and conditions of the GNU General Public
> + *  License.  See the file COPYING in the main directory of the Linux
> + *  distribution for more details.
> + */
> +
> +#include <linux/path.h>
> +#include <linux/fs.h>
> +
> +struct cr_pgarr;
> +
> +struct cr_ctx {
> +	pid_t pid;		/* container identifier */
> +	int crid;		/* unique checkpoint id */
> +
> +	unsigned long flags;
> +	unsigned long oflags;	/* restart: old flags */
> +
> +	struct file *file;
> +	int total;		/* total read/written */
> +
> +	void *tbuf;		/* temp: to avoid many alloc/dealloc */
> +	void *hbuf;		/* header: to avoid many alloc/dealloc */
> +	int hpos;
> +
> +	struct cr_pgarr *pgarr;
> +	struct cr_pgarr *pgcur;
> +
> +	struct path *vfsroot;	/* container root */
> +};
> +
> +/* cr_ctx: flags */
> +#define CR_CTX_CKPT	0x1
> +#define CR_CTX_RSTR	0x2
> +
> +/* allocation defaults */
> +#define CR_ORDER_TBUF  1
> +#define CR_ORDER_HBUF  1
> +
> +#define CR_TBUF_TOTAL  ((PAGE_SIZE << CR_ORDER_TBUF) / sizeof(void *))
> +#define CR_HBUF_TOTAL  ((PAGE_SIZE << CR_ORDER_HBUF) / sizeof(void *))
> +
> +extern void cr_put_fname(char *buf, char *fname, int n);
> +extern char *cr_get_fname(struct path *path, struct path *root, char *buf, int *n);
> +
> +extern int cr_uwrite(struct cr_ctx *ctx, void *buf, int count);
> +extern int cr_kwrite(struct cr_ctx *ctx, void *buf, int count);
> +extern int cr_uread(struct cr_ctx *ctx, void *buf, int count);
> +extern int cr_kread(struct cr_ctx *ctx, void *buf, int count);
> +
> +struct cr_hdr;
> +
> +extern int cr_write_obj(struct cr_ctx *ctx, struct cr_hdr *h, void *buf);
> +extern int cr_write_str(struct cr_ctx *ctx, char *str, int n);
> +extern int cr_write_mm(struct cr_ctx *ctx, struct task_struct *t);
> +
> +extern int cr_read_obj(struct cr_ctx *ctx, struct cr_hdr *h, void *buf, int n);
> +extern int cr_read_obj_type(struct cr_ctx *ctx, void *buf, int n, int type);
> +extern int cr_read_str(struct cr_ctx *ctx, void *str, int n);
> +extern int cr_read_mm(struct cr_ctx *ctx);
> +
> +extern int do_checkpoint(struct cr_ctx *ctx);
> +extern int do_restart(struct cr_ctx *ctx);
> +
> +#endif /* _CKPT_CKPT_H_ */
> diff -puN /dev/null checkpoint/ckpt_hdr.h
> --- /dev/null	2008-04-22 10:49:52.000000000 -0700
> +++ oren-cr.git-dave/checkpoint/ckpt_hdr.h	2008-08-20 12:12:48.000000000 -0700
> @@ -0,0 +1,69 @@
> +/*
> + *  Generic container checkpoint-restart
> + *
> + *  Copyright (C) 2008 Oren Laadan
> + *
> + *  This file is subject to the terms and conditions of the GNU General Public
> + *  License.  See the file COPYING in the main directory of the Linux
> + *  distribution for more details.
> + */
> +
> +#include <linux/types.h>
> +
> +struct cr_hdr {
> +	__s16 type;
> +	__s16 len;
> +	__u32 id;
> +};
> +
> +enum {
> +	CR_HDR_HEAD = 1,
> +	CR_HDR_STR,
> +
> +	CR_HDR_TASK = 101,
> +	CR_HDR_THREAD,
> +	CR_HDR_CPU,
> +
> +	CR_HDR_MM = 201,
> +	CR_HDR_VMA,
> +	CR_HDR_MM_CONTEXT,
> +
> +	CR_HDR_TAIL = 5001
> +};
> +
> +struct cr_hdr_head {
> +	__u32 magic;
> +	__u16 major;
> +	__u16 minor;
> +	__u16 patch;
> +	__u16 version;
> +	__u32 flags;	/* checkpoint options */
> +	__u64 time;	/* when checkpoint taken */
> +};
> +
> +struct cr_hdr_tail {
> +	__u32 magic;
> +	__u32 cksum[2];
> +};
> +
> +struct cr_hdr_task {
> +	__u64 state;
> +	__u32 exit_state;
> +	__u32 exit_code, exit_signal;
> +
> +	__u16 pid;
> +	__u16 tgid;
> +
> +	__u64 utime, stime, utimescaled, stimescaled;
> +	__u64 gtime;
> +	__u64 prev_utime, prev_stime;
> +	__u64 nvcsw, nivcsw;
> +	__u64 start_time_sec, start_time_nsec;
> +	__u64 real_start_time_sec, real_start_time_nsec;
> +	__u64 min_flt, maj_flt;
> +
> +	__s16 task_comm_len;
> +	char comm[TASK_COMM_LEN];
> +};
> +
> +
> diff -puN /dev/null checkpoint/Makefile
> --- /dev/null	2008-04-22 10:49:52.000000000 -0700
> +++ oren-cr.git-dave/checkpoint/Makefile	2008-08-20 12:12:48.000000000 -0700
> @@ -0,0 +1 @@
> +obj-y += sys.o checkpoint.o restart.o
> diff -puN /dev/null checkpoint/restart.c
> --- /dev/null	2008-04-22 10:49:52.000000000 -0700
> +++ oren-cr.git-dave/checkpoint/restart.c	2008-08-20 12:12:48.000000000 -0700
> @@ -0,0 +1,190 @@
> +/*
> + *  Restart logic and helpers
> + *
> + *  Copyright (C) 2008 Oren Laadan
> + *
> + *  This file is subject to the terms and conditions of the GNU General Public
> + *  License.  See the file COPYING in the main directory of the Linux
> + *  distribution for more details.
> + */
> +
> +/*
> + * During restart the code reads in data from the chekcpoint image into a
> + * temporary buffer (ctx->hbuf). Because operations can be nested, one
> + * should call cr_hbuf_get() to reserve space in the buffer, and then
> + * cr_hbuf_put() when it no longer needs that space
> + */
> +
> +#include <linux/version.h>
> +#include <linux/magic.h>
> +#include <linux/sched.h>
> +#include <linux/file.h>
> +
> +#include "ckpt.h"
> +#include "ckpt_hdr.h"
> +
> +/**
> + * cr_hbuf_get - reserve space on the hbuf
> + * @ctx: checkpoint context
> + * @n: number of bytes to reserve
> + */
> +void *cr_hbuf_get(struct cr_ctx *ctx, int n)
> +{
> +	void *ptr;
> +
> +	BUG_ON(ctx->hpos + n > CR_HBUF_TOTAL);
> +	ptr = (void *) (((char *) ctx->hbuf) + ctx->hpos);
> +	ctx->hpos += n;
> +	return ptr;
> +}
> +
> +/**
> + * cr_hbuf_put - unreserve space on the hbuf
> + * @ctx: checkpoint context
> + * @n: number of bytes to reserve
> + */
> +void cr_hbuf_put(struct cr_ctx *ctx, int n)
> +{
> +	BUG_ON(ctx->hpos < n);
> +	ctx->hpos -= n;
> +}
> +
> +/**
> + * cr_read_obj - read a whole record (cr_hdr followed by payload)
> + * @ctx: checkpoint context
> + * @h: record descriptor
> + * @buf: record buffer
> + * @n: available buffer size
> + */
> +int cr_read_obj(struct cr_ctx *ctx, struct cr_hdr *h, void *buf, int n)
> +{
> +	int ret;
> +
> +	ret = cr_kread(ctx, h, sizeof(*h));
> +	if (ret < 0)
> +		return ret;
> +
> +	pr_debug("type %d len %d id %d (%d)\n", h->type, h->len, h->id, n);
> +	if (h->len < 0 || h->len > n)
> +		return -EINVAL;
> +
> +	return cr_kread(ctx, buf, h->len);
> +}
> +
> +/**
> + * cr_read_obj_type - read a whole record of expected type
> + * @ctx: checkpoint context
> + * @buf: record buffer
> + * @n: available buffer size
> + * @type: expected record type
> + */
> +int cr_read_obj_type(struct cr_ctx *ctx, void *buf, int n, int type)
> +{
> +	struct cr_hdr h;
> +	int ret;
> +
> +	ret = cr_read_obj(ctx, &h, buf, n);
> +	if (!ret)
> +		ret = (h.type == type ? h.id : -EINVAL);
> +	return ret;
> +}
> +
> +/**
> + * cr_read_str - read a string record
> + * @ctx: checkpoint context
> + * @str: string buffer
> + * @n: string length
> + */
> +int cr_read_str(struct cr_ctx *ctx, void *str, int n)
> +{
> +	return cr_read_obj_type(ctx, str, n, CR_HDR_STR);
> +}
> +
> +/* read the checkpoint header */
> +static int cr_read_hdr(struct cr_ctx *ctx)
> +{
> +	struct cr_hdr_head *hh = cr_hbuf_get(ctx, sizeof(*hh));
> +	int ret;
> +
> +	ret = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_HEAD);
> +	if (ret < 0)
> +		return ret;
> +
> +	if (hh->magic != CR_HEADER_MAGIC || hh->version != 1 ||
> +	    hh->major != ((LINUX_VERSION_CODE >> 16) & 0xff) ||
> +	    hh->minor != ((LINUX_VERSION_CODE >> 8) & 0xff) ||
> +	    hh->patch != ((LINUX_VERSION_CODE) & 0xff))
> +		return -EINVAL;
> +
> +	if (hh->flags & ~CR_CTX_CKPT)
> +		return -EINVAL;
> +
> +	ctx->oflags = hh->flags;
> +
> +	cr_hbuf_put(ctx, sizeof(*hh));
> +	return 0;
> +}
> +
> +/* read the checkpoint trailer */
> +static int cr_read_tail(struct cr_ctx *ctx)
> +{
> +	struct cr_hdr_tail *hh = cr_hbuf_get(ctx, sizeof(*hh));
> +	int ret;
> +
> +	ret = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_TAIL);
> +	if (ret < 0)
> +		return ret;
> +
> +	if (hh->magic != CR_HEADER_MAGIC ||
> +	    hh->cksum[0] != 1 || hh->cksum[1] != 1)
> +		return -EINVAL;
> +
> +	cr_hbuf_put(ctx, sizeof(*hh));
> +	return 0;
> +}
> +
> +/* read the task_struct into the current task */
> +static int cr_read_task_struct(struct cr_ctx *ctx)
> +{
> +	struct cr_hdr_task *hh = cr_hbuf_get(ctx, sizeof(*hh));
> +	struct task_struct *t = current;
> +	int ret;
> +
> +	ret = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_TASK);
> +	if (ret < 0)
> +		return ret;
> +
> +	/* for now, only restore t->comm */
> +	if (hh->task_comm_len < 0 || hh->task_comm_len > TASK_COMM_LEN)
> +		return -EINVAL;
> +
> +	memset(t->comm, 0, TASK_COMM_LEN);
> +	memcpy(t->comm, hh->comm, hh->task_comm_len);
> +
> +	cr_hbuf_put(ctx, sizeof(*hh));
> +	return 0;
> +}
> +
> +/* read the entire state of the current task */
> +static int cr_read_task(struct cr_ctx *ctx)
> +{
> +	int ret;
> +
> +	ret = cr_read_task_struct(ctx);
> +	pr_debug("ret (task_struct) %d\n", ret);
> +
> +	return ret;
> +}
> +
> +int do_restart(struct cr_ctx *ctx)
> +{
> +	int ret;
> +
> +	ret = cr_read_hdr(ctx);
> +	if (!ret)
> +		ret = cr_read_task(ctx);
> +	if (!ret)
> +		ret = cr_read_tail(ctx);

same comment

> +
> +	return ret;
> +}
> diff -puN /dev/null checkpoint/sys.c
> --- /dev/null	2008-04-22 10:49:52.000000000 -0700
> +++ oren-cr.git-dave/checkpoint/sys.c	2008-08-20 12:12:48.000000000 -0700
> @@ -0,0 +1,233 @@
> +/*
> + *  Generic container checkpoint-restart
> + *
> + *  Copyright (C) 2008 Oren Laadan
> + *
> + *  This file is subject to the terms and conditions of the GNU General Public
> + *  License.  See the file COPYING in the main directory of the Linux
> + *  distribution for more details.
> + */
> +
> +#include <linux/sched.h>
> +#include <linux/fs.h>
> +#include <linux/file.h>
> +#include <linux/uaccess.h>
> +#include <linux/capability.h>
> +
> +#include "ckpt.h"
> +
> +/*
> + * helpers to write/read to/from the image file descriptor
> + *
> + *   cr_uwrite() - write a user-space buffer to the checkpoint image
> + *   cr_kwrite() - write a kernel-space buffer to the checkpoint image
> + *   cr_uread() - read from the checkpoint image to a user-space buffer
> + *   cr_kread() - read from the checkpoint image to a kernel-space buffer
> + *
> + */
> +
> +/* (temporarily added file_pos_read() and file_pos_write() because they
> + * are static in fs/read_write.c... should cleanup and remove later) */
> +static inline loff_t file_pos_read(struct file *file)
> +{
> +	return file->f_pos;
> +}
> +
> +static inline void file_pos_write(struct file *file, loff_t pos)
> +{
> +	file->f_pos = pos;
> +}
> +
> +int cr_uwrite(struct cr_ctx *ctx, void *buf, int count)
> +{
> +	struct file *file = ctx->file;
> +	ssize_t nwrite;
> +	int nleft;
> +
> +	for (nleft = count; nleft; nleft -= nwrite) {
> +		loff_t pos = file_pos_read(file);
> +		nwrite = vfs_write(file, (char __user *) buf, nleft, &pos);
> +		file_pos_write(file, pos);
> +		if (unlikely(nwrite <= 0))	/* zero tolerance */
> +			return (nwrite ? : -EIO);
> +		buf += nwrite;
> +	}
> +
> +	ctx->total += count;
> +	return 0;
> +}
> +
> +int cr_kwrite(struct cr_ctx *ctx, void *buf, int count)
> +{
> +	mm_segment_t oldfs;
> +	int ret;
> +
> +	oldfs = get_fs();
> +	set_fs(KERNEL_DS);
> +	ret = cr_uwrite(ctx, buf, count);
> +	set_fs(oldfs);
> +
> +	return ret;
> +}
> +
> +int cr_uread(struct cr_ctx *ctx, void *buf, int count)
> +{
> +	struct file *file = ctx->file;
> +	ssize_t nread;
> +	int nleft;
> +
> +	for (nleft = count; nleft; nleft -= nread) {
> +		loff_t pos = file_pos_read(file);
> +		nread = vfs_read(file, (char __user *) buf, nleft, &pos);
> +		file_pos_write(file, pos);
> +		if (unlikely(nread <= 0))	/* zero tolerance */
> +			return (nread ? : -EIO);
> +		buf += nread;
> +	}
> +
> +	ctx->total += count;
> +	return 0;
> +}
> +
> +int cr_kread(struct cr_ctx *ctx, void *buf, int count)
> +{
> +	mm_segment_t oldfs;
> +	int ret;
> +
> +	oldfs = get_fs();
> +	set_fs(KERNEL_DS);
> +	ret = cr_uread(ctx, buf, count);
> +	set_fs(oldfs);
> +
> +	return ret;
> +}
> +
> +
> +/*
> + * helpers to manage CR contexts: allocated for each checkpoint and/or
> + * restart operation, and persists until the operation is completed.
> + */
> +
> +static atomic_t cr_ctx_count;	/* unique checkpoint identifier */
> +
> +void cr_ctx_free(struct cr_ctx *ctx)
> +{
> +
> +	if (ctx->file)
> +		fput(ctx->file);
> +	if (ctx->vfsroot)
> +		path_put(ctx->vfsroot);
> +
> +	free_pages((unsigned long) ctx->tbuf, CR_ORDER_TBUF);
> +	free_pages((unsigned long) ctx->hbuf, CR_ORDER_HBUF);
> +
> +	kfree(ctx);
> +}
> +
> +struct cr_ctx *cr_ctx_alloc(pid_t pid, struct file *file, unsigned long flags)
> +{
> +	struct cr_ctx *ctx;
> +
> +	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
> +	if (!ctx)
> +		return NULL;
> +
> +	ctx->tbuf = (void *) __get_free_pages(GFP_KERNEL, CR_ORDER_TBUF);
> +	ctx->hbuf = (void *) __get_free_pages(GFP_KERNEL, CR_ORDER_HBUF);
> +	if (!ctx->tbuf || !ctx->hbuf)
> +		goto nomem;
> +
> +	ctx->pid = pid;
> +	ctx->flags = flags;
> +
> +	ctx->file = file;
> +	get_file(file);
> +
> +	/* assume checkpointer is in container's root vfs */
> +	ctx->vfsroot = &current->fs->root;
> +	path_get(ctx->vfsroot);
> +
> +	ctx->crid = atomic_inc_return(&cr_ctx_count);
> +
> +	return ctx;
> +
> + nomem:
> +	cr_ctx_free(ctx);
> +	return NULL;
> +}
> +
> +/**
> + * sys_checkpoint - checkpoint a container
> + * @pid: pid of the container init(1) process
> + * @fd: file to which dump the checkpoint image
> + * @flags: checkpoint operation flags
> + */
> +asmlinkage long sys_checkpoint(pid_t pid, int fd, unsigned long flags)
> +{
> +	struct cr_ctx *ctx;
> +	struct file *file;
> +	int fput_needed;
> +	int ret;
> +
> +	if (!capable(CAP_SYS_ADMIN))
> +		return -EPERM;
> +
> +	file = fget_light(fd, &fput_needed);
> +	if (!file)
> +		return -EBADF;
> +
> +	/* no flags for now */
> +	if (flags)
> +		return -EINVAL;
> +
> +	ctx = cr_ctx_alloc(pid, file, flags | CR_CTX_CKPT);
> +	if (!ctx) {
> +		fput_light(file, fput_needed);
> +		return -ENOMEM;
> +	}
> +
> +	ret = do_checkpoint(ctx);
> +
> +	cr_ctx_free(ctx);
> +	fput_light(file, fput_needed);
> +	pr_debug("ckpt retval = %d\n", ret);
> +	return ret;
> +}
> +
> +/**
> + * sys_restart - restart a container
> + * @crid: checkpoint image identifier
> + * @fd: file from which read the checkpoint image
> + * @flags: restart operation flags
> + */
> +asmlinkage long sys_restart(int crid, int fd, unsigned long flags)
> +{
> +	struct cr_ctx *ctx;
> +	struct file *file;
> +	int fput_needed;
> +	int ret;
> +
> +	if (!capable(CAP_SYS_ADMIN))
> +		return -EPERM;
> +
> +	file = fget_light(fd, &fput_needed);
> +	if (!file)
> +		return -EBADF;
> +
> +	/* no flags for now */
> +	if (flags)
> +		return -EINVAL;
> +
> +	ctx = cr_ctx_alloc(crid, file, flags | CR_CTX_RSTR);
> +	if (!ctx) {
> +		fput_light(file, fput_needed);
> +		return -ENOMEM;
> +	}
> +
> +	ret = do_restart(ctx);
> +
> +	cr_ctx_free(ctx);
> +	fput_light(file, fput_needed);
> +	pr_debug("restart retval = %d\n", ret);
> +	return ret;
> +}
> diff -puN include/linux/magic.h~0001-checkpoint-restart-general-infrastructure include/linux/magic.h
> --- oren-cr.git/include/linux/magic.h~0001-checkpoint-restart-general-infrastructure	2008-08-20 12:12:48.000000000 -0700
> +++ oren-cr.git-dave/include/linux/magic.h	2008-08-20 12:12:48.000000000 -0700
> @@ -42,4 +42,6 @@
>  #define FUTEXFS_SUPER_MAGIC	0xBAD1DEA
>  #define INOTIFYFS_SUPER_MAGIC	0x2BAD1DEA
> 
> +#define CR_HEADER_MAGIC		0x002d2a00
> +
>  #endif /* __LINUX_MAGIC_H__ */
> diff -puN Makefile~0001-checkpoint-restart-general-infrastructure Makefile
> --- oren-cr.git/Makefile~0001-checkpoint-restart-general-infrastructure	2008-08-20 12:12:48.000000000 -0700
> +++ oren-cr.git-dave/Makefile	2008-08-20 12:12:48.000000000 -0700
> @@ -619,7 +619,7 @@ export mod_strip_cmd
> 
> 
>  ifeq ($(KBUILD_EXTMOD),)
> -core-y		+= kernel/ mm/ fs/ ipc/ security/ crypto/ block/
> +core-y		+= kernel/ mm/ fs/ ipc/ security/ crypto/ block/ checkpoint/
> 
>  vmlinux-dirs	:= $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \
>  		     $(core-y) $(core-m) $(drivers-y) $(drivers-m) \
> _
> _______________________________________________
> Containers mailing list
> Containers at lists.linux-foundation.org
> https://lists.linux-foundation.org/mailman/listinfo/containers

_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers




More information about the Devel mailing list