[Devel] Re: [PATCH 08/10] Introduce functions to restart a process

Louis Rilling Louis.Rilling at kerlabs.com
Mon Oct 20 06:25:36 PDT 2008


On Sat, Oct 18, 2008 at 03:11:36AM +0400, Andrey Mirkin wrote:
> Functions to restart process, restore its state, fpu and registers are added.

[...]

> diff --git a/checkpoint/rst_process.c b/checkpoint/rst_process.c
> new file mode 100644
> index 0000000..b9f745e
> --- /dev/null
> +++ b/checkpoint/rst_process.c
> @@ -0,0 +1,277 @@
> +/*
> + *  Copyright (C) 2008 Parallels, Inc.
> + *
> + *  Author: Andrey Mirkin <major at openvz.org>
> + *
> + *  This program is free software; you can redistribute it and/or
> + *  modify it under the terms of the GNU General Public License as
> + *  published by the Free Software Foundation, version 2 of the
> + *  License.
> + *
> + */
> +
> +#include <linux/sched.h>
> +#include <linux/fs.h>
> +#include <linux/file.h>
> +#include <linux/version.h>
> +#include <linux/module.h>
> +
> +#include "checkpoint.h"
> +#include "cpt_image.h"
> +
> +#define HOOK_RESERVE	256
> +
> +struct thr_context {
> +	struct completion complete;
> +	int error;
> +	struct cpt_context *ctx;
> +	struct task_struct *tsk;
> +};
> +
> +int local_kernel_thread(int (*fn)(void *), void * arg, unsigned long flags, pid_t pid)
> +{
> +	pid_t ret;
> +
> +	if (current->fs == NULL) {
> +		/* do_fork_pid() hates processes without fs, oopses. */
> +		eprintk("local_kernel_thread: current->fs==NULL\n");
> +		return -EINVAL;
> +	}
> +	if (!try_module_get(THIS_MODULE))
> +		return -EBUSY;
> +	ret = kernel_thread(fn, arg, flags);
> +	if (ret < 0)
> +		module_put(THIS_MODULE);
> +	return ret;
> +}
> +
> +static unsigned int decode_task_flags(unsigned int task_flags)
> +{
> +	unsigned int flags = 0;
> +
> +	if (task_flags & (1 << CPT_PF_EXITING))
> +		flags |= PF_EXITING;
> +	if (task_flags & (1 << CPT_PF_FORKNOEXEC))
> +		flags |= PF_FORKNOEXEC;
> +	if (task_flags & (1 << CPT_PF_SUPERPRIV))
> +		flags |= PF_SUPERPRIV;
> +	if (task_flags & (1 << CPT_PF_DUMPCORE))
> +		flags |= PF_DUMPCORE;
> +	if (task_flags & (1 << CPT_PF_SIGNALED))
> +		flags |= PF_SIGNALED;
> +	
> +	return flags;
> +		
> +}
> +
> +int rst_restore_task_struct(struct task_struct *tsk, struct cpt_task_image *ti,
> +			    struct cpt_context *ctx)
> +{
> +	int i;
> +
> +	/* Restore only saved flags, comm and tls for now */
> +	tsk->flags = decode_task_flags(ti->cpt_flags);
> +	clear_tsk_thread_flag(tsk, TIF_FREEZE);
> +	memcpy(tsk->comm, ti->cpt_comm, TASK_COMM_LEN);
> +	for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) {
> +		tsk->thread.tls_array[i].a = ti->cpt_tls[i] & 0xFFFFFFFF;
> +		tsk->thread.tls_array[i].b = ti->cpt_tls[i] >> 32;
> +	}
> +
> +	return 0;
> +}
> +
> +static int rst_restore_fpustate(struct task_struct *tsk, struct cpt_task_image *ti,
> +				struct cpt_context *ctx)
> +{
> +	struct cpt_obj_bits hdr;
> +	int err;
> +	char *buf;
> +
> +	clear_stopped_child_used_math(tsk);
> +
> +	err = rst_get_object(CPT_OBJ_BITS, &hdr, sizeof(hdr), ctx);
> +	if (err < 0)
> +		return err;
> +
> +	buf = kmalloc(hdr.cpt_size, GFP_KERNEL);
> +	if (!buf)
> +		return -ENOMEM;
> +
> +	err = ctx->read(buf, hdr.cpt_size, ctx);
> +	if (err)
> +		goto out;
> +
> +	if (hdr.cpt_content == CPT_CONTENT_X86_FPUSTATE && cpu_has_fxsr) {
> +		memcpy(&tsk->thread.xstate, buf,
> +				sizeof(struct i387_fxsave_struct));
> +		if (ti->cpt_flags & CPT_PF_USED_MATH)
> +			set_stopped_child_used_math(tsk);
> +	}
> +#ifndef CONFIG_X86_64
> +	else if (hdr.cpt_content == CPT_CONTENT_X86_FPUSTATE_OLD &&
> +			!cpu_has_fxsr) {		
> +		memcpy(&tsk->thread.xstate, buf,
> +				sizeof(struct i387_fsave_struct));
> +		if (ti->cpt_flags & CPT_PF_USED_MATH)
> +			set_stopped_child_used_math(tsk);
> +	}
> +#endif
> +
> +out:
> +	kfree(buf);
> +	return err;
> +}
> +
> +static u32 decode_segment(u32 segid)
> +{
> +	if (segid == CPT_SEG_ZERO)
> +		return 0;
> +
> +	/* TLS descriptors */
> +	if (segid <= CPT_SEG_TLS3)
> +		return ((GDT_ENTRY_TLS_MIN + segid - CPT_SEG_TLS1) << 3) + 3;
> +
> +	/* LDT descriptor, it is just an index to LDT array */
> +	if (segid >= CPT_SEG_LDT)
> +		return ((segid - CPT_SEG_LDT) << 3) | 7;
> +
> +	/* Check for one of standard descriptors */
> +	if (segid == CPT_SEG_USER32_DS)
> +		return __USER_DS;
> +	if (segid == CPT_SEG_USER32_CS)
> +		return __USER_CS;
> +
> +	eprintk("Invalid segment reg %d\n", segid);
> +	return 0;
> +}
> +
> +static int rst_restore_registers(struct task_struct *tsk, struct cpt_context *ctx)
> +{
> +	struct cpt_x86_regs ri;
> +	struct pt_regs *regs = task_pt_regs(tsk);
> +	extern char i386_ret_from_resume;
> +	int err;
> +
> +	err = rst_get_object(CPT_OBJ_X86_REGS, &ri, sizeof(ri), ctx);
> +	if (err < 0)
> +		return err;
> +
> +	tsk->thread.sp = (unsigned long) regs;
> +	tsk->thread.sp0 = (unsigned long) (regs+1);
> +	tsk->thread.ip = (unsigned long) &i386_ret_from_resume;
> +
> +	tsk->thread.gs = decode_segment(ri.cpt_gs);
> +	tsk->thread.debugreg0 = ri.cpt_debugreg[0];
> +	tsk->thread.debugreg1 = ri.cpt_debugreg[1];
> +	tsk->thread.debugreg2 = ri.cpt_debugreg[2];
> +	tsk->thread.debugreg3 = ri.cpt_debugreg[3];
> +	tsk->thread.debugreg6 = ri.cpt_debugreg[6];
> +	tsk->thread.debugreg7 = ri.cpt_debugreg[7];
> +
> +	regs->bx = ri.cpt_bx;
> +	regs->cx = ri.cpt_cx;
> +	regs->dx = ri.cpt_dx;
> +	regs->si = ri.cpt_si;
> +	regs->di = ri.cpt_di;
> +	regs->bp = ri.cpt_bp;
> +	regs->ax = ri.cpt_ax;
> +	regs->orig_ax = ri.cpt_orig_ax;
> +	regs->ip = ri.cpt_ip;
> +	regs->flags = ri.cpt_flags;
> +	regs->sp = ri.cpt_sp;
> +
> +	regs->cs = decode_segment(ri.cpt_cs);
> +	regs->ss = decode_segment(ri.cpt_ss);
> +	regs->ds = decode_segment(ri.cpt_ds);
> +	regs->es = decode_segment(ri.cpt_es);
> +	regs->fs = decode_segment(ri.cpt_fs);
> +
> +	tsk->thread.sp -= HOOK_RESERVE;
> +	memset((void*)tsk->thread.sp, 0, HOOK_RESERVE);
> +
> +	return 0;
> +}
> +
> +static int restart_thread(void *arg)
> +{
> +	struct thr_context *thr_ctx = arg;
> +	struct cpt_context *ctx;
> +	struct cpt_task_image *ti;
> +	int err;
> +
> +	current->state = TASK_UNINTERRUPTIBLE;
> +
> +	ctx = thr_ctx->ctx;
> +	ti = kmalloc(sizeof(*ti), GFP_KERNEL);
> +	if (!ti)
> +		return -ENOMEM;
> +
> +	err = rst_get_object(CPT_OBJ_TASK, ti, sizeof(*ti), ctx);
> +	if (!err)
> +		err = rst_restore_task_struct(current, ti, ctx);
> +	/* Restore mm here */
> +	if (!err)
> +		err = rst_restore_fpustate(current, ti, ctx);
> +	if (!err)
> +		err = rst_restore_registers(current, ctx);
> +
> +	thr_ctx->error = err;
> +	complete(&thr_ctx->complete);
> +
> +	if (!err && (ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) {
> +		do_exit(ti->cpt_exit_code);
> +	} else {
> +		__set_current_state(TASK_UNINTERRUPTIBLE);
> +	}
> +
> +	kfree(ti);
> +	schedule();
> +
> +	eprintk("leaked %d/%d %p\n", task_pid_nr(current), task_pid_vnr(current), current->mm);
> +
> +	module_put(THIS_MODULE);

I'm sorry, I still do not understand what you are doing with this self-module
pinning stuff. AFAICS, we should not get here unless there is a bug. So the
checkpoint module ref count is never decreased, right?

Could you detail what is this self-module pinning for? As I already told you,
this looks like a bogus solution to avoid unloading the checkpoint module during
restart.

Thanks!

Louis

[...]

-- 
Dr Louis Rilling			Kerlabs
Skype: louis.rilling			Batiment Germanium
Phone: (+33|0) 6 80 89 08 23		80 avenue des Buttes de Coesmes
http://www.kerlabs.com/			35700 Rennes
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.openvz.org/pipermail/devel/attachments/20081020/ad5c37be/attachment-0001.sig>
-------------- next part --------------
_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers


More information about the Devel mailing list