[Devel] Re: [PATCH 1/2] signal checkpoint: define /proc/pid/sig/

Serge E. Hallyn serue at us.ibm.com
Mon Jun 11 10:05:37 PDT 2007


Quoting Cedric Le Goater (clg at fr.ibm.com):
> Serge E. Hallyn wrote:
> > As I mentioned earlier, I don't know what sort of approach we want
> > to take to guide checkpoint and restart.  I.e. do we want it to be
> > a mostly userspace-orchestrated affair, or entirely done in the
> > kernel using the freezer or some other mechanism in response to a
> > single syscall or containerfs file write?
> > 
> > If we wanted to do a lot of the work in userspace, here is a pair of
> > patches to read and restore signal information.  It's entirely unsafe
> > wrt locking, bc i would assume that if we did in fact do c/r from
> > userspace, we would have some way of entirely pulling the task off
> > the runqueue while doing our thing...
> > 
> > Anyway, this is purely to start discussion.
> > 
> > thanks,
> > -serge
> > 
> >>From 30bbe322942e5ed86bad63861dad80595cd04063 Mon Sep 17 00:00:00 2001
> > From: Serge E. Hallyn <serue at us.ibm.com>
> > Date: Mon, 30 Apr 2007 16:22:44 -0400
> > Subject: [PATCH 1/2] signal checkpoint: define /proc/pid/sig/
> > 
> > Define /proc/<pid>/sig/ directory containing files to report
> > on a process' signal info.
> > 
> > Files defined:
> > 	action: list signal action
> > 	altstack: print sigaltstack location and size
> > 	blocked: print blocked signal mask
> > 	pending: print pending signals and siginfo
> > 	shared_pending: print shared pending signals and siginfo
> > 	waiters: list tasks wait4()ing on task PID.
> 
> some of these are already in /proc/<pid>/status and /proc/<pid>/stat 

Good point.  However they can't be set through that file.  So is it
worth making the files write-only, to avoid duplication, or do we prefer
to not make the checkpoint and restart use different files?

> should we continue to use /proc ? or switch to some other mechanisms
> like getnetlink (taskstats) to map kernel structures.

We want to avoid 'map'ping kernel structures, though, right?  We can
dump the data in a more generic fashion through netlink, dunno what we
prefer.  But this is very definately process information :), so /proc
does seem appropriate.

> > +/*
> > + * print a sigset_t to a buffer.  Return # characters printed,
> > + * not including the final ending '\0'.
> > + */
> > +static int print_sigset(char *buf, sigset_t *sig)
> > +{
> > +	int i;
> > +
> > +	for (i=0; i<_NSIG; i++) {
> > +		if (sigismember(sig, i))
> > +			buf[i] = '1';
> > +		else
> > +			buf[i] = '0';
> > +	}
> > +	buf[_NSIG] = '\0';
> > +
> > +	return _NSIG;
> > +}
> 
> you might want to use render_sigset_t() in fs/proc array.C

Thanks.  (I will incorporate improvements like this if i end up sending
another version)

> > +static int print_sigset_alloc(char **bufp, sigset_t *sig)
> > +{
> > +	char *buf;
> > +
> > +	*bufp = buf = kmalloc(_NSIG+1, GFP_KERNEL);
> > +	if (!buf)
> > +		return -ENOMEM;
> > +
> > +	return print_sigset(buf, sig);
> > +}
> > +
> > +static int print_wait_info(char **bufp, struct signal_struct *signal)
> > +{
> > +	char *buf;
> > +	wait_queue_t *wait;
> > +
> > +	*bufp = buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
> > +	if (!buf)
> > +		return -ENOMEM;
> > +
> > +	spin_lock(&signal->wait_chldexit.lock);
> > +
> > +	list_for_each_entry(wait, &signal->wait_chldexit.task_list, task_list) {
> > +		struct task_struct *tsk = wait->private;
> > +
> > +		if (buf - *bufp +50 > PAGE_SIZE) {
> > +			spin_unlock(&signal->wait_chldexit.lock);
> > +			kfree(buf);
> > +			*bufp = NULL;
> > +			return -ENOMEM;
> > +		}
> > +		WARN_ON(wait->func != default_wake_function);
> > +		buf += sprintf(buf, "%u %d\n", wait->flags, tsk->pid);
> > +	}
> > +
> > +	spin_unlock(&signal->wait_chldexit.lock);
> > +
> > +	return buf-*bufp;
> > +}
> > +
> > +static int print_sigpending_alloc(char **bufp, struct sigpending *pending)
> > +{
> > +	int alloced=0;
> > +	char *buf, *p;
> > +	struct sigqueue *q;
> > +	struct siginfo *info;
> > +
> > +	alloced = PAGE_SIZE;
> > +	p = buf = kmalloc(alloced, GFP_KERNEL);
> > +	if (!buf)
> > +		return -ENOMEM;
> > +	
> > +	p += print_sigset(buf, &pending->signal);
> > +	p += sprintf(p, "\n");
> > +
> > +	list_for_each_entry(q, &pending->list, list) {
> > +		info = &q->info;
> > +		if (p-buf+215 > alloced) {
> > +			int len=p-buf;
> > +			char *buf2;
> > +			alloced += PAGE_SIZE;
> > +			buf2 = kmalloc(alloced, GFP_KERNEL);
> > +			if (!buf2) {
> > +				kfree(buf);
> > +				return -ENOMEM;
> > +			}
> > +			memcpy(buf2, buf, alloced - PAGE_SIZE);
> > +			kfree(buf);
> > +			buf = buf2;
> > +			p = buf+len;
> > +		}
> > +
> > +		p += sprintf(p, "sig %d: user %d flags %d",
> > +			info->si_signo, (int)q->user->uid, q->flags);
> > +		p += sprintf(p, " errno %d code %d\n",
> > +			info->si_errno, info->si_code);
> > +
> > +		switch(info->si_signo) {
> > +		case SIGKILL:
> > +			p += sprintf(p, "  spid %d suid %d\n",
> > +				info->_sifields._kill._pid,
> > +				info->_sifields._kill._uid);
> > +			break;
> > +		/* XXX skipping posix1b timers and signals for now */
> > +		case SIGCHLD:
> > +			p += sprintf(p, "  pid %d uid %d status %d utime %lu stime %lu\n",
> > +				info->_sifields._sigchld._pid,
> > +				info->_sifields._sigchld._uid,
> > +				info->_sifields._sigchld._status,
> > +				info->_sifields._sigchld._utime,
> > +				info->_sifields._sigchld._stime);
> > +			break;
> > +		case SIGILL:
> > +		case SIGFPE:
> > +		case SIGSEGV:
> > +		case SIGBUS:
> > +#ifdef __ARCH_SI_TRAPNO
> > +			p += sprintf(p, "  addr %lu trapno %d\n",
> > +				(unsigned long)info->_sifields._sigfault._addr,
> > +				info->_sifields._sigfault._trapno);
> > +#else
> > +			p += sprintf(p, "  addr %lu\n",
> > +				(unsigned long)info->_sifields._sigfault._addr);
> > +#endif
> > +			break;
> > +		case SIGPOLL:
> > +			p += sprintf(p, "  band %ld fd %d\n",
> > +				(long)info->_sifields._sigpoll._band,
> > +				info->_sifields._sigpoll._fd);
> > +			break;
> > +		default:
> > +			p += sprintf(p, "  Unsupported siginfo for signal %d\n",
> > +				info->si_signo);
> > +			break;
> > +		}
> > +	}
> > +	*bufp = buf;
> > +	return p-buf;
> > +}
> 
> I think we are reaching the limit of /proc when we expose the pending siginfos.

Why?

-serge

> > +static int print_sigaction_list(char **bufp, struct sighand_struct *sighand)
> > +{
> > +	struct k_sigaction *action;
> > +	int maxlen;
> > +	int i;
> > +	char *buf;
> > +
> > +	/* two unsigned longs (20 chars), one int (10 chars), a sigset_t, 3 spaces, plus a newline */
> > +	maxlen = 10 + 20*2 + _NSIG + 4;
> > +	/* and we have _NSIG of those entries, plus an ending \0 */
> > +	maxlen *= _NSIG+1;
> > +
> > +	*bufp = buf = kmalloc(maxlen, GFP_KERNEL);
> > +	if (!buf)
> > +		return -ENOMEM;
> > +
> > +	spin_lock(&sighand->siglock);
> > +	for (i=0; i<_NSIG; i++) {
> > +		action = &sighand->action[i];
> > +		buf += sprintf(buf, "%10d %20lu ", i, action->sa.sa_flags);
> > +		buf += print_sigset(buf, &action->sa.sa_mask);
> > +		buf += sprintf(buf, " %20lu\n", (unsigned long)action->sa.sa_handler);
> > +		BUG_ON(buf-*bufp > maxlen);
> > +	}
> > +	spin_unlock(&sighand->siglock);
> > +	return buf-*bufp;
> > +}
> > +
> > +int task_read_procsig(struct task_struct *p, char *name, char **value)
> > +{
> > +	int ret;
> > +
> > +	if (current != p) {
> > +		ret = security_ptrace(current, p);
> > +		if (ret)
> > +			return ret;
> > +	}
> > +
> > +	ret = -EINVAL;
> > +
> > +	if (strcmp(name, "pending") == 0) {
> > +		/* not masking out blocked signals  yet */
> > +		ret = print_sigpending_alloc(value, &p->pending);
> > +	} else if (strcmp(name, "blocked") == 0) {
> > +		/* not masking out blocked signals  yet */
> > +		ret = print_sigset_alloc(value, &p->blocked);
> > +	} else if (strcmp(name, "shared_pending") == 0) {
> > +		ret = print_sigpending_alloc(value, &p->signal->shared_pending);
> > +	} else if (strcmp(name, "waiters") == 0) {
> > +		ret = print_wait_info(value, p->signal);
> > +	} else if (strcmp(name, "action") == 0) {
> > +		ret = print_sigaction_list(value, p->sighand);
> > +	}  else if (strcmp(name, "altstack") == 0) {
> > +		*value = kmalloc(40, GFP_KERNEL);
> > +		ret = -ENOMEM;
> > +		if (*value) {
> > +			ret = sprintf(*value, "%lu %zd", p->sas_ss_sp, p->sas_ss_size);
> > +		}
> > +	}
> > +
> > +	return ret;
> > +}
_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers




More information about the Devel mailing list