[CRIU] [PATCH 3/3] sysctl: move sysctl calls to usernsd

Tycho Andersen tycho.andersen at canonical.com
Mon Sep 21 07:01:06 PDT 2015


On Mon, Sep 21, 2015 at 12:11:57PM +0300, Pavel Emelyanov wrote:
> On 09/17/2015 06:02 AM, Tycho Andersen wrote:
> 
> > +int sysctl_op(struct sysctl_req *req, size_t nr_req, int op, unsigned int ns)
> > +{
> > +	int ret = 0, i;
> > +	struct sysctl_userns_req *userns_req;
> > +	struct sysctl_req *cur;
> > +
> > +	if (nr_req == 0)
> > +		return 0;
> > +
> > +	if (ns & !KNOWN_NS_MASK) {
> > +		pr_err("don't know how to restore some namespaces in %u\n", ns);
> > +		return -1;
> > +	}
> > +
> > +	userns_req = alloca(MAX_UNSFD_MSG_SIZE);
> > +	userns_req->op = op;
> > +	userns_req->nr_req = nr_req;
> > +	userns_req->ns = ns;
> > +	userns_req->reqs = (struct sysctl_req *) (&userns_req[1]);
> > +
> > +	cur = userns_req->reqs;
> > +	for (i = 0; i < nr_req; i++) {
> > +		int arg_len = sysctl_userns_arg_size(req[i].type);
> > +		int name_len = strlen(req[i].name) + 1;
> > +		int total_len = sizeof(*cur) + arg_len + name_len;
> > +
> > +		if (((char *) cur) + total_len >= ((char *) userns_req) + MAX_UNSFD_MSG_SIZE) {
> > +			pr_err("sysctl msg %s too big: %d\n", req[i].name, total_len);
> > +			return -1;
> > +		}
> > +
> > +		/* copy over the non-pointer fields */
> > +		cur->type = req[i].type;
> > +		cur->flags = req[i].flags;
> > +
> > +		cur->name = (char *) &cur[1];
> > +		strcpy(cur->name, req[i].name);
> > +
> > +		cur->arg = cur->name + name_len;
> > +		memcpy(cur->arg, req[i].arg, arg_len);
> > +
> > +		cur = (struct sysctl_req *) (((char *) cur) + total_len);
> 
> Can we avoid memory copyings when we know we'll not call the userns_call()?

I was hoping you wouldn't say that :). Yes, I can make the change.

> > +	}
> > +
> > +	/* Net namespaces can be restored without usernsd, since anything with
> > +	 * CAP_SYS_ADMIN in its namespace can write to net/ sysctls. The other
> > +	 * namespaces we allow to restore (IPC and UTS) must be restored via
> > +	 * usernsd.
> > +	 */
> > +	if (ns & CLONE_NEWNET)
> 
> If ns is 0 (as is done in e.g. kerndat) we'll go call userns_call :)
> 
> > +		ret = __sysctl_op(userns_req, -1, getpid());
> > +	else
> > +		ret = userns_call(__sysctl_op, UNS_ASYNC, userns_req, MAX_UNSFD_MSG_SIZE, -1);
> 
> Why UNS_ASYNC()? Async means that the routine won't wait for the result to finish,
> but you memcpy() is back below.

The key here is that we never use usernsd at the same time we do the
memcpy, that's the only way that it works (since usernsd doesn't send
the blob back, but we're copying out of the blob). I think with the
above fix that'll make it explicit, though so this little ugliness
will go away.

Tycho

> > +
> > +	if (ret < 0)
> > +		return -1;
> > +
> > +	if (op != CTL_READ)
> > +		return 0;
> > +
> > +	/*
> > +	 * Here, we use a little hack: since we only read in dump mode when
> > +	 * usernsd is not active, we know the above call happened in this
> > +	 * address space, so we can just copy the value read back out. If there
> > +	 * was an API to return stuff via userns_call(), that would be
> > +	 * preferable.
> > +	 */
> > +	cur = userns_req->reqs;
> > +	for (i = 0; i < nr_req; i++) {
> > +		int arg_len = sysctl_userns_arg_size(cur->type);
> > +		int name_len = strlen((char *) &cur[1]) + 1;
> > +		int total_len = sizeof(*cur) + arg_len + name_len;
> > +		void *arg = ((void *) &cur[1]) + name_len;
> > +
> > +		memcpy(req[i].arg, arg, arg_len);
> > +
> > +		cur = (struct sysctl_req *) (((char *) cur) + total_len);
> > +	}
> > +
> > +	return 0;
> > +}
> 
> > @@ -845,3 +845,23 @@ int fd_has_data(int lfd)
> >  
> >  	return ret;
> >  }
> > +
> > +const char *ns_to_string(unsigned int ns)
> > +{
> > +	switch (ns) {
> > +	case CLONE_NEWIPC:
> > +		return "ipc";
> > +	case CLONE_NEWNS:
> > +		return "mnt";
> > +	case CLONE_NEWNET:
> > +		return "net";
> > +	case CLONE_NEWPID:
> > +		return "pid";
> > +	case CLONE_NEWUSER:
> > +		return "user";
> > +	case CLONE_NEWUTS:
> > +		return "uts";
> > +	default:
> > +		return NULL;
> > +	}
> 
> We have ns_desc_array thing, that provides mapping between namespaces' names and flags.
> 
> > +}
> 
> -- Pavel


More information about the CRIU mailing list