[CRIU] [PATCH 3/3] sysctl: move sysctl calls to usernsd

Mon Sep 21 02:11:57 PDT 2015

On 09/17/2015 06:02 AM, Tycho Andersen wrote:

> +int sysctl_op(struct sysctl_req *req, size_t nr_req, int op, unsigned int ns)
> +{
> +	int ret = 0, i;
> +	struct sysctl_userns_req *userns_req;
> +	struct sysctl_req *cur;
> +
> +	if (nr_req == 0)
> +		return 0;
> +
> +	if (ns & !KNOWN_NS_MASK) {
> +		pr_err("don't know how to restore some namespaces in %u\n", ns);
> +		return -1;
> +	}
> +
> +	userns_req = alloca(MAX_UNSFD_MSG_SIZE);
> +	userns_req->op = op;
> +	userns_req->nr_req = nr_req;
> +	userns_req->ns = ns;
> +	userns_req->reqs = (struct sysctl_req *) (&userns_req[1]);
> +
> +	cur = userns_req->reqs;
> +	for (i = 0; i < nr_req; i++) {
> +		int arg_len = sysctl_userns_arg_size(req[i].type);
> +		int name_len = strlen(req[i].name) + 1;
> +		int total_len = sizeof(*cur) + arg_len + name_len;
> +
> +		if (((char *) cur) + total_len >= ((char *) userns_req) + MAX_UNSFD_MSG_SIZE) {
> +			pr_err("sysctl msg %s too big: %d\n", req[i].name, total_len);
> +			return -1;
> +		}
> +
> +		/* copy over the non-pointer fields */
> +		cur->type = req[i].type;
> +		cur->flags = req[i].flags;
> +
> +		cur->name = (char *) &cur[1];
> +		strcpy(cur->name, req[i].name);
> +
> +		cur->arg = cur->name + name_len;
> +		memcpy(cur->arg, req[i].arg, arg_len);
> +
> +		cur = (struct sysctl_req *) (((char *) cur) + total_len);

Can we avoid memory copyings when we know we'll not call the userns_call()?

> +	}
> +
> +	/* Net namespaces can be restored without usernsd, since anything with
> +	 * CAP_SYS_ADMIN in its namespace can write to net/ sysctls. The other
> +	 * namespaces we allow to restore (IPC and UTS) must be restored via
> +	 * usernsd.
> +	 */
> +	if (ns & CLONE_NEWNET)

If ns is 0 (as is done in e.g. kerndat) we'll go call userns_call :)

> +		ret = __sysctl_op(userns_req, -1, getpid());
> +	else
> +		ret = userns_call(__sysctl_op, UNS_ASYNC, userns_req, MAX_UNSFD_MSG_SIZE, -1);

Why UNS_ASYNC()? Async means that the routine won't wait for the result to finish,
but you memcpy() is back below.

> +
> +	if (ret < 0)
> +		return -1;
> +
> +	if (op != CTL_READ)
> +		return 0;
> +
> +	/*
> +	 * Here, we use a little hack: since we only read in dump mode when
> +	 * usernsd is not active, we know the above call happened in this
> +	 * address space, so we can just copy the value read back out. If there
> +	 * was an API to return stuff via userns_call(), that would be
> +	 * preferable.
> +	 */
> +	cur = userns_req->reqs;
> +	for (i = 0; i < nr_req; i++) {
> +		int arg_len = sysctl_userns_arg_size(cur->type);
> +		int name_len = strlen((char *) &cur[1]) + 1;
> +		int total_len = sizeof(*cur) + arg_len + name_len;
> +		void *arg = ((void *) &cur[1]) + name_len;
> +
> +		memcpy(req[i].arg, arg, arg_len);
> +
> +		cur = (struct sysctl_req *) (((char *) cur) + total_len);
> +	}
> +
> +	return 0;
> +}

> @@ -845,3 +845,23 @@ int fd_has_data(int lfd)
>  
>  	return ret;
>  }
> +
> +const char *ns_to_string(unsigned int ns)
> +{
> +	switch (ns) {
> +	case CLONE_NEWIPC:
> +		return "ipc";
> +	case CLONE_NEWNS:
> +		return "mnt";
> +	case CLONE_NEWNET:
> +		return "net";
> +	case CLONE_NEWPID:
> +		return "pid";
> +	case CLONE_NEWUSER:
> +		return "user";
> +	case CLONE_NEWUTS:
> +		return "uts";
> +	default:
> +		return NULL;
> +	}

We have ns_desc_array thing, that provides mapping between namespaces' names and flags.

> +}

-- Pavel