[CRIU] [PATCH 3/3] sysctl: move sysctl calls to usernsd
Pavel Emelyanov
xemul at parallels.com
Mon Sep 21 02:11:57 PDT 2015
On 09/17/2015 06:02 AM, Tycho Andersen wrote:
> +int sysctl_op(struct sysctl_req *req, size_t nr_req, int op, unsigned int ns)
> +{
> + int ret = 0, i;
> + struct sysctl_userns_req *userns_req;
> + struct sysctl_req *cur;
> +
> + if (nr_req == 0)
> + return 0;
> +
> + if (ns & !KNOWN_NS_MASK) {
> + pr_err("don't know how to restore some namespaces in %u\n", ns);
> + return -1;
> + }
> +
> + userns_req = alloca(MAX_UNSFD_MSG_SIZE);
> + userns_req->op = op;
> + userns_req->nr_req = nr_req;
> + userns_req->ns = ns;
> + userns_req->reqs = (struct sysctl_req *) (&userns_req[1]);
> +
> + cur = userns_req->reqs;
> + for (i = 0; i < nr_req; i++) {
> + int arg_len = sysctl_userns_arg_size(req[i].type);
> + int name_len = strlen(req[i].name) + 1;
> + int total_len = sizeof(*cur) + arg_len + name_len;
> +
> + if (((char *) cur) + total_len >= ((char *) userns_req) + MAX_UNSFD_MSG_SIZE) {
> + pr_err("sysctl msg %s too big: %d\n", req[i].name, total_len);
> + return -1;
> + }
> +
> + /* copy over the non-pointer fields */
> + cur->type = req[i].type;
> + cur->flags = req[i].flags;
> +
> + cur->name = (char *) &cur[1];
> + strcpy(cur->name, req[i].name);
> +
> + cur->arg = cur->name + name_len;
> + memcpy(cur->arg, req[i].arg, arg_len);
> +
> + cur = (struct sysctl_req *) (((char *) cur) + total_len);
Can we avoid memory copyings when we know we'll not call the userns_call()?
> + }
> +
> + /* Net namespaces can be restored without usernsd, since anything with
> + * CAP_SYS_ADMIN in its namespace can write to net/ sysctls. The other
> + * namespaces we allow to restore (IPC and UTS) must be restored via
> + * usernsd.
> + */
> + if (ns & CLONE_NEWNET)
If ns is 0 (as is done in e.g. kerndat) we'll go call userns_call :)
> + ret = __sysctl_op(userns_req, -1, getpid());
> + else
> + ret = userns_call(__sysctl_op, UNS_ASYNC, userns_req, MAX_UNSFD_MSG_SIZE, -1);
Why UNS_ASYNC()? Async means that the routine won't wait for the result to finish,
but you memcpy() is back below.
> +
> + if (ret < 0)
> + return -1;
> +
> + if (op != CTL_READ)
> + return 0;
> +
> + /*
> + * Here, we use a little hack: since we only read in dump mode when
> + * usernsd is not active, we know the above call happened in this
> + * address space, so we can just copy the value read back out. If there
> + * was an API to return stuff via userns_call(), that would be
> + * preferable.
> + */
> + cur = userns_req->reqs;
> + for (i = 0; i < nr_req; i++) {
> + int arg_len = sysctl_userns_arg_size(cur->type);
> + int name_len = strlen((char *) &cur[1]) + 1;
> + int total_len = sizeof(*cur) + arg_len + name_len;
> + void *arg = ((void *) &cur[1]) + name_len;
> +
> + memcpy(req[i].arg, arg, arg_len);
> +
> + cur = (struct sysctl_req *) (((char *) cur) + total_len);
> + }
> +
> + return 0;
> +}
> @@ -845,3 +845,23 @@ int fd_has_data(int lfd)
>
> return ret;
> }
> +
> +const char *ns_to_string(unsigned int ns)
> +{
> + switch (ns) {
> + case CLONE_NEWIPC:
> + return "ipc";
> + case CLONE_NEWNS:
> + return "mnt";
> + case CLONE_NEWNET:
> + return "net";
> + case CLONE_NEWPID:
> + return "pid";
> + case CLONE_NEWUSER:
> + return "user";
> + case CLONE_NEWUTS:
> + return "uts";
> + default:
> + return NULL;
> + }
We have ns_desc_array thing, that provides mapping between namespaces' names and flags.
> +}
-- Pavel
More information about the CRIU
mailing list