[CRIU] Re: [PATCH 4/4] inotify: Add checkpoint/restore

Tue Apr 10 09:30:38 EDT 2012

> @@ -337,8 +342,10 @@ static int dump_one_fdinfo(struct fd_parms *p, int lfd,
>  	p->id = MAKE_FD_GENID(p->stat.st_dev, p->stat.st_ino, p->pos);
>  	if (S_ISFIFO(p->stat.st_mode))
>  		p->type = FDINFO_PIPE;
> -	else
> +	else if (S_ISREG(p->stat.st_mode))

This else covered not only S_ISREG-s, but also S_ISDIR and S_ISCHRDEV :\

>  		p->type = FDINFO_REG;
> +	else
> +		p->type = FDINFO_INOTIFY;
>  
>  	return do_dump_one_fdinfo(p, lfd, cr_fdset);
>  }

> @@ -429,7 +436,8 @@ static int dump_one_fd(pid_t pid, int fd, int lfd,
>  
>  	if (S_ISREG(p.stat.st_mode) ||
>              S_ISDIR(p.stat.st_mode) ||
> -            S_ISFIFO(p.stat.st_mode))
> +	    S_ISFIFO(p.stat.st_mode) ||
> +	    is_inotify(lfd))
>  		return dump_one_fdinfo(&p, lfd, cr_fdset);
>  
>  	return dump_unsupp_fd(&p);

<and>

> +/* Checks if file desciptor @fd is inotify */
> +int is_inotify(int fd)
> +{
> +	char link[PATH_MAX], path[32];
> +	ssize_t ret;
> +
> +	snprintf(path, sizeof(path), "/proc/self/fd/%d", fd);
> +	ret = readlink(path, link, sizeof(link));
> +	if (ret < 0) {
> +		pr_perror("Can't read link of fd %d\n", fd);
> +		return -1;
> +	}
> +	link[ret] = 0;
> +	if (!strcmp(link, "anon_inode:inotify"))
> +		return 1;
> +
> +	return 0;
> +}

This is wrong check. You should call statfs on fd and check that the fs magic
is anon-inode-fs one. Then (only then) parse the symlink. This is not only
the correct check, but will also facilitate signalfd, eventfd and other anonfd-s
dumping in the future.

> +struct inotify_file_entry {
> +	u32	id;
> +	u64	i_ino;
> +	u32	mask;
> +	u32	s_dev;
> +	u32	r_dev;
> +	u32	wd;
> +	fh_t	f_handle;
> +} __packed;
> +

Just putting watches in a raw is not enough. You should have an entry describing
inotify fd itself. The thing is that it has flags (O_NONBLOCK is only used now, but
still), fowner and we'll have to dump them there.

> +/* Returns path for mount device @s_dev */
> +static char *inotify_get_mnt_root(unsigned int s_dev)
> +{
> +	static int last = 0;
> +	int i;
> +
> +	/* Cache hit rate is big */
> +again:
> +	for (i = last; i < nr_mntinfo; i++) {
> +		if (s_dev == mntinfo[i].s_dev) {
> +			last = i;
> +			return mntinfo[i].mnt_root;
> +		}
> +	}
> +
> +	if (last) {
> +		last = 0;
> +		goto again;
> +	}
> +
> +	return NULL;
> +}

This is wrong, but enough for the first time (please, put this comment into the code).
Besides, this has to be in some generic .c file.

> +	snprintf(path, sizeof(path), "/proc/self/fd/%d", wd);
> +	ret = readlink(path, link, sizeof(link));
> +	close(wd);
> +	close(mntfd);
> +
> +	if (ret < 1) {
> +		pr_perror("Can't read self-link for %d\n", wd);
> +		return -1;
> +	}

Check if we can just push the /proc/self/fd/xxx path into inotify_add_watch.

> +	attempt = 10, wd = 1;

This means that you only can restore wd-s less than 10 (or 11, I don't try hard
on corner case). This is wrong.

> +	while (wd >= 0 && attempt--) {
> +		wd = inotify_add_watch(i_fd, link, fe->mask);
> +		if (wd < 0) {
> +			pr_err("Can't add watch for %d with %d\n", i_fd, fe->wd);
> +			break;
> +		} else if (wd == fe->wd) {
> +			ret = 0;
> +			break;
> +		}
> +
> +		pr_debug("\t\tWatch got %d but %d expected\n", wd, fe->wd);
> +		inotify_rm_watch(i_fd, wd);

Plus, you have to put a comment here stating that "kernel allocates wd-s sequentially,
this is suboptimal, but the kernel doesn't provide and API for this yet :(" and put 
the respective entry in your long-term TODO list.

> +	}

> +		pr_info("Collected inotify: id %8x wd %8x s_dev %8x i_ino %16lx mask %8x\n",
> +			info->ife.id, info->ife.wd, info->ife.s_dev, info->ife.i_ino, info->ife.mask);
> +		pr_info("\t[fhandle] bytes %8x type %8x __handle %016lx:%016lx\n",
> +			info->ife.f_handle.bytes, info->ife.f_handle.type,
> +			info->ife.f_handle.__handle[0], info->ife.f_handle.__handle[1]);
> +
> +		list_add(&info->list, &inotify_head);

Taking into account the way you restore them, this list should be sorted by wd
in an ascending order.