[CRIU] [PATCH bpf-next v1 3/8] io_uring: Implement eBPF iterator for registered files

Yonghong Song yhs at fb.com
Thu Nov 18 20:33:06 MSK 2021



On 11/15/21 9:42 PM, Kumar Kartikeya Dwivedi wrote:
> This change adds eBPF iterator for buffers registered in io_uring ctx.
> It gives access to the ctx, the index of the registered buffer, and a
> pointer to the struct file itself. This allows the iterator to save
> info related to the file added to an io_uring instance, that isn't easy
> to export using the fdinfo interface (like being able to match
> registered files to a task's file set). Getting access to underlying
> struct file allows deduplication and efficient pairing with task file
> set (obtained using task_file iterator).
> 
> The primary usecase this is enabling is checkpoint/restore support.
> 
> Note that we need to use mutex_trylock when the file is read from, in
> seq_start functions, as the order of lock taken is opposite of what it
> would be when io_uring operation reads the same file.  We take
> seq_file->lock, then ctx->uring_lock, while io_uring would first take
> ctx->uring_lock and then seq_file->lock for the same ctx.
> 
> This can lead to a deadlock scenario described below:
> 
>        CPU 0                             CPU 1
> 
>        vfs_read
>        mutex_lock(&seq_file->lock)       io_read
> 					mutex_lock(&ctx->uring_lock)
>        mutex_lock(&ctx->uring_lock) # switched to mutex_trylock
> 					mutex_lock(&seq_file->lock)
> 
> The trylock also protects the case where io_uring tries to read from
> iterator attached to itself (same ctx), where the order of locks would
> be:
>   io_uring_enter
>    mutex_lock(&ctx->uring_lock) <-----------.
>    io_read                                   \
>     seq_read                                  \
>      mutex_lock(&seq_file->lock)              /
>      mutex_lock(&ctx->uring_lock) # deadlock-`
> 
> In both these cases (recursive read and contended uring_lock), -EDEADLK
> is returned to userspace.
> 
> With the advent of descriptorless files supported by io_uring, this
> iterator provides the required visibility and introspection of io_uring
> instance for the purposes of dumping and restoring it.
> 
> In the future, this iterator will be extended to support direct
> inspection of a lot of file state (currently descriptorless files
> are obtained using openat2 and socket) to dump file state for these
> hidden files. Later, we can explore filling in the gaps for dumping
> file state for more file types (those not hidden in io_uring ctx).
> All this is out of scope for the current series however, but builds
> upon this iterator.
> 
> Cc: Jens Axboe <axboe at kernel.dk>
> Cc: Pavel Begunkov <asml.silence at gmail.com>
> Cc: io-uring at vger.kernel.org
> Signed-off-by: Kumar Kartikeya Dwivedi <memxor at gmail.com>
> ---
>   fs/io_uring.c | 140 +++++++++++++++++++++++++++++++++++++++++++++++++-
>   1 file changed, 139 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 9e9df6767e29..7ac479c95d4e 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -11132,6 +11132,7 @@ __initcall(io_uring_init);
>   BTF_ID_LIST(btf_io_uring_ids)
>   BTF_ID(struct, io_ring_ctx)
>   BTF_ID(struct, io_mapped_ubuf)
> +BTF_ID(struct, file)
>   
>   struct bpf_io_uring_seq_info {
>   	struct io_ring_ctx *ctx;
> @@ -11312,11 +11313,148 @@ const struct bpf_func_proto bpf_page_to_pfn_proto = {
>   	.arg1_btf_id	= &btf_page_to_pfn_ids[0],
>   };
>   
> +/* io_uring iterator for registered files */
> +
> +struct bpf_iter__io_uring_file {
> +	__bpf_md_ptr(struct bpf_iter_meta *, meta);
> +	__bpf_md_ptr(struct io_ring_ctx *, ctx);
> +	__bpf_md_ptr(struct file *, file);
> +	unsigned long index;

change "unisnged long" to either u32 or u64, maybe just u64?

> +};
> +
> +static void *__bpf_io_uring_file_seq_get_next(struct bpf_io_uring_seq_info *info)
> +{
> +	struct file *file = NULL;
> +
> +	if (info->index < info->ctx->nr_user_files) {
> +		/* file set can be sparse */
> +		file = io_file_from_index(info->ctx, info->index++);
> +		/* use info as a distinct pointer to distinguish between empty
> +		 * slot and valid file, since we cannot return NULL for this
> +		 * case if we want iter prog to still be invoked with file ==
> +		 * NULL.
> +		 */
> +		if (!file)
> +			return info;
> +	}
> +
> +	return file;
> +}
> +
> +static void *bpf_io_uring_file_seq_start(struct seq_file *seq, loff_t *pos)
> +{
> +	struct bpf_io_uring_seq_info *info = seq->private;
> +	struct file *file;
> +
> +	/* Indicate to userspace that the uring lock is contended */
> +	if (!mutex_trylock(&info->ctx->uring_lock))
> +		return ERR_PTR(-EDEADLK);
> +
> +	file = __bpf_io_uring_file_seq_get_next(info);
> +	if (!file)
> +		return NULL;
> +
> +	if (*pos == 0)
> +		++*pos;
> +	return file;
> +}
> +
> +static void *bpf_io_uring_file_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> +{
> +	struct bpf_io_uring_seq_info *info = seq->private;
> +
> +	++*pos;
> +	return __bpf_io_uring_file_seq_get_next(info);
> +}
> +
> +DEFINE_BPF_ITER_FUNC(io_uring_file, struct bpf_iter_meta *meta,
> +		     struct io_ring_ctx *ctx, struct file *file,
> +		     unsigned long index)

unsigned long => u64?

> +
> +static int __bpf_io_uring_file_seq_show(struct seq_file *seq, void *v, bool in_stop)
> +{
> +	struct bpf_io_uring_seq_info *info = seq->private;
> +	struct bpf_iter__io_uring_file ctx;
> +	struct bpf_iter_meta meta;
> +	struct bpf_prog *prog;
> +
> +	meta.seq = seq;
> +	prog = bpf_iter_get_info(&meta, in_stop);
> +	if (!prog)
> +		return 0;
> +
> +	ctx.meta = &meta;
> +	ctx.ctx = info->ctx;
> +	/* when we encounter empty slot, v will point to info */
> +	ctx.file = v == info ? NULL : v;
> +	ctx.index = info->index ? info->index - !in_stop : 0;
> +
> +	return bpf_iter_run_prog(prog, &ctx);
> +}
> +
> +static int bpf_io_uring_file_seq_show(struct seq_file *seq, void *v)
> +{
> +	return __bpf_io_uring_file_seq_show(seq, v, false);
> +}
> +
[...]


More information about the CRIU mailing list