[CRIU] [PATCH bpf-next v1 3/8] io_uring: Implement eBPF iterator for registered files
Yonghong Song
yhs at fb.com
Thu Nov 18 20:33:06 MSK 2021
On 11/15/21 9:42 PM, Kumar Kartikeya Dwivedi wrote:
> This change adds eBPF iterator for buffers registered in io_uring ctx.
> It gives access to the ctx, the index of the registered buffer, and a
> pointer to the struct file itself. This allows the iterator to save
> info related to the file added to an io_uring instance, that isn't easy
> to export using the fdinfo interface (like being able to match
> registered files to a task's file set). Getting access to underlying
> struct file allows deduplication and efficient pairing with task file
> set (obtained using task_file iterator).
>
> The primary usecase this is enabling is checkpoint/restore support.
>
> Note that we need to use mutex_trylock when the file is read from, in
> seq_start functions, as the order of lock taken is opposite of what it
> would be when io_uring operation reads the same file. We take
> seq_file->lock, then ctx->uring_lock, while io_uring would first take
> ctx->uring_lock and then seq_file->lock for the same ctx.
>
> This can lead to a deadlock scenario described below:
>
> CPU 0 CPU 1
>
> vfs_read
> mutex_lock(&seq_file->lock) io_read
> mutex_lock(&ctx->uring_lock)
> mutex_lock(&ctx->uring_lock) # switched to mutex_trylock
> mutex_lock(&seq_file->lock)
>
> The trylock also protects the case where io_uring tries to read from
> iterator attached to itself (same ctx), where the order of locks would
> be:
> io_uring_enter
> mutex_lock(&ctx->uring_lock) <-----------.
> io_read \
> seq_read \
> mutex_lock(&seq_file->lock) /
> mutex_lock(&ctx->uring_lock) # deadlock-`
>
> In both these cases (recursive read and contended uring_lock), -EDEADLK
> is returned to userspace.
>
> With the advent of descriptorless files supported by io_uring, this
> iterator provides the required visibility and introspection of io_uring
> instance for the purposes of dumping and restoring it.
>
> In the future, this iterator will be extended to support direct
> inspection of a lot of file state (currently descriptorless files
> are obtained using openat2 and socket) to dump file state for these
> hidden files. Later, we can explore filling in the gaps for dumping
> file state for more file types (those not hidden in io_uring ctx).
> All this is out of scope for the current series however, but builds
> upon this iterator.
>
> Cc: Jens Axboe <axboe at kernel.dk>
> Cc: Pavel Begunkov <asml.silence at gmail.com>
> Cc: io-uring at vger.kernel.org
> Signed-off-by: Kumar Kartikeya Dwivedi <memxor at gmail.com>
> ---
> fs/io_uring.c | 140 +++++++++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 139 insertions(+), 1 deletion(-)
>
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 9e9df6767e29..7ac479c95d4e 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -11132,6 +11132,7 @@ __initcall(io_uring_init);
> BTF_ID_LIST(btf_io_uring_ids)
> BTF_ID(struct, io_ring_ctx)
> BTF_ID(struct, io_mapped_ubuf)
> +BTF_ID(struct, file)
>
> struct bpf_io_uring_seq_info {
> struct io_ring_ctx *ctx;
> @@ -11312,11 +11313,148 @@ const struct bpf_func_proto bpf_page_to_pfn_proto = {
> .arg1_btf_id = &btf_page_to_pfn_ids[0],
> };
>
> +/* io_uring iterator for registered files */
> +
> +struct bpf_iter__io_uring_file {
> + __bpf_md_ptr(struct bpf_iter_meta *, meta);
> + __bpf_md_ptr(struct io_ring_ctx *, ctx);
> + __bpf_md_ptr(struct file *, file);
> + unsigned long index;
change "unisnged long" to either u32 or u64, maybe just u64?
> +};
> +
> +static void *__bpf_io_uring_file_seq_get_next(struct bpf_io_uring_seq_info *info)
> +{
> + struct file *file = NULL;
> +
> + if (info->index < info->ctx->nr_user_files) {
> + /* file set can be sparse */
> + file = io_file_from_index(info->ctx, info->index++);
> + /* use info as a distinct pointer to distinguish between empty
> + * slot and valid file, since we cannot return NULL for this
> + * case if we want iter prog to still be invoked with file ==
> + * NULL.
> + */
> + if (!file)
> + return info;
> + }
> +
> + return file;
> +}
> +
> +static void *bpf_io_uring_file_seq_start(struct seq_file *seq, loff_t *pos)
> +{
> + struct bpf_io_uring_seq_info *info = seq->private;
> + struct file *file;
> +
> + /* Indicate to userspace that the uring lock is contended */
> + if (!mutex_trylock(&info->ctx->uring_lock))
> + return ERR_PTR(-EDEADLK);
> +
> + file = __bpf_io_uring_file_seq_get_next(info);
> + if (!file)
> + return NULL;
> +
> + if (*pos == 0)
> + ++*pos;
> + return file;
> +}
> +
> +static void *bpf_io_uring_file_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> +{
> + struct bpf_io_uring_seq_info *info = seq->private;
> +
> + ++*pos;
> + return __bpf_io_uring_file_seq_get_next(info);
> +}
> +
> +DEFINE_BPF_ITER_FUNC(io_uring_file, struct bpf_iter_meta *meta,
> + struct io_ring_ctx *ctx, struct file *file,
> + unsigned long index)
unsigned long => u64?
> +
> +static int __bpf_io_uring_file_seq_show(struct seq_file *seq, void *v, bool in_stop)
> +{
> + struct bpf_io_uring_seq_info *info = seq->private;
> + struct bpf_iter__io_uring_file ctx;
> + struct bpf_iter_meta meta;
> + struct bpf_prog *prog;
> +
> + meta.seq = seq;
> + prog = bpf_iter_get_info(&meta, in_stop);
> + if (!prog)
> + return 0;
> +
> + ctx.meta = &meta;
> + ctx.ctx = info->ctx;
> + /* when we encounter empty slot, v will point to info */
> + ctx.file = v == info ? NULL : v;
> + ctx.index = info->index ? info->index - !in_stop : 0;
> +
> + return bpf_iter_run_prog(prog, &ctx);
> +}
> +
> +static int bpf_io_uring_file_seq_show(struct seq_file *seq, void *v)
> +{
> + return __bpf_io_uring_file_seq_show(seq, v, false);
> +}
> +
[...]
More information about the CRIU
mailing list