[CRIU] [PATCH v2 15/24] proc/fd: In proc_readfd_common use task_lookup_next_fd_rcu
Eric W. Biederman
ebiederm at xmission.com
Wed Dec 9 01:38:09 MSK 2020
Al Viro <viro at zeniv.linux.org.uk> writes:
> On Fri, Nov 20, 2020 at 05:14:32PM -0600, Eric W. Biederman wrote:
>> When discussing[1] exec and posix file locks it was realized that none
>> of the callers of get_files_struct fundamentally needed to call
>> get_files_struct, and that by switching them to helper functions
>> instead it will both simplify their code and remove unnecessary
>> increments of files_struct.count. Those unnecessary increments can
>> result in exec unnecessarily unsharing files_struct which breaking
>> posix locks, and it can result in fget_light having to fallback to
>> fget reducing system performance.
>>
>> Using task_lookup_next_fd_rcu simplifies proc_readfd_common, by moving
>> the checking for the maximum file descritor into the generic code, and
>> by remvoing the need for capturing and releasing a reference on
>> files_struct.
>>
>> As task_lookup_fd_rcu may update the fd ctx->pos has been changed
>> to be the fd +2 after task_lookup_fd_rcu returns.
>
>
>> + for (fd = ctx->pos - 2;; fd++) {
>> struct file *f;
>> struct fd_data data;
>> char name[10 + 1];
>> unsigned int len;
>>
>> - f = files_lookup_fd_rcu(files, fd);
>> + f = task_lookup_next_fd_rcu(p, &fd);
>
> Ugh... That makes for a massive cacheline pingpong on task_lock -
> instead of grabbing/dropping task_lock() once in the beginning, we do
> that for every damn descriptor.
>
> I really don't like this one. If anything, I would rather have
> a helper that would collect a bunch of pairs (fd,mode) into an
> array and have lookups batched into it. With the loop in that
> sucker grabbing a reasonable amount into a local array, then
> doing proc_fill_cache() for each collected.
Is there any reason we don't simply rcu free the files_struct?
That would remove the need for the task_lock entirely.
Something like this?
diff --git a/fs/file.c b/fs/file.c
index 412033d8cfdf..7aaeac9965f3 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -379,7 +379,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
return NULL;
}
-static struct fdtable *close_files(struct files_struct * files)
+static void close_files(struct files_struct * files)
{
/*
* It is safe to dereference the fd table without RCU or
@@ -407,19 +407,25 @@ static struct fdtable *close_files(struct files_struct * files)
set >>= 1;
}
}
+}
- return fdt;
+static void free_files_struct_rcu(struct rcu_head *rcu)
+{
+ struct files_struct *files =
+ container_of(rcu, struct files_struct, fdtab.rcu);
+ struct fdtable *fdt = rcu_dereference_raw(files->fdt);
+
+ /* free the arrays if they are not embedded */
+ if (fdt != &files->fdtab)
+ __free_fdtable(fdt);
+ kmem_cache_free(files_cachep, files);
}
void put_files_struct(struct files_struct *files)
{
if (atomic_dec_and_test(&files->count)) {
- struct fdtable *fdt = close_files(files);
-
- /* free the arrays if they are not embedded */
- if (fdt != &files->fdtab)
- __free_fdtable(fdt);
- kmem_cache_free(files_cachep, files);
+ close_files(files);
+ call_rcu(&files->fdtab.rcu, free_files_struct_rcu);
}
}
@@ -822,12 +828,14 @@ EXPORT_SYMBOL(fget_raw);
struct file *fget_task(struct task_struct *task, unsigned int fd)
{
+ struct files_struct *files;
struct file *file = NULL;
- task_lock(task);
- if (task->files)
- file = __fget_files(task->files, fd, 0, 1);
- task_unlock(task);
+ rcu_read_lock();
+ files = rcu_dereference(task->files);
+ if (files)
+ file = __fget_files(files, fd, 0, 1);
+ rcu_read_unlock();
return file;
}
@@ -838,11 +846,9 @@ struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
struct files_struct *files;
struct file *file = NULL;
- task_lock(task);
- files = task->files;
+ files = rcu_dereference(task->files);
if (files)
file = files_lookup_fd_rcu(files, fd);
- task_unlock(task);
return file;
}
@@ -854,8 +860,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret
unsigned int fd = *ret_fd;
struct file *file = NULL;
- task_lock(task);
- files = task->files;
+ files = rcu_dereference(task->files);
if (files) {
for (; fd < files_fdtable(files)->max_fds; fd++) {
file = files_lookup_fd_rcu(files, fd);
@@ -863,7 +868,6 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret
break;
}
}
- task_unlock(task);
*ret_fd = fd;
return file;
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 063cd120b459..57cdedb39e31 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -934,7 +934,7 @@ struct task_struct {
struct fs_struct *fs;
/* Open file information: */
- struct files_struct *files;
+ struct files_struct __rcu *files;
#ifdef CONFIG_IO_URING
struct io_uring_task *io_uring;
Eric
More information about the CRIU
mailing list