[CRIU] [PATCH v2 15/24] proc/fd: In proc_readfd_common use task_lookup_next_fd_rcu

Eric W. Biederman ebiederm at xmission.com
Wed Dec 9 01:38:09 MSK 2020


Al Viro <viro at zeniv.linux.org.uk> writes:

> On Fri, Nov 20, 2020 at 05:14:32PM -0600, Eric W. Biederman wrote:
>> When discussing[1] exec and posix file locks it was realized that none
>> of the callers of get_files_struct fundamentally needed to call
>> get_files_struct, and that by switching them to helper functions
>> instead it will both simplify their code and remove unnecessary
>> increments of files_struct.count.  Those unnecessary increments can
>> result in exec unnecessarily unsharing files_struct which breaking
>> posix locks, and it can result in fget_light having to fallback to
>> fget reducing system performance.
>> 
>> Using task_lookup_next_fd_rcu simplifies proc_readfd_common, by moving
>> the checking for the maximum file descritor into the generic code, and
>> by remvoing the need for capturing and releasing a reference on
>> files_struct.
>> 
>> As task_lookup_fd_rcu may update the fd ctx->pos has been changed
>> to be the fd +2 after task_lookup_fd_rcu returns.
>
>
>> +	for (fd = ctx->pos - 2;; fd++) {
>>  		struct file *f;
>>  		struct fd_data data;
>>  		char name[10 + 1];
>>  		unsigned int len;
>>  
>> -		f = files_lookup_fd_rcu(files, fd);
>> +		f = task_lookup_next_fd_rcu(p, &fd);
>
> Ugh...  That makes for a massive cacheline pingpong on task_lock -
> instead of grabbing/dropping task_lock() once in the beginning, we do
> that for every damn descriptor.
>
> I really don't like this one.  If anything, I would rather have
> a helper that would collect a bunch of pairs (fd,mode) into an
> array and have lookups batched into it.  With the loop in that
> sucker grabbing a reasonable amount into a local array, then
> doing proc_fill_cache() for each collected.

Is there any reason we don't simply rcu free the files_struct?
That would remove the need for the task_lock entirely.

Something like this?


diff --git a/fs/file.c b/fs/file.c
index 412033d8cfdf..7aaeac9965f3 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -379,7 +379,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
 	return NULL;
 }
 
-static struct fdtable *close_files(struct files_struct * files)
+static void close_files(struct files_struct * files)
 {
 	/*
 	 * It is safe to dereference the fd table without RCU or
@@ -407,19 +407,25 @@ static struct fdtable *close_files(struct files_struct * files)
 			set >>= 1;
 		}
 	}
+}
 
-	return fdt;
+static void free_files_struct_rcu(struct rcu_head *rcu)
+{
+	struct files_struct *files =
+		container_of(rcu, struct files_struct, fdtab.rcu);
+	struct fdtable *fdt = rcu_dereference_raw(files->fdt);
+
+	/* free the arrays if they are not embedded */
+	if (fdt != &files->fdtab)
+		__free_fdtable(fdt);
+	kmem_cache_free(files_cachep, files);
 }
 
 void put_files_struct(struct files_struct *files)
 {
 	if (atomic_dec_and_test(&files->count)) {
-		struct fdtable *fdt = close_files(files);
-
-		/* free the arrays if they are not embedded */
-		if (fdt != &files->fdtab)
-			__free_fdtable(fdt);
-		kmem_cache_free(files_cachep, files);
+		close_files(files);
+		call_rcu(&files->fdtab.rcu, free_files_struct_rcu);
 	}
 }
 
@@ -822,12 +828,14 @@ EXPORT_SYMBOL(fget_raw);
 
 struct file *fget_task(struct task_struct *task, unsigned int fd)
 {
+	struct files_struct *files;
 	struct file *file = NULL;
 
-	task_lock(task);
-	if (task->files)
-		file = __fget_files(task->files, fd, 0, 1);
-	task_unlock(task);
+	rcu_read_lock();
+	files = rcu_dereference(task->files);
+	if (files)
+		file = __fget_files(files, fd, 0, 1);
+	rcu_read_unlock();
 
 	return file;
 }
@@ -838,11 +846,9 @@ struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
 	struct files_struct *files;
 	struct file *file = NULL;
 
-	task_lock(task);
-	files = task->files;
+	files = rcu_dereference(task->files);
 	if (files)
 		file = files_lookup_fd_rcu(files, fd);
-	task_unlock(task);
 
 	return file;
 }
@@ -854,8 +860,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret
 	unsigned int fd = *ret_fd;
 	struct file *file = NULL;
 
-	task_lock(task);
-	files = task->files;
+	files = rcu_dereference(task->files);
 	if (files) {
 		for (; fd < files_fdtable(files)->max_fds; fd++) {
 			file = files_lookup_fd_rcu(files, fd);
@@ -863,7 +868,6 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret
 				break;
 		}
 	}
-	task_unlock(task);
 	*ret_fd = fd;
 	return file;
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 063cd120b459..57cdedb39e31 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -934,7 +934,7 @@ struct task_struct {
 	struct fs_struct		*fs;
 
 	/* Open file information: */
-	struct files_struct		*files;
+	struct files_struct __rcu	*files;
 
 #ifdef CONFIG_IO_URING
 	struct io_uring_task		*io_uring;

Eric


More information about the CRIU mailing list