[Devel] [PATCH RHEL7 COMMIT] ext4: fix potential race between online resizing and write operations

Konstantin Khorenko khorenko at virtuozzo.com
Fri Mar 6 18:28:55 MSK 2020


The commit is pushed to "branch-rh7-3.10.0-1062.12.1.vz7.131.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-1062.12.1.vz7.131.8
------>
commit c5564a68e660418562d7758f6adc8eb10ca4d497
Author: Theodore Ts'o <tytso at mit.edu>
Date:   Fri Mar 6 18:28:55 2020 +0300

    ext4: fix potential race between online resizing and write operations
    
    ms commit 1d0c3924a92e
    
    During an online resize an array of pointers to buffer heads gets
    replaced so it can get enlarged.  If there is a racing block
    allocation or deallocation which uses the old array, and the old array
    has gotten reused this can lead to a GPF or some other random kernel
    memory getting modified.
    
    Link: https://bugzilla.kernel.org/show_bug.cgi?id=206443
    Link: https://lore.kernel.org/r/20200221053458.730016-2-tytso@mit.edu
    Reported-by: Suraj Jitindar Singh <surajjs at amazon.com>
    Signed-off-by: Theodore Ts'o <tytso at mit.edu>
    Cc: stable at kernel.org
    
    https://jira.sw.ru/browse/PSBM-101798
    [ktkhai: adopted for our kernel]
    
    Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
---
 fs/ext4/balloc.c | 14 +++++++++++---
 fs/ext4/ext4.h   | 20 +++++++++++++++++++-
 fs/ext4/resize.c | 48 +++++++++++++++++++++++++++++++++++++++++-------
 fs/ext4/super.c  | 39 ++++++++++++++++++++++++++-------------
 4 files changed, 97 insertions(+), 24 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 0cad980666963..27862962791ef 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -281,6 +281,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
 	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	struct ext4_group_desc *desc;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct buffer_head *bh_p;
 
 	if (block_group >= ngroups) {
 		ext4_error(sb, "block_group >= groups_count - block_group = %u,"
@@ -291,7 +292,14 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
 
 	group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
 	offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
-	if (!sbi->s_group_desc[group_desc]) {
+	bh_p = sbi_array_rcu_deref(sbi, s_group_desc, group_desc);
+	/*
+	 * sbi_array_rcu_deref returns with rcu unlocked, this is ok since
+	 * the pointer being dereferenced won't be dereferenced again. By
+	 * looking at the usage in add_new_gdb() the value isn't modified,
+	 * just the pointer, and so it remains valid.
+	 */
+	if (!bh_p) {
 		ext4_error(sb, "Group descriptor not loaded - "
 			   "block_group = %u, group_desc = %u, desc = %u",
 			   block_group, group_desc, offset);
@@ -299,10 +307,10 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
 	}
 
 	desc = (struct ext4_group_desc *)(
-		(__u8 *)sbi->s_group_desc[group_desc]->b_data +
+		(__u8 *)bh_p->b_data +
 		offset * EXT4_DESC_SIZE(sb));
 	if (bh)
-		*bh = sbi->s_group_desc[group_desc];
+		*bh = bh_p;
 	return desc;
 }
 
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index db93aa5b6a9c6..0d919cb545a65 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1285,7 +1285,7 @@ struct ext4_sb_info {
 	loff_t s_bitmap_maxbytes;	/* max bytes for bitmap files */
 	struct buffer_head * s_sbh;	/* Buffer containing the super block */
 	struct ext4_super_block *s_es;	/* Pointer to the super block in the buffer */
-	struct buffer_head **s_group_desc;
+	struct buffer_head * __rcu *s_group_desc;
 	unsigned int s_mount_opt;
 	unsigned int s_mount_opt2;
 	unsigned int s_mount_flags;
@@ -1478,6 +1478,23 @@ static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io)
 	inode->i_private = io;
 }
 
+/*
+ * Returns: sbi->field[index]
+ * Used to access an array element from the following sbi fields which require
+ * rcu protection to avoid dereferencing an invalid pointer due to reassignment
+ * - s_group_desc
+ * - s_group_info
+ * - s_flex_group
+ */
+#define sbi_array_rcu_deref(sbi, field, index)				   \
+({									   \
+	typeof(*((sbi)->field)) _v;					   \
+	rcu_read_lock();						   \
+	_v = ((typeof(_v)*)rcu_dereference((sbi)->field))[index];	   \
+	rcu_read_unlock();						   \
+	_v;								   \
+})
+
 /*
  * Inode dynamic state flags
  */
@@ -2279,6 +2296,7 @@ extern int ext4_generic_delete_entry(handle_t *handle,
 				     int csum_size);
 
 /* resize.c */
+extern void ext4_kvfree_array_rcu(void *to_free);
 extern int ext4_group_add(struct super_block *sb,
 				struct ext4_new_group_data *input);
 extern int ext4_group_extend(struct super_block *sb,
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 60aef84eb8542..821f30f83b4e7 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -16,6 +16,33 @@
 
 #include "ext4_jbd2.h"
 
+struct ext4_rcu_ptr {
+	struct rcu_head rcu;
+	void *ptr;
+};
+
+static void ext4_rcu_ptr_callback(struct rcu_head *head)
+{
+	struct ext4_rcu_ptr *ptr;
+
+	ptr = container_of(head, struct ext4_rcu_ptr, rcu);
+	kvfree(ptr->ptr);
+	kfree(ptr);
+}
+
+void ext4_kvfree_array_rcu(void *to_free)
+{
+	struct ext4_rcu_ptr *ptr = kzalloc(sizeof(*ptr), GFP_KERNEL);
+
+	if (ptr) {
+		ptr->ptr = to_free;
+		call_rcu(&ptr->rcu, ext4_rcu_ptr_callback);
+		return;
+	}
+	synchronize_rcu();
+	kvfree(to_free);
+}
+
 int ext4_resize_begin(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -541,8 +568,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
 				brelse(gdb);
 				goto out;
 			}
-			memcpy(gdb->b_data, sbi->s_group_desc[j]->b_data,
-			       gdb->b_size);
+			memcpy(gdb->b_data, sbi_array_rcu_deref(sbi,
+				s_group_desc, j)->b_data, gdb->b_size);
 			set_buffer_uptodate(gdb);
 
 			err = ext4_handle_dirty_metadata(handle, NULL, gdb);
@@ -839,7 +866,9 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	}
 	brelse(dind);
 
-	EXT4_SB(sb)->s_group_desc[gdb_num] = gdb_bh;
+	rcu_read_lock();
+	rcu_dereference(EXT4_SB(sb)->s_group_desc)[gdb_num] = gdb_bh;
+	rcu_read_unlock();
 	EXT4_SB(sb)->s_gdb_count++;
 
 	le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
@@ -876,8 +905,11 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
 	if (!gdb_bh)
 		return -EIO;
 
-	EXT4_SB(sb)->s_group_desc[gdb_num] = gdb_bh;
+	rcu_read_lock();
+	rcu_dereference(EXT4_SB(sb)->s_group_desc)[gdb_num] = gdb_bh;
+	rcu_read_unlock();
 	EXT4_SB(sb)->s_gdb_count++;
+
 	BUFFER_TRACE(gdb_bh, "get_write_access");
 	err = ext4_journal_get_write_access(handle, gdb_bh);
 	if (unlikely(err))
@@ -1143,7 +1175,8 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
 		 * use non-sparse filesystems anymore.  This is already checked above.
 		 */
 		if (gdb_off) {
-			gdb_bh = sbi->s_group_desc[gdb_num];
+			gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc,
+						     gdb_num);
 			BUFFER_TRACE(gdb_bh, "get_write_access");
 			err = ext4_journal_get_write_access(handle, gdb_bh);
 
@@ -1225,7 +1258,7 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
 		/*
 		 * get_write_access() has been called on gdb_bh by ext4_add_new_desc().
 		 */
-		gdb_bh = sbi->s_group_desc[gdb_num];
+		gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc, gdb_num);
 		/* Update group descriptor block for new group */
 		gdp = (struct ext4_group_desc *)(gdb_bh->b_data +
 						 gdb_off * EXT4_DESC_SIZE(sb));
@@ -1455,7 +1488,8 @@ static int ext4_flex_group_add(struct super_block *sb,
 		for (; gdb_num <= gdb_num_end; gdb_num++) {
 			struct buffer_head *gdb_bh;
 
-			gdb_bh = sbi->s_group_desc[gdb_num];
+			gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc,
+						     gdb_num);
 			if (old_gdb == gdb_bh->b_blocknr)
 				continue;
 			update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3adeeb00f118f..f752e0190cdf2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -893,6 +893,7 @@ static inline void ext4_quota_off_umount(struct super_block *sb)
 static void ext4_put_super(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct buffer_head **group_desc;
 	struct ext4_super_block *es = sbi->s_es;
 	int aborted = 0;
 	struct workqueue_struct * rsv_conversion_wq = sbi->rsv_conversion_wq;
@@ -935,9 +936,12 @@ static void ext4_put_super(struct super_block *sb)
 	}
 	kobject_del(&sbi->s_kobj);
 
+	rcu_read_lock();
+	group_desc = rcu_dereference(sbi->s_group_desc);
 	for (i = 0; i < sbi->s_gdb_count; i++)
-		brelse(sbi->s_group_desc[i]);
-	kvfree(sbi->s_group_desc);
+		brelse(group_desc[i]);
+	kvfree(group_desc);
+	rcu_read_unlock();
 	kvfree(sbi->s_flex_groups);
 	percpu_counter_destroy(&sbi->s_freeclusters_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
@@ -2208,13 +2212,14 @@ int ext4_alloc_group_desc_bh_array(struct super_block *sb, ext4_group_t ngroup)
 		return -ENOMEM;
 	}
 
-	o_group_desc = sbi->s_group_desc;
+	rcu_read_lock();
+	o_group_desc = rcu_dereference(EXT4_SB(sb)->s_group_desc);
 	memcpy(n_group_desc, o_group_desc,
 	       sbi->s_gdb_count * sizeof(struct buffer_head *));
-	WRITE_ONCE(sbi->s_group_desc, n_group_desc);
+	rcu_read_unlock();
+	rcu_assign_pointer(EXT4_SB(sb)->s_group_desc, n_group_desc);
 
-	/* FIXME: rcu is needed here. See ms commit 1d0c3924a92e */
-	kvfree(o_group_desc);
+	ext4_kvfree_array_rcu(o_group_desc);
 	return 0;
 }
 
@@ -3854,7 +3859,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
 	char *orig_data = kstrdup(data, GFP_KERNEL);
-	struct buffer_head *bh;
+	struct buffer_head *bh, **group_desc;
 	struct ext4_super_block *es = NULL;
 	struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	ext4_fsblk_t block;
@@ -4417,9 +4422,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 			goto failed_mount;
 		}
 	}
-	sbi->s_group_desc = kvmalloc(db_count *
+	rcu_assign_pointer(sbi->s_group_desc,
+			   kvmalloc(db_count *
 					  sizeof(struct buffer_head *),
-					  GFP_KERNEL);
+					  GFP_KERNEL));
 	if (sbi->s_group_desc == NULL) {
 		ext4_msg(sb, KERN_ERR, "not enough memory");
 		ret = -ENOMEM;
@@ -4436,14 +4442,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	bgl_lock_init(sbi->s_blockgroup_lock);
 
 	for (i = 0; i < db_count; i++) {
+		struct buffer_head *tmp_bh;
 		block = descriptor_loc(sb, logical_sb_block, i);
-		sbi->s_group_desc[i] = sb_bread(sb, block);
-		if (!sbi->s_group_desc[i]) {
+		tmp_bh = sb_bread(sb, block);
+		if (!tmp_bh) {
 			ext4_msg(sb, KERN_ERR,
 			       "can't read group descriptor %d", i);
 			db_count = i;
 			goto failed_mount2;
 		}
+		rcu_read_lock();
+		rcu_dereference(sbi->s_group_desc)[i] = tmp_bh;
+		rcu_read_unlock();
 	}
 	sbi->s_gdb_count = db_count;
 	if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) {
@@ -4815,9 +4825,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (sbi->s_mmp_tsk)
 		kthread_stop(sbi->s_mmp_tsk);
 failed_mount2:
+	rcu_read_lock();
+	group_desc = rcu_dereference(sbi->s_group_desc);
 	for (i = 0; i < db_count; i++)
-		brelse(sbi->s_group_desc[i]);
-	kvfree(sbi->s_group_desc);
+		brelse(group_desc[i]);
+	kvfree(group_desc);
+	rcu_read_unlock();
 failed_mount:
 	if (sbi->s_chksum_driver)
 		crypto_free_shash(sbi->s_chksum_driver);


More information about the Devel mailing list