[Devel] [PATCH RHEL COMMIT] ext4: replace ext4_kvmalloc() with kvmalloc()

Konstantin Khorenko khorenko at virtuozzo.com
Thu Sep 30 16:37:49 MSK 2021


Dropping the patch: ms code already use kvmalloc() which was the main idea of our series (to avoid 
high order memory allocations with kmalloc()).

--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team

On 30.09.2021 16:03, Konstantin Khorenko wrote:
> The commit is pushed to "branch-rh9-5.14.vz9.1.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
> after ark-5.14
> ------>
> commit b90d6aec7e49f5fef4074b121ed67297781ff885
> Author: Oleg Babin <obabin at virtuozzo.com>
> Date:   Thu Sep 30 16:03:55 2021 +0300
> 
>      ext4: replace ext4_kvmalloc() with kvmalloc()
>      
>      ext4_kvmalloc() is used to allocate the table of group descritors
>      blocks. It is called in GFP_NOFS context which is not vmalloc()
>      compatible so it cannot be directly replaced with kvmalloc().
>      
>      In order to use kvmalloc() with GFP_KERNEL flag the memory allocation
>      is moved from add_new_gdb() / add_new_gdb_meta_bg() functions to the
>      beginning of the resize process before any journaling is started and
>      any FS locks are taken.
>      
>      After this we do not need ext4_kvmalloc/ext4_kvfree functions any more.
>      
>      https://jira.sw.ru/browse/PSBM-83044
>      
>      Signed-off-by: Oleg Babin <obabin at virtuozzo.com>
>      
>      Signed-off-by: Jan Dakinevich <jan.dakinevich at virtuozzo.com>
>      
>      +++
>      ext4: fix out of bounds access in ext4_alloc_group_desc_bh_array()
>      
>      https://jira.sw.ru/browse/PSBM-87413
>      
>      mFixes: d695abe ("ext4: replace ext4_kvmalloc() with kvmalloc()")
>      Signed-off-by: Jan Dakinevich <jan.dakinevich at virtuozzo.com>
>      
>      (cherry-picked from vz7 commit cfd1ff8794a4 ("ext4: replace ext4_kvmalloc() with
>      kvmalloc()"))
>      
>      https://jira.sw.ru/browse/PSBM-127849
>      Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
>      
>      +++
>      ext4: Fix high probable use-after-free
>      
>      Here we have even worse race than in mainstream.
>      
>      https://jira.sw.ru/browse/PSBM-101798
>      
>      mFixes: 86521524314e "ext4: replace ext4_kvmalloc() with kvmalloc()"
>      Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
>      
>      (cherry picked from vz7 commit 601cc650f4ef ("ext4: Fix high probable
>      use-after-free"))
>      
>      mFixes: vz8 commit 5e0235ca2ae8 ("ext4: replace ext4_kvmalloc() with
>      kvmalloc()")
>      
>      In the scope of https://jira.sw.ru/browse/PSBM-127850
>      Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>
>      
>      (cherry picked from vz8 commit 781a1d7846ae4c0b1e9cb3e3ad0b8897242027fb)
>      Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
> ---
>   fs/ext4/ext4.h   |  2 ++
>   fs/ext4/resize.c | 49 ++++++++++---------------------------------------
>   fs/ext4/super.c  | 31 +++++++++++++++++++++++++++++++
>   3 files changed, 43 insertions(+), 39 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 3c51e243450d..ca89e6880e24 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -3050,6 +3050,8 @@ extern int ext4_calculate_overhead(struct super_block *sb);
>   extern void ext4_superblock_csum_set(struct super_block *sb);
>   extern int ext4_alloc_flex_bg_array(struct super_block *sb,
>   				    ext4_group_t ngroup);
> +extern int ext4_alloc_group_desc_bh_array(struct super_block *sb,
> +					  ext4_group_t ngroup);
>   extern const char *ext4_decode_error(struct super_block *sb, int errno,
>   				     char nbuf[16]);
>   extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
> diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
> index 7a9f1adef679..d9d97ee7dcbc 100644
> --- a/fs/ext4/resize.c
> +++ b/fs/ext4/resize.c
> @@ -797,7 +797,6 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
>   	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
>   	unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
>   	ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
> -	struct buffer_head **o_group_desc, **n_group_desc = NULL;
>   	struct buffer_head *dind = NULL;
>   	struct buffer_head *gdb_bh = NULL;
>   	int gdbackups;
> @@ -858,15 +857,6 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
>   	if (unlikely(err))
>   		goto errout;
>   
> -	n_group_desc = kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
> -				GFP_KERNEL);
> -	if (!n_group_desc) {
> -		err = -ENOMEM;
> -		ext4_warning(sb, "not enough memory for %lu groups",
> -			     gdb_num + 1);
> -		goto errout;
> -	}
> -
>   	/*
>   	 * Finally, we have all of the possible failures behind us...
>   	 *
> @@ -894,15 +884,8 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
>   	}
>   	brelse(dind);
>   
> -	rcu_read_lock();
> -	o_group_desc = rcu_dereference(EXT4_SB(sb)->s_group_desc);
> -	memcpy(n_group_desc, o_group_desc,
> -	       EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
> -	rcu_read_unlock();
> -	n_group_desc[gdb_num] = gdb_bh;
> -	rcu_assign_pointer(EXT4_SB(sb)->s_group_desc, n_group_desc);
> +	rcu_assign_pointer(EXT4_SB(sb)->s_group_desc[gdb_num], gdb_bh);
>   	EXT4_SB(sb)->s_gdb_count++;
> -	ext4_kvfree_array_rcu(o_group_desc);
>   
>   	lock_buffer(EXT4_SB(sb)->s_sbh);
>   	le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
> @@ -913,7 +896,6 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
>   		ext4_std_error(sb, err);
>   	return err;
>   errout:
> -	kvfree(n_group_desc);
>   	brelse(iloc.bh);
>   	brelse(dind);
>   	brelse(gdb_bh);
> @@ -929,7 +911,6 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
>   			       handle_t *handle, ext4_group_t group) {
>   	ext4_fsblk_t gdblock;
>   	struct buffer_head *gdb_bh;
> -	struct buffer_head **o_group_desc, **n_group_desc;
>   	unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
>   	int err;
>   
> @@ -938,34 +919,16 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
>   	gdb_bh = ext4_sb_bread(sb, gdblock, 0);
>   	if (IS_ERR(gdb_bh))
>   		return PTR_ERR(gdb_bh);
> -	n_group_desc = kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
> -				GFP_KERNEL);
> -	if (!n_group_desc) {
> -		brelse(gdb_bh);
> -		err = -ENOMEM;
> -		ext4_warning(sb, "not enough memory for %lu groups",
> -			     gdb_num + 1);
> -		return err;
> -	}
> -
> -	rcu_read_lock();
> -	o_group_desc = rcu_dereference(EXT4_SB(sb)->s_group_desc);
> -	memcpy(n_group_desc, o_group_desc,
> -	       EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
> -	rcu_read_unlock();
> -	n_group_desc[gdb_num] = gdb_bh;
>   
>   	BUFFER_TRACE(gdb_bh, "get_write_access");
>   	err = ext4_journal_get_write_access(handle, gdb_bh);
>   	if (err) {
> -		kvfree(n_group_desc);
>   		brelse(gdb_bh);
>   		return err;
>   	}
>   
> -	rcu_assign_pointer(EXT4_SB(sb)->s_group_desc, n_group_desc);
> +	rcu_assign_pointer(EXT4_SB(sb)->s_group_desc[gdb_num], gdb_bh);
>   	EXT4_SB(sb)->s_gdb_count++;
> -	ext4_kvfree_array_rcu(o_group_desc);
>   	return err;
>   }
>   
> @@ -1688,6 +1651,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
>   	if (err)
>   		goto out;
>   
> +	err = ext4_alloc_group_desc_bh_array(sb, input->group + 1);
> +	if (err)
> +		goto out;
> +
>   	err = ext4_mb_alloc_groupinfo(sb, input->group + 1);
>   	if (err)
>   		goto out;
> @@ -2066,6 +2033,10 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
>   	if (err)
>   		goto out;
>   
> +	err = ext4_alloc_group_desc_bh_array(sb, n_group + 1);
> +	if (err)
> +		goto out;
> +
>   	err = ext4_mb_alloc_groupinfo(sb, n_group + 1);
>   	if (err)
>   		goto out;
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index befbb0892fdd..0186d0421c2b 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -2752,6 +2752,37 @@ int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
>   	return 0;
>   }
>   
> +/*
> + * Allocate the top-level s_group_desc array for the specified number
> + * of groups. As the memory is allocated before the journaling is started
> + * we can safely use kvmalloc() with GFP_KERNEL flag here.
> + */
> +int ext4_alloc_group_desc_bh_array(struct super_block *sb, ext4_group_t ngroup)
> +{
> +	struct ext4_sb_info *sbi = EXT4_SB(sb);
> +	unsigned long num_desc = DIV_ROUND_UP(ngroup,  EXT4_DESC_PER_BLOCK(sb));
> +	struct buffer_head **o_group_desc, **n_group_desc;
> +
> +	if (num_desc <= sbi->s_gdb_count)
> +		return 0;
> +
> +	n_group_desc = kvmalloc(num_desc * sizeof(struct buffer_head *),
> +				GFP_KERNEL);
> +	if (!n_group_desc) {
> +		ext4_warning(sb, "not enough memory for %lu groups", num_desc);
> +		return -ENOMEM;
> +	}
> +
> +	o_group_desc = sbi->s_group_desc;
> +	memcpy(n_group_desc, o_group_desc,
> +	       sbi->s_gdb_count * sizeof(struct buffer_head *));
> +	WRITE_ONCE(sbi->s_group_desc, n_group_desc);
> +
> +	/* FIXME: rcu is needed here. See ms commit 1d0c3924a92e */
> +	kvfree(o_group_desc);
> +	return 0;
> +}
> +
>   static int ext4_fill_flex_info(struct super_block *sb)
>   {
>   	struct ext4_sb_info *sbi = EXT4_SB(sb);
> .
> 


More information about the Devel mailing list