[Devel] [PATCH vz7 v2] ext4: replace ext4_kvmalloc() with kvmalloc()

Kirill Tkhai ktkhai at virtuozzo.com
Wed May 16 12:49:42 MSK 2018


On 11.05.2018 18:37, Oleg Babin wrote:
> ext4_kvmalloc() is used to allocate the table of group descritors
> blocks. It is called in GFP_NOFS context which is not vmalloc()
> compatible so it cannot be directly replaced with kvmalloc().
> 
> In order to use kvmalloc() with GFP_KERNEL flag the memory allocation
> is moved from add_new_gdb() / add_new_gdb_meta_bg() functions to the
> beginning of the resize process before any journaling is started and
> any FS locks are taken.
> 
> After this we do not need ext4_kvmalloc/ext4_kvfree functions any more
> so remove them.
> 
> v2: replace incorrect 'return err' with 'goto' to a clean up code.
> 
> https://jira.sw.ru/browse/PSBM-83044
> Signed-off-by: Oleg Babin <obabin at virtuozzo.com>
> ---
>  fs/ext4/ext4.h    |   3 --
>  fs/ext4/mballoc.c |   6 +--
>  fs/ext4/resize.c  | 112 +++++++++++++++++++++++++++++++++++-------------------
>  fs/ext4/super.c   |  39 +++----------------
>  4 files changed, 80 insertions(+), 80 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 00903ac..ceb69e8 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -2304,9 +2304,6 @@ extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
>  /* super.c */
>  extern int ext4_calculate_overhead(struct super_block *sb);
>  extern void ext4_superblock_csum_set(struct super_block *sb);
> -extern void *ext4_kvmalloc(size_t size, gfp_t flags);
> -extern void *ext4_kvzalloc(size_t size, gfp_t flags);
> -extern void ext4_kvfree(void *ptr);
>  extern int ext4_alloc_flex_bg_array(struct super_block *sb,
>  				    ext4_group_t ngroup);
>  extern const char *ext4_decode_error(struct super_block *sb, int errno,
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index b3ddecb..d9778e4 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -2358,7 +2358,7 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
>  	if (sbi->s_group_info) {
>  		memcpy(new_groupinfo, sbi->s_group_info,
>  		       sbi->s_group_info_size * sizeof(*sbi->s_group_info));
> -		ext4_kvfree(sbi->s_group_info);
> +		kvfree(sbi->s_group_info);
>  	}
>  	sbi->s_group_info = new_groupinfo;
>  	sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
> @@ -2495,7 +2495,7 @@ err_freebuddy:
>  		kfree(sbi->s_group_info[i]);
>  	iput(sbi->s_buddy_cache);
>  err_freesgi:
> -	ext4_kvfree(sbi->s_group_info);
> +	kvfree(sbi->s_group_info);
>  	return -ENOMEM;
>  }
>  
> @@ -2710,7 +2710,7 @@ int ext4_mb_release(struct super_block *sb)
>  			EXT4_DESC_PER_BLOCK_BITS(sb);
>  		for (i = 0; i < num_meta_group_infos; i++)
>  			kfree(sbi->s_group_info[i]);
> -		ext4_kvfree(sbi->s_group_info);
> +		kvfree(sbi->s_group_info);
>  	}
>  	kfree(sbi->s_mb_offsets);
>  	kfree(sbi->s_mb_maxs);
> diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
> index 218fd6f..ec2f5c3 100644
> --- a/fs/ext4/resize.c
> +++ b/fs/ext4/resize.c
> @@ -79,6 +79,11 @@ static ext4_grpblk_t ext4_group_overhead_blocks(struct super_block *sb,
>  	return overhead;
>  }
>  
> +static unsigned long num_desc_blocks(struct super_block *sb, ext4_group_t groups)
> +{
> +	return (groups + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb);
> +}
> +
>  #define outside(b, first, last)	((b) < (first) || (b) >= (last))
>  #define inside(b, first, last)	((b) >= (first) && (b) < (last))
>  
> @@ -738,6 +743,49 @@ static int verify_reserved_gdb(struct super_block *sb,
>  }
>  
>  /*
> + * Allocate the top-level s_group_desc array for the specified number
> + * of groups. As the memory is allocated before the journaling is started
> + * we can safely use kvmalloc() with GFP_KERNEL flag here.
> + */
> +static int ext4_alloc_group_desc_bh_array(struct super_block *sb,
> +					  ext4_group_t n_groups_count)
> +{
> +	struct ext4_sb_info *sbi = EXT4_SB(sb);
> +	unsigned long o_gdb_count = sbi->s_gdb_count;
> +	unsigned long n_gdb_count = num_desc_blocks(sb, n_groups_count);
> +	struct buffer_head **o_group_desc, **n_group_desc;
> +
> +	if (n_gdb_count <= o_gdb_count)
> +		return 0;
> +
> +	o_group_desc = sbi->s_group_desc;
> +	n_group_desc = kvmalloc(n_gdb_count * sizeof(struct buffer_head *),
> +				GFP_KERNEL);
> +	if (!n_group_desc) {
> +		ext4_warning(sb, "not enough memory for %lu groups", n_gdb_count);
> +		return -ENOMEM;
> +	}
> +
> +	memcpy(n_group_desc, o_group_desc,
> +	       o_gdb_count * sizeof(struct buffer_head *));
> +
> +	memset(n_group_desc + o_gdb_count * sizeof(struct buffer_head *),
> +	       0, (n_gdb_count - o_gdb_count) * sizeof(struct buffer_head *));
> +
> +	sbi->s_group_desc = n_group_desc;

We used to change sbi->s_group_desc in other place and in other context.
What does protect this structure member of concurrent users? I don't
see something special wrong, but the place has changed and we should
be sure, that the protection is still held.

> +
> +	/*
> +	 * Do not update sbi->s_gdb_count here as it can be used later
> +	 * in reserve_backup_gdb(). The field will be upated later in
> +	 * add_new_gdb() / add_new_gdb_meta_bg().
> +	 */
> +
> +	kvfree(o_group_desc);
> +
> +	return 0;
> +}
> +
> +/*
>   * Called when we need to bring a reserved group descriptor table block into
>   * use from the resize inode.  The primary copy of the new GDT block currently
>   * is an indirect block (under the double indirect block in the resize inode).
> @@ -757,7 +805,6 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
>  	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
>  	unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
>  	ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
> -	struct buffer_head **o_group_desc, **n_group_desc;
>  	struct buffer_head *dind;
>  	struct buffer_head *gdb_bh;
>  	int gdbackups;
> @@ -815,16 +862,6 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
>  	if (unlikely(err))
>  		goto exit_dind;
>  
> -	n_group_desc = ext4_kvmalloc((gdb_num + 1) *
> -				     sizeof(struct buffer_head *),
> -				     GFP_NOFS);
> -	if (!n_group_desc) {
> -		err = -ENOMEM;
> -		ext4_warning(sb, "not enough memory for %lu groups",
> -			     gdb_num + 1);
> -		goto exit_inode;
> -	}
> -
>  	/*
>  	 * Finally, we have all of the possible failures behind us...
>  	 *
> @@ -850,13 +887,14 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
>  	}
>  	brelse(dind);
>  
> -	o_group_desc = EXT4_SB(sb)->s_group_desc;
> -	memcpy(n_group_desc, o_group_desc,
> -	       EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
> -	n_group_desc[gdb_num] = gdb_bh;
> -	EXT4_SB(sb)->s_group_desc = n_group_desc;
> +	/*
> +	 * The s_group_desc array was reallocated at the beginning of the
> +	 * resize process but the s_gdb_count field had not been updated
> +	 * then because we use its old value in the reserve_backup_gdb().
> +	 */
> +	BUG_ON(gdb_num != EXT4_SB(sb)->s_gdb_count);
> +	EXT4_SB(sb)->s_group_desc[gdb_num] = gdb_bh;
>  	EXT4_SB(sb)->s_gdb_count++;
> -	ext4_kvfree(o_group_desc);
>  
>  	le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
>  	err = ext4_handle_dirty_super(handle, sb);
> @@ -866,7 +904,6 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
>  	return err;
>  
>  exit_inode:
> -	ext4_kvfree(n_group_desc);
>  	brelse(iloc.bh);
>  exit_dind:
>  	brelse(dind);
> @@ -884,7 +921,6 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
>  			       handle_t *handle, ext4_group_t group) {
>  	ext4_fsblk_t gdblock;
>  	struct buffer_head *gdb_bh;
> -	struct buffer_head **o_group_desc, **n_group_desc;
>  	unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
>  	int err;
>  
> @@ -893,23 +929,16 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
>  	gdb_bh = sb_bread(sb, gdblock);
>  	if (!gdb_bh)
>  		return -EIO;
> -	n_group_desc = ext4_kvmalloc((gdb_num + 1) *
> -				     sizeof(struct buffer_head *),
> -				     GFP_NOFS);
> -	if (!n_group_desc) {
> -		err = -ENOMEM;
> -		ext4_warning(sb, "not enough memory for %lu groups",
> -			     gdb_num + 1);
> -		return err;
> -	}
>  
> -	o_group_desc = EXT4_SB(sb)->s_group_desc;
> -	memcpy(n_group_desc, o_group_desc,
> -	       EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
> -	n_group_desc[gdb_num] = gdb_bh;
> -	EXT4_SB(sb)->s_group_desc = n_group_desc;
> +	/*
> +	 * The s_group_desc array was reallocated at the beginning of the
> +	 * resize process but the s_gdb_count field had not been updated
> +	 * then because we use its old value in the reserve_backup_gdb().
> +	 */
> +	BUG_ON(gdb_num != EXT4_SB(sb)->s_gdb_count);
> +	EXT4_SB(sb)->s_group_desc[gdb_num] = gdb_bh;
>  	EXT4_SB(sb)->s_gdb_count++;
> -	ext4_kvfree(o_group_desc);
> +
>  	BUFFER_TRACE(gdb_bh, "get_write_access");
>  	err = ext4_journal_get_write_access(handle, gdb_bh);
>  	if (unlikely(err))
> @@ -1623,6 +1652,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
>  	if (err)
>  		goto out;
>  
> +	err = ext4_alloc_group_desc_bh_array(sb, input->group + 1);
> +	if (err)
> +		goto out;
> +

Hm. This patch adds this function in two places. But the thing looks like,
we could try to add it in the single place, say, in ext4_flex_group_add().
Does this function starts in NOFS context? I have a doubt, since it's called
in GFP_KERNEL context on the first sight:

1)ext4_group_add()
   ext4_alloc_flex_bg_array()
     kvzalloc(GFP_KERNEL)
   ext4_mb_alloc_groupinfo()
     kvzalloc(GFP_KERNEL)
   ext4_flex_group_add()

2)ext4_resize_fs()
   ext4_mb_alloc_groupinfo()
     kvzalloc(GFP_KERNEL)
   alloc_flex_gd()
     kmalloc(GFP_NOFS) <--- GFP_NOFS here, but is it really required?
   ext4_flex_group_add()

   Could you please to check either it's possible to replace GFP_NOFS
   with GFP_KERNEL in alloc_flex_gd()? For the first sight, I don't see,
   what may change the context after ext4_mb_alloc_groupinfo(), which
   does not require GFP_NOFS.

In case of we can kill GFP_NOFS in alloc_flex_gd(), we may insert
ext4_alloc_group_desc_bh_array() in the single place somewhere at
the beginning of ext4_flex_group_add().

>  	err = ext4_alloc_flex_bg_array(sb, input->group + 1);
>  	if (err)
>  		goto out;
> @@ -1772,11 +1805,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
>  } /* ext4_group_extend */
>  
>  
> -static int num_desc_blocks(struct super_block *sb, ext4_group_t groups)
> -{
> -	return (groups + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb);
> -}
> -
>  /*
>   * Release the resize inode and drop the resize_inode feature if there
>   * are no more reserved gdt blocks, and then convert the file system
> @@ -1973,9 +2001,13 @@ retry:
>  	if (ext4_blocks_count(es) == n_blocks_count)
>  		goto out;
>  
> +	err = ext4_alloc_group_desc_bh_array(sb, n_group + 1);
> +	if (err)
> +		goto out;
> +
>  	err = ext4_alloc_flex_bg_array(sb, n_group + 1);
>  	if (err)
> -		return err;
> +		goto out;
>  
>  	err = ext4_mb_alloc_groupinfo(sb, n_group + 1);
>  	if (err)
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index f644bf0..3fbcf86 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -157,35 +157,6 @@ void ext4_superblock_csum_set(struct super_block *sb)
>  	es->s_checksum = ext4_superblock_csum(sb, es);
>  }
>  
> -void *ext4_kvmalloc(size_t size, gfp_t flags)
> -{
> -	void *ret;
> -
> -	ret = kmalloc(size, flags);
> -	if (!ret)
> -		ret = __vmalloc(size, flags, PAGE_KERNEL);
> -	return ret;
> -}
> -
> -void *ext4_kvzalloc(size_t size, gfp_t flags)
> -{
> -	void *ret;
> -
> -	ret = kzalloc(size, flags);
> -	if (!ret)
> -		ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);
> -	return ret;
> -}
> -
> -void ext4_kvfree(void *ptr)
> -{
> -	if (is_vmalloc_addr(ptr))
> -		vfree(ptr);
> -	else
> -		kfree(ptr);
> -
> -}
> -
>  ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
>  			       struct ext4_group_desc *bg)
>  {
> @@ -947,8 +918,8 @@ static void ext4_put_super(struct super_block *sb)
>  
>  	for (i = 0; i < sbi->s_gdb_count; i++)
>  		brelse(sbi->s_group_desc[i]);
> -	ext4_kvfree(sbi->s_group_desc);
> -	ext4_kvfree(sbi->s_flex_groups);
> +	kvfree(sbi->s_group_desc);
> +	kvfree(sbi->s_flex_groups);
>  	percpu_counter_destroy(&sbi->s_freeclusters_counter);
>  	percpu_counter_destroy(&sbi->s_freeinodes_counter);
>  	percpu_counter_destroy(&sbi->s_dirs_counter);
> @@ -2191,7 +2162,7 @@ int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
>  		memcpy(new_groups, sbi->s_flex_groups,
>  		       (sbi->s_flex_groups_allocated *
>  			sizeof(struct flex_groups)));
> -		ext4_kvfree(sbi->s_flex_groups);
> +		kvfree(sbi->s_flex_groups);

I don't see that sbi->s_flex_groups was allocated via ext4_kvmalloc().
So, it looks like another functional change, refactoring, just because
of you want to delete ext4_kvfree().

I'd suggested to split your patch in two. The first patch will change
the place, where EXT4_SB(sb)->s_group_desc is allocated. The second
one will remove then unused ext4_kvmalloc(), ext4_kvfree() and replace
ext4_kvfree(sbi->s_flex_groups) with kvfree(sbi->s_flex_groups).

>  	}
>  	sbi->s_flex_groups = new_groups;
>  	sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups);
> @@ -4674,7 +4645,7 @@ failed_mount7:
>  failed_mount6:
>  	ext4_mb_release(sb);
>  	if (sbi->s_flex_groups)
> -		ext4_kvfree(sbi->s_flex_groups);
> +		kvfree(sbi->s_flex_groups);
>  	percpu_counter_destroy(&sbi->s_freeclusters_counter);
>  	percpu_counter_destroy(&sbi->s_freeinodes_counter);
>  	percpu_counter_destroy(&sbi->s_dirs_counter);
> @@ -4706,7 +4677,7 @@ failed_mount3:
>  failed_mount2:
>  	for (i = 0; i < db_count; i++)
>  		brelse(sbi->s_group_desc[i]);
> -	ext4_kvfree(sbi->s_group_desc);
> +	kvfree(sbi->s_group_desc);
>  failed_mount:
>  	if (sbi->s_chksum_driver)
>  		crypto_free_shash(sbi->s_chksum_driver);

Kirill


More information about the Devel mailing list