[Devel] [PATCH vz7 v2] ext4: replace ext4_kvmalloc() with kvmalloc()
Kirill Tkhai
ktkhai at virtuozzo.com
Wed May 16 12:49:42 MSK 2018
On 11.05.2018 18:37, Oleg Babin wrote:
> ext4_kvmalloc() is used to allocate the table of group descritors
> blocks. It is called in GFP_NOFS context which is not vmalloc()
> compatible so it cannot be directly replaced with kvmalloc().
>
> In order to use kvmalloc() with GFP_KERNEL flag the memory allocation
> is moved from add_new_gdb() / add_new_gdb_meta_bg() functions to the
> beginning of the resize process before any journaling is started and
> any FS locks are taken.
>
> After this we do not need ext4_kvmalloc/ext4_kvfree functions any more
> so remove them.
>
> v2: replace incorrect 'return err' with 'goto' to a clean up code.
>
> https://jira.sw.ru/browse/PSBM-83044
> Signed-off-by: Oleg Babin <obabin at virtuozzo.com>
> ---
> fs/ext4/ext4.h | 3 --
> fs/ext4/mballoc.c | 6 +--
> fs/ext4/resize.c | 112 +++++++++++++++++++++++++++++++++++-------------------
> fs/ext4/super.c | 39 +++----------------
> 4 files changed, 80 insertions(+), 80 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 00903ac..ceb69e8 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -2304,9 +2304,6 @@ extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
> /* super.c */
> extern int ext4_calculate_overhead(struct super_block *sb);
> extern void ext4_superblock_csum_set(struct super_block *sb);
> -extern void *ext4_kvmalloc(size_t size, gfp_t flags);
> -extern void *ext4_kvzalloc(size_t size, gfp_t flags);
> -extern void ext4_kvfree(void *ptr);
> extern int ext4_alloc_flex_bg_array(struct super_block *sb,
> ext4_group_t ngroup);
> extern const char *ext4_decode_error(struct super_block *sb, int errno,
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index b3ddecb..d9778e4 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -2358,7 +2358,7 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
> if (sbi->s_group_info) {
> memcpy(new_groupinfo, sbi->s_group_info,
> sbi->s_group_info_size * sizeof(*sbi->s_group_info));
> - ext4_kvfree(sbi->s_group_info);
> + kvfree(sbi->s_group_info);
> }
> sbi->s_group_info = new_groupinfo;
> sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
> @@ -2495,7 +2495,7 @@ err_freebuddy:
> kfree(sbi->s_group_info[i]);
> iput(sbi->s_buddy_cache);
> err_freesgi:
> - ext4_kvfree(sbi->s_group_info);
> + kvfree(sbi->s_group_info);
> return -ENOMEM;
> }
>
> @@ -2710,7 +2710,7 @@ int ext4_mb_release(struct super_block *sb)
> EXT4_DESC_PER_BLOCK_BITS(sb);
> for (i = 0; i < num_meta_group_infos; i++)
> kfree(sbi->s_group_info[i]);
> - ext4_kvfree(sbi->s_group_info);
> + kvfree(sbi->s_group_info);
> }
> kfree(sbi->s_mb_offsets);
> kfree(sbi->s_mb_maxs);
> diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
> index 218fd6f..ec2f5c3 100644
> --- a/fs/ext4/resize.c
> +++ b/fs/ext4/resize.c
> @@ -79,6 +79,11 @@ static ext4_grpblk_t ext4_group_overhead_blocks(struct super_block *sb,
> return overhead;
> }
>
> +static unsigned long num_desc_blocks(struct super_block *sb, ext4_group_t groups)
> +{
> + return (groups + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb);
> +}
> +
> #define outside(b, first, last) ((b) < (first) || (b) >= (last))
> #define inside(b, first, last) ((b) >= (first) && (b) < (last))
>
> @@ -738,6 +743,49 @@ static int verify_reserved_gdb(struct super_block *sb,
> }
>
> /*
> + * Allocate the top-level s_group_desc array for the specified number
> + * of groups. As the memory is allocated before the journaling is started
> + * we can safely use kvmalloc() with GFP_KERNEL flag here.
> + */
> +static int ext4_alloc_group_desc_bh_array(struct super_block *sb,
> + ext4_group_t n_groups_count)
> +{
> + struct ext4_sb_info *sbi = EXT4_SB(sb);
> + unsigned long o_gdb_count = sbi->s_gdb_count;
> + unsigned long n_gdb_count = num_desc_blocks(sb, n_groups_count);
> + struct buffer_head **o_group_desc, **n_group_desc;
> +
> + if (n_gdb_count <= o_gdb_count)
> + return 0;
> +
> + o_group_desc = sbi->s_group_desc;
> + n_group_desc = kvmalloc(n_gdb_count * sizeof(struct buffer_head *),
> + GFP_KERNEL);
> + if (!n_group_desc) {
> + ext4_warning(sb, "not enough memory for %lu groups", n_gdb_count);
> + return -ENOMEM;
> + }
> +
> + memcpy(n_group_desc, o_group_desc,
> + o_gdb_count * sizeof(struct buffer_head *));
> +
> + memset(n_group_desc + o_gdb_count * sizeof(struct buffer_head *),
> + 0, (n_gdb_count - o_gdb_count) * sizeof(struct buffer_head *));
> +
> + sbi->s_group_desc = n_group_desc;
We used to change sbi->s_group_desc in other place and in other context.
What does protect this structure member of concurrent users? I don't
see something special wrong, but the place has changed and we should
be sure, that the protection is still held.
> +
> + /*
> + * Do not update sbi->s_gdb_count here as it can be used later
> + * in reserve_backup_gdb(). The field will be upated later in
> + * add_new_gdb() / add_new_gdb_meta_bg().
> + */
> +
> + kvfree(o_group_desc);
> +
> + return 0;
> +}
> +
> +/*
> * Called when we need to bring a reserved group descriptor table block into
> * use from the resize inode. The primary copy of the new GDT block currently
> * is an indirect block (under the double indirect block in the resize inode).
> @@ -757,7 +805,6 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
> struct ext4_super_block *es = EXT4_SB(sb)->s_es;
> unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
> ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
> - struct buffer_head **o_group_desc, **n_group_desc;
> struct buffer_head *dind;
> struct buffer_head *gdb_bh;
> int gdbackups;
> @@ -815,16 +862,6 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
> if (unlikely(err))
> goto exit_dind;
>
> - n_group_desc = ext4_kvmalloc((gdb_num + 1) *
> - sizeof(struct buffer_head *),
> - GFP_NOFS);
> - if (!n_group_desc) {
> - err = -ENOMEM;
> - ext4_warning(sb, "not enough memory for %lu groups",
> - gdb_num + 1);
> - goto exit_inode;
> - }
> -
> /*
> * Finally, we have all of the possible failures behind us...
> *
> @@ -850,13 +887,14 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
> }
> brelse(dind);
>
> - o_group_desc = EXT4_SB(sb)->s_group_desc;
> - memcpy(n_group_desc, o_group_desc,
> - EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
> - n_group_desc[gdb_num] = gdb_bh;
> - EXT4_SB(sb)->s_group_desc = n_group_desc;
> + /*
> + * The s_group_desc array was reallocated at the beginning of the
> + * resize process but the s_gdb_count field had not been updated
> + * then because we use its old value in the reserve_backup_gdb().
> + */
> + BUG_ON(gdb_num != EXT4_SB(sb)->s_gdb_count);
> + EXT4_SB(sb)->s_group_desc[gdb_num] = gdb_bh;
> EXT4_SB(sb)->s_gdb_count++;
> - ext4_kvfree(o_group_desc);
>
> le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
> err = ext4_handle_dirty_super(handle, sb);
> @@ -866,7 +904,6 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
> return err;
>
> exit_inode:
> - ext4_kvfree(n_group_desc);
> brelse(iloc.bh);
> exit_dind:
> brelse(dind);
> @@ -884,7 +921,6 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
> handle_t *handle, ext4_group_t group) {
> ext4_fsblk_t gdblock;
> struct buffer_head *gdb_bh;
> - struct buffer_head **o_group_desc, **n_group_desc;
> unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
> int err;
>
> @@ -893,23 +929,16 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
> gdb_bh = sb_bread(sb, gdblock);
> if (!gdb_bh)
> return -EIO;
> - n_group_desc = ext4_kvmalloc((gdb_num + 1) *
> - sizeof(struct buffer_head *),
> - GFP_NOFS);
> - if (!n_group_desc) {
> - err = -ENOMEM;
> - ext4_warning(sb, "not enough memory for %lu groups",
> - gdb_num + 1);
> - return err;
> - }
>
> - o_group_desc = EXT4_SB(sb)->s_group_desc;
> - memcpy(n_group_desc, o_group_desc,
> - EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
> - n_group_desc[gdb_num] = gdb_bh;
> - EXT4_SB(sb)->s_group_desc = n_group_desc;
> + /*
> + * The s_group_desc array was reallocated at the beginning of the
> + * resize process but the s_gdb_count field had not been updated
> + * then because we use its old value in the reserve_backup_gdb().
> + */
> + BUG_ON(gdb_num != EXT4_SB(sb)->s_gdb_count);
> + EXT4_SB(sb)->s_group_desc[gdb_num] = gdb_bh;
> EXT4_SB(sb)->s_gdb_count++;
> - ext4_kvfree(o_group_desc);
> +
> BUFFER_TRACE(gdb_bh, "get_write_access");
> err = ext4_journal_get_write_access(handle, gdb_bh);
> if (unlikely(err))
> @@ -1623,6 +1652,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
> if (err)
> goto out;
>
> + err = ext4_alloc_group_desc_bh_array(sb, input->group + 1);
> + if (err)
> + goto out;
> +
Hm. This patch adds this function in two places. But the thing looks like,
we could try to add it in the single place, say, in ext4_flex_group_add().
Does this function starts in NOFS context? I have a doubt, since it's called
in GFP_KERNEL context on the first sight:
1)ext4_group_add()
ext4_alloc_flex_bg_array()
kvzalloc(GFP_KERNEL)
ext4_mb_alloc_groupinfo()
kvzalloc(GFP_KERNEL)
ext4_flex_group_add()
2)ext4_resize_fs()
ext4_mb_alloc_groupinfo()
kvzalloc(GFP_KERNEL)
alloc_flex_gd()
kmalloc(GFP_NOFS) <--- GFP_NOFS here, but is it really required?
ext4_flex_group_add()
Could you please to check either it's possible to replace GFP_NOFS
with GFP_KERNEL in alloc_flex_gd()? For the first sight, I don't see,
what may change the context after ext4_mb_alloc_groupinfo(), which
does not require GFP_NOFS.
In case of we can kill GFP_NOFS in alloc_flex_gd(), we may insert
ext4_alloc_group_desc_bh_array() in the single place somewhere at
the beginning of ext4_flex_group_add().
> err = ext4_alloc_flex_bg_array(sb, input->group + 1);
> if (err)
> goto out;
> @@ -1772,11 +1805,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
> } /* ext4_group_extend */
>
>
> -static int num_desc_blocks(struct super_block *sb, ext4_group_t groups)
> -{
> - return (groups + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb);
> -}
> -
> /*
> * Release the resize inode and drop the resize_inode feature if there
> * are no more reserved gdt blocks, and then convert the file system
> @@ -1973,9 +2001,13 @@ retry:
> if (ext4_blocks_count(es) == n_blocks_count)
> goto out;
>
> + err = ext4_alloc_group_desc_bh_array(sb, n_group + 1);
> + if (err)
> + goto out;
> +
> err = ext4_alloc_flex_bg_array(sb, n_group + 1);
> if (err)
> - return err;
> + goto out;
>
> err = ext4_mb_alloc_groupinfo(sb, n_group + 1);
> if (err)
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index f644bf0..3fbcf86 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -157,35 +157,6 @@ void ext4_superblock_csum_set(struct super_block *sb)
> es->s_checksum = ext4_superblock_csum(sb, es);
> }
>
> -void *ext4_kvmalloc(size_t size, gfp_t flags)
> -{
> - void *ret;
> -
> - ret = kmalloc(size, flags);
> - if (!ret)
> - ret = __vmalloc(size, flags, PAGE_KERNEL);
> - return ret;
> -}
> -
> -void *ext4_kvzalloc(size_t size, gfp_t flags)
> -{
> - void *ret;
> -
> - ret = kzalloc(size, flags);
> - if (!ret)
> - ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);
> - return ret;
> -}
> -
> -void ext4_kvfree(void *ptr)
> -{
> - if (is_vmalloc_addr(ptr))
> - vfree(ptr);
> - else
> - kfree(ptr);
> -
> -}
> -
> ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
> struct ext4_group_desc *bg)
> {
> @@ -947,8 +918,8 @@ static void ext4_put_super(struct super_block *sb)
>
> for (i = 0; i < sbi->s_gdb_count; i++)
> brelse(sbi->s_group_desc[i]);
> - ext4_kvfree(sbi->s_group_desc);
> - ext4_kvfree(sbi->s_flex_groups);
> + kvfree(sbi->s_group_desc);
> + kvfree(sbi->s_flex_groups);
> percpu_counter_destroy(&sbi->s_freeclusters_counter);
> percpu_counter_destroy(&sbi->s_freeinodes_counter);
> percpu_counter_destroy(&sbi->s_dirs_counter);
> @@ -2191,7 +2162,7 @@ int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
> memcpy(new_groups, sbi->s_flex_groups,
> (sbi->s_flex_groups_allocated *
> sizeof(struct flex_groups)));
> - ext4_kvfree(sbi->s_flex_groups);
> + kvfree(sbi->s_flex_groups);
I don't see that sbi->s_flex_groups was allocated via ext4_kvmalloc().
So, it looks like another functional change, refactoring, just because
of you want to delete ext4_kvfree().
I'd suggested to split your patch in two. The first patch will change
the place, where EXT4_SB(sb)->s_group_desc is allocated. The second
one will remove then unused ext4_kvmalloc(), ext4_kvfree() and replace
ext4_kvfree(sbi->s_flex_groups) with kvfree(sbi->s_flex_groups).
> }
> sbi->s_flex_groups = new_groups;
> sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups);
> @@ -4674,7 +4645,7 @@ failed_mount7:
> failed_mount6:
> ext4_mb_release(sb);
> if (sbi->s_flex_groups)
> - ext4_kvfree(sbi->s_flex_groups);
> + kvfree(sbi->s_flex_groups);
> percpu_counter_destroy(&sbi->s_freeclusters_counter);
> percpu_counter_destroy(&sbi->s_freeinodes_counter);
> percpu_counter_destroy(&sbi->s_dirs_counter);
> @@ -4706,7 +4677,7 @@ failed_mount3:
> failed_mount2:
> for (i = 0; i < db_count; i++)
> brelse(sbi->s_group_desc[i]);
> - ext4_kvfree(sbi->s_group_desc);
> + kvfree(sbi->s_group_desc);
> failed_mount:
> if (sbi->s_chksum_driver)
> crypto_free_shash(sbi->s_chksum_driver);
Kirill
More information about the Devel
mailing list