[Devel] [PATCH 2/3] ext4: fix lost truncate due to race with writeback

Dmitry Monakhov dmonakhov at openvz.org
Wed Jul 1 07:46:14 PDT 2015


Cyrill Gorcunov <gorcunov at virtuozzo.com> writes:

> From: Jan Kara <jack at suse.cz>
>
> The following race can lead to a loss of i_disksize update from truncate
> thus resulting in a wrong inode size if the inode size isn't updated
> again before inode is reclaimed:
>
> ext4_setattr()				mpage_map_and_submit_extent()
>   EXT4_I(inode)->i_disksize = attr->ia_size;
>   ...					  ...
> 					  disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT
> 					  /* False because i_size isn't
> 					   * updated yet */
> 					  if (disksize > i_size_read(inode))
> 					  /* True, because i_disksize is
> 					   * already truncated */
> 					  if (disksize > EXT4_I(inode)->i_disksize)
> 					    /* Overwrite i_disksize
> 					     * update from truncate */
> 					    ext4_update_i_disksize()
>   i_size_write(inode, attr->ia_size);
>
> For other places updating i_disksize such race cannot happen because
> i_mutex prevents these races. Writeback is the only place where we do
> not hold i_mutex and we cannot grab it there because of lock ordering.
>
> We fix the race by doing both i_disksize and i_size update in truncate
> atomically under i_data_sem and in mpage_map_and_submit_extent() we move
> the check against i_size under i_data_sem as well.
>
> gorcunov@:
>  - ML 90e775b71ac4e685898c7995756fe58c135adaa6
>  - https://jira.sw.ru/browse/PSBM-34383
>
> Signed-off-by: Jan Kara <jack at suse.cz>
> Signed-off-by: "Theodore Ts'o" <tytso at mit.edu>
> Signed-off-by: Cyrill Gorcunov <gorcunov at virtuozzo.com>
ACK
> ---
>  fs/ext4/ext4.h  |   24 ++++++++++++++++++++----
>  fs/ext4/inode.c |   15 ++++++++++++---
>  2 files changed, 32 insertions(+), 7 deletions(-)
>
> Index: linux-pcs7.git/fs/ext4/ext4.h
> ===================================================================
> --- linux-pcs7.git.orig/fs/ext4/ext4.h
> +++ linux-pcs7.git/fs/ext4/ext4.h
> @@ -2400,16 +2400,32 @@ do {								\
>  #define EXT4_FREECLUSTERS_WATERMARK 0
>  #endif
>  
> +/* Update i_disksize. Requires i_mutex to avoid races with truncate */
>  static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
>  {
> -	/*
> -	 * XXX: replace with spinlock if seen contended -bzzz
> -	 */
> +	WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
> +		     !mutex_is_locked(&inode->i_mutex));
>  	down_write(&EXT4_I(inode)->i_data_sem);
>  	if (newsize > EXT4_I(inode)->i_disksize)
>  		EXT4_I(inode)->i_disksize = newsize;
>  	up_write(&EXT4_I(inode)->i_data_sem);
> -	return ;
> +}
> +
> +/*
> + * Update i_disksize after writeback has been started. Races with truncate
> + * are avoided by checking i_size under i_data_sem.
> + */
> +static inline void ext4_wb_update_i_disksize(struct inode *inode, loff_t newsize)
> +{
> +	loff_t i_size;
> +
> +	down_write(&EXT4_I(inode)->i_data_sem);
> +	i_size = i_size_read(inode);
> +	if (newsize > i_size)
> +		newsize = i_size;
> +	if (newsize > EXT4_I(inode)->i_disksize)
> +		EXT4_I(inode)->i_disksize = newsize;
> +	up_write(&EXT4_I(inode)->i_data_sem);
>  }
>  
>  struct ext4_group_info {
> Index: linux-pcs7.git/fs/ext4/inode.c
> ===================================================================
> --- linux-pcs7.git.orig/fs/ext4/inode.c
> +++ linux-pcs7.git/fs/ext4/inode.c
> @@ -1788,7 +1788,7 @@ static void mpage_da_map_and_submit(stru
>  	if (disksize > i_size_read(mpd->inode))
>  		disksize = i_size_read(mpd->inode);
>  	if (disksize > EXT4_I(mpd->inode)->i_disksize) {
> -		ext4_update_i_disksize(mpd->inode, disksize);
> +		ext4_wb_update_i_disksize(mpd->inode, disksize);
>  		err = ext4_mark_inode_dirty(handle, mpd->inode);
>  		if (err)
>  			ext4_error(mpd->inode->i_sb,
> @@ -4831,18 +4831,27 @@ int ext4_setattr(struct dentry *dentry,
>  				error = ext4_orphan_add(handle, inode);
>  				orphan = 1;
>  			}
> +			down_write(&EXT4_I(inode)->i_data_sem);
>  			EXT4_I(inode)->i_disksize = attr->ia_size;
>  			rc = ext4_mark_inode_dirty(handle, inode);
>  			if (!error)
>  				error = rc;
> +			/*
> +			 * We have to update i_size under i_data_sem together
> +			 * with i_disksize to avoid races with writeback code
> +			 * running ext4_wb_update_i_disksize().
> +			 */
> +			if (!error)
> +				i_size_write(inode, attr->ia_size);
> +			up_write(&EXT4_I(inode)->i_data_sem);
>  			ext4_journal_stop(handle);
>  			if (error) {
>  				ext4_orphan_del(NULL, inode);
>  				goto err_out;
>  			}
> -		}
> +		} else
> +			i_size_write(inode, attr->ia_size);
>  
> -		i_size_write(inode, attr->ia_size);
>  		/*
>  		 * Blocks are going to be removed from the inode. Wait
>  		 * for dio in flight.  Temporarily disable
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 472 bytes
Desc: not available
URL: <http://lists.openvz.org/pipermail/devel/attachments/20150701/9aa2afc1/attachment-0001.sig>


More information about the Devel mailing list