[Devel] [PATCH 2/3] ext4: fix lost truncate due to race with writeback

Cyrill Gorcunov gorcunov at virtuozzo.com
Tue Jun 30 04:07:36 PDT 2015


From: Jan Kara <jack at suse.cz>

The following race can lead to a loss of i_disksize update from truncate
thus resulting in a wrong inode size if the inode size isn't updated
again before inode is reclaimed:

ext4_setattr()				mpage_map_and_submit_extent()
  EXT4_I(inode)->i_disksize = attr->ia_size;
  ...					  ...
					  disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT
					  /* False because i_size isn't
					   * updated yet */
					  if (disksize > i_size_read(inode))
					  /* True, because i_disksize is
					   * already truncated */
					  if (disksize > EXT4_I(inode)->i_disksize)
					    /* Overwrite i_disksize
					     * update from truncate */
					    ext4_update_i_disksize()
  i_size_write(inode, attr->ia_size);

For other places updating i_disksize such race cannot happen because
i_mutex prevents these races. Writeback is the only place where we do
not hold i_mutex and we cannot grab it there because of lock ordering.

We fix the race by doing both i_disksize and i_size update in truncate
atomically under i_data_sem and in mpage_map_and_submit_extent() we move
the check against i_size under i_data_sem as well.

gorcunov@:
 - ML 90e775b71ac4e685898c7995756fe58c135adaa6
 - https://jira.sw.ru/browse/PSBM-34383

Signed-off-by: Jan Kara <jack at suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso at mit.edu>
Signed-off-by: Cyrill Gorcunov <gorcunov at virtuozzo.com>
---
 fs/ext4/ext4.h  |   24 ++++++++++++++++++++----
 fs/ext4/inode.c |   15 ++++++++++++---
 2 files changed, 32 insertions(+), 7 deletions(-)

Index: linux-pcs7.git/fs/ext4/ext4.h
===================================================================
--- linux-pcs7.git.orig/fs/ext4/ext4.h
+++ linux-pcs7.git/fs/ext4/ext4.h
@@ -2400,16 +2400,32 @@ do {								\
 #define EXT4_FREECLUSTERS_WATERMARK 0
 #endif
 
+/* Update i_disksize. Requires i_mutex to avoid races with truncate */
 static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
 {
-	/*
-	 * XXX: replace with spinlock if seen contended -bzzz
-	 */
+	WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
+		     !mutex_is_locked(&inode->i_mutex));
 	down_write(&EXT4_I(inode)->i_data_sem);
 	if (newsize > EXT4_I(inode)->i_disksize)
 		EXT4_I(inode)->i_disksize = newsize;
 	up_write(&EXT4_I(inode)->i_data_sem);
-	return ;
+}
+
+/*
+ * Update i_disksize after writeback has been started. Races with truncate
+ * are avoided by checking i_size under i_data_sem.
+ */
+static inline void ext4_wb_update_i_disksize(struct inode *inode, loff_t newsize)
+{
+	loff_t i_size;
+
+	down_write(&EXT4_I(inode)->i_data_sem);
+	i_size = i_size_read(inode);
+	if (newsize > i_size)
+		newsize = i_size;
+	if (newsize > EXT4_I(inode)->i_disksize)
+		EXT4_I(inode)->i_disksize = newsize;
+	up_write(&EXT4_I(inode)->i_data_sem);
 }
 
 struct ext4_group_info {
Index: linux-pcs7.git/fs/ext4/inode.c
===================================================================
--- linux-pcs7.git.orig/fs/ext4/inode.c
+++ linux-pcs7.git/fs/ext4/inode.c
@@ -1788,7 +1788,7 @@ static void mpage_da_map_and_submit(stru
 	if (disksize > i_size_read(mpd->inode))
 		disksize = i_size_read(mpd->inode);
 	if (disksize > EXT4_I(mpd->inode)->i_disksize) {
-		ext4_update_i_disksize(mpd->inode, disksize);
+		ext4_wb_update_i_disksize(mpd->inode, disksize);
 		err = ext4_mark_inode_dirty(handle, mpd->inode);
 		if (err)
 			ext4_error(mpd->inode->i_sb,
@@ -4831,18 +4831,27 @@ int ext4_setattr(struct dentry *dentry,
 				error = ext4_orphan_add(handle, inode);
 				orphan = 1;
 			}
+			down_write(&EXT4_I(inode)->i_data_sem);
 			EXT4_I(inode)->i_disksize = attr->ia_size;
 			rc = ext4_mark_inode_dirty(handle, inode);
 			if (!error)
 				error = rc;
+			/*
+			 * We have to update i_size under i_data_sem together
+			 * with i_disksize to avoid races with writeback code
+			 * running ext4_wb_update_i_disksize().
+			 */
+			if (!error)
+				i_size_write(inode, attr->ia_size);
+			up_write(&EXT4_I(inode)->i_data_sem);
 			ext4_journal_stop(handle);
 			if (error) {
 				ext4_orphan_del(NULL, inode);
 				goto err_out;
 			}
-		}
+		} else
+			i_size_write(inode, attr->ia_size);
 
-		i_size_write(inode, attr->ia_size);
 		/*
 		 * Blocks are going to be removed from the inode. Wait
 		 * for dio in flight.  Temporarily disable



More information about the Devel mailing list