[Devel] [PATCH RHEL9 COMMIT] fuse: Add a new fuse inode state FUSE_I_INVAL_FILES

Konstantin Khorenko khorenko at virtuozzo.com
Tue May 17 17:48:03 MSK 2022


The commit is pushed to "branch-rh9-5.14.0-42.vz9.15.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh9-5.14.0-42.vz9.15.2
------>
commit a907ab7c73091115b1d63b96d1946faa359473a7
Author: Liu Kui <Kui.Liu at acronis.com>
Date:   Thu Mar 3 13:42:46 2022 +0800

    fuse: Add a new fuse inode state FUSE_I_INVAL_FILES
    
    There are many places where it is critical to know if invalidate files
    operation is in progress, otherwise deadlock would happen. Currently
    this can be done by checking  the FUSE_S_FAIL_IMMEDIATELY flag bit from
    fuse_file state. However this is inconvenient in case where a fuse_file
    is not directly available, for example in fuse_writepages(), where it
    has to be done by indirectly checking the fuse_files in fi->rw_files via
    piece of code like this:
    
           spin_lock(&fi->lock);
           if (!list_empty(&fi->rw_files)) {
                   struct fuse_file *ff = list_entry(fi->rw_files.next,
                                                     struct fuse_file, rw_entry);
                   if (!test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state))
                           blocked = true;
           }
           spin_unlock(&fi->lock);
    
    Apparently above piece of code is too heavy for just checking whether we
    are in a particular state. However it can be easily solved by add a new
    state flag bit to fuse inode state. With this newly added flag bit, we
    can greatly simplify code in following scenario:
    
    1. Replace above mentioned code with following just a single line of code
       in places where only the 'inode' parameters is directly available.
    
            test_bit(FUSE_I_INVAL_FILES, &get_fuse_inode(inode)->state)
    
    2. We can make fuse_range_is_writeback() return false if invalidate files
       operation is in progress. This makes sense in that page cache will be
       invalidated and all pending page write will be discarded anyway once
       invalidate files operation starts. This is critical for avoidng deadlock
       in places where fuse_wait_on_page_writeback() is needed, and we don't
       have to use the variant fuse_wait_on_page_writeback_or_invalidate(),
       because fuse_wait_on_page_writeback() can now be woken up by
       fuse_notify_inval_files().
    
       Another benefit is we can guarante fuse_launder_page() always returns
       success in case invalidate files is in progress.
    
    3. The fuse_dummy_writpage() case can be merged into fuse_writepages_fill()
       by simply adding the following check at the begining of fuse_writepages_fill()
    
           if (!wpa && test_bit(FUSE_I_INVAL_FILES, &fi->state)) {
                  unlock_page(page);
                  return 0;
           }
    
       This makes sure that fuse_writepages() returns immediately when it is called via
       fuse_notify_inval_files(), which is essentially what fuse_dummy_writpage() does.
    
       Additionaly,it also can make fuse_writepages() returns faster if it was initiated
       by others and still in the middle of write_cache_pages() when invalidate files
       operation started.
    
    Signed-off-by: Liu Kui <Kui.Liu at acronis.com>
    Feature: vStorage
---
 fs/fuse/file.c   | 111 +++++++++++++++++++------------------------------------
 fs/fuse/fuse_i.h |  12 ++++++
 fs/fuse/inode.c  |   5 +++
 3 files changed, 55 insertions(+), 73 deletions(-)

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 1cce68a2feb5..e666c660ef1c 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -586,6 +586,16 @@ static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
 	found = fuse_find_writeback(fi, idx_from, idx_to);
 	spin_unlock(&fi->lock);
 
+	/*
+	* Return false if invalidate files is in progress, which makes
+	* sense in that page cache will be invalidated and all pending
+	* write requests will be discarded anyway.
+	*
+	* This can make fuse_wait_on_page_writeback() return when
+	* FUSE_NOTIFY_INVAL_FILES is in progress.
+	*/
+	found = found && !test_bit(FUSE_I_INVAL_FILES, &fi->state);
+
 	return found;
 }
 
@@ -607,20 +617,6 @@ static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
 	wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));
 }
 
-/*
- * Can be woken up by FUSE_NOTIFY_INVAL_FILES
- */
-static void fuse_wait_on_page_writeback_or_invalidate(struct inode *inode,
-						      struct file *file,
-						      pgoff_t index)
-{
-	struct fuse_inode *fi = get_fuse_inode(inode);
-	struct fuse_file *ff = file->private_data;
-
-	wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index) ||
-		   test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state));
-}
-
 /*
  * Wait for all pending writepages on the inode to finish.
  *
@@ -1079,10 +1075,11 @@ static int fuse_do_readpage(struct file *file, struct page *page,
 	 * Page writeback can extend beyond the lifetime of the
 	 * page-cache page, so make sure we read a properly synced
 	 * page.
-	 *
-	 * But we can't wait if FUSE_NOTIFY_INVAL_FILES is in progress.
 	 */
-	fuse_wait_on_page_writeback_or_invalidate(inode, file, page->index);
+	fuse_wait_on_page_writeback(inode, page->index);
+
+	if (fuse_file_fail_immediately(file))
+		return -EIO;
 
 	attr_ver = fuse_get_attr_version(fm->fc);
 
@@ -1246,8 +1243,7 @@ static void fuse_readahead(struct readahead_control *rac)
 		ap = &ia->ap;
 		nr_pages = __readahead_batch(rac, ap->pages, nr_pages);
 		for (i = 0; i < nr_pages; i++) {
-			/* we can't wait if FUSE_NOTIFY_INVAL_FILES is in progress */
-			fuse_wait_on_page_writeback_or_invalidate(inode, rac->file,
+			fuse_wait_on_page_writeback(inode,
 						    readahead_index(rac) + i);
 			ap->descs[i].length = PAGE_SIZE;
 		}
@@ -2404,21 +2400,8 @@ static inline bool fuse_blocked_for_wb(struct inode *inode)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
-	bool blocked = false;
-
-	if (!fc->blocked)
-		return false;
 
-	spin_lock(&fi->lock);
-	if (!list_empty(&fi->rw_files)) {
-		struct fuse_file *ff = list_entry(fi->rw_files.next,
-						  struct fuse_file, rw_entry);
-		if (!test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state))
-			blocked = true;
-	}
-	spin_unlock(&fi->lock);
-
-	return blocked;
+	return fc->blocked && !test_bit(FUSE_I_INVAL_FILES, &fi->state);
 }
 
 void fuse_release_ff(struct inode *inode, struct fuse_file *ff);
@@ -2438,6 +2421,15 @@ static int fuse_writepages_fill(struct page *page,
 
 	BUG_ON(wpa && !data->ff);
 
+	/* More than optimization: writeback pages to /dev/null; fused would
+	 * drop our FUSE_WRITE requests anyway, but it will be blocked while
+	 * sending NOTIFY_INVAL_FILES until we return!
+	 */
+	if (!wpa && test_bit(FUSE_I_INVAL_FILES, &fi->state)) {
+		unlock_page(page);
+		return 0;
+	}
+
 	if (wpa && fuse_writepage_need_send(fc, page, ap, data)) {
 		fuse_writepages_send(data);
 		fuse_release_ff(inode, data->ff);
@@ -2516,7 +2508,6 @@ static int fuse_writepages_fill(struct page *page,
 		end_page_writeback(page);
 		fuse_release_ff(inode, data->ff);
 		data->ff = NULL;
-		goto out_unlock;
 	}
 out_unlock:
 	unlock_page(page);
@@ -2527,14 +2518,6 @@ static int fuse_writepages_fill(struct page *page,
 	return err;
 }
 
-static int fuse_dummy_writepage(struct page *page,
-				struct writeback_control *wbc,
-				void *data)
-{
-	unlock_page(page);
-	return 0;
-}
-
 static int fuse_writepages(struct address_space *mapping,
 			   struct writeback_control *wbc)
 {
@@ -2555,29 +2538,9 @@ static int fuse_writepages(struct address_space *mapping,
 	if (wbc->sync_mode != WB_SYNC_NONE)
 		wait_event(fc->blocked_waitq, !fuse_blocked_for_wb(inode));
 
-	/* More than optimization: writeback pages to /dev/null; fused would
-	 * drop our FUSE_WRITE requests anyway, but it will be blocked while
-	 * sending NOTIFY_INVAL_FILES until we return!
-	 *
-	 * NB: We can't wait till fuse_send_writepages() because
-	 * fuse_writepages_fill() would possibly deadlock on
-	 * fuse_page_is_writeback().
-	 */
- 	data.ff = __fuse_write_file_get(fc, get_fuse_inode(inode));
-	if (data.ff && test_bit(FUSE_S_FAIL_IMMEDIATELY, &data.ff->ff_state)) {
-		err = write_cache_pages(mapping, wbc, fuse_dummy_writepage,
-					mapping);
-		fuse_release_ff(inode, data.ff);
-		data.ff = NULL;
-		goto out;
-	}
-	if (data.ff) {
-		fuse_release_ff(inode, data.ff);
-		data.ff = NULL;
-	}
-
 	data.inode = inode;
 	data.wpa = NULL;
+	data.ff = NULL;
 
 	err = -ENOMEM;
 	data.orig_pages = kcalloc(fc->max_pages,
@@ -2601,12 +2564,7 @@ static int fuse_writepages(struct address_space *mapping,
 	return err;
 }
 
-static inline bool fuse_file_fail_immediately(struct file *file)
-{
-	struct fuse_file *ff = file->private_data;
 
-	return test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state);
-}
 
 /*
  * It's worthy to make sure that space is reserved on disk for the write,
@@ -2629,13 +2587,13 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping,
 	if (!page)
 		goto error;
 
+	fuse_wait_on_page_writeback(mapping->host, page->index);
+
 	if (fuse_file_fail_immediately(file)) {
 		err = -EIO;
 		goto cleanup;
 	}
 
-	fuse_wait_on_page_writeback(mapping->host, page->index);
-
 	if (PageUptodate(page) || len == PAGE_SIZE)
 		goto success;
 	/*
@@ -2701,6 +2659,11 @@ static int fuse_launder_page(struct page *page)
 
 		/* Serialize with pending writeback for the same page */
 		fuse_wait_on_page_writeback(inode, page->index);
+
+		/* Return success if FUSE_NOTIFY_INVAL_FILES is in progress */
+		if (test_bit(FUSE_I_INVAL_FILES, &get_fuse_inode(inode)->state))
+			return 0;
+
 		err = fuse_writepage_locked(page);
 		if (!err)
 			fuse_wait_on_page_writeback(inode, page->index);
@@ -2748,10 +2711,12 @@ static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf)
 		return VM_FAULT_NOPAGE;
 	}
 
-	if (fuse_file_fail_immediately(vmf->vma->vm_file))
-		return -EIO;
-
 	fuse_wait_on_page_writeback(inode, page->index);
+	if (fuse_file_fail_immediately(vmf->vma->vm_file)) {
+		unlock_page(page);
+		return VM_FAULT_SIGSEGV;
+	}
+
 	return VM_FAULT_LOCKED;
 }
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index aa74b63b2392..3221892765b5 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -198,6 +198,8 @@ enum {
 
 	/** kdirect open try has already made */
 	FUSE_I_KIO_OPEN_TRY_MADE,
+	/** Operation invalidating files is in progress */
+	FUSE_I_INVAL_FILES,
 };
 
 struct fuse_conn;
@@ -1357,6 +1359,16 @@ static inline void fuse_dio_wait(struct fuse_inode *fi)
 	fuse_write_dio_wait(fi);
 }
 
+static inline bool __fuse_file_fail_immediately(struct fuse_file *ff)
+{
+	return test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state);
+}
+
+static inline bool fuse_file_fail_immediately(struct file *file)
+{
+	return  __fuse_file_fail_immediately(file->private_data);
+}
+
 /**
  * Scan all fuse_mounts belonging to fc to find the first where
  * ilookup5() returns a result.  Return that result and the
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index cdae748e4720..1ad28906bd25 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -511,12 +511,16 @@ int fuse_invalidate_files(struct fuse_conn *fc, u64 nodeid)
 		return -ENOENT;
 
 	fi = get_fuse_inode(inode);
+
 	spin_lock(&fi->lock);
 	list_for_each_entry(ff, &fi->rw_files, rw_entry) {
 		set_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state);
 	}
 	spin_unlock(&fi->lock);
 
+	/* Mark that invalidate files is in progress */
+	set_bit(FUSE_I_INVAL_FILES, &fi->state);
+
 	/* let them see FUSE_S_FAIL_IMMEDIATELY */
 	wake_up_all(&fc->blocked_waitq);
 
@@ -552,6 +556,7 @@ int fuse_invalidate_files(struct fuse_conn *fc, u64 nodeid)
 	if (!err)
 		fuse_invalidate_attr(inode);
 
+	clear_bit(FUSE_I_INVAL_FILES, &fi->state);
 	iput(inode);
 	return err;
 }


More information about the Devel mailing list