[Devel] [PATCH RHEL7 COMMIT] ms/shm: wait for pins to be released when sealing

Konstantin Khorenko khorenko at virtuozzo.com
Thu Oct 15 04:04:20 PDT 2015


The commit is pushed to "branch-rh7-3.10.0-229.7.2.vz7.8.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-229.7.2.vz7.8.6
------>
commit c49f7bf80c6e70a3992c13f6b7f7a60b44c81dce
Author: Andrew Vagin <avagin at openvz.org>
Date:   Thu Oct 15 15:04:19 2015 +0400

    ms/shm: wait for pins to be released when sealing
    
    The patch is required for CRIU.
    
    https://jira.sw.ru/browse/PSBM-39834
    
    From: David Herrmann <dh.herrmann at gmail.com>
    
    ML: 05f65b5c70909ef686f865f0a85406d74d75f70f
    
    If we set SEAL_WRITE on a file, we must make sure there cannot be any
    ongoing write-operations on the file.  For write() calls, we simply lock
    the inode mutex, for mmap() we simply verify there're no writable
    mappings.  However, there might be pages pinned by AIO, Direct-IO and
    similar operations via GUP.  We must make sure those do not write to the
    memfd file after we set SEAL_WRITE.
    
    As there is no way to notify GUP users to drop pages or to wait for them
    to be done, we implement the wait ourself: When setting SEAL_WRITE, we
    check all pages for their ref-count.  If it's bigger than 1, we know
    there's some user of the page.  We then mark the page and wait for up to
    150ms for those ref-counts to be dropped.  If the ref-counts are not
    dropped in time, we refuse the seal operation.
    
    Signed-off-by: David Herrmann <dh.herrmann at gmail.com>
    Acked-by: Hugh Dickins <hughd at google.com>
    Cc: Michael Kerrisk <mtk.manpages at gmail.com>
    Cc: Ryan Lortie <desrt at desrt.ca>
    Cc: Lennart Poettering <lennart at poettering.net>
    Cc: Daniel Mack <zonque at gmail.com>
    Cc: Andy Lutomirski <luto at amacapital.net>
    Signed-off-by: Andrew Morton <akpm at linux-foundation.org>
    Signed-off-by: Linus Torvalds <torvalds at linux-foundation.org>
    Signed-off-by: Andrew Vagin <avagin at openvz.org>
---
 mm/shmem.c | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 109 insertions(+), 1 deletion(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index bc8e08b..fd563aa 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1903,9 +1903,117 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
 	return offset;
 }
 
+/*
+ * We need a tag: a new tag would expand every radix_tree_node by 8 bytes,
+ * so reuse a tag which we firmly believe is never set or cleared on shmem.
+ */
+#define SHMEM_TAG_PINNED        PAGECACHE_TAG_TOWRITE
+#define LAST_SCAN               4       /* about 150ms max */
+
+static void shmem_tag_pins(struct address_space *mapping)
+{
+	struct radix_tree_iter iter;
+	void **slot;
+	pgoff_t start;
+	struct page *page;
+
+	lru_add_drain();
+	start = 0;
+	rcu_read_lock();
+
+restart:
+	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+		page = radix_tree_deref_slot(slot);
+		if (!page || radix_tree_exception(page)) {
+			if (radix_tree_deref_retry(page))
+				goto restart;
+		} else if (page_count(page) - page_mapcount(page) > 1) {
+			spin_lock_irq(&mapping->tree_lock);
+			radix_tree_tag_set(&mapping->page_tree, iter.index,
+					   SHMEM_TAG_PINNED);
+			spin_unlock_irq(&mapping->tree_lock);
+		}
+
+		if (need_resched()) {
+			cond_resched_rcu();
+			start = iter.index + 1;
+			goto restart;
+		}
+	}
+	rcu_read_unlock();
+}
+
+/*
+ * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
+ * via get_user_pages(), drivers might have some pending I/O without any active
+ * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
+ * and see whether it has an elevated ref-count. If so, we tag them and wait for
+ * them to be dropped.
+ * The caller must guarantee that no new user will acquire writable references
+ * to those pages to avoid races.
+ */
 static int shmem_wait_for_pins(struct address_space *mapping)
 {
-	return 0;
+	struct radix_tree_iter iter;
+	void **slot;
+	pgoff_t start;
+	struct page *page;
+	int error, scan;
+
+	shmem_tag_pins(mapping);
+
+	error = 0;
+	for (scan = 0; scan <= LAST_SCAN; scan++) {
+		if (!radix_tree_tagged(&mapping->page_tree, SHMEM_TAG_PINNED))
+			break;
+
+		if (!scan)
+			lru_add_drain_all();
+		else if (schedule_timeout_killable((HZ << scan) / 200))
+			scan = LAST_SCAN;
+
+		start = 0;
+		rcu_read_lock();
+restart:
+		radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter,
+					   start, SHMEM_TAG_PINNED) {
+
+			page = radix_tree_deref_slot(slot);
+			if (radix_tree_exception(page)) {
+				if (radix_tree_deref_retry(page))
+					goto restart;
+
+				page = NULL;
+			}
+
+			if (page &&
+			    page_count(page) - page_mapcount(page) != 1) {
+				if (scan < LAST_SCAN)
+					goto continue_resched;
+
+				/*
+				 * On the last scan, we clean up all those tags
+				 * we inserted; but make a note that we still
+				 * found pages pinned.
+				 */
+				error = -EBUSY;
+			}
+
+			spin_lock_irq(&mapping->tree_lock);
+			radix_tree_tag_clear(&mapping->page_tree,
+					     iter.index, SHMEM_TAG_PINNED);
+			spin_unlock_irq(&mapping->tree_lock);
+continue_resched:
+			if (need_resched()) {
+				cond_resched_rcu();
+				start = iter.index + 1;
+				goto restart;
+			}
+		}
+		rcu_read_unlock();
+	}
+
+	return error;
 }
 
 #define F_ALL_SEALS (F_SEAL_SEAL | \



More information about the Devel mailing list