[CRIU] [PATCH] arm: Workaround shmat() page coloring alignment

Dmitry Safonov dsafonov at virtuozzo.com
Wed Apr 12 10:29:34 PDT 2017


Please, see incode comment about the issue.
There was an attempt to fix this feature in kernel:
  http://www.spinics.net/lists/arm-kernel/msg258870.html

I'll send a patch for kernel with a correct justification.
Still it's worth to workaround this on older kernels.
mremap() after shmat() makes it possible to C/R shmem between ARMv7 CPUs.
Without it C/R will fail even on the same platform.

That is possible that this workaround changes *restore failure* to
*corruption* in shared memory after restore.
Still, I think it's worth to be applied by the following reasons:
1. All ARMv7 CPUs do not have VIPT aliasing data cache.
   And ARMv6 CPUs may have any, but they are not popular because of UMP.
2. SysV IPC shmem should be writable and SHMLBA unaligned
   (if it's only readable, then it's OK).
3. For the data corruption we need to migrate from non-VIPT cached CPUs
   to VIPT aliasing.
4. As this is shmem migration between platforms - quite likely we have
   it`s copy (in case of whole IPC namespace dump - in images).
5. C/R before on the same CPU may fail (quite likely, more than 50% of
   zdtm/transition/ipc failed while I've test it).

So, I think it's very unlikely that someone uses CRIU to migrate
from ARMv7 to ARMv6 unaligned writable shmem, but if he is - here is
a BIG WARNING on restore.

And after all: we've restored application as it was on checkpoint,
we can't reliably tell that shmem needs align on the target platform,
but we warned on restoring.

I wanted to test the cpuid for VIPT caching firstly, but it's in cp15
coprocessor, which means it's unavailable from userspace. Only one
reliable way to tell is to check couple of first boot lines in dmesg:
[    0.000000] CPU: PIPT / VIPT nonaliasing data cache, VIPT aliasing instruction cache

Signed-off-by: Dmitry Safonov <dsafonov at virtuozzo.com>
---
 criu/arch/arm/include/asm/restorer.h |  3 ++
 criu/arch/arm/restorer.c             | 59 ++++++++++++++++++++++++++++++++++++
 criu/pie/restorer.c                  | 12 +++++++-
 3 files changed, 73 insertions(+), 1 deletion(-)

diff --git a/criu/arch/arm/include/asm/restorer.h b/criu/arch/arm/include/asm/restorer.h
index 7e4b09c893b6..f7b963baec13 100644
--- a/criu/arch/arm/include/asm/restorer.h
+++ b/criu/arch/arm/include/asm/restorer.h
@@ -57,6 +57,9 @@
 
 int restore_gpregs(struct rt_sigframe *f, UserArmRegsEntry *r);
 int restore_nonsigframe_gpregs(UserArmRegsEntry *r);
+#define ARCH_HAS_SHMAT_HOOK
+unsigned long arch_shmat(int shmid, void *shmaddr,
+			int shmflg, unsigned long size);
 
 static inline void restore_tls(tls_t *ptls) {
 	asm (
diff --git a/criu/arch/arm/restorer.c b/criu/arch/arm/restorer.c
index 21234b95d33e..588c1c074fc5 100644
--- a/criu/arch/arm/restorer.c
+++ b/criu/arch/arm/restorer.c
@@ -7,8 +7,67 @@
 #include "log.h"
 #include <compel/asm/fpu.h>
 #include "cpu.h"
+#include "page.h"
+#include "common/err.h"
 
 int restore_nonsigframe_gpregs(UserArmRegsEntry *r)
 {
 	return 0;
 }
+
+/*
+ * On ARMv6 CPUs with VIPT caches there are aliasing issues:
+ * if two different cache line indexes correspond to the same physical
+ * address, then changes made to one of the alias might be lost or they
+ * can overwrite each other. To overcome aliasing issues, page coloring
+ * with 4 pages align for shared mappings was introduced (SHMLBA) in kernel.
+ * Which resulted in unique physical address after any tag in cache
+ * (because two upper bits corresponding to page address get unused in tags).
+ *
+ * The problem here is in shmat() syscall:
+ * 1. if shmaddr is NULL then do_shmat() uses arch_get_unmapped_area()
+ *    to allocate shared mapping. Which checks if CPU cache is VIPT
+ *    and only then use SHMLBA alignment.
+ * 2. if shmaddr is specified then do_shmat() checks that address has
+ *    SHMLBA alignment regardless to CPU cache aliasing.
+ *
+ * All above means that on non-VIPT CPU (like any ARMv7) we can get
+ * non-SHMLBA, but page-aligned address with shmat(shmid, NULL, shmflg),
+ * but we can't restore it with shmat(shmid, shmaddr, shmflg).
+ * Which results that we can dump e.g., application with shmem aligned
+ * on 2 pages, but can't restore it on the same ARMv7 CPU.
+ *
+ * To workaround this kernel feature, use mremap() on shmem mapping,
+ * allocated with shmat(shmid, NULL, shmflg).
+ */
+#define SHMLBA (4UL * PAGE_SIZE)
+unsigned long arch_shmat(int shmid, void *shmaddr,
+			int shmflg, unsigned long size)
+{
+	unsigned long smap;
+
+	/* SHMLBA-aligned, direct call shmat() */
+	if (!((unsigned long)shmaddr & (SHMLBA - 1)))
+		return sys_shmat(shmid, shmaddr, shmflg);
+
+	smap = sys_shmat(shmid, NULL, shmflg);
+	if (IS_ERR_VALUE(smap)) {
+		pr_err("shmat() with NULL shmaddr failed: %d\n", (int)smap);
+		return smap;
+	}
+
+	/* We're lucky! */
+	if (smap == (unsigned long)shmaddr)
+		return smap;
+
+	/* Warn ALOUD */
+	pr_warn("Restoring shmem %p unaligned to SHMLBA.\n", shmaddr);
+	pr_warn("Make sure that you don't migrate shmem from non-VIPT cached CPU to VIPT cached (e.g., ARMv7 -> ARMv6)\n");
+	pr_warn("Otherwise YOU HAVE A CHANCE OF DATA CORRUPTIONS in writeable shmem\n");
+
+	smap = sys_mremap(smap, size, size,
+			MREMAP_FIXED | MREMAP_MAYMOVE, (unsigned long)shmaddr);
+	if (IS_ERR_VALUE(smap))
+		pr_err("mremap() for shmem failed: %d\n", (int)smap);
+	return smap;
+}
diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
index 53ecaa0520e2..70ebcdc18a83 100644
--- a/criu/pie/restorer.c
+++ b/criu/pie/restorer.c
@@ -543,6 +543,14 @@ static long restore_self_exe_late(struct task_restore_args *args)
 	return ret;
 }
 
+#ifndef ARCH_HAS_SHMAT_HOOK
+unsigned long arch_shmat(int shmid, void *shmaddr,
+			int shmflg, unsigned long size)
+{
+	return sys_shmat(shmid, shmaddr, shmflg);
+}
+#endif
+
 static unsigned long restore_mapping(VmaEntry *vma_entry)
 {
 	int prot	= vma_entry->prot;
@@ -551,6 +559,8 @@ static unsigned long restore_mapping(VmaEntry *vma_entry)
 
 	if (vma_entry_is(vma_entry, VMA_AREA_SYSVIPC)) {
 		int att_flags;
+		void *shmaddr = decode_pointer(vma_entry->start);
+		unsigned long shmsize = (vma_entry->end - vma_entry->start);
 		/*
 		 * See comment in open_shmem_sysv() for what SYSV_SHMEM_SKIP_FD
 		 * means and why we check for PROT_EXEC few lines below.
@@ -565,7 +575,7 @@ static unsigned long restore_mapping(VmaEntry *vma_entry)
 			att_flags = SHM_RDONLY;
 
 		pr_info("Attach SYSV shmem %d at %"PRIx64"\n", (int)vma_entry->fd, vma_entry->start);
-		return sys_shmat(vma_entry->fd, decode_pointer(vma_entry->start), att_flags);
+		return arch_shmat(vma_entry->fd, shmaddr, att_flags, shmsize);
 	}
 
 	/*
-- 
2.12.2



More information about the CRIU mailing list