[CRIU] [PATCH] Punch holes in input files when restoring anonymous non-shared memory if --auto-dedup is enabled.
Andrei Vagin
avagin at virtuozzo.com
Fri Jul 27 04:59:45 MSK 2018
Applied, thanks! Here is one in-line comment
On Tue, Jul 24, 2018 at 12:12:27PM +0200, Pawel Stradomski wrote:
> This reduces memory usage if image files are stored on tmpfs.
>
> Signed-off-by: Pawel Stradomski <pstradomski at google.com>
> ---
> .../arch/arm/plugins/std/syscalls/syscall.def | 1 +
> .../plugins/std/syscalls/syscall-ppc64.tbl | 1 +
> .../plugins/std/syscalls/syscall-s390.tbl | 1 +
> .../x86/plugins/std/syscalls/syscall_32.tbl | 1 +
> .../x86/plugins/std/syscalls/syscall_64.tbl | 1 +
> criu/mem.c | 6 +++-
> criu/pie/restorer.c | 31 +++++++++++++++++++
> 7 files changed, 41 insertions(+), 1 deletion(-)
>
> diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def b/compel/arch/arm/plugins/std/syscalls/syscall.def
> index b68f9f2f..bcd61d4a 100644
> --- a/compel/arch/arm/plugins/std/syscalls/syscall.def
> +++ b/compel/arch/arm/plugins/std/syscalls/syscall.def
> @@ -109,3 +109,4 @@ seccomp 277 383 (unsigned int op, unsigned int flags, const char *uargs)
> gettimeofday 169 78 (struct timeval *tv, struct timezone *tz)
> preadv_raw 69 361 (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h)
> userfaultfd 282 388 (int flags)
> +fallocate 47 352 (int fd, int mode, loff_t offset, loff_t len)
> diff --git a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl
> index fa0b034e..62e0bc1a 100644
> --- a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl
> +++ b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl
> @@ -89,6 +89,7 @@ __NR_set_robust_list 300 sys_set_robust_list (struct robust_list_head *head, si
> __NR_get_robust_list 299 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
> __NR_vmsplice 285 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags)
> __NR_openat 286 sys_openat (int dfd, const char *filename, int flags, int mode)
> +__NR_fallocate 309 sys_fallocate (int fd, int mode, loff_t offset, loff_t len)
> __NR_timerfd_settime 311 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
> __NR_signalfd4 313 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags)
> __NR_rt_tgsigqueueinfo 322 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info)
> diff --git a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl
> index bc77ae97..3521e915 100644
> --- a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl
> +++ b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl
> @@ -89,6 +89,7 @@ __NR_set_robust_list 304 sys_set_robust_list (struct robust_list_head *head, si
> __NR_get_robust_list 305 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
> __NR_vmsplice 309 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags)
> __NR_openat 288 sys_openat (int dfd, const char *filename, int flags, int mode)
> +__NR_fallocate 314 sys_fallocate (int fd, int mode, loff_t offset, loff_t len)
> __NR_timerfd_settime 320 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
> __NR_signalfd4 322 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags)
> __NR_rt_tgsigqueueinfo 330 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info)
> diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl
> index 9e1de281..a6c55b83 100644
> --- a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl
> +++ b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl
> @@ -83,6 +83,7 @@ __NR_set_robust_list 311 sys_set_robust_list (struct robust_list_head *head, si
> __NR_get_robust_list 312 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
> __NR_vmsplice 316 sys_vmsplice (int fd, const struct iovec *iov, unsigned int nr_segs, unsigned int flags)
> __NR_signalfd 321 sys_signalfd (int ufd, const k_rtsigset_t *sigmask, size_t sigsetsize)
> +__NR_fallocate 324 sys_fallocate (int fd, int mode, loff_t offset, loff_t len)
> __NR_timerfd_settime 325 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
> __NR_preadv 333 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h)
> __NR_rt_tgsigqueueinfo 335 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *uinfo)
> diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl
> index 726fa797..64271514 100644
> --- a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl
> +++ b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl
> @@ -94,6 +94,7 @@ __NR_set_robust_list 273 sys_set_robust_list (struct robust_list_head *head, s
> __NR_get_robust_list 274 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
> __NR_seccomp 317 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs)
> __NR_vmsplice 278 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags)
> +__NR_fallocate 285 sys_fallocate (int fd, int mode, loff_t offset, loff_t len)
> __NR_timerfd_settime 286 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
> __NR_signalfd4 289 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags)
> __NR_preadv 295 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h)
> diff --git a/criu/mem.c b/criu/mem.c
> index d020b7fd..44d0e258 100644
> --- a/criu/mem.c
> +++ b/criu/mem.c
> @@ -1271,7 +1271,11 @@ static int prepare_vma_ios(struct pstree_item *t, struct task_restore_args *ta)
> {
> struct cr_img *pages;
>
> - pages = open_image(CR_FD_PAGES, O_RSTR, rsti(t)->pages_img_id);
> + /* if auto-dedup is on we need RDWR mode to be able to punch holes
> + * in the input files (in restorer.c)
> + */
> + pages = open_image(CR_FD_PAGES, opts.auto_dedup ? O_RDWR : O_RSTR,
> + rsti(t)->pages_img_id);
This doesn't work for userns tests:
[root at fc24 criu]# python test/zdtm.py run -t zdtm/static/env00 -f uns --dedup
=== Run 1/1 ================ zdtm/static/env00
========================= Run zdtm/static/env00 in uns =========================
Start test
./env00 --pidfile=env00.pid --outfile=env00.out --envname=ENV_00_TEST
Run criu dump
files stat: fs/file-max 400903, fs/nr_open 1048576
rlimit: RLIMIT_NOFILE unlimited for self
Loaded kdat cache from /run/criu.kdat
Run criu restore
files stat: fs/file-max 400903, fs/nr_open 1048576
rlimit: RLIMIT_NOFILE unlimited for self
Loaded kdat cache from /run/criu.kdat
=[log]=> dump/zdtm/static/env00/43/1/restore.log
------------------------ grep Error ------------------------
(00.195834) 1: `- FD 1 pid 4
(00.195841) 1: `- FD 2 pid 4
(00.195848) 1: `- type 1 ID 0xa
(00.195855) 1: `- FD 3 pid 4
(00.278816) 1: Error (criu/image.c:432): Unable to open pages-1.img: Permission denied
(00.296743) uns: calling exit_usernsd (-1, 1)
(00.296822) uns: daemon calls 0x4675b0 (62, -1, 1)
(00.296836) uns: `- daemon exits w/ 0
(00.298277) uns: daemon stopped
(00.298298) Error (criu/cr-restore.c:2308): Restoring FAILED.
------------------------ ERROR OVER ------------------------
################# Test zdtm/static/env00 FAIL at CRIU restore ##################
##################################### FAIL #####################################
CRIU opens an image from a target userns and fails to open it for read-write:
-rw-r--r-- 1 root root 106496 Jul 27 04:54 test/dump/zdtm/static/env00/43/1/pages-1.img
Probably, we need to use userns_call() to open images in this case.
> if (!pages)
> return -1;
>
> diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
> index f990e9b7..7c70d02a 100644
> --- a/criu/pie/restorer.c
> +++ b/criu/pie/restorer.c
> @@ -50,6 +50,15 @@
> #define PR_SET_PDEATHSIG 1
> #endif
>
> +#ifndef FALLOC_FL_KEEP_SIZE
> +#define FALLOC_FL_KEEP_SIZE 0x01
> +#endif
> +
> +#ifndef FALLOC_FL_PUNCH_HOLE
> +#define FALLOC_FL_PUNCH_HOLE 0x02
> +#endif
> +
> +
> #define sys_prctl_safe(opcode, val1, val2, val3) \
> ({ \
> long __ret = sys_prctl(opcode, val1, val2, val3, 0); \
> @@ -646,6 +655,14 @@ static unsigned long restore_mapping(VmaEntry *vma_entry)
> !(vma_entry->status & VMA_NO_PROT_WRITE))
> prot |= PROT_WRITE;
>
> + /* TODO: Drop MAP_LOCKED bit and restore it after reading memory.
> + *
> + * Code below tries to limit memory usage by running fallocate()
> + * after each preadv() to avoid doubling memory usage (once in
> + * image files, once in process). Unfortunately, MAP_LOCKED defeats
> + * that mechanism as it causes the process to be charged for memory
> + * immediately upon mmap, not later upon preadv().
> + */
> pr_debug("\tmmap(%"PRIx64" -> %"PRIx64", %x %x %d)\n",
> vma_entry->start, vma_entry->end,
> prot, flags, (int)vma_entry->fd);
> @@ -1355,6 +1372,11 @@ long __export_restore_task(struct task_restore_args *args)
> struct iovec *iovs = rio->iovs;
> int nr = rio->nr_iovs;
> ssize_t r;
> + int file_flags = sys_fcntl(args->vma_ios_fd, F_GETFL, 0);
> + if (file_flags < 0) {
> + pr_err("Can't check file flags\n");
> + file_flags = 0;
> + }
>
> while (nr) {
> pr_debug("Preadv %lx:%d... (%d iovs)\n",
> @@ -1367,6 +1389,15 @@ long __export_restore_task(struct task_restore_args *args)
> }
>
> pr_debug("`- returned %ld\n", (long)r);
> + /* If the file is open for writing, then it means we should punch holes
> + * in it. */
> + if (r > 0 && (file_flags & O_RDWR)) {
> + int fr = sys_fallocate(args->vma_ios_fd, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE,
> + rio->off, r);
> + if (fr < 0) {
> + pr_debug("Failed to punch holes with fallocate: %d\n", fr);
> + }
> + }
> rio->off += r;
> /* Advance the iovecs */
> do {
> --
> 2.18.0.233.g985f88cf7e-goog
>
> _______________________________________________
> CRIU mailing list
> CRIU at openvz.org
> https://lists.openvz.org/mailman/listinfo/criu
More information about the CRIU
mailing list